McKernel blocks forever waiting for mcklogd to retrieve kmsg when kmsg bufer is full with boot log and mcklogd isn't running.
280 lines
13 KiB
Bash
280 lines
13 KiB
Bash
#!/bin/bash
|
|
|
|
# IHK SMP-x86 example boot script.
|
|
# author: Balazs Gerofi <bgerofi@riken.jp>
|
|
# Copyright (C) 2014 RIKEN AICS
|
|
#
|
|
# This is an example script for loading IHK, configuring a partition and
|
|
# booting McKernel on it.
|
|
# The script reserves half of the CPU cores and 512MB of RAM from NUMA node 0
|
|
# when IHK is loaded for the first time, otherwise it destroys the current
|
|
# McKernel instance and reboots it using the same set of resources as it used
|
|
# previously.
|
|
# Note that the script does not output anything unless an error occurs.
|
|
|
|
prefix="@prefix@"
|
|
BINDIR="${prefix}/bin"
|
|
SBINDIR="${prefix}/sbin"
|
|
ETCDIR=@ETCDIR@
|
|
KMODDIR="${prefix}/kmod"
|
|
KERNDIR="${prefix}/@TARGET@/kernel"
|
|
ENABLE_MCOVERLAYFS="@ENABLE_MCOVERLAYFS@"
|
|
|
|
mem="512M@0"
|
|
cpus=""
|
|
|
|
INTERVAL=1
|
|
LOGMODE=0
|
|
facility="LOG_LOCAL6"
|
|
chown_option=`logname 2> /dev/null`
|
|
|
|
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" -o "`systemctl status irqbalance.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
|
|
irqbalance_used="yes"
|
|
else
|
|
irqbalance_used="no"
|
|
fi
|
|
|
|
while getopts :i:k:c:m:o:f: OPT
|
|
do
|
|
case ${OPT} in
|
|
f) facility=${OPTARG}
|
|
;;
|
|
o) chown_option=${OPTARG}
|
|
;;
|
|
i) INTERVAL=${OPTARG}
|
|
expr "${INTERVAL}" + 1 > /dev/null 2>&1
|
|
if [ $? -ge 2 ]
|
|
then
|
|
echo "invalid -i value" >&2
|
|
exit 1
|
|
fi
|
|
if [ ${INTERVAL} -le 0 ]
|
|
then
|
|
echo "invalid -i value" >&2
|
|
exit 1
|
|
fi
|
|
;;
|
|
k) LOGMODE=${OPTARG}
|
|
expr "${LOGMODE}" + 1 > /dev/null 2>&1
|
|
if [ $? -ge 2 ]
|
|
then
|
|
echo "invalid -k value" >&2
|
|
exit 1
|
|
fi
|
|
if [ ${LOGMODE} -lt 0 -o ${LOGMODE} -gt 2 ]
|
|
then
|
|
echo "invalid -k value" >&2
|
|
exit 1
|
|
fi
|
|
;;
|
|
c) cpus=${OPTARG}
|
|
;;
|
|
m) mem=${OPTARG}
|
|
;;
|
|
*) echo "invalid option -${OPT}" >&2
|
|
exit 1
|
|
esac
|
|
done
|
|
|
|
ihk_ikc_irq_core=0
|
|
|
|
release=`uname -r`
|
|
major=`echo ${release} | sed -e 's/^\([0-9]*\).*/\1/'`
|
|
minor=`echo ${release} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/'`
|
|
patch=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/'`
|
|
linux_version_code=`expr \( ${major} \* 65536 \) + \( ${minor} \* 256 \) + ${patch}`
|
|
rhel_release=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/'`
|
|
if [ "${release}" == "${rhel_release}" ]; then rhel_release=""; fi
|
|
enable_mcoverlay="no"
|
|
if [ "${ENABLE_MCOVERLAYFS}" == "yes" ]; then
|
|
if [ "${rhel_release}" == "" ]; then
|
|
if [ ${linux_version_code} -ge 262144 -a ${linux_version_code} -lt 262400 ]; then
|
|
enable_mcoverlay="yes"
|
|
fi
|
|
else
|
|
if [ ${linux_version_code} -eq 199168 -a ${rhel_release} -ge 327 ]; then
|
|
enable_mcoverlay="yes"
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
if [ "$cpus" == "" ]; then
|
|
# Get the number of CPUs on NUMA node 0
|
|
nr_cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $4}' | wc -l`
|
|
|
|
# Use the second half of the cores
|
|
let nr_cpus="$nr_cpus / 2"
|
|
cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $1}' | tail -n $nr_cpus | xargs echo -n | sed 's/ /,/g'`
|
|
if [ "$cpus" == "" ]; then echo "error: no available CPUs on NUMA node 0?" >&2; exit 1; fi
|
|
fi
|
|
|
|
# Remove mcoverlay if loaded
|
|
if [ "$enable_mcoverlay" == "yes" ]; then
|
|
if [ "`lsmod | grep mcoverlay`" != "" ]; then
|
|
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_sys`" != "" ]; then umount -l /tmp/mcos/mcos0_sys; fi
|
|
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_proc`" != "" ]; then umount -l /tmp/mcos/mcos0_proc; fi
|
|
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
|
|
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
|
|
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
|
|
if ! rmmod mcoverlay; then echo "error: removing mcoverlay" >&2; exit 1; fi
|
|
fi
|
|
fi
|
|
|
|
# Stop irqbalance
|
|
if [ "${irqbalance_used}" == "yes" ]; then
|
|
systemctl stop irqbalance_mck.service 2>/dev/null
|
|
if ! systemctl stop irqbalance.service 2>/dev/null ; then echo "error: stopping irqbalance" >&2; exit 1; fi;
|
|
fi
|
|
|
|
# Start mcklogd. Note that McKernel blocks when kmsg buffer is full
|
|
# with '-k 1' until mcklogd unblocks it so starting mcklogd must preceeds
|
|
# booting McKernel
|
|
if [ ${LOGMODE} -ne 0 ]
|
|
then
|
|
# Stop mcklogd which has survived McKernel shutdown because mcstop+release.sh is not used
|
|
pkill mcklogd
|
|
SBINDIR=${SBINDIR} ${SBINDIR}/mcklogd -i ${INTERVAL} -f ${facility}
|
|
fi
|
|
|
|
# Load IHK if not loaded
|
|
if [ "`lsmod | grep ihk`" == "" ]; then
|
|
if ! insmod ${KMODDIR}/ihk.ko; then echo "error: loading ihk" >&2; exit 1; fi;
|
|
fi
|
|
|
|
# Drop Linux caches to free memory
|
|
sync && echo 3 > /proc/sys/vm/drop_caches
|
|
|
|
# Load IHK-SMP if not loaded and reserve CPUs and memory
|
|
if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then
|
|
ihk_irq=""
|
|
for i in `seq 64 255`; do
|
|
if [ ! -d /proc/irq/$i ] && [ "`cat /proc/interrupts | grep ":" | awk '{print $1}' | grep -o '[0-9]*' | grep -e '^$i$'`" == "" ]; then
|
|
ihk_irq=$i
|
|
break
|
|
fi
|
|
done
|
|
if [ "$ihk_irq" == "" ]; then echo "error: no IRQ available" >&2; exit 1; fi
|
|
if ! insmod ${KMODDIR}/ihk-smp-x86.ko ihk_start_irq=$ihk_irq ihk_ikc_irq_core=$ihk_ikc_irq_core; then echo "error: loading ihk-smp-x86" >&2; exit 1; fi;
|
|
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs" >&2; exit 1; fi
|
|
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory" >&2; exit 1; fi
|
|
# If loaded, but no resources allocated, get CPUs and memory
|
|
else
|
|
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
|
|
cpus_allocated=`${SBINDIR}/ihkosctl 0 query cpu`
|
|
if [ "$cpus_allocated" == "" ]; then
|
|
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs" >&2; exit 1; fi
|
|
fi
|
|
|
|
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
|
|
mem_allocated=`${SBINDIR}/ihkosctl 0 query mem`
|
|
if [ "$mem_allocated" == "" ]; then
|
|
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory" >&2; exit 1; fi
|
|
fi
|
|
fi
|
|
|
|
# Load mcctrl if not loaded
|
|
if [ "`lsmod | grep mcctrl`" == "" ]; then
|
|
if ! insmod ${KMODDIR}/mcctrl.ko; then echo "error: inserting mcctrl.ko" >&2; exit 1; fi
|
|
fi
|
|
|
|
# Check for existing OS instance and destroy
|
|
if [ -c /dev/mcos0 ]; then
|
|
# Query CPU cores and memory of OS instance so that the same values are used as previously
|
|
if ! ${SBINDIR}/ihkosctl 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
|
|
cpus=`${SBINDIR}/ihkosctl 0 query cpu`
|
|
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
|
|
mem=`${SBINDIR}/ihkosctl 0 query mem`
|
|
|
|
if ! ${SBINDIR}/ihkconfig 0 destroy 0; then echo "warning: destroy failed" >&2; fi
|
|
else
|
|
# Otherwise query IHK-SMP for resources
|
|
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
|
|
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
|
|
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
|
|
mem=`${SBINDIR}/ihkconfig 0 query mem`
|
|
fi
|
|
|
|
if ! ${SBINDIR}/ihkconfig 0 create; then echo "error: create" >&2; exit; fi
|
|
if ! ${SBINDIR}/ihkosctl 0 assign cpu ${cpus}; then echo "error: assign CPUs" >&2; exit 1; fi
|
|
if ! ${SBINDIR}/ihkosctl 0 assign mem ${mem}; then echo "error: assign memory" >&2; exit 1; fi
|
|
if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then echo "error: loading kernel image" >&2; exit 1; fi
|
|
if ! ${SBINDIR}/ihkosctl 0 kargs "hidos ksyslogd=${LOGMODE}"; then echo "error: setting kernel arguments" >&2; exit 1; fi
|
|
if ! ${SBINDIR}/ihkosctl 0 boot; then echo "error: booting" >&2; exit 1; fi
|
|
if ! chown ${chown_option} /dev/mcd* /dev/mcos*; then echo "error: chowning device files" >&2; exit 1; fi
|
|
|
|
if [ "$enable_mcoverlay" == "yes" ]; then
|
|
if [ ! -e /tmp/mcos ]; then mkdir -p /tmp/mcos; fi
|
|
if ! mount -t tmpfs tmpfs /tmp/mcos; then echo "error: mount /tmp/mcos" >&2; exit 1; fi
|
|
if [ ! -e /tmp/mcos/linux_proc ]; then mkdir -p /tmp/mcos/linux_proc; fi
|
|
if ! mount --bind /proc /tmp/mcos/linux_proc; then echo "error: mount /tmp/mcos/linux_proc" >&2; exit 1; fi
|
|
if ! insmod ${KMODDIR}/mcoverlay.ko; then echo "error: inserting mcoverlay.ko" >&2; exit 1; fi
|
|
while [ ! -e /proc/mcos0 ]
|
|
do
|
|
sleep 1
|
|
done
|
|
if [ ! -e /tmp/mcos/mcos0_proc ]; then mkdir -p /tmp/mcos/mcos0_proc; fi
|
|
if [ ! -e /tmp/mcos/mcos0_proc_upper ]; then mkdir -p /tmp/mcos/mcos0_proc_upper; fi
|
|
if [ ! -e /tmp/mcos/mcos0_proc_work ]; then mkdir -p /tmp/mcos/mcos0_proc_work; fi
|
|
if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then echo "error: mount /tmp/mcos/mcos0_proc" >&2; exit 1; fi
|
|
mount --make-rprivate /proc
|
|
|
|
while [ ! -e /sys/devices/virtual/mcos/mcos0/sys/setup_complete ]
|
|
do
|
|
sleep 1
|
|
done
|
|
if [ ! -e /tmp/mcos/mcos0_sys ]; then mkdir -p /tmp/mcos/mcos0_sys; fi
|
|
if [ ! -e /tmp/mcos/mcos0_sys_upper ]; then mkdir -p /tmp/mcos/mcos0_sys_upper; fi
|
|
if [ ! -e /tmp/mcos/mcos0_sys_work ]; then mkdir -p /tmp/mcos/mcos0_sys_work; fi
|
|
if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then echo "error: mount /tmp/mcos/mcos0_sys" >&2; exit 1; fi
|
|
mount --make-rprivate /sys
|
|
rm -rf /tmp/mcos/mcos0_sys/setup_complete
|
|
for cpuid in `find /sys/devices/system/cpu/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
|
|
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid" ]; then
|
|
rm -rf /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid
|
|
else
|
|
for nodeid in `find /sys/devices/system/cpu/$cpuid/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
|
|
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid/$nodeid" ]; then
|
|
rm -f /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid/$nodeid
|
|
fi
|
|
done
|
|
fi
|
|
done
|
|
for nodeid in `find /sys/devices/system/node/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
|
|
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid" ]; then
|
|
rm -rf /tmp/mcos/mcos0_sys/devices/system/node/$nodeid
|
|
else
|
|
# Delete non-existent symlinks
|
|
for cpuid in `find /sys/devices/system/node/$nodeid/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
|
|
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid/$cpuid" ]; then
|
|
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/$cpuid
|
|
fi
|
|
done
|
|
|
|
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/memory*
|
|
fi
|
|
done
|
|
for cpuid in `find /sys/bus/cpu/devices/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
|
|
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/bus/cpu/devices/$cpuid" ]; then
|
|
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
|
|
fi
|
|
done
|
|
fi
|
|
|
|
# Start irqbalance with CPUs and IRQ for McKernel banned
|
|
if [ "${irqbalance_used}" == "yes" ]; then
|
|
if ! etcdir=@ETCDIR@ perl -e 'use File::Copy qw(copy); $etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "/proc/irq/*/smp_affinity"; foreach $file (@files) { $rel = substr($file, 1); $dir=substr($rel, 0, length($rel)-length("/smp_affinity")); if(0) { print "cp $file $etcdir/$rel\n";} if(system("mkdir -p $etcdir/$dir")){ exit 1;} if(!copy($file,"$etcdir/$rel")){ exit 1;} }' ; then echo "error: saving /proc/irq/*/smp_affinity" >&2; exit 1; fi;
|
|
|
|
ncpus=`lscpu | grep -E '^CPU\(s\):' | awk '{print $2}'`
|
|
smp_affinity_mask=`echo $cpus | ncpus=$ncpus perl -e 'while(<>){@tokens = split /,/;foreach $token (@tokens) {@nums = split /-/,$token; for($num = $nums[0]; $num <= $nums[$#nums]; $num++) {$ndx=int($num/32); $mask[$ndx] |= (1<<($num % 32))}}} $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if($j != $nint32s - 1){print ",";} $nblks = $j == $nint32s - 1 ? int(($ENV{'ncpus'} % 32)/4) : 8; for($i = $nblks - 1;$i >= 0;$i--){ printf("%01x",($mask[$j] >> ($i*4)) & 0xf);}}'`
|
|
|
|
if ! ncpus=$ncpus smp_affinity_mask=$smp_affinity_mask perl -e '@dirs = grep { -d } glob "/proc/irq/*"; foreach $dir (@dirs) { $hit = 0; $affinity_str = `cat $dir/smp_affinity`; chomp $affinity_str; @int32strs = split /,/, $affinity_str; @int32strs_mask=split /,/, $ENV{'smp_affinity_mask'}; for($i=0;$i <= $#int32strs_mask; $i++) { $int32strs_inv[$i] = sprintf("%08x",hex($int32strs_mask[$i])^0xffffffff); if($i == 0) { $len = int((($ENV{'ncpus'}%32)+3)/4); $int32strs_inv[$i] = substr($int32strs_inv[$i], -$len, $len); } } $inv = join(",", @int32strs_inv); $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if(hex($int32strs[$nint32s - 1 - $j]) & hex($int32strs_mask[$nint32s - 1 - $j])) { $hit = 1; }} if($hit == 1) { $cmd = "echo $inv > $dir/smp_affinity 2>/dev/null"; system $cmd;}}'; then echo "error: modifying /proc/irq/*/smp_affinity" >&2; exit 1; fi;
|
|
|
|
banirq=`cat /proc/interrupts| perl -e 'while(<>) { if(/^\s*(\d+).*IHK\-SMP\s*$/) {print $1;}}'`
|
|
|
|
sed "s/%mask%/$smp_affinity_mask/g" $ETCDIR/irqbalance_mck.in | sed "s/%banirq%/$banirq/g" > $ETCDIR/irqbalance_mck
|
|
if ! systemctl link $ETCDIR/irqbalance_mck.service >/dev/null 2>/dev/null; then echo "error: linking irqbalance_mck" >&2; exit 1; fi;
|
|
if ! systemctl start irqbalance_mck.service 2>/dev/null ; then echo "error: starting irqbalance_mck" >&2; exit 1; fi;
|
|
# echo cpus=$cpus mask=$smp_affinity_mask banirq=$banirq
|
|
fi
|
|
|