Files
mckernel/arch/x86/tools/mcreboot-smp-x86.sh.in
e29005 0b0aa6c0e0 Start mcklogd before McKernel to avoid deadlock
McKernel blocks forever waiting for mcklogd to retrieve kmsg when
kmsg bufer is full with boot log and mcklogd isn't running.
2016-10-19 16:40:32 +09:00

280 lines
13 KiB
Bash

#!/bin/bash
# IHK SMP-x86 example boot script.
# author: Balazs Gerofi <bgerofi@riken.jp>
# Copyright (C) 2014 RIKEN AICS
#
# This is an example script for loading IHK, configuring a partition and
# booting McKernel on it.
# The script reserves half of the CPU cores and 512MB of RAM from NUMA node 0
# when IHK is loaded for the first time, otherwise it destroys the current
# McKernel instance and reboots it using the same set of resources as it used
# previously.
# Note that the script does not output anything unless an error occurs.
prefix="@prefix@"
BINDIR="${prefix}/bin"
SBINDIR="${prefix}/sbin"
ETCDIR=@ETCDIR@
KMODDIR="${prefix}/kmod"
KERNDIR="${prefix}/@TARGET@/kernel"
ENABLE_MCOVERLAYFS="@ENABLE_MCOVERLAYFS@"
mem="512M@0"
cpus=""
INTERVAL=1
LOGMODE=0
facility="LOG_LOCAL6"
chown_option=`logname 2> /dev/null`
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" -o "`systemctl status irqbalance.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
irqbalance_used="yes"
else
irqbalance_used="no"
fi
while getopts :i:k:c:m:o:f: OPT
do
case ${OPT} in
f) facility=${OPTARG}
;;
o) chown_option=${OPTARG}
;;
i) INTERVAL=${OPTARG}
expr "${INTERVAL}" + 1 > /dev/null 2>&1
if [ $? -ge 2 ]
then
echo "invalid -i value" >&2
exit 1
fi
if [ ${INTERVAL} -le 0 ]
then
echo "invalid -i value" >&2
exit 1
fi
;;
k) LOGMODE=${OPTARG}
expr "${LOGMODE}" + 1 > /dev/null 2>&1
if [ $? -ge 2 ]
then
echo "invalid -k value" >&2
exit 1
fi
if [ ${LOGMODE} -lt 0 -o ${LOGMODE} -gt 2 ]
then
echo "invalid -k value" >&2
exit 1
fi
;;
c) cpus=${OPTARG}
;;
m) mem=${OPTARG}
;;
*) echo "invalid option -${OPT}" >&2
exit 1
esac
done
ihk_ikc_irq_core=0
release=`uname -r`
major=`echo ${release} | sed -e 's/^\([0-9]*\).*/\1/'`
minor=`echo ${release} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/'`
patch=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/'`
linux_version_code=`expr \( ${major} \* 65536 \) + \( ${minor} \* 256 \) + ${patch}`
rhel_release=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/'`
if [ "${release}" == "${rhel_release}" ]; then rhel_release=""; fi
enable_mcoverlay="no"
if [ "${ENABLE_MCOVERLAYFS}" == "yes" ]; then
if [ "${rhel_release}" == "" ]; then
if [ ${linux_version_code} -ge 262144 -a ${linux_version_code} -lt 262400 ]; then
enable_mcoverlay="yes"
fi
else
if [ ${linux_version_code} -eq 199168 -a ${rhel_release} -ge 327 ]; then
enable_mcoverlay="yes"
fi
fi
fi
if [ "$cpus" == "" ]; then
# Get the number of CPUs on NUMA node 0
nr_cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $4}' | wc -l`
# Use the second half of the cores
let nr_cpus="$nr_cpus / 2"
cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $1}' | tail -n $nr_cpus | xargs echo -n | sed 's/ /,/g'`
if [ "$cpus" == "" ]; then echo "error: no available CPUs on NUMA node 0?" >&2; exit 1; fi
fi
# Remove mcoverlay if loaded
if [ "$enable_mcoverlay" == "yes" ]; then
if [ "`lsmod | grep mcoverlay`" != "" ]; then
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_sys`" != "" ]; then umount -l /tmp/mcos/mcos0_sys; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_proc`" != "" ]; then umount -l /tmp/mcos/mcos0_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
if ! rmmod mcoverlay; then echo "error: removing mcoverlay" >&2; exit 1; fi
fi
fi
# Stop irqbalance
if [ "${irqbalance_used}" == "yes" ]; then
systemctl stop irqbalance_mck.service 2>/dev/null
if ! systemctl stop irqbalance.service 2>/dev/null ; then echo "error: stopping irqbalance" >&2; exit 1; fi;
fi
# Start mcklogd. Note that McKernel blocks when kmsg buffer is full
# with '-k 1' until mcklogd unblocks it so starting mcklogd must preceeds
# booting McKernel
if [ ${LOGMODE} -ne 0 ]
then
# Stop mcklogd which has survived McKernel shutdown because mcstop+release.sh is not used
pkill mcklogd
SBINDIR=${SBINDIR} ${SBINDIR}/mcklogd -i ${INTERVAL} -f ${facility}
fi
# Load IHK if not loaded
if [ "`lsmod | grep ihk`" == "" ]; then
if ! insmod ${KMODDIR}/ihk.ko; then echo "error: loading ihk" >&2; exit 1; fi;
fi
# Drop Linux caches to free memory
sync && echo 3 > /proc/sys/vm/drop_caches
# Load IHK-SMP if not loaded and reserve CPUs and memory
if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then
ihk_irq=""
for i in `seq 64 255`; do
if [ ! -d /proc/irq/$i ] && [ "`cat /proc/interrupts | grep ":" | awk '{print $1}' | grep -o '[0-9]*' | grep -e '^$i$'`" == "" ]; then
ihk_irq=$i
break
fi
done
if [ "$ihk_irq" == "" ]; then echo "error: no IRQ available" >&2; exit 1; fi
if ! insmod ${KMODDIR}/ihk-smp-x86.ko ihk_start_irq=$ihk_irq ihk_ikc_irq_core=$ihk_ikc_irq_core; then echo "error: loading ihk-smp-x86" >&2; exit 1; fi;
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs" >&2; exit 1; fi
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory" >&2; exit 1; fi
# If loaded, but no resources allocated, get CPUs and memory
else
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
cpus_allocated=`${SBINDIR}/ihkosctl 0 query cpu`
if [ "$cpus_allocated" == "" ]; then
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs" >&2; exit 1; fi
fi
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
mem_allocated=`${SBINDIR}/ihkosctl 0 query mem`
if [ "$mem_allocated" == "" ]; then
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory" >&2; exit 1; fi
fi
fi
# Load mcctrl if not loaded
if [ "`lsmod | grep mcctrl`" == "" ]; then
if ! insmod ${KMODDIR}/mcctrl.ko; then echo "error: inserting mcctrl.ko" >&2; exit 1; fi
fi
# Check for existing OS instance and destroy
if [ -c /dev/mcos0 ]; then
# Query CPU cores and memory of OS instance so that the same values are used as previously
if ! ${SBINDIR}/ihkosctl 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
cpus=`${SBINDIR}/ihkosctl 0 query cpu`
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
mem=`${SBINDIR}/ihkosctl 0 query mem`
if ! ${SBINDIR}/ihkconfig 0 destroy 0; then echo "warning: destroy failed" >&2; fi
else
# Otherwise query IHK-SMP for resources
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
fi
if ! ${SBINDIR}/ihkconfig 0 create; then echo "error: create" >&2; exit; fi
if ! ${SBINDIR}/ihkosctl 0 assign cpu ${cpus}; then echo "error: assign CPUs" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 assign mem ${mem}; then echo "error: assign memory" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then echo "error: loading kernel image" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 kargs "hidos ksyslogd=${LOGMODE}"; then echo "error: setting kernel arguments" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 boot; then echo "error: booting" >&2; exit 1; fi
if ! chown ${chown_option} /dev/mcd* /dev/mcos*; then echo "error: chowning device files" >&2; exit 1; fi
if [ "$enable_mcoverlay" == "yes" ]; then
if [ ! -e /tmp/mcos ]; then mkdir -p /tmp/mcos; fi
if ! mount -t tmpfs tmpfs /tmp/mcos; then echo "error: mount /tmp/mcos" >&2; exit 1; fi
if [ ! -e /tmp/mcos/linux_proc ]; then mkdir -p /tmp/mcos/linux_proc; fi
if ! mount --bind /proc /tmp/mcos/linux_proc; then echo "error: mount /tmp/mcos/linux_proc" >&2; exit 1; fi
if ! insmod ${KMODDIR}/mcoverlay.ko; then echo "error: inserting mcoverlay.ko" >&2; exit 1; fi
while [ ! -e /proc/mcos0 ]
do
sleep 1
done
if [ ! -e /tmp/mcos/mcos0_proc ]; then mkdir -p /tmp/mcos/mcos0_proc; fi
if [ ! -e /tmp/mcos/mcos0_proc_upper ]; then mkdir -p /tmp/mcos/mcos0_proc_upper; fi
if [ ! -e /tmp/mcos/mcos0_proc_work ]; then mkdir -p /tmp/mcos/mcos0_proc_work; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then echo "error: mount /tmp/mcos/mcos0_proc" >&2; exit 1; fi
mount --make-rprivate /proc
while [ ! -e /sys/devices/virtual/mcos/mcos0/sys/setup_complete ]
do
sleep 1
done
if [ ! -e /tmp/mcos/mcos0_sys ]; then mkdir -p /tmp/mcos/mcos0_sys; fi
if [ ! -e /tmp/mcos/mcos0_sys_upper ]; then mkdir -p /tmp/mcos/mcos0_sys_upper; fi
if [ ! -e /tmp/mcos/mcos0_sys_work ]; then mkdir -p /tmp/mcos/mcos0_sys_work; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then echo "error: mount /tmp/mcos/mcos0_sys" >&2; exit 1; fi
mount --make-rprivate /sys
rm -rf /tmp/mcos/mcos0_sys/setup_complete
for cpuid in `find /sys/devices/system/cpu/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid" ]; then
rm -rf /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid
else
for nodeid in `find /sys/devices/system/cpu/$cpuid/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid/$nodeid" ]; then
rm -f /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid/$nodeid
fi
done
fi
done
for nodeid in `find /sys/devices/system/node/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid" ]; then
rm -rf /tmp/mcos/mcos0_sys/devices/system/node/$nodeid
else
# Delete non-existent symlinks
for cpuid in `find /sys/devices/system/node/$nodeid/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid/$cpuid" ]; then
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/$cpuid
fi
done
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/memory*
fi
done
for cpuid in `find /sys/bus/cpu/devices/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/bus/cpu/devices/$cpuid" ]; then
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
fi
done
fi
# Start irqbalance with CPUs and IRQ for McKernel banned
if [ "${irqbalance_used}" == "yes" ]; then
if ! etcdir=@ETCDIR@ perl -e 'use File::Copy qw(copy); $etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "/proc/irq/*/smp_affinity"; foreach $file (@files) { $rel = substr($file, 1); $dir=substr($rel, 0, length($rel)-length("/smp_affinity")); if(0) { print "cp $file $etcdir/$rel\n";} if(system("mkdir -p $etcdir/$dir")){ exit 1;} if(!copy($file,"$etcdir/$rel")){ exit 1;} }' ; then echo "error: saving /proc/irq/*/smp_affinity" >&2; exit 1; fi;
ncpus=`lscpu | grep -E '^CPU\(s\):' | awk '{print $2}'`
smp_affinity_mask=`echo $cpus | ncpus=$ncpus perl -e 'while(<>){@tokens = split /,/;foreach $token (@tokens) {@nums = split /-/,$token; for($num = $nums[0]; $num <= $nums[$#nums]; $num++) {$ndx=int($num/32); $mask[$ndx] |= (1<<($num % 32))}}} $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if($j != $nint32s - 1){print ",";} $nblks = $j == $nint32s - 1 ? int(($ENV{'ncpus'} % 32)/4) : 8; for($i = $nblks - 1;$i >= 0;$i--){ printf("%01x",($mask[$j] >> ($i*4)) & 0xf);}}'`
if ! ncpus=$ncpus smp_affinity_mask=$smp_affinity_mask perl -e '@dirs = grep { -d } glob "/proc/irq/*"; foreach $dir (@dirs) { $hit = 0; $affinity_str = `cat $dir/smp_affinity`; chomp $affinity_str; @int32strs = split /,/, $affinity_str; @int32strs_mask=split /,/, $ENV{'smp_affinity_mask'}; for($i=0;$i <= $#int32strs_mask; $i++) { $int32strs_inv[$i] = sprintf("%08x",hex($int32strs_mask[$i])^0xffffffff); if($i == 0) { $len = int((($ENV{'ncpus'}%32)+3)/4); $int32strs_inv[$i] = substr($int32strs_inv[$i], -$len, $len); } } $inv = join(",", @int32strs_inv); $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if(hex($int32strs[$nint32s - 1 - $j]) & hex($int32strs_mask[$nint32s - 1 - $j])) { $hit = 1; }} if($hit == 1) { $cmd = "echo $inv > $dir/smp_affinity 2>/dev/null"; system $cmd;}}'; then echo "error: modifying /proc/irq/*/smp_affinity" >&2; exit 1; fi;
banirq=`cat /proc/interrupts| perl -e 'while(<>) { if(/^\s*(\d+).*IHK\-SMP\s*$/) {print $1;}}'`
sed "s/%mask%/$smp_affinity_mask/g" $ETCDIR/irqbalance_mck.in | sed "s/%banirq%/$banirq/g" > $ETCDIR/irqbalance_mck
if ! systemctl link $ETCDIR/irqbalance_mck.service >/dev/null 2>/dev/null; then echo "error: linking irqbalance_mck" >&2; exit 1; fi;
if ! systemctl start irqbalance_mck.service 2>/dev/null ; then echo "error: starting irqbalance_mck" >&2; exit 1; fi;
# echo cpus=$cpus mask=$smp_affinity_mask banirq=$banirq
fi