From 9ba40dc0ffd3e35ab0486510d34099ecaeb70e7b Mon Sep 17 00:00:00 2001 From: Balazs Gerofi Date: Mon, 10 Aug 2015 12:37:12 +0900 Subject: [PATCH] schedule(): hold runq lock for the entire duration of context switching releasing the runq lock after loading page tables but before the actual context switch can leave execution in an inconsistent if the current process is descheduled from an IRQ between these two steps. this patch holds the runq lock with IRQs disabled and makes the context switch a single atomic operation. --- arch/x86/kernel/cpu.c | 12 ++++++++++++ arch/x86/kernel/interrupt.S | 1 + kernel/include/cls.h | 1 + kernel/process.c | 25 ++++++++++++++++++++----- 4 files changed, 34 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu.c b/arch/x86/kernel/cpu.c index 06860336..3890217d 100644 --- a/arch/x86/kernel/cpu.c +++ b/arch/x86/kernel/cpu.c @@ -1012,6 +1012,18 @@ void ihk_mc_init_context(ihk_mc_kernel_context_t *new_ctx, } extern char enter_user_mode[]; + +/* + * Release runq_lock before entering user space. + * This is needed because schedule() holds the runq lock throughout + * the context switch and when a new process is created it starts + * execution in enter_user_mode, which in turn calls this function. + */ +void release_runq_lock(void) +{ + ihk_mc_spinlock_unlock(&(cpu_local_var(runq_lock)), + cpu_local_var(runq_irqstate)); +} /*@ @ requires \valid(ctx); diff --git a/arch/x86/kernel/interrupt.S b/arch/x86/kernel/interrupt.S index f66c13eb..760c2722 100644 --- a/arch/x86/kernel/interrupt.S +++ b/arch/x86/kernel/interrupt.S @@ -206,6 +206,7 @@ x86_syscall: .globl enter_user_mode enter_user_mode: + callq release_runq_lock movq $0, %rdi movq %rsp, %rsi call check_signal diff --git a/kernel/include/cls.h b/kernel/include/cls.h index 774443fe..9a735d32 100644 --- a/kernel/include/cls.h +++ b/kernel/include/cls.h @@ -46,6 +46,7 @@ struct cpu_local_var { struct process_vm idle_vm; ihk_spinlock_t runq_lock; + unsigned long runq_irqstate; struct process *current; struct list_head runq; size_t runq_len; diff --git a/kernel/process.c b/kernel/process.c index e24fb97d..116baa3f 100644 --- a/kernel/process.c +++ b/kernel/process.c @@ -2062,6 +2062,12 @@ static void do_migrate(void); static void idle(void) { struct cpu_local_var *v = get_this_cpu_local_var(); + + /* Release runq_lock before starting the idle loop. + * See comments at release_runq_lock(). + */ + ihk_mc_spinlock_unlock(&(cpu_local_var(runq_lock)), + cpu_local_var(runq_irqstate)); if(v->status == CPU_STATUS_RUNNING) v->status = CPU_STATUS_IDLE; @@ -2236,7 +2242,6 @@ void schedule(void) struct cpu_local_var *v; struct process *next, *prev, *proc, *tmp = NULL; int switch_ctx = 0; - unsigned long irqstate; struct process *last; if (cpu_local_var(no_preempt)) { @@ -2250,7 +2255,8 @@ void schedule(void) } redo: - irqstate = ihk_mc_spinlock_lock(&(get_this_cpu_local_var()->runq_lock)); + cpu_local_var(runq_irqstate) = + ihk_mc_spinlock_lock(&(get_this_cpu_local_var()->runq_lock)); v = get_this_cpu_local_var(); next = NULL; @@ -2337,14 +2343,22 @@ redo: /* Set up new TLS.. */ do_arch_prctl(ARCH_SET_FS, next->thread.tlsblock_base); - ihk_mc_spinlock_unlock(&(v->runq_lock), irqstate); - if (prev) { last = ihk_mc_switch_context(&prev->ctx, &next->ctx, prev); } else { last = ihk_mc_switch_context(NULL, &next->ctx, prev); } + + /* + * We must hold the lock throughout the context switch, otherwise + * an IRQ could deschedule this process between page table loading and + * context switching and leave the execution in an inconsistent state. + * Since we may be migrated to another core meanwhile, we refer + * directly to cpu_local_var. + */ + ihk_mc_spinlock_unlock(&(cpu_local_var(runq_lock)), + cpu_local_var(runq_irqstate)); /* Have we migrated to another core meanwhile? */ if (v != get_this_cpu_local_var()) { @@ -2358,7 +2372,8 @@ redo: } } else { - ihk_mc_spinlock_unlock(&(v->runq_lock), irqstate); + ihk_mc_spinlock_unlock(&(cpu_local_var(runq_lock)), + cpu_local_var(runq_irqstate)); } }