From 567dcd384698a51d1313872cacd8632b65314e07 Mon Sep 17 00:00:00 2001 From: Masamichi Takagi Date: Mon, 3 Sep 2018 13:54:40 +0900 Subject: [PATCH] Fix deadlock involving mmap_sem and memory_range_lock Change-Id: I187246271163e708af6542c057d0a8dfde5b211e Fujitsu: TEMP_FIX_1 Refs: #986 --- kernel/include/process.h | 2 ++ kernel/include/syscall.h | 2 +- kernel/process.c | 25 +++++++++++++++++++++--- kernel/syscall.c | 42 ++++++++++++++++++++++++++-------------- 4 files changed, 52 insertions(+), 19 deletions(-) diff --git a/kernel/include/process.h b/kernel/include/process.h index 5e9499ab..6e4a0912 100644 --- a/kernel/include/process.h +++ b/kernel/include/process.h @@ -715,6 +715,8 @@ struct process_vm { // 2. addition of process page table (allocate_pages, update_process_page_table) // note that physical memory allocator (ihk_mc_alloc_pages, ihk_pagealloc_alloc) // is protected by its own lock (see ihk/manycore/generic/page_alloc.c) + unsigned long is_memory_range_lock_taken; + /* #986: Fix deadlock between do_page_fault_process_vm() and set_host_vma() */ ihk_atomic_t refcount; int exiting; diff --git a/kernel/include/syscall.h b/kernel/include/syscall.h index e04ac921..ac269fa6 100644 --- a/kernel/include/syscall.h +++ b/kernel/include/syscall.h @@ -462,7 +462,7 @@ static inline unsigned long timespec_to_jiffy(const struct timespec *ats) void reset_cputime(void); void set_cputime(int mode); -int do_munmap(void *addr, size_t len); +int do_munmap(void *addr, size_t len, int holding_memory_range_lock); intptr_t do_mmap(intptr_t addr0, size_t len0, int prot, int flags, int fd, off_t off0); void clear_host_pte(uintptr_t addr, size_t len); diff --git a/kernel/process.c b/kernel/process.c index b1157f21..a8a023d1 100644 --- a/kernel/process.c +++ b/kernel/process.c @@ -1952,11 +1952,28 @@ static int do_page_fault_process_vm(struct process_vm *vm, void *fault_addr0, ui int error; const uintptr_t fault_addr = (uintptr_t)fault_addr0; struct vm_range *range; + struct thread *thread = cpu_local_var(current); + int locked = 0; dkprintf("[%d]do_page_fault_process_vm(%p,%lx,%lx)\n", ihk_mc_get_processor_id(), vm, fault_addr0, reason); - - ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock); + + if (!thread->vm->is_memory_range_lock_taken) { + /* For the case where is_memory_range_lock_taken is incremented after memory_range_lock is taken. */ + while (1) { + if (thread->vm->is_memory_range_lock_taken) { + goto skip; + } + if (ihk_mc_spinlock_trylock_noirq(&vm->memory_range_lock)) { + locked = 1; + break; + } + } + } else { +skip:; + dkprintf("%s: INFO: skip locking of memory_range_lock,pid=%d,tid=%d\n", + __func__, thread->proc->pid, thread->tid); + } if (vm->exiting) { error = -ECANCELED; @@ -2065,7 +2082,9 @@ static int do_page_fault_process_vm(struct process_vm *vm, void *fault_addr0, ui error = 0; out: - ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock); + if (locked) { + ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock); + } dkprintf("[%d]do_page_fault_process_vm(%p,%lx,%lx): %d\n", ihk_mc_get_processor_id(), vm, fault_addr0, reason, error); diff --git a/kernel/syscall.c b/kernel/syscall.c index 8248013e..4701b88f 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -1276,15 +1276,24 @@ void clear_host_pte(uintptr_t addr, size_t len) return; } -static int set_host_vma(uintptr_t addr, size_t len, int prot) +static int set_host_vma(uintptr_t addr, size_t len, int prot, int holding_memory_range_lock) { ihk_mc_user_context_t ctx; long lerror; + struct thread *thread = cpu_local_var(current); ihk_mc_syscall_arg0(&ctx) = addr; ihk_mc_syscall_arg1(&ctx) = len; ihk_mc_syscall_arg2(&ctx) = prot; + dkprintf("%s: offloading __NR_mprotect\n", __FUNCTION__); + /* #986: Let remote page fault code skip + read-locking memory_range_lock. It's safe because other writers are warded off + until the remote PF handling code calls up_write(¤t->mm->mmap_sem) and + vm_range is consistent when calling this function. */ + if (holding_memory_range_lock) { + thread->vm->is_memory_range_lock_taken = 1; + } lerror = syscall_generic_forwarding(__NR_mprotect, &ctx); if (lerror) { kprintf("set_host_vma(%lx,%lx,%x) failed. %ld\n", @@ -1294,10 +1303,13 @@ static int set_host_vma(uintptr_t addr, size_t len, int prot) lerror = 0; out: + if (holding_memory_range_lock) { + thread->vm->is_memory_range_lock_taken = 0; + } return (int)lerror; } -int do_munmap(void *addr, size_t len) +int do_munmap(void *addr, size_t len, int holding_memory_range_lock) { int error; int ro_freed; @@ -1309,7 +1321,7 @@ int do_munmap(void *addr, size_t len) clear_host_pte((uintptr_t)addr, len); } else { - error = set_host_vma((uintptr_t)addr, len, PROT_READ|PROT_WRITE); + error = set_host_vma((uintptr_t)addr, len, PROT_READ|PROT_WRITE, holding_memory_range_lock); if (error) { kprintf("sys_munmap:set_host_vma failed. %d\n", error); /* through */ @@ -1445,7 +1457,7 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot, if (flags & MAP_FIXED) { /* clear specified address range */ - error = do_munmap((void *)addr, len); + error = do_munmap((void *)addr, len, 1/* holding memory_range_lock */); if (error) { ekprintf("do_mmap:do_munmap(%lx,%lx) failed. %d\n", addr, len, error); @@ -1487,7 +1499,7 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot, } if (!(prot & PROT_WRITE)) { - error = set_host_vma(addr, len, PROT_READ); + error = set_host_vma(addr, len, PROT_READ, 1/* holding memory_range_lock */); if (error) { kprintf("do_mmap:set_host_vma failed. %d\n", error); goto out; @@ -1689,7 +1701,7 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot, out: if (ro_vma_mapped) { - (void)set_host_vma(addr, len, PROT_READ|PROT_WRITE); + (void)set_host_vma(addr, len, PROT_READ|PROT_WRITE, 1/* holding memory_range_lock */); } ihk_mc_spinlock_unlock_noirq(&thread->vm->memory_range_lock); @@ -1760,7 +1772,7 @@ SYSCALL_DECLARE(munmap) } ihk_mc_spinlock_lock_noirq(&thread->vm->memory_range_lock); - error = do_munmap((void *)addr, len); + error = do_munmap((void *)addr, len, 1/* holding memory_range_lock */); ihk_mc_spinlock_unlock_noirq(&thread->vm->memory_range_lock); out: @@ -1899,7 +1911,7 @@ out: // XXX: TLB flush flush_tlb(); if (ro_changed && !error) { - error = set_host_vma(start, len, prot & (PROT_READ|PROT_WRITE)); + error = set_host_vma(start, len, prot & (PROT_READ|PROT_WRITE), 1/* holding memory_range_lock */); if (error) { kprintf("sys_mprotect:set_host_vma failed. %d\n", error); /* through */ @@ -2149,7 +2161,7 @@ static void munmap_all(void) addr = (void *)range->start; size = range->end - range->start; - error = do_munmap(addr, size); + error = do_munmap(addr, size, 1/* holding memory_range_lock */); if (error) { kprintf("munmap_all():do_munmap(%p,%lx) failed. %d\n", addr, size, error); @@ -4987,7 +4999,7 @@ SYSCALL_DECLARE(shmat) vrflags |= VRFLAG_PROT_TO_MAXPROT(vrflags); if (!(prot & PROT_WRITE)) { - error = set_host_vma(addr, len, PROT_READ); + error = set_host_vma(addr, len, PROT_READ, 1/* holding memory_range_lock */); if (error) { ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock); shmobj_list_unlock(); @@ -5002,7 +5014,7 @@ SYSCALL_DECLARE(shmat) vrflags, &obj->memobj, 0, obj->pgshift, NULL); if (error) { if (!(prot & PROT_WRITE)) { - (void)set_host_vma(addr, len, PROT_READ|PROT_WRITE); + (void)set_host_vma(addr, len, PROT_READ|PROT_WRITE, 1/* holding memory_range_lock */); } memobj_release(&obj->memobj); ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock); @@ -5297,7 +5309,7 @@ SYSCALL_DECLARE(shmdt) return -EINVAL; } - error = do_munmap((void *)range->start, (range->end - range->start)); + error = do_munmap((void *)range->start, (range->end - range->start), 1/* holding memory_range_lock */); if (error) { ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock); dkprintf("shmdt(%p): %d\n", shmaddr, error); @@ -7825,7 +7837,7 @@ SYSCALL_DECLARE(mremap) /* do the remap */ if (need_relocate) { if (flags & MREMAP_FIXED) { - error = do_munmap((void *)newstart, newsize); + error = do_munmap((void *)newstart, newsize, 1/* holding memory_range_lock */); if (error) { ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" "fixed:munmap failed. %d\n", @@ -7872,7 +7884,7 @@ SYSCALL_DECLARE(mremap) goto out; } - error = do_munmap((void *)oldstart, oldsize); + error = do_munmap((void *)oldstart, oldsize, 1/* holding memory_range_lock */); if (error) { ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" "relocate:munmap failed. %d\n", @@ -7883,7 +7895,7 @@ SYSCALL_DECLARE(mremap) } } else if (newsize < oldsize) { - error = do_munmap((void *)newend, (oldend - newend)); + error = do_munmap((void *)newend, (oldend - newend), 1/* holding memory_range_lock */); if (error) { ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" "shrink:munmap failed. %d\n",