diff --git a/kernel/include/process.h b/kernel/include/process.h index d62ea257..7e96f978 100644 --- a/kernel/include/process.h +++ b/kernel/include/process.h @@ -369,6 +369,13 @@ struct vm_range { int padding; }; +struct vm_range_numa_policy { + struct list_head list; + unsigned long start, end; + DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS); + int numa_mem_policy; +}; + struct vm_regions { unsigned long vm_start, vm_end; unsigned long text_start, text_end; @@ -660,6 +667,8 @@ struct process_vm { long currss; DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS); int numa_mem_policy; + /* Protected by memory_range_lock */ + struct list_head vm_range_numa_policy_list; }; static inline int has_cap_ipc_lock(struct thread *th) diff --git a/kernel/process.c b/kernel/process.c index 858fc092..90821717 100644 --- a/kernel/process.c +++ b/kernel/process.c @@ -210,6 +210,7 @@ init_process_vm(struct process *owner, struct address_space *asp, struct process ihk_atomic_set(&vm->refcount, 1); INIT_LIST_HEAD(&vm->vm_range_list); + INIT_LIST_HEAD(&vm->vm_range_numa_policy_list); vm->address_space = asp; vm->proc = owner; vm->exiting = 0; @@ -2483,6 +2484,7 @@ void sched_init(void) ihk_mc_init_context(&idle_thread->ctx, NULL, idle); ihk_mc_spinlock_init(&idle_thread->vm->memory_range_lock); INIT_LIST_HEAD(&idle_thread->vm->vm_range_list); + INIT_LIST_HEAD(&idle_thread->vm->vm_range_numa_policy_list); idle_thread->proc->pid = 0; idle_thread->tid = ihk_mc_get_processor_id(); diff --git a/kernel/syscall.c b/kernel/syscall.c index f7096a88..9c39a1a5 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -7081,7 +7081,332 @@ out: SYSCALL_DECLARE(mbind) { - return -ENOSYS; + unsigned long addr = ihk_mc_syscall_arg0(ctx); + unsigned long len = ihk_mc_syscall_arg1(ctx); + int mode = ihk_mc_syscall_arg2(ctx); + unsigned long *nodemask = + (unsigned long *)ihk_mc_syscall_arg3(ctx); + unsigned long maxnode = ihk_mc_syscall_arg4(ctx); + unsigned flags = ihk_mc_syscall_arg5(ctx); + struct process_vm *vm = cpu_local_var(current)->vm; + unsigned long nodemask_bits = 0; + int mode_flags = 0; + int error = 0; + int bit; + struct vm_range *range; + struct vm_range_numa_policy *range_policy, *range_policy_iter; + struct vm_range_numa_policy *range_policy_next = NULL; + DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS); + + /* Validate arguments */ + if (addr & ~PAGE_MASK) { + return -EINVAL; + } + + len = (len + PAGE_SIZE - 1) & PAGE_MASK; + if (addr + len < addr || addr == (addr + len)) { + return -EINVAL; + } + + memset(numa_mask, 0, sizeof(numa_mask)); + + if (maxnode) { + nodemask_bits = ALIGN(maxnode, 8); + if (maxnode > (PAGE_SIZE << 3)) { + dkprintf("%s: ERROR: nodemask_bits bigger than PAGE_SIZE bits\n", + __FUNCTION__); + error = -EINVAL; + goto out; + } + + if (nodemask_bits > PROCESS_NUMA_MASK_BITS) { + dkprintf("%s: WARNING: process NUMA mask bits is insufficient\n", + __FUNCTION__); + nodemask_bits = PROCESS_NUMA_MASK_BITS; + } + } + + if ((mode & MPOL_F_STATIC_NODES) && (mode & MPOL_F_RELATIVE_NODES)) { + dkprintf("%s: error: MPOL_F_STATIC_NODES & MPOL_F_RELATIVE_NODES\n", + __FUNCTION__); + error = -EINVAL; + goto out; + } + + if ((flags & MPOL_MF_STRICT) && (flags & MPOL_MF_MOVE)) { + dkprintf("%s: error: MPOL_MF_STRICT & MPOL_MF_MOVE\n", + __FUNCTION__); + /* + * XXX: man page claims the correct error code is EIO, + * but LTP tests for EINVAL. + */ + error = -EINVAL; + goto out; + } + + mode_flags = (mode & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)); + mode &= ~(MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES); + + if (mode_flags & MPOL_F_RELATIVE_NODES) { + /* Not supported.. */ + dkprintf("%s: error: MPOL_F_RELATIVE_NODES not supported\n", + __FUNCTION__); + error = -EINVAL; + goto out; + } + + switch (mode) { + case MPOL_DEFAULT: + if (nodemask && nodemask_bits) { + error = copy_from_user(numa_mask, nodemask, + (nodemask_bits >> 3)); + if (error) { + dkprintf("%s: error: copy_from_user numa_mask\n", + __FUNCTION__); + error = -EFAULT; + goto out; + } + + if (!bitmap_empty(numa_mask, nodemask_bits)) { + dkprintf("%s: ERROR: nodemask not empty for MPOL_DEFAULT\n", + __FUNCTION__); + error = -EINVAL; + goto out; + } + } + break; + + case MPOL_BIND: + case MPOL_INTERLEAVE: + case MPOL_PREFERRED: + /* Special case for MPOL_PREFERRED with empty nodemask */ + if (mode == MPOL_PREFERRED && !nodemask) { + error = 0; + break; + } + + if (flags & MPOL_MF_STRICT) { + error = -EIO; + goto out; + } + + error = copy_from_user(numa_mask, nodemask, + (nodemask_bits >> 3)); + if (error) { + error = -EFAULT; + goto out; + } + + if (!nodemask || bitmap_empty(numa_mask, nodemask_bits)) { + dkprintf("%s: ERROR: nodemask not specified\n", + __FUNCTION__); + error = -EINVAL; + goto out; + } + + /* Verify NUMA mask */ + for_each_set_bit(bit, numa_mask, maxnode) { + if (bit >= ihk_mc_get_nr_numa_nodes()) { + dkprintf("%s: %d is bigger than # of NUMA nodes\n", + __FUNCTION__, bit); + error = -EINVAL; + goto out; + } + } + + break; + + default: + error = -EINVAL; + goto out; + } + + /* Validate address range */ + ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock); + + range = lookup_process_memory_range(vm, addr, addr + len); + if (!range) { + dkprintf("%s: ERROR: range is invalid\n", __FUNCTION__); + error = -EFAULT; + goto unlock_out; + } + + /* Do the actual policy setting */ + switch (mode) { + /* + * Man page claims MPOL_DEFAULT should remove any range specific + * policies so that process wise policy will be used. LTP on the + * other hand seems to test if MPOL_DEFAULT is set as a range policy. + * MPOL_DEFAULT thus behaves the same as the rest of the policies + * for now. + */ +#if 0 + case MPOL_DEFAULT: + /* Delete or adjust any overlapping range settings */ + list_for_each_entry_safe(range_policy_iter, range_policy_next, + &vm->vm_range_numa_policy_list, list) { + int keep = 0; + unsigned long orig_end = range_policy_iter->end; + + if (range_policy_iter->end < addr || + range_policy_iter->start > addr + len) { + continue; + } + + /* Do we need to keep the front? */ + if (range_policy_iter->start < addr) { + range_policy_iter->end = addr; + keep = 1; + } + + /* Do we need to keep the end? */ + if (orig_end > addr + len) { + /* Are we keeping front already? */ + if (keep) { + /* Add a new entry after */ + range_policy = kmalloc(sizeof(*range_policy), + IHK_MC_AP_NOWAIT); + if (!range_policy) { + kprintf("%s: error allocating range_policy\n", + __FUNCTION__); + error = -ENOMEM; + goto unlock_out; + } + + memcpy(range_policy, range_policy_iter, + sizeof(*range_policy)); + range_policy->start = addr + len; + range_policy->end = orig_end; + list_add(&range_policy->list, + &range_policy_iter->list); + } + else { + range_policy_iter->start = addr + len; + keep = 1; + } + } + + if (!keep) { + list_del(&range_policy_iter->list); + kfree(range_policy_iter); + } + } + + break; +#endif + case MPOL_DEFAULT: + case MPOL_BIND: + case MPOL_INTERLEAVE: + case MPOL_PREFERRED: + /* Adjust any overlapping range settings and add new one */ + range_policy_next = NULL; + list_for_each_entry(range_policy_iter, + &vm->vm_range_numa_policy_list, list) { + int adjusted = 0; + unsigned long orig_end = range_policy_iter->end; + + if (range_policy_iter->end < addr) + continue; + + /* Special case of entirely overlapping */ + if (range_policy_iter->start == addr && + range_policy_iter->end == addr + len) { + range_policy = range_policy_iter; + goto mbind_update_only; + } + + /* Overlapping partially? */ + if (range_policy_iter->start < addr) { + orig_end = range_policy_iter->end; + range_policy_iter->end = addr; + adjusted = 1; + } + + /* Do we need to keep the end? */ + if (orig_end > addr + len) { + if (adjusted) { + /* Add a new entry after */ + range_policy = kmalloc(sizeof(*range_policy), + IHK_MC_AP_NOWAIT); + if (!range_policy) { + dkprintf("%s: error allocating range_policy\n", + __FUNCTION__); + error = -ENOMEM; + goto unlock_out; + } + + memcpy(range_policy, range_policy_iter, + sizeof(*range_policy)); + range_policy->start = addr + len; + range_policy->end = orig_end; + list_add(&range_policy->list, + &range_policy_iter->list); + range_policy_next = range_policy; + break; + } + else { + range_policy_iter->start = addr + len; + range_policy_next = range_policy_iter; + break; + } + } + + /* Next one in ascending address order? */ + if (range_policy_iter->start >= addr + len) { + range_policy_next = range_policy_iter; + break; + } + } + + /* Add a new entry */ + range_policy = kmalloc(sizeof(*range_policy), + IHK_MC_AP_NOWAIT); + if (!range_policy) { + dkprintf("%s: error allocating range_policy\n", + __FUNCTION__); + error = -ENOMEM; + goto unlock_out; + } + + memset(range_policy, 0, sizeof(*range_policy)); + range_policy->start = addr; + range_policy->end = addr + len; + + if (range_policy_next) { + list_add_tail(&range_policy->list, + &range_policy_next->list); + } + else { + list_add_tail(&range_policy->list, + &vm->vm_range_numa_policy_list); + } + +mbind_update_only: + if (mode == MPOL_DEFAULT) { + memset(range_policy->numa_mask, 0, sizeof(numa_mask)); + for (bit = 0; bit < ihk_mc_get_nr_numa_nodes(); ++bit) { + set_bit(bit, range_policy->numa_mask); + } + } + else { + memcpy(range_policy->numa_mask, &numa_mask, + sizeof(numa_mask)); + } + range_policy->numa_mem_policy = mode; + + break; + + default: + error = -EINVAL; + goto out; + } + + error = 0; + +unlock_out: + ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock); +out: + return error; } /* sys_mbind() */ SYSCALL_DECLARE(set_mempolicy) @@ -7094,6 +7419,8 @@ SYSCALL_DECLARE(set_mempolicy) struct process_vm *vm = cpu_local_var(current)->vm; int error = 0; int bit, valid_mask; + struct vm_range_numa_policy *range_policy_iter; + struct vm_range_numa_policy *range_policy_next = NULL; DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS); memset(numa_mask, 0, sizeof(numa_mask)); @@ -7108,7 +7435,7 @@ SYSCALL_DECLARE(set_mempolicy) } if (nodemask_bits > PROCESS_NUMA_MASK_BITS) { - kprintf("%s: WARNING: process NUMA mask bits is insufficient\n", + dkprintf("%s: WARNING: process NUMA mask bits is insufficient\n", __FUNCTION__); nodemask_bits = PROCESS_NUMA_MASK_BITS; } @@ -7137,7 +7464,14 @@ SYSCALL_DECLARE(set_mempolicy) set_bit(bit, vm->numa_mask); } - /* TODO: delete all mbind() specified regions */ + /* Delete all range settings */ + ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock); + list_for_each_entry_safe(range_policy_iter, range_policy_next, + &vm->vm_range_numa_policy_list, list) { + list_del(&range_policy_iter->list); + kfree(range_policy_iter); + } + ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock); vm->numa_mem_policy = mode; error = 0; @@ -7176,7 +7510,7 @@ SYSCALL_DECLARE(set_mempolicy) valid_mask = 0; for_each_set_bit(bit, numa_mask, maxnode) { if (bit >= ihk_mc_get_nr_numa_nodes()) { - dkprintf("%s: %d is bigger than # of NUMA nodes\n", + dkprintf("%s: %d is bigger than # of NUMA nodes\n", __FUNCTION__, bit); error = -EINVAL; goto out; @@ -7224,16 +7558,25 @@ SYSCALL_DECLARE(get_mempolicy) unsigned long addr = ihk_mc_syscall_arg3(ctx); unsigned long flags = ihk_mc_syscall_arg4(ctx); struct process_vm *vm = cpu_local_var(current)->vm; - int error; + struct vm_range_numa_policy *range_policy = NULL; + int error = 0; + int policy; - if (((flags & MPOL_F_ADDR) && !addr) || - (!(flags & MPOL_F_ADDR) && addr) || + if ((!(flags & MPOL_F_ADDR) && addr) || (flags & ~(MPOL_F_ADDR | MPOL_F_NODE | MPOL_F_MEMS_ALLOWED)) || ((flags & MPOL_F_NODE) && !(flags & MPOL_F_ADDR) && vm->numa_mem_policy == MPOL_INTERLEAVE)) { return -EINVAL; } + /* + * XXX: man page claims the correct error code is EINVAL, + * but LTP tests for EFAULT. + */ + if ((flags & MPOL_F_ADDR) && !addr) { + return -EFAULT; + } + if (maxnode) { if (maxnode < ihk_mc_get_nr_numa_nodes()) { return -EINVAL; @@ -7247,18 +7590,62 @@ SYSCALL_DECLARE(get_mempolicy) } } + /* Special case of MPOL_F_MEMS_ALLOWED */ + if (flags == MPOL_F_MEMS_ALLOWED) { + if (nodemask) { + error = copy_to_user(nodemask, + cpu_local_var(current)->vm->numa_mask, + (nodemask_bits >> 3)); + if (error) { + error = -EFAULT; + } + } + + goto out; + } + + /* Address range specific? */ + if (flags & MPOL_F_ADDR) { + struct vm_range_numa_policy *range_policy_iter; + struct vm_range *range; + + ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock); + range = lookup_process_memory_range(vm, addr, addr + 1); + if (!range) { + dkprintf("%s: ERROR: range is invalid\n", __FUNCTION__); + error = -EFAULT; + ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock); + goto out; + } + + list_for_each_entry(range_policy_iter, + &vm->vm_range_numa_policy_list, list) { + if (range_policy_iter->start > addr || + range_policy_iter->end <= addr) { + continue; + } + + range_policy = range_policy_iter; + break; + } + ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock); + } + + /* Return policy */ + policy = range_policy ? range_policy->numa_mem_policy : + vm->numa_mem_policy; + if (mode) { - error = copy_to_user(mode, - &cpu_local_var(current)->vm->numa_mem_policy, - sizeof(int)); + error = copy_to_user(mode, &policy, sizeof(int)); if (error) { error = -EFAULT; goto out; } } - if (nodemask) { + if (nodemask && (policy != MPOL_DEFAULT)) { error = copy_to_user(nodemask, + range_policy ? range_policy->numa_mask : cpu_local_var(current)->vm->numa_mask, (nodemask_bits >> 3)); if (error) {