diff --git a/kernel/include/process.h b/kernel/include/process.h index 9132af03..d62ea257 100644 --- a/kernel/include/process.h +++ b/kernel/include/process.h @@ -168,6 +168,13 @@ #define PROCESS_NUMA_MASK_BITS 64 +/* + * Both the MPOL_* mempolicy mode and the MPOL_F_* optional mode flags are + * passed by the user to either set_mempolicy() or mbind() in an 'int' actual. + * The MPOL_MODE_FLAGS macro determines the legal set of optional mode flags. + */ + +/* Policies */ enum { MPOL_DEFAULT, MPOL_PREFERRED, @@ -177,6 +184,51 @@ enum { MPOL_MAX, /* always last member of enum */ }; +enum mpol_rebind_step { + MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */ + MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */ + MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/ + MPOL_REBIND_NSTEP, +}; + +/* Flags for set_mempolicy */ +#define MPOL_F_STATIC_NODES (1 << 15) +#define MPOL_F_RELATIVE_NODES (1 << 14) + +/* + * MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to + * either set_mempolicy() or mbind(). + */ +#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES) + +/* Flags for get_mempolicy */ +#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */ +#define MPOL_F_ADDR (1<<1) /* look up vma using address */ +#define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */ + +/* Flags for mbind */ +#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ +#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform + to policy */ +#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */ +#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */ +#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */ + +#define MPOL_MF_VALID (MPOL_MF_STRICT | \ + MPOL_MF_MOVE | \ + MPOL_MF_MOVE_ALL) + +/* + * Internal flags that share the struct mempolicy flags word with + * "mode flags". These flags are allocated from bit 0 up, as they + * are never OR'ed into the mode in mempolicy API arguments. + */ +#define MPOL_F_SHARED (1 << 0) /* identify shared policies */ +#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ +#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */ +#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ +#define MPOL_F_MORON (1 << 4) /* Migrate On pte_numa Reference On Node */ + #include #include diff --git a/kernel/syscall.c b/kernel/syscall.c index fb537c4e..f7096a88 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -52,6 +52,8 @@ #include #include #include +#include +#include /* Headers taken from kitten LWK */ #include @@ -7084,7 +7086,132 @@ SYSCALL_DECLARE(mbind) SYSCALL_DECLARE(set_mempolicy) { - return -ENOSYS; + int mode = ihk_mc_syscall_arg0(ctx); + unsigned long *nodemask = + (unsigned long *)ihk_mc_syscall_arg1(ctx); + unsigned long maxnode = ihk_mc_syscall_arg2(ctx); + unsigned long nodemask_bits = 0; + struct process_vm *vm = cpu_local_var(current)->vm; + int error = 0; + int bit, valid_mask; + DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS); + + memset(numa_mask, 0, sizeof(numa_mask)); + + if (maxnode) { + nodemask_bits = ALIGN(maxnode, 8); + if (maxnode > (PAGE_SIZE << 3)) { + dkprintf("%s: ERROR: nodemask_bits bigger than PAGE_SIZE bits\n", + __FUNCTION__); + error = -EINVAL; + goto out; + } + + if (nodemask_bits > PROCESS_NUMA_MASK_BITS) { + kprintf("%s: WARNING: process NUMA mask bits is insufficient\n", + __FUNCTION__); + nodemask_bits = PROCESS_NUMA_MASK_BITS; + } + } + + switch (mode) { + case MPOL_DEFAULT: + if (nodemask && nodemask_bits) { + error = copy_from_user(numa_mask, nodemask, + (nodemask_bits >> 3)); + if (error) { + error = -EFAULT; + goto out; + } + + if (!bitmap_empty(numa_mask, nodemask_bits)) { + dkprintf("%s: ERROR: nodemask not empty for MPOL_DEFAULT\n", + __FUNCTION__); + error = -EINVAL; + goto out; + } + } + + memset(vm->numa_mask, 0, sizeof(numa_mask)); + for (bit = 0; bit < ihk_mc_get_nr_numa_nodes(); ++bit) { + set_bit(bit, vm->numa_mask); + } + + /* TODO: delete all mbind() specified regions */ + + vm->numa_mem_policy = mode; + error = 0; + break; + + case MPOL_BIND: + case MPOL_INTERLEAVE: + case MPOL_PREFERRED: + /* Special case for MPOL_PREFERRED with empty nodemask */ + if (mode == MPOL_PREFERRED && !nodemask) { + memset(vm->numa_mask, 0, sizeof(numa_mask)); + for (bit = 0; bit < ihk_mc_get_nr_numa_nodes(); ++bit) { + set_bit(bit, vm->numa_mask); + } + + vm->numa_mem_policy = mode; + error = 0; + break; + } + + if (!nodemask) { + dkprintf("%s: ERROR: nodemask not specified\n", + __FUNCTION__); + error = -EINVAL; + goto out; + } + + error = copy_from_user(numa_mask, nodemask, + (nodemask_bits >> 3)); + if (error) { + error = -EFAULT; + goto out; + } + + /* Verify NUMA mask */ + valid_mask = 0; + for_each_set_bit(bit, numa_mask, maxnode) { + if (bit >= ihk_mc_get_nr_numa_nodes()) { + dkprintf("%s: %d is bigger than # of NUMA nodes\n", + __FUNCTION__, bit); + error = -EINVAL; + goto out; + } + + /* Is there at least one node which is allowed + * in current mask? */ + if (test_bit(bit, vm->numa_mask)) { + valid_mask = 1; + } + } + + if (!valid_mask) { + dkprintf("%s: ERROR: invalid nodemask\n", __FUNCTION__); + error = -EINVAL; + goto out; + } + + /* Update current mask by clearing non-requested nodes */ + for_each_set_bit(bit, vm->numa_mask, maxnode) { + if (!test_bit(bit, numa_mask)) { + clear_bit(bit, vm->numa_mask); + } + } + + vm->numa_mem_policy = mode; + error = 0; + break; + + default: + error = -EINVAL; + } + +out: + return error; } /* sys_set_mempolicy() */ SYSCALL_DECLARE(get_mempolicy) @@ -7092,21 +7219,40 @@ SYSCALL_DECLARE(get_mempolicy) int *mode = (int *)ihk_mc_syscall_arg0(ctx); unsigned long *nodemask = (unsigned long *)ihk_mc_syscall_arg1(ctx); + unsigned long nodemask_bits = 0; unsigned long maxnode = ihk_mc_syscall_arg2(ctx); unsigned long addr = ihk_mc_syscall_arg3(ctx); unsigned long flags = ihk_mc_syscall_arg4(ctx); + struct process_vm *vm = cpu_local_var(current)->vm; int error; - if (flags || addr) { + if (((flags & MPOL_F_ADDR) && !addr) || + (!(flags & MPOL_F_ADDR) && addr) || + (flags & ~(MPOL_F_ADDR | MPOL_F_NODE | MPOL_F_MEMS_ALLOWED)) || + ((flags & MPOL_F_NODE) && !(flags & MPOL_F_ADDR) && + vm->numa_mem_policy == MPOL_INTERLEAVE)) { return -EINVAL; } + if (maxnode) { + if (maxnode < ihk_mc_get_nr_numa_nodes()) { + return -EINVAL; + } + + nodemask_bits = ALIGN(maxnode, 8); + if (nodemask_bits > PROCESS_NUMA_MASK_BITS) { + dkprintf("%s: WARNING: process NUMA mask bits is insufficient\n", + __FUNCTION__); + nodemask_bits = PROCESS_NUMA_MASK_BITS; + } + } + if (mode) { error = copy_to_user(mode, &cpu_local_var(current)->vm->numa_mem_policy, sizeof(int)); if (error) { - error = -EINVAL; + error = -EFAULT; goto out; } } @@ -7114,10 +7260,9 @@ SYSCALL_DECLARE(get_mempolicy) if (nodemask) { error = copy_to_user(nodemask, cpu_local_var(current)->vm->numa_mask, - maxnode < (PROCESS_NUMA_MASK_BITS >> 3) ? - maxnode : (PROCESS_NUMA_MASK_BITS >> 3)); + (nodemask_bits >> 3)); if (error) { - error = -EINVAL; + error = -EFAULT; goto out; } }