diff --git a/arch/x86/kernel/memory.c b/arch/x86/kernel/memory.c index 6c71effd..315cf3ba 100644 --- a/arch/x86/kernel/memory.c +++ b/arch/x86/kernel/memory.c @@ -6,6 +6,7 @@ #include #include #include +#include static char *last_page; extern char _head[], _end[]; @@ -638,109 +639,57 @@ int ihk_mc_pt_clear_large_page(page_table_t pt, void *virt) return __clear_pt_page(pt, virt, 1); } -static int clear_range_l1(struct page_table *pt, uint64_t base, uint64_t start, uint64_t end) -{ - int six; - int eix; - int ret; - int i; - - six = (start <= base)? 0: (start - base) >> PTL1_SHIFT; - eix = ((base + PTL2_SIZE) <= end)? PT_ENTRIES - : ((end - base) + (PTL1_SIZE - 1)) >> PTL1_SHIFT; - - ret = -ENOENT; - for (i = six; i < eix; ++i) { - if (!(pt->entry[i] & PFL1_PRESENT)) { - continue; - } - - pt->entry[i] = 0; - ret = 0; - } - - return ret; -} - -static int clear_range_l2(struct page_table *pt, uint64_t base, uint64_t start, uint64_t end) +typedef int walk_pte_fn_t(void *args, pte_t *ptep, uint64_t base, + uint64_t start, uint64_t end); + +static int walk_pte_l1(struct page_table *pt, uint64_t base, uint64_t start, + uint64_t end, walk_pte_fn_t *funcp, void *args) { int six; int eix; int ret; int i; + int error; uint64_t off; - struct page_table *q; - int error; - six = (start <= base)? 0: (start - base) >> PTL2_SHIFT; - eix = ((base + PTL3_SIZE) <= end)? PT_ENTRIES - : ((end - base) + (PTL2_SIZE - 1)) >> PTL2_SHIFT; + six = (start <= base)? 0: ((start - base) >> PTL1_SHIFT); + eix = ((end == 0) || ((base + PTL2_SIZE) <= end))? PT_ENTRIES + : (((end - base) + (PTL1_SIZE - 1)) >> PTL1_SHIFT); ret = -ENOENT; for (i = six; i < eix; ++i) { - if (!(pt->entry[i] & PFL2_PRESENT)) { - continue; + off = i * PTL1_SIZE; + error = (*funcp)(args, &pt->entry[i], base+off, start, end); + if (!error) { + ret = 0; } + else if (error != -ENOENT) { + ret = error; + break; + } + } + return ret; +} + +static int walk_pte_l2(struct page_table *pt, uint64_t base, uint64_t start, + uint64_t end, walk_pte_fn_t *funcp, void *args) +{ + int six; + int eix; + int ret; + int i; + int error; + uint64_t off; + + six = (start <= base)? 0: ((start - base) >> PTL2_SHIFT); + eix = ((end == 0) || ((base + PTL3_SIZE) <= end))? PT_ENTRIES + : (((end - base) + (PTL2_SIZE - 1)) >> PTL2_SHIFT); + + ret = -ENOENT; + for (i = six; i < eix; ++i) { off = i * PTL2_SIZE; - - if (pt->entry[i] & PFL2_SIZE) { - if (((base + off) < start) || (end < (base + off + PTL2_SIZE))) { - kprintf("clear_range_l2(%p,%lx,%lx,%lx):" - "not a 2MiB page boundary\n", - pt, base, start, end); - ret = -ERANGE; - break; - } - - pt->entry[i] = 0; - ret = 0; - continue; - } - - q = phys_to_virt(pt->entry[i] & PT_PHYSMASK); - - if ((start <= (base + off)) && ((base + off + PTL2_SIZE) <= end)) { - pt->entry[i] = 0; - ret = 0; - arch_free_page(q); - } - else { - error = clear_range_l1(q, base+off, start, end); - if (!error) { - ret = 0; - } - else if (error != -ENOENT) { - ret = error; - break; - } - } - } - - return ret; -} - -static int clear_range_l3(struct page_table *pt, uint64_t base, uint64_t start, uint64_t end) -{ - int six; - int eix; - int ret; - int i; - int error; - struct page_table *q; - - six = (start <= base)? 0: (start - base) >> PTL3_SHIFT; - eix = ((base + PTL4_SIZE) <= end)? PT_ENTRIES - : ((end - base) + (PTL3_SIZE - 1)) >> PTL3_SHIFT; - - ret = -ENOENT; - for (i = six; i < eix; ++i) { - if (!(pt->entry[i] & PFL3_PRESENT)) { - continue; - } - - q = phys_to_virt(pt->entry[i] & PT_PHYSMASK); - error = clear_range_l2(q, base+(i*PTL3_SIZE), start, end); + error = (*funcp)(args, &pt->entry[i], base+off, start, end); if (!error) { ret = 0; } @@ -753,29 +702,24 @@ static int clear_range_l3(struct page_table *pt, uint64_t base, uint64_t start, return ret; } -static int clear_range_l4(struct page_table *pt, uint64_t base, uint64_t start, uint64_t end) +static int walk_pte_l3(struct page_table *pt, uint64_t base, uint64_t start, + uint64_t end, walk_pte_fn_t *funcp, void *args) { int six; int eix; int ret; int i; int error; - struct page_table *q; + uint64_t off; - six = (start <= base)? 0: (start - base) >> PTL4_SHIFT; - eix = ((end - base) + (PTL4_SIZE - 1)) >> PTL4_SHIFT; - if ((eix <= 0) || (PT_ENTRIES < eix)) { - eix = PT_ENTRIES; - } + six = (start <= base)? 0: ((start - base) >> PTL3_SHIFT); + eix = ((end == 0) || ((base + PTL4_SIZE) <= end))? PT_ENTRIES + : (((end - base) + (PTL3_SIZE - 1)) >> PTL3_SHIFT); ret = -ENOENT; for (i = six; i < eix; ++i) { - if (!(pt->entry[i] & PFL4_PRESENT)) { - continue; - } - - q = phys_to_virt(pt->entry[i] & PT_PHYSMASK); - error = clear_range_l3(q, base+(i*PTL4_SIZE), start, end); + off = i * PTL3_SIZE; + error = (*funcp)(args, &pt->entry[i], base+off, start, end); if (!error) { ret = 0; } @@ -788,6 +732,127 @@ static int clear_range_l4(struct page_table *pt, uint64_t base, uint64_t start, return ret; } +static int walk_pte_l4(struct page_table *pt, uint64_t base, uint64_t start, + uint64_t end, walk_pte_fn_t *funcp, void *args) +{ + int six; + int eix; + int ret; + int i; + int error; + uint64_t off; + + six = (start <= base)? 0: ((start - base) >> PTL4_SHIFT); + eix = (end == 0)? PT_ENTRIES + :(((end - base) + (PTL4_SIZE - 1)) >> PTL4_SHIFT); + + ret = -ENOENT; + for (i = six; i < eix; ++i) { + off = i * PTL4_SIZE; + error = (*funcp)(args, &pt->entry[i], base+off, start, end); + if (!error) { + ret = 0; + } + else if (error != -ENOENT) { + ret = error; + break; + } + } + + return ret; +} + +struct clear_range_args { + int free_physical; +}; + +static int clear_range_l1(void *args0, pte_t *ptep, uint64_t base, uint64_t start, uint64_t end) +{ + struct clear_range_args *args = args0; + uint64_t phys; + + if (!(*ptep & PFL1_PRESENT)) { + return -ENOENT; + } + + phys = *ptep & PT_PHYSMASK; + *ptep = 0; + + if (args->free_physical) { + ihk_mc_free_pages(phys_to_virt(phys), 1); + } + + return 0; +} + +static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base, uint64_t start, uint64_t end) +{ + struct clear_range_args *args = args0; + uint64_t phys; + struct page_table *pt; + int error; + + if (!(*ptep & PFL2_PRESENT)) { + return -ENOENT; + } + + if (*ptep & PFL2_SIZE) { + if ((base < start) || (end < (base + PTL2_SIZE))) { + kprintf("clear_range_l2(%p,%p,%lx,%lx,%lx):" + "not a 2MiB page boundary\n", + args0, ptep, base, start, end); + return -ERANGE; + } + + phys = *ptep & PT_PHYSMASK; + *ptep = 0; + + if (args->free_physical) { + ihk_mc_free_pages(phys_to_virt(phys), + LARGE_PAGE_SIZE/PAGE_SIZE); + } + + return 0; + } + + pt = phys_to_virt(*ptep & PT_PHYSMASK); + error = walk_pte_l1(pt, base, start, end, &clear_range_l1, args0); + if (error && (error != -ENOENT)) { + return error; + } + + if ((start <= base) && ((base + PTL2_SIZE) <= end)) { + *ptep = 0; + arch_free_page(pt); + } + + return 0; +} + +static int clear_range_l3(void *args0, pte_t *ptep, uint64_t base, uint64_t start, uint64_t end) +{ + struct page_table *pt; + + if (!(*ptep & PFL3_PRESENT)) { + return -ENOENT; + } + + pt = phys_to_virt(*ptep & PT_PHYSMASK); + return walk_pte_l2(pt, base, start, end, &clear_range_l2, args0); +} + +static int clear_range_l4(void *args0, pte_t *ptep, uint64_t base, uint64_t start, uint64_t end) +{ + struct page_table *pt; + + if (!(*ptep & PFL4_PRESENT)) { + return -ENOENT; + } + + pt = phys_to_virt(*ptep & PT_PHYSMASK); + return walk_pte_l3(pt, base, start, end, &clear_range_l3, args0); +} + static int lookup_pte(struct page_table *pt, void *virt, pte_t **ptep, void **pgbasep, uint64_t *pgsizep) { int l4idx, l3idx, l2idx, l1idx; @@ -819,53 +884,91 @@ static int lookup_pte(struct page_table *pt, void *virt, pte_t **ptep, void **pg return 0; } -static int is_middle_of_the_page(struct page_table *pt, void *virt) +#ifdef USE_LARGE_PAGES +static int split_large_page(struct page_table *pt, intptr_t virt) { int error; - pte_t *pte; + pte_t *ptep; void *pgbase; uint64_t pgsize; + struct page_table *q; + uint64_t phys; + pte_t attr; + int i; - error = lookup_pte(pt, virt, &pte, &pgbase, &pgsize); - if (error) { + error = lookup_pte(pt, (void *)virt, &ptep, &pgbase, &pgsize); + if (error || !(*ptep & PF_PRESENT) || (pgsize == PAGE_SIZE)) { return 0; } - if (!(*pte & PF_PRESENT)) { - return 0; + q = __alloc_new_pt(IHK_MC_AP_NOWAIT); + if (q == NULL) { + kprintf("split_large_page:__alloc_new_pt failed\n"); + return -ENOMEM; } - return pgbase != virt; + phys = *ptep & PT_PHYSMASK; + attr = *ptep & (PFL2_PRESENT | PFL2_WRITABLE | PFL2_USER | PFL2_PWT | PFL2_PCD); + + for (i = 0; i < PT_ENTRIES; ++i) { + q->entry[i] = (phys + (i * PTL1_SIZE)) | attr; + } + + *ptep = (virt_to_phys(q) & PT_PHYSMASK) | PFL2_PDIR_ATTR; + return 0; } +#endif /* USE_LARGE_PAGES */ -int ihk_mc_pt_clear_range(page_table_t pt, void *start0, void *end0) +static int clear_range(page_table_t pt, void *start0, void *end0, int free_physical) { const uint64_t start = (uint64_t)start0; const uint64_t end = (uint64_t)end0; int error; + struct clear_range_args args; if ((USER_END <= start) || (USER_END < end) || (end <= start)) { - kprintf("ihk_mc_pt_clear_range(%p,%p,%p):invalid start and/or end.\n", - pt, start0, end0); + kprintf("clear_range(%p,%p,%p,%x):invalid start and/or end.\n", + pt, start0, end0, free_physical); return -EINVAL; } - if (((start % LARGE_PAGE_SIZE) != 0) && is_middle_of_the_page(pt, start0)) { - kprintf("ihk_mc_pt_clear_range(%p,%p,%p):start0 is not a page boundary\n", - pt, start0, end0); - return -EINVAL; +#ifdef USE_LARGE_PAGES + if (start & (LARGE_PAGE_SIZE - 1)) { + error = split_large_page(pt, start); + if (error) { + kprintf("clear_range(%p,%p,%p,%x):split_large_page(%lx) failed. %d\n", + pt, start0, end0, free_physical, start, error); + return error; + } } - if (((end % LARGE_PAGE_SIZE) != 0) && is_middle_of_the_page(pt, end0)) { - kprintf("ihk_mc_pt_clear_range(%p,%p,%p):end0 is not a page boundary\n", - pt, start0, end0); - return -EINVAL; + if (end & (LARGE_PAGE_SIZE - 1)) { + error = split_large_page(pt, end); + if (error) { + kprintf("clear_range(%p,%p,%p,%x):split_large_page(%lx) failed. %d\n", + pt, start0, end0, free_physical, end, error); + return error; + } } +#endif /* USE_LARGE_PAGES */ - error = clear_range_l4(pt, 0, start, end); + args.free_physical = free_physical; + error = walk_pte_l4(pt, 0, start, end, &clear_range_l4, &args); return error; } +int ihk_mc_pt_clear_range(page_table_t pt, void *start0, void *end0) +{ +#define KEEP_PHYSICAL 0 + return clear_range(pt, start0, end0, KEEP_PHYSICAL); +} + +int ihk_mc_pt_free_range(page_table_t pt, void *start0, void *end0) +{ +#define FREE_PHYSICAL 1 + return clear_range(pt, start0, end0, FREE_PHYSICAL); +} + void load_page_table(struct page_table *pt) { unsigned long pt_addr; diff --git a/executer/include/uprotocol.h b/executer/include/uprotocol.h index 4b1fad07..67c2fff1 100644 --- a/executer/include/uprotocol.h +++ b/executer/include/uprotocol.h @@ -22,7 +22,11 @@ struct program_image_section { unsigned long len; unsigned long remote_pa; unsigned long filesz, offset; + int prot; + int padding; +#if 0 void *source; +#endif }; struct program_load_desc { diff --git a/executer/user/mcexec.c b/executer/user/mcexec.c index 93b520af..57987759 100644 --- a/executer/user/mcexec.c +++ b/executer/user/mcexec.c @@ -124,12 +124,18 @@ struct program_load_desc *load_elf(FILE *fp) desc->sections[j].offset = phdr.p_offset; desc->sections[j].len = phdr.p_memsz; - __dprintf("%d: (%s) %lx, %lx, %lx, %lx\n", - j, (phdr.p_type == PT_LOAD ? "PT_LOAD" : "PT_TLS"), - desc->sections[j].vaddr, + desc->sections[j].prot = PROT_NONE; + desc->sections[j].prot |= (phdr.p_flags & PF_R)? PROT_READ: 0; + desc->sections[j].prot |= (phdr.p_flags & PF_W)? PROT_WRITE: 0; + desc->sections[j].prot |= (phdr.p_flags & PF_X)? PROT_EXEC: 0; + + __dprintf("%d: (%s) %lx, %lx, %lx, %lx, %x\n", + j, (phdr.p_type == PT_LOAD ? "PT_LOAD" : "PT_TLS"), + desc->sections[j].vaddr, desc->sections[j].filesz, desc->sections[j].offset, - desc->sections[j].len); + desc->sections[j].len, + desc->sections[j].prot); j++; if (!load_addr_set) { diff --git a/kernel/host.c b/kernel/host.c index cd9e9b9d..08ea5809 100644 --- a/kernel/host.c +++ b/kernel/host.c @@ -9,6 +9,7 @@ #include #include #include +#include //#define DEBUG_PRINT_HOST @@ -47,6 +48,7 @@ static int process_msg_prepare_process(unsigned long rphys) char **env; int range_npages; void *up_v; + unsigned long flags; sz = sizeof(struct program_load_desc) + sizeof(struct program_image_section) * 16; @@ -88,12 +90,14 @@ static int process_msg_prepare_process(unsigned long rphys) e = (pn->sections[i].vaddr + pn->sections[i].len + PAGE_SIZE - 1) & PAGE_MASK; range_npages = (e - s) >> PAGE_SHIFT; + flags = VR_NONE; + flags |= PROT_TO_VR_FLAG(pn->sections[i].prot); if((up_v = ihk_mc_alloc_pages(range_npages, IHK_MC_AP_NOWAIT)) == NULL){ goto err; } up = virt_to_phys(up_v); - if(add_process_memory_range(proc, s, e, up, VR_NONE) != 0){ + if(add_process_memory_range(proc, s, e, up, flags) != 0){ ihk_mc_free_pages(up_v, range_npages); goto err; } @@ -163,25 +167,26 @@ static int process_msg_prepare_process(unsigned long rphys) (USER_END / 3) & LARGE_PAGE_MASK; /* Map system call stuffs */ + flags = VR_RESERVED | VR_PROT_READ | VR_PROT_WRITE; addr = proc->vm->region.map_start - PAGE_SIZE * SCD_RESERVED_COUNT; e = addr + PAGE_SIZE * DOORBELL_PAGE_COUNT; if(add_process_memory_range(proc, addr, e, cpu_local_var(scp).doorbell_pa, - VR_REMOTE | VR_RESERVED) != 0){ + VR_REMOTE | flags) != 0){ goto err; } addr = e; e = addr + PAGE_SIZE * REQUEST_PAGE_COUNT; if(add_process_memory_range(proc, addr, e, cpu_local_var(scp).request_pa, - VR_REMOTE | VR_RESERVED) != 0){ + VR_REMOTE | flags) != 0){ goto err; } addr = e; e = addr + PAGE_SIZE * RESPONSE_PAGE_COUNT; if(add_process_memory_range(proc, addr, e, cpu_local_var(scp).response_pa, - VR_RESERVED) != 0){ + flags) != 0){ goto err; } @@ -194,7 +199,8 @@ static int process_msg_prepare_process(unsigned long rphys) } args_envs_p = virt_to_phys(args_envs); - if(add_process_memory_range(proc, addr, e, args_envs_p, VR_NONE) != 0){ + if(add_process_memory_range(proc, addr, e, args_envs_p, + VR_PROT_READ|VR_PROT_WRITE) != 0){ ihk_mc_free_pages(args_envs, ARGENV_PAGE_COUNT); goto err; } diff --git a/kernel/include/mman.h b/kernel/include/mman.h new file mode 100644 index 00000000..5f5d5546 --- /dev/null +++ b/kernel/include/mman.h @@ -0,0 +1,34 @@ +#ifndef HEADER_MMAN_H +#define HEADER_MMAN_H + +/* + * memory protection + */ +#define PROT_NONE 0 +#define PROT_READ 0x01 +#define PROT_WRITE 0x02 +#define PROT_EXEC 0x04 + +/* for mprotect */ +#define PROT_GROWSDOWN 0x01000000 +#define PROT_GROWSUP 0x02000000 + +/* + * mapping flags + */ +#define MAP_SHARED 0x01 +#define MAP_PRIVATE 0x02 +#define MAP_FIXED 0x10 +#define MAP_ANONYMOUS 0x20 +#define MAP_32BIT 0x40 +#define MAP_GROWSDOWN 0x0100 +#define MAP_DENYWRITE 0x0800 +#define MAP_EXECUTABLE 0x1000 +#define MAP_LOCKED 0x2000 +#define MAP_NORESERVE 0x4000 +#define MAP_POPULATE 0x8000 +#define MAP_NONBLOCK 0x00010000 +#define MAP_STACK 0x00020000 +#define MAP_HUGETLB 0x00040000 + +#endif /* HEADER_MMAN_H */ diff --git a/kernel/include/process.h b/kernel/include/process.h index 1a4b2690..3a8058b3 100644 --- a/kernel/include/process.h +++ b/kernel/include/process.h @@ -13,6 +13,13 @@ #define VR_IO_NOCACHE 0x100 #define VR_REMOTE 0x200 #define VR_DEMAND_PAGING 0x1000 +#define VR_PROT_NONE 0x00000000 +#define VR_PROT_READ 0x00010000 +#define VR_PROT_WRITE 0x00020000 +#define VR_PROT_EXEC 0x00040000 +#define VR_PROT_MASK 0x00070000 + +#define PROT_TO_VR_FLAG(prot) (((unsigned long)(prot) << 16) & VR_PROT_MASK) #define PS_RUNNING 0x1 #define PS_INTERRUPTIBLE 0x2 @@ -32,7 +39,6 @@ struct vm_range { struct list_head list; unsigned long start, end; - unsigned long phys; unsigned long flag; }; @@ -101,6 +107,8 @@ int add_process_memory_range(struct process *process, unsigned long phys, unsigned long flag); int remove_process_memory_range( struct process *process, unsigned long start, unsigned long end); +struct vm_range *lookup_process_memory_range( + struct process *proc, uintptr_t start, uintptr_t end); int remove_process_region(struct process *proc, unsigned long start, unsigned long end); struct program_load_desc; diff --git a/kernel/include/syscall.h b/kernel/include/syscall.h index 057fc71d..2701489c 100644 --- a/kernel/include/syscall.h +++ b/kernel/include/syscall.h @@ -82,7 +82,11 @@ struct program_image_section { unsigned long len; unsigned long remote_pa; unsigned long filesz, offset; + int prot; + int padding; +#if 0 void *source; +#endif }; struct program_load_desc { diff --git a/kernel/process.c b/kernel/process.c index 8a4110d7..74e1e196 100644 --- a/kernel/process.c +++ b/kernel/process.c @@ -8,6 +8,7 @@ #include #include #include +#include //#define DEBUG_PRINT_PROCESS @@ -104,13 +105,16 @@ extern void __host_update_process_range(struct process *process, struct vm_range *range); int update_process_page_table(struct process *process, - struct vm_range *range, enum ihk_mc_pt_attribute flag) + struct vm_range *range, uint64_t phys, + enum ihk_mc_pt_attribute flag) { - unsigned long p, pa = range->phys; + unsigned long p, pa = phys; unsigned long pp; unsigned long flags = ihk_mc_spinlock_lock(&process->vm->page_table_lock); - const enum ihk_mc_pt_attribute attr - = flag | PTATTR_WRITABLE | PTATTR_USER | PTATTR_FOR_USER; + enum ihk_mc_pt_attribute attr; + + attr = flag | PTATTR_USER | PTATTR_FOR_USER; + attr |= (range->flag & VR_PROT_WRITE)? PTATTR_WRITABLE: 0; p = range->start; while (p < range->end) { @@ -149,7 +153,7 @@ int update_process_page_table(struct process *process, err: pp = range->start; - pa = range->phys; + pa = phys; while(pp < p){ #ifdef USE_LARGE_PAGES if ((p & (LARGE_PAGE_SIZE - 1)) == 0 && @@ -180,7 +184,6 @@ int remove_process_memory_range(struct process *process, unsigned long start, un struct vm_range *next; int error; unsigned long freestart; - unsigned long freephys; unsigned long freesize; struct vm_range *freerange; struct vm_range *newrange; @@ -198,7 +201,6 @@ int remove_process_memory_range(struct process *process, unsigned long start, un if (start <= range->start) { /* partial or whole delete from range->start */ freestart = range->start; - freephys = range->phys; freesize = end - range->start; if (freesize >= (range->end - range->start)) { @@ -208,13 +210,11 @@ int remove_process_memory_range(struct process *process, unsigned long start, un } else { range->start += freesize; - range->phys += freesize; } } else if (range->end <= end) { /* partial delete up to range->end */ freestart = start; - freephys = range->phys + (start - range->start); freesize = range->end - start; range->end = start; @@ -222,7 +222,6 @@ int remove_process_memory_range(struct process *process, unsigned long start, un else { /* delete the middle part of the 'range' */ freestart = start; - freephys = range->phys + (start - range->start); freesize = end - start; newrange = kmalloc(sizeof(struct vm_range), IHK_MC_AP_NOWAIT); @@ -232,27 +231,36 @@ int remove_process_memory_range(struct process *process, unsigned long start, un } newrange->start = end; newrange->end = range->end; - newrange->phys = range->phys + (end - range->start); newrange->flag = range->flag; list_add_tail(&newrange->list, &vm->vm_range_list); range->end = start; } - /* FIXME: traverse page table entries and release physical memory area if present - then mark the page table entry non-present */ - if (freesize > 0 && !(range->flag & VR_DEMAND_PAGING)) { - dkprintf("remove_process_memory_range,remove_process_region\n"); - error = remove_process_region(process, freestart, (freestart + freesize)); - if (error) { - kprintf("remove_process_memory_range:remove_process_region failed: %d\n", error); - /* through */ + if (freesize > 0) { + if (!(range->flag & (VR_REMOTE | VR_IO_NOCACHE | VR_RESERVED))) { + /* clear page table and free physical pages */ + ihk_mc_spinlock_lock_noirq(&vm->page_table_lock); + error = ihk_mc_pt_free_range(vm->page_table, + (void *)start, (void *)end); + ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock); + if (error && (error != -ENOENT)) { + kprintf("remove_process_memory_range:" + "ihk_mc_pt_free_range failed: %d\n", + error); + /* through */ + } } - - if (!(range->flag & (VR_REMOTE | VR_IO_NOCACHE | VR_RESERVED)) && !(range->flag & VR_DEMAND_PAGING)) { - dkprintf("remove_process_memory_range,ihk_mc_free_pages\n"); - // XXX: need TLB shootdown? - ihk_mc_free_pages(phys_to_virt(freephys), freesize>>PAGE_SHIFT); + else { + /* clear page table */ + error = remove_process_region(process, freestart, + (freestart + freesize)); + if (error) { + kprintf("remove_process_memory_range:" + "remove_process_region failed: %d\n", + error); + /* through */ + } } } if (freerange != NULL) { @@ -288,28 +296,29 @@ int add_process_memory_range(struct process *process, INIT_LIST_HEAD(&range->list); range->start = start; range->end = end; - range->phys = phys; range->flag = flag; if(range->flag & VR_DEMAND_PAGING) { - dkprintf("range: 0x%lX - 0x%lX => physicall memory area is allocated on demand (%ld)\n", - range->start, range->end, range->end - range->start); + dkprintf("range: 0x%lX - 0x%lX => physicall memory area is allocated on demand (%ld) [%lx]\n", + range->start, range->end, range->end - range->start, + range->flag); } else { - dkprintf("range: 0x%lX - 0x%lX => 0x%lX - 0x%lX (%ld)\n", + dkprintf("range: 0x%lX - 0x%lX => 0x%lX - 0x%lX (%ld) [%lx]\n", range->start, range->end, range->phys, range->phys + - range->end - range->start, range->end - range->start); + range->end - range->start, range->end - range->start, + range->flag); } if (flag & VR_REMOTE) { - rc = update_process_page_table(process, range, IHK_PTA_REMOTE); + rc = update_process_page_table(process, range, phys, IHK_PTA_REMOTE); } else if (flag & VR_IO_NOCACHE) { - rc = update_process_page_table(process, range, PTATTR_UNCACHABLE); + rc = update_process_page_table(process, range, phys, PTATTR_UNCACHABLE); } else if(flag & VR_DEMAND_PAGING){ //demand paging no need to update process table now kprintf("demand paging do not update process page table\n"); rc = 0; } else { - rc = update_process_page_table(process, range, 0); + rc = update_process_page_table(process, range, phys, 0); } if(rc != 0){ kfree(range); @@ -325,13 +334,30 @@ int add_process_memory_range(struct process *process, list_add_tail(&range->list, &process->vm->vm_range_list); /* Clear content! */ - if (!(flag & VR_REMOTE) && !(flag & VR_DEMAND_PAGING)) - memset((void*)phys_to_virt(range->phys), 0, end - start); + if (!(flag & (VR_REMOTE | VR_DEMAND_PAGING)) + && ((flag & VR_PROT_MASK) != VR_PROT_NONE)) { + memset((void*)phys_to_virt(phys), 0, end - start); + } return 0; } +struct vm_range *lookup_process_memory_range(struct process *proc, uintptr_t start, uintptr_t end) +{ + struct vm_range *range; + if (end <= start) { + return NULL; + } + + list_for_each_entry(range, &proc->vm->vm_range_list, list) { + if ((start < range->end) && (range->start < end)) { + return range; + } + } + + return NULL; +} int init_process_stack(struct process *process, struct program_load_desc *pn, int argc, char **argv, @@ -351,7 +377,8 @@ int init_process_stack(struct process *process, struct program_load_desc *pn, memset(stack, 0, size); - if((rc = add_process_memory_range(process, start, end, virt_to_phys(stack), VR_STACK)) != 0){ + if ((rc = add_process_memory_range(process, start, end, virt_to_phys(stack), + VR_STACK|VR_PROT_READ|VR_PROT_WRITE)) != 0) { ihk_mc_free_pages(stack, USER_STACK_NR_PAGES); return rc; } @@ -423,7 +450,7 @@ unsigned long extend_process_region(struct process *proc, return end; } if((rc = add_process_memory_range(proc, old_aligned_end, - aligned_end, virt_to_phys(p), VR_NONE)) != 0){ + aligned_end, virt_to_phys(p), flag)) != 0){ free_pages(p, (aligned_end - old_aligned_end) >> PAGE_SHIFT); return end; } @@ -509,6 +536,7 @@ void free_process_memory(struct process *proc) { struct vm_range *range, *next; struct process_vm *vm = proc->vm; + int error; if (vm == NULL) { return; @@ -519,17 +547,22 @@ void free_process_memory(struct process *proc) return; } - list_for_each_entry_safe(range, next, &vm->vm_range_list, - list) { - if (!(range->flag & (VR_REMOTE | VR_IO_NOCACHE | VR_RESERVED - | VR_DEMAND_PAGING))) { - ihk_mc_free_pages(phys_to_virt(range->phys), - (range->end - range->start) - >> PAGE_SHIFT); + ihk_mc_spinlock_lock_noirq(&proc->vm->page_table_lock); + list_for_each_entry_safe(range, next, &vm->vm_range_list, list) { + if (!(range->flag & (VR_REMOTE | VR_IO_NOCACHE | VR_RESERVED))) { + error = ihk_mc_pt_free_range(vm->page_table, + (void *)range->start, (void *)range->end); + if (error && (error != -ENOENT)) { + kprintf("free_process_memory:" + "ihk_mc_pt_free_range(%lx,%lx) failed. %d\n", + range->start, range->end, error); + /* through */ + } } list_del(&range->list); ihk_mc_free(range); } + ihk_mc_spinlock_unlock_noirq(&proc->vm->page_table_lock); ihk_mc_pt_destroy(vm->page_table); free_process(vm->owner_process); diff --git a/kernel/syscall.c b/kernel/syscall.c index d238b339..2b0fcca2 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -18,6 +18,7 @@ #include #include #include +#include /* Headers taken from kitten LWK */ #include @@ -125,6 +126,14 @@ int do_syscall(struct syscall_request *req, ihk_mc_user_context_t *ctx) return res->ret; } +long syscall_generic_forwarding(int n, ihk_mc_user_context_t *ctx) +{ + SYSCALL_HEADER; + dkprintf("syscall_generic_forwarding(%d)\n", n); + SYSCALL_ARGS_6(D,D,D,D,D,D); + SYSCALL_FOOTER; +} + SYSCALL_DECLARE(open) { @@ -165,6 +174,7 @@ SYSCALL_DECLARE(mmap) { struct vm_regions *region = &cpu_local_var(current)->vm->region; void *va; + const unsigned long prot_flags = VR_PROT_READ | VR_PROT_WRITE | VR_PROT_EXEC; dkprintf("syscall.c,mmap,addr=%lx,len=%lx,prot=%lx,flags=%x,fd=%x,offset=%lx\n", ihk_mc_syscall_arg0(ctx), ihk_mc_syscall_arg1(ctx), @@ -219,7 +229,7 @@ SYSCALL_DECLARE(mmap) // add range, mapping if(add_process_memory_range(cpu_local_var(current), s_orig, e, - virt_to_phys((void *)(p_aligned - head_space)), VR_NONE) != 0){ + virt_to_phys((void *)(p_aligned - head_space)), prot_flags) != 0){ ihk_mc_free_pages((void *)p, range_npages); return -ENOMEM; } @@ -237,7 +247,7 @@ SYSCALL_DECLARE(mmap) pa = virt_to_phys(va); // add page_table, add memory-range - if(add_process_memory_range(cpu_local_var(current), s, e, pa, VR_NONE) != 0){ + if(add_process_memory_range(cpu_local_var(current), s, e, pa, prot_flags) != 0){ ihk_mc_free_pages(va, range_npages); return -ENOMEM; } @@ -277,6 +287,8 @@ SYSCALL_DECLARE(mmap) dkprintf("syscall.c,mmap,len=%lx\n", len); unsigned long flag = 0; /* eager paging */ + + flag |= VR_PROT_READ|VR_PROT_WRITE|VR_PROT_EXEC; #if 1 /* Intel OpenMP hack: it requests 128MB and munmap tail and head to create 64MB-aligned 64MB memory area @@ -295,7 +307,7 @@ SYSCALL_DECLARE(mmap) dkprintf("syscall.c,mmap,nocache,len=%lx\n", len); region->map_end = extend_process_region( cpu_local_var(current), region->map_start, map_end_aligned, - s + len, VR_IO_NOCACHE); + s + len, VR_IO_NOCACHE|(flag & ~VR_DEMAND_PAGING)); } else #endif @@ -334,12 +346,13 @@ SYSCALL_DECLARE(mmap) unsigned long amt_align = 0x100000; /* takagi */ unsigned long s = ((region->map_end + amt_align - 1) & ~(amt_align - 1)); unsigned long len = (ihk_mc_syscall_arg1(ctx) + PAGE_SIZE - 1) & PAGE_MASK; + unsigned long flag = VR_PROT_READ|VR_PROT_WRITE|VR_PROT_EXEC; dkprintf("(%d),syscall.c,!MAP_FIXED,!MAP_ANONYMOUS,amt_align=%lx,s=%lx,len=%lx\n", ihk_mc_get_processor_id(), amt_align, s, len); region->map_end = extend_process_region(cpu_local_var(current), region->map_start, s, - s + len, 0); + s + len, flag); #else unsigned long s = (region->map_end + PAGE_SIZE - 1) & PAGE_MASK; unsigned long len = (ihk_mc_syscall_arg1(ctx) + PAGE_SIZE - 1) & PAGE_MASK; @@ -347,7 +360,7 @@ SYSCALL_DECLARE(mmap) extend_process_region(cpu_local_var(current), region->map_start, region->map_end, - s + len, 0); + s + len, flag); #endif ihk_mc_spinlock_unlock_noirq(&cpu_local_var(current)->vm->memory_range_lock); if (region->map_end < s + len) { return -EINVAL; } @@ -375,16 +388,266 @@ SYSCALL_DECLARE(mmap) while(1); } +static int do_munmap(void *addr, size_t len) +{ + return remove_process_memory_range( + cpu_local_var(current), (intptr_t)addr, (intptr_t)addr+len); +} + +static int search_free_space(size_t len, intptr_t hint, intptr_t *addrp) +{ + struct process *proc = cpu_local_var(current); + struct vm_regions *region = &proc->vm->region; + intptr_t addr; + struct vm_range *range; + + addr = hint; + for (;;) { +#ifdef USE_LARGE_PAGES + if (len >= LARGE_PAGE_SIZE) { + addr = (addr + LARGE_PAGE_SIZE - 1) & LARGE_PAGE_MASK; + } +#endif /* USE_LARGE_PAGES */ + + if ((region->user_end <= addr) + || ((region->user_end - len) < addr)) { + kprintf("search_free_space:no virtual: %lx %lx %lx\n", + addr, len, region->user_end); + return -ENOMEM; + } + + range = lookup_process_memory_range(proc, addr, addr+len); + if (range == NULL) { + break; + } + addr = range->end; + } + + *addrp = addr; + return 0; +} + +SYSCALL_DECLARE(new_mmap) +{ + const int supported_flags = 0 + | MAP_PRIVATE // 02 + | MAP_FIXED // 10 + | MAP_ANONYMOUS // 20 + ; + const int ignored_flags = 0 +#ifdef USE_NOCACHE_MMAP + | MAP_32BIT // 40 +#endif /* USE_NOCACHE_MMAP */ + | MAP_DENYWRITE // 0800 + | MAP_NORESERVE // 4000 + | MAP_STACK // 00020000 + ; + const int error_flags = 0 + | MAP_SHARED // 01 +#ifndef USE_NOCACHE_MMAP + | MAP_32BIT // 40 +#endif /* ndef USE_NOCACHE_MMAP */ + | MAP_GROWSDOWN // 0100 + | MAP_EXECUTABLE // 1000 + | MAP_LOCKED // 2000 + | MAP_POPULATE // 8000 + | MAP_NONBLOCK // 00010000 + | MAP_HUGETLB // 00040000 + ; + + const intptr_t addr0 = ihk_mc_syscall_arg0(ctx); + const size_t len0 = ihk_mc_syscall_arg1(ctx); + const int prot = ihk_mc_syscall_arg2(ctx); + const int flags = ihk_mc_syscall_arg3(ctx); + const int fd = ihk_mc_syscall_arg4(ctx); + const off_t off = ihk_mc_syscall_arg5(ctx); + + struct process *proc = cpu_local_var(current); + struct vm_regions *region = &proc->vm->region; + intptr_t addr; + size_t len; + int error; + intptr_t npages; + int p2align; + void *p; + int vrflags; + intptr_t phys; + + dkprintf("[%d]sys_mmap(%lx,%lx,%x,%x,%d,%lx)\n", + ihk_mc_get_processor_id(), + addr0, len0, prot, flags, fd, off); + + /* check constants for flags */ + if (1) { + int dup_flags; + + dup_flags = (supported_flags & ignored_flags); + dup_flags |= (ignored_flags & error_flags); + dup_flags |= (error_flags & supported_flags); + + if (dup_flags) { + kprintf("sys_mmap:duplicate flags: %lx\n", dup_flags); + kprintf("s-flags: %08x\n", supported_flags); + kprintf("i-flags: %08x\n", ignored_flags); + kprintf("e-flags: %08x\n", error_flags); + panic("sys_mmap:duplicate flags\n"); + /* no return */ + } + } + + /* check arguments */ +#define VALID_DUMMY_ADDR (region->user_start) + addr = (flags & MAP_FIXED)? addr0: VALID_DUMMY_ADDR; + len = (len0 + PAGE_SIZE - 1) & PAGE_MASK; + if ((addr & (PAGE_SIZE - 1)) + || (addr < region->user_start) + || (region->user_end <= addr) + || (len == 0) + || (len > (region->user_end - region->user_start)) + || ((region->user_end - len) < addr) + || !(flags & (MAP_SHARED | MAP_PRIVATE)) + || ((flags & MAP_SHARED) && (flags & MAP_PRIVATE)) + || (off & (PAGE_SIZE - 1))) { + kprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):EINVAL\n", + addr0, len0, prot, flags, fd, off); + error = -EINVAL; + goto out; + } + + /* check not supported requests */ + if ((flags & error_flags) + || (flags & ~(supported_flags | ignored_flags))) { + kprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):unknown flags %lx\n", + addr0, len0, prot, flags, fd, off, + (flags & ~(supported_flags | ignored_flags))); + error = -EINVAL; + goto out; + } + + ihk_mc_spinlock_lock_noirq(&proc->vm->memory_range_lock); + + if (flags & MAP_FIXED) { + /* clear specified address range */ + error = do_munmap((void *)addr, len); + if (error) { + kprintf("sys_mmap:do_munmap(%lx,%lx) failed. %d\n", + addr, len, error); + ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock); + goto out; + } + } + else { + /* choose mapping address */ + error = search_free_space(len, region->map_end, &addr); + if (error) { + kprintf("sys_mmap:search_free_space(%lx,%lx) failed. %d\n", + len, region->map_end, error); + ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock); + goto out; + } + region->map_end = addr + len; + } + + /* do the map */ + vrflags = VR_NONE; + vrflags |= (prot & PROT_READ)? VR_PROT_READ: 0; + vrflags |= (prot & PROT_WRITE)? VR_PROT_WRITE: 0; + vrflags |= (prot & PROT_EXEC)? VR_PROT_EXEC: 0; + if (flags & MAP_ANONYMOUS) { + if (0) { + /* dummy */ + } +#ifdef USE_NOCACHE_MMAP +#define X_MAP_NOCACHE MAP_32BIT + else if (flags & X_MAP_NOCACHE) { + vrflags |= VR_IO_NOCACHE; + } +#endif + else if ((len == 64*1024*1024) || (len == 128*1024*1024)) { + vrflags |= VR_DEMAND_PAGING; + } + } + + p = NULL; + phys = 0; + if (!(vrflags & VR_DEMAND_PAGING)) { + npages = len >> PAGE_SHIFT; + p2align = PAGE_P2ALIGN; +#ifdef USE_LARGE_PAGES + if ((len >= LARGE_PAGE_SIZE) + && ((addr & (LARGE_PAGE_SIZE - 1)) == 0)) { + p2align = LARGE_PAGE_P2ALIGN; + } +#endif /* USE_LARGE_PAGES */ + p = ihk_mc_alloc_aligned_pages(npages, p2align, IHK_MC_AP_NOWAIT); + if (p == NULL) { + kprintf("sys_mmap:allocate_pages(%d,%d) failed.\n", + npages, p2align); + ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock); + error = -ENOMEM; + goto out; + } + phys = virt_to_phys(p); + } + + error = add_process_memory_range(proc, addr, addr+len, phys, vrflags); + if (error) { + kprintf("sys_mmap:add_process_memory_range" + "(%p,%lx,%lx,%lx,%lx) failed %d\n", + proc, addr, addr+len, + virt_to_phys(p), vrflags, error); + ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock); + if (p != NULL) { + ihk_mc_free_pages(p, npages); + } + goto out; + } + + ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock); + + /* read page with pread64() */ + if (!(flags & MAP_ANONYMOUS)) { + ihk_mc_user_context_t ctx2; + ssize_t ss; + + ihk_mc_syscall_arg0(&ctx2) = fd; + ihk_mc_syscall_arg1(&ctx2) = addr; + ihk_mc_syscall_arg2(&ctx2) = len; + ihk_mc_syscall_arg3(&ctx2) = off; + + ss = syscall_generic_forwarding(__NR_pread64, &ctx2); + if (ss < 0) { + kprintf("sys_mmap:pread(%d,%lx,%lx,%lx) failed %ld\n", + fd, addr, len, off, (long)ss); + error = do_munmap((void *)addr, len); + if (error) { + kprintf("sys_mmap:do_munmap(%lx,%lx) failed. %d\n", + addr, len, error); + /* through */ + } + error = ss; + goto out; + } + } + + error = 0; +out: + if (error) { + kprintf("[%d]sys_mmap(%lx,%lx,%x,%x,%d,%lx): %ld %lx\n", + ihk_mc_get_processor_id(), + addr0, len0, prot, flags, fd, off, error, addr); + } + return (!error)? addr: error; +} + SYSCALL_DECLARE(munmap) { - unsigned long address, len; + void * const addr = (void *)ihk_mc_syscall_arg0(ctx); + const size_t len = ihk_mc_syscall_arg1(ctx); int error; - address = ihk_mc_syscall_arg0(ctx); - len = ihk_mc_syscall_arg1(ctx); - ihk_mc_spinlock_lock_noirq(&cpu_local_var(current)->vm->memory_range_lock); - error = remove_process_memory_range(cpu_local_var(current), address, address+len); + error = do_munmap(addr, len); ihk_mc_spinlock_unlock_noirq(&cpu_local_var(current)->vm->memory_range_lock); return error; @@ -420,7 +683,8 @@ SYSCALL_DECLARE(brk) /* try to extend memory region */ ihk_mc_spinlock_lock_noirq(&cpu_local_var(current)->vm->memory_range_lock); region->brk_end = extend_process_region(cpu_local_var(current), - region->brk_start, region->brk_end, address, 0); + region->brk_start, region->brk_end, address, + VR_PROT_READ|VR_PROT_WRITE); ihk_mc_spinlock_unlock_noirq(&cpu_local_var(current)->vm->memory_range_lock); dkprintf("SC(%d)[sys_brk] brk_end set to %lx\n", ihk_mc_get_processor_id(), region->brk_end); @@ -859,14 +1123,6 @@ SYSCALL_DECLARE(pmc_reset) return ihk_mc_perfctr_reset(counter); } -long syscall_generic_forwarding(int n, ihk_mc_user_context_t *ctx) -{ - SYSCALL_HEADER; - dkprintf("syscall_generic_forwarding(%d)\n", n); - SYSCALL_ARGS_6(D,D,D,D,D,D); - SYSCALL_FOOTER; -} - long syscall(int num, ihk_mc_user_context_t *ctx) { long l; @@ -918,6 +1174,7 @@ long syscall(int num, ihk_mc_user_context_t *ctx) return l; } +#if 0 void __host_update_process_range(struct process *process, struct vm_range *range) { @@ -948,4 +1205,4 @@ void __host_update_process_range(struct process *process, } cpu_enable_interrupt(); } - +#endif diff --git a/lib/include/ihk/mm.h b/lib/include/ihk/mm.h index 14e547d6..a3f53335 100644 --- a/lib/include/ihk/mm.h +++ b/lib/include/ihk/mm.h @@ -94,6 +94,7 @@ int ihk_mc_pt_change_page(page_table_t pt, void *virt, int ihk_mc_pt_clear_page(page_table_t pt, void *virt); int ihk_mc_pt_clear_large_page(page_table_t pt, void *virt); int ihk_mc_pt_clear_range(page_table_t pt, void *start, void *end); +int ihk_mc_pt_free_range(page_table_t pt, void *start, void *end); int ihk_mc_pt_prepare_map(page_table_t pt, void *virt, unsigned long size, enum ihk_mc_pt_prepare_flag);