From 16af976a7105a2b627af8221f65ff9242ae5c391 Mon Sep 17 00:00:00 2001 From: NAKAMURA Gou Date: Wed, 9 Jul 2014 14:19:26 +0900 Subject: [PATCH] support msync() system call. refs #382 Msync(2) of this version writes only the pages which the calling process modified. Modifications of the other processes are not written. --- arch/x86/kernel/include/arch-memory.h | 36 ++++++ arch/x86/kernel/include/syscall_list.h | 1 + kernel/fileobj.c | 32 ++++++ kernel/include/memobj.h | 11 ++ kernel/include/mman.h | 7 ++ kernel/include/process.h | 4 + kernel/process.c | 148 +++++++++++++++++++++++++ kernel/syscall.c | 117 +++++++++++++++++++ 8 files changed, 356 insertions(+) diff --git a/arch/x86/kernel/include/arch-memory.h b/arch/x86/kernel/include/arch-memory.h index 6a7ee477..7b8f4f26 100644 --- a/arch/x86/kernel/include/arch-memory.h +++ b/arch/x86/kernel/include/arch-memory.h @@ -204,6 +204,12 @@ static inline off_t pte_get_off(pte_t *ptep, size_t pgsize) return (off_t)(*ptep & PAGE_MASK); } +static inline void pte_make_null(pte_t *ptep, size_t pgsize) +{ + *ptep = PTE_NULL; + return; +} + static inline void pte_make_fileoff(off_t off, enum ihk_mc_pt_attribute ptattr, size_t pgsize, pte_t *ptep) { @@ -235,6 +241,36 @@ static inline void pte_xchg(pte_t *ptep, pte_t *valp) #define pte_xchg(p,vp) do { *(vp) = xchg((p), *(vp)); } while (0) #endif +static inline void pte_clear_dirty(pte_t *ptep, size_t pgsize) +{ + uint64_t mask; + + switch (pgsize) { + default: /* through */ + case PTL1_SIZE: mask = ~PFL1_DIRTY; break; + case PTL2_SIZE: mask = ~PFL2_DIRTY; break; + case PTL3_SIZE: mask = ~PFL3_DIRTY; break; + } + + asm volatile ("lock andq %0,%1" :: "r"(mask), "m"(*ptep)); + return; +} + +static inline void pte_set_dirty(pte_t *ptep, size_t pgsize) +{ + uint64_t mask; + + switch (pgsize) { + default: /* through */ + case PTL1_SIZE: mask = PFL1_DIRTY; break; + case PTL2_SIZE: mask = PFL2_DIRTY; break; + case PTL3_SIZE: mask = PFL3_DIRTY; break; + } + + asm volatile ("lock orq %0,%1" :: "r"(mask), "m"(*ptep)); + return; +} + struct page_table; void set_pte(pte_t *ppte, unsigned long phys, enum ihk_mc_pt_attribute attr); pte_t *get_pte(struct page_table *pt, void *virt, enum ihk_mc_pt_attribute attr); diff --git a/arch/x86/kernel/include/syscall_list.h b/arch/x86/kernel/include/syscall_list.h index 5a71c59d..23dd6aa8 100644 --- a/arch/x86/kernel/include/syscall_list.h +++ b/arch/x86/kernel/include/syscall_list.h @@ -41,6 +41,7 @@ SYSCALL_DELEGATED(20, writev) SYSCALL_DELEGATED(21, access) SYSCALL_HANDLED(24, sched_yield) SYSCALL_HANDLED(25, mremap) +SYSCALL_HANDLED(26, msync) SYSCALL_HANDLED(28, madvise) SYSCALL_HANDLED(34, pause) SYSCALL_HANDLED(39, getpid) diff --git a/kernel/fileobj.c b/kernel/fileobj.c index 6404627e..6a89790e 100644 --- a/kernel/fileobj.c +++ b/kernel/fileobj.c @@ -46,6 +46,7 @@ static memobj_ref_func_t fileobj_ref; static memobj_get_page_func_t fileobj_get_page; static memobj_copy_page_func_t fileobj_copy_page; static memobj_flush_page_func_t fileobj_flush_page; +static memobj_invalidate_page_func_t fileobj_invalidate_page; static struct memobj_ops fileobj_ops = { .release = &fileobj_release, @@ -53,6 +54,7 @@ static struct memobj_ops fileobj_ops = { .get_page = &fileobj_get_page, .copy_page = &fileobj_copy_page, .flush_page = &fileobj_flush_page, + .invalidate_page = &fileobj_invalidate_page, }; static struct fileobj *to_fileobj(struct memobj *memobj) @@ -577,3 +579,33 @@ static int fileobj_flush_page(struct memobj *memobj, uintptr_t phys, memobj_lock(&obj->memobj); return 0; } + +static int fileobj_invalidate_page(struct memobj *memobj, uintptr_t phys, + size_t pgsize) +{ + struct fileobj *obj = to_fileobj(memobj); + int error; + struct page *page; + + dkprintf("fileobj_invalidate_page(%p,%#lx,%#lx)\n", + memobj, phys, pgsize); + + if (!(page = phys_to_page(phys)) + || !(page = page_list_lookup(obj, page->offset))) { + error = 0; + goto out; + } + + if (ihk_atomic_read(&page->count) == 1) { + if (page_unmap(page)) { + ihk_mc_free_pages(phys_to_virt(phys), + pgsize/PAGE_SIZE); + } + } + + error = 0; +out: + dkprintf("fileobj_invalidate_page(%p,%#lx,%#lx):%d\n", + memobj, phys, pgsize, error); + return error; +} diff --git a/kernel/include/memobj.h b/kernel/include/memobj.h index 65d8ed2f..1545f94b 100644 --- a/kernel/include/memobj.h +++ b/kernel/include/memobj.h @@ -37,6 +37,7 @@ typedef void memobj_ref_func_t(struct memobj *obj); typedef int memobj_get_page_func_t(struct memobj *obj, off_t off, int p2align, uintptr_t *physp); typedef uintptr_t memobj_copy_page_func_t(struct memobj *obj, uintptr_t orgphys, int p2align); typedef int memobj_flush_page_func_t(struct memobj *obj, uintptr_t phys, size_t pgsize); +typedef int memobj_invalidate_page_func_t(struct memobj *obj, uintptr_t phys, size_t pgsize); struct memobj_ops { memobj_release_func_t * release; @@ -44,6 +45,7 @@ struct memobj_ops { memobj_get_page_func_t * get_page; memobj_copy_page_func_t * copy_page; memobj_flush_page_func_t * flush_page; + memobj_invalidate_page_func_t * invalidate_page; }; static inline void memobj_release(struct memobj *obj) @@ -86,6 +88,15 @@ static inline int memobj_flush_page(struct memobj *obj, uintptr_t phys, size_t p return 0; } +static inline int memobj_invalidate_page(struct memobj *obj, uintptr_t phys, + size_t pgsize) +{ + if (obj->ops->invalidate_page) { + return (*obj->ops->invalidate_page)(obj, phys, pgsize); + } + return 0; +} + static inline void memobj_lock(struct memobj *obj) { ihk_mc_spinlock_lock_noirq(&obj->lock); diff --git a/kernel/include/mman.h b/kernel/include/mman.h index b7555eef..f63bb24b 100644 --- a/kernel/include/mman.h +++ b/kernel/include/mman.h @@ -69,4 +69,11 @@ #define MREMAP_MAYMOVE 0x01 #define MREMAP_FIXED 0x02 +/* + * for msync() + */ +#define MS_ASYNC 0x01 +#define MS_INVALIDATE 0x02 +#define MS_SYNC 0x04 + #endif /* HEADER_MMAN_H */ diff --git a/kernel/include/process.h b/kernel/include/process.h index fa294771..718c21ca 100644 --- a/kernel/include/process.h +++ b/kernel/include/process.h @@ -405,6 +405,10 @@ int change_prot_process_memory_range( unsigned long newflag); int remap_process_memory_range(struct process_vm *vm, struct vm_range *range, uintptr_t start, uintptr_t end, off_t off); +int sync_process_memory_range(struct process_vm *vm, struct vm_range *range, + uintptr_t start, uintptr_t end); +int invalidate_process_memory_range(struct process_vm *vm, + struct vm_range *range, uintptr_t start, uintptr_t end); struct vm_range *lookup_process_memory_range( struct process_vm *vm, uintptr_t start, uintptr_t end); struct vm_range *next_process_memory_range( diff --git a/kernel/process.c b/kernel/process.c index c05536e5..8355ad1d 100644 --- a/kernel/process.c +++ b/kernel/process.c @@ -1210,6 +1210,154 @@ out: return error; } +struct sync_args { + struct memobj *memobj; +}; + +static int sync_one_page(void *arg0, page_table_t pt, pte_t *ptep, + void *pgaddr, size_t pgsize) +{ + struct sync_args *args = arg0; + int error; + uintptr_t phys; + + dkprintf("sync_one_page(%p,%p,%p %#lx,%p,%#lx)\n", + arg0, pt, ptep, *ptep, pgaddr, pgsize); + if (pte_is_null(ptep) || pte_is_fileoff(ptep, pgsize) + || !pte_is_dirty(ptep, pgsize)) { + error = 0; + goto out; + } + + pte_clear_dirty(ptep, pgsize); + flush_tlb_single((uintptr_t)pgaddr); /* XXX: TLB flush */ + + phys = pte_get_phys(ptep); + error = memobj_flush_page(args->memobj, phys, pgsize); + if (error) { + ekprintf("sync_one_page(%p,%p,%p %#lx,%p,%#lx):" + "flush failed. %d\n", + arg0, pt, ptep, *ptep, pgaddr, pgsize, error); + pte_set_dirty(ptep, pgsize); + goto out; + } + + error = 0; +out: + dkprintf("sync_one_page(%p,%p,%p %#lx,%p,%#lx):%d\n", + arg0, pt, ptep, *ptep, pgaddr, pgsize, error); + return error; +} + +int sync_process_memory_range(struct process_vm *vm, struct vm_range *range, + uintptr_t start, uintptr_t end) +{ + int error; + struct sync_args args; + + dkprintf("sync_process_memory_range(%p,%p,%#lx,%#lx)\n", + vm, range, start, end); + args.memobj = range->memobj; + + ihk_mc_spinlock_lock_noirq(&vm->page_table_lock); + memobj_lock(range->memobj); + error = visit_pte_range(vm->page_table, (void *)start, (void *)end, + VPTEF_SKIP_NULL, &sync_one_page, &args); + memobj_unlock(range->memobj); + ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock); + if (error) { + ekprintf("sync_process_memory_range(%p,%p,%#lx,%#lx):" + "visit failed%d\n", + vm, range, start, end, error); + goto out; + } +out: + dkprintf("sync_process_memory_range(%p,%p,%#lx,%#lx):%d\n", + vm, range, start, end, error); + return error; +} + +struct invalidate_args { + struct vm_range *range; +}; + +static int invalidate_one_page(void *arg0, page_table_t pt, pte_t *ptep, + void *pgaddr, size_t pgsize) +{ + struct invalidate_args *args = arg0; + struct vm_range *range = args->range; + int error; + uintptr_t phys; + struct page *page; + off_t linear_off; + pte_t apte; + + dkprintf("invalidate_one_page(%p,%p,%p %#lx,%p,%#lx)\n", + arg0, pt, ptep, *ptep, pgaddr, pgsize); + if (pte_is_null(ptep) || pte_is_fileoff(ptep, pgsize)) { + error = 0; + goto out; + } + + phys = pte_get_phys(ptep); + page = phys_to_page(phys); + linear_off = range->objoff + ((uintptr_t)pgaddr - range->start); + if (page && (page->offset == linear_off)) { + pte_make_null(&apte, pgsize); + } + else { + pte_make_fileoff(page->offset, 0, pgsize, &apte); + } + pte_xchg(ptep, &apte); + flush_tlb_single((uintptr_t)pgaddr); /* XXX: TLB flush */ + + if (page && page_unmap(page)) { + panic("invalidate_one_page"); + } + + error = memobj_invalidate_page(range->memobj, phys, pgsize); + if (error) { + ekprintf("invalidate_one_page(%p,%p,%p %#lx,%p,%#lx):" + "invalidate failed. %d\n", + arg0, pt, ptep, *ptep, pgaddr, pgsize, error); + goto out; + } + + error = 0; +out: + dkprintf("invalidate_one_page(%p,%p,%p %#lx,%p,%#lx):%d\n", + arg0, pt, ptep, *ptep, pgaddr, pgsize, error); + return error; +} + +int invalidate_process_memory_range(struct process_vm *vm, + struct vm_range *range, uintptr_t start, uintptr_t end) +{ + int error; + struct invalidate_args args; + + dkprintf("invalidate_process_memory_range(%p,%p,%#lx,%#lx)\n", + vm, range, start, end); + args.range = range; + + ihk_mc_spinlock_lock_noirq(&vm->page_table_lock); + memobj_lock(range->memobj); + error = visit_pte_range(vm->page_table, (void *)start, (void *)end, + VPTEF_SKIP_NULL, &invalidate_one_page, &args); + memobj_unlock(range->memobj); + ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock); + if (error) { + ekprintf("invalidate_process_memory_range(%p,%p,%#lx,%#lx):" + "visit failed%d\n", + vm, range, start, end, error); + goto out; + } +out: + dkprintf("invalidate_process_memory_range(%p,%p,%#lx,%#lx):%d\n", + vm, range, start, end, error); + return error; +} + static int page_fault_process_memory_range(struct process_vm *vm, struct vm_range *range, uintptr_t fault_addr, uint64_t reason) { int error; diff --git a/kernel/syscall.c b/kernel/syscall.c index 642dbfe6..722a38f2 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -3935,6 +3935,123 @@ out: return ret; } +SYSCALL_DECLARE(msync) +{ + const uintptr_t start0 = ihk_mc_syscall_arg0(ctx); + const size_t len0 = ihk_mc_syscall_arg1(ctx); + const int flags = ihk_mc_syscall_arg2(ctx); + const size_t len = (len0 + PAGE_SIZE - 1) & PAGE_MASK; + const uintptr_t start = start0; + const uintptr_t end = start + len; + struct process *proc = cpu_local_var(current); + struct process_vm *vm = proc->vm; + int error; + uintptr_t addr; + struct vm_range *range; + uintptr_t s; + uintptr_t e; + + dkprintf("sys_msync(%#lx,%#lx,%#x)\n", start0, len0, flags); + ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock); + + if ((start0 & ~PAGE_MASK) + || (flags & ~(MS_ASYNC|MS_INVALIDATE|MS_SYNC)) + || ((flags & MS_ASYNC) && (flags & MS_SYNC))) { + error = -EINVAL; + ekprintf("sys_msync(%#lx,%#lx,%#x):invalid args. %d\n", + start0, len0, flags, error); + goto out; + } + if (end < start) { + error = -ENOMEM; + ekprintf("sys_msync(%#lx,%#lx,%#x):invalid args. %d\n", + start0, len0, flags, error); + goto out; + } + + /* check ranges */ + range = NULL; + for (addr = start; addr < end; addr = range->end) { + if (!range) { + range = lookup_process_memory_range(vm, addr, + addr+PAGE_SIZE); + } + else { + range = next_process_memory_range(vm, range); + } + + if (!range || (addr < range->start)) { + error = -ENOMEM; + ekprintf("sys_msync(%#lx,%#lx,%#x):" + "invalid VMR %d %#lx-%#lx %#lx\n", + start0, len0, flags, error, + range?range->start:0, + range?range->end:0, + range?range->flag:0); + goto out; + } + if ((flags & MS_INVALIDATE) && (range->flag & VR_LOCKED)) { + error = -EBUSY; + ekprintf("sys_msync(%#lx,%#lx,%#x):" + "locked VMR %d %#lx-%#lx %#lx\n", + start0, len0, flags, error, + range->start, range->end, range->flag); + goto out; + } + } + + /* do the sync */ + range = NULL; + for (addr = start; addr < end; addr = range->end) { + if (!range) { + range = lookup_process_memory_range(vm, addr, + addr+PAGE_SIZE); + } + else { + range = next_process_memory_range(vm, range); + } + + if ((range->flag & VR_PRIVATE) || !range->memobj + || !memobj_has_pager(range->memobj)) { + dkprintf("sys_msync(%#lx,%#lx,%#x):" + "unsyncable VMR %d %#lx-%#lx %#lx\n", + start0, len0, flags, error, + range->start, range->end, range->flag); + /* nothing to do */ + continue; + } + + s = addr; + e = (range->end < end)? range->end: end; + + if (flags & (MS_ASYNC | MS_SYNC)) { + error = sync_process_memory_range(vm, range, s, e); + if (error) { + ekprintf("sys_msync(%#lx,%#lx,%#x):sync failed. %d\n", + start0, len0, flags, error); + goto out; + } + } + + if (flags & MS_INVALIDATE) { + error = invalidate_process_memory_range( + vm, range, s, e); + if (error) { + ekprintf("sys_msync(%#lx,%#lx,%#x):" + "invalidate failed. %d\n", + start0, len0, flags, error); + goto out; + } + } + } + + error = 0; +out: + ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock); + dkprintf("sys_msync(%#lx,%#lx,%#x):%d\n", start0, len0, flags, error); + return error; +} /* sys_msync() */ + SYSCALL_DECLARE(getcpu) { const uintptr_t cpup = ihk_mc_syscall_arg0(ctx);