From 27172ad4134b91e883599f4c49f769c358389910 Mon Sep 17 00:00:00 2001 From: NAKAMURA Gou Date: Fri, 9 Aug 2013 20:55:17 +0900 Subject: [PATCH] support private mapped file --- arch/x86/kernel/cpu.c | 16 +- arch/x86/kernel/include/arch-memory.h | 18 +- arch/x86/kernel/include/registers.h | 2 + arch/x86/kernel/memory.c | 16 +- executer/kernel/syscall.c | 357 +++++++++++++++-- kernel/Makefile.build | 2 +- kernel/Makefile.build.dcfa | 2 +- kernel/fileobj.c | 530 ++++++++++++++++++++++++++ kernel/host.c | 14 +- kernel/include/memobj.h | 60 +++ kernel/include/page.h | 22 +- kernel/include/pager.h | 27 ++ kernel/include/process.h | 19 +- kernel/include/syscall.h | 1 + kernel/mem.c | 52 ++- kernel/process.c | 297 +++++++++++++-- kernel/syscall.c | 90 +++-- 17 files changed, 1392 insertions(+), 133 deletions(-) create mode 100644 kernel/fileobj.c create mode 100644 kernel/include/memobj.h create mode 100644 kernel/include/pager.h diff --git a/arch/x86/kernel/cpu.c b/arch/x86/kernel/cpu.c index 6a29d057..4855b215 100644 --- a/arch/x86/kernel/cpu.c +++ b/arch/x86/kernel/cpu.c @@ -288,8 +288,22 @@ void init_syscall(void) wrmsr(MSR_LSTAR, (unsigned long)x86_syscall); } +static void enable_page_protection_fault(void) +{ + asm volatile ( + "pushf ;" + "cli ;" + "mov %%cr0,%%rax;" + "or $0x10000,%%rax;" + "mov %%rax,%%cr0;" + "popf" + ::: "%rax"); + return; +} + void init_cpu(void) { + enable_page_protection_fault(); init_fpu(); init_lapic(); init_syscall(); @@ -465,7 +479,7 @@ void cpu_restore_interrupt(unsigned long flags) void cpu_pause(void) { - asm volatile("pause"); + asm volatile("pause" ::: "memory"); } unsigned long cpu_disable_interrupt_save(void) diff --git a/arch/x86/kernel/include/arch-memory.h b/arch/x86/kernel/include/arch-memory.h index 1f65567f..24e8df6a 100644 --- a/arch/x86/kernel/include/arch-memory.h +++ b/arch/x86/kernel/include/arch-memory.h @@ -1,6 +1,8 @@ #ifndef __HEADER_X86_COMMON_ARCH_MEMORY_H #define __HEADER_X86_COMMON_ARCH_MEMORY_H +#include + #define KERNEL_CS_ENTRY 4 #define KERNEL_DS_ENTRY 5 #define USER_CS_ENTRY 6 @@ -103,9 +105,23 @@ enum ihk_mc_pt_attribute { PTATTR_FOR_USER = 0x20000, }; +#define PTE_NULL ((pte_t)0) typedef unsigned long pte_t; -#define PTE_NULL ((pte_t)0) +static inline int pte_is_null(pte_t *ptep) +{ + return (*ptep == PTE_NULL); +} + +static inline int pte_is_present(pte_t *ptep) +{ + return !!(*ptep & PF_PRESENT); +} + +static inline uintptr_t pte_get_phys(pte_t *ptep) +{ + return (*ptep & PT_PHYSMASK); +} struct page_table; void set_pte(pte_t *ppte, unsigned long phys, int attr); diff --git a/arch/x86/kernel/include/registers.h b/arch/x86/kernel/include/registers.h index 80716dcb..d06e4bed 100644 --- a/arch/x86/kernel/include/registers.h +++ b/arch/x86/kernel/include/registers.h @@ -128,6 +128,8 @@ struct x86_regs { unsigned long error, rip, cs, rflags, rsp, ss; }; +#define REGS_GET_STACK_POINTER(regs) (((struct x86_regs *)regs)->rsp) + /* * Page fault error code bits: * diff --git a/arch/x86/kernel/memory.c b/arch/x86/kernel/memory.c index 15abf3c8..9d75267d 100644 --- a/arch/x86/kernel/memory.c +++ b/arch/x86/kernel/memory.c @@ -7,6 +7,7 @@ #include #include #include +#include #define dkprintf(...) #define ekprintf(...) kprintf(__VA_ARGS__) @@ -820,16 +821,20 @@ static int clear_range_l1(void *args0, pte_t *ptep, uint64_t base, { struct clear_range_args *args = args0; uint64_t phys; + struct page *page; if (*ptep == PTE_NULL) { return -ENOENT; } phys = *ptep & PT_PHYSMASK; - *ptep = 0; + *ptep = PTE_NULL; if (args->free_physical) { - ihk_mc_free_pages(phys_to_virt(phys), 1); + page = phys_to_page(phys); + if (page && page_unmap(page)) { + ihk_mc_free_pages(phys_to_virt(phys), 1); + } } return 0; @@ -842,6 +847,7 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base, uint64_t phys; struct page_table *pt; int error; + struct page *page; if (*ptep == PTE_NULL) { return -ENOENT; @@ -866,8 +872,10 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base, *ptep = PTE_NULL; if (args->free_physical) { - ihk_mc_free_pages(phys_to_virt(phys), - LARGE_PAGE_SIZE/PAGE_SIZE); + page = phys_to_page(phys); + if (page && page_unmap(page)) { + ihk_mc_free_pages(phys_to_virt(phys), PTL2_SIZE/PTL1_SIZE); + } } return 0; diff --git a/executer/kernel/syscall.c b/executer/kernel/syscall.c index ca024817..55a6de80 100644 --- a/executer/kernel/syscall.c +++ b/executer/kernel/syscall.c @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include #include #include @@ -25,6 +27,8 @@ #define dprintk(...) #endif +static long pager_call(ihk_os_t os, struct syscall_request *req); + #ifdef SC_DEBUG //static struct ihk_dma_request last_request; @@ -203,40 +207,47 @@ static int remote_page_fault(struct mcctrl_usrdata *usrdata, void *fault_addr, u resp->fault_address = (unsigned long)fault_addr; resp->fault_reason = reason; +#define STATUS_PAGER_COMPLETED 1 #define STATUS_PAGE_FAULT 3 req->valid = 0; mb(); resp->status = STATUS_PAGE_FAULT; - /* wait for response */ - error = wait_event_interruptible(channel->wq_syscall, channel->req); - if (error) { - printk("remote_page_fault:interrupted. %d\n", error); - goto out; - } - channel->req = 0; - if (!req->valid) { - printk("remote_page_fault:not valid\n"); - } - req->valid = 0; + for (;;) { + /* wait for response */ + error = wait_event_interruptible(channel->wq_syscall, channel->req); + if (error) { + printk("remote_page_fault:interrupted. %d\n", error); + goto out; + } + channel->req = 0; + if (!req->valid) { + printk("remote_page_fault:not valid\n"); + } + req->valid = 0; - /* check result */ - if (req->number != __NR_mmap) { - printk("remote_page_fault:unexpected response. %lx %lx\n", - req->number, req->args[0]); - error = -EIO; - goto out; - } - else if (req->args[0] != 0x0101) { - printk("remote_page_fault:unexpected response. %lx %lx\n", - req->number, req->args[0]); - error = -EIO; - goto out; - } - else if (req->args[1] != 0) { - error = req->args[1]; - printk("remote_page_fault:response %d\n", error); - goto out; + /* check result */ + if (req->number != __NR_mmap) { + printk("remote_page_fault:unexpected response. %lx %lx\n", + req->number, req->args[0]); + error = -EIO; + goto out; + } +#define PAGER_REQ_RESUME 0x0101 + else if (req->args[0] != PAGER_REQ_RESUME) { + resp->ret = pager_call(usrdata->os, (void *)req); + mb(); + resp->status = STATUS_PAGER_COMPLETED; + continue; + } + else { + error = req->args[1]; + if (error) { + printk("remote_page_fault:response %d\n", error); + goto out; + } + } + break; } error = 0; @@ -600,6 +611,286 @@ int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, #endif #endif /* !DO_USER_MODE */ +struct pager { + struct list_head list; + struct inode * inode; + int ref; + struct file * rofile; + struct file * rwfile; +}; + +/* + * for linux v2.6.35 or prior + */ +#ifndef DEFINE_SEMAPHORE +#define DEFINE_SEMAPHORE(...) DECLARE_MUTEX(__VA_ARGS__) +#endif + +static DEFINE_SEMAPHORE(pager_sem); +static struct list_head pager_list = LIST_HEAD_INIT(pager_list); + +struct pager_create_result { + uintptr_t handle; + int maxprot; +}; + +static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa) +{ + ihk_device_t dev = ihk_os_to_dev(os); + int error; + struct pager_create_result *resp; + int maxprot = -1; + struct file *file = NULL; + struct inode *inode; + struct pager *pager = NULL; + struct pager *newpager = NULL; + uintptr_t phys; + + dprintk("pager_req_create(%d,%lx)\n", fd, (long)result_pa); + + file = fget(fd); + if (!file) { + error = -EBADF; + printk("pager_req_create(%d,%lx):file not found. %d\n", fd, (long)result_pa, error); + goto out; + } + + inode = file->f_path.dentry->d_inode; + if (!inode) { + error = -EBADF; + printk("pager_req_create(%d,%lx):inode not found. %d\n", fd, (long)result_pa, error); + goto out; + } + + maxprot = 0; + if ((file->f_mode & FMODE_READ) && (file->f_mode & FMODE_PREAD)) { + maxprot |= PROT_READ; + } + if ((file->f_mode & FMODE_WRITE) && (file->f_mode & FMODE_PWRITE)) { + maxprot |= PROT_WRITE; + } + if (!(file->f_path.mnt->mnt_flags & MNT_NOEXEC)) { + maxprot |= PROT_EXEC; + } + if (!(maxprot & PROT_READ)) { + error = -EACCES; + printk("pager_req_create(%d,%lx):cannot read file. %d\n", fd, (long)result_pa, error); + goto out; + } + + for (;;) { + error = down_interruptible(&pager_sem); + if (error) { + error = -EINTR; + printk("pager_req_create(%d,%lx):signaled. %d\n", fd, (long)result_pa, error); + goto out; + } + + list_for_each_entry(pager, &pager_list, list) { + if (pager->inode == inode) { + goto found; + } + } + + if (newpager) { + newpager->inode = inode; + newpager->ref = 0; + list_add(&newpager->list, &pager_list); + pager = newpager; + newpager = NULL; + break; + } + + up(&pager_sem); + + newpager = kzalloc(sizeof(*newpager), GFP_KERNEL); + if (!newpager) { + error = -ENOMEM; + printk("pager_req_create(%d,%lx):kzalloc failed. %d\n", fd, (long)result_pa, error); + goto out; + } + } + +found: + ++pager->ref; + if (!pager->rwfile && (maxprot & PROT_WRITE)) { + get_file(file); + pager->rwfile = file; + } + else if (!pager->rofile && !(maxprot & PROT_WRITE)) { + get_file(file); + pager->rofile = file; + } + up(&pager_sem); + + phys = ihk_device_map_memory(dev, result_pa, sizeof(*resp)); + resp = ihk_device_map_virtual(dev, phys, sizeof(*resp), NULL, 0); + resp->handle = (uintptr_t)pager; + resp->maxprot = maxprot; + ihk_device_unmap_virtual(dev, resp, sizeof(*resp)); + ihk_device_unmap_memory(dev, phys, sizeof(*resp)); + + error = 0; +out: + if (newpager) { + kfree(newpager); + } + if (file) { + fput(file); + } + dprintk("pager_req_create(%d,%lx): %d %p %x\n", + fd, (long)result_pa, error, pager, maxprot); + return error; +} + +static int pager_req_release(ihk_os_t os, uintptr_t handle, int unref) +{ + int error; + struct pager *p; + struct pager *free_pager = NULL; + + dprintk("pager_req_relase(%p,%lx,%d)\n", os, handle, unref); + + error = down_interruptible(&pager_sem); + if (error) { + printk("pager_req_relase(%p,%lx,%d):signaled. %d\n", os, handle, unref, error); + goto out; + } + + error = -EBADF; + list_for_each_entry(p, &pager_list, list) { + if ((uintptr_t)p == handle) { + error = 0; + p->ref -= unref; + if (p->ref <= 0) { + list_del(&p->list); + free_pager = p; + } + break; + } + } + + up(&pager_sem); + + if (error) { + printk("pager_req_relase(%p,%lx,%d):pager not found. %d\n", os, handle, unref, error); + goto out; + } + + if (free_pager) { + if (free_pager->rofile) { + fput(free_pager->rofile); + } + if (free_pager->rwfile) { + fput(free_pager->rwfile); + } + kfree(free_pager); + } + + error = 0; +out: + dprintk("pager_req_relase(%p,%lx,%d): %d\n", os, handle, unref, error); + return error; +} + +static int pager_req_read(ihk_os_t os, uintptr_t handle, off_t off, size_t size, uintptr_t rpa) +{ + ssize_t ss; + struct pager *pager; + struct file *file = NULL; + uintptr_t phys = -1; + ihk_device_t dev = ihk_os_to_dev(os); + void *buf = NULL; + mm_segment_t fs; + loff_t pos; + + dprintk("pager_req_read(%lx,%lx,%lx,%lx)\n", handle, off, size, rpa); + + ss = down_interruptible(&pager_sem); + if (ss) { + printk("pager_req_read(%lx,%lx,%lx,%lx): signaled. %ld\n", handle, off, size, rpa, ss); + goto out; + } + + list_for_each_entry(pager, &pager_list, list) { + if ((uintptr_t)pager == handle) { + file = (pager->rofile)? pager->rofile: pager->rwfile; + get_file(file); + break; + } + } + up(&pager_sem); + + if (!file) { + ss = -EBADF; + printk("pager_req_read(%lx,%lx,%lx,%lx):pager not found. %ld\n", handle, off, size, rpa, ss); + goto out; + } + + phys = ihk_device_map_memory(dev, rpa, size); + buf = ihk_device_map_virtual(dev, phys, size, NULL, 0); + fs = get_fs(); + set_fs(KERNEL_DS); + pos = off; + ss = vfs_read(file, buf, size, &pos); + if ((ss != size) && (ss > 0)) { + if (clear_user(buf+ss, size-ss) == 0) { + ss = size; + } + else { + ss = -EFAULT; + } + } + set_fs(fs); + if (ss < 0) { + printk("pager_req_read(%lx,%lx,%lx,%lx):pread failed. %ld\n", handle, off, size, rpa, ss); + goto out; + } + +out: + if (buf) { + ihk_device_unmap_virtual(dev, buf, size); + } + if (phys != (uintptr_t)-1) { + ihk_device_unmap_memory(dev, phys, size); + } + if (file) { + fput(file); + } + dprintk("pager_req_read(%lx,%lx,%lx,%lx): %ld\n", handle, off, size, rpa, ss); + return ss; +} + +static long pager_call(ihk_os_t os, struct syscall_request *req) +{ + long ret; + + dprintk("pager_call(%#lx)\n", req->args[0]); + switch (req->args[0]) { +#define PAGER_REQ_CREATE 0x0001 +#define PAGER_REQ_RELEASE 0x0002 +#define PAGER_REQ_READ 0x0003 + case PAGER_REQ_CREATE: + ret = pager_req_create(os, req->args[1], req->args[2]); + break; + + case PAGER_REQ_RELEASE: + ret = pager_req_release(os, req->args[1], req->args[2]); + break; + + case PAGER_REQ_READ: + ret = pager_req_read(os, req->args[1], req->args[2], req->args[3], req->args[4]); + break; + + default: + ret = -ENOSYS; + break; + } + + dprintk("pager_call(%#lx): %ld\n", req->args[0], ret); + return ret; +} + static void __return_syscall(struct mcctrl_channel *c, int ret) { c->param.response_va->ret = ret; @@ -642,10 +933,14 @@ static void clear_pte_range(uintptr_t addr, uintptr_t len) int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall_request *sc) { int error; - long ret; + long ret = -1; - dprintk("__do_in_kernel_syscall(%p,%p,%p %ld)\n", os, c, sc, sc->number); + dprintk("__do_in_kernel_syscall(%p,%p,%ld %lx)\n", os, c, sc->number, sc->args[0]); switch (sc->number) { + case __NR_mmap: + ret = pager_call(os, sc); + break; + case __NR_munmap: clear_pte_range(sc->args[0], sc->args[1]); ret = 0; @@ -661,6 +956,6 @@ int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall error = 0; out: - dprintk("__do_in_kernel_syscall(%p,%p,%p %ld): %d\n", os, c, sc, sc->number, error); + dprintk("__do_in_kernel_syscall(%p,%p,%ld %lx): %d %ld\n", os, c, sc->number, sc->args[0], error, ret); return error; } diff --git a/kernel/Makefile.build b/kernel/Makefile.build index 36bf10b2..1a7f6438 100644 --- a/kernel/Makefile.build +++ b/kernel/Makefile.build @@ -1,6 +1,6 @@ IHKDIR=$(IHKBASE)/$(TARGETDIR) OBJS = init.o mem.o debug.o mikc.o listeners.o ap.o syscall.o cls.o host.o -OBJS += process.o copy.o waitq.o futex.o timer.o plist.o +OBJS += process.o copy.o waitq.o futex.o timer.o plist.o fileobj.o DEPSRCS=$(wildcard $(SRC)/*.c) CFLAGS += -I$(SRC)/include -mcmodel=kernel -D__KERNEL__ diff --git a/kernel/Makefile.build.dcfa b/kernel/Makefile.build.dcfa index 580eeb89..cf0a2719 100644 --- a/kernel/Makefile.build.dcfa +++ b/kernel/Makefile.build.dcfa @@ -1,6 +1,6 @@ IHKDIR=$(IHKBASE)/$(TARGETDIR) OBJS = init.o mem.o debug.o mikc.o listeners.o ap.o syscall.o cls.o host.o -OBJS += process.o copy.o waitq.o futex.o timer.o plist.o +OBJS += process.o copy.o waitq.o futex.o timer.o plist.o fileobj.o DEPSRCS=$(wildcard $(SRC)/*.c) CFLAGS += -I$(SRC)/include -mcmodel=kernel -D__KERNEL__ diff --git a/kernel/fileobj.c b/kernel/fileobj.c new file mode 100644 index 00000000..9f906d2a --- /dev/null +++ b/kernel/fileobj.c @@ -0,0 +1,530 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define dkprintf(...) +#define ekprintf(...) kprintf(__VA_ARGS__) + +static ihk_spinlock_t fileobj_list_lock = SPIN_LOCK_UNLOCKED; +static LIST_HEAD(fileobj_list); + +struct fileobj { + struct memobj memobj; /* must be first */ + long sref; + long cref; + uintptr_t handle; + struct list_head page_list; + struct list_head list; +}; + +static memobj_release_func_t fileobj_release; +static memobj_ref_func_t fileobj_ref; +static memobj_get_page_func_t fileobj_get_page; +static memobj_copy_page_func_t fileobj_copy_page; + +static struct memobj_ops fileobj_ops = { + .release = &fileobj_release, + .ref = &fileobj_ref, + .get_page = &fileobj_get_page, + .copy_page = &fileobj_copy_page, +}; + +static struct fileobj *to_fileobj(struct memobj *memobj) +{ + return (struct fileobj *)memobj; +} + +static struct memobj *to_memobj(struct fileobj *fileobj) +{ + return &fileobj->memobj; +} + +/*********************************************************************** + * page_list + */ +static void page_list_init(struct fileobj *obj) +{ + INIT_LIST_HEAD(&obj->page_list); + return; +} + +static void page_list_insert(struct fileobj *obj, struct page *page) +{ + list_add(&page->list, &obj->page_list); + return; +} + +static void page_list_remove(struct fileobj *obj, struct page *page) +{ + list_del(&page->list); +} + +static struct page *page_list_lookup(struct fileobj *obj, off_t off) +{ + struct page *page; + + list_for_each_entry(page, &obj->page_list, list) { + if ((page->mode != PM_WILL_PAGEIO) + && (page->mode != PM_PAGEIO) + && (page->mode != PM_DONE_PAGEIO) + && (page->mode != PM_PAGEIO_EOF) + && (page->mode != PM_PAGEIO_ERROR) + && (page->mode != PM_MAPPED)) { + kprintf("page_list_lookup(%p,%lx): mode %x\n", + obj, off, page->mode); + panic("page_list_lookup:invalid obj page"); + } + if (page->offset == off) { + goto out; + } + } + page = NULL; + +out: + return page; +} + +static struct page *page_list_first(struct fileobj *obj) +{ + if (list_empty(&obj->page_list)) { + return NULL; + } + + return list_first_entry(&obj->page_list, struct page, list); +} + +/*********************************************************************** + * obj_list + */ +static void obj_list_insert(struct fileobj *obj) +{ + list_add(&obj->list, &fileobj_list); +} + +static void obj_list_remove(struct fileobj *obj) +{ + list_del(&obj->list); +} + +/* return NULL or locked fileobj */ +static struct fileobj *obj_list_lookup(uintptr_t handle) +{ + struct fileobj *obj; + struct fileobj *p; + + obj = NULL; + list_for_each_entry(p, &fileobj_list, list) { + if (p->handle == handle) { + memobj_lock(&p->memobj); + if (p->cref > 0) { + obj = p; + break; + } + memobj_unlock(&p->memobj); + } + } + + return obj; +} + +/*********************************************************************** + * fileobj + */ +int fileobj_create(int fd, struct memobj **objp, int *maxprotp) +{ + ihk_mc_user_context_t ctx; + struct pager_create_result result; // XXX: assumes contiguous physical + int error; + struct fileobj *newobj = NULL; + struct fileobj *obj; + + dkprintf("fileobj_create(%d)\n", fd); + newobj = kmalloc(sizeof(*newobj), IHK_MC_AP_NOWAIT); + if (!newobj) { + error = -ENOMEM; + kprintf("fileobj_create(%d):kmalloc failed. %d\n", fd, error); + goto out; + } + + ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_CREATE; + ihk_mc_syscall_arg1(&ctx) = fd; + ihk_mc_syscall_arg2(&ctx) = virt_to_phys(&result); + + error = syscall_generic_forwarding(__NR_mmap, &ctx); + if (error) { + kprintf("fileobj_create(%d):create failed. %d\n", fd, error); + goto out; + } + + memset(newobj, 0, sizeof(*newobj)); + newobj->memobj.ops = &fileobj_ops; + newobj->handle = result.handle; + newobj->sref = 1; + newobj->cref = 1; + page_list_init(newobj); + ihk_mc_spinlock_init(&newobj->memobj.lock); + + ihk_mc_spinlock_lock_noirq(&fileobj_list_lock); + obj = obj_list_lookup(result.handle); + if (!obj) { + obj_list_insert(newobj); + obj = newobj; + newobj = NULL; + } + else { + ++obj->sref; + ++obj->cref; + memobj_unlock(&obj->memobj); /* locked by obj_list_lookup() */ + } + + ihk_mc_spinlock_unlock_noirq(&fileobj_list_lock); + + error = 0; + *objp = to_memobj(obj); + *maxprotp = result.maxprot; + +out: + if (newobj) { + kfree(newobj); + } + dkprintf("fileobj_create(%d):%d %p %x\n", fd, error, *objp, *maxprotp); + return error; +} + +static void fileobj_ref(struct memobj *memobj) +{ + struct fileobj *obj = to_fileobj(memobj); + + dkprintf("fileobj_ref(%p %lx):\n", obj, obj->handle); + memobj_lock(&obj->memobj); + ++obj->cref; + memobj_unlock(&obj->memobj); + return; +} + +static void fileobj_release(struct memobj *memobj) +{ + struct fileobj *obj = to_fileobj(memobj); + long free_sref = 0; + uintptr_t free_handle; + struct fileobj *free_obj = NULL; + + dkprintf("fileobj_release(%p %lx)\n", obj, obj->handle); + + memobj_lock(&obj->memobj); + --obj->cref; + free_sref = obj->sref - 1; /* surplus sref */ + if (obj->cref <= 0) { + free_sref = obj->sref; + free_obj = obj; + } + obj->sref -= free_sref; + free_handle = obj->handle; + memobj_unlock(&obj->memobj); + + if (free_obj) { + ihk_mc_spinlock_lock_noirq(&fileobj_list_lock); + /* zap page_list */ + for (;;) { + struct page *page; + + page = page_list_first(obj); + if (!page) { + break; + } + page_list_remove(obj, page); + + if (!((page->mode == PM_WILL_PAGEIO) + || (page->mode == PM_DONE_PAGEIO) + || (page->mode == PM_PAGEIO_EOF) + || (page->mode == PM_PAGEIO_ERROR) + || ((page->mode == PM_MAPPED) + && (page->count <= 0)))) { + kprintf("fileobj_release(%p %lx): " + "mode %x, count %d, off %lx\n", + obj, obj->handle, page->mode, + page->count, page->offset); + panic("fileobj_release"); + } + + page->mode = PM_NONE; + free_pages(phys_to_virt(page_to_phys(page)), 1); + } + obj_list_remove(free_obj); + ihk_mc_spinlock_unlock_noirq(&fileobj_list_lock); + kfree(free_obj); + } + + if (free_sref) { + int error; + ihk_mc_user_context_t ctx; + + ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_RELEASE; + ihk_mc_syscall_arg1(&ctx) = free_handle; + ihk_mc_syscall_arg2(&ctx) = free_sref; + + error = syscall_generic_forwarding(__NR_mmap, &ctx); + if (error) { + kprintf("fileobj_release(%p %lx):" + "release %ld failed. %d\n", + obj, free_handle, free_sref, error); + /* through */ + } + } + + dkprintf("fileobj_release(%p %lx):free %ld %p\n", + obj, free_handle, free_sref, free_obj); + return; +} + +struct pageio_args { + struct fileobj * fileobj; + off_t objoff; + size_t pgsize; +}; + +/* + * fileobj_do_pageio(): + * - args0 will be freed with kfree() + * - args0->fileobj will be released + */ +static void fileobj_do_pageio(void *args0) +{ + struct pageio_args *args = args0; + struct fileobj *obj = args->fileobj; + off_t off = args->objoff; + size_t pgsize = args->pgsize; + struct page *page; + ihk_mc_user_context_t ctx; + ssize_t ss; + + memobj_lock(&obj->memobj); + page = page_list_lookup(obj, off); + if (!page) { + goto out; + } + + while (page->mode == PM_PAGEIO) { + memobj_unlock(&obj->memobj); + cpu_pause(); + memobj_lock(&obj->memobj); + } + + if (page->mode == PM_WILL_PAGEIO) { + page->mode = PM_PAGEIO; + memobj_unlock(&obj->memobj); + + ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_READ; + ihk_mc_syscall_arg1(&ctx) = obj->handle; + ihk_mc_syscall_arg2(&ctx) = off; + ihk_mc_syscall_arg3(&ctx) = pgsize; + ihk_mc_syscall_arg4(&ctx) = page_to_phys(page); + + ss = syscall_generic_forwarding(__NR_mmap, &ctx); + + memobj_lock(&obj->memobj); + if (page->mode != PM_PAGEIO) { + kprintf("fileobj_do_pageio(%p,%lx,%lx):" + "invalid mode %x\n", + obj, off, pgsize, page->mode); + panic("fileobj_do_pageio:invalid page mode"); + } + + if (ss == 0) { + dkprintf("fileobj_do_pageio(%p,%lx,%lx):EOF? %ld\n", + obj, off, pgsize, ss); + page->mode = PM_PAGEIO_EOF; + goto out; + } + else if (ss != pgsize) { + kprintf("fileobj_do_pageio(%p,%lx,%lx):" + "read failed. %ld\n", + obj, off, pgsize, ss); + page->mode = PM_PAGEIO_ERROR; + goto out; + } + + page->mode = PM_DONE_PAGEIO; + } +out: + memobj_unlock(&obj->memobj); + fileobj_release(&obj->memobj); /* got fileobj_get_page() */ + kfree(args0); + dkprintf("fileobj_do_pageio(%p,%lx,%lx):\n", obj, off, pgsize); + return; +} + +static int fileobj_get_page(struct memobj *memobj, off_t off, int p2align, uintptr_t *physp) +{ + struct process *proc = cpu_local_var(current); + struct fileobj *obj = to_fileobj(memobj); + int error; + void *virt = NULL; + int npages; + uintptr_t phys = -1; + struct page *page; + struct pageio_args *args = NULL; + + dkprintf("fileobj_get_page(%p,%lx,%x,%p)\n", obj, off, p2align, physp); + + memobj_lock(&obj->memobj); + if (p2align != PAGE_P2ALIGN) { + error = -ENOMEM; + goto out; + } + + page = page_list_lookup(obj, off); + if (!page || (page->mode == PM_WILL_PAGEIO) + || (page->mode == PM_PAGEIO)) { + args = kmalloc(sizeof(*args), IHK_MC_AP_NOWAIT); + if (!args) { + error = -ENOMEM; + kprintf("fileobj_get_page(%p,%lx,%x,%p):" + "kmalloc failed. %d\n", + obj, off, p2align, physp, error); + goto out; + } + + if (!page) { + npages = 1 << p2align; + virt = ihk_mc_alloc_pages(npages, IHK_MC_AP_NOWAIT); + if (!virt) { + error = -ENOMEM; + kprintf("fileobj_get_page(%p,%lx,%x,%p):" + "alloc failed. %d\n", + obj, off, p2align, physp, + error); + goto out; + } + phys = virt_to_phys(virt); + page = phys_to_page(phys); + if (page->mode != PM_NONE) { + panic("fileobj_get_page:invalid new page"); + } + page->mode = PM_WILL_PAGEIO; + page->offset = off; + page_list_insert(obj, page); + } + + ++obj->cref; /* for fileobj_do_pageio() */ + + args->fileobj = obj; + args->objoff = off; + args->pgsize = PAGE_SIZE << p2align; + + proc->pgio_fp = &fileobj_do_pageio; + proc->pgio_arg = args; + + error = -ERESTART; + virt = NULL; + args = NULL; + goto out; + } + else if (page->mode == PM_DONE_PAGEIO) { + page->mode = PM_MAPPED; + page->count = 0; + } + else if (page->mode == PM_PAGEIO_EOF) { + error = -ERANGE; + goto out; + } + else if (page->mode == PM_PAGEIO_ERROR) { + error = -EIO; + goto out; + } + + ++page->count; + + error = 0; + *physp = page_to_phys(page); + virt = NULL; +out: + memobj_unlock(&obj->memobj); + if (virt) { + ihk_mc_free_pages(virt, npages); + } + if (args) { + kfree(args); + } + dkprintf("fileobj_get_page(%p,%lx,%x,%p): %d %lx\n", + obj, off, p2align, physp, error, phys); + return error; +} + +static uintptr_t fileobj_copy_page( + struct memobj *memobj, uintptr_t orgpa, int p2align) +{ + struct page *orgpage = phys_to_page(orgpa); + size_t pgsize = PAGE_SIZE << p2align; + int npages = 1 << p2align; + void *newkva = NULL; + uintptr_t newpa = -1; + void *orgkva; + + dkprintf("fileobj_copy_page(%p,%lx,%d)\n", memobj, orgpa, p2align); + if (p2align != PAGE_P2ALIGN) { + panic("p2align"); + } + + memobj_lock(memobj); + for (;;) { + if (orgpage->mode != PM_MAPPED) { + kprintf("fileobj_copy_page(%p,%lx,%d):" + "invalid cow page. %x\n", + memobj, orgpa, p2align, orgpage->mode); + panic("fileobj_copy_page:invalid cow page"); + } + if (orgpage->count == 1) { // XXX: private only + list_del(&orgpage->list); + orgpage->mode = PM_NONE; + newpa = orgpa; + break; + } + if (orgpage->count <= 0) { + kprintf("fileobj_copy_page(%p,%lx,%d):" + "orgpage count corrupted. %x\n", + memobj, orgpa, p2align, orgpage->count); + panic("fileobj_copy_page:orgpage count corrupted"); + } + if (newkva) { + orgkva = phys_to_virt(orgpa); + memcpy(newkva, orgkva, pgsize); + --orgpage->count; + newpa = virt_to_phys(newkva); + newkva = NULL; /* avoid ihk_mc_free_pages() */ + break; + } + + memobj_unlock(memobj); + newkva = ihk_mc_alloc_aligned_pages(npages, p2align, + IHK_MC_AP_NOWAIT); + if (!newkva) { + kprintf("fileobj_copy_page(%p,%lx,%d):" + "alloc page failed\n", + memobj, orgpa, p2align); + goto out; + } + memobj_lock(memobj); + } + memobj_unlock(memobj); + +out: + if (newkva) { + ihk_mc_free_pages(newkva, npages); + } + dkprintf("fileobj_copy_page(%p,%lx,%d): %lx\n", + memobj, orgpa, p2align, newpa); + return newpa; +} diff --git a/kernel/host.c b/kernel/host.c index 216a43b9..beec8bd9 100644 --- a/kernel/host.c +++ b/kernel/host.c @@ -94,12 +94,13 @@ static int process_msg_prepare_process(unsigned long rphys) range_npages = (e - s) >> PAGE_SHIFT; flags = VR_NONE; flags |= PROT_TO_VR_FLAG(pn->sections[i].prot); + flags |= VRFLAG_PROT_TO_MAXPROT(flags); if((up_v = ihk_mc_alloc_pages(range_npages, IHK_MC_AP_NOWAIT)) == NULL){ goto err; } up = virt_to_phys(up_v); - if(add_process_memory_range(proc, s, e, up, flags) != 0){ + if(add_process_memory_range(proc, s, e, up, flags, NULL, 0) != 0){ ihk_mc_free_pages(up_v, range_npages); goto err; } @@ -170,29 +171,32 @@ static int process_msg_prepare_process(unsigned long rphys) /* Map system call stuffs */ flags = VR_RESERVED | VR_PROT_READ | VR_PROT_WRITE; + flags |= VRFLAG_PROT_TO_MAXPROT(flags); addr = proc->vm->region.map_start - PAGE_SIZE * SCD_RESERVED_COUNT; e = addr + PAGE_SIZE * DOORBELL_PAGE_COUNT; if(add_process_memory_range(proc, addr, e, cpu_local_var(scp).doorbell_pa, - VR_REMOTE | flags) != 0){ + VR_REMOTE | flags, NULL, 0) != 0){ goto err; } addr = e; e = addr + PAGE_SIZE * REQUEST_PAGE_COUNT; if(add_process_memory_range(proc, addr, e, cpu_local_var(scp).request_pa, - VR_REMOTE | flags) != 0){ + VR_REMOTE | flags, NULL, 0) != 0){ goto err; } addr = e; e = addr + PAGE_SIZE * RESPONSE_PAGE_COUNT; if(add_process_memory_range(proc, addr, e, cpu_local_var(scp).response_pa, - flags) != 0){ + flags, NULL, 0) != 0){ goto err; } /* Map, copy and update args and envs */ + flags = VR_PROT_READ | VR_PROT_WRITE; + flags |= VRFLAG_PROT_TO_MAXPROT(flags); addr = e; e = addr + PAGE_SIZE * ARGENV_PAGE_COUNT; @@ -202,7 +206,7 @@ static int process_msg_prepare_process(unsigned long rphys) args_envs_p = virt_to_phys(args_envs); if(add_process_memory_range(proc, addr, e, args_envs_p, - VR_PROT_READ|VR_PROT_WRITE) != 0){ + flags, NULL, 0) != 0){ ihk_mc_free_pages(args_envs, ARGENV_PAGE_COUNT); goto err; } diff --git a/kernel/include/memobj.h b/kernel/include/memobj.h new file mode 100644 index 00000000..eab0f7f9 --- /dev/null +++ b/kernel/include/memobj.h @@ -0,0 +1,60 @@ +#ifndef HEADER_MEMOBJ_H +#define HEADER_MEMOBJ_H + +#include +#include +#include +#include + +struct memobj { + struct memobj_ops * ops; + ihk_spinlock_t lock; +}; + +typedef void memobj_release_func_t(struct memobj *obj); +typedef void memobj_ref_func_t(struct memobj *obj); +typedef int memobj_get_page_func_t(struct memobj *obj, off_t off, int p2align, uintptr_t *physp); +typedef uintptr_t memobj_copy_page_func_t(struct memobj *obj, uintptr_t orgphys, int p2align); + +struct memobj_ops { + memobj_release_func_t * release; + memobj_ref_func_t * ref; + memobj_get_page_func_t * get_page; + memobj_copy_page_func_t * copy_page; +}; + +static inline void memobj_release(struct memobj *obj) +{ + (*obj->ops->release)(obj); +} + +static inline void memobj_ref(struct memobj *obj) +{ + (*obj->ops->ref)(obj); +} + +static inline int memobj_get_page(struct memobj *obj, off_t off, + int p2align, uintptr_t *physp) +{ + return (*obj->ops->get_page)(obj, off, p2align, physp); +} + +static inline uintptr_t memobj_copy_page(struct memobj *obj, + uintptr_t orgphys, int p2align) +{ + return (*obj->ops->copy_page)(obj, orgphys, p2align); +} + +static inline void memobj_lock(struct memobj *obj) +{ + ihk_mc_spinlock_lock_noirq(&obj->lock); +} + +static inline void memobj_unlock(struct memobj *obj) +{ + ihk_mc_spinlock_unlock_noirq(&obj->lock); +} + +int fileobj_create(int fd, struct memobj **objp, int *maxprotp); + +#endif /* HEADER_MEMOBJ_H */ diff --git a/kernel/include/page.h b/kernel/include/page.h index 6d5aaaaa..1a7b78f9 100644 --- a/kernel/include/page.h +++ b/kernel/include/page.h @@ -2,16 +2,28 @@ #define __HEADER_PAGE_H struct page { - struct list_head list; - uint64_t flags; - int64_t count; + struct list_head list; + uint8_t mode; + uint8_t padding[3]; + int32_t count; + off_t offset; }; -/* flags */ -#define PAGE_IN_LIST 0x0001UL +/* mode */ +enum page_mode { + PM_NONE = 0x00, + PM_PENDING_FREE = 0x01, + PM_WILL_PAGEIO = 0x02, + PM_PAGEIO = 0x03, + PM_DONE_PAGEIO = 0x04, + PM_PAGEIO_EOF = 0x05, + PM_PAGEIO_ERROR = 0x06, + PM_MAPPED = 0x07, +}; struct page *phys_to_page(uintptr_t phys); uintptr_t page_to_phys(struct page *page); +int page_unmap(struct page *page); void *allocate_pages(int npages, enum ihk_mc_ap_flag flag); void free_pages(void *va, int npages); diff --git a/kernel/include/pager.h b/kernel/include/pager.h new file mode 100644 index 00000000..d595dc84 --- /dev/null +++ b/kernel/include/pager.h @@ -0,0 +1,27 @@ +#ifndef HEADER_PAGER_H +#define HEADER_PAGER_H + +#include + +enum pager_op { + PAGER_REQ_CREATE = 0x0001, + PAGER_REQ_RELEASE = 0x0002, + PAGER_REQ_READ = 0x0003, +}; + +/* + * int pager_req_create(int fd, int flags, int prot, uintptr_t result_rpa); + */ +struct pager_create_result { + uintptr_t handle; + int maxprot; + int8_t padding[4]; +}; + +/* + * int pager_req_release(uintptr_t handle); + */ +/* + * int pager_req_read(uintptr_t handle, off_t off, size_t size, uintptr_t buf_rpa); + */ +#endif /* HEADER_PAGER_H */ diff --git a/kernel/include/process.h b/kernel/include/process.h index ec66cfad..006c67f5 100644 --- a/kernel/include/process.h +++ b/kernel/include/process.h @@ -7,6 +7,7 @@ #include #include #include +#include #define VR_NONE 0x0 #define VR_STACK 0x1 @@ -14,13 +15,21 @@ #define VR_IO_NOCACHE 0x100 #define VR_REMOTE 0x200 #define VR_DEMAND_PAGING 0x1000 +#define VR_PRIVATE 0x2000 #define VR_PROT_NONE 0x00000000 #define VR_PROT_READ 0x00010000 #define VR_PROT_WRITE 0x00020000 #define VR_PROT_EXEC 0x00040000 #define VR_PROT_MASK 0x00070000 +#define VR_MAXPROT_NONE 0x00000000 +#define VR_MAXPROT_READ 0x00100000 +#define VR_MAXPROT_WRITE 0x00200000 +#define VR_MAXPROT_EXEC 0x00400000 +#define VR_MAXPROT_MASK 0x00700000 #define PROT_TO_VR_FLAG(prot) (((unsigned long)(prot) << 16) & VR_PROT_MASK) +#define VRFLAG_PROT_TO_MAXPROT(vrflag) (((vrflag) & VR_PROT_MASK) << 4) +#define VRFLAG_MAXPROT_TO_PROT(vrflag) (((vrflag) & VR_MAXPROT_MASK) >> 4) #define PS_RUNNING 0x1 #define PS_INTERRUPTIBLE 0x2 @@ -42,6 +51,8 @@ struct vm_range { struct list_head list; unsigned long start, end; unsigned long flag; + struct memobj *memobj; + off_t objoff; }; struct vm_regions { @@ -61,6 +72,8 @@ struct sig_handler { struct k_sigaction action[_NSIG]; }; +typedef void pgio_func_t(void *arg); + struct process { int pid; int status; @@ -92,6 +105,8 @@ struct process { // TODO: backup FR and MMX regs unsigned long sigrc; // return code of rt_sigreturn (x86_64: rax reg.) struct rlimit rlimit_stack; + pgio_func_t *pgio_fp; + void *pgio_arg; }; struct process_vm { @@ -118,11 +133,13 @@ struct process *clone_process(struct process *org, void destroy_process(struct process *proc); void hold_process(struct process *proc); void free_process(struct process *proc); +void flush_process_memory(struct process *proc); void free_process_memory(struct process *proc); int add_process_memory_range(struct process *process, unsigned long start, unsigned long end, - unsigned long phys, unsigned long flag); + unsigned long phys, unsigned long flag, + struct memobj *memobj, off_t objoff); int remove_process_memory_range( struct process *process, unsigned long start, unsigned long end); int split_process_memory_range(struct process *process, diff --git a/kernel/include/syscall.h b/kernel/include/syscall.h index 536ee8ff..744c6d04 100644 --- a/kernel/include/syscall.h +++ b/kernel/include/syscall.h @@ -194,6 +194,7 @@ struct syscall_params { extern int do_syscall(struct syscall_request *req, ihk_mc_user_context_t *ctx); extern int obtain_clone_cpuid(); +extern long syscall_generic_forwarding(int n, ihk_mc_user_context_t *ctx); #define DECLARATOR(number,name) __NR_##name = number, #define SYSCALL_HANDLED(number,name) DECLARATOR(number,name) diff --git a/kernel/mem.c b/kernel/mem.c index 00bb7194..c1a7dd58 100644 --- a/kernel/mem.c +++ b/kernel/mem.c @@ -68,12 +68,15 @@ void free_pages(void *va, int npages) struct list_head *pendings = &cpu_local_var(pending_free_pages); struct page *page; + page = phys_to_page(virt_to_phys(va)); + if (!page) { + panic("free_pages:struct page not found"); + } + if (page->mode != PM_NONE) { + panic("free_pages:not PM_NONE"); + } if (pendings->next != NULL) { - page = phys_to_page(virt_to_phys(va)); - if (page->flags & PAGE_IN_LIST) { - panic("free_pages"); - } - page->flags |= PAGE_IN_LIST; + page->mode = PM_PENDING_FREE; page->count = npages; list_add_tail(&page->list, pendings); return; @@ -103,10 +106,10 @@ void finish_free_pages_pending(void) } list_for_each_entry_safe(page, next, pendings, list) { - if (!(page->flags & PAGE_IN_LIST)) { - panic("free_pending_pages"); + if (page->mode != PM_PENDING_FREE) { + panic("free_pending_pages:not PM_PENDING_FREE"); } - page->flags &= ~PAGE_IN_LIST; + page->mode = PM_NONE; list_del(&page->list); ihk_pagealloc_free(pa_allocator, page_to_phys(page), page->count); } @@ -192,11 +195,6 @@ static void unhandled_page_fault(struct process *proc, void *fault_addr, void *r } #endif -#if 0 - panic("mem fault"); -#endif - set_signal(SIGSEGV, regs); - check_signal(0, regs); return; } @@ -215,6 +213,13 @@ static void page_fault_handler(void *fault_addr, uint64_t reason, void *regs) ihk_mc_get_processor_id(), fault_addr, reason, regs, error); unhandled_page_fault(proc, fault_addr, regs); + if (error == -ERANGE) { + set_signal(SIGBUS, regs); + } + else { + set_signal(SIGSEGV, regs); + } + check_signal(0, regs); goto out; } @@ -311,6 +316,27 @@ uintptr_t page_to_phys(struct page *page) return phys; } +int page_unmap(struct page *page) +{ + dkprintf("page_unmap(%p %x %d)\n", page, page->mode, page->count); + if (page->mode != PM_MAPPED) { + return 1; + } + + if (--page->count > 0) { + /* other mapping exist */ + dkprintf("page_unmap(%p %x %d): 0\n", + page, page->mode, page->count); + return 0; + } + + /* no mapping exist */ + list_del(&page->list); + page->mode = PM_NONE; + dkprintf("page_unmap(%p %x %d): 1\n", page, page->mode, page->count); + return 1; +} + static void page_init(void) { size_t npages; diff --git a/kernel/process.c b/kernel/process.c index b642d532..e9c140b3 100644 --- a/kernel/process.c +++ b/kernel/process.c @@ -212,6 +212,16 @@ int split_process_memory_range(struct process *proc, struct vm_range *range, newrange->end = range->end; newrange->flag = range->flag; + if (range->memobj) { + memobj_ref(range->memobj); + newrange->memobj = range->memobj; + newrange->objoff = range->objoff + (addr - range->start); + } + else { + newrange->memobj = NULL; + newrange->objoff = 0; + } + range->end = addr; list_add(&newrange->list, &range->list); @@ -239,13 +249,27 @@ int join_process_memory_range(struct process *proc, merging->start, merging->end); if ((surviving->end != merging->start) - || (surviving->flag != merging->flag)) { + || (surviving->flag != merging->flag) + || (surviving->memobj != merging->memobj)) { error = -EINVAL; goto out; } + if (surviving->memobj != NULL) { + size_t len; + off_t endoff; + + len = surviving->end - surviving->start; + endoff = surviving->objoff + len; + if (endoff != merging->objoff) { + return -EINVAL; + } + } surviving->end = merging->end; + if (merging->memobj) { + memobj_release(merging->memobj); + } list_del(&merging->list); ihk_mc_free(merging); @@ -296,8 +320,14 @@ int free_process_memory_range(struct process_vm *vm, struct vm_range *range) #endif /* USE_LARGE_PAGES */ ihk_mc_spinlock_lock_noirq(&vm->page_table_lock); + if (range->memobj) { + memobj_lock(range->memobj); + } error = ihk_mc_pt_free_range(vm->page_table, (void *)start, (void *)end); + if (range->memobj) { + memobj_unlock(range->memobj); + } ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock); if (error && (error != -ENOENT)) { ekprintf("free_process_memory_range(%p,%lx-%lx):" @@ -319,6 +349,9 @@ int free_process_memory_range(struct process_vm *vm, struct vm_range *range) } } + if (range->memobj) { + memobj_release(range->memobj); + } list_del(&range->list); ihk_mc_free(range); @@ -434,7 +467,8 @@ enum ihk_mc_pt_attribute vrflag_to_ptattr(unsigned long flag) int add_process_memory_range(struct process *process, unsigned long start, unsigned long end, - unsigned long phys, unsigned long flag) + unsigned long phys, unsigned long flag, + struct memobj *memobj, off_t offset) { struct vm_range *range; int rc; @@ -459,6 +493,8 @@ int add_process_memory_range(struct process *process, range->start = start; range->end = end; range->flag = flag; + range->memobj = memobj; + range->objoff = offset; if(range->flag & VR_DEMAND_PAGING) { dkprintf("range: 0x%lX - 0x%lX => physicall memory area is allocated on demand (%ld) [%lx]\n", @@ -477,7 +513,7 @@ int add_process_memory_range(struct process *process, rc = update_process_page_table(process, range, phys, PTATTR_UNCACHABLE); } else if(flag & VR_DEMAND_PAGING){ //demand paging no need to update process table now - kprintf("demand paging do not update process page table\n"); + dkprintf("demand paging do not update process page table\n"); rc = 0; } else if ((range->flag & VR_PROT_MASK) == VR_PROT_NONE) { rc = 0; @@ -652,6 +688,8 @@ static int page_fault_process_memory_range(struct process_vm *vm, uintptr_t phys; enum ihk_mc_pt_attribute attr; pte_t *ptep; + off_t off; + struct page *page = NULL; dkprintf("[%d]page_fault_process_memory_range(%p,%lx-%lx %lx,%lx)\n", ihk_mc_get_processor_id(), vm, range->start, @@ -662,8 +700,8 @@ static int page_fault_process_memory_range(struct process_vm *vm, /* (1) check PTE */ ptep = ihk_mc_pt_lookup_pte(vm->page_table, (void *)fault_addr, &ptepgaddr, &ptepgsize, &ptep2align); - if (ptep && (*ptep != PTE_NULL)) { - if (!(*ptep & PF_PRESENT)) { + if (ptep && !pte_is_null(ptep)) { + if (!pte_is_present(ptep)) { error = -EFAULT; kprintf("[%d]page_fault_process_memory_range" "(%p,%lx-%lx %lx,%lx):" @@ -714,12 +752,34 @@ static int page_fault_process_memory_range(struct process_vm *vm, if ((range->start <= (uintptr_t)pgaddr) && (((uintptr_t)pgaddr + pgsize) <= range->end)) { npages = pgsize / PAGE_SIZE; - virt = ihk_mc_alloc_aligned_pages(npages, p2align, - IHK_MC_AP_NOWAIT); - if (virt) { - phys = virt_to_phys(virt); - memset(virt, 0, pgsize); - break; + if (range->memobj) { + off = range->objoff + ((uintptr_t)pgaddr - range->start); + error = memobj_get_page(range->memobj, off, p2align, &phys); + if (!error) { + page = phys_to_page(phys); + break; + } + else if (error == -ERESTART) { + goto out; + } + else if (error != -ENOMEM) { + kprintf("[%d]page_fault_process_memory_range" + "(%p,%lx-%lx %lx,%lx):" + "get page failed. %d\n", + ihk_mc_get_processor_id(), vm, + range->start, range->end, + range->flag, fault_addr, error); + goto out; + } + } + else { + virt = ihk_mc_alloc_aligned_pages(npages, + p2align, IHK_MC_AP_NOWAIT); + if (virt) { + phys = virt_to_phys(virt); + memset(virt, 0, pgsize); + break; + } } } @@ -740,6 +800,10 @@ static int page_fault_process_memory_range(struct process_vm *vm, /* (5) mapping */ attr = vrflag_to_ptattr(range->flag); + if (range->memobj && (range->flag & VR_PRIVATE) && (range->flag & VR_PROT_WRITE)) { + /* for copy-on-write */ + attr &= ~PTATTR_WRITABLE; + } if (ptep) { error = ihk_mc_pt_set_pte(vm->page_table, ptep, pgsize, phys, attr); if (error) { @@ -765,11 +829,23 @@ static int page_fault_process_memory_range(struct process_vm *vm, goto out; } } - virt = NULL; + virt = NULL; /* avoid ihk_mc_free_pages() */ + page = NULL; /* avoid page_unmap() */ error = 0; out: ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock); + if (page) { + int need_free; + + memobj_lock(range->memobj); + need_free = page_unmap(page); + memobj_unlock(range->memobj); + + if (need_free) { + ihk_mc_free_pages(phys_to_virt(phys), npages); + } + } if (virt != NULL) { ihk_mc_free_pages(virt, npages); } @@ -779,32 +855,113 @@ out: return error; } -int page_fault_process(struct process *proc, void *fault_addr0, uint64_t reason) +static int protection_fault_process_memory_range(struct process_vm *vm, struct vm_range *range, uintptr_t fault_addr) +{ + int error; + pte_t *ptep; + void *pgaddr; + size_t pgsize; + int pgp2align; + int npages; + uintptr_t oldpa; + void *oldkva; + void *newkva; + uintptr_t newpa; + struct page *oldpage; + enum ihk_mc_pt_attribute attr; + + dkprintf("protection_fault_process_memory_range(%p,%lx-%lx %lx,%lx)\n", + vm, range->start, range->end, range->flag, fault_addr); + + if (!range->memobj) { + error = -EFAULT; + kprintf("protection_fault_process_memory_range" + "(%p,%lx-%lx %lx,%lx):no memobj. %d\n", + vm, range->start, range->end, range->flag, + fault_addr, error); + goto out; + } + + ihk_mc_spinlock_lock_noirq(&vm->page_table_lock); + ptep = ihk_mc_pt_lookup_pte(vm->page_table, (void *)fault_addr, &pgaddr, &pgsize, &pgp2align); + if (!ptep || !pte_is_present(ptep)) { + error = 0; + kprintf("protection_fault_process_memory_range" + "(%p,%lx-%lx %lx,%lx):page not found. %d\n", + vm, range->start, range->end, range->flag, + fault_addr, error); + flush_tlb(); + goto out; + } + if (pgsize != PAGE_SIZE) { + panic("protection_fault_process_memory_range:NYI:cow large page"); + } + npages = 1 << pgp2align; + + oldpa = pte_get_phys(ptep); + oldkva = phys_to_virt(oldpa); + oldpage = phys_to_page(oldpa); + + if (oldpage) { + newpa = memobj_copy_page(range->memobj, oldpa, pgp2align); + newkva = phys_to_virt(newpa); + } + else { + newkva = ihk_mc_alloc_aligned_pages(npages, pgp2align, + IHK_MC_AP_NOWAIT); + if (!newkva) { + error = -ENOMEM; + kprintf("protection_fault_process_memory_range" + "(%p,%lx-%lx %lx,%lx):" + "alloc page failed. %d\n", + vm, range->start, range->end, + range->flag, fault_addr, error); + goto out; + } + + memcpy(newkva, oldkva, pgsize); + newpa = virt_to_phys(newkva); + } + + attr = vrflag_to_ptattr(range->flag); + error = ihk_mc_pt_set_pte(vm->page_table, ptep, pgsize, newpa, attr); + if (error) { + kprintf("protection_fault_process_memory_range" + "(%p,%lx-%lx %lx,%lx):set pte failed. %d\n", + vm, range->start, range->end, range->flag, + fault_addr, error); + panic("protection_fault_process_memory_range:ihk_mc_pt_set_pte failed."); + ihk_mc_free_pages(newkva, npages); + goto out; + } + flush_tlb_single(fault_addr); + + error = 0; +out: + ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock); + dkprintf("protection_fault_process_memory_range" + "(%p,%lx-%lx %lx,%lx): %d\n", + vm, range->start, range->end, range->flag, + fault_addr, error); + return error; +} + +static int do_page_fault_process(struct process *proc, void *fault_addr0, uint64_t reason) { struct process_vm *vm = proc->vm; int error; const uintptr_t fault_addr = (uintptr_t)fault_addr0; struct vm_range *range; - dkprintf("[%d]page_fault_process(%p,%lx,%lx)\n", + dkprintf("[%d]do_page_fault_process(%p,%lx,%lx)\n", ihk_mc_get_processor_id(), proc, fault_addr0, reason); ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock); - /* NYI: page proctection fault */ - if (reason & PF_PROT) { - error = -EFAULT; - kprintf("[%d]page_fault_process(%p,%lx,%lx):" - "protection fault. %d\n", - ihk_mc_get_processor_id(), proc, - fault_addr0, reason, error); - goto out; - } - range = lookup_process_memory_range(vm, fault_addr, fault_addr+1); if (range == NULL) { error = -EFAULT; - kprintf("[%d]page_fault_process(%p,%lx,%lx):" + kprintf("[%d]do_page_fault_process(%p,%lx,%lx):" "out of range. %d\n", ihk_mc_get_processor_id(), proc, fault_addr0, reason, error); @@ -815,31 +972,69 @@ int page_fault_process(struct process *proc, void *fault_addr0, uint64_t reason) || ((reason & PF_WRITE) && !(range->flag & VR_PROT_WRITE))) { error = -EFAULT; - kprintf("[%d]page_fault_process(%p,%lx,%lx):" + kprintf("[%d]do_page_fault_process(%p,%lx,%lx):" "access denied. %d\n", ihk_mc_get_processor_id(), proc, fault_addr0, reason, error); goto out; } - error = page_fault_process_memory_range(vm, range, fault_addr); - if (error) { - kprintf("[%d]page_fault_process(%p,%lx,%lx):" - "fault range failed. %d\n", - ihk_mc_get_processor_id(), proc, - fault_addr0, reason, error); - goto out; + if (reason & PF_PROT) { + error = protection_fault_process_memory_range(vm, range, fault_addr); + if (error) { + kprintf("[%d]do_page_fault_process(%p,%lx,%lx):" + "protection range failed. %d\n", + ihk_mc_get_processor_id(), proc, + fault_addr0, reason, error); + goto out; + } + } + else { + error = page_fault_process_memory_range(vm, range, fault_addr); + if (error == -ERESTART) { + goto out; + } + if (error) { + kprintf("[%d]do_page_fault_process(%p,%lx,%lx):" + "fault range failed. %d\n", + ihk_mc_get_processor_id(), proc, + fault_addr0, reason, error); + goto out; + } } error = 0; out: ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock); - dkprintf("[%d]page_fault_process(%p,%lx,%lx): %d\n", + dkprintf("[%d]do_page_fault_process(%p,%lx,%lx): %d\n", ihk_mc_get_processor_id(), proc, fault_addr0, reason, error); return error; } +int page_fault_process(struct process *proc, void *fault_addr, uint64_t reason) +{ + int error; + + if (proc != cpu_local_var(current)) { + panic("page_fault_process: other process"); + } + + for (;;) { + error = do_page_fault_process(proc, fault_addr, reason); + if (error != -ERESTART) { + break; + } + + if (proc->pgio_fp) { + (*proc->pgio_fp)(proc->pgio_arg); + proc->pgio_fp = NULL; + } + } + + return error; +} + int init_process_stack(struct process *process, struct program_load_desc *pn, int argc, char **argv, int envc, char **env) @@ -852,14 +1047,18 @@ int init_process_stack(struct process *process, struct program_load_desc *pn, unsigned long end = process->vm->region.user_end; unsigned long start = end - size; int rc; + unsigned long vrflag; if(stack == NULL) return -ENOMEM; memset(stack, 0, size); + vrflag = VR_STACK; + vrflag |= VR_PROT_READ | VR_PROT_WRITE | VR_PROT_EXEC; + vrflag |= VRFLAG_PROT_TO_MAXPROT(vrflag); if ((rc = add_process_memory_range(process, start, end, virt_to_phys(stack), - VR_STACK|VR_PROT_READ|VR_PROT_WRITE)) != 0) { + vrflag, NULL, 0)) != 0) { ihk_mc_free_pages(stack, USER_STACK_NR_PAGES); return rc; } @@ -989,7 +1188,7 @@ unsigned long extend_process_region(struct process *proc, } } if((rc = add_process_memory_range(proc, aligned_end, aligned_new_end, - (p==0?0:virt_to_phys(p)), flag)) != 0){ + (p==0?0:virt_to_phys(p)), flag, NULL, 0)) != 0){ free_pages(p, (aligned_new_end - aligned_end) >> PAGE_SHIFT); return end; } @@ -1014,6 +1213,32 @@ int remove_process_region(struct process *proc, return 0; } +void flush_process_memory(struct process *proc) +{ + struct process_vm *vm = proc->vm; + struct vm_range *range; + struct vm_range *next; + int error; + + dkprintf("flush_process_memory(%p)\n", proc); + ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock); + list_for_each_entry_safe(range, next, &vm->vm_range_list, list) { + if (range->memobj) { + // XXX: temporary of temporary + error = free_process_memory_range(vm, range); + if (error) { + ekprintf("flush_process_memory(%p):" + "free range failed. %lx-%lx %d\n", + proc, range->start, range->end, error); + /* through */ + } + } + } + ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock); + dkprintf("flush_process_memory(%p):\n", proc); + return; +} + void free_process_memory(struct process *proc) { struct vm_range *range, *next; diff --git a/kernel/syscall.c b/kernel/syscall.c index bb654917..32eeb08a 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -19,6 +19,8 @@ #include #include #include +#include +#include /* Headers taken from kitten LWK */ #include @@ -210,6 +212,7 @@ terminate(int rc, int sig, ihk_mc_user_context_t *ctx) /* XXX: send SIGKILL to all threads in this process */ + flush_process_memory(proc); /* temporary hack */ do_syscall(&request, ctx); #define IS_DETACHED_PROCESS(proc) (1) /* should be implemented in the future */ @@ -370,10 +373,13 @@ SYSCALL_DECLARE(mmap) int error; intptr_t npages; int p2align; - void *p; + void *p = NULL; int vrflags; intptr_t phys; int unmapped = 0; + struct memobj *memobj = NULL; + int maxprot; + int denied; dkprintf("[%d]sys_mmap(%lx,%lx,%x,%x,%d,%lx)\n", ihk_mc_get_processor_id(), @@ -462,6 +468,7 @@ SYSCALL_DECLARE(mmap) /* do the map */ vrflags = VR_NONE; vrflags |= PROT_TO_VR_FLAG(prot); + vrflags |= (flags & MAP_PRIVATE)? VR_PRIVATE: 0; if (flags & MAP_ANONYMOUS) { if (0) { /* dummy */ @@ -476,10 +483,21 @@ SYSCALL_DECLARE(mmap) vrflags |= VR_DEMAND_PAGING; } } + else { + vrflags |= VR_DEMAND_PAGING; + } - p = NULL; phys = 0; - if (!(vrflags & VR_DEMAND_PAGING) + maxprot = PROT_READ | PROT_WRITE | PROT_EXEC; + if (!(flags & MAP_ANONYMOUS)) { + error = fileobj_create(fd, &memobj, &maxprot); + if (error) { + ekprintf("sys_mmap:fileobj_create failed. %d\n", error); + ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock); + goto out; + } + } + else if (!(vrflags & VR_DEMAND_PAGING) && ((vrflags & VR_PROT_MASK) != VR_PROT_NONE)) { npages = len >> PAGE_SHIFT; p2align = PAGE_P2ALIGN; @@ -500,48 +518,41 @@ SYSCALL_DECLARE(mmap) phys = virt_to_phys(p); } - error = add_process_memory_range(proc, addr, addr+len, phys, vrflags); + if ((flags & MAP_PRIVATE) && (maxprot & PROT_READ)) { + maxprot |= PROT_WRITE; + } + denied = prot & ~maxprot; + if (denied) { + ekprintf("sys_mmap:denied %x. %x %x\n", denied, prot, maxprot); + ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock); + error = (denied == PROT_EXEC)? -EPERM: -EACCES; + goto out; + } + vrflags |= VRFLAG_PROT_TO_MAXPROT(PROT_TO_VR_FLAG(maxprot)); + + error = add_process_memory_range(proc, addr, addr+len, phys, vrflags, memobj, off); if (error) { ekprintf("sys_mmap:add_process_memory_range" "(%p,%lx,%lx,%lx,%lx) failed %d\n", proc, addr, addr+len, virt_to_phys(p), vrflags, error); ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock); - if (p != NULL) { - ihk_mc_free_pages(p, npages); - } goto out; } ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock); - /* read page with pread64() */ - if (!(flags & MAP_ANONYMOUS)) { - ihk_mc_user_context_t ctx2; - ssize_t ss; - - ihk_mc_syscall_arg0(&ctx2) = fd; - ihk_mc_syscall_arg1(&ctx2) = addr; - ihk_mc_syscall_arg2(&ctx2) = len; - ihk_mc_syscall_arg3(&ctx2) = off; - - ss = syscall_generic_forwarding(__NR_pread64, &ctx2); - if (ss < 0) { - ekprintf("sys_mmap:pread(%d,%lx,%lx,%lx) failed %ld\n", - fd, addr, len, off, (long)ss); - error = do_munmap((void *)addr, len); - if (error) { - ekprintf("sys_mmap:do_munmap(%lx,%lx) failed. %d\n", - addr, len, error); - /* through */ - } - error = ss; - goto out; - } - } - error = 0; + p = NULL; + memobj = NULL; + out: + if (p) { + ihk_mc_free_pages(p, npages); + } + if (memobj) { + memobj_release(memobj); + } if (unmapped) { clear_host_pte(addr, len); } @@ -600,6 +611,7 @@ SYSCALL_DECLARE(mprotect) int error; struct vm_range *changed; const unsigned long protflags = PROT_TO_VR_FLAG(prot); + unsigned long denied; dkprintf("[%d]sys_mprotect(%lx,%lx,%x)\n", ihk_mc_get_processor_id(), start, len0, prot); @@ -675,6 +687,14 @@ SYSCALL_DECLARE(mprotect) goto out; } + denied = protflags & ~VRFLAG_MAXPROT_TO_PROT(range->flag); + if (denied) { + ekprintf("sys_mprotect(%lx,%lx,%x):denied %lx. %lx %lx\n", + start, len0, prot, denied, protflags, range->flag); + error = -EACCES; + goto out; + } + if (range->flag & (VR_REMOTE | VR_RESERVED | VR_IO_NOCACHE)) { ekprintf("sys_mprotect(%lx,%lx,%x):cannot change\n", start, len0, prot); @@ -736,6 +756,7 @@ SYSCALL_DECLARE(brk) unsigned long address = ihk_mc_syscall_arg0(ctx); struct vm_regions *region = &cpu_local_var(current)->vm->region; unsigned long r; + unsigned long vrflag; dkprintf("SC(%d)[sys_brk] brk_start=%lx,end=%lx\n", ihk_mc_get_processor_id(), region->brk_start, region->brk_end); @@ -753,10 +774,11 @@ SYSCALL_DECLARE(brk) } /* try to extend memory region */ + vrflag = VR_PROT_READ | VR_PROT_WRITE; + vrflag |= VRFLAG_PROT_TO_MAXPROT(vrflag); ihk_mc_spinlock_lock_noirq(&cpu_local_var(current)->vm->memory_range_lock); region->brk_end = extend_process_region(cpu_local_var(current), - region->brk_start, region->brk_end, address, - VR_PROT_READ|VR_PROT_WRITE); + region->brk_start, region->brk_end, address, vrflag); ihk_mc_spinlock_unlock_noirq(&cpu_local_var(current)->vm->memory_range_lock); dkprintf("SC(%d)[sys_brk] brk_end set to %lx\n", ihk_mc_get_processor_id(), region->brk_end);