diff --git a/arch/x86/kernel/include/arch-memory.h b/arch/x86/kernel/include/arch-memory.h index 9fda8e41..dc75673a 100644 --- a/arch/x86/kernel/include/arch-memory.h +++ b/arch/x86/kernel/include/arch-memory.h @@ -185,6 +185,28 @@ static inline off_t pte_get_off(pte_t *ptep, size_t pgsize) return (off_t)(*ptep & PAGE_MASK); } +static inline void pte_make_fileoff(off_t off, + enum ihk_mc_pt_attribute ptattr, size_t pgsize, pte_t *ptep) +{ + uint64_t attr; + + attr = ptattr & ~PAGE_MASK; + + switch (pgsize) { + case PTL1_SIZE: attr |= PFL1_FILEOFF; break; + case PTL2_SIZE: attr |= PFL2_FILEOFF | PFL2_SIZE; break; + case PTL3_SIZE: attr |= PFL3_FILEOFF | PFL3_SIZE; break; + default: +#if 0 /* XXX: workaround. cannot use panic() here */ + panic("pte_make_fileoff"); +#else + attr |= PTATTR_FILEOFF; +#endif + break; + } + *ptep = (off & PAGE_MASK) | attr; +} + #if 0 /* XXX: workaround. cannot use panic() here */ static inline void pte_xchg(pte_t *ptep, pte_t *valp) { diff --git a/arch/x86/kernel/include/syscall_list.h b/arch/x86/kernel/include/syscall_list.h index 77f8b6ec..cb246049 100644 --- a/arch/x86/kernel/include/syscall_list.h +++ b/arch/x86/kernel/include/syscall_list.h @@ -75,6 +75,7 @@ SYSCALL_DELEGATED(201, time) SYSCALL_HANDLED(202, futex) SYSCALL_HANDLED(203, sched_setaffinity) SYSCALL_HANDLED(204, sched_getaffinity) +SYSCALL_HANDLED(216, remap_file_pages) SYSCALL_DELEGATED(217, getdents64) SYSCALL_HANDLED(218, set_tid_address) SYSCALL_HANDLED(231, exit_group) diff --git a/kernel/include/process.h b/kernel/include/process.h index 5bc99fea..2688c887 100644 --- a/kernel/include/process.h +++ b/kernel/include/process.h @@ -228,6 +228,8 @@ int join_process_memory_range(struct process *process, struct vm_range *survivin int change_prot_process_memory_range( struct process *process, struct vm_range *range, unsigned long newflag); +int remap_process_memory_range(struct process_vm *vm, struct vm_range *range, + uintptr_t start, uintptr_t end, off_t off); struct vm_range *lookup_process_memory_range( struct process_vm *vm, uintptr_t start, uintptr_t end); struct vm_range *next_process_memory_range( diff --git a/kernel/process.c b/kernel/process.c index 373d3b71..174a1181 100644 --- a/kernel/process.c +++ b/kernel/process.c @@ -997,6 +997,94 @@ out: return error; } +struct rfp_args { + off_t off; + uintptr_t start; + struct memobj *memobj; +}; + +static int remap_one_page(void *arg0, page_table_t pt, pte_t *ptep, + void *pgaddr, size_t pgsize) +{ + struct rfp_args * const args = arg0; + int error; + off_t off; + pte_t apte; + uintptr_t phys; + struct page *page; + + dkprintf("remap_one_page(%p,%p,%p %#lx,%p,%#lx)\n", + arg0, pt, ptep, *ptep, pgaddr, pgsize); + + /* XXX: NYI: large pages */ + if (pgsize != PAGE_SIZE) { + error = -E2BIG; + ekprintf("remap_one_page(%p,%p,%p %#lx,%p,%#lx):%d\n", + arg0, pt, ptep, *ptep, pgaddr, pgsize, error); + goto out; + } + + off = args->off + ((uintptr_t)pgaddr - args->start); + pte_make_fileoff(off, 0, pgsize, &apte); + + pte_xchg(ptep, &apte); + flush_tlb_single((uintptr_t)pgaddr); /* XXX: TLB flush */ + + if (pte_is_null(&apte) || pte_is_fileoff(&apte, pgsize)) { + error = 0; + goto out; + } + phys = pte_get_phys(&apte); + + if (pte_is_dirty(&apte, pgsize)) { + memobj_flush_page(args->memobj, phys, pgsize); /* XXX: in lock period */ + } + + page = phys_to_page(phys); + if (page && page_unmap(page)) { + ihk_mc_free_pages(phys_to_virt(phys), pgsize/PAGE_SIZE); + } + + error = 0; +out: + dkprintf("remap_one_page(%p,%p,%p %#lx,%p,%#lx): %d\n", + arg0, pt, ptep, *ptep, pgaddr, pgsize, error); + return error; +} + +int remap_process_memory_range(struct process_vm *vm, struct vm_range *range, + uintptr_t start, uintptr_t end, off_t off) +{ + struct rfp_args args; + int error; + + dkprintf("remap_process_memory_range(%p,%p,%#lx,%#lx,%#lx)\n", + vm, range, start, end, off); + ihk_mc_spinlock_lock_noirq(&vm->page_table_lock); + memobj_lock(range->memobj); + + args.start = start; + args.off = off; + args.memobj = range->memobj; + + error = visit_pte_range(vm->page_table, (void *)start, + (void *)end, VPTEF_DEFAULT, &remap_one_page, &args); + if (error) { + ekprintf("remap_process_memory_range(%p,%p,%#lx,%#lx,%#lx):" + "visit pte failed %d\n", + vm, range, start, end, off, error); + goto out; + } + + error = 0; +out: + memobj_unlock(range->memobj); + ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock); + dkprintf("remap_process_memory_range(%p,%p,%#lx,%#lx,%#lx):%d\n", + vm, range, start, end, off, error); + return error; +} + static int page_fault_process_memory_range(struct process_vm *vm, struct vm_range *range, uintptr_t fault_addr, uint64_t reason) { int error; diff --git a/kernel/syscall.c b/kernel/syscall.c index f2ccc583..6d86ff3d 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -2271,6 +2271,80 @@ out2: return error; } +SYSCALL_DECLARE(remap_file_pages) +{ + const uintptr_t start0 = ihk_mc_syscall_arg0(ctx); + const size_t size = ihk_mc_syscall_arg1(ctx); + const int prot = ihk_mc_syscall_arg2(ctx); + const size_t pgoff = ihk_mc_syscall_arg3(ctx); + const int flags = ihk_mc_syscall_arg4(ctx); + int error; + const uintptr_t start = start0 & PAGE_MASK; + const uintptr_t end = start + size; + const off_t off = (off_t)pgoff << PAGE_SHIFT; + struct process * const proc = cpu_local_var(current); + struct vm_range *range; + int er; + int need_populate = 0; + + dkprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x)\n", + start0, size, prot, pgoff, flags); + ihk_mc_spinlock_lock_noirq(&proc->vm->memory_range_lock); +#define PGOFF_LIMIT ((off_t)1 << ((8*sizeof(off_t) - 1) - PAGE_SHIFT)) + if ((size <= 0) || (size & (PAGE_SIZE - 1)) || (prot != 0) + || (pgoff < 0) || (PGOFF_LIMIT <= pgoff) + || ((PGOFF_LIMIT - pgoff) < (size / PAGE_SIZE)) + || !((start < end) || (end == 0))) { + ekprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x):" + "invalid args\n", + start0, size, prot, pgoff, flags); + error = -EINVAL; + goto out; + } + + range = lookup_process_memory_range(proc->vm, start, end); + if (!range || (start < range->start) || (range->end < end) + || (range->flag & VR_PRIVATE) + || !range->memobj) { + ekprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x):" + "invalid VMR:[%#lx-%#lx) %#lx %p\n", + start0, size, prot, pgoff, flags, + range?range->start:0, range?range->end:0, + range?range->flag:0, range?range->memobj:NULL); + error = -EINVAL; + goto out; + } + + error = remap_process_memory_range(proc->vm, range, start, end, off); + if (error) { + ekprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x):" + "remap failed %d\n", + start0, size, prot, pgoff, flags, error); + goto out; + } + clear_host_pte(start, size); /* XXX: workaround */ + + if (range->flag & VR_LOCKED) { + need_populate = 1; + } + error = 0; +out: + ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock); + + if (need_populate + && (er = populate_process_memory( + proc, (void *)start, size))) { + ekprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x):" + "populate failed %d\n", + start0, size, prot, pgoff, flags, er); + /* ignore populate error */ + } + + dkprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x): %d\n", + start0, size, prot, pgoff, flags, error); + return error; +} + #ifdef DCFA_KMOD #ifdef CMD_DCFA