From f0a52d4519a442c5a8968d92f8ede471bed3871d Mon Sep 17 00:00:00 2001 From: NAKAMURA Gou Date: Fri, 24 Jan 2014 21:27:18 +0900 Subject: [PATCH] implement mmap(MAP_POPULATE) populate_process_memory() function is not efficient, because whether every small page is present is checked. --- arch/x86/kernel/include/arch-memory.h | 6 +++ arch/x86/kernel/include/registers.h | 5 +++ kernel/include/process.h | 1 + kernel/process.c | 58 +++++++++++++++++++++++++++ kernel/syscall.c | 24 ++++++++++- 5 files changed, 93 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/include/arch-memory.h b/arch/x86/kernel/include/arch-memory.h index 26ec9681..aea5691d 100644 --- a/arch/x86/kernel/include/arch-memory.h +++ b/arch/x86/kernel/include/arch-memory.h @@ -62,6 +62,7 @@ #define PT_PHYSMASK (((1UL << 52) - 1) & PAGE_MASK) #define PF_PRESENT ((pte_t)0x01) /* entry is valid */ +#define PF_WRITABLE ((pte_t)0x02) #define PF_SIZE ((pte_t)0x80) /* entry points large page */ #define PFL4_PRESENT ((pte_t)0x01) @@ -130,6 +131,11 @@ static inline int pte_is_present(pte_t *ptep) return !!(*ptep & PF_PRESENT); } +static inline int pte_is_writable(pte_t *ptep) +{ + return !!(*ptep & PF_WRITABLE); +} + static inline uintptr_t pte_get_phys(pte_t *ptep) { return (*ptep & PT_PHYSMASK); diff --git a/arch/x86/kernel/include/registers.h b/arch/x86/kernel/include/registers.h index 601e4ecc..c12925b6 100644 --- a/arch/x86/kernel/include/registers.h +++ b/arch/x86/kernel/include/registers.h @@ -151,6 +151,9 @@ struct x86_regs { * bit 2 == 0: kernel-mode access 1: user-mode access * bit 3 == 1: use of reserved bit detected * bit 4 == 1: fault was an instruction fetch + * + * internal use: + * bit 30 == 1: don't use COW page to resolve page fault. */ enum x86_pf_error_code { PF_PROT = 1 << 0, @@ -158,6 +161,8 @@ enum x86_pf_error_code { PF_USER = 1 << 2, PF_RSVD = 1 << 3, PF_INSTR = 1 << 4, + + PF_DONTCOW = 1 << 30, }; #endif diff --git a/kernel/include/process.h b/kernel/include/process.h index 1db107eb..c3ffd457 100644 --- a/kernel/include/process.h +++ b/kernel/include/process.h @@ -163,6 +163,7 @@ void hold_process(struct process *proc); void free_process(struct process *proc); void flush_process_memory(struct process *proc); void free_process_memory(struct process *proc); +int populate_process_memory(struct process *proc, void *start, size_t len); int add_process_memory_range(struct process *process, unsigned long start, unsigned long end, diff --git a/kernel/process.c b/kernel/process.c index 5afc144b..06d9631d 100644 --- a/kernel/process.c +++ b/kernel/process.c @@ -1037,6 +1037,41 @@ static int do_page_fault_process(struct process *proc, void *fault_addr0, uint64 goto out; } } + else if (reason & PF_DONTCOW) { + pte_t *ptep; + void *ptepgaddr; + size_t ptepgsize; + int ptep2align; + + ihk_mc_spinlock_lock_noirq(&vm->page_table_lock); + ptep = ihk_mc_pt_lookup_pte(vm->page_table, fault_addr0, + &ptepgaddr, &ptepgsize, &ptep2align); + ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock); + + if (!ptep || pte_is_null(ptep)) { + error = page_fault_process_memory_range(vm, range, fault_addr); + if (error == -ERESTART) { + goto out; + } + else if (error) { + kprintf("[%d]do_page_fault_process(%p,%lx,%lx):" + "fault range failed. %d\n", + ihk_mc_get_processor_id(), proc, + fault_addr0, reason, error); + goto out; + } + } + else if (!pte_is_writable(ptep) && (range->flag & VR_PROT_WRITE)) { + error = protection_fault_process_memory_range(vm, range, fault_addr); + if (error) { + kprintf("[%d]do_page_fault_process(%p,%lx,%lx):" + "protection range failed. %d\n", + ihk_mc_get_processor_id(), proc, + fault_addr0, reason, error); + goto out; + } + } + } else { error = page_fault_process_memory_range(vm, range, fault_addr); if (error == -ERESTART) { @@ -1345,6 +1380,29 @@ void free_process_memory(struct process *proc) free_process(vm->owner_process); } +int populate_process_memory(struct process *proc, void *start, size_t len) +{ + int error; + const int reason = PF_USER | PF_DONTCOW; + uintptr_t end; + uintptr_t addr; + + end = (uintptr_t)start + len; + for (addr = (uintptr_t)start; addr < end; addr += PAGE_SIZE) { + error = page_fault_process(proc, (void *)addr, reason); + if (error) { + ekprintf("populate_process_range:page_fault_process" + "(%p,%lx,%lx) failed %d\n", + proc, addr, reason, error); + goto out; + } + } + + error = 0; +out: + return error; +} + void hold_process(struct process *proc) { if (proc->status & (PS_ZOMBIE | PS_EXITED)) { diff --git a/kernel/syscall.c b/kernel/syscall.c index 60c95959..7c38a89b 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -404,6 +404,7 @@ SYSCALL_DECLARE(mmap) | MAP_PRIVATE // 02 | MAP_FIXED // 10 | MAP_ANONYMOUS // 20 + | MAP_POPULATE // 8000 ; const int ignored_flags = 0 #ifdef USE_NOCACHE_MMAP @@ -420,7 +421,6 @@ SYSCALL_DECLARE(mmap) | MAP_GROWSDOWN // 0100 | MAP_EXECUTABLE // 1000 | MAP_LOCKED // 2000 - | MAP_POPULATE // 8000 | MAP_NONBLOCK // 00010000 | MAP_HUGETLB // 00040000 ; @@ -612,6 +612,28 @@ out: } ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock); + if (!error && (flags & MAP_POPULATE)) { + error = populate_process_memory(proc, (void *)addr, len); + if (error) { + ekprintf("sys_mmap:populate_process_memory" + "(%p,%p,%lx) failed %d\n", + proc, (void *)addr, len, error); + /* + * In this case, + * the mapping established by this call should be unmapped + * before mmap() returns with error. + * + * However, the mapping cannot be unmaped simply, + * because the mapping can be modified by other thread + * because memory_range_lock has been released. + * + * For the moment, like a linux-2.6.38-8, + * the physical page allocation failure is ignored. + */ + error = 0; + } + } + out2: if (p) { ihk_mc_free_pages(p, npages);