From d2e29bf598cca70553ddf54e6f9506b49f5ce420 Mon Sep 17 00:00:00 2001 From: NAKAMURA Gou Date: Mon, 23 Jun 2014 18:04:24 +0900 Subject: [PATCH 01/23] add visit_pte_range() refs #21 refs #22 --- arch/x86/kernel/memory.c | 150 +++++++++++++++++++++++++++++++++++++++ lib/include/ihk/mm.h | 11 +++ 2 files changed, 161 insertions(+) diff --git a/arch/x86/kernel/memory.c b/arch/x86/kernel/memory.c index dfacf1a6..97f02658 100644 --- a/arch/x86/kernel/memory.c +++ b/arch/x86/kernel/memory.c @@ -833,6 +833,156 @@ static int split_large_page(pte_t *ptep) return 0; } +struct visit_pte_args { + page_table_t pt; + enum visit_pte_flag flags; + int padding; + pte_visitor_t *funcp; + void *arg; +}; + +static int visit_pte_l1(void *arg0, pte_t *ptep, uintptr_t base, + uintptr_t start, uintptr_t end) +{ + struct visit_pte_args *args = arg0; + + if ((*ptep == PTE_NULL) && (args->flags & VPTEF_SKIP_NULL)) { + return 0; + } + + return (*args->funcp)(args->arg, args->pt, ptep, (void *)base, + PTL1_SIZE); +} + +static int visit_pte_l2(void *arg0, pte_t *ptep, uintptr_t base, + uintptr_t start, uintptr_t end) +{ + int error; + struct visit_pte_args *args = arg0; + struct page_table *pt; + + if ((*ptep == PTE_NULL) && (args->flags & VPTEF_SKIP_NULL)) { + return 0; + } + +#ifdef USE_LARGE_PAGES + if (((*ptep == PTE_NULL) || (*ptep & PFL2_SIZE)) + && (start <= base) + && (((base + PTL2_SIZE) <= end) + || (end == 0))) { + error = (*args->funcp)(args->arg, args->pt, ptep, + (void *)base, PTL2_SIZE); + if (error != -E2BIG) { + return error; + } + } + + if (*ptep & PFL2_SIZE) { + ekprintf("visit_pte_l2:split large page\n"); + return -ENOMEM; + } +#endif + + if (*ptep == PTE_NULL) { + pt = __alloc_new_pt(IHK_MC_AP_NOWAIT); + if (!pt) { + return -ENOMEM; + } + *ptep = virt_to_phys(pt) | PFL2_PDIR_ATTR; + } + else { + pt = phys_to_virt(*ptep & PT_PHYSMASK); + } + + error = walk_pte_l1(pt, base, start, end, &visit_pte_l1, arg0); + return error; +} + +static int visit_pte_l3(void *arg0, pte_t *ptep, uintptr_t base, + uintptr_t start, uintptr_t end) +{ + int error; + struct visit_pte_args *args = arg0; + struct page_table *pt; + + if ((*ptep == PTE_NULL) && (args->flags & VPTEF_SKIP_NULL)) { + return 0; + } + +#ifdef USE_LARGE_PAGES + if (((*ptep == PTE_NULL) || (*ptep & PFL3_SIZE)) + && (start <= base) + && (((base + PTL3_SIZE) <= end) + || (end == 0))) { + error = (*args->funcp)(args->arg, args->pt, ptep, + (void *)base, PTL3_SIZE); + if (error != -E2BIG) { + return error; + } + } + + if (*ptep & PFL3_SIZE) { + ekprintf("visit_pte_l3:split large page\n"); + return -ENOMEM; + } +#endif + + if (*ptep == PTE_NULL) { + pt = __alloc_new_pt(IHK_MC_AP_NOWAIT); + if (!pt) { + return -ENOMEM; + } + *ptep = virt_to_phys(pt) | PFL3_PDIR_ATTR; + } + else { + pt = phys_to_virt(*ptep & PT_PHYSMASK); + } + + error = walk_pte_l2(pt, base, start, end, &visit_pte_l2, arg0); + return error; +} + +static int visit_pte_l4(void *arg0, pte_t *ptep, uintptr_t base, + uintptr_t start, uintptr_t end) +{ + int error; + struct visit_pte_args *args = arg0; + struct page_table *pt; + + if ((*ptep == PTE_NULL) && (args->flags & VPTEF_SKIP_NULL)) { + return 0; + } + + if (*ptep == PTE_NULL) { + pt = __alloc_new_pt(IHK_MC_AP_NOWAIT); + if (!pt) { + return -ENOMEM; + } + *ptep = virt_to_phys(pt) | PFL4_PDIR_ATTR; + } + else { + pt = phys_to_virt(*ptep & PT_PHYSMASK); + } + + error = walk_pte_l3(pt, base, start, end, &visit_pte_l3, arg0); + return error; +} + +int visit_pte_range(page_table_t pt, void *start0, void *end0, + enum visit_pte_flag flags, pte_visitor_t *funcp, void *arg) +{ + const uintptr_t start = (uintptr_t)start0; + const uintptr_t end = (uintptr_t)end0; + struct visit_pte_args args; + + args.pt = pt; + args.flags = flags; + args.funcp = funcp; + args.arg = arg; + + return walk_pte_l4(pt, 0, start, end, &visit_pte_l4, &args); +} + struct clear_range_args { int free_physical; uint8_t padding[4]; diff --git a/lib/include/ihk/mm.h b/lib/include/ihk/mm.h index a5bb214a..c8970f5a 100644 --- a/lib/include/ihk/mm.h +++ b/lib/include/ihk/mm.h @@ -47,6 +47,12 @@ enum ihk_mc_pt_prepare_flag { IHK_MC_PT_LAST_LEVEL, }; +enum visit_pte_flag { + VPTEF_SKIP_NULL = 0x0001, /* skip null PTEs */ + + VPTEF_DEFAULT = 0, +}; + struct ihk_mc_memory_area { unsigned long start; unsigned long size; @@ -123,6 +129,11 @@ int ihk_mc_pt_set_pte(page_table_t pt, pte_t *ptep, size_t pgsize, uintptr_t phy int ihk_mc_pt_prepare_map(page_table_t pt, void *virt, unsigned long size, enum ihk_mc_pt_prepare_flag); +typedef int pte_visitor_t(void *arg, page_table_t pt, pte_t *ptep, + void *pgaddr, size_t pgsize); +int visit_pte_range(page_table_t pt, void *start, void *end, + enum visit_pte_flag flags, pte_visitor_t *funcp, void *arg); + struct page_table *ihk_mc_pt_create(enum ihk_mc_ap_flag ap_flag); /* XXX: proper use of struct page_table and page_table_t is unknown */ void ihk_mc_pt_destroy(struct page_table *pt); From c395dc2410b2a0e42f81693c62a6c2fb8b75827e Mon Sep 17 00:00:00 2001 From: NAKAMURA Gou Date: Mon, 23 Jun 2014 18:28:54 +0900 Subject: [PATCH 02/23] add pte_is_dirty() --- arch/x86/kernel/include/arch-memory.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/x86/kernel/include/arch-memory.h b/arch/x86/kernel/include/arch-memory.h index 26752e0a..8704299a 100644 --- a/arch/x86/kernel/include/arch-memory.h +++ b/arch/x86/kernel/include/arch-memory.h @@ -138,6 +138,21 @@ static inline int pte_is_writable(pte_t *ptep) return !!(*ptep & PF_WRITABLE); } +static inline int pte_is_dirty(pte_t *ptep, size_t pgsize) +{ + switch (pgsize) { + case PTL1_SIZE: return !!(*ptep & PFL1_DIRTY); + case PTL2_SIZE: return !!(*ptep & PFL2_DIRTY); + case PTL3_SIZE: return !!(*ptep & PFL3_DIRTY); + default: +#if 0 /* XXX: workaround. cannot use panic() here */ + panic("pte_is_dirty"); +#else + return !!(*ptep & PTATTR_DIRTY); +#endif + } +} + static inline uintptr_t pte_get_phys(pte_t *ptep) { return (*ptep & PT_PHYSMASK); From 04f1b3f401a1d2ce2b357a1e7987b9707bb9698b Mon Sep 17 00:00:00 2001 From: NAKAMURA Gou Date: Tue, 24 Jun 2014 12:10:17 +0900 Subject: [PATCH 03/23] add pte_xchg() --- arch/x86/kernel/include/arch-memory.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/kernel/include/arch-memory.h b/arch/x86/kernel/include/arch-memory.h index 8704299a..1181b0e3 100644 --- a/arch/x86/kernel/include/arch-memory.h +++ b/arch/x86/kernel/include/arch-memory.h @@ -158,6 +158,15 @@ static inline uintptr_t pte_get_phys(pte_t *ptep) return (*ptep & PT_PHYSMASK); } +#if 0 /* XXX: workaround. cannot use panic() here */ +static inline void pte_xchg(pte_t *ptep, pte_t *valp) +{ + *valp = xchg(ptep, *valp); +} +#else +#define pte_xchg(p,vp) do { *(vp) = xchg((p), *(vp)); } while (0) +#endif + struct page_table; void set_pte(pte_t *ppte, unsigned long phys, enum ihk_mc_pt_attribute attr); pte_t *get_pte(struct page_table *pt, void *virt, enum ihk_mc_pt_attribute attr); From bb3756dc7403250d3adf8d26e6cd72ab2ea8b4d3 Mon Sep 17 00:00:00 2001 From: NAKAMURA Gou Date: Mon, 23 Jun 2014 18:37:19 +0900 Subject: [PATCH 04/23] add fileoff-type PTE fileoff-type PTE holds a file offset which the page data should be loaded from. refs #21 --- arch/x86/kernel/include/arch-memory.h | 33 +++++++++++++++++++-- arch/x86/kernel/memory.c | 42 ++++++++++++++++++++------- kernel/process.c | 12 ++++++-- 3 files changed, 70 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/include/arch-memory.h b/arch/x86/kernel/include/arch-memory.h index 1181b0e3..9fda8e41 100644 --- a/arch/x86/kernel/include/arch-memory.h +++ b/arch/x86/kernel/include/arch-memory.h @@ -78,6 +78,8 @@ #define PFL3_DIRTY ((pte_t)0x40) #define PFL3_SIZE ((pte_t)0x80) /* Used in 1G page */ #define PFL3_GLOBAL ((pte_t)0x100) +#define PFL3_IGNORED_11 ((pte_t)1 << 11) +#define PFL3_FILEOFF PFL3_IGNORED_11 #define PFL2_PRESENT ((pte_t)0x01) #define PFL2_WRITABLE ((pte_t)0x02) @@ -88,6 +90,8 @@ #define PFL2_DIRTY ((pte_t)0x40) #define PFL2_SIZE ((pte_t)0x80) /* Used in 2M page */ #define PFL2_GLOBAL ((pte_t)0x100) +#define PFL2_IGNORED_11 ((pte_t)1 << 11) +#define PFL2_FILEOFF PFL2_IGNORED_11 #define PFL1_PRESENT ((pte_t)0x01) #define PFL1_WRITABLE ((pte_t)0x02) @@ -96,6 +100,8 @@ #define PFL1_PCD ((pte_t)0x10) #define PFL1_ACCESSED ((pte_t)0x20) #define PFL1_DIRTY ((pte_t)0x40) +#define PFL1_IGNORED_11 ((pte_t)1 << 11) +#define PFL1_FILEOFF PFL1_IGNORED_11 /* We allow user programs to access all the memory */ #define PFL4_KERN_ATTR (PFL4_PRESENT | PFL4_WRITABLE) @@ -108,6 +114,9 @@ #define PFL3_PDIR_ATTR (PFL3_PRESENT | PFL3_WRITABLE | PFL3_USER) #define PFL2_PDIR_ATTR (PFL2_PRESENT | PFL2_WRITABLE | PFL2_USER) +#define PTE_NULL ((pte_t)0) +typedef unsigned long pte_t; + /* For easy conversion, it is better to be the same as architecture's ones */ enum ihk_mc_pt_attribute { PTATTR_ACTIVE = 0x01, @@ -115,14 +124,12 @@ enum ihk_mc_pt_attribute { PTATTR_USER = 0x04, PTATTR_DIRTY = 0x40, PTATTR_LARGEPAGE = 0x80, + PTATTR_FILEOFF = PFL2_FILEOFF, PTATTR_NO_EXECUTE = 0x8000000000000000, PTATTR_UNCACHABLE = 0x10000, PTATTR_FOR_USER = 0x20000, }; -#define PTE_NULL ((pte_t)0) -typedef unsigned long pte_t; - static inline int pte_is_null(pte_t *ptep) { return (*ptep == PTE_NULL); @@ -153,11 +160,31 @@ static inline int pte_is_dirty(pte_t *ptep, size_t pgsize) } } +static inline int pte_is_fileoff(pte_t *ptep, size_t pgsize) +{ + switch (pgsize) { + case PTL1_SIZE: return !!(*ptep & PFL1_FILEOFF); + case PTL2_SIZE: return !!(*ptep & PFL2_FILEOFF); + case PTL3_SIZE: return !!(*ptep & PFL3_FILEOFF); + default: +#if 0 /* XXX: workaround. cannot use panic() here */ + panic("pte_is_fileoff"); +#else + return !!(*ptep & PTATTR_FILEOFF); +#endif + } +} + static inline uintptr_t pte_get_phys(pte_t *ptep) { return (*ptep & PT_PHYSMASK); } +static inline off_t pte_get_off(pte_t *ptep, size_t pgsize) +{ + return (off_t)(*ptep & PAGE_MASK); +} + #if 0 /* XXX: workaround. cannot use panic() here */ static inline void pte_xchg(pte_t *ptep, pte_t *valp) { diff --git a/arch/x86/kernel/memory.c b/arch/x86/kernel/memory.c index 97f02658..d1118c00 100644 --- a/arch/x86/kernel/memory.c +++ b/arch/x86/kernel/memory.c @@ -219,7 +219,13 @@ static struct page_table *__alloc_new_pt(enum ihk_mc_ap_flag ap_flag) * but L2 and L1 do not! */ -static enum ihk_mc_pt_attribute attr_mask = PTATTR_WRITABLE | PTATTR_USER | PTATTR_ACTIVE; +static enum ihk_mc_pt_attribute attr_mask + = 0 + | PTATTR_FILEOFF + | PTATTR_WRITABLE + | PTATTR_USER + | PTATTR_ACTIVE + | 0; #define ATTR_MASK attr_mask void enable_ptattr_no_execute(void) @@ -523,6 +529,7 @@ int ihk_mc_pt_print_pte(struct page_table *pt, void *virt) if (!(pt->entry[l4idx] & PFL4_PRESENT)) { __kprintf("0x%lX l4idx not present! \n", (unsigned long)virt); + __kprintf("l4 entry: 0x%lX\n", pt->entry[l4idx]); return -EFAULT; } pt = phys_to_virt(pt->entry[l4idx] & PAGE_MASK); @@ -530,6 +537,7 @@ int ihk_mc_pt_print_pte(struct page_table *pt, void *virt) __kprintf("l3 table: 0x%lX l3idx: %d \n", virt_to_phys(pt), l3idx); if (!(pt->entry[l3idx] & PFL3_PRESENT)) { __kprintf("0x%lX l3idx not present! \n", (unsigned long)virt); + __kprintf("l3 entry: 0x%lX\n", pt->entry[l3idx]); return -EFAULT; } pt = phys_to_virt(pt->entry[l3idx] & PAGE_MASK); @@ -537,6 +545,7 @@ int ihk_mc_pt_print_pte(struct page_table *pt, void *virt) __kprintf("l2 table: 0x%lX l2idx: %d \n", virt_to_phys(pt), l2idx); if (!(pt->entry[l2idx] & PFL2_PRESENT)) { __kprintf("0x%lX l2idx not present! \n", (unsigned long)virt); + __kprintf("l2 entry: 0x%lX\n", pt->entry[l2idx]); return -EFAULT; } if ((pt->entry[l2idx] & PFL2_SIZE)) { @@ -546,11 +555,12 @@ int ihk_mc_pt_print_pte(struct page_table *pt, void *virt) __kprintf("l1 table: 0x%lX l1idx: %d \n", virt_to_phys(pt), l1idx); if (!(pt->entry[l1idx] & PFL1_PRESENT)) { - __kprintf("0x%lX PTE (l1) not present! entry: 0x%lX\n", - (unsigned long)virt, pt->entry[l1idx]); + __kprintf("0x%lX l1idx not present! \n", (unsigned long)virt); + __kprintf("l1 entry: 0x%lX\n", pt->entry[l1idx]); return -EFAULT; } + __kprintf("l1 entry: 0x%lX\n", pt->entry[l1idx]); return 0; } @@ -822,8 +832,16 @@ static int split_large_page(pte_t *ptep) return -ENOMEM; } - phys = *ptep & PT_PHYSMASK; - attr = *ptep & ~PFL2_SIZE; + if (!(*ptep & PFL2_FILEOFF)) { + phys = *ptep & PT_PHYSMASK; + attr = *ptep & ~PT_PHYSMASK; + attr &= ~PFL2_SIZE; + } + else { + phys = *ptep & PAGE_MASK; /* file offset */ + attr = *ptep & ~PAGE_MASK; + attr &= ~PFL2_SIZE; + } for (i = 0; i < PT_ENTRIES; ++i) { pt->entry[i] = (phys + (i * PTL1_SIZE)) | attr; @@ -1008,7 +1026,7 @@ static int clear_range_l1(void *args0, pte_t *ptep, uint64_t base, memobj_flush_page(args->memobj, phys, PTL1_SIZE); } - if (args->free_physical) { + if (!(old & PFL1_FILEOFF) && args->free_physical) { page = phys_to_page(phys); if (page && page_unmap(page)) { ihk_mc_free_pages(phys_to_virt(phys), 1); @@ -1054,7 +1072,7 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base, memobj_flush_page(args->memobj, phys, PTL2_SIZE); } - if (args->free_physical) { + if (!(old & PFL2_FILEOFF) && args->free_physical) { page = phys_to_page(phys); if (page && page_unmap(page)) { ihk_mc_free_pages(phys_to_virt(phys), PTL2_SIZE/PTL1_SIZE); @@ -1148,7 +1166,7 @@ static int change_attr_range_l1(void *arg0, pte_t *ptep, uint64_t base, { struct change_attr_args *args = arg0; - if (*ptep == PTE_NULL) { + if ((*ptep == PTE_NULL) || (*ptep & PFL1_FILEOFF)) { return -ENOENT; } @@ -1163,7 +1181,7 @@ static int change_attr_range_l2(void *arg0, pte_t *ptep, uint64_t base, int error; struct page_table *pt; - if (*ptep == PTE_NULL) { + if ((*ptep == PTE_NULL) || (*ptep & PFL2_FILEOFF)) { return -ENOENT; } @@ -1182,7 +1200,9 @@ static int change_attr_range_l2(void *arg0, pte_t *ptep, uint64_t base, } if (*ptep & PFL2_SIZE) { - *ptep = (*ptep & ~args->clrpte) | args->setpte; + if (!(*ptep & PFL2_FILEOFF)) { + *ptep = (*ptep & ~args->clrpte) | args->setpte; + } return 0; } @@ -1195,7 +1215,7 @@ static int change_attr_range_l3(void *arg0, pte_t *ptep, uint64_t base, { struct page_table *pt; - if (*ptep == PTE_NULL) { + if ((*ptep == PTE_NULL) || (*ptep & PFL3_FILEOFF)) { return -ENOENT; } diff --git a/kernel/process.c b/kernel/process.c index 1c6f75b2..373d3b71 100644 --- a/kernel/process.c +++ b/kernel/process.c @@ -1012,7 +1012,8 @@ static int page_fault_process_memory_range(struct process_vm *vm, struct vm_rang ihk_mc_spinlock_lock_noirq(&vm->page_table_lock); /*****/ ptep = ihk_mc_pt_lookup_pte(vm->page_table, (void *)fault_addr, &pgaddr, &pgsize, &p2align); - if (!(reason & PF_PROT) && ptep && !pte_is_null(ptep)) { + if (!(reason & PF_PROT) && ptep && !pte_is_null(ptep) + && !pte_is_fileoff(ptep, pgsize)) { if (!pte_is_present(ptep)) { error = -EFAULT; kprintf("page_fault_process_memory_range(%p,%lx-%lx %lx,%lx,%lx):PROT_NONE. %d\n", vm, range->start, range->end, range->flag, fault_addr, reason, error); @@ -1034,11 +1035,16 @@ static int page_fault_process_memory_range(struct process_vm *vm, struct vm_rang } attr = arch_vrflag_to_ptattr(range->flag, reason, ptep); pgaddr = (void *)(fault_addr & ~(pgsize - 1)); - if (!ptep || pte_is_null(ptep)) { + if (!ptep || pte_is_null(ptep) || pte_is_fileoff(ptep, pgsize)) { if (range->memobj) { off_t off; - off = range->objoff + ((uintptr_t)pgaddr - range->start); + if (!ptep || !pte_is_fileoff(ptep, pgsize)) { + off = range->objoff + ((uintptr_t)pgaddr - range->start); + } + else { + off = pte_get_off(ptep, pgsize); + } error = memobj_get_page(range->memobj, off, p2align, &phys); if (error) { if (error != -ERESTART) { From 90aaf9dc9c59a6f0aedaf6b70b16e781ff8bb4df Mon Sep 17 00:00:00 2001 From: NAKAMURA Gou Date: Mon, 23 Jun 2014 18:43:16 +0900 Subject: [PATCH 05/23] support remap_file_pages(2) fixes #21 --- arch/x86/kernel/include/arch-memory.h | 22 +++++++ arch/x86/kernel/include/syscall_list.h | 1 + kernel/include/process.h | 2 + kernel/process.c | 88 ++++++++++++++++++++++++++ kernel/syscall.c | 74 ++++++++++++++++++++++ 5 files changed, 187 insertions(+) diff --git a/arch/x86/kernel/include/arch-memory.h b/arch/x86/kernel/include/arch-memory.h index 9fda8e41..dc75673a 100644 --- a/arch/x86/kernel/include/arch-memory.h +++ b/arch/x86/kernel/include/arch-memory.h @@ -185,6 +185,28 @@ static inline off_t pte_get_off(pte_t *ptep, size_t pgsize) return (off_t)(*ptep & PAGE_MASK); } +static inline void pte_make_fileoff(off_t off, + enum ihk_mc_pt_attribute ptattr, size_t pgsize, pte_t *ptep) +{ + uint64_t attr; + + attr = ptattr & ~PAGE_MASK; + + switch (pgsize) { + case PTL1_SIZE: attr |= PFL1_FILEOFF; break; + case PTL2_SIZE: attr |= PFL2_FILEOFF | PFL2_SIZE; break; + case PTL3_SIZE: attr |= PFL3_FILEOFF | PFL3_SIZE; break; + default: +#if 0 /* XXX: workaround. cannot use panic() here */ + panic("pte_make_fileoff"); +#else + attr |= PTATTR_FILEOFF; +#endif + break; + } + *ptep = (off & PAGE_MASK) | attr; +} + #if 0 /* XXX: workaround. cannot use panic() here */ static inline void pte_xchg(pte_t *ptep, pte_t *valp) { diff --git a/arch/x86/kernel/include/syscall_list.h b/arch/x86/kernel/include/syscall_list.h index 77f8b6ec..cb246049 100644 --- a/arch/x86/kernel/include/syscall_list.h +++ b/arch/x86/kernel/include/syscall_list.h @@ -75,6 +75,7 @@ SYSCALL_DELEGATED(201, time) SYSCALL_HANDLED(202, futex) SYSCALL_HANDLED(203, sched_setaffinity) SYSCALL_HANDLED(204, sched_getaffinity) +SYSCALL_HANDLED(216, remap_file_pages) SYSCALL_DELEGATED(217, getdents64) SYSCALL_HANDLED(218, set_tid_address) SYSCALL_HANDLED(231, exit_group) diff --git a/kernel/include/process.h b/kernel/include/process.h index 5bc99fea..2688c887 100644 --- a/kernel/include/process.h +++ b/kernel/include/process.h @@ -228,6 +228,8 @@ int join_process_memory_range(struct process *process, struct vm_range *survivin int change_prot_process_memory_range( struct process *process, struct vm_range *range, unsigned long newflag); +int remap_process_memory_range(struct process_vm *vm, struct vm_range *range, + uintptr_t start, uintptr_t end, off_t off); struct vm_range *lookup_process_memory_range( struct process_vm *vm, uintptr_t start, uintptr_t end); struct vm_range *next_process_memory_range( diff --git a/kernel/process.c b/kernel/process.c index 373d3b71..174a1181 100644 --- a/kernel/process.c +++ b/kernel/process.c @@ -997,6 +997,94 @@ out: return error; } +struct rfp_args { + off_t off; + uintptr_t start; + struct memobj *memobj; +}; + +static int remap_one_page(void *arg0, page_table_t pt, pte_t *ptep, + void *pgaddr, size_t pgsize) +{ + struct rfp_args * const args = arg0; + int error; + off_t off; + pte_t apte; + uintptr_t phys; + struct page *page; + + dkprintf("remap_one_page(%p,%p,%p %#lx,%p,%#lx)\n", + arg0, pt, ptep, *ptep, pgaddr, pgsize); + + /* XXX: NYI: large pages */ + if (pgsize != PAGE_SIZE) { + error = -E2BIG; + ekprintf("remap_one_page(%p,%p,%p %#lx,%p,%#lx):%d\n", + arg0, pt, ptep, *ptep, pgaddr, pgsize, error); + goto out; + } + + off = args->off + ((uintptr_t)pgaddr - args->start); + pte_make_fileoff(off, 0, pgsize, &apte); + + pte_xchg(ptep, &apte); + flush_tlb_single((uintptr_t)pgaddr); /* XXX: TLB flush */ + + if (pte_is_null(&apte) || pte_is_fileoff(&apte, pgsize)) { + error = 0; + goto out; + } + phys = pte_get_phys(&apte); + + if (pte_is_dirty(&apte, pgsize)) { + memobj_flush_page(args->memobj, phys, pgsize); /* XXX: in lock period */ + } + + page = phys_to_page(phys); + if (page && page_unmap(page)) { + ihk_mc_free_pages(phys_to_virt(phys), pgsize/PAGE_SIZE); + } + + error = 0; +out: + dkprintf("remap_one_page(%p,%p,%p %#lx,%p,%#lx): %d\n", + arg0, pt, ptep, *ptep, pgaddr, pgsize, error); + return error; +} + +int remap_process_memory_range(struct process_vm *vm, struct vm_range *range, + uintptr_t start, uintptr_t end, off_t off) +{ + struct rfp_args args; + int error; + + dkprintf("remap_process_memory_range(%p,%p,%#lx,%#lx,%#lx)\n", + vm, range, start, end, off); + ihk_mc_spinlock_lock_noirq(&vm->page_table_lock); + memobj_lock(range->memobj); + + args.start = start; + args.off = off; + args.memobj = range->memobj; + + error = visit_pte_range(vm->page_table, (void *)start, + (void *)end, VPTEF_DEFAULT, &remap_one_page, &args); + if (error) { + ekprintf("remap_process_memory_range(%p,%p,%#lx,%#lx,%#lx):" + "visit pte failed %d\n", + vm, range, start, end, off, error); + goto out; + } + + error = 0; +out: + memobj_unlock(range->memobj); + ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock); + dkprintf("remap_process_memory_range(%p,%p,%#lx,%#lx,%#lx):%d\n", + vm, range, start, end, off, error); + return error; +} + static int page_fault_process_memory_range(struct process_vm *vm, struct vm_range *range, uintptr_t fault_addr, uint64_t reason) { int error; diff --git a/kernel/syscall.c b/kernel/syscall.c index f2ccc583..6d86ff3d 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -2271,6 +2271,80 @@ out2: return error; } +SYSCALL_DECLARE(remap_file_pages) +{ + const uintptr_t start0 = ihk_mc_syscall_arg0(ctx); + const size_t size = ihk_mc_syscall_arg1(ctx); + const int prot = ihk_mc_syscall_arg2(ctx); + const size_t pgoff = ihk_mc_syscall_arg3(ctx); + const int flags = ihk_mc_syscall_arg4(ctx); + int error; + const uintptr_t start = start0 & PAGE_MASK; + const uintptr_t end = start + size; + const off_t off = (off_t)pgoff << PAGE_SHIFT; + struct process * const proc = cpu_local_var(current); + struct vm_range *range; + int er; + int need_populate = 0; + + dkprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x)\n", + start0, size, prot, pgoff, flags); + ihk_mc_spinlock_lock_noirq(&proc->vm->memory_range_lock); +#define PGOFF_LIMIT ((off_t)1 << ((8*sizeof(off_t) - 1) - PAGE_SHIFT)) + if ((size <= 0) || (size & (PAGE_SIZE - 1)) || (prot != 0) + || (pgoff < 0) || (PGOFF_LIMIT <= pgoff) + || ((PGOFF_LIMIT - pgoff) < (size / PAGE_SIZE)) + || !((start < end) || (end == 0))) { + ekprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x):" + "invalid args\n", + start0, size, prot, pgoff, flags); + error = -EINVAL; + goto out; + } + + range = lookup_process_memory_range(proc->vm, start, end); + if (!range || (start < range->start) || (range->end < end) + || (range->flag & VR_PRIVATE) + || !range->memobj) { + ekprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x):" + "invalid VMR:[%#lx-%#lx) %#lx %p\n", + start0, size, prot, pgoff, flags, + range?range->start:0, range?range->end:0, + range?range->flag:0, range?range->memobj:NULL); + error = -EINVAL; + goto out; + } + + error = remap_process_memory_range(proc->vm, range, start, end, off); + if (error) { + ekprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x):" + "remap failed %d\n", + start0, size, prot, pgoff, flags, error); + goto out; + } + clear_host_pte(start, size); /* XXX: workaround */ + + if (range->flag & VR_LOCKED) { + need_populate = 1; + } + error = 0; +out: + ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock); + + if (need_populate + && (er = populate_process_memory( + proc, (void *)start, size))) { + ekprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x):" + "populate failed %d\n", + start0, size, prot, pgoff, flags, er); + /* ignore populate error */ + } + + dkprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x): %d\n", + start0, size, prot, pgoff, flags, error); + return error; +} + #ifdef DCFA_KMOD #ifdef CMD_DCFA From 11bb334bd4661478647db58799baa1e3ffb05048 Mon Sep 17 00:00:00 2001 From: NAKAMURA Gou Date: Tue, 1 Jul 2014 18:52:31 +0900 Subject: [PATCH 06/23] support mremap(2) fixes #22 --- arch/x86/kernel/include/syscall_list.h | 1 + arch/x86/kernel/memory.c | 70 ++++++++ kernel/include/mman.h | 6 + kernel/include/process.h | 3 + kernel/process.c | 33 ++++ kernel/syscall.c | 221 +++++++++++++++++++++++++ lib/include/ihk/mm.h | 1 + 7 files changed, 335 insertions(+) diff --git a/arch/x86/kernel/include/syscall_list.h b/arch/x86/kernel/include/syscall_list.h index cb246049..23209802 100644 --- a/arch/x86/kernel/include/syscall_list.h +++ b/arch/x86/kernel/include/syscall_list.h @@ -40,6 +40,7 @@ SYSCALL_DELEGATED(18, pwrite64) SYSCALL_DELEGATED(20, writev) SYSCALL_DELEGATED(21, access) SYSCALL_HANDLED(24, sched_yield) +SYSCALL_HANDLED(25, mremap) SYSCALL_HANDLED(28, madvise) SYSCALL_HANDLED(34, pause) SYSCALL_HANDLED(39, getpid) diff --git a/arch/x86/kernel/memory.c b/arch/x86/kernel/memory.c index d1118c00..ac5cb12a 100644 --- a/arch/x86/kernel/memory.c +++ b/arch/x86/kernel/memory.c @@ -1802,6 +1802,76 @@ enum ihk_mc_pt_attribute arch_vrflag_to_ptattr(unsigned long flag, uint64_t faul return attr; } +struct move_args { + uintptr_t src; + uintptr_t dest; +}; + +static int move_one_page(void *arg0, page_table_t pt, pte_t *ptep, void *pgaddr, size_t pgsize) +{ + int error; + struct move_args *args = arg0; + uintptr_t dest; + pte_t apte; + uintptr_t phys; + enum ihk_mc_pt_attribute attr; + + dkprintf("move_one_page(%p,%p,%p %#lx,%p,%#lx)\n", + arg0, pt, ptep, *ptep, pgaddr, pgsize); + if (pte_is_fileoff(ptep, pgsize)) { + error = -ENOTSUPP; + kprintf("move_one_page(%p,%p,%p %#lx,%p,%#lx):fileoff. %d\n", + arg0, pt, ptep, *ptep, pgaddr, pgsize, error); + goto out; + } + + dest = args->dest + ((uintptr_t)pgaddr - args->src); + + apte = PTE_NULL; + pte_xchg(ptep, &apte); + + phys = apte & PT_PHYSMASK; + attr = apte & ~PT_PHYSMASK; + + error = ihk_mc_pt_set_range(pt, (void *)dest, + (void *)(dest + pgsize), phys, attr); + if (error) { + kprintf("move_one_page(%p,%p,%p %#lx,%p,%#lx):" + "set failed. %d\n", + arg0, pt, ptep, *ptep, pgaddr, pgsize, error); + goto out; + } + + error = 0; +out: + dkprintf("move_one_page(%p,%p,%p %#lx,%p,%#lx):%d\n", + arg0, pt, ptep, *ptep, pgaddr, pgsize, error); + return error; +} + +int move_pte_range(page_table_t pt, void *src, void *dest, size_t size) +{ + int error; + struct move_args args; + + dkprintf("move_pte_range(%p,%p,%p,%#lx)\n", pt, src, dest, size); + args.src = (uintptr_t)src; + args.dest = (uintptr_t)dest; + + error = visit_pte_range(pt, src, src+size, VPTEF_SKIP_NULL, + &move_one_page, &args); + flush_tlb(); /* XXX: TLB flush */ + if (error) { + goto out; + } + + error = 0; +out: + dkprintf("move_pte_range(%p,%p,%p,%#lx):%d\n", + pt, src, dest, size, error); + return error; +} + void load_page_table(struct page_table *pt) { unsigned long pt_addr; diff --git a/kernel/include/mman.h b/kernel/include/mman.h index c29ef8da..b7555eef 100644 --- a/kernel/include/mman.h +++ b/kernel/include/mman.h @@ -63,4 +63,10 @@ #define MADV_HWPOISON 100 #define MADV_SOFT_OFFLINE 101 +/* + * for mremap() + */ +#define MREMAP_MAYMOVE 0x01 +#define MREMAP_FIXED 0x02 + #endif /* HEADER_MMAN_H */ diff --git a/kernel/include/process.h b/kernel/include/process.h index 2688c887..0011a68e 100644 --- a/kernel/include/process.h +++ b/kernel/include/process.h @@ -29,6 +29,7 @@ #define VR_DEMAND_PAGING 0x1000 #define VR_PRIVATE 0x2000 #define VR_LOCKED 0x4000 +#define VR_FILEOFF 0x8000 /* remap_file_pages()ed range */ #define VR_PROT_NONE 0x00000000 #define VR_PROT_READ 0x00010000 #define VR_PROT_WRITE 0x00020000 @@ -236,6 +237,8 @@ struct vm_range *next_process_memory_range( struct process_vm *vm, struct vm_range *range); struct vm_range *previous_process_memory_range( struct process_vm *vm, struct vm_range *range); +int extend_up_process_memory_range(struct process_vm *vm, + struct vm_range *range, uintptr_t newend); int page_fault_process(struct process *proc, void *fault_addr, uint64_t reason); int remove_process_region(struct process *proc, diff --git a/kernel/process.c b/kernel/process.c index 174a1181..18ce28d9 100644 --- a/kernel/process.c +++ b/kernel/process.c @@ -936,6 +936,39 @@ struct vm_range *previous_process_memory_range( return prev; } +int extend_up_process_memory_range(struct process_vm *vm, + struct vm_range *range, uintptr_t newend) +{ + int error; + struct vm_range *next; + + dkprintf("exntend_up_process_memory_range(%p,%p %#lx-%#lx,%#lx)\n", + vm, range, range->start, range->end, newend); + if (newend <= range->end) { + error = -EINVAL; + goto out; + } + + if (vm->region.user_end < newend) { + error = -EPERM; + goto out; + } + + next = next_process_memory_range(vm ,range); + if (next && (next->start < newend)) { + error = -ENOMEM; + goto out; + } + + error = 0; + range->end = newend; + +out: + dkprintf("exntend_up_process_memory_range(%p,%p %#lx-%#lx,%#lx):%d\n", + vm, range, range->start, range->end, newend, error); + return error; +} + int change_prot_process_memory_range(struct process *proc, struct vm_range *range, unsigned long protflag) { diff --git a/kernel/syscall.c b/kernel/syscall.c index 6d86ff3d..5fbf73ba 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -2315,6 +2315,7 @@ SYSCALL_DECLARE(remap_file_pages) goto out; } + range->flag |= VR_FILEOFF; error = remap_process_memory_range(proc->vm, range, start, end, off); if (error) { ekprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x):" @@ -2345,6 +2346,226 @@ out: return error; } +SYSCALL_DECLARE(mremap) +{ + const uintptr_t oldaddr = ihk_mc_syscall_arg0(ctx); + const size_t oldsize0 = ihk_mc_syscall_arg1(ctx); + const size_t newsize0 = ihk_mc_syscall_arg2(ctx); + const int flags = ihk_mc_syscall_arg3(ctx); + const uintptr_t newaddr = ihk_mc_syscall_arg4(ctx); + const ssize_t oldsize = (oldsize0 + PAGE_SIZE - 1) & PAGE_MASK; + const ssize_t newsize = (newsize0 + PAGE_SIZE - 1) & PAGE_MASK; + const uintptr_t oldstart = oldaddr; + const uintptr_t oldend = oldstart + oldsize; + struct process *proc = cpu_local_var(current); + struct process_vm *vm = proc->vm; + int error; + struct vm_range *range; + int need_relocate; + uintptr_t newstart; + uintptr_t newend; + size_t size; + uintptr_t ret; + uintptr_t lckstart = -1; + uintptr_t lckend = -1; + + dkprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx)\n", + oldaddr, oldsize0, newsize0, flags, newaddr); + ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock); + + if ((oldaddr & ~PAGE_MASK) + || (oldsize < 0) + || (newsize <= 0) + || (flags & ~(MREMAP_MAYMOVE | MREMAP_FIXED)) + || ((flags & MREMAP_FIXED) + && !(flags & MREMAP_MAYMOVE)) + || ((flags & MREMAP_FIXED) + && (newaddr & ~PAGE_MASK))) { + error = -EINVAL; + ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):invalid. %d\n", + oldaddr, oldsize0, newsize0, flags, newaddr, + error); + goto out; + } + + /* check original mapping */ + range = lookup_process_memory_range(vm, oldstart, oldstart+PAGE_SIZE); + if (!range || (oldstart < range->start) || (range->end < oldend) + || (range->flag & (VR_FILEOFF)) + || (range->flag & (VR_REMOTE|VR_IO_NOCACHE|VR_RESERVED))) { + error = -EFAULT; + ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" + "lookup failed. %d %p %#lx-%#lx %#lx\n", + oldaddr, oldsize0, newsize0, flags, newaddr, + error, range, range?range->start:0, + range?range->end:0, range?range->flag:0); + goto out; + } + + if (oldend < oldstart) { + error = -EINVAL; + ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" + "old range overflow. %d\n", + oldaddr, oldsize0, newsize0, flags, newaddr, + error); + goto out; + } + + /* determine new mapping range */ + need_relocate = 0; + if (flags & MREMAP_FIXED) { + need_relocate = 1; + newstart = newaddr; + newend = newstart + newsize; + if (newstart < vm->region.user_start) { + error = -EPERM; + ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" + "mmap_min_addr %#lx. %d\n", + oldaddr, oldsize0, newsize0, flags, + newaddr, vm->region.user_start, + error); + goto out; + } + if ((newstart < oldend) && (oldstart < newend)) { + error = -EINVAL; + ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" + "fixed:overlapped. %d\n", + oldaddr, oldsize0, newsize0, flags, + newaddr, error); + goto out; + } + } + else if (!(flags & MREMAP_FIXED) && (oldsize < newsize)) { + if (oldend == range->end) { + newstart = oldstart; + newend = newstart + newsize; + error = extend_up_process_memory_range(vm, range, + newend); + if (!error) { + if (range->flag & VR_LOCKED) { + lckstart = oldend; + lckend = newend; + } + goto out; + } + } + if (!(flags & MREMAP_MAYMOVE)) { + error = -ENOMEM; + ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" + "cannot relocate. %d\n", + oldaddr, oldsize0, newsize0, flags, + newaddr, error); + goto out; + } + need_relocate = 1; + error = search_free_space(newsize, vm->region.map_end, + (intptr_t *)&newstart); + if (error) { + ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" + "search failed. %d\n", + oldaddr, oldsize0, newsize0, flags, + newaddr, error); + goto out; + } + newend = newstart + newsize; + } + else { + newstart = oldstart; + newend = newstart + newsize; + } + + /* do the remap */ + if (need_relocate) { + if (flags & MREMAP_FIXED) { + error = do_munmap((void *)newstart, newsize); + if (error) { + ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" + "fixed:munmap failed. %d\n", + oldaddr, oldsize0, newsize0, + flags, newaddr, error); + goto out; + } + } + if (range->memobj) { + memobj_ref(range->memobj); + } + error = add_process_memory_range(proc, newstart, newend, -1, + range->flag, range->memobj, + range->objoff + (oldstart - range->start)); + if (error) { + ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" + "add failed. %d\n", + oldaddr, oldsize0, newsize0, flags, + newaddr, error); + if (range->memobj) { + memobj_release(range->memobj); + } + goto out; + } + if (range->flag & VR_LOCKED) { + lckstart = newstart; + lckend = newend; + } + + if (oldsize > 0) { + size = (oldsize < newsize)? oldsize: newsize; + ihk_mc_spinlock_lock_noirq(&vm->page_table_lock); + error = move_pte_range(vm->page_table, + (void *)oldstart, (void *)newstart, + size); + ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock); + if (error) { + ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" + "move failed. %d\n", + oldaddr, oldsize0, newsize0, + flags, newaddr, error); + goto out; + } + + error = do_munmap((void *)oldstart, oldsize); + if (error) { + ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" + "relocate:munmap failed. %d\n", + oldaddr, oldsize0, newsize0, + flags, newaddr, error); + goto out; + } + } + } + else if (newsize < oldsize) { + error = do_munmap((void *)newend, (oldend - newend)); + if (error) { + ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" + "shrink:munmap failed. %d\n", + oldaddr, oldsize0, newsize0, flags, + newaddr, error); + goto out; + } + } + else { + /* nothing to do */ + } + + error = 0; +out: + ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock); + if (!error && (lckstart < lckend)) { + error = populate_process_memory(proc, (void *)lckstart, (lckend - lckstart)); + if (error) { + ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" + "populate failed. %d %#lx-%#lx\n", + oldaddr, oldsize0, newsize0, flags, + newaddr, error, lckstart, lckend); + error = 0; /* ignore error */ + } + } + ret = (error)? error: newstart; + dkprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):%d %#lx\n", + oldaddr, oldsize0, newsize0, flags, newaddr, error, + ret); + return ret; +} + #ifdef DCFA_KMOD #ifdef CMD_DCFA diff --git a/lib/include/ihk/mm.h b/lib/include/ihk/mm.h index c8970f5a..8f3e57a0 100644 --- a/lib/include/ihk/mm.h +++ b/lib/include/ihk/mm.h @@ -133,6 +133,7 @@ typedef int pte_visitor_t(void *arg, page_table_t pt, pte_t *ptep, void *pgaddr, size_t pgsize); int visit_pte_range(page_table_t pt, void *start, void *end, enum visit_pte_flag flags, pte_visitor_t *funcp, void *arg); +int move_pte_range(page_table_t pt, void *src, void *dest, size_t size); struct page_table *ihk_mc_pt_create(enum ihk_mc_ap_flag ap_flag); /* XXX: proper use of struct page_table and page_table_t is unknown */ From b2ca24d013417b3cee7b6641d4e013f8c1ee5b7a Mon Sep 17 00:00:00 2001 From: NAKAMURA Gou Date: Wed, 2 Jul 2014 15:17:21 +0900 Subject: [PATCH 07/23] remap_file_pages: reject special ranges special ranges are: - VR_REMOTE - VR_IO_NOCACHE - VR_RESERVED --- kernel/syscall.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/syscall.c b/kernel/syscall.c index 5fbf73ba..5fb5cc13 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -2305,6 +2305,7 @@ SYSCALL_DECLARE(remap_file_pages) range = lookup_process_memory_range(proc->vm, start, end); if (!range || (start < range->start) || (range->end < end) || (range->flag & VR_PRIVATE) + || (range->flag & (VR_REMOTE|VR_IO_NOCACHE|VR_RESERVED)) || !range->memobj) { ekprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x):" "invalid VMR:[%#lx-%#lx) %#lx %p\n", From a8a226a443419329516a55ce688619f7a64731e1 Mon Sep 17 00:00:00 2001 From: NAKAMURA Gou Date: Wed, 2 Jul 2014 19:58:13 +0900 Subject: [PATCH 08/23] use PF_POPULATE for resolving delegated page faults Since a host side PTE does not follow McKernel's copying a COW page, COW pages cannot be used for resolving delegated page faults. Therefore, to copy pages eagerly, PF_POPULATE should be used. --- kernel/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/syscall.c b/kernel/syscall.c index 5fb5cc13..ad813f62 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -209,7 +209,7 @@ long do_syscall(struct syscall_request *req, ihk_mc_user_context_t *ctx, cpu_local_var(current)->pid); error = page_fault_process(get_cpu_local_var(cpu)->current, (void *)res->fault_address, - res->fault_reason); + res->fault_reason|PF_POPULATE); /* send result */ req2.number = __NR_mmap; From d59628e131cebfaeb517c7de390539269c8e14ee Mon Sep 17 00:00:00 2001 From: NAKAMURA Gou Date: Wed, 2 Jul 2014 20:01:05 +0900 Subject: [PATCH 09/23] fix debug prints to avoid NULL dereferences --- kernel/syscall.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/syscall.c b/kernel/syscall.c index ad813f62..6856ab12 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -1702,8 +1702,8 @@ SYSCALL_DECLARE(madvise) dkprintf("[%d]sys_madvise(%lx,%lx,%x):not contig " "%lx [%lx-%lx)\n", ihk_mc_get_processor_id(), start, - len0, advice, addr, range->start, - range->end); + len0, advice, addr, range?range->start:0, + range?range->end:0); error = -ENOMEM; goto out; } @@ -2035,7 +2035,8 @@ SYSCALL_DECLARE(mlock) dkprintf("[%d]sys_mlock(%lx,%lx):not contiguous." " %lx [%lx-%lx)\n", ihk_mc_get_processor_id(), start0, - len0, addr, range->start, range->end); + len0, addr, range?range->start:0, + range?range->end:0); error = -ENOMEM; goto out; } @@ -2209,7 +2210,8 @@ SYSCALL_DECLARE(munlock) dkprintf("[%d]sys_munlock(%lx,%lx):not contiguous." " %lx [%lx-%lx)\n", ihk_mc_get_processor_id(), start0, - len0, addr, range->start, range->end); + len0, addr, range?range->start:0, + range?range->end:0); error = -ENOMEM; goto out; } From 9efb5e4fc5be2300acc88f4c0225110e165362b3 Mon Sep 17 00:00:00 2001 From: NAKAMURA Gou Date: Thu, 3 Jul 2014 12:06:29 +0900 Subject: [PATCH 10/23] add memobj_has_pager() --- kernel/fileobj.c | 1 + kernel/include/memobj.h | 12 ++++++++++++ kernel/syscall.c | 5 ++--- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/kernel/fileobj.c b/kernel/fileobj.c index 18a27046..df83646d 100644 --- a/kernel/fileobj.c +++ b/kernel/fileobj.c @@ -184,6 +184,7 @@ int fileobj_create(int fd, struct memobj **objp, int *maxprotp) memset(newobj, 0, sizeof(*newobj)); newobj->memobj.ops = &fileobj_ops; + newobj->memobj.flags = MF_HAS_PAGER; newobj->handle = result.handle; newobj->sref = 1; newobj->cref = 1; diff --git a/kernel/include/memobj.h b/kernel/include/memobj.h index c846ad09..7a7e99c3 100644 --- a/kernel/include/memobj.h +++ b/kernel/include/memobj.h @@ -18,8 +18,15 @@ #include #include +enum { + /* for memobj.flags */ + MF_HAS_PAGER = 0x0001, +}; + struct memobj { struct memobj_ops * ops; + uint32_t flags; + int8_t padding[4]; ihk_spinlock_t lock; }; @@ -74,6 +81,11 @@ static inline void memobj_unlock(struct memobj *obj) ihk_mc_spinlock_unlock_noirq(&obj->lock); } +static inline int memobj_has_pager(struct memobj *obj) +{ + return !!(obj->flags & MF_HAS_PAGER); +} + int fileobj_create(int fd, struct memobj **objp, int *maxprotp); #endif /* HEADER_MEMOBJ_H */ diff --git a/kernel/syscall.c b/kernel/syscall.c index 6856ab12..8fb35986 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -1708,9 +1708,8 @@ SYSCALL_DECLARE(madvise) goto out; } -#define MEMOBJ_IS_FILEOBJ(obj) ((obj) != NULL) - if (!MEMOBJ_IS_FILEOBJ(range->memobj)) { - dkprintf("[%d]sys_madvise(%lx,%lx,%x):not fileobj " + if (!range->memobj || !memobj_has_pager(range->memobj)) { + dkprintf("[%d]sys_madvise(%lx,%lx,%x):has not pager" "[%lx-%lx) %lx\n", ihk_mc_get_processor_id(), start, len0, advice, range->start, From 9057268f0eaa9a3787fba39f2d6525d3e65c33c0 Mon Sep 17 00:00:00 2001 From: NAKAMURA Gou Date: Thu, 3 Jul 2014 12:11:02 +0900 Subject: [PATCH 11/23] add memobj's default action --- kernel/include/memobj.h | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/kernel/include/memobj.h b/kernel/include/memobj.h index 7a7e99c3..35350c60 100644 --- a/kernel/include/memobj.h +++ b/kernel/include/memobj.h @@ -16,6 +16,7 @@ #include #include #include +#include #include enum { @@ -46,29 +47,42 @@ struct memobj_ops { static inline void memobj_release(struct memobj *obj) { - (*obj->ops->release)(obj); + if (obj->ops->release) { + (*obj->ops->release)(obj); + } } static inline void memobj_ref(struct memobj *obj) { - (*obj->ops->ref)(obj); + if (obj->ops->ref) { + (*obj->ops->ref)(obj); + } } static inline int memobj_get_page(struct memobj *obj, off_t off, int p2align, uintptr_t *physp) { - return (*obj->ops->get_page)(obj, off, p2align, physp); + if (obj->ops->get_page) { + return (*obj->ops->get_page)(obj, off, p2align, physp); + } + return -ENXIO; } static inline uintptr_t memobj_copy_page(struct memobj *obj, uintptr_t orgphys, int p2align) { - return (*obj->ops->copy_page)(obj, orgphys, p2align); + if (obj->ops->copy_page) { + return (*obj->ops->copy_page)(obj, orgphys, p2align); + } + return -ENXIO; } static inline int memobj_flush_page(struct memobj *obj, uintptr_t phys, size_t pgsize) { - return (*obj->ops->flush_page)(obj, phys, pgsize); + if (obj->ops->flush_page) { + return (*obj->ops->flush_page)(obj, phys, pgsize); + } + return 0; } static inline void memobj_lock(struct memobj *obj) From 380fcbda7368689d70c78ed1650e7330546c6f68 Mon Sep 17 00:00:00 2001 From: NAKAMURA Gou Date: Thu, 3 Jul 2014 13:55:26 +0900 Subject: [PATCH 12/23] add shmobj for shared anonymous mappings --- kernel/Makefile.build | 2 +- kernel/include/memobj.h | 2 + kernel/include/shm.h | 49 +++++++ kernel/shmobj.c | 287 ++++++++++++++++++++++++++++++++++++++++ kernel/syscall.c | 26 +++- 5 files changed, 359 insertions(+), 7 deletions(-) create mode 100644 kernel/include/shm.h create mode 100644 kernel/shmobj.c diff --git a/kernel/Makefile.build b/kernel/Makefile.build index 1a7f6438..ded7f58d 100644 --- a/kernel/Makefile.build +++ b/kernel/Makefile.build @@ -1,6 +1,6 @@ IHKDIR=$(IHKBASE)/$(TARGETDIR) OBJS = init.o mem.o debug.o mikc.o listeners.o ap.o syscall.o cls.o host.o -OBJS += process.o copy.o waitq.o futex.o timer.o plist.o fileobj.o +OBJS += process.o copy.o waitq.o futex.o timer.o plist.o fileobj.o shmobj.o DEPSRCS=$(wildcard $(SRC)/*.c) CFLAGS += -I$(SRC)/include -mcmodel=kernel -D__KERNEL__ diff --git a/kernel/include/memobj.h b/kernel/include/memobj.h index 35350c60..0ba9c838 100644 --- a/kernel/include/memobj.h +++ b/kernel/include/memobj.h @@ -18,6 +18,7 @@ #include #include #include +#include enum { /* for memobj.flags */ @@ -101,5 +102,6 @@ static inline int memobj_has_pager(struct memobj *obj) } int fileobj_create(int fd, struct memobj **objp, int *maxprotp); +int shmobj_create(struct shmid_ds *ds, struct memobj **objp); #endif /* HEADER_MEMOBJ_H */ diff --git a/kernel/include/shm.h b/kernel/include/shm.h new file mode 100644 index 00000000..3117aba5 --- /dev/null +++ b/kernel/include/shm.h @@ -0,0 +1,49 @@ +/** + * \file shm.h + * License details are found in the file LICENSE. + * \brief + * header file for System V shared memory + * \author Gou Nakamura + */ +/* + * HISTORY: + */ + +#ifndef HEADER_SHM_H +#define HEADER_SHM_H + +/* begin types.h */ +typedef int32_t key_t; +typedef uint32_t uid_t; +typedef uint32_t gid_t; +typedef int64_t time_t; +typedef int32_t pid_t; +/* end types.h */ + +typedef uint64_t shmatt_t; + +struct ipc_perm { + key_t key; + uid_t uid; + gid_t gid; + uid_t cuid; + gid_t cgid; + uint16_t mode; + uint8_t padding[2]; + uint16_t seq; + uint8_t padding2[22]; +}; + +struct shmid_ds { + struct ipc_perm shm_perm; + size_t shm_segsz; + time_t shm_atime; + time_t shm_dtime; + time_t shm_ctime; + pid_t shm_cpid; + pid_t shm_lpid; + shmatt_t shm_nattch; + uint8_t padding[16]; +}; + +#endif /* HEADER_SHM_H */ diff --git a/kernel/shmobj.c b/kernel/shmobj.c new file mode 100644 index 00000000..b0c03942 --- /dev/null +++ b/kernel/shmobj.c @@ -0,0 +1,287 @@ +/** + * \file shmobj.c + * License details are found in the file LICENSE. + * \brief + * shared memory object + * \author Gou Nakamura + */ +/* + * HISTORY: + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0) +#define ekprintf(...) kprintf(__VA_ARGS__) +#define fkprintf(...) kprintf(__VA_ARGS__) + +struct shmobj { + struct memobj memobj; /* must be first */ + long ref; + struct shmid_ds ds; + struct list_head page_list; +}; + +static memobj_release_func_t shmobj_release; +static memobj_ref_func_t shmobj_ref; +static memobj_get_page_func_t shmobj_get_page; + +static struct memobj_ops shmobj_ops = { + .release = &shmobj_release, + .ref = &shmobj_ref, + .get_page = &shmobj_get_page, +}; + +static struct shmobj *to_shmobj(struct memobj *memobj) +{ + return (struct shmobj *)memobj; +} + +static struct memobj *to_memobj(struct shmobj *shmobj) +{ + return &shmobj->memobj; +} + +/*********************************************************************** + * page_list + */ +static void page_list_init(struct shmobj *obj) +{ + INIT_LIST_HEAD(&obj->page_list); + return; +} + +static void page_list_insert(struct shmobj *obj, struct page *page) +{ + list_add(&page->list, &obj->page_list); + return; +} + +static void page_list_remove(struct shmobj *obj, struct page *page) +{ + list_del(&page->list); + return; +} + +static struct page *page_list_lookup(struct shmobj *obj, off_t off) +{ + struct page *page; + + list_for_each_entry(page, &obj->page_list, list) { + if (page->offset == off) { + goto out; + } + } + page = NULL; + +out: + return page; +} + +static struct page *page_list_first(struct shmobj *obj) +{ + if (list_empty(&obj->page_list)) { + return NULL; + } + + return list_first_entry(&obj->page_list, struct page, list); +} + +int shmobj_create(struct shmid_ds *ds, struct memobj **objp) +{ + struct shmobj *obj = NULL; + int error; + + dkprintf("shmobj_create(%p %#lx,%p)\n", ds, ds->shm_segsz, objp); + obj = kmalloc(sizeof(*obj), IHK_MC_AP_NOWAIT); + if (!obj) { + error = -ENOMEM; + ekprintf("shmobj_create(%p %#lx,%p):kmalloc failed. %d\n", + ds, ds->shm_segsz, objp, error); + goto out; + } + + memset(obj, 0, sizeof(*obj)); + obj->memobj.ops = &shmobj_ops; + obj->ref = 1; + obj->ds = *ds; + page_list_init(obj); + ihk_mc_spinlock_init(&obj->memobj.lock); + + error = 0; + *objp = to_memobj(obj); + obj = NULL; + +out: + if (obj) { + kfree(obj); + } + dkprintf("shmobj_create(%p %#lx,%p):%d %p\n", + ds, ds->shm_segsz, objp, error, *objp); + return error; +} + +static void shmobj_release(struct memobj *memobj) +{ + struct shmobj *obj = to_shmobj(memobj); + struct shmobj *freeobj = NULL; + + dkprintf("shmobj_release(%p)\n", memobj); + memobj_lock(&obj->memobj); + --obj->ref; + if (obj->ref <= 0) { + if (obj->ref < 0) { + fkprintf("shmobj_release(%p):ref %ld\n", + memobj, obj->ref); + panic("shmobj_release:freeing free shmobj"); + } + freeobj = obj; + } + memobj_unlock(&obj->memobj); + + if (freeobj) { + /* zap page_list */ + for (;;) { + struct page *page; + int count; + + page = page_list_first(obj); + if (!page) { + break; + } + page_list_remove(obj, page); + + dkprintf("shmobj_release(%p):" + "release page. %p %#lx %d %d", + memobj, page, page_to_phys(page), + page->mode, page->count); + count = ihk_atomic_sub_return(1, &page->count); + if (!((page->mode == PM_MAPPED) && (count == 0))) { + fkprintf("shmobj_release(%p): " + "page %p phys %#lx mode %#x" + " count %d off %#lx\n", + memobj, page, + page_to_phys(page), + page->mode, count, + page->offset); + panic("shmobj_release"); + } + + /* XXX:NYI: large pages */ + page->mode = PM_NONE; + free_pages(phys_to_virt(page_to_phys(page)), 1); + } + dkprintf("shmobj_release(%p):free shmobj", memobj); + kfree(freeobj); + } + dkprintf("shmobj_release(%p):\n", memobj); + return; +} + +static void shmobj_ref(struct memobj *memobj) +{ + struct shmobj *obj = to_shmobj(memobj); + long newref; + + dkprintf("shmobj_ref(%p)\n", memobj); + memobj_lock(&obj->memobj); + newref = ++obj->ref; + memobj_unlock(&obj->memobj); + dkprintf("shmobj_ref(%p): newref %ld\n", memobj, newref); + return; +} + +static int shmobj_get_page(struct memobj *memobj, off_t off, int p2align, + uintptr_t *physp) +{ + struct shmobj *obj = to_shmobj(memobj); + int error; + struct page *page; + int npages; + void *virt = NULL; + uintptr_t phys = -1; + + dkprintf("shmobj_get_page(%p,%#lx,%d,%p)\n", + memobj, off, p2align, physp); + memobj_lock(&obj->memobj); + if (off & ~PAGE_MASK) { + error = -EINVAL; + ekprintf("shmobj_get_page(%p,%#lx,%d,%p):invalid argument. %d\n", + memobj, off, p2align, physp, error); + goto out; + } + if (p2align != PAGE_P2ALIGN) { /* XXX:NYI:large pages */ + error = -ENOMEM; + ekprintf("shmobj_get_page(%p,%#lx,%d,%p):large page. %d\n", + memobj, off, p2align, physp, error); + goto out; + } + if (obj->ds.shm_segsz <= off) { + error = -ERANGE; + ekprintf("shmobj_get_page(%p,%#lx,%d,%p):beyond the end. %d\n", + memobj, off, p2align, physp, error); + goto out; + } + if ((obj->ds.shm_segsz - off) < (PAGE_SIZE << p2align)) { + error = -ENOSPC; + ekprintf("shmobj_get_page(%p,%#lx,%d,%p):too large. %d\n", + memobj, off, p2align, physp, error); + goto out; + } + + page = page_list_lookup(obj, off); + if (!page) { + npages = 1 << p2align; + virt = ihk_mc_alloc_pages(npages, IHK_MC_AP_NOWAIT); + if (!virt) { + error = -ENOMEM; + ekprintf("shmobj_get_page(%p,%#lx,%d,%p):" + "alloc failed. %d\n", + memobj, off, p2align, physp, error); + goto out; + } + phys = virt_to_phys(virt); + page = phys_to_page(phys); + if (page->mode != PM_NONE) { + fkprintf("shmobj_get_page(%p,%#lx,%d,%p):" + "page %p %#lx %d %d %#lx\n", + memobj, off, p2align, physp, + page, page_to_phys(page), page->mode, + page->count, page->offset); + panic("shmobj_get_page()"); + } + memset(virt, 0, npages*PAGE_SIZE); + page->mode = PM_MAPPED; + page->offset = off; + ihk_atomic_set(&page->count, 1); + page_list_insert(obj, page); + virt = NULL; + dkprintf("shmobj_get_page(%p,%#lx,%d,%p):alloc page. %p %#lx\n", + memobj, off, p2align, physp, page, phys); + } + + ihk_atomic_inc(&page->count); + + error = 0; + *physp = page_to_phys(page); + +out: + memobj_unlock(&obj->memobj); + if (virt) { + ihk_mc_free_pages(virt, npages); + } + dkprintf("shmobj_get_page(%p,%#lx,%d,%p):%d\n", + memobj, off, p2align, physp, error); + return error; +} diff --git a/kernel/syscall.c b/kernel/syscall.c index 8fb35986..3187f35e 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -45,6 +45,7 @@ #include #include #include +#include /* Headers taken from kitten LWK */ #include @@ -630,12 +631,13 @@ SYSCALL_DECLARE(mmap) const int prot = ihk_mc_syscall_arg2(ctx); const int flags = ihk_mc_syscall_arg3(ctx); const int fd = ihk_mc_syscall_arg4(ctx); - const off_t off = ihk_mc_syscall_arg5(ctx); + const off_t off0 = ihk_mc_syscall_arg5(ctx); struct process *proc = cpu_local_var(current); struct vm_regions *region = &proc->vm->region; intptr_t addr; size_t len; + off_t off; int error; intptr_t npages; int p2align; @@ -646,10 +648,11 @@ SYSCALL_DECLARE(mmap) int maxprot; int denied; int ro_vma_mapped = 0; + struct shmid_ds ads; dkprintf("[%d]sys_mmap(%lx,%lx,%x,%x,%d,%lx)\n", ihk_mc_get_processor_id(), - addr0, len0, prot, flags, fd, off); + addr0, len0, prot, flags, fd, off0); /* check constants for flags */ if (1) { @@ -681,9 +684,9 @@ SYSCALL_DECLARE(mmap) || ((region->user_end - len) < addr) || !(flags & (MAP_SHARED | MAP_PRIVATE)) || ((flags & MAP_SHARED) && (flags & MAP_PRIVATE)) - || (off & (PAGE_SIZE - 1))) { + || (off0 & (PAGE_SIZE - 1))) { ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):EINVAL\n", - addr0, len0, prot, flags, fd, off); + addr0, len0, prot, flags, fd, off0); error = -EINVAL; goto out2; } @@ -692,7 +695,7 @@ SYSCALL_DECLARE(mmap) if ((flags & error_flags) || (flags & ~(supported_flags | ignored_flags))) { ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):unknown flags %x\n", - addr0, len0, prot, flags, fd, off, + addr0, len0, prot, flags, fd, off0, (flags & ~(supported_flags | ignored_flags))); error = -EINVAL; goto out2; @@ -754,8 +757,10 @@ SYSCALL_DECLARE(mmap) } phys = 0; + off = 0; maxprot = PROT_READ | PROT_WRITE | PROT_EXEC; if (!(flags & MAP_ANONYMOUS)) { + off = off0; error = fileobj_create(fd, &memobj, &maxprot); if (error) { ekprintf("sys_mmap:fileobj_create failed. %d\n", error); @@ -781,6 +786,15 @@ SYSCALL_DECLARE(mmap) } phys = virt_to_phys(p); } + else if (flags & MAP_SHARED) { + memset(&ads, 0, sizeof(ads)); + ads.shm_segsz = len; + error = shmobj_create(&ads, &memobj); + if (error) { + ekprintf("sys_mmap:shmobj_create failed. %d\n", error); + goto out; + } + } if ((flags & MAP_PRIVATE) && (maxprot & PROT_READ)) { maxprot |= PROT_WRITE; @@ -844,7 +858,7 @@ out2: } dkprintf("[%d]sys_mmap(%lx,%lx,%x,%x,%d,%lx): %ld %lx\n", ihk_mc_get_processor_id(), - addr0, len0, prot, flags, fd, off, error, addr); + addr0, len0, prot, flags, fd, off0, error, addr); return (!error)? addr: error; } From 36cff84e05620271fa1b5d56d3c7ccf17017f12b Mon Sep 17 00:00:00 2001 From: NAKAMURA Gou Date: Thu, 3 Jul 2014 13:58:05 +0900 Subject: [PATCH 13/23] add zeroobj for private anonymous mappings --- kernel/Makefile.build | 1 + kernel/include/memobj.h | 1 + kernel/syscall.c | 7 ++ kernel/zeroobj.c | 206 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 215 insertions(+) create mode 100644 kernel/zeroobj.c diff --git a/kernel/Makefile.build b/kernel/Makefile.build index ded7f58d..49e40193 100644 --- a/kernel/Makefile.build +++ b/kernel/Makefile.build @@ -1,6 +1,7 @@ IHKDIR=$(IHKBASE)/$(TARGETDIR) OBJS = init.o mem.o debug.o mikc.o listeners.o ap.o syscall.o cls.o host.o OBJS += process.o copy.o waitq.o futex.o timer.o plist.o fileobj.o shmobj.o +OBJS += zeroobj.o DEPSRCS=$(wildcard $(SRC)/*.c) CFLAGS += -I$(SRC)/include -mcmodel=kernel -D__KERNEL__ diff --git a/kernel/include/memobj.h b/kernel/include/memobj.h index 0ba9c838..6bfd2a0a 100644 --- a/kernel/include/memobj.h +++ b/kernel/include/memobj.h @@ -103,5 +103,6 @@ static inline int memobj_has_pager(struct memobj *obj) int fileobj_create(int fd, struct memobj **objp, int *maxprotp); int shmobj_create(struct shmid_ds *ds, struct memobj **objp); +int zeroobj_create(struct memobj **objp); #endif /* HEADER_MEMOBJ_H */ diff --git a/kernel/syscall.c b/kernel/syscall.c index 3187f35e..b0d86109 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -795,6 +795,13 @@ SYSCALL_DECLARE(mmap) goto out; } } + else { + error = zeroobj_create(&memobj); + if (error) { + ekprintf("sys_mmap:zeroobj_create failed. %d\n", error); + goto out; + } + } if ((flags & MAP_PRIVATE) && (maxprot & PROT_READ)) { maxprot |= PROT_WRITE; diff --git a/kernel/zeroobj.c b/kernel/zeroobj.c new file mode 100644 index 00000000..2305ea6a --- /dev/null +++ b/kernel/zeroobj.c @@ -0,0 +1,206 @@ +/** + * \file zeroobj.c + * License details are found in the file LICENSE. + * \brief + * read-only zeroed page object + * \author Gou Nakamura + */ +/* + * HISTORY: + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0) +#define ekprintf(...) kprintf(__VA_ARGS__) +#define fkprintf(...) kprintf(__VA_ARGS__) + +struct zeroobj { + struct memobj memobj; /* must be first */ + struct list_head page_list; +}; + +static ihk_spinlock_t the_zeroobj_lock = SPIN_LOCK_UNLOCKED; +static struct zeroobj *the_zeroobj = NULL; /* singleton */ + +static memobj_get_page_func_t zeroobj_get_page; + +static struct memobj_ops zeroobj_ops = { + .get_page = &zeroobj_get_page, +}; + +static struct zeroobj *to_zeroobj(struct memobj *memobj) +{ + return (struct zeroobj *)memobj; +} + +static struct memobj *to_memobj(struct zeroobj *zeroobj) +{ + return &zeroobj->memobj; +} + +/*********************************************************************** + * page_list + */ +static void page_list_init(struct zeroobj *obj) +{ + INIT_LIST_HEAD(&obj->page_list); + return; +} + +static void page_list_insert(struct zeroobj *obj, struct page *page) +{ + list_add(&page->list, &obj->page_list); + return; +} + +static struct page *page_list_first(struct zeroobj *obj) +{ + if (list_empty(&obj->page_list)) { + return NULL; + } + + return list_first_entry(&obj->page_list, struct page, list); +} + +/*********************************************************************** + * zeroobj + */ +static int alloc_zeroobj(void) +{ + int error; + struct zeroobj *obj = NULL; + void *virt = NULL; + uintptr_t phys; + struct page *page; + + dkprintf("alloc_zeroobj()\n"); + ihk_mc_spinlock_lock_noirq(&the_zeroobj_lock); + if (the_zeroobj) { + error = 0; + dkprintf("alloc_zeroobj():already. %d\n", error); + goto out; + } + + obj = kmalloc(sizeof(*obj), IHK_MC_AP_NOWAIT); + if (!obj) { + error = -ENOMEM; + ekprintf("alloc_zeroobj():kmalloc failed. %d\n", error); + goto out; + } + + memset(obj, 0, sizeof(*obj)); + obj->memobj.ops = &zeroobj_ops; + page_list_init(obj); + ihk_mc_spinlock_init(&obj->memobj.lock); + + virt = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT); /* XXX:NYI:large page */ + if (!virt) { + error = -ENOMEM; + ekprintf("alloc_zeroobj():alloc pages failed. %d\n", error); + goto out; + } + phys = virt_to_phys(virt); + page = phys_to_page(phys); + + if (page->mode != PM_NONE) { + fkprintf("alloc_zeroobj():" + "page %p %#lx %d %d %#lx\n", + page, page_to_phys(page), page->mode, + page->count, page->offset); + panic("alloc_zeroobj:dup alloc"); + } + + memset(virt, 0, PAGE_SIZE); + page->mode = PM_MAPPED; + page->offset = 0; + ihk_atomic_set(&page->count, 1); + page_list_insert(obj, page); + virt = NULL; + + error = 0; + the_zeroobj = obj; + obj = NULL; + +out: + ihk_mc_spinlock_unlock_noirq(&the_zeroobj_lock); + if (virt) { + ihk_mc_free_pages(virt, 1); + } + if (obj) { + kfree(obj); + } + dkprintf("alloc_zeroobj():%d %p\n", error, the_zeroobj); + return error; +} + +int zeroobj_create(struct memobj **objp) +{ + int error; + + dkprintf("zeroobj_create(%p)\n", objp); + if (!the_zeroobj) { + error = alloc_zeroobj(); + if (error) { + goto out; + } + } + + error = 0; + *objp = to_memobj(the_zeroobj); + +out: + dkprintf("zeroobj_create(%p):%d %p\n", objp, error, *objp); + return error; +} + +static int zeroobj_get_page(struct memobj *memobj, off_t off, int p2align, + uintptr_t *physp) +{ + int error; + struct zeroobj *obj = to_zeroobj(memobj); + struct page *page; + + dkprintf("zeroobj_get_page(%p,%#lx,%d,%p)\n", + memobj, off, p2align, physp); + if (off & ~PAGE_MASK) { + error = -EINVAL; + ekprintf("zeroobj_get_page(%p,%#lx,%d,%p):invalid argument. %d\n", + memobj, off, p2align, physp, error); + goto out; + } + if (p2align != PAGE_P2ALIGN) { /* XXX:NYI:large pages */ + error = -ENOMEM; + ekprintf("zeroobj_get_page(%p,%#lx,%d,%p):large page. %d\n", + memobj, off, p2align, physp, error); + goto out; + } + + page = page_list_first(obj); + if (!page) { + error = -ENOMEM; + ekprintf("zeroobj_get_page(%p,%#lx,%d,%p):page not found. %d\n", + memobj, off, p2align, physp, error); + goto out; + } + + ihk_atomic_inc(&page->count); + + error = 0; + *physp = page_to_phys(page); + +out: + dkprintf("zeroobj_get_page(%p,%#lx,%d,%p):%d\n", + memobj, off, p2align, physp, error); + return error; +} From 31a605f94b9ed20ff849214730697f9fab00fe02 Mon Sep 17 00:00:00 2001 From: "Balazs Gerofi bgerofi@riken.jp" Date: Thu, 10 Jul 2014 13:53:12 +0900 Subject: [PATCH 14/23] push/pop r15 when entering/leaving kernel space (fix for bug #53: r15 wasn't propagated during fork()) --- arch/x86/kernel/include/registers.h | 2 +- arch/x86/kernel/interrupt.S | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/include/registers.h b/arch/x86/kernel/include/registers.h index 97889b6e..0e9dca07 100644 --- a/arch/x86/kernel/include/registers.h +++ b/arch/x86/kernel/include/registers.h @@ -136,7 +136,7 @@ struct tss64 { } __attribute__((packed)); struct x86_regs { - unsigned long r11, r10, r9, r8; + unsigned long r15, r11, r10, r9, r8; unsigned long rdi, rsi, rdx, rcx, rbx, rax, rbp; unsigned long error, rip, cs, rflags, rsp, ss; }; diff --git a/arch/x86/kernel/interrupt.S b/arch/x86/kernel/interrupt.S index e814afa4..d0a0838b 100644 --- a/arch/x86/kernel/interrupt.S +++ b/arch/x86/kernel/interrupt.S @@ -35,8 +35,10 @@ pushq %r8; \ pushq %r9; \ pushq %r10; \ - pushq %r11; + pushq %r11; \ + pushq %r15; #define POP_ALL_REGS \ + popq %r15; \ popq %r11; \ popq %r10; \ popq %r9; \ @@ -67,7 +69,7 @@ vector=vector+1 common_interrupt: PUSH_ALL_REGS - movq 88(%rsp), %rdi + movq 96(%rsp), %rdi movq %rsp, %rsi call handle_interrupt /* Enter C code */ POP_ALL_REGS @@ -83,7 +85,7 @@ page_fault: cld PUSH_ALL_REGS movq %cr2, %rdi - movq 88(%rsp),%rsi + movq 96(%rsp),%rsi movq %rsp,%rdx movq __page_fault_handler_address(%rip), %rax andq %rax, %rax @@ -120,13 +122,13 @@ x86_syscall: movq %gs:24, %rcx movq %rcx, 32(%rsp) PUSH_ALL_REGS - movq 72(%rsp), %rdi + movq 80(%rsp), %rdi movw %ss, %ax movw %ax, %ds movq %rsp, %rsi callq *__x86_syscall_handler(%rip) 1: - movq %rax, 72(%rsp) + movq %rax, 80(%rsp) POP_ALL_REGS #ifdef USE_SYSRET movq 8(%rsp), %rcx From 292b34fe2152d6a6a4c74746c76ceac549f8ce39 Mon Sep 17 00:00:00 2001 From: Tomoki Shirasawa Date: Sun, 13 Jul 2014 12:49:30 +0900 Subject: [PATCH 15/23] signal handler is not passed to a child process(redmine#62) --- kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/process.c b/kernel/process.c index 18ce28d9..cd447d49 100644 --- a/kernel/process.c +++ b/kernel/process.c @@ -253,7 +253,7 @@ struct process *clone_process(struct process *org, unsigned long pc, goto err_free_sighandler; } - memset(proc->sighandler, '\0', sizeof(struct sig_handler)); + memcpy(proc->sighandler, org->sighandler, sizeof(struct sig_handler)); ihk_atomic_set(&proc->sighandler->use, 1); ihk_mc_spinlock_init(&proc->sighandler->lock); ihk_atomic_set(&proc->sigshared->use, 1); From 5e6ed852cbf51b9fba1c84bf85a0230b4a5923a4 Mon Sep 17 00:00:00 2001 From: Tomoki Shirasawa Date: Sun, 13 Jul 2014 12:51:28 +0900 Subject: [PATCH 16/23] Kill child threads when receiving terminating signals(redmine#63) Create process table (child is missing when sending a signal to a child process just after forking it)(redmine#61) --- arch/x86/kernel/syscall.c | 55 +++++++++++++++++++++++++-------------- kernel/init.c | 3 +++ kernel/syscall.c | 13 ++++++++- 3 files changed, 51 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/syscall.c b/arch/x86/kernel/syscall.c index 21259603..38c1ea42 100644 --- a/arch/x86/kernel/syscall.c +++ b/arch/x86/kernel/syscall.c @@ -288,14 +288,16 @@ check_signal(unsigned long rc, void *regs0) unsigned long do_kill(int pid, int tid, int sig) { + struct cpu_local_var *v; + struct process *p; struct process *proc = cpu_local_var(current); struct process *tproc = NULL; int i; __sigset_t mask; struct sig_pending *pending; struct list_head *head; - int irqstate; int rc; + unsigned long irqstate; if(proc == NULL || proc->pid == 0){ return -ESRCH; @@ -314,37 +316,52 @@ do_kill(int pid, int tid, int sig) } else{ for(i = 0; i < num_processors; i++){ - if(get_cpu_local_var(i)->current && - get_cpu_local_var(i)->current->pid == pid){ - tproc = get_cpu_local_var(i)->current; - break; + v = get_cpu_local_var(i); + irqstate = ihk_mc_spinlock_lock(&(v->runq_lock)); + list_for_each_entry(p, &(v->runq), sched_list){ + if(p->pid == pid){ + tproc = p; + break; + } } + ihk_mc_spinlock_unlock(&(v->runq_lock), irqstate); } } } else if(pid == -1){ - for(i = 0; i < num_processors; i++) - if(get_cpu_local_var(i)->current && - get_cpu_local_var(i)->current->pid > 0 && - get_cpu_local_var(i)->current->tid == tid){ - tproc = get_cpu_local_var(i)->current; - break; + for(i = 0; i < num_processors; i++){ + v = get_cpu_local_var(i); + irqstate = ihk_mc_spinlock_lock(&(v->runq_lock)); + list_for_each_entry(p, &(v->runq), sched_list){ + if(p->pid > 0 && + p->tid == tid){ + tproc = p; + break; + } } + ihk_mc_spinlock_unlock(&(v->runq_lock), irqstate); + } } else{ if(pid == 0) return -ESRCH; - for(i = 0; i < num_processors; i++) - if(get_cpu_local_var(i)->current && - get_cpu_local_var(i)->current->pid == pid && - get_cpu_local_var(i)->current->tid == tid){ - tproc = get_cpu_local_var(i)->current; - break; + for(i = 0; i < num_processors; i++){ + v = get_cpu_local_var(i); + irqstate = ihk_mc_spinlock_lock(&(v->runq_lock)); + list_for_each_entry(p, &(v->runq), sched_list){ + if(p->pid == pid && + p->tid == tid){ + tproc = p; + break; + } } + ihk_mc_spinlock_unlock(&(v->runq_lock), irqstate); + } } - if(!tproc) + if(!tproc){ return -ESRCH; + } if(sig == 0) return 0; @@ -375,7 +392,7 @@ do_kill(int pid, int tid, int sig) } else{ list_add_tail(&pending->list, head); - proc->sigevent = 1; + tproc->sigevent = 1; } } if(tid == -1){ diff --git a/kernel/init.c b/kernel/init.c index 39a54993..6f31113c 100644 --- a/kernel/init.c +++ b/kernel/init.c @@ -211,8 +211,11 @@ static void post_init(void) } if (find_command_line("hidos")) { + extern ihk_spinlock_t syscall_lock; + init_host_syscall_channel(); init_host_syscall_channel2(); + ihk_mc_spinlock_init(&syscall_lock); } ap_start(); } diff --git a/kernel/syscall.c b/kernel/syscall.c index 5fb5cc13..7b27e667 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -168,6 +168,7 @@ static void send_syscall(struct syscall_request *req, int cpu, int pid) #endif } +ihk_spinlock_t syscall_lock; long do_syscall(struct syscall_request *req, ihk_mc_user_context_t *ctx, int cpu, int pid) @@ -176,6 +177,9 @@ long do_syscall(struct syscall_request *req, ihk_mc_user_context_t *ctx, struct syscall_request req2 IHK_DMA_ALIGN; struct syscall_params *scp; int error; + long rc; + int islock = 0; + unsigned long irqstate; dkprintf("SC(%d)[%3d] sending syscall\n", ihk_mc_get_processor_id(), @@ -184,6 +188,8 @@ long do_syscall(struct syscall_request *req, ihk_mc_user_context_t *ctx, if(req->number == __NR_exit_group || req->number == __NR_kill){ // interrupt syscall scp = &get_cpu_local_var(0)->scp2; + islock = 1; + irqstate = ihk_mc_spinlock_lock(&syscall_lock); } else{ scp = &get_cpu_local_var(cpu)->scp; @@ -225,7 +231,12 @@ long do_syscall(struct syscall_request *req, ihk_mc_user_context_t *ctx, ihk_mc_get_processor_id(), req->number, res->ret); - return res->ret; + rc = res->ret; + if(islock){ + ihk_mc_spinlock_unlock(&syscall_lock, irqstate); + } + + return rc; } long syscall_generic_forwarding(int n, ihk_mc_user_context_t *ctx) From f923e03565eda9ee2a454e9498de35d9f4d38ee5 Mon Sep 17 00:00:00 2001 From: YOSHIDA Masanori Date: Wed, 25 Jun 2014 16:40:05 +0900 Subject: [PATCH 17/23] fix to make CPU idle when len(runq) != 0 but all queued are sleeping --- kernel/process.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/process.c b/kernel/process.c index cd447d49..ae824878 100644 --- a/kernel/process.c +++ b/kernel/process.c @@ -1742,10 +1742,6 @@ void schedule(void) list_add_tail(&prev->sched_list, &(v->runq)); ++v->runq_len; } - - if (!v->runq_len) { - v->status = CPU_STATUS_IDLE; - } } /* Pick a new running process */ @@ -1759,6 +1755,7 @@ void schedule(void) /* No process? Run idle.. */ if (!next) { next = &cpu_local_var(idle); + v->status = CPU_STATUS_IDLE; } if (prev != next) { From 1889d10e3a5b7bb7c3e4f314988dc9cfd8a37f4e Mon Sep 17 00:00:00 2001 From: YOSHIDA Masanori Date: Fri, 23 May 2014 17:52:54 +0900 Subject: [PATCH 18/23] add cpu_set and getter/setter of that (incomplete sched_***affinity) --- kernel/include/process.h | 3 + kernel/syscall.c | 115 ++++++++++++++++++++++++++------------- 2 files changed, 81 insertions(+), 37 deletions(-) diff --git a/kernel/include/process.h b/kernel/include/process.h index 0011a68e..2b7d6bac 100644 --- a/kernel/include/process.h +++ b/kernel/include/process.h @@ -20,6 +20,7 @@ #include #include #include +#include #define VR_NONE 0x0 #define VR_STACK 0x1 @@ -185,6 +186,8 @@ struct process { void *pgio_arg; struct fork_tree_node *ftn; + + cpu_set_t cpu_set; }; struct process_vm { diff --git a/kernel/syscall.c b/kernel/syscall.c index 6dd4ec5d..01e3839e 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -1919,56 +1919,97 @@ SYSCALL_DECLARE(ptrace) return -ENOSYS; } +#define MIN2(x,y) (x) < (y) ? (x) : (y) SYSCALL_DECLARE(sched_setaffinity) { -#if 0 - int pid = (int)ihk_mc_syscall_arg0(ctx); - unsigned int len = (unsigned int)ihk_mc_syscall_arg1(ctx); -#endif - cpu_set_t *mask = (cpu_set_t *)ihk_mc_syscall_arg2(ctx); - unsigned long __phys; -#if 0 - int i; -#endif - /* TODO: check mask is in user's page table */ - if(!mask) { return -EFAULT; } - if (ihk_mc_pt_virt_to_phys(cpu_local_var(current)->vm->page_table, - (void *)mask, - &__phys)) { + int tid = (int)ihk_mc_syscall_arg0(ctx); + size_t len = (size_t)ihk_mc_syscall_arg1(ctx); + cpu_set_t *u_cpu_set = (cpu_set_t *)ihk_mc_syscall_arg2(ctx); + + cpu_set_t cpu_set; + struct process *thread; + int i; + + if (sizeof(cpu_set_t) > len) { + kprintf("%s %d\n", __FILE__, __LINE__); + return -EINVAL; + } + len = MIN2(len, sizeof(cpu_set_t)); + + if (copy_from_user(cpu_local_var(current), &cpu_set, u_cpu_set, len)) { + kprintf("%s %d\n", __FILE__, __LINE__); return -EFAULT; } -#if 0 - dkprintf("sched_setaffinity,\n"); - for(i = 0; i < len/sizeof(__cpu_mask); i++) { - dkprintf("mask[%d]=%lx,", i, mask->__bits[i]); - } -#endif + + thread = NULL; + extern int num_processors; + for (i = 0; i < num_processors; i++) { + struct process *tmp_proc; + ihk_mc_spinlock_lock_noirq(&get_cpu_local_var(i)->runq_lock); + list_for_each_entry(tmp_proc, &get_cpu_local_var(i)->runq, sched_list) { + if (tmp_proc && tmp_proc->pid && tmp_proc->tid == tid) { + thread = tmp_proc; + hold_process(thread); + break; + } + } + ihk_mc_spinlock_unlock_noirq(&get_cpu_local_var(i)->runq_lock); + if (thread) + break; + } + if (!thread) { + kprintf("%s %d\n", __FILE__, __LINE__); + return -ESRCH; + } + memcpy(&thread->cpu_set, &cpu_set, sizeof(cpu_set)); + release_process(thread); + kprintf("%s %d\n", __FILE__, __LINE__); return 0; } -#define MIN2(x,y) (x) < (y) ? (x) : (y) -#define MIN3(x,y,z) MIN2(MIN2((x),(y)),MIN2((y),(z))) // see linux-2.6.34.13/kernel/sched.c SYSCALL_DECLARE(sched_getaffinity) { - //int pid = (int)ihk_mc_syscall_arg0(ctx); - unsigned int len = (int)ihk_mc_syscall_arg1(ctx); - //int cpu_id; - cpu_set_t *mask = (cpu_set_t *)ihk_mc_syscall_arg2(ctx); - struct ihk_mc_cpu_info *cpu_info = ihk_mc_get_cpu_info(); - if(len*8 < cpu_info->ncpus) { return -EINVAL; } - if(len & (sizeof(unsigned long)-1)) { return -EINVAL; } - int min_len = MIN2(len, sizeof(cpu_set_t)); - //int min_ncpus = MIN2(min_len*8, cpu_info->ncpus); + int tid = (int)ihk_mc_syscall_arg0(ctx); + size_t len = (size_t)ihk_mc_syscall_arg1(ctx); + cpu_set_t *u_cpu_set = (cpu_set_t *)ihk_mc_syscall_arg2(ctx); - CPU_ZERO_S(min_len, mask); - CPU_SET_S(ihk_mc_get_hardware_processor_id(), min_len, mask); - //for (cpu_id = 0; cpu_id < min_ncpus; ++cpu_id) - // CPU_SET_S(cpu_info->hw_ids[cpu_id], min_len, mask); + int ret; + struct process *thread; + int i; - // dkprintf("sched_getaffinity returns full mask\n"); + if (sizeof(cpu_set_t) > len) { + kprintf("%s %d\n", __FILE__, __LINE__); + return -EINVAL; + } + len = MIN2(len, sizeof(cpu_set_t)); - return min_len; + thread = NULL; + extern int num_processors; + for (i = 0; i < num_processors; i++) { + struct process *tmp_proc; + ihk_mc_spinlock_lock_noirq(&get_cpu_local_var(i)->runq_lock); + list_for_each_entry(tmp_proc, &get_cpu_local_var(i)->runq, sched_list) { + if (tmp_proc && tmp_proc->pid && tmp_proc->tid == tid) { + thread = tmp_proc; + hold_process(thread); + break; + } + } + ihk_mc_spinlock_unlock_noirq(&get_cpu_local_var(i)->runq_lock); + if (thread) + break; + } + if (!thread) { + kprintf("%s %d\n", __FILE__, __LINE__); + return -ESRCH; + } + ret = copy_to_user(cpu_local_var(current), u_cpu_set, &thread->cpu_set, len); + release_process(thread); + kprintf("%s %d %d\n", __FILE__, __LINE__, ret); + if (ret < 0) + return ret; + return len; } SYSCALL_DECLARE(sched_yield) From 170a54d976595678972dbda638cc214746ba15eb Mon Sep 17 00:00:00 2001 From: YOSHIDA Masanori Date: Wed, 25 Jun 2014 17:02:02 +0900 Subject: [PATCH 19/23] add CPU affinity function - syscall support -- sched_setaffinity -- sched_getaffinity - migration in scheduler - resched at return of syscalls/interrupts --- arch/x86/kernel/cpu.c | 2 + kernel/include/cls.h | 8 +++ kernel/mem.c | 1 + kernel/process.c | 154 +++++++++++++++++++++++++++++++++++++++--- kernel/syscall.c | 95 +++++++++++++------------- 5 files changed, 203 insertions(+), 57 deletions(-) diff --git a/arch/x86/kernel/cpu.c b/arch/x86/kernel/cpu.c index 82ee53f7..cd15aae3 100644 --- a/arch/x86/kernel/cpu.c +++ b/arch/x86/kernel/cpu.c @@ -438,6 +438,7 @@ void handle_interrupt(int vector, struct x86_regs *regs) } check_signal(0, regs); + check_need_resched(); } void gpe_handler(struct x86_regs *regs) @@ -447,6 +448,7 @@ void gpe_handler(struct x86_regs *regs) arch_show_interrupt_context(regs); set_signal(SIGILL, regs); check_signal(0, regs); + check_need_resched(); // panic("GPF"); } diff --git a/kernel/include/cls.h b/kernel/include/cls.h index 77170294..b8d7f52e 100644 --- a/kernel/include/cls.h +++ b/kernel/include/cls.h @@ -30,6 +30,9 @@ struct malloc_header { #define CPU_STATUS_RUNNING (2) extern ihk_spinlock_t cpu_status_lock; +#define CPU_FLAG_NEED_RESCHED 0x1U +#define CPU_FLAG_NEED_MIGRATE 0x2U + struct cpu_local_var { /* malloc */ struct malloc_header free_list; @@ -54,6 +57,11 @@ struct cpu_local_var { int fs; struct list_head pending_free_pages; + + unsigned int flags; + + ihk_spinlock_t migq_lock; + struct list_head migq; } __attribute__((aligned(64))); diff --git a/kernel/mem.c b/kernel/mem.c index 90d492c7..d84e043c 100644 --- a/kernel/mem.c +++ b/kernel/mem.c @@ -248,6 +248,7 @@ out: dkprintf("[%d]page_fault_handler(%p,%lx,%p): (%d)\n", ihk_mc_get_processor_id(), fault_addr, reason, regs, error); + check_need_resched(); return; } diff --git a/kernel/process.c b/kernel/process.c index ae824878..a3af4a4c 100644 --- a/kernel/process.c +++ b/kernel/process.c @@ -1711,6 +1711,9 @@ void sched_init(void) cpu_local_var(runq_len) = 0; ihk_mc_spinlock_init(&cpu_local_var(runq_lock)); + INIT_LIST_HEAD(&cpu_local_var(migq)); + ihk_mc_spinlock_init(&cpu_local_var(migq_lock)); + #ifdef TIMER_CPU_ID if (ihk_mc_get_processor_id() == TIMER_CPU_ID) { init_timers(); @@ -1719,6 +1722,72 @@ void sched_init(void) #endif } +static void double_rq_lock(struct cpu_local_var *v1, struct cpu_local_var *v2) +{ + if (v1 < v2) { + ihk_mc_spinlock_lock_noirq(&v1->runq_lock); + ihk_mc_spinlock_lock_noirq(&v2->runq_lock); + } else { + ihk_mc_spinlock_lock_noirq(&v2->runq_lock); + ihk_mc_spinlock_lock_noirq(&v1->runq_lock); + } +} + +static void double_rq_unlock(struct cpu_local_var *v1, struct cpu_local_var *v2) +{ + ihk_mc_spinlock_unlock_noirq(&v1->runq_lock); + ihk_mc_spinlock_unlock_noirq(&v2->runq_lock); +} + +struct migrate_request { + struct list_head list; + struct process *proc; + struct waitq wq; +}; + +static void do_migrate(void) +{ + int cur_cpu_id = ihk_mc_get_processor_id(); + struct cpu_local_var *cur_v = get_cpu_local_var(cur_cpu_id); + struct migrate_request *req, *tmp; + + ihk_mc_spinlock_lock_noirq(&cur_v->migq_lock); + list_for_each_entry_safe(req, tmp, &cur_v->migq, list) { + int cpu_id; + struct cpu_local_var *v; + + /* 0. check if migration is necessary */ + list_del(&req->list); + if (req->proc->cpu_id != cur_cpu_id) /* already not here */ + goto ack; + if (CPU_ISSET(cur_cpu_id, &req->proc->cpu_set)) /* good affinity */ + goto ack; + + /* 1. select CPU */ + for (cpu_id = 0; cpu_id < CPU_SETSIZE; cpu_id++) + if (CPU_ISSET(cpu_id, &req->proc->cpu_set)) + break; + if (CPU_SETSIZE == cpu_id) /* empty affinity (bug?) */ + goto ack; + + /* 2. migrate thread */ + v = get_cpu_local_var(cpu_id); + double_rq_lock(cur_v, v); + list_del(&req->proc->sched_list); + cur_v->runq_len -= 1; + req->proc->cpu_id = cpu_id; + list_add_tail(&req->proc->sched_list, &v->runq); + v->runq_len += 1; + if (v->runq_len == 1) + ihk_mc_interrupt_cpu(get_x86_cpu_local_variable(cpu_id)->apic_id, 0xd1); + double_rq_unlock(cur_v, v); + +ack: + waitq_wakeup(&req->wq); + } + ihk_mc_spinlock_unlock_noirq(&cur_v->migq_lock); +} + void schedule(void) { struct cpu_local_var *v = get_this_cpu_local_var(); @@ -1727,6 +1796,7 @@ void schedule(void) unsigned long irqstate; struct process *last; +redo: irqstate = ihk_mc_spinlock_lock(&(v->runq_lock)); next = NULL; @@ -1744,18 +1814,22 @@ void schedule(void) } } - /* Pick a new running process */ - list_for_each_entry_safe(proc, tmp, &(v->runq), sched_list) { - if (proc->status == PS_RUNNING) { - next = proc; - break; - } - } - - /* No process? Run idle.. */ - if (!next) { + if (v->flags & CPU_FLAG_NEED_MIGRATE) { next = &cpu_local_var(idle); - v->status = CPU_STATUS_IDLE; + } else { + /* Pick a new running process */ + list_for_each_entry_safe(proc, tmp, &(v->runq), sched_list) { + if (proc->status == PS_RUNNING) { + next = proc; + break; + } + } + + /* No process? Run idle.. */ + if (!next) { + next = &cpu_local_var(idle); + v->status = CPU_STATUS_IDLE; + } } if (prev != next) { @@ -1793,6 +1867,21 @@ void schedule(void) else { ihk_mc_spinlock_unlock(&(v->runq_lock), irqstate); } + + if (v->flags & CPU_FLAG_NEED_MIGRATE) { + v->flags &= ~CPU_FLAG_NEED_MIGRATE; + do_migrate(); + goto redo; + } +} + +void check_need_resched(void) +{ + struct cpu_local_var *v = get_this_cpu_local_var(); + if (v->flags & CPU_FLAG_NEED_RESCHED) { + v->flags &= ~CPU_FLAG_NEED_RESCHED; + schedule(); + } } @@ -1837,6 +1926,49 @@ int sched_wakeup_process(struct process *proc, int valid_states) return status; } +/* + * 1. Add current process to waitq + * 2. Queue migration request into the target CPU's queue + * 3. Kick migration on the CPU + * 4. Wait for completion of the migration + * + * struct migrate_request { + * list //migq, + * wq, + * proc + * } + * + * [expected processing of the target CPU] + * 1. Interrupted by IPI + * 2. call schedule() via check_resched() + * 3. Do migration + * 4. Wake up this thread + */ +void sched_request_migrate(int cpu_id, struct process *proc) +{ + struct cpu_local_var *v = get_cpu_local_var(cpu_id); + struct migrate_request req = { .proc = proc }; + unsigned long irqstate; + DECLARE_WAITQ_ENTRY(entry, cpu_local_var(current)); + + waitq_init(&req.wq); + waitq_prepare_to_wait(&req.wq, &entry, PS_UNINTERRUPTIBLE); + + irqstate = ihk_mc_spinlock_lock(&v->migq_lock); + list_add_tail(&req.list, &v->migq); + ihk_mc_spinlock_unlock(&v->migq_lock, irqstate); + + v->flags |= CPU_FLAG_NEED_RESCHED | CPU_FLAG_NEED_MIGRATE; + v->status = CPU_STATUS_RUNNING; + + if (cpu_id != ihk_mc_get_processor_id()) + ihk_mc_interrupt_cpu(/* Kick scheduler */ + get_x86_cpu_local_variable(cpu_id)->apic_id, 0xd1); + + schedule(); + waitq_finish_wait(&req.wq, &entry); +} + /* Runq lock must be held here */ diff --git a/kernel/syscall.c b/kernel/syscall.c index 01e3839e..fd2764b6 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -1926,45 +1926,51 @@ SYSCALL_DECLARE(sched_setaffinity) size_t len = (size_t)ihk_mc_syscall_arg1(ctx); cpu_set_t *u_cpu_set = (cpu_set_t *)ihk_mc_syscall_arg2(ctx); - cpu_set_t cpu_set; + cpu_set_t k_cpu_set, cpu_set; struct process *thread; - int i; + int cpu_id; - if (sizeof(cpu_set_t) > len) { - kprintf("%s %d\n", __FILE__, __LINE__); + if (sizeof(k_cpu_set) > len) { + kprintf("%s:%d\n Too small buffer.", __FILE__, __LINE__); return -EINVAL; } - len = MIN2(len, sizeof(cpu_set_t)); + len = MIN2(len, sizeof(k_cpu_set)); - if (copy_from_user(cpu_local_var(current), &cpu_set, u_cpu_set, len)) { - kprintf("%s %d\n", __FILE__, __LINE__); + if (copy_from_user(cpu_local_var(current), &k_cpu_set, u_cpu_set, len)) { + kprintf("%s:%d copy_from_user failed.\n", __FILE__, __LINE__); return -EFAULT; } - thread = NULL; + // XXX: We should build something like cpu_available_mask in advance + CPU_ZERO(&cpu_set); extern int num_processors; - for (i = 0; i < num_processors; i++) { - struct process *tmp_proc; - ihk_mc_spinlock_lock_noirq(&get_cpu_local_var(i)->runq_lock); - list_for_each_entry(tmp_proc, &get_cpu_local_var(i)->runq, sched_list) { - if (tmp_proc && tmp_proc->pid && tmp_proc->tid == tid) { - thread = tmp_proc; - hold_process(thread); - break; - } - } - ihk_mc_spinlock_unlock_noirq(&get_cpu_local_var(i)->runq_lock); - if (thread) - break; - } - if (!thread) { - kprintf("%s %d\n", __FILE__, __LINE__); - return -ESRCH; + for (cpu_id = 0; cpu_id < num_processors; cpu_id++) + if (CPU_ISSET(cpu_id, &k_cpu_set)) + CPU_SET(cpu_id, &cpu_set); + + for (cpu_id = 0; cpu_id < num_processors; cpu_id++) { + ihk_mc_spinlock_lock_noirq(&get_cpu_local_var(cpu_id)->runq_lock); + list_for_each_entry(thread, &get_cpu_local_var(cpu_id)->runq, sched_list) + if (thread->pid && thread->tid == tid) + goto found; /* without unlocking runq_lock */ + ihk_mc_spinlock_unlock_noirq(&get_cpu_local_var(cpu_id)->runq_lock); } + kprintf("%s:%d Thread not found.\n", __FILE__, __LINE__); + return -ESRCH; + +found: memcpy(&thread->cpu_set, &cpu_set, sizeof(cpu_set)); - release_process(thread); - kprintf("%s %d\n", __FILE__, __LINE__); - return 0; + + if (!CPU_ISSET(cpu_id, &thread->cpu_set)) { + hold_process(thread); + ihk_mc_spinlock_unlock_noirq(&get_cpu_local_var(cpu_id)->runq_lock); + sched_request_migrate(cpu_id, thread); + release_process(thread); + return 0; + } else { + ihk_mc_spinlock_unlock_noirq(&get_cpu_local_var(cpu_id)->runq_lock); + return 0; + } } // see linux-2.6.34.13/kernel/sched.c @@ -1972,40 +1978,36 @@ SYSCALL_DECLARE(sched_getaffinity) { int tid = (int)ihk_mc_syscall_arg0(ctx); size_t len = (size_t)ihk_mc_syscall_arg1(ctx); - cpu_set_t *u_cpu_set = (cpu_set_t *)ihk_mc_syscall_arg2(ctx); + cpu_set_t k_cpu_set, *u_cpu_set = (cpu_set_t *)ihk_mc_syscall_arg2(ctx); int ret; - struct process *thread; + int found = 0; int i; - if (sizeof(cpu_set_t) > len) { - kprintf("%s %d\n", __FILE__, __LINE__); + if (sizeof(k_cpu_set) > len) { + kprintf("%s:%d Too small buffer.\n", __FILE__, __LINE__); return -EINVAL; } - len = MIN2(len, sizeof(cpu_set_t)); + len = MIN2(len, sizeof(k_cpu_set)); - thread = NULL; extern int num_processors; - for (i = 0; i < num_processors; i++) { - struct process *tmp_proc; + for (i = 0; i < num_processors && !found; i++) { + struct process *thread; ihk_mc_spinlock_lock_noirq(&get_cpu_local_var(i)->runq_lock); - list_for_each_entry(tmp_proc, &get_cpu_local_var(i)->runq, sched_list) { - if (tmp_proc && tmp_proc->pid && tmp_proc->tid == tid) { - thread = tmp_proc; - hold_process(thread); + list_for_each_entry(thread, &get_cpu_local_var(i)->runq, sched_list) { + if (thread->pid && thread->tid == tid) { + found = 1; + memcpy(&k_cpu_set, &thread->cpu_set, sizeof(k_cpu_set)); break; } } ihk_mc_spinlock_unlock_noirq(&get_cpu_local_var(i)->runq_lock); - if (thread) - break; } - if (!thread) { - kprintf("%s %d\n", __FILE__, __LINE__); + if (!found) { + kprintf("%s:%d Thread not found.\n", __FILE__, __LINE__); return -ESRCH; } - ret = copy_to_user(cpu_local_var(current), u_cpu_set, &thread->cpu_set, len); - release_process(thread); + ret = copy_to_user(cpu_local_var(current), u_cpu_set, &k_cpu_set, len); kprintf("%s %d %d\n", __FILE__, __LINE__, ret); if (ret < 0) return ret; @@ -2777,6 +2779,7 @@ long syscall(int num, ihk_mc_user_context_t *ctx) } check_signal(l, NULL); + check_need_resched(); return l; } From 3751fa176627ff84c526198551fa19ba849fa1f9 Mon Sep 17 00:00:00 2001 From: YOSHIDA Masanori Date: Wed, 25 Jun 2014 17:25:47 +0900 Subject: [PATCH 20/23] add some functions related to migration into kernel/process.h ... and also add include guard to arch/x86/kernel/include/signal.h --- arch/x86/kernel/cpu.c | 1 + arch/x86/kernel/include/signal.h | 5 +++++ kernel/include/process.h | 3 +++ 3 files changed, 9 insertions(+) diff --git a/arch/x86/kernel/cpu.c b/arch/x86/kernel/cpu.c index cd15aae3..7b289961 100644 --- a/arch/x86/kernel/cpu.c +++ b/arch/x86/kernel/cpu.c @@ -21,6 +21,7 @@ #include #include #include +#include #define LAPIC_ID 0x020 #define LAPIC_TIMER 0x320 diff --git a/arch/x86/kernel/include/signal.h b/arch/x86/kernel/include/signal.h index 23735056..33820ce9 100644 --- a/arch/x86/kernel/include/signal.h +++ b/arch/x86/kernel/include/signal.h @@ -11,6 +11,9 @@ * 2012/02/11 bgerofi what kind of new features have been added */ +#ifndef __HEADER_X86_COMMON_SIGNAL_H +#define __HEADER_X86_COMMON_SIGNAL_H + #define _NSIG 64 #define _NSIG_BPW 64 #define _NSIG_WORDS (_NSIG / _NSIG_BPW) @@ -149,3 +152,5 @@ typedef struct siginfo { #define SIGSYS 31 #define SIGUNUSED 31 #define SIGRTMIN 32 + +#endif /*__HEADER_X86_COMMON_SIGNAL_H*/ diff --git a/kernel/include/process.h b/kernel/include/process.h index 2b7d6bac..5072e196 100644 --- a/kernel/include/process.h +++ b/kernel/include/process.h @@ -261,4 +261,7 @@ void runq_add_proc(struct process *proc, int cpu_id); void runq_del_proc(struct process *proc, int cpu_id); int sched_wakeup_process(struct process *proc, int valid_states); +void sched_request_migrate(int cpu_id, struct process *proc); +void check_need_resched(void); + #endif From 99931179e10fc4d81115baa2d40cf5f0339b7884 Mon Sep 17 00:00:00 2001 From: YOSHIDA Masanori Date: Wed, 25 Jun 2014 17:29:02 +0900 Subject: [PATCH 21/23] add get_cpu_id syscall with No.700 for testing --- arch/x86/kernel/include/syscall_list.h | 1 + kernel/syscall.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/arch/x86/kernel/include/syscall_list.h b/arch/x86/kernel/include/syscall_list.h index 23209802..af7fb177 100644 --- a/arch/x86/kernel/include/syscall_list.h +++ b/arch/x86/kernel/include/syscall_list.h @@ -89,5 +89,6 @@ SYSCALL_HANDLED(601, pmc_init) SYSCALL_HANDLED(602, pmc_start) SYSCALL_HANDLED(603, pmc_stop) SYSCALL_HANDLED(604, pmc_reset) +SYSCALL_HANDLED(700, get_cpu_id) /**** End of File ****/ diff --git a/kernel/syscall.c b/kernel/syscall.c index fd2764b6..90b34d32 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -2014,6 +2014,11 @@ SYSCALL_DECLARE(sched_getaffinity) return len; } +SYSCALL_DECLARE(get_cpu_id) +{ + return ihk_mc_get_processor_id(); +} + SYSCALL_DECLARE(sched_yield) { return -ENOSYS; From 35b87169668f33fc7aa08703e091dd07292c0b53 Mon Sep 17 00:00:00 2001 From: "bgerofi@riken.jp" Date: Tue, 15 Jul 2014 17:31:16 +0900 Subject: [PATCH 22/23] push/pop r12,r13 and r14 as well when entering/leaving kernel space --- arch/x86/kernel/include/registers.h | 2 +- arch/x86/kernel/interrupt.S | 14 ++++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/include/registers.h b/arch/x86/kernel/include/registers.h index 0e9dca07..2b3d7bb8 100644 --- a/arch/x86/kernel/include/registers.h +++ b/arch/x86/kernel/include/registers.h @@ -136,7 +136,7 @@ struct tss64 { } __attribute__((packed)); struct x86_regs { - unsigned long r15, r11, r10, r9, r8; + unsigned long r15, r14, r13, r12, r11, r10, r9, r8; unsigned long rdi, rsi, rdx, rcx, rbx, rax, rbp; unsigned long error, rip, cs, rflags, rsp, ss; }; diff --git a/arch/x86/kernel/interrupt.S b/arch/x86/kernel/interrupt.S index d0a0838b..04dbb85f 100644 --- a/arch/x86/kernel/interrupt.S +++ b/arch/x86/kernel/interrupt.S @@ -36,9 +36,15 @@ pushq %r9; \ pushq %r10; \ pushq %r11; \ + pushq %r12; \ + pushq %r13; \ + pushq %r14; \ pushq %r15; #define POP_ALL_REGS \ popq %r15; \ + popq %r14; \ + popq %r13; \ + popq %r12; \ popq %r11; \ popq %r10; \ popq %r9; \ @@ -69,7 +75,7 @@ vector=vector+1 common_interrupt: PUSH_ALL_REGS - movq 96(%rsp), %rdi + movq 120(%rsp), %rdi movq %rsp, %rsi call handle_interrupt /* Enter C code */ POP_ALL_REGS @@ -85,7 +91,7 @@ page_fault: cld PUSH_ALL_REGS movq %cr2, %rdi - movq 96(%rsp),%rsi + movq 120(%rsp),%rsi movq %rsp,%rdx movq __page_fault_handler_address(%rip), %rax andq %rax, %rax @@ -122,13 +128,13 @@ x86_syscall: movq %gs:24, %rcx movq %rcx, 32(%rsp) PUSH_ALL_REGS - movq 80(%rsp), %rdi + movq 104(%rsp), %rdi movw %ss, %ax movw %ax, %ds movq %rsp, %rsi callq *__x86_syscall_handler(%rip) 1: - movq %rax, 80(%rsp) + movq %rax, 104(%rsp) POP_ALL_REGS #ifdef USE_SYSRET movq 8(%rsp), %rcx From a5b36e2b5154d918c3833d5ef2b09196eab55d35 Mon Sep 17 00:00:00 2001 From: "Balazs Gerofi bgerofi@riken.jp" Date: Thu, 17 Jul 2014 12:28:57 +0900 Subject: [PATCH 23/23] dkprintf() support in kernel/listeners.c --- kernel/listeners.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/listeners.c b/kernel/listeners.c index e99e9ed9..f30ab889 100644 --- a/kernel/listeners.c +++ b/kernel/listeners.c @@ -18,6 +18,16 @@ #include #include +//#define DEBUG_LISTENERS + +#ifdef DEBUG_LISTENERS +#define dkprintf(...) kprintf(__VA_ARGS__) +#define ekprintf(...) kprintf(__VA_ARGS__) +#else +#define dkprintf(...) +#define ekprintf(...) kprintf(__VA_ARGS__) +#endif + static unsigned long read_tsc(void) { unsigned int low, high; @@ -103,5 +113,5 @@ static struct ihk_ikc_listen_param test_listen_param = { void mc_ikc_test_init(void) { ihk_ikc_listen_port(NULL, &test_listen_param); - kprintf("Listener registered port %d\n", 500); + dkprintf("Listener registered port %d\n", 500); }