From 101a0f6e4a887d382c0231578b5534d949895141 Mon Sep 17 00:00:00 2001 From: "Balazs Gerofi bgerofi@riken.jp" Date: Tue, 22 Jul 2014 12:24:07 +0900 Subject: [PATCH] remote TLB invalidation code for multi-threaded applications (e.g., during munmap()) --- arch/x86/kernel/cpu.c | 12 +- arch/x86/kernel/include/bitops.h | 213 ++++++++++++++++++++++++++++ arch/x86/kernel/include/ihk/types.h | 4 + arch/x86/kernel/memory.c | 49 ++++--- kernel/include/hash.h | 2 - kernel/include/process.h | 6 + kernel/mem.c | 104 ++++++++++++++ kernel/process.c | 39 ++++- kernel/syscall.c | 4 +- lib/include/ihk/cpu.h | 4 + lib/include/ihk/mm.h | 31 +++- 11 files changed, 435 insertions(+), 33 deletions(-) diff --git a/arch/x86/kernel/cpu.c b/arch/x86/kernel/cpu.c index 7b289961..d7f2ec55 100644 --- a/arch/x86/kernel/cpu.c +++ b/arch/x86/kernel/cpu.c @@ -407,6 +407,7 @@ void setup_x86_ap(void (*next_func)(void)) void arch_show_interrupt_context(const void *reg); void set_signal(int sig, void *regs); void check_signal(unsigned long rc, void *regs); +extern void tlb_flush_handler(int vector); void handle_interrupt(int vector, struct x86_regs *regs) { @@ -419,7 +420,8 @@ void handle_interrupt(int vector, struct x86_regs *regs) if (vector < 0 || vector > 255) { panic("Invalid interrupt vector."); - } else if (vector < 32) { + } + else if (vector < 32) { if (vector == 8 || (vector >= 10 && vector <= 15) || vector == 17) { kprintf("Exception %d, rflags: 0x%lX CS: 0x%lX, RIP: 0x%lX\n", @@ -430,7 +432,13 @@ void handle_interrupt(int vector, struct x86_regs *regs) } arch_show_interrupt_context(regs); panic("Unhandled exception"); - } else { + } + else if (vector >= IHK_TLB_FLUSH_IRQ_VECTOR_START && + vector < IHK_TLB_FLUSH_IRQ_VECTOR_END) { + + tlb_flush_handler(vector); + } + else { list_for_each_entry(h, &handlers[vector - 32], list) { if (h->func) { h->func(h->priv); diff --git a/arch/x86/kernel/include/bitops.h b/arch/x86/kernel/include/bitops.h index 39a23058..72aca0ca 100644 --- a/arch/x86/kernel/include/bitops.h +++ b/arch/x86/kernel/include/bitops.h @@ -24,4 +24,217 @@ static inline int fls(int x) return r + 1; } +/** + * ffs - find first set bit in word + * @x: the word to search + * + * This is defined the same way as the libc and compiler builtin ffs + * routines, therefore differs in spirit from the other bitops. + * + * ffs(value) returns 0 if value is 0 or the position of the first + * set bit if value is nonzero. The first (least significant) bit + * is at position 1. + */ +static inline int ffs(int x) +{ + int r; + asm("bsfl %1,%0\n\t" + "jnz 1f\n\t" + "movl $-1,%0\n" + "1:" : "=r" (r) : "rm" (x)); + return r + 1; +} + + +/** + * __ffs - find first set bit in word + * @word: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static inline unsigned long __ffs(unsigned long word) +{ + asm("bsf %1,%0" + : "=r" (word) + : "rm" (word)); + return word; +} + +/** + * ffz - find first zero bit in word + * @word: The word to search + * + * Undefined if no zero exists, so code should check against ~0UL first. + */ +static inline unsigned long ffz(unsigned long word) +{ + asm("bsf %1,%0" + : "=r" (word) + : "r" (~word)); + return word; +} + + +#define ADDR (*(volatile long *)addr) + +static inline void set_bit(int nr, volatile unsigned long *addr) +{ + asm volatile("lock; btsl %1,%0" + : "+m" (ADDR) + : "Ir" (nr) + : "memory"); +} + +static inline void clear_bit(int nr, volatile unsigned long *addr) +{ + asm volatile("lock; btrl %1,%0" + : "+m" (ADDR) + : "Ir" (nr) + : "memory"); +} + +#define for_each_set_bit(bit, addr, size) \ + for ((bit) = find_first_bit((addr), (size)); \ + (bit) < (size); \ + (bit) = find_next_bit((addr), (size), (bit) + 1)) + +#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) + +/* + * Find the next set bit in a memory region. + */ +static unsigned long find_next_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) +{ + const unsigned long *p = addr + BITOP_WORD(offset); + unsigned long result = offset & ~(BITS_PER_LONG-1); + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset %= BITS_PER_LONG; + if (offset) { + tmp = *(p++); + tmp &= (~0UL << offset); + if (size < BITS_PER_LONG) + goto found_first; + if (tmp) + goto found_middle; + size -= BITS_PER_LONG; + result += BITS_PER_LONG; + } + while (size & ~(BITS_PER_LONG-1)) { + if ((tmp = *(p++))) + goto found_middle; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + tmp = *p; + +found_first: + tmp &= (~0UL >> (BITS_PER_LONG - size)); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found_middle: + return result + __ffs(tmp); +} + +/* + * This implementation of find_{first,next}_zero_bit was stolen from + * Linus' asm-alpha/bitops.h. + */ +static unsigned long find_next_zero_bit(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + BITOP_WORD(offset); + unsigned long result = offset & ~(BITS_PER_LONG-1); + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset %= BITS_PER_LONG; + if (offset) { + tmp = *(p++); + tmp |= ~0UL >> (BITS_PER_LONG - offset); + if (size < BITS_PER_LONG) + goto found_first; + if (~tmp) + goto found_middle; + size -= BITS_PER_LONG; + result += BITS_PER_LONG; + } + while (size & ~(BITS_PER_LONG-1)) { + if (~(tmp = *(p++))) + goto found_middle; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + tmp = *p; + +found_first: + tmp |= ~0UL << size; + if (tmp == ~0UL) /* Are any bits zero? */ + return result + size; /* Nope. */ +found_middle: + return result + ffz(tmp); +} + +/* + * Find the first set bit in a memory region. + */ +static unsigned long find_first_bit(const unsigned long *addr, + unsigned long size) +{ + const unsigned long *p = addr; + unsigned long result = 0; + unsigned long tmp; + + while (size & ~(BITS_PER_LONG-1)) { + if ((tmp = *(p++))) + goto found; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + + tmp = (*p) & (~0UL >> (BITS_PER_LONG - size)); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found: + return result + __ffs(tmp); +} + +/* + * Find the first cleared bit in a memory region. + */ +static unsigned long find_first_zero_bit(const unsigned long *addr, + unsigned long size) +{ + const unsigned long *p = addr; + unsigned long result = 0; + unsigned long tmp; + + while (size & ~(BITS_PER_LONG-1)) { + if (~(tmp = *(p++))) + goto found; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + + tmp = (*p) | (~0UL << size); + if (tmp == ~0UL) /* Are any bits zero? */ + return result + size; /* Nope. */ +found: + return result + ffz(tmp); +} + #endif diff --git a/arch/x86/kernel/include/ihk/types.h b/arch/x86/kernel/include/ihk/types.h index 9f000a96..2d44b196 100644 --- a/arch/x86/kernel/include/ihk/types.h +++ b/arch/x86/kernel/include/ihk/types.h @@ -31,5 +31,9 @@ typedef int64_t off_t; #define NULL ((void *)0) +#define BITS_PER_LONG_SHIFT 6 +#define BITS_PER_LONG (1 << BITS_PER_LONG_SHIFT) + + #endif diff --git a/arch/x86/kernel/memory.c b/arch/x86/kernel/memory.c index ac5cb12a..ffc4f35a 100644 --- a/arch/x86/kernel/memory.c +++ b/arch/x86/kernel/memory.c @@ -1005,6 +1005,7 @@ struct clear_range_args { int free_physical; uint8_t padding[4]; struct memobj *memobj; + struct process_vm *vm; }; static int clear_range_l1(void *args0, pte_t *ptep, uint64_t base, @@ -1032,6 +1033,8 @@ static int clear_range_l1(void *args0, pte_t *ptep, uint64_t base, ihk_mc_free_pages(phys_to_virt(phys), 1); } } + + remote_flush_tlb_cpumask(args->vm, base, ihk_mc_get_processor_id()); return 0; } @@ -1079,6 +1082,8 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base, } } + remote_flush_tlb_cpumask(args->vm, base, ihk_mc_get_processor_id()); + return 0; } @@ -1122,8 +1127,9 @@ static int clear_range_l4(void *args0, pte_t *ptep, uint64_t base, return walk_pte_l3(pt, base, start, end, &clear_range_l3, args0); } -static int clear_range(struct page_table *pt, uintptr_t start, uintptr_t end, - int free_physical, struct memobj *memobj) +static int clear_range(struct page_table *pt, struct process_vm *vm, + uintptr_t start, uintptr_t end, int free_physical, + struct memobj *memobj) { int error; struct clear_range_args args; @@ -1137,22 +1143,25 @@ static int clear_range(struct page_table *pt, uintptr_t start, uintptr_t end, args.free_physical = free_physical; args.memobj = memobj; + args.vm = vm; error = walk_pte_l4(pt, 0, start, end, &clear_range_l4, &args); return error; } -int ihk_mc_pt_clear_range(page_table_t pt, void *start, void *end) +int ihk_mc_pt_clear_range(page_table_t pt, struct process_vm *vm, + void *start, void *end) { #define KEEP_PHYSICAL 0 - return clear_range(pt, (uintptr_t)start, (uintptr_t)end, + return clear_range(pt, vm, (uintptr_t)start, (uintptr_t)end, KEEP_PHYSICAL, NULL); } -int ihk_mc_pt_free_range(page_table_t pt, void *start, void *end, struct memobj *memobj) +int ihk_mc_pt_free_range(page_table_t pt, struct process_vm *vm, + void *start, void *end, struct memobj *memobj) { #define FREE_PHYSICAL 1 - return clear_range(pt, (uintptr_t)start, (uintptr_t)end, + return clear_range(pt, vm, (uintptr_t)start, (uintptr_t)end, FREE_PHYSICAL, memobj); } @@ -1474,6 +1483,7 @@ struct set_range_args { enum ihk_mc_pt_attribute attr; int padding; uintptr_t diff; + struct process_vm *vm; }; int set_range_l1(void *args0, pte_t *ptep, uintptr_t base, uintptr_t start, @@ -1489,7 +1499,7 @@ int set_range_l1(void *args0, pte_t *ptep, uintptr_t base, uintptr_t start, error = -EBUSY; ekprintf("set_range_l1(%lx,%lx,%lx):page exists. %d %lx\n", base, start, end, error, *ptep); - (void)clear_range(args->pt, start, base, KEEP_PHYSICAL, NULL); + (void)clear_range(args->pt, args->vm, start, base, KEEP_PHYSICAL, NULL); goto out; } @@ -1536,7 +1546,7 @@ int set_range_l2(void *args0, pte_t *ptep, uintptr_t base, uintptr_t start, ekprintf("set_range_l2(%lx,%lx,%lx):" "__alloc_new_pt failed. %d %lx\n", base, start, end, error, *ptep); - (void)clear_range(args->pt, start, base, + (void)clear_range(args->pt, args->vm, start, base, KEEP_PHYSICAL, NULL); goto out; } @@ -1548,7 +1558,7 @@ int set_range_l2(void *args0, pte_t *ptep, uintptr_t base, uintptr_t start, ekprintf("set_range_l2(%lx,%lx,%lx):" "page exists. %d %lx\n", base, start, end, error, *ptep); - (void)clear_range(args->pt, start, base, KEEP_PHYSICAL, NULL); + (void)clear_range(args->pt, args->vm, start, base, KEEP_PHYSICAL, NULL); goto out; } else { @@ -1604,7 +1614,7 @@ int set_range_l3(void *args0, pte_t *ptep, uintptr_t base, uintptr_t start, ekprintf("set_range_l3(%lx,%lx,%lx):" "__alloc_new_pt failed. %d %lx\n", base, start, end, error, *ptep); - (void)clear_range(args->pt, start, base, + (void)clear_range(args->pt, args->vm, start, base, KEEP_PHYSICAL, NULL); goto out; } @@ -1615,7 +1625,7 @@ int set_range_l3(void *args0, pte_t *ptep, uintptr_t base, uintptr_t start, ekprintf("set_range_l3(%lx,%lx,%lx):" "page exists. %d %lx\n", base, start, end, error, *ptep); - (void)clear_range(args->pt, start, base, KEEP_PHYSICAL, NULL); + (void)clear_range(args->pt, args->vm, start, base, KEEP_PHYSICAL, NULL); goto out; } else { @@ -1653,7 +1663,7 @@ int set_range_l4(void *args0, pte_t *ptep, uintptr_t base, uintptr_t start, ekprintf("set_range_l4(%lx,%lx,%lx):" "__alloc_new_pt failed. %d %lx\n", base, start, end, error, *ptep); - (void)clear_range(args->pt, start, base, + (void)clear_range(args->pt, args->vm, start, base, KEEP_PHYSICAL, NULL); goto out; } @@ -1678,8 +1688,8 @@ out: return error; } -int ihk_mc_pt_set_range(page_table_t pt, void *start, void *end, - uintptr_t phys, enum ihk_mc_pt_attribute attr) +int ihk_mc_pt_set_range(page_table_t pt, struct process_vm *vm, void *start, + void *end, uintptr_t phys, enum ihk_mc_pt_attribute attr) { int error; struct set_range_args args; @@ -1691,6 +1701,7 @@ int ihk_mc_pt_set_range(page_table_t pt, void *start, void *end, args.phys = phys; args.attr = attr; args.diff = (uintptr_t)start ^ phys; + args.vm = vm; error = walk_pte_l4(pt, 0, (uintptr_t)start, (uintptr_t)end, &set_range_l4, &args); @@ -1805,9 +1816,11 @@ enum ihk_mc_pt_attribute arch_vrflag_to_ptattr(unsigned long flag, uint64_t faul struct move_args { uintptr_t src; uintptr_t dest; + struct process_vm *vm; }; -static int move_one_page(void *arg0, page_table_t pt, pte_t *ptep, void *pgaddr, size_t pgsize) +static int move_one_page(void *arg0, page_table_t pt, pte_t *ptep, + void *pgaddr, size_t pgsize) { int error; struct move_args *args = arg0; @@ -1833,7 +1846,7 @@ static int move_one_page(void *arg0, page_table_t pt, pte_t *ptep, void *pgaddr, phys = apte & PT_PHYSMASK; attr = apte & ~PT_PHYSMASK; - error = ihk_mc_pt_set_range(pt, (void *)dest, + error = ihk_mc_pt_set_range(pt, args->vm, (void *)dest, (void *)(dest + pgsize), phys, attr); if (error) { kprintf("move_one_page(%p,%p,%p %#lx,%p,%#lx):" @@ -1849,7 +1862,8 @@ out: return error; } -int move_pte_range(page_table_t pt, void *src, void *dest, size_t size) +int move_pte_range(page_table_t pt, struct process_vm *vm, + void *src, void *dest, size_t size) { int error; struct move_args args; @@ -1857,6 +1871,7 @@ int move_pte_range(page_table_t pt, void *src, void *dest, size_t size) dkprintf("move_pte_range(%p,%p,%p,%#lx)\n", pt, src, dest, size); args.src = (uintptr_t)src; args.dest = (uintptr_t)dest; + args.vm = vm; error = visit_pte_range(pt, src, src+size, VPTEF_SKIP_NULL, &move_one_page, &args); diff --git a/kernel/include/hash.h b/kernel/include/hash.h index 5ff075df..43e5ac0e 100644 --- a/kernel/include/hash.h +++ b/kernel/include/hash.h @@ -14,8 +14,6 @@ * machines where multiplications are slow. */ -#define BITS_PER_LONG 64 - /* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ #define GOLDEN_RATIO_PRIME_32 0x9e370001UL /* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ diff --git a/kernel/include/process.h b/kernel/include/process.h index 5072e196..adf74ecf 100644 --- a/kernel/include/process.h +++ b/kernel/include/process.h @@ -205,6 +205,9 @@ struct process_vm { // 2. addition of process page table (allocate_pages, update_process_page_table) // note that physical memory allocator (ihk_mc_alloc_pages, ihk_pagealloc_alloc) // is protected by its own lock (see ihk/manycore/generic/page_alloc.c) + + cpu_set_t cpu_set; + ihk_spinlock_t cpu_set_lock; }; @@ -264,4 +267,7 @@ int sched_wakeup_process(struct process *proc, int valid_states); void sched_request_migrate(int cpu_id, struct process *proc); void check_need_resched(void); +void cpu_set(int cpu, cpu_set_t *cpu_set, ihk_spinlock_t *lock); +void cpu_clear(int cpu, cpu_set_t *cpu_set, ihk_spinlock_t *lock); + #endif diff --git a/kernel/mem.c b/kernel/mem.c index d84e043c..2216ad88 100644 --- a/kernel/mem.c +++ b/kernel/mem.c @@ -33,6 +33,8 @@ #endif #include #include +#include +#include //#define DEBUG_PRINT_MEM @@ -50,6 +52,8 @@ static struct page *pa_pages; extern int ihk_mc_pt_print_pte(struct page_table *pt, void *virt); +struct tlb_flush_entry tlb_flush_vector[IHK_TLB_FLUSH_IRQ_VECTOR_SIZE]; + static void reserve_pages(unsigned long start, unsigned long end, int type) { if (start < pa_start) { @@ -218,6 +222,106 @@ static void unhandled_page_fault(struct process *proc, void *fault_addr, void *r return; } +void remote_flush_tlb_cpumask(struct process_vm *vm, + unsigned long addr, int cpu_id) +{ + unsigned long cpu; + int flush_ind; + struct tlb_flush_entry *flush_entry; + cpu_set_t _cpu_set; + + if (addr) { + flush_ind = (addr >> PAGE_SHIFT) % IHK_TLB_FLUSH_IRQ_VECTOR_SIZE; + } + /* Zero address denotes full TLB flush */ + else { + /* Random.. */ + flush_ind = (rdtsc()) % IHK_TLB_FLUSH_IRQ_VECTOR_SIZE; + } + + flush_entry = &tlb_flush_vector[flush_ind]; + + /* Take a copy of the cpu set so that we don't hold the lock + * all the way while interrupting other cores */ + ihk_mc_spinlock_lock_noirq(&vm->cpu_set_lock); + memcpy(&_cpu_set, &vm->cpu_set, sizeof(cpu_set_t)); + ihk_mc_spinlock_unlock_noirq(&vm->cpu_set_lock); + + dkprintf("trying to aquire flush_entry->lock flush_ind: %d\n", flush_ind); + + ihk_mc_spinlock_lock_noirq(&flush_entry->lock); + + flush_entry->vm = vm; + flush_entry->addr = addr; + ihk_atomic_set(&flush_entry->pending, 0); + + dkprintf("lock aquired, iterating cpu mask.. flush_ind: %d\n", flush_ind); + + /* Loop through CPUs in this address space and interrupt them for + * TLB flush on the specified address */ + for_each_set_bit(cpu, (const unsigned long*)&_cpu_set.__bits, CPU_SETSIZE) { + + if (ihk_mc_get_processor_id() == cpu) + continue; + + ihk_atomic_inc(&flush_entry->pending); + dkprintf("remote_flush_tlb_cpumask: flush_ind: %d, addr: 0x%lX, interrupting cpu: %d\n", + flush_ind, addr, cpu); + + ihk_mc_interrupt_cpu(get_x86_cpu_local_variable(cpu)->apic_id, + flush_ind + IHK_TLB_FLUSH_IRQ_VECTOR_START); + } + +#ifdef DEBUG_IC_TLB + { + unsigned long tsc; + tsc = rdtsc() + 12884901888; /* 1.2GHz =>10 sec */ +#endif + + /* Wait for all cores */ + while (ihk_atomic_read(&flush_entry->pending) != 0) { + cpu_pause(); + +#ifdef DEBUG_IC_TLB + if (rdtsc() > tsc) { + kprintf("waited 10 secs for remote TLB!! -> panic_all()\n"); + panic_all_cores("waited 10 secs for remote TLB!!\n"); + } +#endif + } +#ifdef DEBUG_IC_TLB + } +#endif + + ihk_mc_spinlock_unlock_noirq(&flush_entry->lock); +} + +void tlb_flush_handler(int vector) +{ + int flags = cpu_disable_interrupt_save(); + + struct tlb_flush_entry *flush_entry = &tlb_flush_vector[vector - + IHK_TLB_FLUSH_IRQ_VECTOR_START]; + + dkprintf("decreasing pending cnt for %d\n", + vector - IHK_TLB_FLUSH_IRQ_VECTOR_START); + + /* Decrease counter */ + ihk_atomic_dec(&flush_entry->pending); + + dkprintf("flusing TLB for addr: 0x%lX\n", flush_entry->addr); + + if (flush_entry->addr) { + flush_tlb_single(flush_entry->addr & PAGE_MASK); + } + /* Zero address denotes full TLB flush */ + else { + flush_tlb(); + } + + cpu_restore_interrupt(flags); +} + static void page_fault_handler(void *fault_addr, uint64_t reason, void *regs) { struct process *proc = cpu_local_var(current); diff --git a/kernel/process.c b/kernel/process.c index a3af4a4c..78f52d2b 100644 --- a/kernel/process.c +++ b/kernel/process.c @@ -116,6 +116,8 @@ static int init_process_vm(struct process *owner, struct process_vm *vm) vm->page_table = pt; hold_process(owner); vm->owner_process = owner; + memset(&vm->cpu_set, 0, sizeof(cpu_set_t)); + ihk_mc_spinlock_init(&vm->cpu_set_lock); return 0; } @@ -169,6 +171,9 @@ struct process *create_process(unsigned long user_pc) goto err_free_sigshared; } + cpu_set(ihk_mc_get_processor_id(), &proc->vm->cpu_set, + &proc->vm->cpu_set_lock); + ihk_mc_spinlock_init(&proc->spin_sleep_lock); proc->spin_sleep = 0; @@ -389,7 +394,7 @@ static int copy_user_ranges(struct process *proc, struct process *org) /* Set up new PTE */ attr = arch_vrflag_to_ptattr(range->flag, PF_POPULATE, NULL); - if (ihk_mc_pt_set_range(proc->vm->page_table, vaddr, + if (ihk_mc_pt_set_range(proc->vm->page_table, proc->vm, vaddr, vaddr + pgsize, virt_to_phys(pg_vaddr), attr)) { kprintf("ERROR: copy_user_ranges() " "(%p,%lx-%lx %lx,%lx):" @@ -634,7 +639,7 @@ int free_process_memory_range(struct process_vm *vm, struct vm_range *range) if (range->memobj) { memobj_lock(range->memobj); } - error = ihk_mc_pt_free_range(vm->page_table, + error = ihk_mc_pt_free_range(vm->page_table, vm, (void *)start, (void *)end, (range->flag & VR_PRIVATE)? NULL: range->memobj); if (range->memobj) { @@ -650,7 +655,7 @@ int free_process_memory_range(struct process_vm *vm, struct vm_range *range) } else { ihk_mc_spinlock_lock_noirq(&vm->page_table_lock); - error = ihk_mc_pt_clear_range(vm->page_table, + error = ihk_mc_pt_clear_range(vm->page_table, vm, (void *)start, (void *)end); ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock); if (error && (error != -ENOENT)) { @@ -1227,7 +1232,8 @@ static int page_fault_process_memory_range(struct process_vm *vm, struct vm_rang } } else { - error = ihk_mc_pt_set_range(vm->page_table, pgaddr, pgaddr+pgsize, phys, attr); + error = ihk_mc_pt_set_range(vm->page_table, vm, pgaddr, pgaddr + pgsize, + phys, attr); if (error) { kprintf("page_fault_process_memory_range(%p,%lx-%lx %lx,%lx,%lx):set_range failed. %d\n", vm, range->start, range->end, range->flag, fault_addr, reason, error); goto out; @@ -1366,7 +1372,7 @@ int init_process_stack(struct process *process, struct program_load_desc *pn, return -ENOMEM; } memset(stack, 0, minsz); - error = ihk_mc_pt_set_range(process->vm->page_table, + error = ihk_mc_pt_set_range(process->vm->page_table, process->vm, (void *)(end-minsz), (void *)end, virt_to_phys(stack), arch_vrflag_to_ptattr(vrflag, PF_POPULATE, NULL)); @@ -1526,7 +1532,8 @@ int remove_process_region(struct process *proc, ihk_mc_spinlock_lock_noirq(&proc->vm->page_table_lock); /* We defer freeing to the time of exit */ // XXX: check error - ihk_mc_pt_clear_range(proc->vm->page_table, (void *)start, (void *)end); + ihk_mc_pt_clear_range(proc->vm->page_table, proc->vm, + (void *)start, (void *)end); ihk_mc_spinlock_unlock_noirq(&proc->vm->page_table_lock); return 0; @@ -1650,6 +1657,10 @@ void destroy_process(struct process *proc) struct sig_pending *pending; struct sig_pending *next; + if (proc->vm) { + cpu_clear(proc->cpu_id, &proc->vm->cpu_set, &proc->vm->cpu_set_lock); + } + free_process_memory(proc); if(ihk_atomic_dec_and_test(&proc->sighandler->use)){ @@ -1679,6 +1690,22 @@ void release_process(struct process *proc) destroy_process(proc); } +void cpu_set(int cpu, cpu_set_t *cpu_set, ihk_spinlock_t *lock) +{ + unsigned int flags; + flags = ihk_mc_spinlock_lock(lock); + CPU_SET(cpu, cpu_set); + ihk_mc_spinlock_unlock(lock, flags); +} + +void cpu_clear(int cpu, cpu_set_t *cpu_set, ihk_spinlock_t *lock) +{ + unsigned int flags; + flags = ihk_mc_spinlock_lock(lock); + CPU_CLR(cpu, cpu_set); + ihk_mc_spinlock_unlock(lock, flags); +} + static void idle(void) { cpu_local_var(status) = CPU_STATUS_IDLE; diff --git a/kernel/syscall.c b/kernel/syscall.c index 90b34d32..67280657 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -1278,6 +1278,8 @@ SYSCALL_DECLARE(clone) return -ENOMEM; } + cpu_set(cpuid, &new->vm->cpu_set, &new->vm->cpu_set_lock); + if (clone_flags & CLONE_VM) { new->pid = cpu_local_var(current)->pid; @@ -2592,7 +2594,7 @@ SYSCALL_DECLARE(mremap) if (oldsize > 0) { size = (oldsize < newsize)? oldsize: newsize; ihk_mc_spinlock_lock_noirq(&vm->page_table_lock); - error = move_pte_range(vm->page_table, + error = move_pte_range(vm->page_table, vm, (void *)oldstart, (void *)newstart, size); ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock); diff --git a/lib/include/ihk/cpu.h b/lib/include/ihk/cpu.h index b0b4a936..d52bb7d0 100644 --- a/lib/include/ihk/cpu.h +++ b/lib/include/ihk/cpu.h @@ -92,6 +92,10 @@ enum ihk_asr_type { IHK_ASR_X86_GS, }; +#define IHK_TLB_FLUSH_IRQ_VECTOR_START 68 +#define IHK_TLB_FLUSH_IRQ_VECTOR_SIZE 64 +#define IHK_TLB_FLUSH_IRQ_VECTOR_END (IHK_TLB_FLUSH_IRQ_VECTOR_START + IHK_TLB_FLUSH_IRQ_VECTOR_SIZE) + int ihk_mc_arch_set_special_register(enum ihk_asr_type, unsigned long value); int ihk_mc_arch_get_special_register(enum ihk_asr_type, unsigned long *value); diff --git a/lib/include/ihk/mm.h b/lib/include/ihk/mm.h index 8f3e57a0..f8952ae3 100644 --- a/lib/include/ihk/mm.h +++ b/lib/include/ihk/mm.h @@ -8,6 +8,7 @@ */ /* * HISTORY + * 2014/07: bgerofi: remote TLB flush handler */ #ifndef __HEADER_GENERIC_IHK_MM_H @@ -15,8 +16,11 @@ #include #include +#include +#include struct memobj; +struct process_vm; enum ihk_mc_gma_type { IHK_MC_GMA_MAP_START, @@ -115,16 +119,18 @@ int ihk_mc_pt_change_page(page_table_t pt, void *virt, enum ihk_mc_pt_attribute); int ihk_mc_pt_clear_page(page_table_t pt, void *virt); int ihk_mc_pt_clear_large_page(page_table_t pt, void *virt); -int ihk_mc_pt_clear_range(page_table_t pt, void *start, void *end); -int ihk_mc_pt_free_range(page_table_t pt, void *start, void *end, struct memobj *memobj); +int ihk_mc_pt_clear_range(page_table_t pt, struct process_vm *vm, + void *start, void *end); +int ihk_mc_pt_free_range(page_table_t pt, struct process_vm *vm, + void *start, void *end, struct memobj *memobj); int ihk_mc_pt_change_attr_range(page_table_t pt, void *start, void *end, enum ihk_mc_pt_attribute clrattr, enum ihk_mc_pt_attribute setattr); int ihk_mc_pt_alloc_range(page_table_t pt, void *start, void *end, enum ihk_mc_pt_attribute attr); pte_t *ihk_mc_pt_lookup_pte(page_table_t pt, void *virt, void **pgbasep, size_t *pgsizep, int *p2alignp); -int ihk_mc_pt_set_range(page_table_t pt, void *start, void *end, - uintptr_t phys, enum ihk_mc_pt_attribute attr); +int ihk_mc_pt_set_range(page_table_t pt, struct process_vm *vm, void *start, + void *end, uintptr_t phys, enum ihk_mc_pt_attribute attr); int ihk_mc_pt_set_pte(page_table_t pt, pte_t *ptep, size_t pgsize, uintptr_t phys, enum ihk_mc_pt_attribute attr); int ihk_mc_pt_prepare_map(page_table_t pt, void *virt, unsigned long size, enum ihk_mc_pt_prepare_flag); @@ -133,7 +139,8 @@ typedef int pte_visitor_t(void *arg, page_table_t pt, pte_t *ptep, void *pgaddr, size_t pgsize); int visit_pte_range(page_table_t pt, void *start, void *end, enum visit_pte_flag flags, pte_visitor_t *funcp, void *arg); -int move_pte_range(page_table_t pt, void *src, void *dest, size_t size); +int move_pte_range(page_table_t pt, struct process_vm *vm, + void *src, void *dest, size_t size); struct page_table *ihk_mc_pt_create(enum ihk_mc_ap_flag ap_flag); /* XXX: proper use of struct page_table and page_table_t is unknown */ @@ -142,4 +149,18 @@ void ihk_mc_load_page_table(struct page_table *pt); int ihk_mc_pt_virt_to_phys(struct page_table *pt, void *virt, unsigned long *phys); +void remote_flush_tlb_cpumask(struct process_vm *vm, + unsigned long addr, int cpu_id); + +extern void (*__tlb_flush_handler)(int vector); + +struct tlb_flush_entry { + struct process_vm *vm; + unsigned long addr; + ihk_atomic_t pending; + ihk_spinlock_t lock; +} __attribute__((aligned(64))); + +extern struct tlb_flush_entry tlb_flush_vector[IHK_TLB_FLUSH_IRQ_VECTOR_SIZE]; + #endif