/** * \file syscall.c * License details are found in the file LICENSE. * \brief * system call handlers * \author Taku Shimosawa \par * Copyright (C) 2011 - 2012 Taku Shimosawa * \author Balazs Gerofi \par * Copyright (C) 2012 RIKEN AICS * \author Masamichi Takagi \par * Copyright (C) 2012 - 2013 NEC Corporation * \author Min Si \par * Copyright (C) 2012 Min Si * \author Balazs Gerofi \par * Copyright (C) 2013 The University of Tokyo * \author Gou Nakamura \par * Copyright (C) 2013 Hitachi, Ltd. * \author Tomoki Shirasawa \par * Copyright (C) 2013 Hitachi, Ltd. */ /* * HISTORY: */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Headers taken from kitten LWK */ #include #include #define SYSCALL_BY_IKC //#define DEBUG_PRINT_SC #ifdef DEBUG_PRINT_SC #define dkprintf(...) kprintf(__VA_ARGS__) #define ekprintf(...) kprintf(__VA_ARGS__) #else #define dkprintf(...) #define ekprintf(...) kprintf(__VA_ARGS__) #endif //static ihk_atomic_t pid_cnt = IHK_ATOMIC_INIT(1024); /* generate system call handler's prototypes */ #define SYSCALL_HANDLED(number,name) extern long sys_##name(int n, ihk_mc_user_context_t *ctx); #define SYSCALL_DELEGATED(number,name) #include #undef SYSCALL_HANDLED #undef SYSCALL_DELEGATED /* generate syscall_table[] */ static long (*syscall_table[])(int, ihk_mc_user_context_t *) = { #define SYSCALL_HANDLED(number,name) [number] = &sys_##name, #define SYSCALL_DELEGATED(number,name) #include #undef SYSCALL_HANDLED #undef SYSCALL_DELEGATED }; /* generate syscall_name[] */ #define MCKERNEL_UNUSED __attribute__ ((unused)) static char *syscall_name[] MCKERNEL_UNUSED = { #define DECLARATOR(number,name) [number] = #name, #define SYSCALL_HANDLED(number,name) DECLARATOR(number,sys_##name) #define SYSCALL_DELEGATED(number,name) DECLARATOR(number,sys_##name) #include #undef DECLARATOR #undef SYSCALL_HANDLED #undef SYSCALL_DELEGATED }; void check_signal(unsigned long rc, void *regs); void do_signal(long rc, void *regs, struct process *proc, struct sig_pending *pending); extern unsigned long do_kill(int pid, int tid, int sig); int copy_from_user(struct process *, void *, const void *, size_t); int copy_to_user(struct process *, void *, const void *, size_t); void do_setpgid(int, int); int prepare_process_ranges_args_envs(struct process *proc, struct program_load_desc *pn, struct program_load_desc *p, enum ihk_mc_pt_attribute attr, char *args, int args_len, char *envs, int envs_len); #ifdef DCFA_KMOD static void do_mod_exit(int status); #endif static void send_syscall(struct syscall_request *req, int cpu, int pid) { struct ikc_scd_packet packet; struct syscall_response *res; #ifdef USE_DMA unsigned long fin; #endif struct syscall_params *scp; struct ihk_ikc_channel_desc *syscall_channel; int ret; if(req->number == __NR_exit_group || req->number == __NR_gettid || req->number == __NR_kill){ // interrupt syscall extern int num_processors; scp = &get_cpu_local_var(0)->scp2; syscall_channel = get_cpu_local_var(0)->syscall_channel2; /* XXX: is this really going to work if multiple processes * exit/receive signals at the same time?? */ cpu = num_processors; if(req->number == __NR_kill) pid = req->args[0]; if(req->number == __NR_gettid) pid = req->args[1]; } else{ scp = &get_cpu_local_var(cpu)->scp; syscall_channel = get_cpu_local_var(cpu)->syscall_channel; } res = scp->response_va; res->status = 0; req->valid = 0; #ifdef USE_DMA memcpy_async(scp->request_pa, virt_to_phys(req), sizeof(*req), 0, &fin); memcpy_async_wait(&scp->post_fin); scp->post_va->v[0] = scp->post_idx; memcpy_async_wait(&fin); #else memcpy(scp->request_va, req, sizeof(*req)); #endif barrier(); scp->request_va->valid = 1; *(unsigned int *)scp->doorbell_va = cpu + 1; #ifdef SYSCALL_BY_IKC packet.msg = SCD_MSG_SYSCALL_ONESIDE; packet.ref = cpu; packet.pid = pid ? pid : cpu_local_var(current)->pid; packet.arg = scp->request_rpa; dkprintf("send syscall, nr: %d, pid: %d\n", req->number, packet.pid); ret = ihk_ikc_send(syscall_channel, &packet, 0); if (ret < 0) { kprintf("ERROR: sending IKC msg, ret: %d\n", ret); } #endif } ihk_spinlock_t syscall_lock; long do_syscall(struct syscall_request *req, ihk_mc_user_context_t *ctx, int cpu, int pid) { struct syscall_response *res; struct syscall_request req2 IHK_DMA_ALIGN; struct syscall_params *scp; int error; long rc; int islock = 0; unsigned long irqstate; dkprintf("SC(%d)[%3d] sending syscall\n", ihk_mc_get_processor_id(), req->number); if(req->number == __NR_exit_group || req->number == __NR_gettid || req->number == __NR_kill){ // interrupt syscall scp = &get_cpu_local_var(0)->scp2; islock = 1; irqstate = ihk_mc_spinlock_lock(&syscall_lock); } else{ scp = &get_cpu_local_var(cpu)->scp; } res = scp->response_va; send_syscall(req, cpu, pid); dkprintf("SC(%d)[%3d] waiting for host.. \n", ihk_mc_get_processor_id(), req->number); #define STATUS_IN_PROGRESS 0 #define STATUS_COMPLETED 1 #define STATUS_PAGE_FAULT 3 while (res->status != STATUS_COMPLETED) { while (res->status == STATUS_IN_PROGRESS) { cpu_pause(); } if (res->status == STATUS_PAGE_FAULT) { dkprintf("STATUS_PAGE_FAULT in syscall, pid: %d\n", cpu_local_var(current)->pid); error = page_fault_process(get_cpu_local_var(cpu)->current, (void *)res->fault_address, res->fault_reason|PF_POPULATE); /* send result */ req2.number = __NR_mmap; #define PAGER_RESUME_PAGE_FAULT 0x0101 req2.args[0] = PAGER_RESUME_PAGE_FAULT; req2.args[1] = error; send_syscall(&req2, cpu, pid); } } dkprintf("SC(%d)[%3d] got host reply: %d \n", ihk_mc_get_processor_id(), req->number, res->ret); rc = res->ret; if(islock){ ihk_mc_spinlock_unlock(&syscall_lock, irqstate); } return rc; } long syscall_generic_forwarding(int n, ihk_mc_user_context_t *ctx) { SYSCALL_HEADER; dkprintf("syscall_generic_forwarding(%d)\n", n); SYSCALL_ARGS_6(D,D,D,D,D,D); SYSCALL_FOOTER; } #if 0 void sigchld_parent(struct process *parent, int status) { struct process *proc = cpu_local_var(current); int irqstate; struct sig_pending *pending; struct list_head *head; __sigset_t mask; mask = __sigmask(SIGCHLD); head = &parent->sigpending; irqstate = ihk_mc_spinlock_lock(&parent->sigpendinglock); list_for_each_entry(pending, head, list) { if (pending->sigmask.__val[0] == mask) break; } if (&pending->list == head) { pending = kmalloc(sizeof(struct sig_pending), IHK_MC_AP_NOWAIT); if (!pending) { /* TODO: what to do here?? */ panic("ERROR: not enough memory for signaling parent process!"); } pending->sigmask.__val[0] = mask; pending->info.si_signo = SIGCHLD; pending->info._sifields._sigchld.si_pid = proc->pid; pending->info._sifields._sigchld.si_status = status; list_add_tail(&pending->list, head); proc->sigevent = 1; } /* TODO: There was a SIGCHLD pending */ else { } ihk_mc_spinlock_unlock(&parent->sigpendinglock, irqstate); } #endif /* * From glibc: INLINE_SYSCALL (wait4, 4, pid, stat_loc, options, NULL); */ SYSCALL_DECLARE(wait4) { struct process *proc = cpu_local_var(current); struct fork_tree_node *child, *child_iter; int pid = (int)ihk_mc_syscall_arg0(ctx); int pgid = proc->pgid; int *status = (int *)ihk_mc_syscall_arg1(ctx); int options = (int)ihk_mc_syscall_arg2(ctx); int ret; struct waitq_entry waitpid_wqe; int empty = 1; if (options & ~(WNOHANG | WUNTRACED | WCONTINUED)) { return -EINVAL; } rescan: child = NULL; pid = (int)ihk_mc_syscall_arg0(ctx); ihk_mc_spinlock_lock_noirq(&proc->ftn->lock); list_for_each_entry(child_iter, &proc->ftn->children, siblings_list) { empty = 0; ihk_mc_spinlock_lock_noirq(&child_iter->lock); if ((pid < 0 && -pid == child_iter->pgid) || pid == -1 || (pid == 0 && pgid == child_iter->pgid) || (pid > 0 && pid == child_iter->pid)) { child = child_iter; break; } ihk_mc_spinlock_unlock_noirq(&child_iter->lock); } if (empty || (!child && pid != -1)) { ihk_mc_spinlock_unlock_noirq(&proc->ftn->lock); return -ECHILD; } /* If child is valid we are still holding its ftn->lock */ if (child) { if (child->status == PS_ZOMBIE) { struct syscall_request request IHK_DMA_ALIGN; ihk_mc_spinlock_unlock_noirq(&child->lock); dkprintf("wait: found PS_ZOMBIE process: %d\n", child->pid); list_del(&child->siblings_list); ihk_mc_spinlock_unlock_noirq(&proc->ftn->lock); if (status) { *status = child->exit_status; } pid = child->pid; release_fork_tree_node(child); /* Ask host to clean up exited child */ request.number = __NR_wait4; request.args[0] = pid; request.args[1] = 0; ret = do_syscall(&request, ctx, ihk_mc_get_processor_id(), 0); if (ret != pid) kprintf("WARNING: host waitpid failed?\n"); goto exit; } else if(child->status == PS_STOPPED) { ihk_mc_spinlock_unlock_noirq(&child->lock); ihk_mc_spinlock_unlock_noirq(&proc->ftn->lock); /* exit_status is created in do_signal */ if (status) { *status = child->exit_status; } pid = child->pid; dkprintf("wait4,PS_STOPPED,pid=%d,status=%08x\n", pid, *status); goto exit; } ihk_mc_spinlock_unlock_noirq(&child->lock); } /* Don't sleep if WNOHANG requested */ if (options & WNOHANG) { ihk_mc_spinlock_unlock_noirq(&proc->ftn->lock); *status = 0; pid = 0; goto exit; } /* Sleep */ waitq_init_entry(&waitpid_wqe, proc); waitq_prepare_to_wait(&proc->ftn->waitpid_q, &waitpid_wqe, PS_INTERRUPTIBLE); ihk_mc_spinlock_unlock_noirq(&proc->ftn->lock); schedule(); dkprintf("wait4(): woken up\n"); waitq_finish_wait(&proc->ftn->waitpid_q, &waitpid_wqe); goto rescan; exit: return pid; } void terminate(int rc, int sig, ihk_mc_user_context_t *ctx) { struct syscall_request request IHK_DMA_ALIGN; struct process *proc = cpu_local_var(current); struct fork_tree_node *ftn = proc->ftn; struct fork_tree_node *child, *next; request.number = __NR_exit_group; request.args[0] = ((rc & 0x00ff) << 8) | (sig & 0xff); if (1) { extern void query_free_mem_interrupt_handler(void *); query_free_mem_interrupt_handler(NULL); } #ifdef DCFA_KMOD do_mod_exit(rc); #endif /* XXX: send SIGKILL to all threads in this process */ flush_process_memory(proc); /* temporary hack */ do_syscall(&request, ctx, ihk_mc_get_processor_id(), 0); #define IS_DETACHED_PROCESS(proc) (1) /* should be implemented in the future */ /* Do a "wait" on all children and detach owner process */ ihk_mc_spinlock_lock_noirq(&ftn->lock); list_for_each_entry_safe(child, next, &ftn->children, siblings_list) { list_del(&child->siblings_list); release_fork_tree_node(child); } ftn->owner = NULL; ihk_mc_spinlock_unlock_noirq(&ftn->lock); /* Send SIGCHILD to parent */ if (ftn->parent) { ihk_mc_spinlock_lock_noirq(&ftn->lock); ftn->pid = proc->pid; ftn->exit_status = ((rc & 0x00ff) << 8) | (sig & 0xff); ftn->status = PS_ZOMBIE; ihk_mc_spinlock_unlock_noirq(&ftn->lock); /* Signal parent if still attached */ ihk_mc_spinlock_lock_noirq(&ftn->parent->lock); if (ftn->parent->owner) { do_kill(ftn->parent->owner->pid, -1, SIGCHLD); /* sigchld_parent(ftn->parent->owner, 0); */ } ihk_mc_spinlock_unlock_noirq(&ftn->parent->lock); /* Wake parent (if sleeping in wait4()) */ waitq_wakeup(&ftn->parent->waitpid_q); release_fork_tree_node(ftn->parent); } release_fork_tree_node(ftn); proc->status = PS_EXITED; release_process(proc); schedule(); } void interrupt_syscall(int pid, int cpuid) { ihk_mc_user_context_t ctx; long lerror; ihk_mc_syscall_arg0(&ctx) = pid; ihk_mc_syscall_arg1(&ctx) = cpuid; lerror = syscall_generic_forwarding(__NR_kill, &ctx); if (lerror) { kprintf("clear_host_pte failed. %ld\n", lerror); } return; } SYSCALL_DECLARE(exit_group) { #if 0 SYSCALL_HEADER; #endif terminate((int)ihk_mc_syscall_arg0(ctx), 0, ctx); #if 0 struct process *proc = cpu_local_var(current); #ifdef DCFA_KMOD do_mod_exit((int)ihk_mc_syscall_arg0(ctx)); #endif /* XXX: send SIGKILL to all threads in this process */ do_syscall(&request, ctx, ihk_mc_get_processor_id(), 0); #define IS_DETACHED_PROCESS(proc) (1) /* should be implemented in the future */ proc->status = PS_ZOMBIE; if (IS_DETACHED_PROCESS(proc)) { /* release a reference for wait(2) */ proc->status = PS_EXITED; free_process(proc); } schedule(); #endif return 0; } static void clear_host_pte(uintptr_t addr, size_t len) { ihk_mc_user_context_t ctx; long lerror; ihk_mc_syscall_arg0(&ctx) = addr; ihk_mc_syscall_arg1(&ctx) = len; /* NOTE: 3rd parameter denotes new rpgtable of host process (if not zero) */ ihk_mc_syscall_arg2(&ctx) = 0; lerror = syscall_generic_forwarding(__NR_munmap, &ctx); if (lerror) { kprintf("clear_host_pte failed. %ld\n", lerror); } return; } static int set_host_vma(uintptr_t addr, size_t len, int prot) { ihk_mc_user_context_t ctx; long lerror; ihk_mc_syscall_arg0(&ctx) = addr; ihk_mc_syscall_arg1(&ctx) = len; ihk_mc_syscall_arg2(&ctx) = prot; lerror = syscall_generic_forwarding(__NR_mprotect, &ctx); if (lerror) { kprintf("set_host_vma(%lx,%lx,%x) failed. %ld\n", addr, len, prot, lerror); goto out; } lerror = 0; out: return (int)lerror; } static int do_munmap(void *addr, size_t len) { int error; int ro_freed; begin_free_pages_pending(); error = remove_process_memory_range(cpu_local_var(current), (intptr_t)addr, (intptr_t)addr+len, &ro_freed); // XXX: TLB flush flush_tlb(); if (error || !ro_freed) { clear_host_pte((uintptr_t)addr, len); } else { error = set_host_vma((uintptr_t)addr, len, PROT_READ|PROT_WRITE); if (error) { kprintf("sys_munmap:set_host_vma failed. %d\n", error); /* through */ } } finish_free_pages_pending(); return error; } static int search_free_space(size_t len, intptr_t hint, intptr_t *addrp) { struct process *proc = cpu_local_var(current); struct vm_regions *region = &proc->vm->region; intptr_t addr; int error; struct vm_range *range; dkprintf("search_free_space(%lx,%lx,%p)\n", len, hint, addrp); addr = hint; for (;;) { #ifdef USE_LARGE_PAGES if (len >= LARGE_PAGE_SIZE) { addr = (addr + LARGE_PAGE_SIZE - 1) & LARGE_PAGE_MASK; } #endif /* USE_LARGE_PAGES */ if ((region->user_end <= addr) || ((region->user_end - len) < addr)) { ekprintf("search_free_space(%lx,%lx,%p):" "no space. %lx %lx\n", len, hint, addrp, addr, region->user_end); error = -ENOMEM; goto out; } range = lookup_process_memory_range(proc->vm, addr, addr+len); if (range == NULL) { break; } addr = range->end; } error = 0; *addrp = addr; out: dkprintf("search_free_space(%lx,%lx,%p): %d %lx\n", len, hint, addrp, error, addr); return error; } #ifdef UNDEFINED mmap() {} #endif SYSCALL_DECLARE(mmap) { const int supported_flags = 0 | MAP_SHARED // 01 | MAP_PRIVATE // 02 | MAP_FIXED // 10 | MAP_ANONYMOUS // 20 | MAP_LOCKED // 2000 | MAP_POPULATE // 8000 ; const int ignored_flags = 0 #ifdef USE_NOCACHE_MMAP | MAP_32BIT // 40 #endif /* USE_NOCACHE_MMAP */ | MAP_DENYWRITE // 0800 | MAP_NORESERVE // 4000 | MAP_STACK // 00020000 ; const int error_flags = 0 #ifndef USE_NOCACHE_MMAP | MAP_32BIT // 40 #endif /* ndef USE_NOCACHE_MMAP */ | MAP_GROWSDOWN // 0100 | MAP_EXECUTABLE // 1000 | MAP_NONBLOCK // 00010000 | MAP_HUGETLB // 00040000 ; const intptr_t addr0 = ihk_mc_syscall_arg0(ctx); const size_t len0 = ihk_mc_syscall_arg1(ctx); const int prot = ihk_mc_syscall_arg2(ctx); const int flags = ihk_mc_syscall_arg3(ctx); const int fd = ihk_mc_syscall_arg4(ctx); const off_t off0 = ihk_mc_syscall_arg5(ctx); struct process *proc = cpu_local_var(current); struct vm_regions *region = &proc->vm->region; intptr_t addr; size_t len; off_t off; int error; intptr_t npages; int p2align; void *p = NULL; int vrflags; intptr_t phys; struct memobj *memobj = NULL; int maxprot; int denied; int ro_vma_mapped = 0; struct shmid_ds ads; dkprintf("[%d]sys_mmap(%lx,%lx,%x,%x,%d,%lx)\n", ihk_mc_get_processor_id(), addr0, len0, prot, flags, fd, off0); /* check constants for flags */ if (1) { int dup_flags; dup_flags = (supported_flags & ignored_flags); dup_flags |= (ignored_flags & error_flags); dup_flags |= (error_flags & supported_flags); if (dup_flags) { ekprintf("sys_mmap:duplicate flags: %lx\n", dup_flags); ekprintf("s-flags: %08x\n", supported_flags); ekprintf("i-flags: %08x\n", ignored_flags); ekprintf("e-flags: %08x\n", error_flags); panic("sys_mmap:duplicate flags\n"); /* no return */ } } /* check arguments */ #define VALID_DUMMY_ADDR (region->user_start) addr = (flags & MAP_FIXED)? addr0: VALID_DUMMY_ADDR; len = (len0 + PAGE_SIZE - 1) & PAGE_MASK; if ((addr & (PAGE_SIZE - 1)) || (addr < region->user_start) || (region->user_end <= addr) || (len == 0) || (len > (region->user_end - region->user_start)) || ((region->user_end - len) < addr) || !(flags & (MAP_SHARED | MAP_PRIVATE)) || ((flags & MAP_SHARED) && (flags & MAP_PRIVATE)) || (off0 & (PAGE_SIZE - 1))) { ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):EINVAL\n", addr0, len0, prot, flags, fd, off0); error = -EINVAL; goto out2; } /* check not supported requests */ if ((flags & error_flags) || (flags & ~(supported_flags | ignored_flags))) { ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):unknown flags %x\n", addr0, len0, prot, flags, fd, off0, (flags & ~(supported_flags | ignored_flags))); error = -EINVAL; goto out2; } ihk_mc_spinlock_lock_noirq(&proc->vm->memory_range_lock); if (flags & MAP_FIXED) { /* clear specified address range */ error = do_munmap((void *)addr, len); if (error) { ekprintf("sys_mmap:do_munmap(%lx,%lx) failed. %d\n", addr, len, error); goto out; } } else { /* choose mapping address */ error = search_free_space(len, region->map_end, &addr); if (error) { ekprintf("sys_mmap:search_free_space(%lx,%lx) failed. %d\n", len, region->map_end, error); goto out; } region->map_end = addr + len; } /* do the map */ vrflags = VR_NONE; vrflags |= PROT_TO_VR_FLAG(prot); vrflags |= (flags & MAP_PRIVATE)? VR_PRIVATE: 0; vrflags |= (flags & MAP_LOCKED)? VR_LOCKED: 0; if (flags & MAP_ANONYMOUS) { if (0) { /* dummy */ } #ifdef USE_NOCACHE_MMAP #define X_MAP_NOCACHE MAP_32BIT else if (flags & X_MAP_NOCACHE) { vrflags |= VR_IO_NOCACHE; } #endif else { vrflags |= VR_DEMAND_PAGING; } } else { vrflags |= VR_DEMAND_PAGING; } if (!(prot & PROT_WRITE)) { error = set_host_vma(addr, len, PROT_READ); if (error) { kprintf("sys_mmap:set_host_vma failed. %d\n", error); goto out; } ro_vma_mapped = 1; } phys = 0; off = 0; maxprot = PROT_READ | PROT_WRITE | PROT_EXEC; if (!(flags & MAP_ANONYMOUS)) { off = off0; error = fileobj_create(fd, &memobj, &maxprot); if (error) { ekprintf("sys_mmap:fileobj_create failed. %d\n", error); goto out; } } else if (!(vrflags & VR_DEMAND_PAGING) && ((vrflags & VR_PROT_MASK) != VR_PROT_NONE)) { npages = len >> PAGE_SHIFT; p2align = PAGE_P2ALIGN; #ifdef USE_LARGE_PAGES if ((len >= LARGE_PAGE_SIZE) && ((addr & (LARGE_PAGE_SIZE - 1)) == 0)) { p2align = LARGE_PAGE_P2ALIGN; } #endif /* USE_LARGE_PAGES */ p = ihk_mc_alloc_aligned_pages(npages, p2align, IHK_MC_AP_NOWAIT); if (p == NULL) { ekprintf("sys_mmap:allocate_pages(%d,%d) failed.\n", npages, p2align); error = -ENOMEM; goto out; } phys = virt_to_phys(p); } else if (flags & MAP_SHARED) { memset(&ads, 0, sizeof(ads)); ads.shm_segsz = len; error = shmobj_create(&ads, &memobj); if (error) { ekprintf("sys_mmap:shmobj_create failed. %d\n", error); goto out; } } else { error = zeroobj_create(&memobj); if (error) { ekprintf("sys_mmap:zeroobj_create failed. %d\n", error); goto out; } } if ((flags & MAP_PRIVATE) && (maxprot & PROT_READ)) { maxprot |= PROT_WRITE; } denied = prot & ~maxprot; if (denied) { ekprintf("sys_mmap:denied %x. %x %x\n", denied, prot, maxprot); error = (denied == PROT_EXEC)? -EPERM: -EACCES; goto out; } vrflags |= VRFLAG_PROT_TO_MAXPROT(PROT_TO_VR_FLAG(maxprot)); error = add_process_memory_range(proc, addr, addr+len, phys, vrflags, memobj, off); if (error) { ekprintf("sys_mmap:add_process_memory_range" "(%p,%lx,%lx,%lx,%lx) failed %d\n", proc, addr, addr+len, virt_to_phys(p), vrflags, error); goto out; } error = 0; p = NULL; memobj = NULL; ro_vma_mapped = 0; out: if (ro_vma_mapped) { (void)set_host_vma(addr, len, PROT_READ|PROT_WRITE); } ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock); if (!error && (flags & (MAP_POPULATE | MAP_LOCKED))) { error = populate_process_memory(proc, (void *)addr, len); if (error) { ekprintf("sys_mmap:populate_process_memory" "(%p,%p,%lx) failed %d\n", proc, (void *)addr, len, error); /* * In this case, * the mapping established by this call should be unmapped * before mmap() returns with error. * * However, the mapping cannot be unmaped simply, * because the mapping can be modified by other thread * because memory_range_lock has been released. * * For the moment, like a linux-2.6.38-8, * the physical page allocation failure is ignored. */ error = 0; } } out2: if (p) { ihk_mc_free_pages(p, npages); } if (memobj) { memobj_release(memobj); } dkprintf("[%d]sys_mmap(%lx,%lx,%x,%x,%d,%lx): %ld %lx\n", ihk_mc_get_processor_id(), addr0, len0, prot, flags, fd, off0, error, addr); return (!error)? addr: error; } #ifdef UNDEFINED munmap() {} #endif SYSCALL_DECLARE(munmap) { const uintptr_t addr = ihk_mc_syscall_arg0(ctx); const size_t len0 = ihk_mc_syscall_arg1(ctx); struct process *proc = cpu_local_var(current); struct vm_regions *region = &proc->vm->region; size_t len; int error; dkprintf("[%d]sys_munmap(%lx,%lx)\n", ihk_mc_get_processor_id(), addr, len0); len = (len0 + PAGE_SIZE - 1) & PAGE_MASK; if ((addr & (PAGE_SIZE - 1)) || (addr < region->user_start) || (region->user_end <= addr) || (len == 0) || (len > (region->user_end - region->user_start)) || ((region->user_end - len) < addr)) { error = -EINVAL; goto out; } ihk_mc_spinlock_lock_noirq(&proc->vm->memory_range_lock); error = do_munmap((void *)addr, len); ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock); out: dkprintf("[%d]sys_munmap(%lx,%lx): %d\n", ihk_mc_get_processor_id(), addr, len0, error); return error; } #ifdef UNDEFINED mprotect() {} #endif SYSCALL_DECLARE(mprotect) { const intptr_t start = ihk_mc_syscall_arg0(ctx); const size_t len0 = ihk_mc_syscall_arg1(ctx); const int prot = ihk_mc_syscall_arg2(ctx); struct process *proc = cpu_local_var(current); struct vm_regions *region = &proc->vm->region; size_t len; intptr_t end; struct vm_range *first; intptr_t addr; struct vm_range *range; int error; struct vm_range *changed; const unsigned long protflags = PROT_TO_VR_FLAG(prot); unsigned long denied; int ro_changed = 0; dkprintf("[%d]sys_mprotect(%lx,%lx,%x)\n", ihk_mc_get_processor_id(), start, len0, prot); len = (len0 + PAGE_SIZE - 1) & PAGE_MASK; end = start + len; /* check arguments */ if ((start & (PAGE_SIZE - 1)) || (start < region->user_start) || (region->user_end <= start) || (len > (region->user_end - region->user_start) || ((region->user_end - len) < start))) { ekprintf("[%d]sys_mprotect(%lx,%lx,%x): -EINVAL\n", ihk_mc_get_processor_id(), start, len0, prot); return -EINVAL; } if (len == 0) { /* nothing to do */ return 0; } ihk_mc_spinlock_lock_noirq(&proc->vm->memory_range_lock); #if 0 /* check contiguous map */ first = NULL; for (addr = start; addr < end; addr = range->end) { if (first == NULL) { range = lookup_process_memory_range(proc->vm, start, start+PAGE_SIZE); first = range; } else { range = next_process_memory_range(proc->vm, range); } if ((range == NULL) || (addr < range->start)) { /* not contiguous */ ekprintf("sys_mprotect(%lx,%lx,%x):not contiguous\n", start, len0, prot); error = -ENOMEM; goto out; } if (range->flag & (VR_REMOTE | VR_RESERVED | VR_IO_NOCACHE)) { ekprintf("sys_mprotect(%lx,%lx,%x):cannot change\n", start, len0, prot); error = -EINVAL; goto out; } } #else first = lookup_process_memory_range(proc->vm, start, start+PAGE_SIZE); #endif /* do the mprotect */ changed = NULL; for (addr = start; addr < end; addr = changed->end) { if (changed == NULL) { range = first; } else { range = next_process_memory_range(proc->vm, changed); } if ((range == NULL) || (addr < range->start)) { /* not contiguous */ ekprintf("sys_mprotect(%lx,%lx,%x):not contiguous\n", start, len0, prot); error = -ENOMEM; goto out; } denied = protflags & ~VRFLAG_MAXPROT_TO_PROT(range->flag); if (denied) { ekprintf("sys_mprotect(%lx,%lx,%x):denied %lx. %lx %lx\n", start, len0, prot, denied, protflags, range->flag); error = -EACCES; goto out; } if (range->flag & (VR_REMOTE | VR_RESERVED | VR_IO_NOCACHE)) { ekprintf("sys_mprotect(%lx,%lx,%x):cannot change\n", start, len0, prot); error = -ENOMEM; goto out; } if (range->start < addr) { error = split_process_memory_range(proc, range, addr, &range); if (error) { ekprintf("sys_mprotect(%lx,%lx,%x):split failed. %d\n", start, len0, prot, error); goto out; } } if (end < range->end) { error = split_process_memory_range(proc, range, end, NULL); if (error) { ekprintf("sys_mprotect(%lx,%lx,%x):split failed. %d\n", start, len0, prot, error); goto out; } } if ((range->flag ^ protflags) & VR_PROT_WRITE) { ro_changed = 1; } error = change_prot_process_memory_range(proc, range, protflags); if (error) { ekprintf("sys_mprotect(%lx,%lx,%x):change failed. %d\n", start, len0, prot, error); goto out; } if (changed == NULL) { changed = range; } else { error = join_process_memory_range(proc, changed, range); if (error) { ekprintf("sys_mprotect(%lx,%lx,%x):join failed. %d\n", start, len0, prot, error); changed = range; /* through */ } } } error = 0; out: // XXX: TLB flush flush_tlb(); if (ro_changed && !error) { error = set_host_vma(start, len, prot & (PROT_READ|PROT_WRITE)); if (error) { kprintf("sys_mprotect:set_host_vma failed. %d\n", error); /* through */ } } ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock); dkprintf("[%d]sys_mprotect(%lx,%lx,%x): %d\n", ihk_mc_get_processor_id(), start, len0, prot, error); return error; } SYSCALL_DECLARE(brk) { unsigned long address = ihk_mc_syscall_arg0(ctx); struct vm_regions *region = &cpu_local_var(current)->vm->region; unsigned long r; unsigned long vrflag; dkprintf("SC(%d)[sys_brk] brk_start=%lx,end=%lx\n", ihk_mc_get_processor_id(), region->brk_start, region->brk_end); /* brk change fail, including glibc trick brk(0) to obtain current brk */ if(address < region->brk_start) { r = region->brk_end; goto out; } /* brk change fail, because we don't shrink memory region */ if(address < region->brk_end) { r = region->brk_end; goto out; } /* try to extend memory region */ vrflag = VR_PROT_READ | VR_PROT_WRITE; vrflag |= VRFLAG_PROT_TO_MAXPROT(vrflag); ihk_mc_spinlock_lock_noirq(&cpu_local_var(current)->vm->memory_range_lock); region->brk_end = extend_process_region(cpu_local_var(current), region->brk_start, region->brk_end, address, vrflag); ihk_mc_spinlock_unlock_noirq(&cpu_local_var(current)->vm->memory_range_lock); dkprintf("SC(%d)[sys_brk] brk_end set to %lx\n", ihk_mc_get_processor_id(), region->brk_end); r = region->brk_end; out: return r; } SYSCALL_DECLARE(getpid) { return cpu_local_var(current)->pid; } void settid(struct process *proc, int mode, int newcpuid, int oldcpuid) { ihk_mc_user_context_t ctx; unsigned long rc; ihk_mc_syscall_arg0(&ctx) = mode; ihk_mc_syscall_arg1(&ctx) = proc->pid; ihk_mc_syscall_arg2(&ctx) = newcpuid; ihk_mc_syscall_arg3(&ctx) = oldcpuid; rc = syscall_generic_forwarding(__NR_gettid, &ctx); proc->tid = rc; } SYSCALL_DECLARE(gettid) { return cpu_local_var(current)->tid; } long do_arch_prctl(unsigned long code, unsigned long address) { int err = 0; enum ihk_asr_type type; switch (code) { case ARCH_SET_FS: case ARCH_GET_FS: type = IHK_ASR_X86_FS; break; case ARCH_GET_GS: type = IHK_ASR_X86_GS; break; case ARCH_SET_GS: return -ENOTSUPP; default: return -EINVAL; } switch (code) { case ARCH_SET_FS: dkprintf("[%d] arch_prctl: ARCH_SET_FS: 0x%lX\n", ihk_mc_get_processor_id(), address); cpu_local_var(current)->thread.tlsblock_base = address; err = ihk_mc_arch_set_special_register(type, address); break; case ARCH_SET_GS: err = ihk_mc_arch_set_special_register(type, address); break; case ARCH_GET_FS: case ARCH_GET_GS: err = ihk_mc_arch_get_special_register(type, (unsigned long*)address); break; default: break; } return err; } SYSCALL_DECLARE(arch_prctl) { return do_arch_prctl(ihk_mc_syscall_arg0(ctx), ihk_mc_syscall_arg1(ctx)); } SYSCALL_DECLARE(execve) { long ret; const char *filename = (const char *)ihk_mc_syscall_arg0(ctx); char **argv = (char **)ihk_mc_syscall_arg1(ctx); char **envp = (char **)ihk_mc_syscall_arg2(ctx); char *argv_flat = NULL; int argv_flat_len = 0; char *envp_flat = NULL; int envp_flat_len = 0; struct syscall_request request IHK_DMA_ALIGN; struct program_load_desc *desc; struct process_vm *vm = cpu_local_var(current)->vm; struct vm_range *range; ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock); range = lookup_process_memory_range(vm, (unsigned long)filename, (unsigned long)filename+1); if (range == NULL || !(range->flag & VR_PROT_READ)) { ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock); kprintf("execve(): ERROR: filename is bad address\n"); return -EFAULT; } ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock); desc = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT); if (!desc) { kprintf("execve(): ERROR: allocating program descriptor\n"); return -ENOMEM; } memset((void*)desc, 0, PAGE_SIZE); /* Request host to open executable and load ELF section descriptions */ request.number = __NR_execve; request.args[0] = 1; /* 1st phase - get ELF desc */ request.args[1] = (unsigned long)filename; request.args[2] = virt_to_phys(desc); ret = do_syscall(&request, ctx, ihk_mc_get_processor_id(), 0); if (ret != 0) { kprintf("execve(): ERROR: host failed to load elf header, errno: %d\n", ret); return -ret; } dkprintf("execve(): ELF desc received, num sections: %d\n", desc->num_sections); if (desc->shell_path[0]) { dkprintf("execve(): shell interpreter: %s\n", desc->shell_path); } /* Flatten argv and envp into kernel-space buffers */ argv_flat_len = flatten_strings(-1, (desc->shell_path[0] ? desc->shell_path : NULL), argv, &argv_flat); if (argv_flat_len == 0) { kprintf("ERROR: no argv for executable: %s?\n", filename); return -EINVAL; } envp_flat_len = flatten_strings(-1, NULL, envp, &envp_flat); if (envp_flat_len == 0) { kprintf("ERROR: no envp for executable: %s?\n", filename); return -EINVAL; } /* Unmap all memory areas of the process, userspace will be gone */ free_process_memory_ranges(cpu_local_var(current)); ihk_mc_init_user_process(&cpu_local_var(current)->ctx, &cpu_local_var(current)->uctx, ((char *)cpu_local_var(current)) + KERNEL_STACK_NR_PAGES * PAGE_SIZE, desc->entry, 0); /* Create virtual memory ranges and update args/envs */ if (prepare_process_ranges_args_envs(cpu_local_var(current), desc, desc, PTATTR_NO_EXECUTE | PTATTR_WRITABLE | PTATTR_FOR_USER, argv_flat, argv_flat_len, envp_flat, envp_flat_len) != 0) { kprintf("execve(): PANIC: preparing ranges, args, envs, stack\n"); panic(""); } /* Clear host user space PTEs */ request.number = __NR_munmap; request.args[0] = cpu_local_var(current)->vm->region.user_start; request.args[1] = cpu_local_var(current)->vm->region.user_end - cpu_local_var(current)->vm->region.user_start; dkprintf("execve(): requesting host PTE clear\n"); if (do_syscall(&request, ctx, ihk_mc_get_processor_id(), 0)) { kprintf("execve(): ERROR: clearing PTEs in host process\n"); panic(""); } /* Request host to transfer ELF image */ request.number = __NR_execve; request.args[0] = 2; /* 2nd phase - transfer ELF image */ request.args[1] = virt_to_phys(desc); request.args[2] = sizeof(struct program_load_desc) + sizeof(struct program_image_section) * desc->num_sections; ret = do_syscall(&request, ctx, ihk_mc_get_processor_id(), 0); if (ret != 0) { kprintf("execve(): PANIC: host failed to load elf image\n"); panic(""); } /* Switch to new execution context */ dkprintf("execve(): switching to new process\n"); ihk_mc_switch_context(NULL, &cpu_local_var(current)->ctx, cpu_local_var(current)); /* Never reach here */ return 0; } unsigned long do_fork(int clone_flags, unsigned long newsp, unsigned long parent_tidptr, unsigned long child_tidptr, unsigned long tlsblock_base, unsigned long curpc, unsigned long cursp) { int cpuid; struct process *new; ihk_mc_user_context_t ctx1; struct syscall_request request1 IHK_DMA_ALIGN; dkprintf("do_fork,flags=%08x,newsp=%lx,ptidptr=%lx,ctidptr=%lx,tls=%lx,curpc=%lx,cursp=%lx", clone_flags, newsp, parent_tidptr, child_tidptr, tlsblock_base, curpc, cursp); dkprintf("do_fork(): stack_pointr passed in: 0x%lX, stack pointer of caller: 0x%lx\n", newsp, cursp); cpuid = obtain_clone_cpuid(); if (cpuid == -1) { return -EAGAIN; } new = clone_process(cpu_local_var(current), curpc, newsp ? newsp : cursp, clone_flags); if (!new) { return -ENOMEM; } new->pgid = cpu_local_var(current)->pgid; cpu_set(cpuid, &new->vm->cpu_set, &new->vm->cpu_set_lock); if (clone_flags & CLONE_VM) { new->pid = cpu_local_var(current)->pid; settid(new, 1, cpuid, -1); } /* fork() a new process on the host */ else { request1.number = __NR_fork; new->pid = do_syscall(&request1, &ctx1, ihk_mc_get_processor_id(), 0); if (new->pid == -1) { kprintf("ERROR: forking host process\n"); /* TODO: clean-up new */ return -EFAULT; } /* In a single threaded process TID equals to PID */ settid(new, 0, cpuid, -1); dkprintf("fork(): new pid: %d\n", new->pid); /* clear user space PTEs and set new rpgtable so that consequent * page faults will look up the right mappings */ request1.number = __NR_munmap; request1.args[0] = new->vm->region.user_start; request1.args[1] = new->vm->region.user_end - new->vm->region.user_start; /* 3rd parameter denotes new rpgtable of host process */ request1.args[2] = virt_to_phys(new->vm->page_table); request1.args[3] = new->pid; dkprintf("fork(): requesting PTE clear and rpgtable (0x%lx) update\n", request1.args[2]); if (do_syscall(&request1, &ctx1, ihk_mc_get_processor_id(), new->pid)) { kprintf("ERROR: clearing PTEs in host process\n"); } } new->ftn->pid = new->pid; new->ftn->pgid = new->pgid; if (clone_flags & CLONE_PARENT_SETTID) { dkprintf("clone_flags & CLONE_PARENT_SETTID: 0x%lX\n", parent_tidptr); *(int*)parent_tidptr = new->pid; } if (clone_flags & CLONE_CHILD_CLEARTID) { dkprintf("clone_flags & CLONE_CHILD_CLEARTID: 0x%lX\n", child_tidptr); new->thread.clear_child_tid = (int*)child_tidptr; } if (clone_flags & CLONE_CHILD_SETTID) { unsigned long phys; dkprintf("clone_flags & CLONE_CHILD_SETTID: 0x%lX\n", child_tidptr); if (ihk_mc_pt_virt_to_phys(new->vm->page_table, (void *)child_tidptr, &phys)) { kprintf("ERROR: looking up physical addr for child process\n"); return -EFAULT; } *((int*)phys_to_virt(phys)) = new->tid; } if (clone_flags & CLONE_SETTLS) { dkprintf("clone_flags & CLONE_SETTLS: 0x%lX\n", tlsblock_base); new->thread.tlsblock_base = tlsblock_base; } else { new->thread.tlsblock_base = cpu_local_var(current)->thread.tlsblock_base; } ihk_mc_syscall_ret(new->uctx) = 0; dkprintf("clone: kicking scheduler!,cpuid=%d pid=%d tid=%d\n", cpuid, new->pid, new->tid); runq_add_proc(new, cpuid); return new->tid; } SYSCALL_DECLARE(vfork) { return do_fork(SIGCHLD, 0, 0, 0, 0, ihk_mc_syscall_pc(ctx), ihk_mc_syscall_sp(ctx)); } SYSCALL_DECLARE(clone) { return do_fork((int)ihk_mc_syscall_arg0(ctx), ihk_mc_syscall_arg1(ctx), ihk_mc_syscall_arg2(ctx), ihk_mc_syscall_arg3(ctx), ihk_mc_syscall_arg4(ctx), ihk_mc_syscall_pc(ctx), ihk_mc_syscall_sp(ctx)); } SYSCALL_DECLARE(set_tid_address) { cpu_local_var(current)->thread.clear_child_tid = (int*)ihk_mc_syscall_arg0(ctx); return cpu_local_var(current)->pid; } SYSCALL_DECLARE(kill) { int pid = ihk_mc_syscall_arg0(ctx); int sig = ihk_mc_syscall_arg1(ctx); return do_kill(pid, -1, sig); } // see linux-2.6.34.13/kernel/signal.c SYSCALL_DECLARE(tgkill) { int tgid = ihk_mc_syscall_arg0(ctx); int tid = ihk_mc_syscall_arg1(ctx); int sig = ihk_mc_syscall_arg2(ctx); if(tid <= 0) return -EINVAL; if(tgid <= 0 && tgid != -1) return -EINVAL; return do_kill(tgid, tid, sig); } SYSCALL_DECLARE(setpgid) { int pid = ihk_mc_syscall_arg0(ctx); int pgid = ihk_mc_syscall_arg1(ctx); long rc; rc = syscall_generic_forwarding(__NR_setpgid, ctx); if(rc == 0){ do_setpgid(pid, pgid); } return rc; } SYSCALL_DECLARE(set_robust_list) { return -ENOSYS; } int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct process *proc = cpu_local_var(current); struct k_sigaction *k; int irqstate; irqstate = ihk_mc_spinlock_lock(&proc->sighandler->lock); k = proc->sighandler->action + sig - 1; if(oact) memcpy(oact, k, sizeof(struct k_sigaction)); if(act) memcpy(k, act, sizeof(struct k_sigaction)); ihk_mc_spinlock_unlock(&proc->sighandler->lock, irqstate); return 0; } SYSCALL_DECLARE(rt_sigprocmask) { int how = ihk_mc_syscall_arg0(ctx); const sigset_t *set = (const sigset_t *)ihk_mc_syscall_arg1(ctx); sigset_t *oldset = (sigset_t *)ihk_mc_syscall_arg2(ctx); size_t sigsetsize = (size_t)ihk_mc_syscall_arg3(ctx); struct process *proc = cpu_local_var(current); int flag; __sigset_t wsig; if(sigsetsize != sizeof(sigset_t)) return -EINVAL; if(set && how != SIG_BLOCK && how != SIG_UNBLOCK && how != SIG_SETMASK) return -EINVAL; flag = ihk_mc_spinlock_lock(&proc->sighandler->lock); if(oldset){ wsig = proc->sigmask.__val[0]; if(copy_to_user(proc, oldset->__val, &wsig, sizeof wsig)) goto fault; } if(set){ if(copy_from_user(proc, &wsig, set->__val, sizeof wsig)) goto fault; switch(how){ case SIG_BLOCK: proc->sigmask.__val[0] |= wsig; break; case SIG_UNBLOCK: proc->sigmask.__val[0] &= ~wsig; break; case SIG_SETMASK: proc->sigmask.__val[0] = wsig; break; } } ihk_mc_spinlock_unlock(&proc->sighandler->lock, flag); return 0; fault: ihk_mc_spinlock_unlock(&proc->sighandler->lock, flag); return -EFAULT; } SYSCALL_DECLARE(rt_sigpending) { int flag; struct sig_pending *pending; struct list_head *head; ihk_spinlock_t *lock; __sigset_t w = 0; struct process *proc = cpu_local_var(current); sigset_t *set = (sigset_t *)ihk_mc_syscall_arg0(ctx); size_t sigsetsize = (size_t)ihk_mc_syscall_arg1(ctx); if (sigsetsize > sizeof(sigset_t)) return -EINVAL; lock = &proc->sigshared->lock; head = &proc->sigshared->sigpending; flag = ihk_mc_spinlock_lock(lock); list_for_each_entry(pending, head, list){ w |= pending->sigmask.__val[0]; } ihk_mc_spinlock_unlock(lock, flag); lock = &proc->sigpendinglock; head = &proc->sigpending; flag = ihk_mc_spinlock_lock(lock); list_for_each_entry(pending, head, list){ w |= pending->sigmask.__val[0]; } ihk_mc_spinlock_unlock(lock, flag); if(copy_to_user(proc, set->__val, &w, sizeof w)) return -EFAULT; return 0; } SYSCALL_DECLARE(rt_sigtimedwait) { struct process *proc = cpu_local_var(current); const sigset_t *set = (const sigset_t *)ihk_mc_syscall_arg0(ctx); siginfo_t *info = (siginfo_t *)ihk_mc_syscall_arg1(ctx); void *timeout = (void *)ihk_mc_syscall_arg2(ctx); size_t sigsetsize = (size_t)ihk_mc_syscall_arg3(ctx); siginfo_t winfo; __sigset_t wset; long wtimeout[2]; if (sigsetsize > sizeof(sigset_t)) return -EINVAL; memset(&winfo, '\0', sizeof winfo); if(copy_from_user(proc, &wset, set, sizeof wset)) return -EFAULT; if(copy_from_user(proc, wtimeout, timeout, sizeof wtimeout)) return -EFAULT; if(copy_to_user(proc, info, &winfo, sizeof winfo)) return -EFAULT; return -EOPNOTSUPP; } SYSCALL_DECLARE(rt_sigqueueinfo) { struct process *proc = cpu_local_var(current); int pid = (int)ihk_mc_syscall_arg0(ctx); int sig = (int)ihk_mc_syscall_arg1(ctx); siginfo_t *info = (siginfo_t *)ihk_mc_syscall_arg2(ctx); siginfo_t winfo; if (0) kprintf("sys_rt_sigqueueinfo(%d,%d,%p)\n", pid, sig, info); if(copy_from_user(proc, &winfo, info, sizeof winfo)) return -EFAULT; return -EOPNOTSUPP; } static int do_sigsuspend(struct process *proc, const sigset_t *set) { __sigset_t wset; __sigset_t bset; int flag; struct sig_pending *pending; struct list_head *head; ihk_spinlock_t *lock; wset = set->__val[0]; wset &= ~__sigmask(SIGKILL); wset &= ~__sigmask(SIGSTOP); bset = proc->sigmask.__val[0]; proc->sigmask.__val[0] = wset; for(;;){ while(proc->sigevent == 0); proc->sigevent = 0; lock = &proc->sigshared->lock; head = &proc->sigshared->sigpending; flag = ihk_mc_spinlock_lock(lock); list_for_each_entry(pending, head, list){ if(!(pending->sigmask.__val[0] & wset)) break; } if(&pending->list == head){ ihk_mc_spinlock_unlock(lock, flag); lock = &proc->sigpendinglock; head = &proc->sigpending; flag = ihk_mc_spinlock_lock(lock); list_for_each_entry(pending, head, list){ if(!(pending->sigmask.__val[0] & wset)) break; } } if(&pending->list == head){ ihk_mc_spinlock_unlock(lock, flag); continue; } list_del(&pending->list); ihk_mc_spinlock_unlock(lock, flag); proc->sigmask.__val[0] = bset; do_signal(-EINTR, NULL, proc, pending); break; } return -EINTR; } SYSCALL_DECLARE(pause) { struct process *proc = cpu_local_var(current); return do_sigsuspend(proc, &proc->sigmask); } SYSCALL_DECLARE(rt_sigsuspend) { struct process *proc = cpu_local_var(current); const sigset_t *set = (const sigset_t *)ihk_mc_syscall_arg0(ctx); size_t sigsetsize = (size_t)ihk_mc_syscall_arg1(ctx); sigset_t wset; if (sigsetsize > sizeof(sigset_t)) return -EINVAL; if(copy_from_user(proc, &wset, set, sizeof wset)) return -EFAULT; return do_sigsuspend(proc, &wset); } SYSCALL_DECLARE(sigaltstack) { struct process *proc = cpu_local_var(current); const stack_t *ss = (const stack_t *)ihk_mc_syscall_arg0(ctx); stack_t *oss = (stack_t *)ihk_mc_syscall_arg1(ctx); stack_t wss; if(oss) if(copy_to_user(proc, oss, &proc->sigstack, sizeof wss)) return -EFAULT; if(ss){ if(copy_from_user(proc, &wss, ss, sizeof wss)) return -EFAULT; if(wss.ss_flags != 0 && wss.ss_flags != SS_DISABLE) return -EINVAL; if(wss.ss_flags == SS_DISABLE){ proc->sigstack.ss_sp = NULL; proc->sigstack.ss_flags = SS_DISABLE; proc->sigstack.ss_size = 0; } else{ if(wss.ss_size < MINSIGSTKSZ) return -ENOMEM; memcpy(&proc->sigstack, &wss, sizeof wss); } } return 0; } #ifdef UNDEFINED madvise() {} #endif SYSCALL_DECLARE(madvise) { const uintptr_t start = (uintptr_t)ihk_mc_syscall_arg0(ctx); const size_t len0 = (size_t)ihk_mc_syscall_arg1(ctx); const int advice = (int)ihk_mc_syscall_arg2(ctx); size_t len; uintptr_t end; struct process *proc = cpu_local_var(current); struct vm_regions *region = &proc->vm->region; struct vm_range *first; uintptr_t addr; struct vm_range *range; int error; dkprintf("[%d]sys_madvise(%lx,%lx,%x)\n", ihk_mc_get_processor_id(), start, len0, advice); len = (len0 + PAGE_SIZE - 1) & PAGE_MASK; end = start + len; if ((start & (PAGE_SIZE - 1)) || (len < len0) || (end < start)) { error = -EINVAL; goto out2; } if ((start < region->user_start) || (region->user_end <= start) || (len > (region->user_end - region->user_start)) || ((region->user_end - len) < start)) { error = -ENOMEM; goto out2; } error = 0; switch (advice) { default: case MADV_MERGEABLE: case MADV_UNMERGEABLE: case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: case MADV_DONTDUMP: case MADV_DODUMP: error = -EINVAL; break; case MADV_NORMAL: case MADV_RANDOM: case MADV_SEQUENTIAL: case MADV_WILLNEED: case MADV_DONTNEED: case MADV_DONTFORK: case MADV_DOFORK: break; case MADV_REMOVE: error = -EACCES; break; case MADV_HWPOISON: case MADV_SOFT_OFFLINE: error = -EPERM; break; } if (error) { goto out2; } if (start == end) { error = 0; goto out2; } ihk_mc_spinlock_lock_noirq(&proc->vm->memory_range_lock); /* check contiguous map */ first = NULL; for (addr = start; addr < end; addr = range->end) { if (first == NULL) { range = lookup_process_memory_range(proc->vm, start, start+PAGE_SIZE); first = range; } else { range = next_process_memory_range(proc->vm, range); } if ((range == NULL) || (addr < range->start)) { /* not contiguous */ dkprintf("[%d]sys_madvise(%lx,%lx,%x):not contig " "%lx [%lx-%lx)\n", ihk_mc_get_processor_id(), start, len0, advice, addr, range?range->start:0, range?range->end:0); error = -ENOMEM; goto out; } if (!range->memobj || !memobj_has_pager(range->memobj)) { dkprintf("[%d]sys_madvise(%lx,%lx,%x):has not pager" "[%lx-%lx) %lx\n", ihk_mc_get_processor_id(), start, len0, advice, range->start, range->end, range->memobj); error = -EBADF; goto out; } if ((advice == MADV_DONTNEED) && (range->flag & VR_LOCKED)) { dkprintf("[%d]sys_madvise(%lx,%lx,%x):locked" "[%lx-%lx) %lx\n", ihk_mc_get_processor_id(), start, len0, advice, range->start, range->end, range->flag); error = -EINVAL; goto out; } } error = 0; out: ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock); out2: dkprintf("[%d]sys_madvise(%lx,%lx,%x): %d\n", ihk_mc_get_processor_id(), start, len0, advice, error); return error; } SYSCALL_DECLARE(futex) { uint64_t timeout = 0; // No timeout uint32_t val2 = 0; uint32_t *uaddr = (uint32_t *)ihk_mc_syscall_arg0(ctx); int op = (int)ihk_mc_syscall_arg1(ctx); uint32_t val = (uint32_t)ihk_mc_syscall_arg2(ctx); struct timespec *utime = (struct timespec*)ihk_mc_syscall_arg3(ctx); uint32_t *uaddr2 = (uint32_t *)ihk_mc_syscall_arg4(ctx); uint32_t val3 = (uint32_t)ihk_mc_syscall_arg5(ctx); /* Mask off the FUTEX_PRIVATE_FLAG, * assume all futexes are address space private */ op = (op & FUTEX_CMD_MASK); dkprintf("futex op=[%x, %s],uaddr=%lx, val=%x, utime=%lx, uaddr2=%lx, val3=%x, []=%x\n", op, (op == FUTEX_WAIT) ? "FUTEX_WAIT" : (op == FUTEX_WAIT_BITSET) ? "FUTEX_WAIT_BITSET" : (op == FUTEX_WAKE) ? "FUTEX_WAKE" : (op == FUTEX_WAKE_OP) ? "FUTEX_WAKE_OP" : (op == FUTEX_WAKE_BITSET) ? "FUTEX_WAKE_BITSET" : (op == FUTEX_CMP_REQUEUE) ? "FUTEX_CMP_REQUEUE" : (op == FUTEX_REQUEUE) ? "FUTEX_REQUEUE (NOT IMPL!)" : "unknown", (unsigned long)uaddr, op, val, utime, uaddr2, val3, *uaddr); if (utime && (op == FUTEX_WAIT_BITSET || op == FUTEX_WAIT)) { struct syscall_request request IHK_DMA_ALIGN; struct timeval tv_now; request.number = n; unsigned long __phys; dkprintf("futex,utime and FUTEX_WAIT_*, uaddr=%lx, []=%x\n", (unsigned long)uaddr, *uaddr); if (ihk_mc_pt_virt_to_phys(cpu_local_var(current)->vm->page_table, (void *)&tv_now, &__phys)) { return -EFAULT; } request.args[0] = __phys; int r = do_syscall(&request, ctx, ihk_mc_get_processor_id(), 0); if (r < 0) { return -EFAULT; } dkprintf("futex, FUTEX_WAIT_*, arg3 != NULL, pc=%lx\n", (unsigned long)ihk_mc_syscall_pc(ctx)); dkprintf("now->tv_sec=%016ld,tv_nsec=%016ld\n", tv_now.tv_sec, tv_now.tv_usec * 1000); dkprintf("utime->tv_sec=%016ld,tv_nsec=%016ld\n", utime->tv_sec, utime->tv_nsec); long nsec_now = ((long)tv_now.tv_sec * 1000000000ULL) + tv_now.tv_usec * 1000; long nsec_timeout = ((long)utime->tv_sec * 1000000000ULL) + utime->tv_nsec * 1; long diff_nsec = nsec_timeout - nsec_now; timeout = (diff_nsec / 1000) * 1100; // (usec * 1.1GHz) dkprintf("futex timeout: %lu\n", timeout); } /* Requeue parameter in 'utime' if op == FUTEX_CMP_REQUEUE. * number of waiters to wake in 'utime' if op == FUTEX_WAKE_OP. */ if (op == FUTEX_CMP_REQUEUE || op == FUTEX_WAKE_OP) val2 = (uint32_t) (unsigned long) ihk_mc_syscall_arg3(ctx); return futex(uaddr, op, val, timeout, uaddr2, val2, val3); } SYSCALL_DECLARE(exit) { struct process *proc = cpu_local_var(current); #ifdef DCFA_KMOD do_mod_exit((int)ihk_mc_syscall_arg0(ctx)); #endif /* XXX: for if all threads issued the exit(2) rather than exit_group(2), * exit(2) also should delegate. */ /* If there is a clear_child_tid address set, clear it and wake it. * This unblocks any pthread_join() waiters. */ if (proc->thread.clear_child_tid) { dkprintf("exit clear_child!\n"); *proc->thread.clear_child_tid = 0; barrier(); futex((uint32_t *)proc->thread.clear_child_tid, FUTEX_WAKE, 1, 0, NULL, 0, 0); } proc->status = PS_ZOMBIE; release_fork_tree_node(proc->ftn); release_process(proc); schedule(); return 0; } SYSCALL_DECLARE(getrlimit) { int ret; int resource = ihk_mc_syscall_arg0(ctx); struct rlimit *rlm = (struct rlimit *)ihk_mc_syscall_arg1(ctx); struct process *proc = cpu_local_var(current); switch (resource) { case RLIMIT_STACK: dkprintf("[%d] getrlimit() RLIMIT_STACK\n", ihk_mc_get_processor_id()); if(copy_to_user(proc, &rlm->rlim_cur, &proc->rlimit_stack.rlim_cur, sizeof rlm->rlim_cur)) return -EFAULT; if(copy_to_user(proc, &rlm->rlim_max, &proc->rlimit_stack.rlim_max, sizeof rlm->rlim_max)) return -EFAULT; ret = 0; break; case RLIMIT_FSIZE: case RLIMIT_LOCKS: case RLIMIT_NOFILE: /* just forward */ ret = syscall_generic_forwarding(n, ctx); break; default: return -ENOSYS; } return ret; } SYSCALL_DECLARE(ptrace) { const int request = ihk_mc_syscall_arg0(ctx); const long pid = ihk_mc_syscall_arg1(ctx); void * const addr = (void *)ihk_mc_syscall_arg2(ctx); void * const data = (void *)ihk_mc_syscall_arg3(ctx); kprintf("ptrace(%d,%ld,%p,%p): ENOSYS\n", request, pid, addr, data); return -ENOSYS; } #define MIN2(x,y) (x) < (y) ? (x) : (y) SYSCALL_DECLARE(sched_setaffinity) { int tid = (int)ihk_mc_syscall_arg0(ctx); size_t len = (size_t)ihk_mc_syscall_arg1(ctx); cpu_set_t *u_cpu_set = (cpu_set_t *)ihk_mc_syscall_arg2(ctx); cpu_set_t k_cpu_set, cpu_set; struct process *thread; int cpu_id; unsigned long irqstate; extern int num_processors; if (sizeof(k_cpu_set) > len) { kprintf("%s:%d\n Too small buffer.", __FILE__, __LINE__); return -EINVAL; } len = MIN2(len, sizeof(k_cpu_set)); if (copy_from_user(cpu_local_var(current), &k_cpu_set, u_cpu_set, len)) { kprintf("%s:%d copy_from_user failed.\n", __FILE__, __LINE__); return -EFAULT; } // XXX: We should build something like cpu_available_mask in advance CPU_ZERO(&cpu_set); for (cpu_id = 0; cpu_id < num_processors; cpu_id++) if (CPU_ISSET(cpu_id, &k_cpu_set)) CPU_SET(cpu_id, &cpu_set); if(tid == 0) tid = cpu_local_var(current)->tid; for (cpu_id = 0; cpu_id < num_processors; cpu_id++) { irqstate = ihk_mc_spinlock_lock(&get_cpu_local_var(cpu_id)->runq_lock); list_for_each_entry(thread, &get_cpu_local_var(cpu_id)->runq, sched_list) if (thread->pid && thread->tid == tid) goto found; /* without unlocking runq_lock */ ihk_mc_spinlock_unlock(&get_cpu_local_var(cpu_id)->runq_lock, irqstate); } kprintf("%s:%d Thread not found.\n", __FILE__, __LINE__); return -ESRCH; found: memcpy(&thread->cpu_set, &cpu_set, sizeof(cpu_set)); if (!CPU_ISSET(cpu_id, &thread->cpu_set)) { hold_process(thread); ihk_mc_spinlock_unlock(&get_cpu_local_var(cpu_id)->runq_lock, irqstate); sched_request_migrate(cpu_id, thread); release_process(thread); return 0; } else { ihk_mc_spinlock_unlock(&get_cpu_local_var(cpu_id)->runq_lock, irqstate); return 0; } } // see linux-2.6.34.13/kernel/sched.c SYSCALL_DECLARE(sched_getaffinity) { int tid = (int)ihk_mc_syscall_arg0(ctx); size_t len = (size_t)ihk_mc_syscall_arg1(ctx); cpu_set_t k_cpu_set, *u_cpu_set = (cpu_set_t *)ihk_mc_syscall_arg2(ctx); int ret; int found = 0; int i; unsigned long irqstate; extern int num_processors; if (sizeof(k_cpu_set) > len) { kprintf("%s:%d Too small buffer.\n", __FILE__, __LINE__); return -EINVAL; } len = MIN2(len, sizeof(k_cpu_set)); if(tid == 0) tid = cpu_local_var(current)->tid; for (i = 0; i < num_processors && !found; i++) { struct process *thread; irqstate = ihk_mc_spinlock_lock(&get_cpu_local_var(i)->runq_lock); list_for_each_entry(thread, &get_cpu_local_var(i)->runq, sched_list) { if (thread->pid && thread->tid == tid) { found = 1; memcpy(&k_cpu_set, &thread->cpu_set, sizeof(k_cpu_set)); break; } } ihk_mc_spinlock_unlock(&get_cpu_local_var(i)->runq_lock, irqstate); } if (!found) { kprintf("%s:%d Thread not found.\n", __FILE__, __LINE__); return -ESRCH; } ret = copy_to_user(cpu_local_var(current), u_cpu_set, &k_cpu_set, len); kprintf("%s %d %d\n", __FILE__, __LINE__, ret); if (ret < 0) return ret; return len; } SYSCALL_DECLARE(get_cpu_id) { return ihk_mc_get_processor_id(); } SYSCALL_DECLARE(sched_yield) { return -ENOSYS; } #ifdef UNDEFINED mlock() {} #endif SYSCALL_DECLARE(mlock) { const uintptr_t start0 = ihk_mc_syscall_arg0(ctx); const size_t len0 = ihk_mc_syscall_arg1(ctx); struct process *proc = cpu_local_var(current); struct vm_regions *region = &proc->vm->region; uintptr_t start; size_t len; uintptr_t end; struct vm_range *first; uintptr_t addr; struct vm_range *range; int error; struct vm_range *changed; dkprintf("[%d]sys_mlock(%lx,%lx)\n", ihk_mc_get_processor_id(), start0, len0); start = start0 & PAGE_MASK; len = (start & (PAGE_SIZE - 1)) + len0; len = (len + PAGE_SIZE - 1) & PAGE_MASK; end = start + len; if (end < start) { error = -EINVAL; goto out2; } if ((start < region->user_start) || (region->user_end <= start) || (len > (region->user_end - region->user_start)) || ((region->user_end - len) < start)) { error = -ENOMEM; goto out2; } if (start == end) { error = 0; goto out2; } ihk_mc_spinlock_lock_noirq(&proc->vm->memory_range_lock); /* check contiguous map */ first = NULL; for (addr = start; addr < end; addr = range->end) { if (first == NULL) { range = lookup_process_memory_range(proc->vm, start, start+PAGE_SIZE); first = range; } else { range = next_process_memory_range(proc->vm, range); } if (!range || (addr < range->start)) { /* not contiguous */ dkprintf("[%d]sys_mlock(%lx,%lx):not contiguous." " %lx [%lx-%lx)\n", ihk_mc_get_processor_id(), start0, len0, addr, range->start, range->end); error = -ENOMEM; goto out; } if (range->flag & (VR_REMOTE | VR_RESERVED | VR_IO_NOCACHE)) { ekprintf("[%d]sys_mlock(%lx,%lx):cannot change." " [%lx-%lx) %lx\n", ihk_mc_get_processor_id(), start0, len0, range->start, range->end, range->flag); error = -EINVAL; goto out; } } /* do the mlock */ changed = NULL; for (addr = start; addr < end; addr = changed->end) { if (!changed) { range = first; } else { range = next_process_memory_range(proc->vm, changed); } if (!range || (addr < range->start)) { /* not contiguous */ dkprintf("[%d]sys_mlock(%lx,%lx):not contiguous." " %lx [%lx-%lx)\n", ihk_mc_get_processor_id(), start0, len0, addr, range?range->start:0, range?range->end:0); error = -ENOMEM; goto out; } if (range->start < addr) { error = split_process_memory_range(proc, range, addr, &range); if (error) { ekprintf("[%d]sys_mlock(%lx,%lx):split failed. " " [%lx-%lx) %lx %d\n", ihk_mc_get_processor_id(), start0, len0, range->start, range->end, addr, error); goto out; } } if (end < range->end) { error = split_process_memory_range(proc, range, end, NULL); if (error) { ekprintf("[%d]sys_mlock(%lx,%lx):split failed. " " [%lx-%lx) %lx %d\n", ihk_mc_get_processor_id(), start0, len0, range->start, range->end, addr, error); goto out; } } range->flag |= VR_LOCKED; if (!changed) { changed = range; } else { error = join_process_memory_range(proc, changed, range); if (error) { dkprintf("[%d]sys_mlock(%lx,%lx):join failed. %d", ihk_mc_get_processor_id(), start0, len0, error); dkprintf("LHS: %p [%lx-%lx) %lx %p\n", changed, changed->start, changed->end, changed->flag, changed->memobj); dkprintf("RHS: %p [%lx-%lx) %lx %p\n", range, range->start, range->end, range->flag, range->memobj); changed = range; /* through */ } } } error = 0; out: ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock); if (!error) { error = populate_process_memory(proc, (void *)start, len); if (error) { ekprintf("sys_mlock(%lx,%lx):populate failed. %d\n", start0, len0, error); /* * In this case, * the region locked by this call should be unlocked * before mlock() returns with error. * * However, the region cannot be unlocked simply, * because the region can be modified by other thread * because memory_range_lock has been released. * * For the time being, like a linux-2.6.38-8, * the physical page allocation failure is ignored. */ error = 0; } } out2: dkprintf("[%d]sys_mlock(%lx,%lx): %d\n", ihk_mc_get_processor_id(), start0, len0, error); return error; } #ifdef UNDEFINED munlock() {} #endif SYSCALL_DECLARE(munlock) { const uintptr_t start0 = ihk_mc_syscall_arg0(ctx); const size_t len0 = ihk_mc_syscall_arg1(ctx); struct process *proc = cpu_local_var(current); struct vm_regions *region = &proc->vm->region; uintptr_t start; size_t len; uintptr_t end; struct vm_range *first; uintptr_t addr; struct vm_range *range; int error; struct vm_range *changed; dkprintf("[%d]sys_munlock(%lx,%lx)\n", ihk_mc_get_processor_id(), start0, len0); start = start0 & PAGE_MASK; len = (start & (PAGE_SIZE - 1)) + len0; len = (len + PAGE_SIZE - 1) & PAGE_MASK; end = start + len; if (end < start) { error = -EINVAL; goto out2; } if ((start < region->user_start) || (region->user_end <= start) || (len > (region->user_end - region->user_start)) || ((region->user_end - len) < start)) { error = -ENOMEM; goto out2; } if (start == end) { error = 0; goto out2; } ihk_mc_spinlock_lock_noirq(&proc->vm->memory_range_lock); /* check contiguous map */ first = NULL; for (addr = start; addr < end; addr = range->end) { if (first == NULL) { range = lookup_process_memory_range(proc->vm, start, start+PAGE_SIZE); first = range; } else { range = next_process_memory_range(proc->vm, range); } if (!range || (addr < range->start)) { /* not contiguous */ dkprintf("[%d]sys_munlock(%lx,%lx):not contiguous." " %lx [%lx-%lx)\n", ihk_mc_get_processor_id(), start0, len0, addr, range->start, range->end); error = -ENOMEM; goto out; } if (range->flag & (VR_REMOTE | VR_RESERVED | VR_IO_NOCACHE)) { ekprintf("[%d]sys_munlock(%lx,%lx):cannot change." " [%lx-%lx) %lx\n", ihk_mc_get_processor_id(), start0, len0, range->start, range->end, range->flag); error = -EINVAL; goto out; } } /* do the munlock */ changed = NULL; for (addr = start; addr < end; addr = changed->end) { if (!changed) { range = first; } else { range = next_process_memory_range(proc->vm, changed); } if (!range || (addr < range->start)) { /* not contiguous */ dkprintf("[%d]sys_munlock(%lx,%lx):not contiguous." " %lx [%lx-%lx)\n", ihk_mc_get_processor_id(), start0, len0, addr, range?range->start:0, range?range->end:0); error = -ENOMEM; goto out; } if (range->start < addr) { error = split_process_memory_range(proc, range, addr, &range); if (error) { ekprintf("[%d]sys_munlock(%lx,%lx):split failed. " " [%lx-%lx) %lx %d\n", ihk_mc_get_processor_id(), start0, len0, range->start, range->end, addr, error); goto out; } } if (end < range->end) { error = split_process_memory_range(proc, range, end, NULL); if (error) { ekprintf("[%d]sys_munlock(%lx,%lx):split failed. " " [%lx-%lx) %lx %d\n", ihk_mc_get_processor_id(), start0, len0, range->start, range->end, addr, error); goto out; } } range->flag &= ~VR_LOCKED; if (!changed) { changed = range; } else { error = join_process_memory_range(proc, changed, range); if (error) { dkprintf("[%d]sys_munlock(%lx,%lx):join failed. %d", ihk_mc_get_processor_id(), start0, len0, error); dkprintf("LHS: %p [%lx-%lx) %lx %p\n", changed, changed->start, changed->end, changed->flag, changed->memobj); dkprintf("RHS: %p [%lx-%lx) %lx %p\n", range, range->start, range->end, range->flag, range->memobj); changed = range; /* through */ } } } error = 0; out: ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock); out2: dkprintf("[%d]sys_munlock(%lx,%lx): %d\n", ihk_mc_get_processor_id(), start0, len0, error); return error; } #ifdef UNDEFINED remap_file_pages() {} #endif SYSCALL_DECLARE(remap_file_pages) { const uintptr_t start0 = ihk_mc_syscall_arg0(ctx); const size_t size = ihk_mc_syscall_arg1(ctx); const int prot = ihk_mc_syscall_arg2(ctx); const size_t pgoff = ihk_mc_syscall_arg3(ctx); const int flags = ihk_mc_syscall_arg4(ctx); int error; const uintptr_t start = start0 & PAGE_MASK; const uintptr_t end = start + size; const off_t off = (off_t)pgoff << PAGE_SHIFT; struct process * const proc = cpu_local_var(current); struct vm_range *range; int er; int need_populate = 0; dkprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x)\n", start0, size, prot, pgoff, flags); ihk_mc_spinlock_lock_noirq(&proc->vm->memory_range_lock); #define PGOFF_LIMIT ((off_t)1 << ((8*sizeof(off_t) - 1) - PAGE_SHIFT)) if ((size <= 0) || (size & (PAGE_SIZE - 1)) || (prot != 0) || (pgoff < 0) || (PGOFF_LIMIT <= pgoff) || ((PGOFF_LIMIT - pgoff) < (size / PAGE_SIZE)) || !((start < end) || (end == 0))) { ekprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x):" "invalid args\n", start0, size, prot, pgoff, flags); error = -EINVAL; goto out; } range = lookup_process_memory_range(proc->vm, start, end); if (!range || (start < range->start) || (range->end < end) || (range->flag & VR_PRIVATE) || (range->flag & (VR_REMOTE|VR_IO_NOCACHE|VR_RESERVED)) || !range->memobj) { ekprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x):" "invalid VMR:[%#lx-%#lx) %#lx %p\n", start0, size, prot, pgoff, flags, range?range->start:0, range?range->end:0, range?range->flag:0, range?range->memobj:NULL); error = -EINVAL; goto out; } range->flag |= VR_FILEOFF; error = remap_process_memory_range(proc->vm, range, start, end, off); if (error) { ekprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x):" "remap failed %d\n", start0, size, prot, pgoff, flags, error); goto out; } clear_host_pte(start, size); /* XXX: workaround */ if (range->flag & VR_LOCKED) { need_populate = 1; } error = 0; out: ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock); if (need_populate && (er = populate_process_memory( proc, (void *)start, size))) { ekprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x):" "populate failed %d\n", start0, size, prot, pgoff, flags, er); /* ignore populate error */ } dkprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x): %d\n", start0, size, prot, pgoff, flags, error); return error; } #ifdef UNDEFINED mremap() {} #endif SYSCALL_DECLARE(mremap) { const uintptr_t oldaddr = ihk_mc_syscall_arg0(ctx); const size_t oldsize0 = ihk_mc_syscall_arg1(ctx); const size_t newsize0 = ihk_mc_syscall_arg2(ctx); const int flags = ihk_mc_syscall_arg3(ctx); const uintptr_t newaddr = ihk_mc_syscall_arg4(ctx); const ssize_t oldsize = (oldsize0 + PAGE_SIZE - 1) & PAGE_MASK; const ssize_t newsize = (newsize0 + PAGE_SIZE - 1) & PAGE_MASK; const uintptr_t oldstart = oldaddr; const uintptr_t oldend = oldstart + oldsize; struct process *proc = cpu_local_var(current); struct process_vm *vm = proc->vm; int error; struct vm_range *range; int need_relocate; uintptr_t newstart; uintptr_t newend; size_t size; uintptr_t ret; uintptr_t lckstart = -1; uintptr_t lckend = -1; dkprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx)\n", oldaddr, oldsize0, newsize0, flags, newaddr); ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock); if ((oldaddr & ~PAGE_MASK) || (oldsize < 0) || (newsize <= 0) || (flags & ~(MREMAP_MAYMOVE | MREMAP_FIXED)) || ((flags & MREMAP_FIXED) && !(flags & MREMAP_MAYMOVE)) || ((flags & MREMAP_FIXED) && (newaddr & ~PAGE_MASK))) { error = -EINVAL; ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):invalid. %d\n", oldaddr, oldsize0, newsize0, flags, newaddr, error); goto out; } /* check original mapping */ range = lookup_process_memory_range(vm, oldstart, oldstart+PAGE_SIZE); if (!range || (oldstart < range->start) || (range->end < oldend) || (range->flag & (VR_FILEOFF)) || (range->flag & (VR_REMOTE|VR_IO_NOCACHE|VR_RESERVED))) { error = -EFAULT; ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" "lookup failed. %d %p %#lx-%#lx %#lx\n", oldaddr, oldsize0, newsize0, flags, newaddr, error, range, range?range->start:0, range?range->end:0, range?range->flag:0); goto out; } if (oldend < oldstart) { error = -EINVAL; ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" "old range overflow. %d\n", oldaddr, oldsize0, newsize0, flags, newaddr, error); goto out; } /* determine new mapping range */ need_relocate = 0; if (flags & MREMAP_FIXED) { need_relocate = 1; newstart = newaddr; newend = newstart + newsize; if (newstart < vm->region.user_start) { error = -EPERM; ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" "mmap_min_addr %#lx. %d\n", oldaddr, oldsize0, newsize0, flags, newaddr, vm->region.user_start, error); goto out; } if ((newstart < oldend) && (oldstart < newend)) { error = -EINVAL; ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" "fixed:overlapped. %d\n", oldaddr, oldsize0, newsize0, flags, newaddr, error); goto out; } } else if (!(flags & MREMAP_FIXED) && (oldsize < newsize)) { if (oldend == range->end) { newstart = oldstart; newend = newstart + newsize; error = extend_up_process_memory_range(vm, range, newend); if (!error) { if (range->flag & VR_LOCKED) { lckstart = oldend; lckend = newend; } goto out; } } if (!(flags & MREMAP_MAYMOVE)) { error = -ENOMEM; ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" "cannot relocate. %d\n", oldaddr, oldsize0, newsize0, flags, newaddr, error); goto out; } need_relocate = 1; error = search_free_space(newsize, vm->region.map_end, (intptr_t *)&newstart); if (error) { ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" "search failed. %d\n", oldaddr, oldsize0, newsize0, flags, newaddr, error); goto out; } newend = newstart + newsize; } else { newstart = oldstart; newend = newstart + newsize; } /* do the remap */ if (need_relocate) { if (flags & MREMAP_FIXED) { error = do_munmap((void *)newstart, newsize); if (error) { ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" "fixed:munmap failed. %d\n", oldaddr, oldsize0, newsize0, flags, newaddr, error); goto out; } } if (range->memobj) { memobj_ref(range->memobj); } error = add_process_memory_range(proc, newstart, newend, -1, range->flag, range->memobj, range->objoff + (oldstart - range->start)); if (error) { ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" "add failed. %d\n", oldaddr, oldsize0, newsize0, flags, newaddr, error); if (range->memobj) { memobj_release(range->memobj); } goto out; } if (range->flag & VR_LOCKED) { lckstart = newstart; lckend = newend; } if (oldsize > 0) { size = (oldsize < newsize)? oldsize: newsize; ihk_mc_spinlock_lock_noirq(&vm->page_table_lock); error = move_pte_range(vm->page_table, vm, (void *)oldstart, (void *)newstart, size); ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock); if (error) { ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" "move failed. %d\n", oldaddr, oldsize0, newsize0, flags, newaddr, error); goto out; } error = do_munmap((void *)oldstart, oldsize); if (error) { ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" "relocate:munmap failed. %d\n", oldaddr, oldsize0, newsize0, flags, newaddr, error); goto out; } } } else if (newsize < oldsize) { error = do_munmap((void *)newend, (oldend - newend)); if (error) { ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" "shrink:munmap failed. %d\n", oldaddr, oldsize0, newsize0, flags, newaddr, error); goto out; } } else { /* nothing to do */ } error = 0; out: ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock); if (!error && (lckstart < lckend)) { error = populate_process_memory(proc, (void *)lckstart, (lckend - lckstart)); if (error) { ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" "populate failed. %d %#lx-%#lx\n", oldaddr, oldsize0, newsize0, flags, newaddr, error, lckstart, lckend); error = 0; /* ignore error */ } } ret = (error)? error: newstart; dkprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):%d %#lx\n", oldaddr, oldsize0, newsize0, flags, newaddr, error, ret); return ret; } #ifdef DCFA_KMOD #ifdef CMD_DCFA extern int ibmic_cmd_syscall(char *uargs); extern void ibmic_cmd_exit(int status); #endif #ifdef CMD_DCFAMPI extern int dcfampi_cmd_syscall(char *uargs); #endif static int (*mod_call_table[]) (char *) = { #ifdef CMD_DCFA [1] = ibmic_cmd_syscall, #endif #ifdef CMD_DCFAMPI [2] = dcfampi_cmd_syscall, #endif }; static void (*mod_exit_table[]) (int) = { #ifdef CMD_DCFA [1] = ibmic_cmd_exit, #endif #ifdef CMD_DCFAMPI [2] = NULL, #endif }; SYSCALL_DECLARE(mod_call) { int mod_id; unsigned long long uargs; mod_id = ihk_mc_syscall_arg0(ctx); uargs = ihk_mc_syscall_arg1(ctx); dkprintf("mod_call id:%d, uargs=0x%llx, type=%s, command=%x\n", mod_id, uargs, mod_id==1?"ibmic":"dcfampi", *((uint32_t*)(((char*)uargs)+0))); if(mod_call_table[mod_id]) return mod_call_table[mod_id]((char*)uargs); kprintf("ERROR! undefined mod_call id:%d\n", mod_id); return -ENOSYS; } static void do_mod_exit(int status){ int i; for(i=1; i<=2; i++){ if(mod_exit_table[i]) mod_exit_table[i](status); } } #endif /* select counter type */ SYSCALL_DECLARE(pmc_init) { int counter = ihk_mc_syscall_arg0(ctx); enum ihk_perfctr_type type = (enum ihk_perfctr_type)ihk_mc_syscall_arg1(ctx); /* see ihk/manycore/generic/include/ihk/perfctr.h */ int mode = PERFCTR_USER_MODE; return ihk_mc_perfctr_init(counter, type, mode); } SYSCALL_DECLARE(pmc_start) { unsigned long counter = ihk_mc_syscall_arg0(ctx); return ihk_mc_perfctr_start(1 << counter); } SYSCALL_DECLARE(pmc_stop) { unsigned long counter = ihk_mc_syscall_arg0(ctx); return ihk_mc_perfctr_stop(1 << counter); } SYSCALL_DECLARE(pmc_reset) { int counter = ihk_mc_syscall_arg0(ctx); return ihk_mc_perfctr_reset(counter); } long syscall(int num, ihk_mc_user_context_t *ctx) { long l; cpu_enable_interrupt(); #if 0 if(num != 24) // if not sched_yield #endif dkprintf("SC(%d:%d)[%3d=%s](%lx, %lx,%lx, %lx, %lx, %lx)@%lx,sp:%lx", ihk_mc_get_processor_id(), ihk_mc_get_hardware_processor_id(), num, syscall_name[num], ihk_mc_syscall_arg0(ctx), ihk_mc_syscall_arg1(ctx), ihk_mc_syscall_arg2(ctx), ihk_mc_syscall_arg3(ctx), ihk_mc_syscall_arg4(ctx), ihk_mc_syscall_arg5(ctx), ihk_mc_syscall_pc(ctx), ihk_mc_syscall_sp(ctx)); #if 1 #if 0 if(num != 24) // if not sched_yield #endif dkprintf(",*sp:%lx,*(sp+8):%lx,*(sp+16):%lx,*(sp+24):%lx", *((unsigned long*)ihk_mc_syscall_sp(ctx)), *((unsigned long*)(ihk_mc_syscall_sp(ctx)+8)), *((unsigned long*)(ihk_mc_syscall_sp(ctx)+16)), *((unsigned long*)(ihk_mc_syscall_sp(ctx)+24))); #endif #if 0 if(num != 24) // if not sched_yield #endif dkprintf("\n"); if ((0 <= num) && (num < (sizeof(syscall_table) / sizeof(syscall_table[0]))) && (syscall_table[num] != NULL)) { l = syscall_table[num](num, ctx); dkprintf("SC(%d)[%3d] ret: %d\n", ihk_mc_get_processor_id(), num, l); } else { dkprintf("USC[%3d](%lx, %lx, %lx, %lx, %lx) @ %lx | %lx\n", num, ihk_mc_syscall_arg0(ctx), ihk_mc_syscall_arg1(ctx), ihk_mc_syscall_arg2(ctx), ihk_mc_syscall_arg3(ctx), ihk_mc_syscall_arg4(ctx), ihk_mc_syscall_pc(ctx), ihk_mc_syscall_sp(ctx)); l = syscall_generic_forwarding(num, ctx); } check_signal(l, NULL); check_need_resched(); return l; } #if 0 void __host_update_process_range(struct process *process, struct vm_range *range) { struct syscall_post *post; int idx; memcpy_async_wait(&cpu_local_var(scp).post_fin); post = &cpu_local_var(scp).post_buf; post->v[0] = 1; post->v[1] = range->start; post->v[2] = range->end; post->v[3] = range->phys; cpu_disable_interrupt(); if (cpu_local_var(scp).post_idx >= PAGE_SIZE / sizeof(struct syscall_post)) { /* XXX: Wait until it is consumed */ } else { idx = ++(cpu_local_var(scp).post_idx); cpu_local_var(scp).post_fin = 0; memcpy_async(cpu_local_var(scp).post_pa + idx * sizeof(*post), virt_to_phys(post), sizeof(*post), 0, &cpu_local_var(scp).post_fin); } cpu_enable_interrupt(); } #endif