diff --git a/executer/include/uprotocol.h b/executer/include/uprotocol.h index d1722bb1..6b64db4f 100644 --- a/executer/include/uprotocol.h +++ b/executer/include/uprotocol.h @@ -62,6 +62,7 @@ #define MCEXEC_UP_TERMINATE_THREAD 0x30a02925 #define MCEXEC_UP_GET_NUM_POOL_THREADS 0x30a02926 #define MCEXEC_UP_UTI_ATTR 0x30a02927 +#define MCEXEC_UP_UNMAP_PSEUDO_FILEMAP 0x30a02928 #define MCEXEC_UP_DEBUG_LOG 0x40000000 diff --git a/executer/kernel/mcctrl/control.c b/executer/kernel/mcctrl/control.c index ac58688c..d72c4219 100644 --- a/executer/kernel/mcctrl/control.c +++ b/executer/kernel/mcctrl/control.c @@ -283,6 +283,8 @@ struct mcos_handler_info { int cpu; struct mcctrl_usrdata *ud; struct file *file; + unsigned long user_start; + unsigned long user_end; }; struct mcos_handler_info; @@ -420,6 +422,8 @@ static long mcexec_start_image(ihk_os_t os, info->pid = desc->pid; info->cpu = desc->cpu; + info->user_start = desc->user_start; + info->user_end = desc->user_end; ihk_os_register_release_handler(file, release_handler, info); ihk_os_set_mcos_private_data(file, info); @@ -1198,9 +1202,28 @@ int mcexec_syscall(struct mcctrl_usrdata *ud, struct ikc_scd_packet *packet) wqhln = wqhln_iter; break; } - if (!wqhln) { - printk("%s: WARNING: no target thread found for exact request??\n", - __FUNCTION__); + /* Find the mcexec thread with the same tid as the requesting McKernel thread + and let it handle the migrate-to-Linux request */ + if (packet->req.number == __NR_sched_setaffinity && packet->req.args[0] == 0) { + list_for_each_entry(wqhln_iter, &ppd->wq_list, list) { + if (packet->req.ttid == wqhln_iter->rtid) { + if (!wqhln_iter->task) { + printk("%s: ERROR: wqhln_iter->task=%p,rtid=%d,&ppd->wq_list_lock=%p\n", __FUNCTION__, wqhln_iter->task, wqhln_iter->rtid, &ppd->wq_list_lock); + } else if(wqhln_iter->req) { + /* list_del() is called after woken-up */ + dprintk("%s: INFO: target thread is busy, wqhln_iter->req=%d,rtid=%d,&ppd->wq_list_lock=%p\n", __FUNCTION__, wqhln_iter->req, wqhln_iter->rtid, &ppd->wq_list_lock); + } else { + wqhln = wqhln_iter; + dprintk("%s: uti, worker with tid of %d found in wq_list\n", __FUNCTION__, packet->req.ttid); + } + break; + } + } + } else { + if (!wqhln) { + printk("%s: WARNING: no target thread (tid=%d) found for exact request??\n", + __FUNCTION__, packet->req.ttid); + } } } /* Is there any thread available? */ @@ -1225,6 +1248,12 @@ retry_alloc: wqhln = wqhln_alloc; wqhln->req = 0; wqhln->task = NULL; + /* Let the mcexec thread to handle migrate-to-Linux request in mcexec_wait_syscall() after finishing the current task */ + if (packet->req.number == __NR_sched_setaffinity && packet->req.args[0] == 0) { + wqhln->rtid = packet->req.ttid; + } else { + wqhln->rtid = 0; + } init_waitqueue_head(&wqhln->wq_syscall); list_add_tail(&wqhln->list, &ppd->wq_req_list); } @@ -1272,16 +1301,27 @@ int mcexec_wait_syscall(ihk_os_t os, struct syscall_wait_desc *__user req) retry: /* Prepare per-thread wait queue head or find a valid request */ irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock); + + /* Handle migrate-to-Linux request if any */ + list_for_each_entry(wqhln_iter, &ppd->wq_req_list, list) { + if (wqhln_iter->rtid == task_pid_vnr(current)) { + wqhln = wqhln_iter; + wqhln->task = current; + list_del(&wqhln->list); + goto found; + } + } + /* First see if there is a valid request already that is not yet taken */ list_for_each_entry(wqhln_iter, &ppd->wq_req_list, list) { - if (wqhln_iter->task == NULL && wqhln_iter->req) { + if (!wqhln_iter->rtid && wqhln_iter->task == NULL && wqhln_iter->req) { wqhln = wqhln_iter; wqhln->task = current; list_del(&wqhln->list); break; } } - + found: if (!wqhln) { retry_alloc: wqhln = kmalloc(sizeof(*wqhln), GFP_ATOMIC); @@ -1293,6 +1333,8 @@ retry_alloc: wqhln->task = current; wqhln->req = 0; wqhln->packet = NULL; + /* Let mcexec_syscall() find the mcexec thread to handle migrate-to-Linux request */ + wqhln->rtid = task_pid_vnr(current); init_waitqueue_head(&wqhln->wq_syscall); list_add(&wqhln->list, &ppd->wq_list); @@ -2354,6 +2396,8 @@ mcexec_util_thread2(ihk_os_t os, unsigned long arg, struct file *file) void **__user param = (void **__user )arg; void *__user rctx = (void *__user)param[1]; void *__user lctx = (void *__user)param[2]; + struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os); + struct mcctrl_per_proc_data *ppd; save_fs_ctx(lctx); info = ihk_os_get_mcos_private_data(file); @@ -2451,34 +2495,148 @@ err: kfree(thread); return 0; } + +long mcexec_unmap_pseudo_filemap(ihk_os_t os, struct file *file) +{ + long rc = -1; + struct mcos_handler_info *info; + info = ihk_os_get_mcos_private_data(file); + dprintk("%s: clear_pte_range %p-%p\n", __FUNCTION__, (void*)info->user_start, (void*)info->user_end); + rc = clear_pte_range(info->user_start, info->user_end - info->user_start); + return rc; +} -long -mcexec_syscall_thread(ihk_os_t os, unsigned long arg, struct file *file) + static long (*mckernel_do_futex)(int n, unsigned long arg0, unsigned long arg1, + unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5, + unsigned long _uti_clv, + void *uti_futex_resp, + void *_linux_wait_event, + void *_linux_printk, + void *_linux_clock_gettime); + + long uti_wait_event(void *_resp, unsigned long nsec_timeout) { + struct uti_futex_resp *resp = _resp; + if (nsec_timeout) { + return wait_event_interruptible_timeout(resp->wq, resp->done, nsecs_to_jiffies(nsec_timeout)); + } else { + return wait_event_interruptible(resp->wq, resp->done); + } + } + + int uti_printk(const char *fmt, ...) { + int sum = 0, nwritten; + va_list args; + va_start(args, fmt); + nwritten = vprintk(fmt, args); + sum += nwritten; + va_end(args); + return sum; + } + +int uti_clock_gettime(clockid_t clk_id, struct timespec *tp) { + int ret = 0; + struct timespec64 ts64; + dprintk("%s: clk_id=%x,REALTIME=%x,MONOTONIC=%x\n", __FUNCTION__, clk_id, CLOCK_REALTIME, CLOCK_MONOTONIC); + switch(clk_id) { + case CLOCK_REALTIME: + getnstimeofday64(&ts64); + tp->tv_sec = ts64.tv_sec; + tp->tv_nsec = ts64.tv_nsec; + dprintk("%s: CLOCK_REALTIME,%ld.%09ld\n", __FUNCTION__, tp->tv_sec, tp->tv_nsec); + break; + case CLOCK_MONOTONIC: { + /* Do not use getrawmonotonic() because it returns different value than clock_gettime() */ + ktime_get_ts64(&ts64); + tp->tv_sec = ts64.tv_sec; + tp->tv_nsec = ts64.tv_nsec; + dprintk("%s: CLOCK_MONOTONIC,%ld.%09ld\n", __FUNCTION__, tp->tv_sec, tp->tv_nsec); + break; } + default: + ret = -EINVAL; + break; + } + return ret; +} + +long mcexec_syscall_thread(ihk_os_t os, unsigned long arg, struct file *file) { struct syscall_struct { int number; unsigned long args[6]; unsigned long ret; + unsigned long uti_clv; /* copy of a clv in McKernel */ }; struct syscall_struct param; struct syscall_struct __user *uparam = (struct syscall_struct __user *)arg; - int rc; + long rc; if (copy_from_user(¶m, uparam, sizeof param)) { return -EFAULT; } - rc = syscall_backward(ihk_host_os_get_usrdata(os), param.number, - param.args[0], param.args[1], param.args[2], - param.args[3], param.args[4], param.args[5], - ¶m.ret); +#if 1 /* debug */ + if (param.number == __NR_futex) { +#else + if (0) { +#endif + struct uti_futex_resp resp = { + .done = 0 + }; + init_waitqueue_head(&resp.wq); + + if (!mckernel_do_futex) { + if (ihk_os_get_special_address(os, IHK_SPADDR_MCKERNEL_DO_FUTEX, + (unsigned long *)&mckernel_do_futex, + NULL)) { + kprintf("%s: ihk_os_get_special_address failed\n", __FUNCTION__); + return -EINVAL; + } + dprintk("%s: mckernel_do_futex=%p\n", __FUNCTION__, mckernel_do_futex); + } + rc = (*mckernel_do_futex)(param.number, param.args[0], param.args[1], param.args[2], + param.args[3], param.args[4], param.args[5], param.uti_clv, (void *)&resp, (void *)uti_wait_event, (void *)uti_printk, (void *)uti_clock_gettime); + param.ret = rc; + } else { + dprintk("%s: syscall_backward, SC %d, tid %d\n", __FUNCTION__, param.number, task_tgid_vnr(current)); + rc = syscall_backward(ihk_host_os_get_usrdata(os), param.number, + param.args[0], param.args[1], param.args[2], + param.args[3], param.args[4], param.args[5], + ¶m.ret); + } if (copy_to_user(&uparam->ret, ¶m.ret, sizeof(unsigned long))) { return -EFAULT; } return rc; } +void mcctrl_futex_wake(struct ikc_scd_packet *pisp) +{ + struct uti_futex_resp *resp; + + /* Guard the access to pisp->futex.resp, which is dead out of mcexec_syscall_thread() */ + if (*pisp->futex.spin_sleep == 0) { + dprintk("%s: DEBUG: woken up by someone else\n", __FUNCTION__); + return; + } + + resp = pisp->futex.resp; + if (!resp) { + kprintf("%s: ERROR: pisp->futex.resp is NULL\n", __FUNCTION__); + return; + } + + if (*pisp->futex.spin_sleep == 0) { + kprintf("%s: ERROR: resp is dead\n", __FUNCTION__); + return; + } + + resp->done = 1; + wake_up_interruptible(&resp->wq); +} + + static struct ihk_cache_topology * cache_topo_search(struct ihk_cpu_topology *cpu_topo, int level) { @@ -2838,6 +2996,9 @@ long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg, case MCEXEC_UP_TERMINATE_THREAD: return mcexec_terminate_thread(os, (unsigned long *)arg, file); + case MCEXEC_UP_UNMAP_PSEUDO_FILEMAP: + return mcexec_unmap_pseudo_filemap(os, file); + case MCEXEC_UP_GET_NUM_POOL_THREADS: return mcctrl_get_num_pool_threads(os); diff --git a/executer/kernel/mcctrl/driver.c b/executer/kernel/mcctrl/driver.c index ae735c32..eb7a672d 100644 --- a/executer/kernel/mcctrl/driver.c +++ b/executer/kernel/mcctrl/driver.c @@ -90,6 +90,7 @@ static struct ihk_os_user_call_handler mcctrl_uchs[] = { { .request = MCEXEC_UP_TERMINATE_THREAD, .func = mcctrl_ioctl }, { .request = MCEXEC_UP_GET_NUM_POOL_THREADS, .func = mcctrl_ioctl }, { .request = MCEXEC_UP_UTI_ATTR, .func = mcctrl_ioctl }, + { .request = MCEXEC_UP_UNMAP_PSEUDO_FILEMAP, .func = mcctrl_ioctl }, { .request = MCEXEC_UP_DEBUG_LOG, .func = mcctrl_ioctl }, { .request = IHK_OS_AUX_PERF_NUM, .func = mcctrl_ioctl }, { .request = IHK_OS_AUX_PERF_SET, .func = mcctrl_ioctl }, diff --git a/executer/kernel/mcctrl/ikc.c b/executer/kernel/mcctrl/ikc.c index a7cb0271..916b2d2e 100644 --- a/executer/kernel/mcctrl/ikc.c +++ b/executer/kernel/mcctrl/ikc.c @@ -52,6 +52,8 @@ static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ihk_ikc_channel_desc *c); int mcexec_syscall(struct mcctrl_usrdata *ud, struct ikc_scd_packet *packet); void sig_done(unsigned long arg, int err); +void mcctrl_perf_ack(ihk_os_t os, struct ikc_scd_packet *packet); +void mcctrl_futex_wake(struct ikc_scd_packet *pisp); void mcctrl_os_read_write_cpu_response(ihk_os_t os, struct ikc_scd_packet *pisp); void mcctrl_eventfd(ihk_os_t os, struct ikc_scd_packet *pisp); @@ -221,6 +223,10 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c, mcctrl_eventfd(__os, pisp); break; + case SCD_MSG_FUTEX_WAKE: + mcctrl_futex_wake(pisp); + break; + default: printk(KERN_ERR "mcctrl:syscall_packet_handler:" "unknown message (%d.%d.%d.%d.%d.%#lx)\n", diff --git a/executer/kernel/mcctrl/mcctrl.h b/executer/kernel/mcctrl/mcctrl.h index 847cfd7f..6e449239 100644 --- a/executer/kernel/mcctrl/mcctrl.h +++ b/executer/kernel/mcctrl/mcctrl.h @@ -102,6 +102,8 @@ #define SCD_MSG_CPU_RW_REG 0x52 #define SCD_MSG_CPU_RW_REG_RESP 0x53 +#define SCD_MSG_FUTEX_WAKE 0x60 + #define DMA_PIN_SHIFT 21 #define DO_USER_MODE @@ -126,6 +128,12 @@ enum mcctrl_os_cpu_operation { MCCTRL_OS_CPU_MAX_OP }; +/* Used to wake-up a Linux thread futex_wait()-ing */ +struct uti_futex_resp { + int done; + wait_queue_head_t wq; +}; + struct ikc_scd_packet { int msg; int err; @@ -164,6 +172,12 @@ struct ikc_scd_packet { struct { int eventfd_type; }; + + /* SCD_MSG_FUTEX_WAKE */ + struct { + void *resp; + int *spin_sleep; /* 1: waiting in linux_wait_event() 0: woken up by someone else */ + } futex; }; char padding[8]; }; @@ -465,6 +479,7 @@ inline struct mcctrl_per_thread_data *mcctrl_get_per_thread_data( void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet, long ret, int stid); +int clear_pte_range(uintptr_t start, uintptr_t len); int mcctrl_os_alive(void); diff --git a/executer/kernel/mcctrl/syscall.c b/executer/kernel/mcctrl/syscall.c index a9c605f7..2f38fc90 100644 --- a/executer/kernel/mcctrl/syscall.c +++ b/executer/kernel/mcctrl/syscall.c @@ -475,7 +475,7 @@ out_put_ppd: return syscall_ret; } -static int remote_page_fault(struct mcctrl_usrdata *usrdata, void *fault_addr, uint64_t reason) +int remote_page_fault(struct mcctrl_usrdata *usrdata, void *fault_addr, uint64_t reason) { struct ikc_scd_packet *packet; struct ikc_scd_packet *free_packet = NULL; @@ -1999,7 +1999,7 @@ out: return (IS_ERR_VALUE(map))? (int)map: 0; } -static int clear_pte_range(uintptr_t start, uintptr_t len) +int clear_pte_range(uintptr_t start, uintptr_t len) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; diff --git a/executer/user/arch/x86_64/arch_args.h b/executer/user/arch/x86_64/arch_args.h index 8c7fa740..d9bb0a16 100644 --- a/executer/user/arch/x86_64/arch_args.h +++ b/executer/user/arch/x86_64/arch_args.h @@ -67,6 +67,12 @@ get_syscall_arg6(syscall_args *args) return args->r9; } +static inline unsigned long +get_syscall_rip(syscall_args *args) +{ + return args->rip; +} + static inline void set_syscall_number(syscall_args *args, unsigned long value) { diff --git a/executer/user/mcexec.c b/executer/user/mcexec.c index 2f5bc1d7..efba2473 100644 --- a/executer/user/mcexec.c +++ b/executer/user/mcexec.c @@ -192,6 +192,7 @@ struct syscall_struct { int number; unsigned long args[6]; unsigned long ret; + unsigned long uti_clv; /* copy of a clv in McKernel */ }; #ifdef NCCS @@ -1951,6 +1952,20 @@ static void ld_preload_init() #endif } +struct uti_desc { + void *wp; + int mck_tid; + unsigned long key; + int pid, tid; /* Used as the id of tracee when issuing MCEXEC_UP_TERMINATE_THREAD */ + unsigned long uti_clv; + sem_t arg, attach; +}; + +static int create_tracer(); +int uti_pfd[2]; +void *uti_wp; +struct uti_desc *uti_desc; + int main(int argc, char **argv) { int ret = 0; @@ -2129,6 +2144,31 @@ int main(int argc, char **argv) if (opendev() == -1) exit(EXIT_FAILURE); + /* Perform mmap() before fork() in create_tracer() */ + uti_wp = mmap(NULL, PAGE_SIZE * 3, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (uti_wp == (void *)-1) { + exit(1); + } + uti_desc = mmap(NULL, sizeof(struct uti_desc), PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (uti_desc == (void *)-1) { + exit(1); + } + sem_init(&uti_desc->arg, 1, 0); + sem_init(&uti_desc->attach, 1, 0); +#if 1 + /* Create tracer before any proxy VMAs are attached */ + if ((error = pipe(uti_pfd)) == -1) { + fprintf(stderr, "%s: pipe returned %d\n", __FUNCTION__, error); + return -1; + } + if ((error = create_tracer())) { + fprintf(stderr, "%s: create tracer returned %d\n", __FUNCTION__, error); + return error; + } +#endif + ld_preload_init(); #ifdef ADD_ENVS_OPTION @@ -2853,11 +2893,8 @@ debug_sig(int s) #endif static int -create_tracer(void *wp, int mck_tid, unsigned long key) +create_tracer() { - int pid = getpid(); - int tid = gettid(); - int pfd[2]; int tpid; int rc; int st; @@ -2868,44 +2905,42 @@ create_tracer(void *wp, int mck_tid, unsigned long key) unsigned long code = 0; int exited = 0; int mode = 0; + //struct tracer_desc desc; + unsigned long buf; - if (pipe(pfd) == -1) - return -1; tpid = fork(); if (tpid) { - struct timeval tv; - fd_set rfd; - if (tpid == -1) return -1; - close(pfd[1]); + close(uti_pfd[1]); while ((rc = waitpid(tpid, &st, 0)) == -1 && errno == EINTR); if (rc == -1 || !WIFEXITED(st) || WEXITSTATUS(st)) { fprintf(stderr, "waitpid rc=%d st=%08x\n", rc, st); return -ENOMEM; } +#if 0 + struct timeval tv; + fd_set rfd; FD_ZERO(&rfd); - FD_SET(pfd[0], &rfd); + FD_SET(uti_pfd[0], &rfd); tv.tv_sec = 1; tv.tv_usec = 0; - while ((rc = select(pfd[0] + 1, &rfd, NULL, NULL, &tv)) == -1 && + while ((rc = select(uti_pfd[0] + 1, &rfd, NULL, NULL, &tv)) == -1 && errno == EINTR); if (rc == 0) { - close(pfd[0]); + fprintf(stderr, "%s: select timed out\n", __FUNCTION__); + close(uti_pfd[0]); return -ETIMEDOUT; } if (rc == -1) { - close(pfd[0]); + fprintf(stderr, "%s: select errno=%d\n", __FUNCTION__, errno); + close(uti_pfd[0]); return -errno; } - rc = read(pfd[0], &st, 1); - close(pfd[0]); - if (rc != 1) { - return -EAGAIN; - } +#endif return 0; } - close(pfd[0]); + close(uti_pfd[0]); tpid = fork(); if (tpid) { if (tpid == -1) { @@ -2914,17 +2949,42 @@ create_tracer(void *wp, int mck_tid, unsigned long key) } exit(0); } - if (ptrace(PTRACE_ATTACH, tid, 0, 0) == -1) { + +#if 0 + /* Reopen device because one process must be managed by one opened-device */ + close(fd); + fd = opendev(); + if (fd < 0) { + fprintf(stderr, "%s: ERROR: opendev returned %d\n", __FUNCTION__, errno); + exit(1); + } + + if (ioctl(fd, MCEXEC_UP_CREATE_PPD) != 0) { + fprintf(stderr, "%s: ERROR: MCEXEC_UP_CREATE_PPD returned %d\n", __FUNCTION__, errno); + exit(1); + } +#endif + + sem_wait(&uti_desc->arg); + //close(uti_pfd[0]); + + if (ptrace(PTRACE_ATTACH, uti_desc->tid, 0, 0) == -1) { fprintf(stderr, "PTRACE_ATTACH errno=%d\n", errno); exit(1); } waitpid(-1, &st, __WALL); - if (ptrace(PTRACE_SETOPTIONS, tid, 0, PTRACE_O_TRACESYSGOOD) == -1) { + if (ptrace(PTRACE_SETOPTIONS, uti_desc->tid, 0, PTRACE_O_TRACESYSGOOD) == -1) { fprintf(stderr, "PTRACE_SETOPTIONS errno=%d\n", errno); exit(1); } - write(pfd[1], " ", 1); - close(pfd[1]); + + /* Wake up tracee so that it can context-switch to McKernel code */ + rc = write(uti_pfd[1], &buf, sizeof(unsigned long)); + if (rc != sizeof(unsigned long)) { + fprintf(stderr, "%s: write returned %d\n", __FUNCTION__, rc); + exit(1); + } + close(uti_pfd[1]); for (i = 0; i < 4096; i++) if (i != fd @@ -2940,33 +3000,42 @@ create_tracer(void *wp, int mck_tid, unsigned long key) #endif for (i = 1; i <= 10; i++) { - param = (struct syscall_struct *)wp + i; + param = (struct syscall_struct *)uti_desc->wp + i; *(void **)param = param_top; param_top = param; } - memset(wp, '\0', sizeof(long)); + memset(uti_desc->wp, '\0', sizeof(long)); #ifdef DEBUG_UTI fprintf(stderr, "tracer PID=%d\n", getpid()); signal(SIGINT, debug_sig); #endif for (;;) { - ptrace(PTRACE_SYSCALL, tid, 0, sig); + ptrace(PTRACE_SYSCALL, uti_desc->tid, 0, sig); sig = 0; waitpid(-1, &st, __WALL); if (WIFEXITED(st) || WIFSIGNALED(st)) { unsigned long term_param[4]; - term_param[0] = pid; - term_param[1] = tid; - term_param[3] = key; + term_param[0] = uti_desc->pid; + term_param[1] = uti_desc->tid; + term_param[3] = uti_desc->key; code = st; if (exited == 2 || // exit_group WIFSIGNALED(st)) { code |= 0x0000000100000000; } term_param[2] = code; - ioctl(fd, MCEXEC_UP_TERMINATE_THREAD, term_param); + if (ioctl(fd, MCEXEC_UP_TERMINATE_THREAD, term_param) != 0) { + fprintf(stderr, "%s: ERROR: MCEXEC_UP_TERMINATE_THREAD returned %d\n", __FUNCTION__, errno); + } + __dprintf("%s: WIFEXITED=%d,WIFSIGNALED=%d,WTERMSIG=%d,exited=%d\n", __FUNCTION__, WIFEXITED(st), WIFSIGNALED(st), WTERMSIG(st), exited); +#if 0 + if (ptrace(PTRACE_DETACH, uti_desc->tid, 0, WIFSIGNALED(st) ? WTERMSIG(st) : 0) && errno != ESRCH) { + fprintf(stderr, "PTRACE_DETACH errno=%d\n", errno); + exit(1); + } +#endif break; } if (!WIFSTOPPED(st)) { @@ -2975,7 +3044,7 @@ create_tracer(void *wp, int mck_tid, unsigned long key) if (WSTOPSIG(st) & 0x80) { // syscall syscall_args args; - get_syscall_args(tid, &args); + get_syscall_args(uti_desc->tid, &args); #ifdef DEBUG_UTI if (get_syscall_return(&args) == -ENOSYS) { @@ -3000,8 +3069,8 @@ create_tracer(void *wp, int mck_tid, unsigned long key) switch (get_syscall_number(&args)) { case __NR_gettid: set_syscall_number(&args, -1); - set_syscall_return(&args, mck_tid); - set_syscall_args(tid, &args); + set_syscall_return(&args, uti_desc->mck_tid); + set_syscall_args(uti_desc->tid, &args); continue; case __NR_futex: case __NR_brk: @@ -3029,7 +3098,7 @@ create_tracer(void *wp, int mck_tid, unsigned long key) #endif /* POSTK_DEBUG_ARCH_DEP_78 */ case __NR_execve: set_syscall_number(&args, -1); - set_syscall_args(tid, &args); + set_syscall_args(uti_desc->tid, &args); continue; case __NR_ioctl: param = (struct syscall_struct *) @@ -3038,7 +3107,7 @@ create_tracer(void *wp, int mck_tid, unsigned long key) get_syscall_arg1(&args) == fd && get_syscall_arg2(&args) == MCEXEC_UP_SYSCALL_THREAD && - samepage(wp, param)) { + samepage(uti_desc->wp, param)) { set_syscall_arg1(&args, param->args[0]); set_syscall_arg2(&args, param->args[1]); set_syscall_arg3(&args, param->args[2]); @@ -3048,7 +3117,7 @@ create_tracer(void *wp, int mck_tid, unsigned long key) set_syscall_return(&args, param->ret); *(void **)param = param_top; param_top = param; - set_syscall_args(tid, &args); + set_syscall_args(uti_desc->tid, &args); } continue; default: @@ -3068,6 +3137,7 @@ create_tracer(void *wp, int mck_tid, unsigned long key) param->args[3] = get_syscall_arg4(&args); param->args[4] = get_syscall_arg5(&args); param->args[5] = get_syscall_arg6(&args); + param->uti_clv = uti_desc->uti_clv; param->ret = -EINVAL; set_syscall_number(&args, __NR_ioctl); set_syscall_arg1(&args, fd); @@ -3075,7 +3145,7 @@ create_tracer(void *wp, int mck_tid, unsigned long key) MCEXEC_UP_SYSCALL_THREAD); set_syscall_arg3(&args, (unsigned long)param); } - set_syscall_args(tid, &args); + set_syscall_args(uti_desc->tid, &args); } else { // signal sig = WSTOPSIG(st) & 0x7f; @@ -3083,45 +3153,47 @@ create_tracer(void *wp, int mck_tid, unsigned long key) } #ifdef DEBUG_UTI - fprintf(stderr, "offloaded thread called these syscalls\n"); - debug_sig(0); + //fprintf(stderr, "offloaded thread called these syscalls\n"); + //debug_sig(0); #endif exit(0); } static long -util_thread(unsigned long uctx_pa, int remote_tid, unsigned long pattr) +util_thread(struct thread_data_s *my_thread, unsigned long uctx_pa, int remote_tid, unsigned long pattr, unsigned long uti_clv) { void *lctx; void *rctx; - void *wp; void *param[6]; int rc = 0; + unsigned long buf; -#ifdef POSTK_DEBUG_ARCH_DEP_35 - wp = mmap(NULL, page_size * 3, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, -1, 0); -#else /* POSTK_DEBUG_ARCH_DEP_35 */ - wp = mmap(NULL, PAGE_SIZE * 3, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, -1, 0); -#endif /* POSTK_DEBUG_ARCH_DEP_35 */ - if (wp == (void *)-1) { - rc = -errno; - goto out; +#if 0 + { + int error; + if ((error = pipe(uti_pfd)) == -1) { + fprintf(stderr, "%s: pipe returned %d\n", __FUNCTION__, error); + rc = error; goto out; + } + if ((error = create_tracer())) { + fprintf(stderr, "%s: create_tracer returned %d\n", __FUNCTION__, error); + rc = error; goto out; + } } +#endif #ifdef POSTK_DEBUG_ARCH_DEP_35 - lctx = (char *)wp + page_size; + lctx = (char *)uti_wp + page_size; rctx = (char *)lctx + page_size; -#else /* POSTK_DEBUG_ARCH_DEP_35 */ - lctx = (char *)wp + PAGE_SIZE; +#else + lctx = (char *)uti_wp + PAGE_SIZE; rctx = (char *)lctx + PAGE_SIZE; #endif /* POSTK_DEBUG_ARCH_DEP_35 */ param[0] = (void *)uctx_pa; param[1] = rctx; param[2] = lctx; - param[4] = wp; + param[4] = uti_wp; #ifdef POSTK_DEBUG_ARCH_DEP_35 param[5] = (void *)(page_size * 3); #else /* POSTK_DEBUG_ARCH_DEP_35 */ @@ -3134,11 +3206,37 @@ util_thread(unsigned long uctx_pa, int remote_tid, unsigned long pattr) } create_worker_thread(NULL); - if ((rc = create_tracer(wp, remote_tid, (unsigned long)param[3]))) { - fprintf(stderr, "create tracer %d\n", rc); + + /* Pass info to the tracer so that it can masquerade as the tracee */ + uti_desc->wp = uti_wp; + uti_desc->mck_tid = remote_tid; + uti_desc->key = (unsigned long)param[3]; + uti_desc->pid = getpid(); + uti_desc->tid = gettid(); + uti_desc->uti_clv = uti_clv; + +#if 0 + //usleep(100000); + ssize_t nwritten; + char *cur; + for(cur = (char*)&uti_desc; (nwritten = write(uti_pfd[1], cur, sizeof(struct uti_desc) - (cur - (char*)&uti_desc))) > 0; cur += nwritten) { } + if (nwritten < 0) { + fprintf(stderr, "write returned %ld errno=%d\n", nwritten, errno); rc = -errno; goto out; } + close(uti_pfd[1]); +#endif + sem_post(&uti_desc->arg); + + /* Wait until tracer attaches me. We can't use + futex because it would be captured and redirected by tracer */ + rc = read(uti_pfd[0], &buf, sizeof(unsigned long)); + if (rc != sizeof(unsigned long)) { + fprintf(stderr, "%s: write returned %d\n", __FUNCTION__, rc); + exit(1); + } + close(uti_pfd[0]); if (pattr) { struct uti_attr_desc desc; @@ -3155,11 +3253,11 @@ util_thread(unsigned long uctx_pa, int remote_tid, unsigned long pattr) pthread_exit(NULL); out: - if (wp) + if (uti_wp != (void*)-1) #ifdef POSTK_DEBUG_ARCH_DEP_35 - munmap(wp, page_size * 3); + munmap(uti_wp, page_size * 3); #else /* POSTK_DEBUG_ARCH_DEP_35 */ - munmap(wp, PAGE_SIZE * 3); + munmap(uti_wp, PAGE_SIZE * 3); #endif /* POSTK_DEBUG_ARCH_DEP_35 */ return rc; } @@ -4225,8 +4323,8 @@ return_execve2: case __NR_sched_setaffinity: if (w.sr.args[0] == 0) { - ret = util_thread(w.sr.args[1], w.sr.rtid, - w.sr.args[2]); + ret = util_thread(my_thread, w.sr.args[1], w.sr.rtid, + w.sr.args[2], w.sr.args[3]); } else { ret = munmap((void *)w.sr.args[1], diff --git a/kernel/futex.c b/kernel/futex.c index 327d36ff..c657dfa2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -71,14 +71,21 @@ #include #include #include +#include //#define DEBUG_PRINT_FUTEX #ifdef DEBUG_PRINT_FUTEX #undef DDEBUG_DEFAULT #define DDEBUG_DEFAULT DDEBUG_PRINT +#define uti_dkprintf(...) do { ((clv_override && linux_printk) ? (*linux_printk) : kprintf)(__VA_ARGS__); } while (0) +#else +#define uti_dkprintf(...) do { } while (0) #endif +#define uti_kprintf(...) do { ((clv_override && linux_printk) ? (*linux_printk) : kprintf)(__VA_ARGS__); } while (0) + +unsigned long ihk_mc_get_ns_per_tsc(void); int futex_cmpxchg_enabled; /** @@ -108,6 +115,9 @@ struct futex_q { union futex_key key; union futex_key *requeue_pi_key; uint32_t bitset; + + /* Used to wake-up a thread running on a Linux CPU */ + void *uti_futex_resp; }; /* @@ -180,11 +190,12 @@ static void drop_futex_key_refs(union futex_key *key) * lock_page() might sleep, the caller should not hold a spinlock. */ static int -get_futex_key(uint32_t *uaddr, int fshared, union futex_key *key) +get_futex_key(uint32_t *uaddr, int fshared, union futex_key *key, struct cpu_local_var *clv_override) { unsigned long address = (unsigned long)uaddr; unsigned long phys; - struct process_vm *mm = cpu_local_var(current)->vm; + struct thread *thread = cpu_local_var_with_override(current, clv_override); + struct process_vm *mm = thread->vm; /* * The futex address must be "naturally" aligned. @@ -250,7 +261,7 @@ static int cmpxchg_futex_value_locked(uint32_t __user *uaddr, uint32_t uval, uin * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. */ -static void wake_futex(struct futex_q *q) +static void wake_futex(struct futex_q *q, struct cpu_local_var *clv_override) { struct thread *p = q->task; @@ -272,8 +283,31 @@ static void wake_futex(struct futex_q *q) barrier(); q->lock_ptr = NULL; - dkprintf("wake_futex(): waking up tid %d\n", p->tid); - sched_wakeup_thread(p, PS_NORMAL); + + if (q->uti_futex_resp) { + int rc; + uti_dkprintf("wake_futex(): waking up migrated-to-Linux thread (tid %d),uti_futex_resp=%p\n", p->tid, q->uti_futex_resp); + /* TODO: Add the case when a Linux thread waking up another Linux thread */ + if (clv_override) { + uti_dkprintf("%s: ERROR: A Linux thread is waking up migrated-to-Linux thread\n", __FUNCTION__); + } + if (p->spin_sleep == 0) { + uti_dkprintf("%s: INFO: woken up by someone else\n", __FUNCTION__); + } + + struct ikc_scd_packet pckt; + struct ihk_ikc_channel_desc *resp_channel = cpu_local_var_with_override(ikc2linux, clv_override); + pckt.msg = SCD_MSG_FUTEX_WAKE; + pckt.futex.resp = q->uti_futex_resp; + pckt.futex.spin_sleep = &p->spin_sleep; + rc = ihk_ikc_send(resp_channel, &pckt, 0); + if (rc) { + uti_dkprintf("%s: ERROR: ihk_ikc_send returned %d, resp_channel=%p\n", __FUNCTION__, rc, resp_channel); + } + } else { + uti_dkprintf("wake_futex(): waking up McKernel thread (tid %d)\n", p->tid); + sched_wakeup_thread(p, PS_NORMAL); + } } /* @@ -303,7 +337,7 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) /* * Wake up waiters matching bitset queued on this futex (uaddr). */ -static int futex_wake(uint32_t *uaddr, int fshared, int nr_wake, uint32_t bitset) +static int futex_wake(uint32_t *uaddr, int fshared, int nr_wake, uint32_t bitset, struct cpu_local_var *clv_override) { struct futex_hash_bucket *hb; struct futex_q *this, *next; @@ -314,7 +348,7 @@ static int futex_wake(uint32_t *uaddr, int fshared, int nr_wake, uint32_t bitset if (!bitset) return -EINVAL; - ret = get_futex_key(uaddr, fshared, &key); + ret = get_futex_key(uaddr, fshared, &key, clv_override); if ((ret != 0)) goto out; @@ -330,7 +364,7 @@ static int futex_wake(uint32_t *uaddr, int fshared, int nr_wake, uint32_t bitset if (!(this->bitset & bitset)) continue; - wake_futex(this); + wake_futex(this, clv_override); if (++ret >= nr_wake) break; } @@ -348,7 +382,8 @@ out: */ static int futex_wake_op(uint32_t *uaddr1, int fshared, uint32_t *uaddr2, - int nr_wake, int nr_wake2, int op) + int nr_wake, int nr_wake2, int op, + struct cpu_local_var *clv_override) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; struct futex_hash_bucket *hb1, *hb2; @@ -357,10 +392,10 @@ futex_wake_op(uint32_t *uaddr1, int fshared, uint32_t *uaddr2, int ret, op_ret; retry: - ret = get_futex_key(uaddr1, fshared, &key1); + ret = get_futex_key(uaddr1, fshared, &key1, clv_override); if ((ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2); + ret = get_futex_key(uaddr2, fshared, &key2, clv_override); if ((ret != 0)) goto out_put_key1; @@ -394,7 +429,7 @@ retry_private: plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key1)) { - wake_futex(this); + wake_futex(this, clv_override); if (++ret >= nr_wake) break; } @@ -406,7 +441,7 @@ retry_private: op_ret = 0; plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key2)) { - wake_futex(this); + wake_futex(this, clv_override); if (++op_ret >= nr_wake2) break; } @@ -469,7 +504,7 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, */ static int futex_requeue(uint32_t *uaddr1, int fshared, uint32_t *uaddr2, int nr_wake, int nr_requeue, uint32_t *cmpval, - int requeue_pi) + int requeue_pi, struct cpu_local_var *clv_override) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; int drop_count = 0, task_count = 0, ret; @@ -477,10 +512,10 @@ static int futex_requeue(uint32_t *uaddr1, int fshared, uint32_t *uaddr2, struct plist_head *head1; struct futex_q *this, *next; - ret = get_futex_key(uaddr1, fshared, &key1); + ret = get_futex_key(uaddr1, fshared, &key1, clv_override); if ((ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2); + ret = get_futex_key(uaddr2, fshared, &key2, clv_override); if ((ret != 0)) goto out_put_key1; @@ -515,7 +550,7 @@ static int futex_requeue(uint32_t *uaddr1, int fshared, uint32_t *uaddr2, */ /* RIKEN: no requeue_pi at this moment */ if (++task_count <= nr_wake) { - wake_futex(this); + wake_futex(this, clv_override); continue; } @@ -574,7 +609,7 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) * state is implicit in the state of woken task (see futex_wait_requeue_pi() for * an example). */ -static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) +static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb, struct cpu_local_var *clv_override) { int prio; @@ -595,7 +630,7 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) q->list.plist.spinlock = &hb->lock; #endif plist_add(&q->list, &hb->chain); - q->task = cpu_local_var(current); + q->task = cpu_local_var_with_override(current, clv_override); ihk_mc_spinlock_unlock_noirq(&hb->lock); } @@ -658,19 +693,19 @@ retry: /* RIKEN: this function has been rewritten so that it returns the remaining * time in case we are waken. */ -static uint64_t futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, - uint64_t timeout) +static int64_t futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, + uint64_t timeout, struct cpu_local_var *clv_override) { - uint64_t time_remain = 0; + int64_t time_remain = 0; unsigned long irqstate; - struct thread *thread = cpu_local_var(current); + struct thread *thread = cpu_local_var_with_override(current, clv_override); /* * The task state is guaranteed to be set before another task can * wake it. * queue_me() calls spin_unlock() upon completion, serializing * access to the hash list and forcing a memory barrier. */ - xchg4(&(cpu_local_var(current)->status), PS_INTERRUPTIBLE); + xchg4(&(thread->status), PS_INTERRUPTIBLE); /* Indicate spin sleep. Note that schedule_timeout() with * idle_halt should use spin sleep because sleep with timeout @@ -682,25 +717,40 @@ static uint64_t futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q ihk_mc_spinlock_unlock(&thread->spin_sleep_lock, irqstate); } - queue_me(q, hb); + queue_me(q, hb, clv_override); if (!plist_node_empty(&q->list)) { + if (clv_override) { + uti_dkprintf("%s: tid: %d is trying to sleep\n", __FUNCTION__, thread->tid); + /* Note that the unit of timeout is nsec */ + time_remain = (*linux_wait_event)(q->uti_futex_resp, timeout); + + /* Note that time_remain == 0 indicates contidion evaluated to false after the timeout elapsed */ + if (time_remain < 0) { + if (time_remain == -ERESTARTSYS) { /* Interrupted by signal */ + uti_dkprintf("%s: DEBUG: wait_event returned -ERESTARTSYS\n", __FUNCTION__); + } else { + uti_kprintf("%s: ERROR: wait_event returned %d\n", __FUNCTION__, time_remain); + } + } + uti_dkprintf("%s: tid: %d woken up\n", __FUNCTION__, thread->tid); + } else { if (timeout) { - dkprintf("futex_wait_queue_me(): tid: %d schedule_timeout()\n", cpu_local_var(current)->tid); + dkprintf("futex_wait_queue_me(): tid: %d schedule_timeout()\n", thread->tid); time_remain = schedule_timeout(timeout); } else { - dkprintf("futex_wait_queue_me(): tid: %d schedule()\n", cpu_local_var(current)->tid); + dkprintf("futex_wait_queue_me(): tid: %d schedule()\n", thread->tid); spin_sleep_or_schedule(); time_remain = 0; } - - dkprintf("futex_wait_queue_me(): tid: %d woken up\n", cpu_local_var(current)->tid); + dkprintf("futex_wait_queue_me(): tid: %d woken up\n", thread->tid); + } } /* This does not need to be serialized */ - cpu_local_var(current)->status = PS_RUNNING; + thread->status = PS_RUNNING; thread->spin_sleep = 0; return time_remain; @@ -724,7 +774,8 @@ static uint64_t futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked */ static int futex_wait_setup(uint32_t __user *uaddr, uint32_t val, int fshared, - struct futex_q *q, struct futex_hash_bucket **hb) + struct futex_q *q, struct futex_hash_bucket **hb, + struct cpu_local_var *clv_override) { uint32_t uval; int ret; @@ -747,7 +798,7 @@ static int futex_wait_setup(uint32_t __user *uaddr, uint32_t val, int fshared, * rare, but normal. */ q->key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q->key); + ret = get_futex_key(uaddr, fshared, &q->key, clv_override); if (ret != 0) return ret; @@ -771,46 +822,54 @@ static int futex_wait_setup(uint32_t __user *uaddr, uint32_t val, int fshared, } static int futex_wait(uint32_t __user *uaddr, int fshared, - uint32_t val, uint64_t timeout, uint32_t bitset, int clockrt) + uint32_t val, uint64_t timeout, uint32_t bitset, int clockrt, + struct cpu_local_var *clv_override) { struct futex_hash_bucket *hb; struct futex_q q; - uint64_t time_remain; + int64_t time_remain; int ret; if (!bitset) return -EINVAL; #ifdef PROFILE_ENABLE - if (cpu_local_var(current)->profile && - cpu_local_var(current)->profile_start_ts) { - cpu_local_var(current)->profile_elapsed_ts += - (rdtsc() - cpu_local_var(current)->profile_start_ts); - cpu_local_var(current)->profile_start_ts = 0; + if (cpu_local_var_with_override(current, clv_override)->profile && + cpu_local_var_with_override(current, clv_override)->profile_start_ts) { + cpu_local_var_with_override(current, clv_override)->profile_elapsed_ts += + (rdtsc() - cpu_local_var_with_override(current, clv_override)->profile_start_ts); + cpu_local_var_with_override(current, clv_override)->profile_start_ts = 0; } #endif q.bitset = bitset; q.requeue_pi_key = NULL; + q.uti_futex_resp = cpu_local_var_with_override(uti_futex_resp, clv_override); retry: /* Prepare to wait on uaddr. */ - ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); - if (ret) + ret = futex_wait_setup(uaddr, val, fshared, &q, &hb, clv_override); + if (ret) { + uti_dkprintf("%s: tid=%d futex_wait_setup returns zero, no need to sleep\n", __FUNCTION__, cpu_local_var_with_override(current, clv_override)->tid); goto out; + } /* queue_me and wait for wakeup, timeout, or a signal. */ - time_remain = futex_wait_queue_me(hb, &q, timeout); + time_remain = futex_wait_queue_me(hb, &q, timeout, clv_override); /* If we were woken (and unqueued), we succeeded, whatever. */ ret = 0; - if (!unqueue_me(&q)) + if (!unqueue_me(&q)) { + uti_dkprintf("%s: tid=%d unqueued\n", __FUNCTION__, cpu_local_var_with_override(current, clv_override)->tid); goto out_put_key; + } ret = -ETIMEDOUT; /* RIKEN: timer expired case (indicated by !time_remain) */ - if (timeout && !time_remain) + if (timeout && !time_remain) { + uti_dkprintf("%s: tid=%d timer expired\n", __FUNCTION__, cpu_local_var_with_override(current, clv_override)->tid); goto out_put_key; + } if (hassigpending(cpu_local_var(current))) { ret = -EINTR; @@ -825,19 +884,22 @@ out_put_key: put_futex_key(fshared, &q.key); out: #ifdef PROFILE_ENABLE - if (cpu_local_var(current)->profile) { - cpu_local_var(current)->profile_start_ts = rdtsc(); + if (cpu_local_var_with_override(current, clv_override)->profile) { + cpu_local_var_with_override(current, clv_override)->profile_start_ts = rdtsc(); } #endif return ret; } int futex(uint32_t *uaddr, int op, uint32_t val, uint64_t timeout, - uint32_t *uaddr2, uint32_t val2, uint32_t val3, int fshared) + uint32_t *uaddr2, uint32_t val2, uint32_t val3, int fshared, + struct cpu_local_var *clv_override) { int clockrt, ret = -ENOSYS; int cmd = op & FUTEX_CMD_MASK; + uti_dkprintf("%s: uaddr=%p, op=%x, val=%x, timeout=%ld, uaddr2=%p, val2=%x, val3=%x, fshared=%d, clv=%p\n", __FUNCTION__, uaddr, op, val, timeout, uaddr2, val2, val3, fshared, clv_override); + clockrt = op & FUTEX_CLOCK_REALTIME; if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) return -ENOSYS; @@ -846,21 +908,21 @@ int futex(uint32_t *uaddr, int op, uint32_t val, uint64_t timeout, case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; case FUTEX_WAIT_BITSET: - ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt); + ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt, clv_override); break; case FUTEX_WAKE: val3 = FUTEX_BITSET_MATCH_ANY; case FUTEX_WAKE_BITSET: - ret = futex_wake(uaddr, fshared, val, val3); + ret = futex_wake(uaddr, fshared, val, val3, clv_override); break; case FUTEX_REQUEUE: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0, clv_override); break; case FUTEX_CMP_REQUEUE: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, 0); + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, 0, clv_override); break; case FUTEX_WAKE_OP: - ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); + ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3, clv_override); break; /* RIKEN: these calls are not supported for now. case FUTEX_LOCK_PI: diff --git a/kernel/include/cls.h b/kernel/include/cls.h index 920914cd..f1f8b4c0 100644 --- a/kernel/include/cls.h +++ b/kernel/include/cls.h @@ -100,6 +100,9 @@ struct cpu_local_var { struct list_head smp_func_req_list; struct process_vm *on_fork_vm; + + /* UTI */ + void *uti_futex_resp; } __attribute__((aligned(64))); @@ -111,4 +114,6 @@ static struct cpu_local_var *get_this_cpu_local_var(void) #define cpu_local_var(name) get_this_cpu_local_var()->name +#define cpu_local_var_with_override(name, clv_override) (clv_override ? clv_override->name : get_this_cpu_local_var()->name) + #endif diff --git a/kernel/include/futex.h b/kernel/include/futex.h index a86b5cae..f09afcf9 100644 --- a/kernel/include/futex.h +++ b/kernel/include/futex.h @@ -150,6 +150,7 @@ union futex_key { extern int futex_init(void); +struct cpu_local_var; extern int futex( uint32_t __user * uaddr, @@ -159,7 +160,8 @@ futex( uint32_t __user * uaddr2, uint32_t val2, uint32_t val3, - int fshared + int fshared, + struct cpu_local_var *clv_override ); diff --git a/kernel/include/syscall.h b/kernel/include/syscall.h index 52953682..d578d686 100644 --- a/kernel/include/syscall.h +++ b/kernel/include/syscall.h @@ -83,6 +83,8 @@ #define SCD_MSG_CPU_RW_REG 0x52 #define SCD_MSG_CPU_RW_REG_RESP 0x53 +#define SCD_MSG_FUTEX_WAKE 0x60 + /* Cloning flags. */ # define CSIGNAL 0x000000ff /* Signal mask to be sent at exit. */ # define CLONE_VM 0x00000100 /* Set if VM shared between processes. */ @@ -276,6 +278,12 @@ struct ikc_scd_packet { struct { int eventfd_type; }; + + /* SCD_MSG_FUTEX_WAKE */ + struct { + void *resp; + int *spin_sleep; /* 1: waiting in linux_wait_event() 0: woken up by someone else */ + } futex; }; char padding[8]; }; @@ -475,6 +483,14 @@ int arch_cpu_read_write_register(struct ihk_os_cpu_register *desc, enum mcctrl_os_cpu_operation op); struct vm_range_numa_policy *vm_range_policy_search(struct process_vm *vm, uintptr_t addr); time_t time(void); +long do_futex(int n, unsigned long arg0, unsigned long arg1, + unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5, + unsigned long _uti_clv, + void *uti_futex_resp, + void *_linux_wait_event, + void *_linux_printk, + void *_linux_clock_gettime); #ifndef POSTK_DEBUG_ARCH_DEP_52 #define VDSO_MAXPAGES 2 @@ -592,4 +608,9 @@ struct move_pages_smp_req { #define PROCESS_VM_READ 0 #define PROCESS_VM_WRITE 1 +/* uti: function pointers pointing to Linux codes */ +extern long (*linux_wait_event)(void *_resp, unsigned long nsec_timeout); +extern int (*linux_printk)(const char *fmt, ...); +extern int (*linux_clock_gettime)(clockid_t clk_id, struct timespec *tp); + #endif diff --git a/kernel/include/time.h b/kernel/include/time.h index b4c1bffd..1d4a30c2 100644 --- a/kernel/include/time.h +++ b/kernel/include/time.h @@ -25,6 +25,8 @@ #define CLOCK_PROCESS_CPUTIME_ID 2 #define CLOCK_THREAD_CPUTIME_ID 3 +typedef int clockid_t; + typedef long int __time_t; /* POSIX.1b structure for a time value. This is like a `struct timeval' but diff --git a/kernel/init.c b/kernel/init.c index ddc6260b..25a805dd 100644 --- a/kernel/init.c +++ b/kernel/init.c @@ -251,6 +251,11 @@ static void nmi_init() ihk_set_nmi_mode_addr(phys); } +static void uti_init() +{ + ihk_set_mckernel_do_futex((unsigned long)do_futex); +} + static void rest_init(void) { handler_init(); @@ -266,6 +271,7 @@ static void rest_init(void) #endif /* !POSTK_DEBUG_TEMP_FIX_73 */ cpu_local_var_init(); nmi_init(); + uti_init(); time_init(); kmalloc_init(); diff --git a/kernel/syscall.c b/kernel/syscall.c index 80ca9aa1..cfe0c076 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -74,6 +74,13 @@ #define DDEBUG_DEFAULT DDEBUG_PRINT #endif +#define DEBUG_UTI +#ifdef DEBUG_UTI +#define uti_dkprintf(...) do { ((uti_clv && linux_printk) ? (*linux_printk) : kprintf)(__VA_ARGS__); } while (0) +#else +#define uti_dkprintf(...) do { } while (0) +#endif + //static ihk_atomic_t pid_cnt = IHK_ATOMIC_INIT(1024); /* generate system call handler's prototypes */ @@ -139,6 +146,10 @@ static void do_mod_exit(int status); */ #define NR_TIDS (allow_oversubscribe ? (num_processors * 2) : num_processors) +long (*linux_wait_event)(void *_resp, unsigned long nsec_timeout); +int (*linux_printk)(const char *fmt, ...); +int (*linux_clock_gettime)(clockid_t clk_id, struct timespec *tp); + static void send_syscall(struct syscall_request *req, int cpu, int pid, struct syscall_response *res) { struct ikc_scd_packet packet IHK_DMA_ALIGN; @@ -264,10 +275,21 @@ long do_syscall(struct syscall_request *req, int cpu, int pid) ++thread->in_syscall_offload; } - /* The current thread is the requester and any thread from - * the pool may serve the request */ + /* The current thread is the requester */ req->rtid = cpu_local_var(current)->tid; - req->ttid = 0; + + if (req->number == __NR_sched_setaffinity && req->args[0] == 0) { + /* mcexec thread serving migrate-to-Linux request must have + the same tid as the requesting McKernel thread because the + serving thread jumps to hfi driver and then jumps to + rus_vm_fault() without registering it into per thread data + by mcctrl_add_per_thread_data()). */ + req->ttid = cpu_local_var(current)->tid/*0*/; + dkprintf("%s: uti, ttid=%d\n", __FUNCTION__, req->ttid); + } else { + /* Any thread from the pool may serve the request */ + req->ttid = 0; + } res.req_thread_status = IHK_SCD_REQ_THREAD_SPINNING; #ifdef POSTK_DEBUG_TEMP_FIX_26 /* do_syscall arg pid is not targetpid */ send_syscall(req, cpu, target_pid, &res); @@ -5323,8 +5345,16 @@ SYSCALL_DECLARE(shmdt) return 0; } /* sys_shmdt() */ -SYSCALL_DECLARE(futex) +long do_futex(int n, unsigned long arg0, unsigned long arg1, + unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5, + unsigned long _uti_clv, + void *uti_futex_resp, + void *_linux_wait_event, + void *_linux_printk, + void *_linux_clock_gettime) { + struct cpu_local_var *uti_clv = (struct cpu_local_var *)_uti_clv; uint64_t timeout = 0; // No timeout uint32_t val2 = 0; // Only one clock is used, ignore FUTEX_CLOCK_REALTIME @@ -5332,24 +5362,44 @@ SYSCALL_DECLARE(futex) int fshared = 1; int ret = 0; - uint32_t *uaddr = (uint32_t *)ihk_mc_syscall_arg0(ctx); - int op = (int)ihk_mc_syscall_arg1(ctx); - uint32_t val = (uint32_t)ihk_mc_syscall_arg2(ctx); - struct timespec *utime = (struct timespec*)ihk_mc_syscall_arg3(ctx); - uint32_t *uaddr2 = (uint32_t *)ihk_mc_syscall_arg4(ctx); - uint32_t val3 = (uint32_t)ihk_mc_syscall_arg5(ctx); + uint32_t *uaddr = (uint32_t *)arg0; + int op = (int)arg1; + uint32_t val = (uint32_t)arg2; + struct timespec *utime = (struct timespec*)arg3; + uint32_t *uaddr2 = (uint32_t *)arg4; + uint32_t val3 = (uint32_t)arg5; int flags = op; - struct ihk_os_cpu_monitor *monitor = cpu_local_var(monitor); - monitor->status = IHK_OS_MONITOR_KERNEL_HEAVY; - + + /* TODO: replace these with passing via struct smp_boot_param */ + if (_linux_printk && !linux_printk) { + linux_printk = (int (*)(const char *fmt, ...))_linux_printk; + } + if (_linux_wait_event && !linux_wait_event) { + linux_wait_event = (long (*)(void *_resp, unsigned long nsec_timeout))_linux_wait_event; + } + if (_linux_clock_gettime && !linux_clock_gettime) { + linux_clock_gettime = (int (*)(clockid_t clk_id, struct timespec *tp))_linux_clock_gettime; + } + + /* Fill in clv */ + if (uti_clv) { + uti_clv->uti_futex_resp = uti_futex_resp; + } + + /* monitor is per-cpu object */ + if (!uti_clv) { + struct ihk_os_cpu_monitor *monitor = cpu_local_var(monitor); + monitor->status = IHK_OS_MONITOR_KERNEL_HEAVY; + } + /* Cross-address space futex? */ if (op & FUTEX_PRIVATE_FLAG) { fshared = 0; } op = (op & FUTEX_CMD_MASK); - dkprintf("futex op=[%x, %s],uaddr=%lx, val=%x, utime=%lx, uaddr2=%lx, val3=%x, []=%x, shared: %d\n", + uti_dkprintf("futex op=[%x, %s],uaddr=%lx, val=%x, utime=%lx, uaddr2=%lx, val3=%x, []=%x, shared: %d\n", flags, (op == FUTEX_WAIT) ? "FUTEX_WAIT" : (op == FUTEX_WAIT_BITSET) ? "FUTEX_WAIT_BITSET" : @@ -5360,8 +5410,13 @@ SYSCALL_DECLARE(futex) (op == FUTEX_REQUEUE) ? "FUTEX_REQUEUE (NOT IMPL!)" : "unknown", (unsigned long)uaddr, val, utime, uaddr2, val3, *uaddr, fshared); + if ((op == FUTEX_WAIT || op == FUTEX_WAIT_BITSET) && utime) { + uti_dkprintf("%s: utime=%ld.%09ld\n", __FUNCTION__, utime->tv_sec, utime->tv_nsec); + } if (utime && (op == FUTEX_WAIT_BITSET || op == FUTEX_WAIT)) { unsigned long nsec_timeout; + if (!uti_clv) { + /* Use cycles for non-UTI case */ /* As per the Linux implementation FUTEX_WAIT specifies the duration of * the timeout, while FUTEX_WAIT_BITSET specifies the absolute timestamp */ @@ -5407,19 +5462,35 @@ SYSCALL_DECLARE(futex) else { nsec_timeout = (utime->tv_sec * NS_PER_SEC + utime->tv_nsec); } - timeout = nsec_timeout * 1000 / ihk_mc_get_ns_per_tsc(); - dkprintf("futex timeout: %lu\n", timeout); + + } + else{ + if (op == FUTEX_WAIT_BITSET) { /* User passed absolute time */ + struct timespec ats; + ret = (*linux_clock_gettime)((flags & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME: CLOCK_MONOTONIC, &ats); + if (ret) { + return ret; + } + uti_dkprintf("%s: ats=%ld.%09ld\n", __FUNCTION__, ats.tv_sec, ats.tv_nsec); + /* Use nsec for UTI case */ + timeout = (utime->tv_sec * NS_PER_SEC + utime->tv_nsec) - + (ats.tv_sec * NS_PER_SEC + ats.tv_nsec); + } else { /* User passed relative time */ + /* Use nsec for UTI case */ + timeout = (utime->tv_sec * NS_PER_SEC + utime->tv_nsec); + } + } } /* Requeue parameter in 'utime' if op == FUTEX_CMP_REQUEUE. * number of waiters to wake in 'utime' if op == FUTEX_WAKE_OP. */ if (op == FUTEX_CMP_REQUEUE || op == FUTEX_WAKE_OP) - val2 = (uint32_t) (unsigned long) ihk_mc_syscall_arg3(ctx); + val2 = (uint32_t) (unsigned long) arg3; - ret = futex(uaddr, op, val, timeout, uaddr2, val2, val3, fshared); + ret = futex(uaddr, op, val, timeout, uaddr2, val2, val3, fshared, uti_clv); - dkprintf("futex op=[%x, %s],uaddr=%lx, val=%x, utime=%lx, uaddr2=%lx, val3=%x, []=%x, shared: %d, ret: %d\n", + uti_dkprintf("futex op=[%x, %s],uaddr=%lx, val=%x, utime=%lx, uaddr2=%lx, val3=%x, []=%x, shared: %d, ret: %d\n", op, (op == FUTEX_WAIT) ? "FUTEX_WAIT" : (op == FUTEX_WAIT_BITSET) ? "FUTEX_WAIT_BITSET" : @@ -5433,6 +5504,14 @@ SYSCALL_DECLARE(futex) return ret; } +SYSCALL_DECLARE(futex) +{ + return do_futex(n, ihk_mc_syscall_arg0(ctx), ihk_mc_syscall_arg1(ctx), + ihk_mc_syscall_arg2(ctx), ihk_mc_syscall_arg3(ctx), + ihk_mc_syscall_arg4(ctx), ihk_mc_syscall_arg5(ctx), + 0UL, NULL, NULL, NULL, NULL); +} + static void do_exit(int code) { @@ -5474,7 +5553,7 @@ do_exit(int code) setint_user((int*)thread->clear_child_tid, 0); barrier(); futex((uint32_t *)thread->clear_child_tid, - FUTEX_WAKE, 1, 0, NULL, 0, 0, 1); + FUTEX_WAKE, 1, 0, NULL, 0, 0, 1, NULL); } mcs_rwlock_writer_lock(&proc->threads_lock, &lock); @@ -8998,6 +9077,7 @@ util_thread(struct uti_attr *arg) { volatile unsigned long *context; unsigned long pcontext; + struct cpu_local_var *uti_clv; struct syscall_request request IHK_DMA_ALIGN; long rc; struct thread *thread = cpu_local_var(current); @@ -9016,6 +9096,14 @@ util_thread(struct uti_attr *arg) pcontext = virt_to_phys((void *)context); save_uctx((void *)context, NULL); + /* Create a copy of clv and replace clv with it when the Linux thread calls in a McKernel function */ + uti_clv = kmalloc(sizeof(struct cpu_local_var), IHK_MC_AP_NOWAIT); + if (!uti_clv) { + ihk_mc_free_pages((void *)context, 1); + return -ENOMEM; + } + memcpy(uti_clv, get_this_cpu_local_var(), sizeof(struct cpu_local_var)); + request.number = __NR_sched_setaffinity; request.args[0] = 0; request.args[1] = pcontext; @@ -9025,19 +9113,23 @@ util_thread(struct uti_attr *arg) kattr.parent_cpuid = thread->parent_cpuid; request.args[2] = virt_to_phys(&kattr); } + request.args[3] = (unsigned long)uti_clv; thread->thread_offloaded = 1; rc = do_syscall(&request, ihk_mc_get_processor_id(), 0); thread->thread_offloaded = 0; free_address = context[0]; free_size = context[1]; ihk_mc_free_pages((void *)context, 1); + kfree(uti_clv); if (rc >= 0) { if (rc & 0x10000007f) { // exit_group || signal + dkprintf("%s: exit_group || signal\n", __FUNCTION__); thread->proc->nohost = 1; terminate((rc >> 8) & 255, rc & 255); } else { + dkprintf("%s: !exit_group && !signal\n", __FUNCTION__); request.number = __NR_sched_setaffinity; request.args[0] = 1; request.args[1] = free_address; diff --git a/lib/include/ihk/mm.h b/lib/include/ihk/mm.h index 47705489..88cb5c20 100644 --- a/lib/include/ihk/mm.h +++ b/lib/include/ihk/mm.h @@ -229,6 +229,7 @@ char *ihk_get_kargs(void); int ihk_set_monitor(unsigned long addr, unsigned long size); int ihk_set_rusage(unsigned long addr, unsigned long size); int ihk_set_nmi_mode_addr(unsigned long addr); +int ihk_set_mckernel_do_futex(unsigned long addr); extern void (*__tlb_flush_handler)(int vector);