From a7c02254233347c63fd26bafc88c501e3f07445a Mon Sep 17 00:00:00 2001 From: Balazs Gerofi Date: Tue, 8 May 2012 18:32:43 +0900 Subject: [PATCH] futex and rlimit --- kernel/Makefile.build | 2 +- kernel/futex.c | 467 +++++++++++++++++++++++++++++ kernel/host.c | 2 +- kernel/include/asm.h | 38 +++ kernel/include/futex.h | 242 +++++++++++++++ kernel/include/hash.h | 70 +++++ kernel/include/lwk/compiler-gcc.h | 36 +++ kernel/include/lwk/compiler-gcc4.h | 24 ++ kernel/include/lwk/compiler.h | 146 +++++++++ kernel/include/lwk/futex.h | 109 +++++++ kernel/include/lwk/stddef.h | 25 ++ kernel/include/process.h | 29 +- kernel/include/rlimit.h | 88 ++++++ kernel/include/waitq.h | 2 +- kernel/mem.c | 22 +- kernel/process.c | 45 ++- kernel/syscall.c | 252 +++++++++++++--- 17 files changed, 1534 insertions(+), 65 deletions(-) create mode 100644 kernel/futex.c create mode 100644 kernel/include/asm.h create mode 100644 kernel/include/futex.h create mode 100644 kernel/include/hash.h create mode 100644 kernel/include/lwk/compiler-gcc.h create mode 100644 kernel/include/lwk/compiler-gcc4.h create mode 100644 kernel/include/lwk/compiler.h create mode 100644 kernel/include/lwk/futex.h create mode 100644 kernel/include/lwk/stddef.h create mode 100644 kernel/include/rlimit.h diff --git a/kernel/Makefile.build b/kernel/Makefile.build index 48e9670f..def70322 100644 --- a/kernel/Makefile.build +++ b/kernel/Makefile.build @@ -1,6 +1,6 @@ AALDIR=$(AALBASE)/$(TARGET) OBJS = init.o mem.o debug.o mikc.o listeners.o ap.o syscall.o cls.o host.o -OBJS += process.o copy.o waitq.o +OBJS += process.o copy.o waitq.o futex.o DEPSRCS=$(wildcard $(SRC)/*.c) CFLAGS += -I$(SRC)/include -mcmodel=kernel -D__KERNEL__ diff --git a/kernel/futex.c b/kernel/futex.c new file mode 100644 index 00000000..b4586fb7 --- /dev/null +++ b/kernel/futex.c @@ -0,0 +1,467 @@ +/* + * Kitten LWK futex code adaptation. + * Copyright (c) 2012 RIKEN AICS + */ + +/* + * Copyright (c) 2008 Sandia National Laboratories + * + * Futex code adapted from Linux 2.6.27.9, original copyright below. + * Simplified to only support address-space (process-private) futexes. + * Removed demand-paging, cow, etc. complications since LWK doesn't + * require these. + */ + +/* + * Fast Userspace Mutexes (which I call "Futexes!"). + * (C) Rusty Russell, IBM 2002 + * + * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar + * (C) Copyright 2003 Red Hat Inc, All Rights Reserved + * + * Removed page pinning, fix privately mapped COW pages and other cleanups + * (C) Copyright 2003, 2004 Jamie Lokier + * + * Robust futex support started by Ingo Molnar + * (C) Copyright 2006 Red Hat Inc, All Rights Reserved + * Thanks to Thomas Gleixner for suggestions, analysis and fixes. + * + * PI-futex support started by Ingo Molnar and Thomas Gleixner + * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006 Timesys Corp., Thomas Gleixner + * + * PRIVATE futexes by Eric Dumazet + * Copyright (C) 2007 Eric Dumazet + * + * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly + * enough at me, Linus for the original (flawed) idea, Matthew + * Kirkwood for proof-of-concept implementation. + * + * "The futexes are also cursed." + * "But they come in a choice of three flavours!" + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#include +#include +#include +#include +#include +#include + +#ifdef __UACCESS__ +#include +#endif + +#endif + +void futex_queue_init(struct futex_queue *queue) +{ + aal_mc_spinlock_init(&queue->lock); + INIT_LIST_HEAD(&queue->futex_list); +} + +static int uaddr_is_valid(uint32_t __user *uaddr) +{ +#ifdef __UACCESS__ + return access_ok(VERIFY_WRITE, uaddr, sizeof(uint32_t)); +#else + return 1; +#endif +} + +static int futex_init(struct futex *futex, uint32_t __user *uaddr, + uint32_t bitset) +{ + if (!uaddr_is_valid(uaddr)) + return -EINVAL; + + futex->uaddr = uaddr; + futex->bitset = bitset; + waitq_init(&futex->waitq); + return 0; +} + +static struct futex_queue *get_queue(uint32_t __user *uaddr) +{ + uint64_t hash = hash_64((uint64_t)uaddr, FUTEX_HASHBITS); + return &cpu_local_var(current)->vm->futex_queues[hash]; +} + +static struct futex_queue *queue_lock(struct futex *futex, int *irqflags) +{ + struct futex_queue *queue = get_queue(futex->uaddr); + futex->lock_ptr = &queue->lock; + *irqflags = aal_mc_spinlock_lock(&queue->lock); + return queue; +} + +static void queue_unlock(struct futex_queue *futex_queue, int irqflags) +{ + aal_mc_spinlock_unlock(&futex_queue->lock, irqflags); +} + +static void queue_me(struct futex *futex, struct futex_queue *futex_queue) +{ + list_add_tail(&futex->link, &futex_queue->futex_list); +} + +static int unqueue_me(struct futex *futex) +{ + aal_spinlock_t *lock_ptr; + int irqflags; + int status = 0; + + /* In the common case we don't take the spinlock, which is nice. */ +retry: + lock_ptr = futex->lock_ptr; + barrier(); + if (lock_ptr != NULL) { + irqflags = aal_mc_spinlock_lock(lock_ptr); + /* + * q->lock_ptr can change between reading it and + * spin_lock(), causing us to take the wrong lock. This + * corrects the race condition. + * + * Reasoning goes like this: if we have the wrong lock, + * q->lock_ptr must have changed (maybe several times) + * between reading it and the spin_lock(). It can + * change again after the spin_lock() but only if it was + * already changed before the spin_lock(). It cannot, + * however, change back to the original value. Therefore + * we can detect whether we acquired the correct lock. + */ + if (lock_ptr != futex->lock_ptr) { + aal_mc_spinlock_unlock(lock_ptr, irqflags); + goto retry; + } + + //WARN_ON(list_empty(&futex->link)); + list_del(&futex->link); + aal_mc_spinlock_unlock(lock_ptr, irqflags); + status = 1; + } + + return status; +} + +static void lock_two_queues(struct futex_queue *queue1, int *irqflags1, + struct futex_queue *queue2, int *irqflags2) +{ + if (queue1 < queue2) + *irqflags1 = aal_mc_spinlock_lock(&queue1->lock); + + *irqflags2 = aal_mc_spinlock_lock(&queue2->lock); + + if (queue1 > queue2) + *irqflags1 = aal_mc_spinlock_lock(&queue1->lock); +} + +static void unlock_two_queues(struct futex_queue *queue1, int irqflags1, + struct futex_queue *queue2, int irqflags2) +{ + if (queue1 == queue2) { + aal_mc_spinlock_unlock(&queue2->lock, irqflags2); + } + else { + aal_mc_spinlock_unlock(&queue2->lock, irqflags2); + aal_mc_spinlock_unlock(&queue1->lock, irqflags1); + } +} + +/** Puts a task to sleep waiting on a futex. */ +static int futex_wait(uint32_t __user *uaddr, uint32_t val, + uint64_t timeout, uint32_t bitset) +{ + DECLARE_WAITQ_ENTRY(wait, cpu_local_var(current)); + int status; + uint32_t uval; + struct futex futex; + struct futex_queue *queue; + int irqflags; + uint64_t time_remain = 0; + + if (!bitset) + return -EINVAL; + + /* This verifies that uaddr is sane */ + if ((status = futex_init(&futex, uaddr, bitset)) != 0) + return status; + + /* Lock the futex queue corresponding to uaddr */ + queue = queue_lock(&futex, &irqflags); + + /* Get the value from user-space. Since we don't have + * paging, the only options are for this to succeed (with no + * page faults) or fail, returning -EFAULT. There is no way + * for us to be put to sleep, so holding the queue's spinlock + * is fine. */ +#ifdef __UACCESS__ + if ((status = get_user(uval, uaddr)) != 0) + goto error; +#else + uval = *uaddr; + status = 0; +#endif + + /* The user-space value must match the value passed in */ + if (uval != val) { + status = -EWOULDBLOCK; + goto error; + } + + /* Add ourself to the futex queue and drop our lock on it */ + queue_me(&futex, queue); + queue_unlock(queue, irqflags); + + /* Add ourself to the futex's waitq and go to sleep */ + cpu_local_var(current)->status = PS_INTERRUPTIBLE; + waitq_add_entry(&futex.waitq, &wait); + + if (!list_empty(&futex.link)) { + // We don't have timers for now, let's sleep forever, + // and pretend we were woken up + //time_remain = schedule_timeout(timeout); + schedule(); + time_remain = 10; + } + + cpu_local_var(current)->status = PS_RUNNING; + + /* + * NOTE: We don't remove ourself from the waitq because + * we are the only user of it. + */ + + /* If we were woken (and unqueued), we succeeded, whatever. */ + if (!unqueue_me(&futex)) + return 0; + + if (time_remain == 0) + return -ETIMEDOUT; + + /* We expect that there is a signal pending, but another thread + * may have handled it for us already. */ + return -EINTR; + +error: + queue_unlock(queue, irqflags); + return status; +} + +/* + * The futex_queue's lock must be held when this is called. + * Afterwards, the futex_queue must not be accessed. + */ +static void wake_futex(struct futex *futex) +{ + list_del_init(&futex->link); + /* + * The lock in waitq_wakeup() is a crucial memory barrier after the + * list_del_init() and also before assigning to futex->lock_ptr. + */ + waitq_wakeup(&futex->waitq); + /* + * The waiting task can free the futex as soon as this is written, + * without taking any locks. This must come last. + * + * A memory barrier is required here to prevent the following store + * to lock_ptr from getting ahead of the wakeup. Clearing the lock + * at the end of waitq_wakeup() does not prevent this store from + * moving. + */ + barrier(); + futex->lock_ptr = NULL; +} + +/** Wakes up nr_wake tasks waiting on a futex. */ +static int futex_wake(uint32_t __user *uaddr, int nr_wake, uint32_t bitset) +{ + struct futex_queue *queue; + struct list_head *head; + struct futex *this, *next; + int nr_woke = 0; + int irqflags; + + if (!bitset) + return -EINVAL; + + if (!uaddr_is_valid(uaddr)) + return -EINVAL; + + queue = get_queue(uaddr); + irqflags = aal_mc_spinlock_lock(&queue->lock); + head = &queue->futex_list; + + list_for_each_entry_safe(this, next, head, link) { + if ((this->uaddr == uaddr) && (this->bitset & bitset)) { + wake_futex(this); + if (++nr_woke >= nr_wake) + break; + } + } + + aal_mc_spinlock_unlock(&queue->lock, irqflags); + return nr_woke; +} + +/** Conditionally wakes up tasks that are waiting on futexes. */ +static int futex_wake_op(uint32_t __user *uaddr1, uint32_t __user *uaddr2, + int nr_wake1, int nr_wake2, int op) +{ + struct futex_queue *queue1, *queue2; + int irqflags1 = 0; + int irqflags2 = 0; + struct list_head *head; + struct futex *this, *next; + int op_result, nr_woke1 = 0, nr_woke2 = 0; + + if (!uaddr_is_valid(uaddr1) || !uaddr_is_valid(uaddr2)) + return -EINVAL; + + queue1 = get_queue(uaddr1); + queue2 = get_queue(uaddr2); + lock_two_queues(queue1, &irqflags1, queue2, &irqflags2); + + op_result = futex_atomic_op_inuser(op, (int *)uaddr2); + if (op_result < 0) { + unlock_two_queues(queue1, irqflags1, queue2, irqflags2); + return op_result; + } + + head = &queue1->futex_list; + list_for_each_entry_safe(this, next, head, link) { + if (this->uaddr == uaddr1) { + wake_futex(this); + if (++nr_woke1 >= nr_wake1) + break; + } + } + + if (op_result > 0) { + head = &queue2->futex_list; + list_for_each_entry_safe(this, next, head, link) { + if (this->uaddr == uaddr2) { + wake_futex(this); + if (++nr_woke2 >= nr_wake2) + break; + } + } + } + + unlock_two_queues(queue1, irqflags1, queue2, irqflags2); + return nr_woke1 + nr_woke2; +} + +/** Conditionally wakes up or requeues tasks that are waiting on futexes. */ +static int futex_cmp_requeue(uint32_t __user *uaddr1, uint32_t __user *uaddr2, + int nr_wake, int nr_requeue, uint32_t cmpval) +{ + struct futex_queue *queue1, *queue2; + int irqflags1, irqflags2; + struct list_head *head1, *head2; + struct futex *this, *next; + uint32_t curval; + int status, nr_woke = 0; + + if (!uaddr_is_valid(uaddr1) || !uaddr_is_valid(uaddr2)) + return -EINVAL; + + queue1 = get_queue(uaddr1); + queue2 = get_queue(uaddr2); + lock_two_queues(queue1, &irqflags1, queue2, &irqflags2); + +#ifdef __UACCESS__ + if ((status = get_user(curval, uaddr1)) != 0) + goto out_unlock; +#else + curval = *uaddr1; + status = 0; +#endif + + if (curval != cmpval) { + status = -EAGAIN; + goto out_unlock; + } + + head1 = &queue1->futex_list; + head2 = &queue2->futex_list; + list_for_each_entry_safe(this, next, head1, link) { + if (this->uaddr != uaddr1) + continue; + if (++nr_woke <= nr_wake) { + wake_futex(this); + } else { + /* If uaddr1 and uaddr2 hash to the + * same futex queue, no need to requeue */ + if (head1 != head2) { + list_move_tail(&this->link, head2); + this->lock_ptr = &queue2->lock; + } + this->uaddr = uaddr2; + + if (nr_woke - nr_wake >= nr_requeue) + break; + } + } + status = nr_woke; + +out_unlock: + unlock_two_queues(queue1, irqflags1, queue2, irqflags2); + return status; +} + +int futex(uint32_t __user *uaddr, int op, uint32_t val, uint64_t timeout, + uint32_t __user *uaddr2, uint32_t val2, uint32_t val3) +{ + int status; + + switch (op) { + case FUTEX_WAIT: + val3 = FUTEX_BITSET_MATCH_ANY; + case FUTEX_WAIT_BITSET: + status = futex_wait(uaddr, val, timeout, val3); + break; + case FUTEX_WAKE: + val3 = FUTEX_BITSET_MATCH_ANY; + case FUTEX_WAKE_BITSET: + status = futex_wake(uaddr, val, val3); + break; + case FUTEX_WAKE_OP: + status = futex_wake_op(uaddr, uaddr2, val, val2, val3); + break; + case FUTEX_CMP_REQUEUE: + status = futex_cmp_requeue(uaddr, uaddr2, val, val2, val3); + break; + default: + kprintf("sys_futex() op=%d not supported (pid: )\n", + op, &cpu_local_var(current)->pid); + + status = -ENOSYS; + } + + return status; +} + diff --git a/kernel/host.c b/kernel/host.c index da75d26c..ebc13a3c 100644 --- a/kernel/host.c +++ b/kernel/host.c @@ -45,7 +45,7 @@ static void process_msg_prepare_process(unsigned long rphys) + sizeof(struct program_image_section) * n); proc = create_process(p->entry); - proc->pid = p->pid; + proc->pid = 1024; /* TODO: Clear it at the proper timing */ cpu_local_var(scp).post_idx = 0; diff --git a/kernel/include/asm.h b/kernel/include/asm.h new file mode 100644 index 00000000..aa37e8df --- /dev/null +++ b/kernel/include/asm.h @@ -0,0 +1,38 @@ +#ifndef _ASM_X86_ASM_H +#define _ASM_X86_ASM_H + +#ifdef __ASSEMBLY__ +# define __ASM_FORM(x) x +# define __ASM_EX_SEC .section __ex_table +#else +# define __ASM_FORM(x) " " #x " " +# define __ASM_EX_SEC " .section __ex_table,\"a\"\n" +#endif + +# define __ASM_SEL(a,b) __ASM_FORM(b) + +#define __ASM_SIZE(inst) __ASM_SEL(inst##l, inst##q) +#define __ASM_REG(reg) __ASM_SEL(e##reg, r##reg) + +#define _ASM_PTR __ASM_SEL(.long, .quad) +#define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8) +#define _ASM_MOV_UL __ASM_SIZE(mov) + +#define _ASM_INC __ASM_SIZE(inc) +#define _ASM_DEC __ASM_SIZE(dec) +#define _ASM_ADD __ASM_SIZE(add) +#define _ASM_SUB __ASM_SIZE(sub) +#define _ASM_XADD __ASM_SIZE(xadd) +#define _ASM_AX __ASM_REG(ax) +#define _ASM_BX __ASM_REG(bx) +#define _ASM_CX __ASM_REG(cx) +#define _ASM_DX __ASM_REG(dx) + +/* Exception table entry */ +# define _ASM_EXTABLE(from,to) \ + __ASM_EX_SEC \ + _ASM_ALIGN "\n" \ + _ASM_PTR #from "," #to "\n" \ + " .previous\n" + +#endif /* _ASM_X86_ASM_H */ diff --git a/kernel/include/futex.h b/kernel/include/futex.h new file mode 100644 index 00000000..72a5e1ac --- /dev/null +++ b/kernel/include/futex.h @@ -0,0 +1,242 @@ +/* Kitten LWK futex adaptation */ + + +#ifndef _LWK_FUTEX_H +#define _LWK_FUTEX_H + +/** \name Futex Commands + * @{ + */ +#define FUTEX_WAIT 0 +#define FUTEX_WAKE 1 +#define FUTEX_CMP_REQUEUE 4 +#define FUTEX_WAKE_OP 5 +#define FUTEX_WAIT_BITSET 9 +#define FUTEX_WAKE_BITSET 10 +// @} + +#define FUTEX_PRIVATE_FLAG 128 +#define FUTEX_CLOCK_REALTIME 256 +#define FUTEX_CMD_MASK ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME) + +/** \name Futex Operations, used for FUTEX_WAKE_OP + * @{ + */ +#define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */ +#define FUTEX_OP_ADD 1 /* *(int *)UADDR2 += OPARG; */ +#define FUTEX_OP_OR 2 /* *(int *)UADDR2 |= OPARG; */ +#define FUTEX_OP_ANDN 3 /* *(int *)UADDR2 &= ~OPARG; */ +#define FUTEX_OP_XOR 4 /* *(int *)UADDR2 ^= OPARG; */ + +#define FUTEX_OP_OPARG_SHIFT 8 /* Use (1 << OPARG) instead of OPARG. */ + +#define FUTEX_OP_CMP_EQ 0 /* if (oldval == CMPARG) wake */ +#define FUTEX_OP_CMP_NE 1 /* if (oldval != CMPARG) wake */ +#define FUTEX_OP_CMP_LT 2 /* if (oldval < CMPARG) wake */ +#define FUTEX_OP_CMP_LE 3 /* if (oldval <= CMPARG) wake */ +#define FUTEX_OP_CMP_GT 4 /* if (oldval > CMPARG) wake */ +#define FUTEX_OP_CMP_GE 5 /* if (oldval >= CMPARG) wake */ +// @} + +/* FUTEX_WAKE_OP will perform atomically + int oldval = *(int *)UADDR2; + *(int *)UADDR2 = oldval OP OPARG; + if (oldval CMP CMPARG) + wake UADDR2; */ +#define FUTEX_OP(op, oparg, cmp, cmparg) \ + (((op & 0xf) << 28) | ((cmp & 0xf) << 24) \ + | ((oparg & 0xfff) << 12) | (cmparg & 0xfff)) + +/* + * bitset with all bits set for the FUTEX_xxx_BITSET OPs to request a + * match of any bit. + */ +#define FUTEX_BITSET_MATCH_ANY 0xffffffff + +#ifdef __KERNEL__ + +#include +#include +#include +#include + +#ifndef _ASM_X86_FUTEX_H +#define _ASM_X86_FUTEX_H + +#ifdef __KERNEL__ + +/* We don't deal with uaccess at the moment, because x86 can access + * userspace directly, we rely on glibc and the app developers. + */ +#ifdef __UACCESS__ +#include +#endif + +#include +#include + +#define __user + +#if 0 +#include +#include +#endif + +#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \ + asm volatile("1:\t" insn "\n" \ + "2:\t.section .fixup,\"ax\"\n" \ + "3:\tmov\t%3, %1\n" \ + "\tjmp\t2b\n" \ + "\t.previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : "=r" (oldval), "=r" (ret), "+m" (*uaddr) \ + : "i" (-EFAULT), "0" (oparg), "1" (0)) + +#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \ + asm volatile("1:\tmovl %2, %0\n" \ + "\tmovl\t%0, %3\n" \ + "\t" insn "\n" \ + "2:\tlock; cmpxchgl %3, %2\n" \ + "\tjnz\t1b\n" \ + "3:\t.section .fixup,\"ax\"\n" \ + "4:\tmov\t%5, %1\n" \ + "\tjmp\t3b\n" \ + "\t.previous\n" \ + _ASM_EXTABLE(1b, 4b) \ + _ASM_EXTABLE(2b, 4b) \ + : "=&a" (oldval), "=&r" (ret), \ + "+m" (*uaddr), "=&r" (tem) \ + : "r" (oparg), "i" (-EFAULT), "1" (0)) + +static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) +{ + int op = (encoded_op >> 28) & 7; + int cmp = (encoded_op >> 24) & 15; + int oparg = (encoded_op << 8) >> 20; + int cmparg = (encoded_op << 20) >> 20; + int oldval = 0, ret, tem; + + if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) + oparg = 1 << oparg; + +#ifdef __UACCESS__ + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + return -EFAULT; +#endif + + switch (op) { + case FUTEX_OP_SET: + __futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg); + break; + case FUTEX_OP_ADD: + __futex_atomic_op1("lock; xaddl %0, %2", ret, oldval, + uaddr, oparg); + break; + case FUTEX_OP_OR: + __futex_atomic_op2("orl %4, %3", ret, oldval, uaddr, oparg); + break; + case FUTEX_OP_ANDN: + __futex_atomic_op2("andl %4, %3", ret, oldval, uaddr, ~oparg); + break; + case FUTEX_OP_XOR: + __futex_atomic_op2("xorl %4, %3", ret, oldval, uaddr, oparg); + break; + default: + ret = -ENOSYS; + } + + if (!ret) { + switch (cmp) { + case FUTEX_OP_CMP_EQ: + ret = (oldval == cmparg); + break; + case FUTEX_OP_CMP_NE: + ret = (oldval != cmparg); + break; + case FUTEX_OP_CMP_LT: + ret = (oldval < cmparg); + break; + case FUTEX_OP_CMP_GE: + ret = (oldval >= cmparg); + break; + case FUTEX_OP_CMP_LE: + ret = (oldval <= cmparg); + break; + case FUTEX_OP_CMP_GT: + ret = (oldval > cmparg); + break; + default: + ret = -ENOSYS; + } + } + return ret; +} + +static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, + int newval) +{ +#ifdef __UACCESS__ + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + return -EFAULT; +#endif + + asm volatile("1:\tlock; cmpxchgl %3, %1\n" + "2:\t.section .fixup, \"ax\"\n" + "3:\tmov %2, %0\n" + "\tjmp 2b\n" + "\t.previous\n" + _ASM_EXTABLE(1b, 3b) + : "=a" (oldval), "+m" (*uaddr) + : "i" (-EFAULT), "r" (newval), "0" (oldval) + : "memory" + ); + + return oldval; +} + +#endif // __KERNEL__ +#endif // _ASM_X86_FUTEX_H + + + +#define FUTEX_HASHBITS 8 /* 256 entries in each futex hash tbl */ + +/** Futex tracking structure. + * + * A futex has a woken state, just like tasks have TASK_RUNNING. + * It is considered woken when list_empty(&futex->link) || futex->lock_ptr == 0. + * The order of wakup is always to make the first condition true, then + * wake up futex->waitq, then make the second condition true. + */ +struct futex { + struct list_head link; + struct waitq waitq; + aal_spinlock_t * lock_ptr; + uint32_t __user * uaddr; + uint32_t bitset; +}; + +struct futex_queue { + aal_spinlock_t lock; + struct list_head futex_list; +}; + +extern void +futex_queue_init( + struct futex_queue * queue +); + +extern int +futex( + uint32_t __user * uaddr, + int op, + uint32_t val, + uint64_t timeout, + uint32_t __user * uaddr2, + uint32_t val2, + uint32_t val3 +); + + +#endif +#endif diff --git a/kernel/include/hash.h b/kernel/include/hash.h new file mode 100644 index 00000000..5ff075df --- /dev/null +++ b/kernel/include/hash.h @@ -0,0 +1,70 @@ +#ifndef _LINUX_HASH_H +#define _LINUX_HASH_H +/* Fast hashing routine for ints, longs and pointers. + (C) 2002 William Lee Irwin III, IBM */ + +/* + * Knuth recommends primes in approximately golden ratio to the maximum + * integer representable by a machine word for multiplicative hashing. + * Chuck Lever verified the effectiveness of this technique: + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf + * + * These primes are chosen to be bit-sparse, that is operations on + * them can use shifts and additions instead of multiplications for + * machines where multiplications are slow. + */ + +#define BITS_PER_LONG 64 + +/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ +#define GOLDEN_RATIO_PRIME_32 0x9e370001UL +/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ +#define GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001UL + +#if BITS_PER_LONG == 32 +#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_32 +#define hash_long(val, bits) hash_32(val, bits) +#elif BITS_PER_LONG == 64 +#define hash_long(val, bits) hash_64(val, bits) +#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_64 +#else +#error Wordsize not 32 or 64 +#endif + +static inline uint64_t hash_64(uint64_t val, unsigned int bits) +{ + uint64_t hash = val; + + /* Sigh, gcc can't optimise this alone like it does for 32 bits. */ + uint64_t n = hash; + n <<= 18; + hash -= n; + n <<= 33; + hash -= n; + n <<= 3; + hash += n; + n <<= 3; + hash -= n; + n <<= 4; + hash += n; + n <<= 2; + hash += n; + + /* High bits are more random, so use them. */ + return hash >> (64 - bits); +} + +static inline uint32_t hash_32(uint32_t val, unsigned int bits) +{ + /* On some cpus multiply is faster, on others gcc will do shifts */ + uint32_t hash = val * GOLDEN_RATIO_PRIME_32; + + /* High bits are more random, so use them. */ + return hash >> (32 - bits); +} + +static inline unsigned long hash_ptr(void *ptr, unsigned int bits) +{ + return hash_long((unsigned long)ptr, bits); +} +#endif /* _LINUX_HASH_H */ diff --git a/kernel/include/lwk/compiler-gcc.h b/kernel/include/lwk/compiler-gcc.h new file mode 100644 index 00000000..e1af2ce6 --- /dev/null +++ b/kernel/include/lwk/compiler-gcc.h @@ -0,0 +1,36 @@ +/* Never include this file directly. Include instead. */ + +/* + * Common definitions for all gcc versions go here. + */ + + +/* Optimization barrier */ +/* The "volatile" is due to gcc bugs + * NOTE: already defined in aal/manycore/generic/include/aal/cpu.h + * #define barrier() __asm__ __volatile__("": : :"memory") + */ + +/* This macro obfuscates arithmetic on a variable address so that gcc + shouldn't recognize the original var, and make assumptions about it */ +/* + * Versions of the ppc64 compiler before 4.1 had a bug where use of + * RELOC_HIDE could trash r30. The bug can be worked around by changing + * the inline assembly constraint from =g to =r, in this particular + * case either is valid. + */ +#define RELOC_HIDE(ptr, off) \ + ({ unsigned long __ptr; \ + __asm__ ("" : "=r"(__ptr) : "0"(ptr)); \ + (typeof(ptr)) (__ptr + (off)); }) + + +#define inline inline __attribute__((always_inline)) +#define __inline__ __inline__ __attribute__((always_inline)) +#define __inline __inline __attribute__((always_inline)) +#define __deprecated __attribute__((deprecated)) +#define noinline __attribute__((noinline)) +#define __attribute_pure__ __attribute__((pure)) +#define __attribute_const__ __attribute__((__const__)) +#define __weak __attribute__((weak)) +#define __noreturn __attribute__((noreturn)) diff --git a/kernel/include/lwk/compiler-gcc4.h b/kernel/include/lwk/compiler-gcc4.h new file mode 100644 index 00000000..d7d192e3 --- /dev/null +++ b/kernel/include/lwk/compiler-gcc4.h @@ -0,0 +1,24 @@ +/* Never include this file directly. Include instead. */ + +/* These definitions are for GCC v4.x. */ +#include + +#ifdef CONFIG_FORCED_INLINING +# undef inline +# undef __inline__ +# undef __inline +# define inline inline __attribute__((always_inline)) +# define __inline__ __inline__ __attribute__((always_inline)) +# define __inline __inline __attribute__((always_inline)) +#endif + +#define __used __attribute__((__used__)) +#define __must_check __attribute__((warn_unused_result)) +#define __compiler_offsetof(a,b) __builtin_offsetof(a,b) +#define __always_inline inline __attribute__((always_inline)) + +/* + * A trick to suppress uninitialized variable warning without generating any + * code + */ +#define uninitialized_var(x) x = x diff --git a/kernel/include/lwk/compiler.h b/kernel/include/lwk/compiler.h new file mode 100644 index 00000000..533a0bfa --- /dev/null +++ b/kernel/include/lwk/compiler.h @@ -0,0 +1,146 @@ +#ifndef _LWK_COMPILER_H +#define _LWK_COMPILER_H + +#ifndef __ASSEMBLY__ + +#ifdef __CHECKER__ +# define __user __attribute__((noderef, address_space(1))) +# define __kernel /* default address space */ +# define __safe __attribute__((safe)) +# define __force __attribute__((force)) +# define __nocast __attribute__((nocast)) +# define __iomem __attribute__((noderef, address_space(2))) +# define __acquires(x) __attribute__((context(0,1))) +# define __releases(x) __attribute__((context(1,0))) +# define __acquire(x) __context__(1) +# define __release(x) __context__(-1) +# define __cond_lock(x) ((x) ? ({ __context__(1); 1; }) : 0) +# define __unused(x) x __attribute__((unused)) +extern void __chk_user_ptr(void __user *); +extern void __chk_io_ptr(void __iomem *); +#else +# define __user +# define __kernel +# define __safe +# define __force +# define __nocast +# define __iomem +# define __chk_user_ptr(x) (void)0 +# define __chk_io_ptr(x) (void)0 +# define __builtin_warning(x, y...) (1) +# define __acquires(x) +# define __releases(x) +# define __acquire(x) (void)0 +# define __release(x) (void)0 +# define __cond_lock(x) (x) +# define __unused(x) x +#endif + +#ifdef __KERNEL__ + +#if __GNUC__ > 4 +#error no compiler-gcc.h file for this gcc version +#elif __GNUC__ == 4 +# include +#else +# error Sorry, your compiler is too old/not recognized. +#endif + +/* + * Generic compiler-dependent macros required for kernel + * build go below this comment. Actual compiler/compiler version + * specific implementations come from the above header files + */ + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +/* Optimization barrier */ +#ifndef barrier +# define barrier() __memory_barrier() +#endif + +#ifndef RELOC_HIDE +# define RELOC_HIDE(ptr, off) \ + ({ unsigned long __ptr; \ + __ptr = (unsigned long) (ptr); \ + (typeof(ptr)) (__ptr + (off)); }) +#endif + +#endif /* __KERNEL__ */ + +#endif /* __ASSEMBLY__ */ + +#ifdef __KERNEL__ +/* + * Allow us to mark functions as 'deprecated' and have gcc emit a nice + * warning for each use, in hopes of speeding the functions removal. + * Usage is: + * int __deprecated foo(void) + */ +#ifndef __deprecated +# define __deprecated /* unimplemented */ +#endif + +#ifndef __must_check +#define __must_check +#endif + +/* + * Allow us to avoid 'defined but not used' warnings on functions and data, + * as well as force them to be emitted to the assembly file. + * + * As of gcc 3.4, static functions that are not marked with attribute((used)) + * may be elided from the assembly file. As of gcc 3.4, static data not so + * marked will not be elided, but this may change in a future gcc version. + * + * In prior versions of gcc, such functions and data would be emitted, but + * would be warned about except with attribute((unused)). + */ +#ifndef __used +# define __used /* unimplemented */ +#endif + +/* + * From the GCC manual: + * + * Many functions have no effects except the return value and their + * return value depends only on the parameters and/or global + * variables. Such a function can be subject to common subexpression + * elimination and loop optimization just as an arithmetic operator + * would be. + * [...] + */ +#ifndef __attribute_pure__ +# define __attribute_pure__ /* unimplemented */ +#endif + +#ifndef noinline +#define noinline +#endif + +#ifndef __always_inline +#define __always_inline inline +#endif + +#endif /* __KERNEL__ */ + +/* + * From the GCC manual: + * + * Many functions do not examine any values except their arguments, + * and have no effects except the return value. Basically this is + * just slightly more strict class than the `pure' attribute above, + * since function is not allowed to read global memory. + * + * Note that a function that has pointer arguments and examines the + * data pointed to must _not_ be declared `const'. Likewise, a + * function that calls a non-`const' function usually must not be + * `const'. It does not make sense for a `const' function to return + * `void'. + */ +#ifndef __attribute_const__ +# define __attribute_const__ /* unimplemented */ +#endif + +#endif /* _LWK_COMPILER_H */ diff --git a/kernel/include/lwk/futex.h b/kernel/include/lwk/futex.h new file mode 100644 index 00000000..5f4f2f24 --- /dev/null +++ b/kernel/include/lwk/futex.h @@ -0,0 +1,109 @@ +#ifndef _LWK_FUTEX_H +#define _LWK_FUTEX_H + +/** \name Futex Commands + * @{ + */ +#define FUTEX_WAIT 0 +#define FUTEX_WAKE 1 +#define FUTEX_CMP_REQUEUE 4 +#define FUTEX_WAKE_OP 5 +#define FUTEX_WAIT_BITSET 9 +#define FUTEX_WAKE_BITSET 10 +// @} + +#define FUTEX_PRIVATE_FLAG 128 +#define FUTEX_CLOCK_REALTIME 256 +#define FUTEX_CMD_MASK ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME) + +/** \name Futex Operations, used for FUTEX_WAKE_OP + * @{ + */ +#define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */ +#define FUTEX_OP_ADD 1 /* *(int *)UADDR2 += OPARG; */ +#define FUTEX_OP_OR 2 /* *(int *)UADDR2 |= OPARG; */ +#define FUTEX_OP_ANDN 3 /* *(int *)UADDR2 &= ~OPARG; */ +#define FUTEX_OP_XOR 4 /* *(int *)UADDR2 ^= OPARG; */ + +#define FUTEX_OP_OPARG_SHIFT 8 /* Use (1 << OPARG) instead of OPARG. */ + +#define FUTEX_OP_CMP_EQ 0 /* if (oldval == CMPARG) wake */ +#define FUTEX_OP_CMP_NE 1 /* if (oldval != CMPARG) wake */ +#define FUTEX_OP_CMP_LT 2 /* if (oldval < CMPARG) wake */ +#define FUTEX_OP_CMP_LE 3 /* if (oldval <= CMPARG) wake */ +#define FUTEX_OP_CMP_GT 4 /* if (oldval > CMPARG) wake */ +#define FUTEX_OP_CMP_GE 5 /* if (oldval >= CMPARG) wake */ +// @} + +/* FUTEX_WAKE_OP will perform atomically + int oldval = *(int *)UADDR2; + *(int *)UADDR2 = oldval OP OPARG; + if (oldval CMP CMPARG) + wake UADDR2; */ +#define FUTEX_OP(op, oparg, cmp, cmparg) \ + (((op & 0xf) << 28) | ((cmp & 0xf) << 24) \ + | ((oparg & 0xfff) << 12) | (cmparg & 0xfff)) + +/* + * bitset with all bits set for the FUTEX_xxx_BITSET OPs to request a + * match of any bit. + */ +#define FUTEX_BITSET_MATCH_ANY 0xffffffff + +#ifdef __KERNEL__ + +#include +#include +#include +#include + +#define FUTEX_HASHBITS 8 /* 256 entries in each futex hash tbl */ + +/** Futex tracking structure. + * + * A futex has a woken state, just like tasks have TASK_RUNNING. + * It is considered woken when list_empty(&futex->link) || futex->lock_ptr == 0. + * The order of wakup is always to make the first condition true, then + * wake up futex->waitq, then make the second condition true. + */ +struct futex { + struct list_head link; + struct waitq waitq; + spinlock_t * lock_ptr; + uint32_t __user * uaddr; + uint32_t bitset; +}; + +struct futex_queue { + spinlock_t lock; + struct list_head futex_list; +}; + +extern void +futex_queue_init( + struct futex_queue * queue +); + +extern int +futex( + uint32_t __user * uaddr, + int op, + uint32_t val, + uint64_t timeout, + uint32_t __user * uaddr2, + uint32_t val2, + uint32_t val3 +); + +extern long +sys_futex( + uint32_t __user * uaddr, + int op, + uint32_t val, + struct timespec __user * utime, + uint32_t __user * uaddr2, + uint32_t val3 +); + +#endif +#endif diff --git a/kernel/include/lwk/stddef.h b/kernel/include/lwk/stddef.h new file mode 100644 index 00000000..66551f34 --- /dev/null +++ b/kernel/include/lwk/stddef.h @@ -0,0 +1,25 @@ +#ifndef _LWK_STDDEF_H +#define _LWK_STDDEF_H + +#include + +#undef NULL +#if defined(__cplusplus) +#define NULL 0 +#else +#define NULL ((void *)0) +#endif + +#ifdef __KERNEL__ +#define false 0 +#define true 1 +#endif + +#undef offsetof +#ifdef __compiler_offsetof +#define offsetof(TYPE,MEMBER) __compiler_offsetof(TYPE,MEMBER) +#else +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#endif + +#endif diff --git a/kernel/include/process.h b/kernel/include/process.h index bbe346df..e081ee21 100644 --- a/kernel/include/process.h +++ b/kernel/include/process.h @@ -20,6 +20,7 @@ #define PS_NORMAL (PS_INTERRUPTIBLE | PS_UNINTERRUPTIBLE) + struct vm_range { struct list_head list; unsigned long start, end; @@ -33,16 +34,9 @@ struct vm_regions { unsigned long brk_start, brk_end; unsigned long map_start, map_end; unsigned long stack_start, stack_end; - unsigned long tlsblock_base, tlsblock_limit; }; -struct process_vm { - aal_atomic_t refcount; - - struct page_table *page_table; - struct list_head vm_range_list; - struct vm_regions region; -}; +struct process_vm; struct process { int pid; @@ -54,13 +48,30 @@ struct process { aal_mc_kernel_context_t ctx; aal_mc_user_context_t *uctx; - struct list_head sched_list; // Runqueue + // Runqueue list entry + struct list_head sched_list; struct thread { int *clear_child_tid; + unsigned long tlsblock_base, tlsblock_limit; } thread; }; +#include +#include + +struct process_vm { + aal_atomic_t refcount; + + struct page_table *page_table; + struct list_head vm_range_list; + struct vm_regions region; + + // Address space private futexes + struct futex_queue futex_queues[1 << FUTEX_HASHBITS]; +}; + + struct process *create_process(unsigned long user_pc); struct process *clone_process(struct process *org, unsigned long pc, unsigned long sp); diff --git a/kernel/include/rlimit.h b/kernel/include/rlimit.h new file mode 100644 index 00000000..0631b914 --- /dev/null +++ b/kernel/include/rlimit.h @@ -0,0 +1,88 @@ + +#ifndef __RLIMIT_H +#define __RLIMIT_H + +/* Kinds of resource limit. */ +enum __rlimit_resource +{ + /* Per-process CPU limit, in seconds. */ + RLIMIT_CPU = 0, +#define RLIMIT_CPU RLIMIT_CPU + + /* Largest file that can be created, in bytes. */ + RLIMIT_FSIZE = 1, +#define RLIMIT_FSIZE RLIMIT_FSIZE + + /* Maximum size of data segment, in bytes. */ + RLIMIT_DATA = 2, +#define RLIMIT_DATA RLIMIT_DATA + + /* Maximum size of stack segment, in bytes. */ + RLIMIT_STACK = 3, +#define RLIMIT_STACK RLIMIT_STACK + + /* Largest core file that can be created, in bytes. */ + RLIMIT_CORE = 4, +#define RLIMIT_CORE RLIMIT_CORE + + /* Largest resident set size, in bytes. + This affects swapping; processes that are exceeding their + resident set size will be more likely to have physical memory + taken from them. */ + __RLIMIT_RSS = 5, +#define RLIMIT_RSS __RLIMIT_RSS + + /* Number of open files. */ + RLIMIT_NOFILE = 7, + __RLIMIT_OFILE = RLIMIT_NOFILE, /* BSD name for same. */ +#define RLIMIT_NOFILE RLIMIT_NOFILE +#define RLIMIT_OFILE __RLIMIT_OFILE + + /* Address space limit. */ + RLIMIT_AS = 9, +#define RLIMIT_AS RLIMIT_AS + + /* Number of processes. */ + __RLIMIT_NPROC = 6, +#define RLIMIT_NPROC __RLIMIT_NPROC + + /* Locked-in-memory address space. */ + __RLIMIT_MEMLOCK = 8, +#define RLIMIT_MEMLOCK __RLIMIT_MEMLOCK + + /* Maximum number of file locks. */ + __RLIMIT_LOCKS = 10, +#define RLIMIT_LOCKS __RLIMIT_LOCKS + + /* Maximum number of pending signals. */ + __RLIMIT_SIGPENDING = 11, +#define RLIMIT_SIGPENDING __RLIMIT_SIGPENDING + + /* Maximum bytes in POSIX message queues. */ + __RLIMIT_MSGQUEUE = 12, +#define RLIMIT_MSGQUEUE __RLIMIT_MSGQUEUE + + /* Maximum nice priority allowed to raise to. + Nice levels 19 .. -20 correspond to 0 .. 39 + values of this resource limit. */ + __RLIMIT_NICE = 13, +#define RLIMIT_NICE __RLIMIT_NICE + + /* Maximum realtime priority allowed for non-priviledged + processes. */ + __RLIMIT_RTPRIO = 14, +#define RLIMIT_RTPRIO __RLIMIT_RTPRIO + + __RLIMIT_NLIMITS = 15, + __RLIM_NLIMITS = __RLIMIT_NLIMITS +#define RLIMIT_NLIMITS __RLIMIT_NLIMITS +#define RLIM_NLIMITS __RLIM_NLIMITS +}; + + +struct rlimit { + uint64_t rlim_cur; /* Soft limit */ + uint64_t rlim_max; /* Hard limit (ceiling for rlim_cur) */ +}; + +#endif diff --git a/kernel/include/waitq.h b/kernel/include/waitq.h index 066c9540..235b99cd 100644 --- a/kernel/include/waitq.h +++ b/kernel/include/waitq.h @@ -5,8 +5,8 @@ #include #include -#include +struct process; struct waitq_entry; typedef int (*waitq_func_t)(struct waitq_entry *wait, unsigned mode, diff --git a/kernel/mem.c b/kernel/mem.c index 359186f9..567b56cf 100644 --- a/kernel/mem.c +++ b/kernel/mem.c @@ -45,7 +45,27 @@ static struct aal_mc_pa_ops allocator = { static void page_fault_handler(unsigned long address, void *regs) { - kprintf("Page fault for %016lx\n", address); + struct vm_range *range, *next; + char found = 0; + + kprintf("[%d] Page fault for 0x%lX\n", + aal_mc_get_processor_id(), address); + + list_for_each_entry_safe(range, next, + &cpu_local_var(current)->vm->vm_range_list, + list) { + + if (range->start <= address && range->end > address) { + kprintf("address is in range, flag: 0x%X! \n", range->flag); + found = 1; + break; + } + } + + if (!found) + kprintf("address is out of range! \n"); + + /* TODO */ aal_mc_debug_show_interrupt_context(regs); panic("page fault"); diff --git a/kernel/process.c b/kernel/process.c index 7c525f3d..24629ab3 100644 --- a/kernel/process.c +++ b/kernel/process.c @@ -19,22 +19,30 @@ extern long do_arch_prctl(unsigned long code, unsigned long address); void init_process_vm(struct process_vm *vm) { + int i; + aal_atomic_set(&vm->refcount, 1); INIT_LIST_HEAD(&vm->vm_range_list); vm->page_table = aal_mc_pt_create(); - vm->region.tlsblock_base = 0; + + /* Initialize futex queues */ + for (i = 0; i < (1 << FUTEX_HASHBITS); ++i) + futex_queue_init(&vm->futex_queues[i]); + } struct process *create_process(unsigned long user_pc) { struct process *proc; - proc = aal_mc_alloc_pages(1, 0); + proc = aal_mc_alloc_pages(3, 0); + if (!proc) + return NULL; memset(proc, 0, sizeof(struct process)); aal_mc_init_user_process(&proc->ctx, &proc->uctx, - ((char *)proc) + PAGE_SIZE, user_pc, 0); + ((char *)proc) + 3 * PAGE_SIZE, user_pc, 0); proc->vm = (struct process_vm *)(proc + 1); @@ -50,7 +58,7 @@ struct process *clone_process(struct process *org, unsigned long pc, proc = aal_mc_alloc_pages(1, 0); - memset(proc, 0, sizeof(struct process)); + memset(proc, 0, sizeof(*proc)); aal_mc_init_user_process(&proc->ctx, &proc->uctx, ((char *)proc) + PAGE_SIZE, pc, sp); @@ -101,9 +109,9 @@ int add_process_memory_range(struct process *process, range->phys = phys; range->flag = flag; - dkprintf("range: %lx - %lx => %lx - %lx\n", + dkprintf("range: 0x%lX - 0x%lX => 0x%lX - 0x%lX (%ld)\n", range->start, range->end, range->phys, range->phys + - range->end - range->start); + range->end - range->start, range->end - range->start); if (flag & VR_REMOTE) { update_process_page_table(process, range, AAL_PTA_REMOTE); @@ -122,14 +130,17 @@ int add_process_memory_range(struct process *process, return 0; } + +#define NR_STACK_PAGES 2 + void init_process_stack(struct process *process) { - char *stack = aal_mc_alloc_pages(1, 0); - unsigned long *p = (unsigned long *)(stack + PAGE_SIZE); + char *stack = aal_mc_alloc_pages(NR_STACK_PAGES, 0); + unsigned long *p = (unsigned long *)(stack + (NR_STACK_PAGES * PAGE_SIZE)); - memset(stack, 0, PAGE_SIZE); + memset(stack, 0, NR_STACK_PAGES * PAGE_SIZE); - add_process_memory_range(process, USER_END - PAGE_SIZE, + add_process_memory_range(process, USER_END - (NR_STACK_PAGES * PAGE_SIZE), USER_END, virt_to_phys(stack), VR_STACK); @@ -147,7 +158,7 @@ void init_process_stack(struct process *process) aal_mc_modify_user_context(process->uctx, AAL_UCR_STACK_POINTER, USER_END - sizeof(unsigned long) * 9); process->vm->region.stack_end = USER_END; - process->vm->region.stack_start = USER_END - PAGE_SIZE; + process->vm->region.stack_start = USER_END - (NR_STACK_PAGES * PAGE_SIZE); } @@ -231,11 +242,11 @@ static void idle(void) { //unsigned int flags; //flags = aal_mc_spinlock_lock(&cpu_status_lock); - cpu_local_var(status) = CPU_STATUS_IDLE; //aal_mc_spinlock_unlock(&cpu_status_lock, flags); while (1) { cpu_enable_interrupt(); schedule(); + cpu_local_var(status) = CPU_STATUS_IDLE; cpu_halt(); } } @@ -307,8 +318,13 @@ void schedule(void) prev ? prev->pid : 0, next ? next->pid : 0); aal_mc_load_page_table(next->vm->page_table); - do_arch_prctl(ARCH_SET_FS, next->vm->region.tlsblock_base); - cpu_local_var(status) = CPU_STATUS_RUNNING; + + kprintf("[%d] schedule: tlsblock_base: 0x%lX\n", + aal_mc_get_processor_id(), next->thread.tlsblock_base); + do_arch_prctl(ARCH_SET_FS, next->thread.tlsblock_base); + + if (next != &cpu_local_var(idle)) + cpu_local_var(status) = CPU_STATUS_RUNNING; if (prev) { aal_mc_switch_context(&prev->ctx, &next->ctx); @@ -355,6 +371,7 @@ void __runq_add_proc(struct process *proc, int cpu_id) ++v->runq_len; proc->cpu_id = cpu_id; proc->status = PS_RUNNING; + get_cpu_local_var(cpu_id)->status = CPU_STATUS_RUNNING; dkprintf("runq_add_proc(): pid %d added to CPU[%d]'s runq\n", proc->pid, cpu_id); diff --git a/kernel/syscall.c b/kernel/syscall.c index a8e6936b..9c9e008f 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -12,9 +13,12 @@ #include #include #include +#include +#include /* Headers taken from kitten LWK */ #include +#include #define SYSCALL_BY_IKC @@ -26,6 +30,10 @@ #define dkprintf(...) #endif +static aal_spinlock_t sysc_lock = { 0 }; + +static aal_atomic_t pid_cnt = AAL_ATOMIC_INIT(1024); + int memcpy_async(unsigned long dest, unsigned long src, unsigned long len, int wait, unsigned long *notify); @@ -58,6 +66,7 @@ static void send_syscall(struct syscall_request *req) packet.arg = cpu_local_var(scp).request_rpa; aal_ikc_send(cpu_local_var(syscall_channel), &packet, 0); + //aal_ikc_send(get_cpu_local_var(0)->syscall_channel, &packet, 0); #endif } @@ -151,8 +160,59 @@ SYSCALL_DECLARE(open) SYSCALL_FOOTER; } +static DECLARE_WAITQ(my_waitq); + SYSCALL_DECLARE(ioctl) { + + switch (aal_mc_syscall_arg0(ctx)) { + + case 0: { + struct waitq_entry my_wait; + waitq_init_entry(&my_wait, cpu_local_var(current)); + + dkprintf("CPU[%d] pid[%d] going to sleep...\n", + cpu_local_var(current)->cpu_id, + cpu_local_var(current)->pid); + + waitq_prepare_to_wait(&my_waitq, &my_wait, PS_INTERRUPTIBLE); + schedule(); + + waitq_finish_wait(&my_waitq, &my_wait); + + dkprintf("CPU[%d] pid[%d] woke up!\n", + cpu_local_var(current)->cpu_id, + cpu_local_var(current)->pid); + + break; + } + + case 1: + + dkprintf("CPU[%d] pid[%d] waking up everyone..\n", + cpu_local_var(current)->cpu_id, + cpu_local_var(current)->pid); + + waitq_wakeup(&my_waitq); + + break; + + case 2: + + dkprintf("[%d] pid %d made an ioctl\n", + cpu_local_var(current)->cpu_id, + cpu_local_var(current)->pid); + + break; + + default: + dkprintf("ioctl() unimplemented\n"); + + } + + return 0; + +#if 0 SYSCALL_HEADER; /* Very ad-hoc for termios */ @@ -163,6 +223,7 @@ SYSCALL_DECLARE(ioctl) } return -EINVAL; +#endif } SYSCALL_DECLARE(read) @@ -195,9 +256,14 @@ SYSCALL_DECLARE(pwrite) SYSCALL_DECLARE(close) { + kprintf("[%d] close() \n", aal_mc_get_processor_id()); + return -EBADF; + +/* SYSCALL_HEADER; SYSCALL_ARGS_1(D); SYSCALL_FOOTER; +*/ } SYSCALL_DECLARE(lseek) @@ -322,6 +388,11 @@ long do_arch_prctl(unsigned long code, unsigned long address) switch (code) { case ARCH_SET_FS: + kprintf("[%d] arch_prctl: ARCH_SET_FS: 0x%lX\n", + aal_mc_get_processor_id(), address); + cpu_local_var(current)->thread.tlsblock_base = address; + err = aal_mc_arch_set_special_register(type, address); + break; case ARCH_SET_GS: err = aal_mc_arch_set_special_register(type, address); break; @@ -390,63 +461,73 @@ SYSCALL_DECLARE(clone) SYSCALL_DECLARE(clone) { - int i; - int cpuid = -1; - int clone_flags = aal_mc_syscall_arg0(ctx); - //unsigned long flags; /* spinlock */ - struct aal_mc_cpu_info *cpu_info = aal_mc_get_cpu_info(); - struct process *new; + int i; + int cpuid = -1; + int clone_flags = aal_mc_syscall_arg0(ctx); + //unsigned long flags; /* spinlock */ + struct aal_mc_cpu_info *cpu_info = aal_mc_get_cpu_info(); + struct process *new; - kputs(";sys_clone\n"); + dkprintf("[%d] clone(): stack_pointr: 0x%lX\n", + aal_mc_get_processor_id(), + (unsigned long)aal_mc_syscall_arg1(ctx)); //flags = aal_mc_spinlock_lock(&cpu_status_lock); for (i = 0; i < cpu_info->ncpus; i++) { - if(get_cpu_local_var(i)->status == CPU_STATUS_IDLE) + if (get_cpu_local_var(i)->status == CPU_STATUS_IDLE) { cpuid = i; + break; + } } - if(cpuid < 0) return -EAGAIN; + if (cpuid < 0) + return -EAGAIN; + new = clone_process(cpu_local_var(current), aal_mc_syscall_pc(ctx), aal_mc_syscall_arg1(ctx)); + + if (!new) { + return -ENOMEM; + } - /* TODO: allocate new pid */ - new->pid = 0xc107e; - - if (clone_flags & CLONE_SETTLS) { - dkprintf("clone_flags & CLONE_SETTLS\n"); + /* Allocate new pid */ + new->pid = aal_atomic_inc_return(&pid_cnt); + + if (clone_flags & CLONE_PARENT_SETTID) { + dkprintf("clone_flags & CLONE_PARENT_SETTID: 0x%lX\n", + (unsigned long)aal_mc_syscall_arg2(ctx)); - new->vm->region.tlsblock_base + *(int*)aal_mc_syscall_arg2(ctx) = new->pid; + } + + if (clone_flags & CLONE_CHILD_CLEARTID) { + dkprintf("clone_flags & CLONE_CHILD_CLEARTID: 0x%lX\n", + (unsigned long)aal_mc_syscall_arg3(ctx)); + + new->thread.clear_child_tid = (int*)aal_mc_syscall_arg3(ctx); + } + + if (clone_flags & CLONE_SETTLS) { + dkprintf("clone_flags & CLONE_SETTLS: 0x%lX\n", + (unsigned long)aal_mc_syscall_arg4(ctx)); + + new->thread.tlsblock_base = (unsigned long)aal_mc_syscall_arg4(ctx); } - else - new->vm->region.tlsblock_base = 0; - - if (clone_flags & CLONE_PARENT_SETTID) { - unsigned long pptid; - int *vptid; - if (aal_mc_pt_virt_to_phys(cpu_local_var(current)->vm->page_table, - (int*)aal_mc_syscall_arg2(ctx), &pptid)) - return -EFAULT; - - vptid = (int *)phys_to_virt(pptid); - *vptid = 1; + else { + new->thread.tlsblock_base = 0; } - new->thread.clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) - ? (int*)aal_mc_syscall_arg3(ctx) - : NULL; - - aal_mc_syscall_ret(new->uctx) = 0; - runq_add_proc(new, cpuid); + //get_cpu_local_var(cpuid)->next = new; - //get_cpu_local_var(cpuid)->status = CPU_STATUS_RUNNING; //aal_mc_spinlock_unlock(&cpu_status_lock, flags); - aal_mc_interrupt_cpu(aal_mc_get_cpu_info()->hw_ids[cpuid], 0xd1); dkprintf("clone: kicking scheduler!\n"); - while (1) { cpu_halt(); } + aal_mc_interrupt_cpu(get_x86_cpu_local_variable(cpuid)->apic_id, 0xd1); + + //while (1) { cpu_halt(); } return new->pid; } @@ -459,6 +540,7 @@ SYSCALL_DECLARE(set_tid_address) return cpu_local_var(current)->pid; } + SYSCALL_DECLARE(set_robust_list) { return -ENOSYS; @@ -501,6 +583,94 @@ SYSCALL_DECLARE(writev) return ret; } +SYSCALL_DECLARE(futex) +{ + // TODO: timespec support! + //struct timespec _utime; + uint64_t timeout = 1000; // MAX_SCHEDULE_TIMEOUT; + uint32_t val2 = 0; + + uint32_t *uaddr = (uint32_t *)aal_mc_syscall_arg0(ctx); + int op = (int)aal_mc_syscall_arg1(ctx); + uint32_t val = (uint32_t)aal_mc_syscall_arg2(ctx); + //struct timespec __user *utime = aal_mc_syscall_arg3(ctx); + uint32_t *uaddr2 = (uint32_t *)aal_mc_syscall_arg4(ctx); + uint32_t val3 = (uint32_t)aal_mc_syscall_arg5(ctx); + + /* Mask off the FUTEX_PRIVATE_FLAG, + * assume all futexes are address space private */ + op = (op & FUTEX_CMD_MASK); + +#if 0 + if (utime && (op == FUTEX_WAIT)) { + if (copy_from_user(&_utime, utime, sizeof(_utime)) != 0) + return -EFAULT; + if (!timespec_valid(&_utime)) + return -EINVAL; + timeout = timespec_to_ns(_utime); + } +#endif + + /* Requeue parameter in 'utime' if op == FUTEX_CMP_REQUEUE. + * number of waiters to wake in 'utime' if op == FUTEX_WAKE_OP. */ + if (op == FUTEX_CMP_REQUEUE || op == FUTEX_WAKE_OP) + val2 = (uint32_t) (unsigned long) aal_mc_syscall_arg3(ctx); + + return futex(uaddr, op, val, timeout, uaddr2, val2, val3); +} + +SYSCALL_DECLARE(exit) +{ + /* If there is a clear_child_tid address set, clear it and wake it. + * This unblocks any pthread_join() waiters. */ + if (cpu_local_var(current)->thread.clear_child_tid) { + + kprintf("exit clear_child!\n"); + + *cpu_local_var(current)->thread.clear_child_tid = 0; + barrier(); + futex((uint32_t *)cpu_local_var(current)->thread.clear_child_tid, + FUTEX_WAKE, 1, 0, NULL, 0, 0); + } + + runq_del_proc(cpu_local_var(current), cpu_local_var(current)->cpu_id); + free_process_memory(cpu_local_var(current)); + + cpu_local_var(current) = NULL; + schedule(); + + return 0; +} + +SYSCALL_DECLARE(getrlimit) +{ + int ret; + int resource = aal_mc_syscall_arg0(ctx); + struct rlimit *rlm = (struct rlimit *)aal_mc_syscall_arg1(ctx); + + switch (resource) { + + case RLIMIT_STACK: + + dkprintf("[%d] getrlimit() RLIMIT_STACK\n", aal_mc_get_processor_id()); + rlm->rlim_cur = (1024*1024); + rlm->rlim_max = (16384*1024); + ret = 0; + break; + + default: + + return -ENOSYS; + } + + return ret; +} + +SYSCALL_DECLARE(noop) +{ + kprintf("noop() \n"); + return -EFAULT; +} static long (*syscall_table[])(int, aal_mc_user_context_t *) = { [0] = sys_read, @@ -513,13 +683,17 @@ static long (*syscall_table[])(int, aal_mc_user_context_t *) = { [10] = sys_mprotect, [11] = sys_munmap, [12] = sys_brk, + [14] = sys_noop, [16] = sys_ioctl, [17] = sys_pread, [18] = sys_pwrite, [20] = sys_writev, + [28] = sys_noop, [39] = sys_getpid, [56] = sys_clone, + [60] = sys_exit, [63] = sys_uname, + [97] = sys_getrlimit, [102] = sys_getxid, [104] = sys_getxid, [107] = sys_getxid, @@ -527,6 +701,7 @@ static long (*syscall_table[])(int, aal_mc_user_context_t *) = { [110] = sys_getxid, [111] = sys_getxid, [158] = sys_arch_prctl, + [202] = sys_futex, [218] = sys_set_tid_address, [231] = sys_exit_group, [273] = sys_set_robust_list, @@ -563,7 +738,6 @@ long syscall(int num, aal_mc_user_context_t *ctx) if (syscall_table[num]) { l = syscall_table[num](num, ctx); dkprintf(" %lx\n", l); - return l; } else { dkprintf("USC[%3d](%lx, %lx, %lx, %lx, %lx) @ %lx | %lx\n", num, aal_mc_syscall_arg0(ctx), aal_mc_syscall_arg1(ctx), @@ -571,8 +745,10 @@ long syscall(int num, aal_mc_user_context_t *ctx) aal_mc_syscall_arg4(ctx), aal_mc_syscall_pc(ctx), aal_mc_syscall_sp(ctx)); //while(1); - return -ENOSYS; + l = -ENOSYS; } + + return l; } void __host_update_process_range(struct process *process,