From 83db56a040c7d66342f8b061c6bb4ecda37b59ac Mon Sep 17 00:00:00 2001 From: "Balazs Gerofi bgerofi@riken.jp" Date: Fri, 10 May 2013 14:23:14 +0900 Subject: [PATCH] futex adaptation from Linux 2.6.34 (Intel MPSS Linux) --- kernel/Makefile.build | 2 +- kernel/futex.c | 1048 ++++++++++++++++++++++++++------------ kernel/include/futex.h | 80 ++- kernel/include/jhash.h | 145 ++++++ kernel/include/plist.h | 273 ++++++++++ kernel/include/process.h | 3 - kernel/init.c | 2 + kernel/plist.c | 123 +++++ kernel/process.c | 9 +- kernel/syscall.c | 62 +-- 10 files changed, 1341 insertions(+), 406 deletions(-) create mode 100644 kernel/include/jhash.h create mode 100644 kernel/include/plist.h create mode 100644 kernel/plist.c diff --git a/kernel/Makefile.build b/kernel/Makefile.build index 10a6c84c..36bf10b2 100644 --- a/kernel/Makefile.build +++ b/kernel/Makefile.build @@ -1,6 +1,6 @@ IHKDIR=$(IHKBASE)/$(TARGETDIR) OBJS = init.o mem.o debug.o mikc.o listeners.o ap.o syscall.o cls.o host.o -OBJS += process.o copy.o waitq.o futex.o timer.o +OBJS += process.o copy.o waitq.o futex.o timer.o plist.o DEPSRCS=$(wildcard $(SRC)/*.c) CFLAGS += -I$(SRC)/include -mcmodel=kernel -D__KERNEL__ diff --git a/kernel/futex.c b/kernel/futex.c index 91e4dd50..19fc874e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1,15 +1,7 @@ /* - * Kitten LWK futex code adaptation. - * Copyright (c) 2012 RIKEN AICS - */ - -/* - * Copyright (c) 2008 Sandia National Laboratories - * - * Futex code adapted from Linux 2.6.27.9, original copyright below. - * Simplified to only support address-space (process-private) futexes. - * Removed demand-paging, cow, etc. complications since LWK doesn't - * require these. + * Linux futex adaptation. + * (C) Copyright 2013 RIKEN AICS + * Balazs Gerofi */ /* @@ -33,6 +25,10 @@ * PRIVATE futexes by Eric Dumazet * Copyright (C) 2007 Eric Dumazet * + * Requeue-PI support by Darren Hart + * Copyright (C) IBM Corporation, 2009 + * Thanks to Thomas Gleixner for conceptual design and careful reviews. + * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. @@ -57,90 +53,555 @@ #include #include -#include +#include #include +#include #include +#include #include #include #include -#if 0 -#include -#include -#include -#include -#include -#include +//#define DEBUG_PRINT_FUTEX -#ifdef __UACCESS__ -#include -#endif - -#endif - -void futex_queue_init(struct futex_queue *queue) -{ - ihk_mc_spinlock_init(&queue->lock); - INIT_LIST_HEAD(&queue->futex_list); -} - -static int uaddr_is_valid(uint32_t __user *uaddr) -{ -#ifdef __UACCESS__ - return access_ok(VERIFY_WRITE, uaddr, sizeof(uint32_t)); +#ifdef DEBUG_PRINT_FUTEX +#define dkprintf kprintf #else - return 1; +#define dkprintf(...) #endif + +int futex_cmpxchg_enabled; + +/** + * struct futex_q - The hashed futex queue entry, one per waiting task + * @task: the task waiting on the futex + * @lock_ptr: the hash bucket lock + * @key: the key the futex is hashed on + * @requeue_pi_key: the requeue_pi target futex key + * @bitset: bitset for the optional bitmasked wakeup + * + * We use this hashed waitqueue, instead of a normal wait_queue_t, so + * we can wake only the relevant ones (hashed queues may be shared). + * + * A futex_q has a woken state, just like tasks have TASK_RUNNING. + * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. + * The order of wakup is always to make the first condition true, then + * the second. + * + * PI futexes are typically woken before they are removed from the hash list via + * the rt_mutex code. See unqueue_me_pi(). + */ +struct futex_q { + struct plist_node list; + + struct process *task; + ihk_spinlock_t *lock_ptr; + union futex_key key; + union futex_key *requeue_pi_key; + uint32_t bitset; +}; + +/* + * Hash buckets are shared by all the futex_keys that hash to the same + * location. Each key may have multiple futex_q structures, one for each task + * waiting on a futex. + */ +struct futex_hash_bucket { + ihk_spinlock_t lock; + struct plist_head chain; +}; + +static struct futex_hash_bucket futex_queues[1<both.word, + (sizeof(key->both.word)+sizeof(key->both.ptr))/4, + key->both.offset); + return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)]; } -static int futex_init(struct futex *futex, uint32_t __user *uaddr, - uint32_t bitset) +/* + * Return 1 if two futex_keys are equal, 0 otherwise. + */ +static inline int match_futex(union futex_key *key1, union futex_key *key2) { - if (!uaddr_is_valid(uaddr)) - return -EINVAL; + return (key1 && key2 + && key1->both.word == key2->both.word + && key1->both.ptr == key2->both.ptr + && key1->both.offset == key2->both.offset); +} + +/* + * Take a reference to the resource addressed by a key. + * Can be called while holding spinlocks. + * + */ +static void get_futex_key_refs(union futex_key *key) +{ + /* RIKEN: only !fshared futexes... */ + return; +} + +/* + * Drop a reference to the resource addressed by a key. + * The hash bucket spinlock must not be held. + */ +static void drop_futex_key_refs(union futex_key *key) +{ + /* RIKEN: only !fshared futexes... */ + return; +} +/** + * get_futex_key() - Get parameters which are the keys for a futex + * @uaddr: virtual address of the futex + * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED + * @key: address where result is stored. + * + * Returns a negative error code or 0 + * The key words are stored in *key on success. + * + * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, + * offset_within_page). For private mappings, it's (uaddr, current->mm). + * We can usually work out the index without swapping in the page. + * + * lock_page() might sleep, the caller should not hold a spinlock. + */ +static int +get_futex_key(uint32_t *uaddr, int fshared, union futex_key *key) +{ + unsigned long address = (unsigned long)uaddr; + struct process_vm *mm = cpu_local_var(current)->vm; + + /* + * The futex address must be "naturally" aligned. + */ + key->both.offset = address % PAGE_SIZE; + if (((address % sizeof(uint32_t)) != 0)) + return -EINVAL; + address -= key->both.offset; + + /* + * PROCESS_PRIVATE futexes are fast. + * As the mm cannot disappear under us and the 'key' only needs + * virtual address, we dont even have to find the underlying vma. + * Note : We do have to check 'uaddr' is a valid user address, + * but access_ok() should be faster than find_vma() + */ + if (!fshared) { + + key->private.mm = mm; + key->private.address = address; + get_futex_key_refs(key); + return 0; + } + + /* RIKEN: No shared futex support... */ + return -EFAULT; +} + + +static inline +void put_futex_key(int fshared, union futex_key *key) +{ + drop_futex_key_refs(key); +} + +static int cmpxchg_futex_value_locked(uint32_t __user *uaddr, uint32_t uval, uint32_t newval) +{ + int curval; + + /* RIKEN: futexes are on not swappable memory */ + curval = futex_atomic_cmpxchg_inatomic((int*)uaddr, (int)uval, (int)newval); + + return curval; +} + +static int get_futex_value_locked(uint32_t *dest, uint32_t *from) +{ + /* RIKEN: futexes are always on not swappable pages */ + *dest = *from; - futex->uaddr = uaddr; - futex->bitset = bitset; - waitq_init(&futex->waitq); return 0; } -static struct futex_queue *get_queue(uint32_t __user *uaddr) +/* + * The hash bucket lock must be held when this is called. + * Afterwards, the futex_q must not be accessed. + */ +static void wake_futex(struct futex_q *q) { - uint64_t hash = hash_64((uint64_t)uaddr, FUTEX_HASHBITS); - return &cpu_local_var(current)->vm->futex_queues[hash]; + struct process *p = q->task; + + /* + * We set q->lock_ptr = NULL _before_ we wake up the task. If + * a non futex wake up happens on another CPU then the task + * might exit and p would dereference a non existing task + * struct. Prevent this by holding a reference on p across the + * wake up. + */ + + plist_del(&q->list, &q->list.plist); + /* + * The waiting task can free the futex_q as soon as + * q->lock_ptr = NULL is written, without taking any locks. A + * memory barrier is required here to prevent the following + * store to lock_ptr from getting ahead of the plist_del. + */ + barrier(); + q->lock_ptr = NULL; + + sched_wakeup_process(p, PS_NORMAL); } -static struct futex_queue *queue_lock(struct futex *futex, int *irqflags) +/* + * Express the locking dependencies for lockdep: + */ +static inline void +double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) { - struct futex_queue *queue = get_queue(futex->uaddr); - futex->lock_ptr = &queue->lock; - *irqflags = ihk_mc_spinlock_lock(&queue->lock); - return queue; + if (hb1 <= hb2) { + ihk_mc_spinlock_lock_noirq(&hb1->lock); + if (hb1 < hb2) + ihk_mc_spinlock_lock_noirq(&hb2->lock); + } else { /* hb1 > hb2 */ + ihk_mc_spinlock_lock_noirq(&hb2->lock); + ihk_mc_spinlock_lock_noirq(&hb1->lock); + } } -static void queue_unlock(struct futex_queue *futex_queue, int irqflags) +static inline void +double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) { - ihk_mc_spinlock_unlock(&futex_queue->lock, irqflags); + ihk_mc_spinlock_unlock_noirq(&hb1->lock); + if (hb1 != hb2) + ihk_mc_spinlock_unlock_noirq(&hb2->lock); } -static void queue_me(struct futex *futex, struct futex_queue *futex_queue) +/* + * Wake up waiters matching bitset queued on this futex (uaddr). + */ +static int futex_wake(uint32_t *uaddr, int fshared, int nr_wake, uint32_t bitset) { - list_add_tail(&futex->link, &futex_queue->futex_list); + struct futex_hash_bucket *hb; + struct futex_q *this, *next; + struct plist_head *head; + union futex_key key = FUTEX_KEY_INIT; + int ret; + + if (!bitset) + return -EINVAL; + + ret = get_futex_key(uaddr, fshared, &key); + if ((ret != 0)) + goto out; + + hb = hash_futex(&key); + ihk_mc_spinlock_lock_noirq(&hb->lock); + head = &hb->chain; + + plist_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, &key)) { + + /* RIKEN: no pi state... */ + /* Check if one of the bits is set in both bitsets */ + if (!(this->bitset & bitset)) + continue; + + wake_futex(this); + if (++ret >= nr_wake) + break; + } + } + + ihk_mc_spinlock_unlock_noirq(&hb->lock); + put_futex_key(fshared, &key); +out: + return ret; } -static int unqueue_me(struct futex *futex) +/* + * Wake up all waiters hashed on the physical page that is mapped + * to this virtual address: + */ +static int +futex_wake_op(uint32_t *uaddr1, int fshared, uint32_t *uaddr2, + int nr_wake, int nr_wake2, int op) +{ + union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; + struct futex_hash_bucket *hb1, *hb2; + struct plist_head *head; + struct futex_q *this, *next; + int ret, op_ret; + +retry: + ret = get_futex_key(uaddr1, fshared, &key1); + if ((ret != 0)) + goto out; + ret = get_futex_key(uaddr2, fshared, &key2); + if ((ret != 0)) + goto out_put_key1; + + hb1 = hash_futex(&key1); + hb2 = hash_futex(&key2); + +retry_private: + double_lock_hb(hb1, hb2); + op_ret = futex_atomic_op_inuser(op, (int*)uaddr2); + if ((op_ret < 0)) { + + double_unlock_hb(hb1, hb2); + + if ((op_ret != -EFAULT)) { + ret = op_ret; + goto out_put_keys; + } + + /* RIKEN: set ret to 0 as if fault_in_user_writeable() returned it */ + ret = 0; + + if (!fshared) + goto retry_private; + + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); + goto retry; + } + + head = &hb1->chain; + + plist_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, &key1)) { + wake_futex(this); + if (++ret >= nr_wake) + break; + } + } + + if (op_ret > 0) { + head = &hb2->chain; + + op_ret = 0; + plist_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, &key2)) { + wake_futex(this); + if (++op_ret >= nr_wake2) + break; + } + } + ret += op_ret; + } + + double_unlock_hb(hb1, hb2); +out_put_keys: + put_futex_key(fshared, &key2); +out_put_key1: + put_futex_key(fshared, &key1); +out: + return ret; +} + +/** + * requeue_futex() - Requeue a futex_q from one hb to another + * @q: the futex_q to requeue + * @hb1: the source hash_bucket + * @hb2: the target hash_bucket + * @key2: the new key for the requeued futex_q + */ +static inline +void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, union futex_key *key2) +{ + + /* + * If key1 and key2 hash to the same bucket, no need to + * requeue. + */ + if ((&hb1->chain != &hb2->chain)) { + plist_del(&q->list, &hb1->chain); + plist_add(&q->list, &hb2->chain); + q->lock_ptr = &hb2->lock; +#ifdef CONFIG_DEBUG_PI_LIST + q->list.plist.spinlock = &hb2->lock; +#endif + } + get_futex_key_refs(key2); + q->key = *key2; +} + +/** + * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 + * uaddr1: source futex user address + * uaddr2: target futex user address + * nr_wake: number of waiters to wake (must be 1 for requeue_pi) + * nr_requeue: number of waiters to requeue (0-INT_MAX) + * requeue_pi: if we are attempting to requeue from a non-pi futex to a + * pi futex (pi to pi requeue is not supported) + * + * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire + * uaddr2 atomically on behalf of the top waiter. + * + * Returns: + * >=0 - on success, the number of tasks requeued or woken + * <0 - on error + */ +static int futex_requeue(uint32_t *uaddr1, int fshared, uint32_t *uaddr2, + int nr_wake, int nr_requeue, uint32_t *cmpval, + int requeue_pi) +{ + union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; + int drop_count = 0, task_count = 0, ret; + struct futex_hash_bucket *hb1, *hb2; + struct plist_head *head1; + struct futex_q *this, *next; + + ret = get_futex_key(uaddr1, fshared, &key1); + if ((ret != 0)) + goto out; + ret = get_futex_key(uaddr2, fshared, &key2); + if ((ret != 0)) + goto out_put_key1; + + hb1 = hash_futex(&key1); + hb2 = hash_futex(&key2); + + double_lock_hb(hb1, hb2); + + if ((cmpval != NULL)) { + uint32_t curval; + + ret = get_futex_value_locked(&curval, uaddr1); + + if (curval != *cmpval) { + ret = -EAGAIN; + goto out_unlock; + } + } + + head1 = &hb1->chain; + plist_for_each_entry_safe(this, next, head1, list) { + if (task_count - nr_wake >= nr_requeue) + break; + + if (!match_futex(&this->key, &key1)) + continue; + + /* + * Wake nr_wake waiters. For requeue_pi, if we acquired the + * lock, we already woke the top_waiter. If not, it will be + * woken by futex_unlock_pi(). + */ + /* RIKEN: no requeue_pi at this moment */ + if (++task_count <= nr_wake) { + wake_futex(this); + continue; + } + + requeue_futex(this, hb1, hb2, &key2); + drop_count++; + } + +out_unlock: + double_unlock_hb(hb1, hb2); + + /* + * drop_futex_key_refs() must be called outside the spinlocks. During + * the requeue we moved futex_q's from the hash bucket at key1 to the + * one at key2 and updated their key pointer. We no longer need to + * hold the references to key1. + */ + while (--drop_count >= 0) + drop_futex_key_refs(&key1); + + put_futex_key(fshared, &key2); +out_put_key1: + put_futex_key(fshared, &key1); +out: + return ret ? ret : task_count; +} + +/* The key must be already stored in q->key. */ +static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) +{ + struct futex_hash_bucket *hb; + + get_futex_key_refs(&q->key); + hb = hash_futex(&q->key); + q->lock_ptr = &hb->lock; + + ihk_mc_spinlock_lock_noirq(&hb->lock); + return hb; +} + +static inline void +queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) +{ + ihk_mc_spinlock_unlock_noirq(&hb->lock); + drop_futex_key_refs(&q->key); +} + +/** + * queue_me() - Enqueue the futex_q on the futex_hash_bucket + * @q: The futex_q to enqueue + * @hb: The destination hash bucket + * + * The hb->lock must be held by the caller, and is released here. A call to + * queue_me() is typically paired with exactly one call to unqueue_me(). The + * exceptions involve the PI related operations, which may use unqueue_me_pi() + * or nothing if the unqueue is done as part of the wake process and the unqueue + * state is implicit in the state of woken task (see futex_wait_requeue_pi() for + * an example). + */ +static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) +{ + int prio; + + /* + * The priority used to register this element is + * - either the real thread-priority for the real-time threads + * (i.e. threads with a priority lower than MAX_RT_PRIO) + * - or MAX_RT_PRIO for non-RT threads. + * Thus, all RT-threads are woken first in priority order, and + * the others are woken last, in FIFO order. + * + * RIKEN: no priorities at the moment, everyone is 10. + */ + prio = 10; + + plist_node_init(&q->list, prio); +#ifdef CONFIG_DEBUG_PI_LIST + q->list.plist.spinlock = &hb->lock; +#endif + plist_add(&q->list, &hb->chain); + q->task = cpu_local_var(current); + ihk_mc_spinlock_unlock_noirq(&hb->lock); +} + +/** + * unqueue_me() - Remove the futex_q from its futex_hash_bucket + * @q: The futex_q to unqueue + * + * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must + * be paired with exactly one earlier call to queue_me(). + * + * Returns: + * 1 - if the futex_q was still queued (and we removed unqueued it) + * 0 - if the futex_q was already removed by the waking thread + */ +static int unqueue_me(struct futex_q *q) { ihk_spinlock_t *lock_ptr; - int irqflags; - int status = 0; + int ret = 0; /* In the common case we don't take the spinlock, which is nice. */ retry: - lock_ptr = futex->lock_ptr; + lock_ptr = q->lock_ptr; barrier(); if (lock_ptr != NULL) { - irqflags = ihk_mc_spinlock_lock(lock_ptr); + ihk_mc_spinlock_lock_noirq(lock_ptr); /* * q->lock_ptr can change between reading it and * spin_lock(), causing us to take the wrong lock. This @@ -154,95 +615,46 @@ retry: * however, change back to the original value. Therefore * we can detect whether we acquired the correct lock. */ - if (lock_ptr != futex->lock_ptr) { - ihk_mc_spinlock_unlock(lock_ptr, irqflags); + if (lock_ptr != q->lock_ptr) { + ihk_mc_spinlock_unlock_noirq(lock_ptr); goto retry; } + plist_del(&q->list, &q->list.plist); - //WARN_ON(list_empty(&futex->link)); - list_del(&futex->link); - ihk_mc_spinlock_unlock(lock_ptr, irqflags); - status = 1; + ihk_mc_spinlock_unlock_noirq(lock_ptr); + ret = 1; } - return status; + drop_futex_key_refs(&q->key); + return ret; } -static void lock_two_queues(struct futex_queue *queue1, int *irqflags1, - struct futex_queue *queue2, int *irqflags2) -{ - if (queue1 < queue2) - *irqflags1 = ihk_mc_spinlock_lock(&queue1->lock); - - *irqflags2 = ihk_mc_spinlock_lock(&queue2->lock); - - if (queue1 > queue2) - *irqflags1 = ihk_mc_spinlock_lock(&queue1->lock); -} +/** + * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal + * @hb: the futex hash bucket, must be locked by the caller + * @q: the futex_q to queue up on + * @timeout: the prepared hrtimer_sleeper, or null for no timeout + */ -static void unlock_two_queues(struct futex_queue *queue1, int irqflags1, - struct futex_queue *queue2, int irqflags2) +/* RIKEN: this function has been rewritten so that it returns the remaining + * time in case we are waken. + */ +static uint64_t futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, + uint64_t timeout) { - if (queue1 == queue2) { - ihk_mc_spinlock_unlock(&queue2->lock, irqflags2); - } - else { - ihk_mc_spinlock_unlock(&queue2->lock, irqflags2); - ihk_mc_spinlock_unlock(&queue1->lock, irqflags1); - } -} - -/** Puts a task to sleep waiting on a futex. */ -static int futex_wait(uint32_t __user *uaddr, uint32_t val, - uint64_t timeout, uint32_t bitset) -{ - DECLARE_WAITQ_ENTRY(wait, cpu_local_var(current)); - int status; - uint32_t uval; - struct futex futex; - struct futex_queue *queue; - int irqflags; uint64_t time_remain = 0; - - if (!bitset) - return -EINVAL; - - /* This verifies that uaddr is sane */ - if ((status = futex_init(&futex, uaddr, bitset)) != 0) - return status; - - /* Lock the futex queue corresponding to uaddr */ - queue = queue_lock(&futex, &irqflags); - - /* Get the value from user-space. Since we don't have - * paging, the only options are for this to succeed (with no - * page faults) or fail, returning -EFAULT. There is no way - * for us to be put to sleep, so holding the queue's spinlock - * is fine. */ -#ifdef __UACCESS__ - if ((status = get_user(uval, uaddr)) != 0) - goto error; -#else - uval = *uaddr; - status = 0; -#endif - - /* The user-space value must match the value passed in */ - if (uval != val) { - status = -EWOULDBLOCK; - goto error; - } + /* + * The task state is guaranteed to be set before another task can + * wake it. set_current_state() is implemented using set_mb() and + * queue_me() calls spin_unlock() upon completion, both serializing + * access to the hash list and forcing another memory barrier. + */ + xchg4(&(cpu_local_var(current)->status), PS_INTERRUPTIBLE); + queue_me(q, hb); - /* Add ourself to the futex's waitq and go to sleep */ - cpu_local_var(current)->status = PS_INTERRUPTIBLE; - waitq_add_entry(&futex.waitq, &wait); - - /* Add ourself to the futex queue and drop our lock on it */ - queue_me(&futex, queue); - queue_unlock(queue, irqflags); - - if (!list_empty(&futex.link)) { + if (!plist_node_empty(&q->list)) { + /* RIKEN: use mcos timers */ if (timeout) { time_remain = schedule_timeout(timeout); } @@ -251,221 +663,217 @@ static int futex_wait(uint32_t __user *uaddr, uint32_t val, time_remain = 0; } } - - cpu_local_var(current)->status = PS_RUNNING; - - /* - * NOTE: We don't remove ourself from the waitq because - * we are the only user of it. - */ - /* If we were woken (and unqueued), we succeeded, whatever. */ - if (!unqueue_me(&futex)) - return 0; - - if (time_remain == 0) - return -ETIMEDOUT; - - /* We expect that there is a signal pending, but another thread - * may have handled it for us already. */ - return -EINTR; - -error: - queue_unlock(queue, irqflags); - return status; + /* This does not need to be serialized */ + cpu_local_var(current)->status = PS_RUNNING; + + return time_remain; } -/* - * The futex_queue's lock must be held when this is called. - * Afterwards, the futex_queue must not be accessed. +/** + * futex_wait_setup() - Prepare to wait on a futex + * @uaddr: the futex userspace address + * @val: the expected value + * @fshared: whether the futex is shared (1) or not (0) + * @q: the associated futex_q + * @hb: storage for hash_bucket pointer to be returned to caller + * + * Setup the futex_q and locate the hash_bucket. Get the futex value and + * compare it with the expected value. Handle atomic faults internally. + * Return with the hb lock held and a q.key reference on success, and unlocked + * with no q.key reference on failure. + * + * Returns: + * 0 - uaddr contains val and hb has been locked + * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked */ -static void wake_futex(struct futex *futex) +static int futex_wait_setup(uint32_t __user *uaddr, uint32_t val, int fshared, + struct futex_q *q, struct futex_hash_bucket **hb) { - list_del_init(&futex->link); + uint32_t uval; + int ret; + /* - * The lock in waitq_wakeup() is a crucial memory barrier after the - * list_del_init() and also before assigning to futex->lock_ptr. - */ - waitq_wakeup(&futex->waitq); - /* - * The waiting task can free the futex as soon as this is written, - * without taking any locks. This must come last. + * Access the page AFTER the hash-bucket is locked. + * Order is important: * - * A memory barrier is required here to prevent the following store - * to lock_ptr from getting ahead of the wakeup. Clearing the lock - * at the end of waitq_wakeup() does not prevent this store from - * moving. + * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); + * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } + * + * The basic logical guarantee of a futex is that it blocks ONLY + * if cond(var) is known to be true at the time of blocking, for + * any cond. If we queued after testing *uaddr, that would open + * a race condition where we could block indefinitely with + * cond(var) false, which would violate the guarantee. + * + * A consequence is that futex_wait() can return zero and absorb + * a wakeup when *uaddr != val on entry to the syscall. This is + * rare, but normal. */ - barrier(); - futex->lock_ptr = NULL; + q->key = FUTEX_KEY_INIT; + ret = get_futex_key(uaddr, fshared, &q->key); + if ((ret != 0)) + return ret; + + *hb = queue_lock(q); + + ret = get_futex_value_locked(&uval, uaddr); + + /* RIKEN: get_futex_value_locked() always returns 0 on mckernel */ + + if (uval != val) { + queue_unlock(q, *hb); + ret = -EWOULDBLOCK; + } + + if (ret) + put_futex_key(fshared, &q->key); + return ret; } -/** Wakes up nr_wake tasks waiting on a futex. */ -static int futex_wake(uint32_t __user *uaddr, int nr_wake, uint32_t bitset) +static int futex_wait(uint32_t __user *uaddr, int fshared, + uint32_t val, uint64_t timeout, uint32_t bitset, int clockrt) { - struct futex_queue *queue; - struct list_head *head; - struct futex *this, *next; - int nr_woke = 0; - int irqflags; + struct futex_hash_bucket *hb; + struct futex_q q; + uint64_t time_remain; + int ret; if (!bitset) return -EINVAL; - if (!uaddr_is_valid(uaddr)) - return -EINVAL; + q.bitset = bitset; + q.requeue_pi_key = NULL; - queue = get_queue(uaddr); - irqflags = ihk_mc_spinlock_lock(&queue->lock); - head = &queue->futex_list; + /* RIKEN: futex_wait_queue_me() calls schedule_timeout() if timer is set */ - list_for_each_entry_safe(this, next, head, link) { - if ((this->uaddr == uaddr) && (this->bitset & bitset)) { - wake_futex(this); - if (++nr_woke >= nr_wake) - break; - } - } +retry: + /* Prepare to wait on uaddr. */ + ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); + if (ret) + goto out; - ihk_mc_spinlock_unlock(&queue->lock, irqflags); - return nr_woke; + /* queue_me and wait for wakeup, timeout, or a signal. */ + time_remain = futex_wait_queue_me(hb, &q, timeout); + + /* If we were woken (and unqueued), we succeeded, whatever. */ + ret = 0; + if (!unqueue_me(&q)) + goto out_put_key; + ret = -ETIMEDOUT; + + /* RIKEN: timer expired case (indicated by !time_remain) */ + if (timeout && !time_remain) + goto out_put_key; + + /* RIKEN: no signals */ + put_futex_key(fshared, &q.key); + goto retry; + +out_put_key: + put_futex_key(fshared, &q.key); +out: + return ret; } -/** Conditionally wakes up tasks that are waiting on futexes. */ -static int futex_wake_op(uint32_t __user *uaddr1, uint32_t __user *uaddr2, - int nr_wake1, int nr_wake2, int op) +int futex(uint32_t *uaddr, int op, uint32_t val, uint64_t timeout, + uint32_t *uaddr2, uint32_t val2, uint32_t val3) { - struct futex_queue *queue1, *queue2; - int irqflags1 = 0; - int irqflags2 = 0; - struct list_head *head; - struct futex *this, *next; - int op_result, nr_woke1 = 0, nr_woke2 = 0; + int clockrt, ret = -ENOSYS; + int cmd = op & FUTEX_CMD_MASK; + int fshared = 0; - if (!uaddr_is_valid(uaddr1) || !uaddr_is_valid(uaddr2)) - return -EINVAL; - - queue1 = get_queue(uaddr1); - queue2 = get_queue(uaddr2); - lock_two_queues(queue1, &irqflags1, queue2, &irqflags2); - - op_result = futex_atomic_op_inuser(op, (int *)uaddr2); - if (op_result < 0) { - unlock_two_queues(queue1, irqflags1, queue2, irqflags2); - return op_result; + /* RIKEN: Assume address space private futexes. + if (!(op & FUTEX_PRIVATE_FLAG)) { + fshared = 1; } + */ - head = &queue1->futex_list; - list_for_each_entry_safe(this, next, head, link) { - if (this->uaddr == uaddr1) { - wake_futex(this); - if (++nr_woke1 >= nr_wake1) - break; - } + clockrt = op & FUTEX_CLOCK_REALTIME; + if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) + return -ENOSYS; + + switch (cmd) { + case FUTEX_WAIT: + val3 = FUTEX_BITSET_MATCH_ANY; + case FUTEX_WAIT_BITSET: + ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt); + break; + case FUTEX_WAKE: + val3 = FUTEX_BITSET_MATCH_ANY; + case FUTEX_WAKE_BITSET: + ret = futex_wake(uaddr, fshared, val, val3); + break; + case FUTEX_REQUEUE: + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); + break; + case FUTEX_CMP_REQUEUE: + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, + 0); + break; + case FUTEX_WAKE_OP: + ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); + break; + /* RIKEN: these calls are not supported for now. + case FUTEX_LOCK_PI: + if (futex_cmpxchg_enabled) + ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); + break; + case FUTEX_UNLOCK_PI: + if (futex_cmpxchg_enabled) + ret = futex_unlock_pi(uaddr, fshared); + break; + case FUTEX_TRYLOCK_PI: + if (futex_cmpxchg_enabled) + ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); + break; + case FUTEX_WAIT_REQUEUE_PI: + val3 = FUTEX_BITSET_MATCH_ANY; + ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, + clockrt, uaddr2); + break; + case FUTEX_CMP_REQUEUE_PI: + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, + 1); + break; + */ + default: + kprintf("futex() invalid cmd: %d \n", cmd); + ret = -ENOSYS; } - - if (op_result > 0) { - head = &queue2->futex_list; - list_for_each_entry_safe(this, next, head, link) { - if (this->uaddr == uaddr2) { - wake_futex(this); - if (++nr_woke2 >= nr_wake2) - break; - } - } - } - - unlock_two_queues(queue1, irqflags1, queue2, irqflags2); - return nr_woke1 + nr_woke2; + return ret; } -/** Conditionally wakes up or requeues tasks that are waiting on futexes. */ -static int futex_cmp_requeue(uint32_t __user *uaddr1, uint32_t __user *uaddr2, - int nr_wake, int nr_requeue, uint32_t cmpval) -{ - struct futex_queue *queue1, *queue2; - int irqflags1, irqflags2; - struct list_head *head1, *head2; - struct futex *this, *next; - uint32_t curval; - int status, nr_woke = 0; - - if (!uaddr_is_valid(uaddr1) || !uaddr_is_valid(uaddr2)) - return -EINVAL; - - queue1 = get_queue(uaddr1); - queue2 = get_queue(uaddr2); - lock_two_queues(queue1, &irqflags1, queue2, &irqflags2); - -#ifdef __UACCESS__ - if ((status = get_user(curval, uaddr1)) != 0) - goto out_unlock; -#else - curval = *uaddr1; - status = 0; +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #endif - if (curval != cmpval) { - status = -EAGAIN; - goto out_unlock; - } - - head1 = &queue1->futex_list; - head2 = &queue2->futex_list; - list_for_each_entry_safe(this, next, head1, link) { - if (this->uaddr != uaddr1) - continue; - if (++nr_woke <= nr_wake) { - wake_futex(this); - } else { - /* If uaddr1 and uaddr2 hash to the - * same futex queue, no need to requeue */ - if (head1 != head2) { - list_move_tail(&this->link, head2); - this->lock_ptr = &queue2->lock; - } - this->uaddr = uaddr2; - - if (nr_woke - nr_wake >= nr_requeue) - break; - } - } - status = nr_woke; - -out_unlock: - unlock_two_queues(queue1, irqflags1, queue2, irqflags2); - return status; -} - -int futex(uint32_t __user *uaddr, int op, uint32_t val, uint64_t timeout, - uint32_t __user *uaddr2, uint32_t val2, uint32_t val3) +int futex_init(void) { - int status; + int curval; + int i; - switch (op) { - case FUTEX_WAIT: - val3 = FUTEX_BITSET_MATCH_ANY; - case FUTEX_WAIT_BITSET: - status = futex_wait(uaddr, val, timeout, val3); - break; - case FUTEX_WAKE: - val3 = FUTEX_BITSET_MATCH_ANY; - case FUTEX_WAKE_BITSET: - status = futex_wake(uaddr, val, val3); - break; - case FUTEX_WAKE_OP: - status = futex_wake_op(uaddr, uaddr2, val, val2, val3); - break; - case FUTEX_CMP_REQUEUE: - status = futex_cmp_requeue(uaddr, uaddr2, val, val2, val3); - break; - default: - kprintf("sys_futex() op=%d not supported (pid: )\n", - op, &cpu_local_var(current)->pid); - - status = -ENOSYS; + /* + * This will fail and we want it. Some arch implementations do + * runtime detection of the futex_atomic_cmpxchg_inatomic() + * functionality. We want to know that before we call in any + * of the complex code paths. Also we want to prevent + * registration of robust lists in that case. NULL is + * guaranteed to fault and we get -EFAULT on functional + * implementation, the non functional ones will return + * -ENOSYS. + */ + curval = cmpxchg_futex_value_locked(NULL, 0, 0); + if (curval == -EFAULT) { + dkprintf("futex_cmpxchg_enabled = 1 ??\n"); + futex_cmpxchg_enabled = 1; } - return status; + for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { + plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); + ihk_mc_spinlock_init(&futex_queues[i].lock); + } + + return 0; } diff --git a/kernel/include/futex.h b/kernel/include/futex.h index 2f700cf7..6c9d46df 100644 --- a/kernel/include/futex.h +++ b/kernel/include/futex.h @@ -1,24 +1,50 @@ -/* Kitten LWK futex adaptation */ +/* + * Linux futex adaptation. + * (C) Copyright 2013 RIKEN AICS + * Balazs Gerofi + */ - -#ifndef _LWK_FUTEX_H -#define _LWK_FUTEX_H +#ifndef _FUTEX_H +#define _FUTEX_H /** \name Futex Commands * @{ */ #define FUTEX_WAIT 0 #define FUTEX_WAKE 1 +#define FUTEX_FD 2 +#define FUTEX_REQUEUE 3 #define FUTEX_CMP_REQUEUE 4 #define FUTEX_WAKE_OP 5 +#define FUTEX_LOCK_PI 6 +#define FUTEX_UNLOCK_PI 7 +#define FUTEX_TRYLOCK_PI 8 #define FUTEX_WAIT_BITSET 9 #define FUTEX_WAKE_BITSET 10 +#define FUTEX_WAIT_REQUEUE_PI 11 +#define FUTEX_CMP_REQUEUE_PI 12 // @} #define FUTEX_PRIVATE_FLAG 128 #define FUTEX_CLOCK_REALTIME 256 #define FUTEX_CMD_MASK ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME) +#define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG) +#define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG) +#define FUTEX_REQUEUE_PRIVATE (FUTEX_REQUEUE | FUTEX_PRIVATE_FLAG) +#define FUTEX_CMP_REQUEUE_PRIVATE (FUTEX_CMP_REQUEUE | FUTEX_PRIVATE_FLAG) +#define FUTEX_WAKE_OP_PRIVATE (FUTEX_WAKE_OP | FUTEX_PRIVATE_FLAG) +#define FUTEX_LOCK_PI_PRIVATE (FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG) +#define FUTEX_UNLOCK_PI_PRIVATE (FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG) +#define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG) +#define FUTEX_WAIT_BITSET_PRIVATE (FUTEX_WAIT_BITSET | FUTEX_PRIVATE_FLAG) +#define FUTEX_WAKE_BITSET_PRIVATE (FUTEX_WAKE_BITSET | FUTEX_PRIVATE_FLAG) +#define FUTEX_WAIT_REQUEUE_PI_PRIVATE (FUTEX_WAIT_REQUEUE_PI | \ + FUTEX_PRIVATE_FLAG) +#define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ + FUTEX_PRIVATE_FLAG) + + /** \name Futex Operations, used for FUTEX_WAKE_OP * @{ */ @@ -201,30 +227,34 @@ static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, #define FUTEX_HASHBITS 8 /* 256 entries in each futex hash tbl */ -/** Futex tracking structure. - * - * A futex has a woken state, just like tasks have TASK_RUNNING. - * It is considered woken when list_empty(&futex->link) || futex->lock_ptr == 0. - * The order of wakup is always to make the first condition true, then - * wake up futex->waitq, then make the second condition true. - */ -struct futex { - struct list_head link; - struct waitq waitq; - ihk_spinlock_t * lock_ptr; - uint32_t __user * uaddr; - uint32_t bitset; +#define FUT_OFF_INODE 1 /* We set bit 0 if key has a reference on inode */ +#define FUT_OFF_MMSHARED 2 /* We set bit 1 if key has a reference on mm */ + +struct process_vm; + +union futex_key { +#if 0 + struct { + unsigned long pgoff; + struct inode *inode; + int offset; + } shared; +#endif + struct { + unsigned long address; + struct process_vm *mm; + int offset; + } private; + struct { + unsigned long word; + void *ptr; + int offset; + } both; }; -struct futex_queue { - ihk_spinlock_t lock; - struct list_head futex_list; -}; +#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } } -extern void -futex_queue_init( - struct futex_queue * queue -); +extern int futex_init(void); extern int futex( diff --git a/kernel/include/jhash.h b/kernel/include/jhash.h new file mode 100644 index 00000000..a026476d --- /dev/null +++ b/kernel/include/jhash.h @@ -0,0 +1,145 @@ +#ifndef _LINUX_JHASH_H +#define _LINUX_JHASH_H + +/* RIKEN: u32 replaced to uint32_t + * + * jhash.h: Jenkins hash support. + * + * Copyright (C) 1996 Bob Jenkins (bob_jenkins@burtleburtle.net) + * + * http://burtleburtle.net/bob/hash/ + * + * These are the credits from Bob's sources: + * + * lookup2.c, by Bob Jenkins, December 1996, Public Domain. + * hash(), hash2(), hash3, and mix() are externally useful functions. + * Routines to test the hash are included if SELF_TEST is defined. + * You can use this free for any purpose. It has no warranty. + * + * Copyright (C) 2003 David S. Miller (davem@redhat.com) + * + * I've modified Bob's hash to be useful in the Linux kernel, and + * any bugs present are surely my fault. -DaveM + */ + +/* NOTE: Arguments are modified. */ +#define __jhash_mix(a, b, c) \ +{ \ + a -= b; a -= c; a ^= (c>>13); \ + b -= c; b -= a; b ^= (a<<8); \ + c -= a; c -= b; c ^= (b>>13); \ + a -= b; a -= c; a ^= (c>>12); \ + b -= c; b -= a; b ^= (a<<16); \ + c -= a; c -= b; c ^= (b>>5); \ + a -= b; a -= c; a ^= (c>>3); \ + b -= c; b -= a; b ^= (a<<10); \ + c -= a; c -= b; c ^= (b>>15); \ +} + +/* The golden ration: an arbitrary value */ +#define JHASH_GOLDEN_RATIO 0x9e3779b9 + +/* The most generic version, hashes an arbitrary sequence + * of bytes. No alignment or length assumptions are made about + * the input key. + */ +static inline uint32_t jhash(const void *key, uint32_t length, uint32_t initval) +{ + uint32_t a, b, c, len; + const uint8_t *k = key; + + len = length; + a = b = JHASH_GOLDEN_RATIO; + c = initval; + + while (len >= 12) { + a += (k[0] +((uint32_t)k[1]<<8) +((uint32_t)k[2]<<16) +((uint32_t)k[3]<<24)); + b += (k[4] +((uint32_t)k[5]<<8) +((uint32_t)k[6]<<16) +((uint32_t)k[7]<<24)); + c += (k[8] +((uint32_t)k[9]<<8) +((uint32_t)k[10]<<16)+((uint32_t)k[11]<<24)); + + __jhash_mix(a,b,c); + + k += 12; + len -= 12; + } + + c += length; + switch (len) { + case 11: c += ((uint32_t)k[10]<<24); + case 10: c += ((uint32_t)k[9]<<16); + case 9 : c += ((uint32_t)k[8]<<8); + case 8 : b += ((uint32_t)k[7]<<24); + case 7 : b += ((uint32_t)k[6]<<16); + case 6 : b += ((uint32_t)k[5]<<8); + case 5 : b += k[4]; + case 4 : a += ((uint32_t)k[3]<<24); + case 3 : a += ((uint32_t)k[2]<<16); + case 2 : a += ((uint32_t)k[1]<<8); + case 1 : a += k[0]; + }; + + __jhash_mix(a,b,c); + + return c; +} + +/* A special optimized version that handles 1 or more of uint32_ts. + * The length parameter here is the number of uint32_ts in the key. + */ +static inline uint32_t jhash2(const uint32_t *k, uint32_t length, uint32_t initval) +{ + uint32_t a, b, c, len; + + a = b = JHASH_GOLDEN_RATIO; + c = initval; + len = length; + + while (len >= 3) { + a += k[0]; + b += k[1]; + c += k[2]; + __jhash_mix(a, b, c); + k += 3; len -= 3; + } + + c += length * 4; + + switch (len) { + case 2 : b += k[1]; + case 1 : a += k[0]; + }; + + __jhash_mix(a,b,c); + + return c; +} + + +/* A special ultra-optimized versions that knows they are hashing exactly + * 3, 2 or 1 word(s). + * + * NOTE: In partilar the "c += length; __jhash_mix(a,b,c);" normally + * done at the end is not done here. + */ +static inline uint32_t jhash_3words(uint32_t a, uint32_t b, uint32_t c, uint32_t initval) +{ + a += JHASH_GOLDEN_RATIO; + b += JHASH_GOLDEN_RATIO; + c += initval; + + __jhash_mix(a, b, c); + + return c; +} + +static inline uint32_t jhash_2words(uint32_t a, uint32_t b, uint32_t initval) +{ + return jhash_3words(a, b, 0, initval); +} + +static inline uint32_t jhash_1word(uint32_t a, uint32_t initval) +{ + return jhash_3words(a, 0, 0, initval); +} + +#endif /* _LINUX_JHASH_H */ diff --git a/kernel/include/plist.h b/kernel/include/plist.h new file mode 100644 index 00000000..80231129 --- /dev/null +++ b/kernel/include/plist.h @@ -0,0 +1,273 @@ +/* + * Descending-priority-sorted double-linked list + * + * (C) 2002-2003 Intel Corp + * Inaky Perez-Gonzalez . + * + * 2001-2005 (c) MontaVista Software, Inc. + * Daniel Walker + * + * (C) 2005 Thomas Gleixner + * + * Simplifications of the original code by + * Oleg Nesterov + * + * Licensed under the FSF's GNU Public License v2 or later. + * + * Based on simple lists (include/linux/list.h). + * + * This is a priority-sorted list of nodes; each node has a + * priority from INT_MIN (highest) to INT_MAX (lowest). + * + * Addition is O(K), removal is O(1), change of priority of a node is + * O(K) and K is the number of RT priority levels used in the system. + * (1 <= K <= 99) + * + * This list is really a list of lists: + * + * - The tier 1 list is the prio_list, different priority nodes. + * + * - The tier 2 list is the node_list, serialized nodes. + * + * Simple ASCII art explanation: + * + * |HEAD | + * | | + * |prio_list.prev|<------------------------------------| + * |prio_list.next|<->|pl|<->|pl|<--------------->|pl|<-| + * |10 | |10| |21| |21| |21| |40| (prio) + * | | | | | | | | | | | | + * | | | | | | | | | | | | + * |node_list.next|<->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<-| + * |node_list.prev|<------------------------------------| + * + * The nodes on the prio_list list are sorted by priority to simplify + * the insertion of new nodes. There are no nodes with duplicate + * priorites on the list. + * + * The nodes on the node_list are ordered by priority and can contain + * entries which have the same priority. Those entries are ordered + * FIFO + * + * Addition means: look for the prio_list node in the prio_list + * for the priority of the node and insert it before the node_list + * entry of the next prio_list node. If it is the first node of + * that priority, add it to the prio_list in the right position and + * insert it into the serialized node_list list + * + * Removal means remove it from the node_list and remove it from + * the prio_list if the node_list list_head is non empty. In case + * of removal from the prio_list it must be checked whether other + * entries of the same priority are on the list or not. If there + * is another entry of the same priority then this entry has to + * replace the removed entry on the prio_list. If the entry which + * is removed is the only entry of this priority then a simple + * remove from both list is sufficient. + * + * INT_MIN is the highest priority, 0 is the medium highest, INT_MAX + * is lowest priority. + * + * No locking is done, up to the caller. + * + */ +#ifndef _LINUX_PLIST_H_ +#define _LINUX_PLIST_H_ + +#include +#include + +struct plist_head { + struct list_head prio_list; + struct list_head node_list; +#ifdef CONFIG_DEBUG_PI_LIST + raw_spinlock_t *rawlock; + spinlock_t *spinlock; +#endif +}; + +struct plist_node { + int prio; + struct plist_head plist; +}; + +#ifdef CONFIG_DEBUG_PI_LIST +# define PLIST_HEAD_LOCK_INIT(_lock) .spinlock = _lock +# define PLIST_HEAD_LOCK_INIT_RAW(_lock) .rawlock = _lock +#else +# define PLIST_HEAD_LOCK_INIT(_lock) +# define PLIST_HEAD_LOCK_INIT_RAW(_lock) +#endif + +#define _PLIST_HEAD_INIT(head) \ + .prio_list = LIST_HEAD_INIT((head).prio_list), \ + .node_list = LIST_HEAD_INIT((head).node_list) + +/** + * PLIST_HEAD_INIT - static struct plist_head initializer + * @head: struct plist_head variable name + * @_lock: lock to initialize for this list + */ +#define PLIST_HEAD_INIT(head, _lock) \ +{ \ + _PLIST_HEAD_INIT(head), \ + PLIST_HEAD_LOCK_INIT(&(_lock)) \ +} + +/** + * PLIST_HEAD_INIT_RAW - static struct plist_head initializer + * @head: struct plist_head variable name + * @_lock: lock to initialize for this list + */ +#define PLIST_HEAD_INIT_RAW(head, _lock) \ +{ \ + _PLIST_HEAD_INIT(head), \ + PLIST_HEAD_LOCK_INIT_RAW(&(_lock)) \ +} + +/** + * PLIST_NODE_INIT - static struct plist_node initializer + * @node: struct plist_node variable name + * @__prio: initial node priority + */ +#define PLIST_NODE_INIT(node, __prio) \ +{ \ + .prio = (__prio), \ + .plist = { _PLIST_HEAD_INIT((node).plist) }, \ +} + +/** + * plist_head_init - dynamic struct plist_head initializer + * @head: &struct plist_head pointer + * @lock: spinlock protecting the list (debugging) + */ +static inline void +plist_head_init(struct plist_head *head, ihk_spinlock_t *lock) +{ + INIT_LIST_HEAD(&head->prio_list); + INIT_LIST_HEAD(&head->node_list); +#ifdef CONFIG_DEBUG_PI_LIST + head->spinlock = lock; + head->rawlock = NULL; +#endif +} + +/** + * plist_head_init_raw - dynamic struct plist_head initializer + * @head: &struct plist_head pointer + * @lock: raw_spinlock protecting the list (debugging) + */ +static inline void +plist_head_init_raw(struct plist_head *head, ihk_spinlock_t *lock) +{ + INIT_LIST_HEAD(&head->prio_list); + INIT_LIST_HEAD(&head->node_list); +#ifdef CONFIG_DEBUG_PI_LIST + head->rawlock = lock; + head->spinlock = NULL; +#endif +} + +/** + * plist_node_init - Dynamic struct plist_node initializer + * @node: &struct plist_node pointer + * @prio: initial node priority + */ +static inline void plist_node_init(struct plist_node *node, int prio) +{ + node->prio = prio; + plist_head_init(&node->plist, NULL); +} + +extern void plist_add(struct plist_node *node, struct plist_head *head); +extern void plist_del(struct plist_node *node, struct plist_head *head); + +/** + * plist_for_each - iterate over the plist + * @pos: the type * to use as a loop counter + * @head: the head for your list + */ +#define plist_for_each(pos, head) \ + list_for_each_entry(pos, &(head)->node_list, plist.node_list) + +/** + * plist_for_each_safe - iterate safely over a plist of given type + * @pos: the type * to use as a loop counter + * @n: another type * to use as temporary storage + * @head: the head for your list + * + * Iterate over a plist of given type, safe against removal of list entry. + */ +#define plist_for_each_safe(pos, n, head) \ + list_for_each_entry_safe(pos, n, &(head)->node_list, plist.node_list) + +/** + * plist_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop counter + * @head: the head for your list + * @mem: the name of the list_struct within the struct + */ +#define plist_for_each_entry(pos, head, mem) \ + list_for_each_entry(pos, &(head)->node_list, mem.plist.node_list) + +/** + * plist_for_each_entry_safe - iterate safely over list of given type + * @pos: the type * to use as a loop counter + * @n: another type * to use as temporary storage + * @head: the head for your list + * @m: the name of the list_struct within the struct + * + * Iterate over list of given type, safe against removal of list entry. + */ +#define plist_for_each_entry_safe(pos, n, head, m) \ + list_for_each_entry_safe(pos, n, &(head)->node_list, m.plist.node_list) + +/** + * plist_head_empty - return !0 if a plist_head is empty + * @head: &struct plist_head pointer + */ +static inline int plist_head_empty(const struct plist_head *head) +{ + return list_empty(&head->node_list); +} + +/** + * plist_node_empty - return !0 if plist_node is not on a list + * @node: &struct plist_node pointer + */ +static inline int plist_node_empty(const struct plist_node *node) +{ + return plist_head_empty(&node->plist); +} + +/* All functions below assume the plist_head is not empty. */ + +/** + * plist_first_entry - get the struct for the first entry + * @head: the &struct plist_head pointer + * @type: the type of the struct this is embedded in + * @member: the name of the list_struct within the struct + */ +#ifdef CONFIG_DEBUG_PI_LIST +# define plist_first_entry(head, type, member) \ +({ \ + WARN_ON(plist_head_empty(head)); \ + container_of(plist_first(head), type, member); \ +}) +#else +# define plist_first_entry(head, type, member) \ + container_of(plist_first(head), type, member) +#endif + +/** + * plist_first - return the first node (and thus, highest priority) + * @head: the &struct plist_head pointer + * + * Assumes the plist is _not_ empty. + */ +static inline struct plist_node *plist_first(const struct plist_head *head) +{ + return list_entry(head->node_list.next, + struct plist_node, plist.node_list); +} + +#endif diff --git a/kernel/include/process.h b/kernel/include/process.h index 489897c6..24e5ec17 100644 --- a/kernel/include/process.h +++ b/kernel/include/process.h @@ -73,9 +73,6 @@ struct process_vm { struct list_head vm_range_list; struct vm_regions region; - // Address space private futexes - struct futex_queue futex_queues[1 << FUTEX_HASHBITS]; - ihk_spinlock_t page_table_lock; ihk_spinlock_t memory_range_lock; // to protect the followings: diff --git a/kernel/init.c b/kernel/init.c index cea67803..6376b680 100644 --- a/kernel/init.c +++ b/kernel/init.c @@ -216,6 +216,8 @@ int main(void) post_init(); + futex_init(); + kputs("MCK/IHK booted.\n"); #ifdef DCFA_KMOD diff --git a/kernel/plist.c b/kernel/plist.c new file mode 100644 index 00000000..5c0d1f28 --- /dev/null +++ b/kernel/plist.c @@ -0,0 +1,123 @@ +/* + * lib/plist.c + * + * Descending-priority-sorted double-linked list + * + * (C) 2002-2003 Intel Corp + * Inaky Perez-Gonzalez . + * + * 2001-2005 (c) MontaVista Software, Inc. + * Daniel Walker + * + * (C) 2005 Thomas Gleixner + * + * Simplifications of the original code by + * Oleg Nesterov + * + * Licensed under the FSF's GNU Public License v2 or later. + * + * Based on simple lists (include/linux/list.h). + * + * This file contains the add / del functions which are considered to + * be too large to inline. See include/linux/plist.h for further + * information. + */ + +#include +#include + +#ifdef CONFIG_DEBUG_PI_LIST + +static void plist_check_prev_next(struct list_head *t, struct list_head *p, + struct list_head *n) +{ + WARN(n->prev != p || p->next != n, + "top: %p, n: %p, p: %p\n" + "prev: %p, n: %p, p: %p\n" + "next: %p, n: %p, p: %p\n", + t, t->next, t->prev, + p, p->next, p->prev, + n, n->next, n->prev); +} + +static void plist_check_list(struct list_head *top) +{ + struct list_head *prev = top, *next = top->next; + + plist_check_prev_next(top, prev, next); + while (next != top) { + prev = next; + next = prev->next; + plist_check_prev_next(top, prev, next); + } +} + +static void plist_check_head(struct plist_head *head) +{ + WARN_ON(!head->rawlock && !head->spinlock); + if (head->rawlock) + WARN_ON_SMP(!raw_spin_is_locked(head->rawlock)); + if (head->spinlock) + WARN_ON_SMP(!spin_is_locked(head->spinlock)); + plist_check_list(&head->prio_list); + plist_check_list(&head->node_list); +} + +#else +# define plist_check_head(h) do { } while (0) +#endif + +/** + * plist_add - add @node to @head + * + * @node: &struct plist_node pointer + * @head: &struct plist_head pointer + */ +void plist_add(struct plist_node *node, struct plist_head *head) +{ + struct plist_node *iter; + + plist_check_head(head); +#if 0 + WARN_ON(!plist_node_empty(node)); +#endif + + list_for_each_entry(iter, &head->prio_list, plist.prio_list) { + if (node->prio < iter->prio) + goto lt_prio; + else if (node->prio == iter->prio) { + iter = list_entry(iter->plist.prio_list.next, + struct plist_node, plist.prio_list); + goto eq_prio; + } + } + +lt_prio: + list_add_tail(&node->plist.prio_list, &iter->plist.prio_list); +eq_prio: + list_add_tail(&node->plist.node_list, &iter->plist.node_list); + + plist_check_head(head); +} + +/** + * plist_del - Remove a @node from plist. + * + * @node: &struct plist_node pointer - entry to be removed + * @head: &struct plist_head pointer - list head + */ +void plist_del(struct plist_node *node, struct plist_head *head) +{ + plist_check_head(head); + + if (!list_empty(&node->plist.prio_list)) { + struct plist_node *next = plist_first(&node->plist); + + list_move_tail(&next->plist.prio_list, &node->plist.prio_list); + list_del_init(&node->plist.prio_list); + } + + list_del_init(&node->plist.node_list); + + plist_check_head(head); +} diff --git a/kernel/process.c b/kernel/process.c index f7b00b20..885a2a64 100644 --- a/kernel/process.c +++ b/kernel/process.c @@ -19,25 +19,18 @@ #define USER_STACK_NR_PAGES 8192 -#define KERNEL_STACK_NR_PAGES 16 +#define KERNEL_STACK_NR_PAGES 24 extern long do_arch_prctl(unsigned long code, unsigned long address); void init_process_vm(struct process_vm *vm) { - int i; - ihk_mc_spinlock_init(&vm->memory_range_lock); ihk_mc_spinlock_init(&vm->page_table_lock); ihk_atomic_set(&vm->refcount, 1); INIT_LIST_HEAD(&vm->vm_range_list); vm->page_table = ihk_mc_pt_create(); - - /* Initialize futex queues */ - for (i = 0; i < (1 << FUTEX_HASHBITS); ++i) - futex_queue_init(&vm->futex_queues[i]); - } struct process *create_process(unsigned long user_pc) diff --git a/kernel/syscall.c b/kernel/syscall.c index 4040381a..a3d16fc0 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -370,7 +370,7 @@ SYSCALL_DECLARE(exit_group) SYSCALL_DECLARE(mmap) { struct vm_regions *region = &cpu_local_var(current)->vm->region; - unsigned long lockr; + unsigned long lockr; dkprintf("syscall.c,mmap,addr=%lx,len=%lx,prot=%lx,flags=%x,fd=%x,offset=%lx\n", ihk_mc_syscall_arg0(ctx), ihk_mc_syscall_arg1(ctx), @@ -865,14 +865,22 @@ SYSCALL_DECLARE(futex) uint32_t *uaddr2 = (uint32_t *)ihk_mc_syscall_arg4(ctx); uint32_t val3 = (uint32_t)ihk_mc_syscall_arg5(ctx); - dkprintf("futex,uaddr=%lx,op=%x, val=%x, utime=%lx, uaddr2=%lx, val3=%x, []=%x\n", (unsigned long)uaddr, op, val, utime, uaddr2, val3, *uaddr); - /* Mask off the FUTEX_PRIVATE_FLAG, * assume all futexes are address space private */ op = (op & FUTEX_CMD_MASK); + + dkprintf("futex op=[%x, %s],uaddr=%lx, val=%x, utime=%lx, uaddr2=%lx, val3=%x, []=%x\n", + op, + (op == FUTEX_WAIT) ? "FUTEX_WAIT" : + (op == FUTEX_WAIT_BITSET) ? "FUTEX_WAIT_BITSET" : + (op == FUTEX_WAKE) ? "FUTEX_WAKE" : + (op == FUTEX_WAKE_OP) ? "FUTEX_WAKE_OP" : + (op == FUTEX_WAKE_BITSET) ? "FUTEX_WAKE_BITSET" : + (op == FUTEX_CMP_REQUEUE) ? "FUTEX_CMP_REQUEUE" : + (op == FUTEX_REQUEUE) ? "FUTEX_REQUEUE (NOT IMPL!)" : "unknown", + (unsigned long)uaddr, op, val, utime, uaddr2, val3, *uaddr); if (utime && (op == FUTEX_WAIT_BITSET || op == FUTEX_WAIT)) { - /* gettimeofday(&tv_now, NULL) from host */ struct syscall_request request IHK_DMA_ALIGN; struct timeval tv_now; request.number = 96; @@ -904,6 +912,7 @@ SYSCALL_DECLARE(futex) long diff_nsec = nsec_timeout - nsec_now; timeout = (diff_nsec / 1000) * 1100; // (usec * 1.1GHz) + dkprintf("futex timeout: %lu\n", timeout); } /* Requeue parameter in 'utime' if op == FUTEX_CMP_REQUEUE. @@ -911,51 +920,6 @@ SYSCALL_DECLARE(futex) if (op == FUTEX_CMP_REQUEUE || op == FUTEX_WAKE_OP) val2 = (uint32_t) (unsigned long) ihk_mc_syscall_arg3(ctx); - // we don't have timer interrupt and wakeup, so fake it by just pausing - if (utime && (op == FUTEX_WAIT_BITSET || op == FUTEX_WAIT)) { - // gettimeofday(&tv_now, NULL); - struct syscall_request request IHK_DMA_ALIGN; - struct timeval tv_now; - request.number = 96; - -#if 1 - unsigned long __phys; - if (ihk_mc_pt_virt_to_phys(cpu_local_var(current)->vm->page_table, - (void *)&tv_now, - &__phys)) { - return -EFAULT; - } - request.args[0] = __phys; - - int r = do_syscall(&request, ctx); - if(r < 0) { - return -EFAULT; - } - - dkprintf("futex,FUTEX_WAIT_BITSET,arg3!=NULL,pc=%lx\n", (unsigned long)ihk_mc_syscall_pc(ctx)); - - dkprintf(" now->tv_sec=%016ld,tv_nsec=%016ld\n", tv_now.tv_sec, tv_now.tv_usec * 1000); - dkprintf("utime->tv_sec=%016ld,tv_nsec=%016ld\n", utime->tv_sec, utime->tv_nsec); - - long nsec_now = ((long)tv_now.tv_sec * 1000000000ULL) + - tv_now.tv_usec * 1000; - long nsec_timeout = ((long)utime->tv_sec * 1000000000ULL) + - utime->tv_nsec * 1; - long diff_nsec = nsec_timeout - nsec_now; - - /* - if(diff_nsec > 0) { - dkprintf("pausing %016ldnsec\n", diff_nsec); - arch_delay(diff_nsec/1000); // unit is usec - } - */ - timeout = (diff_nsec / 1000) * 1100; // (usec * 1.1GHz) -#else - arch_delay(200000); // unit is usec - return -ETIMEDOUT; -#endif - } - return futex(uaddr, op, val, timeout, uaddr2, val2, val3); }