22 files changed, 1479 insertions, 566 deletions
diff --git a/kernel/futex.c b/kernel/futex.c
index d546b2d53a62..80b5ce716596 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -19,6 +19,10 @@
 *  PRIVATE futexes by Eric Dumazet
 *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
 *
+ *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
+ *  Copyright (C) IBM Corporation, 2009
+ *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
+ *
 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
 *  enough at me, Linus for the original (flawed) idea, Matthew
 *  Kirkwood for proof-of-concept implementation.
@@ -96,8 +100,8 @@ struct futex_pi_state {
 */
 struct futex_q {
        struct plist_node list;
-        /* There can only be a single waiter */
+        /* Waiter reference */
-        wait_queue_head_t waiter;
+        struct task_struct *task;
        /* Which hash list lock to use: */
        spinlock_t *lock_ptr;
@@ -107,7 +111,9 @@ struct futex_q {
        /* Optional priority inheritance state: */
        struct futex_pi_state *pi_state;
-        struct task_struct *task;
+        /* rt_waiter storage for requeue_pi: */
+        struct rt_mutex_waiter *rt_waiter;
        /* Bitset for the optional bitmasked wakeup */
        u32 bitset;
@@ -278,6 +284,25 @@ void put_futex_key(int fshared, union futex_key *key)
        drop_futex_key_refs(key);
 }
+/**
+ * futex_top_waiter() - Return the highest priority waiter on a futex
+ * @hb:     the hash bucket the futex_q's reside in
+ * @key:    the futex key (to distinguish it from other futex futex_q's)
+ *
+ * Must be called with the hb lock held.
+ */
+static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
+                                        union futex_key *key)
+{
+        struct futex_q *this;
+        plist_for_each_entry(this, &hb->chain, list) {
+                if (match_futex(&this->key, key))
+                        return this;
+        }
+        return NULL;
+}
 static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
 {
        u32 curval;
@@ -539,28 +564,160 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
        return 0;
 }
+/**
+ * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex
+ * @uaddr:              the pi futex user address
+ * @hb:                 the pi futex hash bucket
+ * @key:                the futex key associated with uaddr and hb
+ * @ps:                 the pi_state pointer where we store the result of the
+ *                      lookup
+ * @task:               the task to perform the atomic lock work for.  This will
+ *                      be "current" except in the case of requeue pi.
+ * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Returns:
+ *  0 - ready to wait
+ *  1 - acquired the lock
+ * <0 - error
+ *
+ * The hb->lock and futex_key refs shall be held by the caller.
+ */
+static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
+                                union futex_key *key,
+                                struct futex_pi_state **ps,
+                                struct task_struct *task, int set_waiters)
+{
+        int lock_taken, ret, ownerdied = 0;
+        u32 uval, newval, curval;
+retry:
+        ret = lock_taken = 0;
+        /*
+         * To avoid races, we attempt to take the lock here again
+         * (by doing a 0 -> TID atomic cmpxchg), while holding all
+         * the locks. It will most likely not succeed.
+         */
+        newval = task_pid_vnr(task);
+        if (set_waiters)
+                newval |= FUTEX_WAITERS;
+        curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
+        if (unlikely(curval == -EFAULT))
+                return -EFAULT;
+        /*
+         * Detect deadlocks.
+         */
+        if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
+                return -EDEADLK;
+        /*
+         * Surprise - we got the lock. Just return to userspace:
+         */
+        if (unlikely(!curval))
+                return 1;
+        uval = curval;
+        /*
+         * Set the FUTEX_WAITERS flag, so the owner will know it has someone
+         * to wake at the next unlock.
+         */
+        newval = curval | FUTEX_WAITERS;
+        /*
+         * There are two cases, where a futex might have no owner (the
+         * owner TID is 0): OWNER_DIED. We take over the futex in this
+         * case. We also do an unconditional take over, when the owner
+         * of the futex died.
+         *
+         * This is safe as we are protected by the hash bucket lock !
+         */
+        if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
+                /* Keep the OWNER_DIED bit */
+                newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
+                ownerdied = 0;
+                lock_taken = 1;
+        }
+        curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
+        if (unlikely(curval == -EFAULT))
+                return -EFAULT;
+        if (unlikely(curval != uval))
+                goto retry;
+        /*
+         * We took the lock due to owner died take over.
+         */
+        if (unlikely(lock_taken))
+                return 1;
+        /*
+         * We dont have the lock. Look up the PI state (or create it if
+         * we are the first waiter):
+         */
+        ret = lookup_pi_state(uval, hb, key, ps);
+        if (unlikely(ret)) {
+                switch (ret) {
+                case -ESRCH:
+                        /*
+                         * No owner found for this futex. Check if the
+                         * OWNER_DIED bit is set to figure out whether
+                         * this is a robust futex or not.
+                         */
+                        if (get_futex_value_locked(&curval, uaddr))
+                                return -EFAULT;
+                        /*
+                         * We simply start over in case of a robust
+                         * futex. The code above will take the futex
+                         * and return happy.
+                         */
+                        if (curval & FUTEX_OWNER_DIED) {
+                                ownerdied = 1;
+                                goto retry;
+                        }
+                default:
+                        break;
+                }
+        }
+        return ret;
+}
 /*
 * The hash bucket lock must be held when this is called.
 * Afterwards, the futex_q must not be accessed.
 */
 static void wake_futex(struct futex_q *q)
 {
-        plist_del(&q->list, &q->list.plist);
+        struct task_struct *p = q->task;
        /*
-         * The lock in wake_up_all() is a crucial memory barrier after the
+         * We set q->lock_ptr = NULL _before_ we wake up the task. If
-         * plist_del() and also before assigning to q->lock_ptr.
+         * a non futex wake up happens on another CPU then the task
+         * might exit and p would dereference a non existing task
+         * struct. Prevent this by holding a reference on p across the
+         * wake up.
         */
-        wake_up(&q->waiter);
+        get_task_struct(p);
+        plist_del(&q->list, &q->list.plist);
        /*
-         * The waiting task can free the futex_q as soon as this is written,
+         * The waiting task can free the futex_q as soon as
-         * without taking any locks.  This must come last.
+         * q->lock_ptr = NULL is written, without taking any locks. A
-         *
+         * memory barrier is required here to prevent the following
-         * A memory barrier is required here to prevent the following store to
+         * store to lock_ptr from getting ahead of the plist_del.
-         * lock_ptr from getting ahead of the wakeup. Clearing the lock at the
-         * end of wake_up() does not prevent this store from moving.
         */
        smp_wmb();
        q->lock_ptr = NULL;
+        wake_up_state(p, TASK_NORMAL);
+        put_task_struct(p);
 }
 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -689,7 +846,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
        plist_for_each_entry_safe(this, next, head, list) {
                if (match_futex (&this->key, &key)) {
-                        if (this->pi_state) {
+                        if (this->pi_state || this->rt_waiter) {
                                ret = -EINVAL;
                                break;
                        }
@@ -802,24 +959,185 @@ out:
        return ret;
 }
-/*
+/**
- * Requeue all waiters hashed on one physical page to another
+ * requeue_futex() - Requeue a futex_q from one hb to another
- * physical page.
+ * @q:          the futex_q to requeue
+ * @hb1:        the source hash_bucket
+ * @hb2:        the target hash_bucket
+ * @key2:       the new key for the requeued futex_q
+ */
+static inline
+void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
+                   struct futex_hash_bucket *hb2, union futex_key *key2)
+{
+        /*
+         * If key1 and key2 hash to the same bucket, no need to
+         * requeue.
+         */
+        if (likely(&hb1->chain != &hb2->chain)) {
+                plist_del(&q->list, &hb1->chain);
+                plist_add(&q->list, &hb2->chain);
+                q->lock_ptr = &hb2->lock;
+#ifdef CONFIG_DEBUG_PI_LIST
+                q->list.plist.lock = &hb2->lock;
+#endif
+        }
+        get_futex_key_refs(key2);
+        q->key = *key2;
+}
+/**
+ * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
+ * q:   the futex_q
+ * key: the key of the requeue target futex
+ *
+ * During futex_requeue, with requeue_pi=1, it is possible to acquire the
+ * target futex if it is uncontended or via a lock steal.  Set the futex_q key
+ * to the requeue target futex so the waiter can detect the wakeup on the right
+ * futex, but remove it from the hb and NULL the rt_waiter so it can detect
+ * atomic lock acquisition.  Must be called with the q->lock_ptr held.
+ */
+static inline
+void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
+{
+        drop_futex_key_refs(&q->key);
+        get_futex_key_refs(key);
+        q->key = *key;
+        WARN_ON(plist_node_empty(&q->list));
+        plist_del(&q->list, &q->list.plist);
+        WARN_ON(!q->rt_waiter);
+        q->rt_waiter = NULL;
+        wake_up_state(q->task, TASK_NORMAL);
+}
+/**
+ * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
+ * @pifutex:            the user address of the to futex
+ * @hb1:                the from futex hash bucket, must be locked by the caller
+ * @hb2:                the to futex hash bucket, must be locked by the caller
+ * @key1:               the from futex key
+ * @key2:               the to futex key
+ * @ps:                 address to store the pi_state pointer
+ * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Try and get the lock on behalf of the top waiter if we can do it atomically.
+ * Wake the top waiter if we succeed.  If the caller specified set_waiters,
+ * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
+ * hb1 and hb2 must be held by the caller.
+ *
+ * Returns:
+ *  0 - failed to acquire the lock atomicly
+ *  1 - acquired the lock
+ * <0 - error
+ */
+static int futex_proxy_trylock_atomic(u32 __user *pifutex,
+                                 struct futex_hash_bucket *hb1,
+                                 struct futex_hash_bucket *hb2,
+                                 union futex_key *key1, union futex_key *key2,
+                                 struct futex_pi_state **ps, int set_waiters)
+{
+        struct futex_q *top_waiter = NULL;
+        u32 curval;
+        int ret;
+        if (get_futex_value_locked(&curval, pifutex))
+                return -EFAULT;
+        /*
+         * Find the top_waiter and determine if there are additional waiters.
+         * If the caller intends to requeue more than 1 waiter to pifutex,
+         * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
+         * as we have means to handle the possible fault.  If not, don't set
+         * the bit unecessarily as it will force the subsequent unlock to enter
+         * the kernel.
+         */
+        top_waiter = futex_top_waiter(hb1, key1);
+        /* There are no waiters, nothing for us to do. */
+        if (!top_waiter)
+                return 0;
+        /*
+         * Try to take the lock for top_waiter.  Set the FUTEX_WAITERS bit in
+         * the contended case or if set_waiters is 1.  The pi_state is returned
+         * in ps in contended cases.
+         */
+        ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
+                                   set_waiters);
+        if (ret == 1)
+                requeue_pi_wake_futex(top_waiter, key2);
+        return ret;
+}
+/**
+ * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
+ * uaddr1:      source futex user address
+ * uaddr2:      target futex user address
+ * nr_wake:     number of waiters to wake (must be 1 for requeue_pi)
+ * nr_requeue:  number of waiters to requeue (0-INT_MAX)
+ * requeue_pi:  if we are attempting to requeue from a non-pi futex to a
+ *              pi futex (pi to pi requeue is not supported)
+ *
+ * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
+ * uaddr2 atomically on behalf of the top waiter.
+ *
+ * Returns:
+ * >=0 - on success, the number of tasks requeued or woken
+ *  <0 - on error
 */
 static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
-                         int nr_wake, int nr_requeue, u32 *cmpval)
+                         int nr_wake, int nr_requeue, u32 *cmpval,
+                         int requeue_pi)
 {
        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
+        int drop_count = 0, task_count = 0, ret;
+        struct futex_pi_state *pi_state = NULL;
        struct futex_hash_bucket *hb1, *hb2;
        struct plist_head *head1;
        struct futex_q *this, *next;
-        int ret, drop_count = 0;
+        u32 curval2;
+        if (requeue_pi) {
+                /*
+                 * requeue_pi requires a pi_state, try to allocate it now
+                 * without any locks in case it fails.
+                 */
+                if (refill_pi_state_cache())
+                        return -ENOMEM;
+                /*
+                 * requeue_pi must wake as many tasks as it can, up to nr_wake
+                 * + nr_requeue, since it acquires the rt_mutex prior to
+                 * returning to userspace, so as to not leave the rt_mutex with
+                 * waiters and no owner.  However, second and third wake-ups
+                 * cannot be predicted as they involve race conditions with the
+                 * first wake and a fault while looking up the pi_state.  Both
+                 * pthread_cond_signal() and pthread_cond_broadcast() should
+                 * use nr_wake=1.
+                 */
+                if (nr_wake != 1)
+                        return -EINVAL;
+        }
 retry:
+        if (pi_state != NULL) {
+                /*
+                 * We will have to lookup the pi_state again, so free this one
+                 * to keep the accounting correct.
+                 */
+                free_pi_state(pi_state);
+                pi_state = NULL;
+        }
        ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
        if (unlikely(ret != 0))
                goto out;
-        ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_READ);
+        ret = get_futex_key(uaddr2, fshared, &key2,
+                            requeue_pi ? VERIFY_WRITE : VERIFY_READ);
        if (unlikely(ret != 0))
                goto out_put_key1;
@@ -854,32 +1172,99 @@ retry_private:
                }
        }
+        if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
+                /*
+                 * Attempt to acquire uaddr2 and wake the top waiter. If we
+                 * intend to requeue waiters, force setting the FUTEX_WAITERS
+                 * bit.  We force this here where we are able to easily handle
+                 * faults rather in the requeue loop below.
+                 */
+                ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
+                                                 &key2, &pi_state, nr_requeue);
+                /*
+                 * At this point the top_waiter has either taken uaddr2 or is
+                 * waiting on it.  If the former, then the pi_state will not
+                 * exist yet, look it up one more time to ensure we have a
+                 * reference to it.
+                 */
+                if (ret == 1) {
+                        WARN_ON(pi_state);
+                        task_count++;
+                        ret = get_futex_value_locked(&curval2, uaddr2);
+                        if (!ret)
+                                ret = lookup_pi_state(curval2, hb2, &key2,
+                                                      &pi_state);
+                }
+                switch (ret) {
+                case 0:
+                        break;
+                case -EFAULT:
+                        double_unlock_hb(hb1, hb2);
+                        put_futex_key(fshared, &key2);
+                        put_futex_key(fshared, &key1);
+                        ret = get_user(curval2, uaddr2);
+                        if (!ret)
+                                goto retry;
+                        goto out;
+                case -EAGAIN:
+                        /* The owner was exiting, try again. */
+                        double_unlock_hb(hb1, hb2);
+                        put_futex_key(fshared, &key2);
+                        put_futex_key(fshared, &key1);
+                        cond_resched();
+                        goto retry;
+                default:
+                        goto out_unlock;
+                }
+        }
        head1 = &hb1->chain;
        plist_for_each_entry_safe(this, next, head1, list) {
-                if (!match_futex (&this->key, &key1))
+                if (task_count - nr_wake >= nr_requeue)
+                        break;
+                if (!match_futex(&this->key, &key1))
                        continue;
-                if (++ret <= nr_wake) {
+                WARN_ON(!requeue_pi && this->rt_waiter);
+                WARN_ON(requeue_pi && !this->rt_waiter);
+                /*
+                 * Wake nr_wake waiters.  For requeue_pi, if we acquired the
+                 * lock, we already woke the top_waiter.  If not, it will be
+                 * woken by futex_unlock_pi().
+                 */
+                if (++task_count <= nr_wake && !requeue_pi) {
                        wake_futex(this);
-                } else {
+                        continue;
-                        /*
+                }
-                         * If key1 and key2 hash to the same bucket, no need to
-                         * requeue.
-                         */
-                        if (likely(head1 != &hb2->chain)) {
-                                plist_del(&this->list, &hb1->chain);
-                                plist_add(&this->list, &hb2->chain);
-                                this->lock_ptr = &hb2->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
-                                this->list.plist.lock = &hb2->lock;
-#endif
-                        }
-                        this->key = key2;
-                        get_futex_key_refs(&key2);
-                        drop_count++;
-                        if (ret - nr_wake >= nr_requeue)
+                /*
-                                break;
+                 * Requeue nr_requeue waiters and possibly one more in the case
+                 * of requeue_pi if we couldn't acquire the lock atomically.
+                 */
+                if (requeue_pi) {
+                        /* Prepare the waiter to take the rt_mutex. */
+                        atomic_inc(&pi_state->refcount);
+                        this->pi_state = pi_state;
+                        ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
+                                                        this->rt_waiter,
+                                                        this->task, 1);
+                        if (ret == 1) {
+                                /* We got the lock. */
+                                requeue_pi_wake_futex(this, &key2);
+                                continue;
+                        } else if (ret) {
+                                /* -EDEADLK */
+                                this->pi_state = NULL;
+                                free_pi_state(pi_state);
+                                goto out_unlock;
+                        }
                }
+                requeue_futex(this, hb1, hb2, &key2);
+                drop_count++;
        }
 out_unlock:
@@ -899,7 +1284,9 @@ out_put_keys:
 out_put_key1:
        put_futex_key(fshared, &key1);
 out:
-        return ret;
+        if (pi_state != NULL)
+                free_pi_state(pi_state);
+        return ret ? ret : task_count;
 }
 /* The key must be already stored in q->key. */
@@ -907,8 +1294,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
 {
        struct futex_hash_bucket *hb;
-        init_waitqueue_head(&q->waiter);
        get_futex_key_refs(&q->key);
        hb = hash_futex(&q->key);
        q->lock_ptr = &hb->lock;
@@ -1119,35 +1504,149 @@ handle_fault:
 */
 #define FLAGS_SHARED            0x01
 #define FLAGS_CLOCKRT           0x02
+#define FLAGS_HAS_TIMEOUT       0x04
 static long futex_wait_restart(struct restart_block *restart);
-static int futex_wait(u32 __user *uaddr, int fshared,
+/**
-                      u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+ * fixup_owner() - Post lock pi_state and corner case management
+ * @uaddr:      user address of the futex
+ * @fshared:    whether the futex is shared (1) or not (0)
+ * @q:          futex_q (contains pi_state and access to the rt_mutex)
+ * @locked:     if the attempt to take the rt_mutex succeeded (1) or not (0)
+ *
+ * After attempting to lock an rt_mutex, this function is called to cleanup
+ * the pi_state owner as well as handle race conditions that may allow us to
+ * acquire the lock. Must be called with the hb lock held.
+ *
+ * Returns:
+ *  1 - success, lock taken
+ *  0 - success, lock not taken
+ * <0 - on error (-EFAULT)
+ */
+static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
+                       int locked)
 {
-        struct task_struct *curr = current;
+        struct task_struct *owner;
-        struct restart_block *restart;
+        int ret = 0;
-        DECLARE_WAITQUEUE(wait, curr);
-        struct futex_hash_bucket *hb;
-        struct futex_q q;
-        u32 uval;
-        int ret;
-        struct hrtimer_sleeper t;
-        int rem = 0;
-        if (!bitset)
+        if (locked) {
-                return -EINVAL;
+                /*
+                 * Got the lock. We might not be the anticipated owner if we
+                 * did a lock-steal - fix up the PI-state in that case:
+                 */
+                if (q->pi_state->owner != current)
+                        ret = fixup_pi_state_owner(uaddr, q, current, fshared);
+                goto out;
+        }
-        q.pi_state = NULL;
+        /*
-        q.bitset = bitset;
+         * Catch the rare case, where the lock was released when we were on the
-retry:
+         * way back before we locked the hash bucket.
-        q.key = FUTEX_KEY_INIT;
+         */
-        ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_READ);
+        if (q->pi_state->owner == current) {
-        if (unlikely(ret != 0))
+                /*
+                 * Try to get the rt_mutex now. This might fail as some other
+                 * task acquired the rt_mutex after we removed ourself from the
+                 * rt_mutex waiters list.
+                 */
+                if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
+                        locked = 1;
+                        goto out;
+                }
+                /*
+                 * pi_state is incorrect, some other task did a lock steal and
+                 * we returned due to timeout or signal without taking the
+                 * rt_mutex. Too late. We can access the rt_mutex_owner without
+                 * locking, as the other task is now blocked on the hash bucket
+                 * lock. Fix the state up.
+                 */
+                owner = rt_mutex_owner(&q->pi_state->pi_mutex);
+                ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
                goto out;
+        }
-retry_private:
+        /*
-        hb = queue_lock(&q);
+         * Paranoia check. If we did not take the lock, then we should not be
+         * the owner, nor the pending owner, of the rt_mutex.
+         */
+        if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
+                printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
+                                "pi-state %p\n", ret,
+                                q->pi_state->pi_mutex.owner,
+                                q->pi_state->owner);
+out:
+        return ret ? ret : locked;
+}
+/**
+ * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
+ * @hb:         the futex hash bucket, must be locked by the caller
+ * @q:          the futex_q to queue up on
+ * @timeout:    the prepared hrtimer_sleeper, or null for no timeout
+ */
+static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
+                                struct hrtimer_sleeper *timeout)
+{
+        queue_me(q, hb);
+        /*
+         * There might have been scheduling since the queue_me(), as we
+         * cannot hold a spinlock across the get_user() in case it
+         * faults, and we cannot just set TASK_INTERRUPTIBLE state when
+         * queueing ourselves into the futex hash. This code thus has to
+         * rely on the futex_wake() code removing us from hash when it
+         * wakes us up.
+         */
+        set_current_state(TASK_INTERRUPTIBLE);
+        /* Arm the timer */
+        if (timeout) {
+                hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
+                if (!hrtimer_active(&timeout->timer))
+                        timeout->task = NULL;
+        }
+        /*
+         * !plist_node_empty() is safe here without any lock.
+         * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
+         */
+        if (likely(!plist_node_empty(&q->list))) {
+                /*
+                 * If the timer has already expired, current will already be
+                 * flagged for rescheduling. Only call schedule if there
+                 * is no timeout, or if it has yet to expire.
+                 */
+                if (!timeout || timeout->task)
+                        schedule();
+        }
+        __set_current_state(TASK_RUNNING);
+}
+/**
+ * futex_wait_setup() - Prepare to wait on a futex
+ * @uaddr:      the futex userspace address
+ * @val:        the expected value
+ * @fshared:    whether the futex is shared (1) or not (0)
+ * @q:          the associated futex_q
+ * @hb:         storage for hash_bucket pointer to be returned to caller
+ *
+ * Setup the futex_q and locate the hash_bucket.  Get the futex value and
+ * compare it with the expected value.  Handle atomic faults internally.
+ * Return with the hb lock held and a q.key reference on success, and unlocked
+ * with no q.key reference on failure.
+ *
+ * Returns:
+ *  0 - uaddr contains val and hb has been locked
+ * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
+ */
+static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
+                           struct futex_q *q, struct futex_hash_bucket **hb)
+{
+        u32 uval;
+        int ret;
        /*
         * Access the page AFTER the hash-bucket is locked.
@@ -1165,95 +1664,83 @@ retry_private:
         * A consequence is that futex_wait() can return zero and absorb
         * a wakeup when *uaddr != val on entry to the syscall.  This is
         * rare, but normal.
-         *
-         * For shared futexes, we hold the mmap semaphore, so the mapping
-         * cannot have changed since we looked it up in get_futex_key.
         */
+retry:
+        q->key = FUTEX_KEY_INIT;
+        ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
+        if (unlikely(ret != 0))
+                return ret;
+retry_private:
+        *hb = queue_lock(q);
        ret = get_futex_value_locked(&uval, uaddr);
-        if (unlikely(ret)) {
+        if (ret) {
-                queue_unlock(&q, hb);
+                queue_unlock(q, *hb);
                ret = get_user(uval, uaddr);
                if (ret)
-                        goto out_put_key;
+                        goto out;
                if (!fshared)
                        goto retry_private;
-                put_futex_key(fshared, &q.key);
+                put_futex_key(fshared, &q->key);
                goto retry;
        }
-        ret = -EWOULDBLOCK;
-        if (unlikely(uval != val)) {
-                queue_unlock(&q, hb);
-                goto out_put_key;
-        }
-        /* Only actually queue if *uaddr contained val.  */
+        if (uval != val) {
-        queue_me(&q, hb);
+                queue_unlock(q, *hb);
+                ret = -EWOULDBLOCK;
+        }
-        /*
+out:
-         * There might have been scheduling since the queue_me(), as we
+        if (ret)
-         * cannot hold a spinlock across the get_user() in case it
+                put_futex_key(fshared, &q->key);
-         * faults, and we cannot just set TASK_INTERRUPTIBLE state when
+        return ret;
-         * queueing ourselves into the futex hash.  This code thus has to
+}
-         * rely on the futex_wake() code removing us from hash when it
-         * wakes us up.
-         */
-        /* add_wait_queue is the barrier after __set_current_state. */
+static int futex_wait(u32 __user *uaddr, int fshared,
-        __set_current_state(TASK_INTERRUPTIBLE);
+                      u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
-        add_wait_queue(&q.waiter, &wait);
+{
-        /*
+        struct hrtimer_sleeper timeout, *to = NULL;
-         * !plist_node_empty() is safe here without any lock.
+        struct restart_block *restart;
-         * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
+        struct futex_hash_bucket *hb;
-         */
+        struct futex_q q;
-        if (likely(!plist_node_empty(&q.list))) {
+        int ret;
-                if (!abs_time)
-                        schedule();
-                else {
-                        hrtimer_init_on_stack(&t.timer,
-                                              clockrt ? CLOCK_REALTIME :
-                                              CLOCK_MONOTONIC,
-                                              HRTIMER_MODE_ABS);
-                        hrtimer_init_sleeper(&t, current);
-                        hrtimer_set_expires_range_ns(&t.timer, *abs_time,
-                                                     current->timer_slack_ns);
-                        hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
-                        if (!hrtimer_active(&t.timer))
-                                t.task = NULL;
-                        /*
+        if (!bitset)
-                         * the timer could have already expired, in which
+                return -EINVAL;
-                         * case current would be flagged for rescheduling.
-                         * Don't bother calling schedule.
-                         */
-                        if (likely(t.task))
-                                schedule();
-                        hrtimer_cancel(&t.timer);
+        q.pi_state = NULL;
+        q.bitset = bitset;
+        q.rt_waiter = NULL;
-                        /* Flag if a timeout occured */
+        if (abs_time) {
-                        rem = (t.task == NULL);
+                to = &timeout;
-                        destroy_hrtimer_on_stack(&t.timer);
+                hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
-                }
+                                      CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+                hrtimer_init_sleeper(to, current);
+                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
+                                             current->timer_slack_ns);
        }
-        __set_current_state(TASK_RUNNING);
-        /*
+        /* Prepare to wait on uaddr. */
-         * NOTE: we don't remove ourselves from the waitqueue because
+        ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
-         * we are the only user of it.
+        if (ret)
-         */
+                goto out;
+        /* queue_me and wait for wakeup, timeout, or a signal. */
+        futex_wait_queue_me(hb, &q, to);
        /* If we were woken (and unqueued), we succeeded, whatever. */
        ret = 0;
        if (!unqueue_me(&q))
                goto out_put_key;
        ret = -ETIMEDOUT;
-        if (rem)
+        if (to && !to->task)
                goto out_put_key;
        /*
@@ -1270,7 +1757,7 @@ retry_private:
        restart->futex.val = val;
        restart->futex.time = abs_time->tv64;
        restart->futex.bitset = bitset;
-        restart->futex.flags = 0;
+        restart->futex.flags = FLAGS_HAS_TIMEOUT;
        if (fshared)
                restart->futex.flags |= FLAGS_SHARED;
@@ -1282,6 +1769,10 @@ retry_private:
 out_put_key:
        put_futex_key(fshared, &q.key);
 out:
+        if (to) {
+                hrtimer_cancel(&to->timer);
+                destroy_hrtimer_on_stack(&to->timer);
+        }
        return ret;
 }
@@ -1290,13 +1781,16 @@ static long futex_wait_restart(struct restart_block *restart)
 {
        u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
        int fshared = 0;
-        ktime_t t;
+        ktime_t t, *tp = NULL;
-        t.tv64 = restart->futex.time;
+        if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
+                t.tv64 = restart->futex.time;
+                tp = &t;
+        }
        restart->fn = do_no_restart_syscall;
        if (restart->futex.flags & FLAGS_SHARED)
                fshared = 1;
-        return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
+        return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
                                restart->futex.bitset,
                                restart->futex.flags & FLAGS_CLOCKRT);
 }
@@ -1312,11 +1806,10 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
                         int detect, ktime_t *time, int trylock)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
-        struct task_struct *curr = current;
        struct futex_hash_bucket *hb;
-        u32 uval, newval, curval;
+        u32 uval;
        struct futex_q q;
-        int ret, lock_taken, ownerdied = 0;
+        int res, ret;
        if (refill_pi_state_cache())
                return -ENOMEM;
@@ -1330,6 +1823,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
        }
        q.pi_state = NULL;
+        q.rt_waiter = NULL;
 retry:
        q.key = FUTEX_KEY_INIT;
        ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
@@ -1339,81 +1833,15 @@ retry:
 retry_private:
        hb = queue_lock(&q);
-retry_locked:
+        ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
-        ret = lock_taken = 0;
-        /*
-         * To avoid races, we attempt to take the lock here again
-         * (by doing a 0 -> TID atomic cmpxchg), while holding all
-         * the locks. It will most likely not succeed.
-         */
-        newval = task_pid_vnr(current);
-        curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
-        if (unlikely(curval == -EFAULT))
-                goto uaddr_faulted;
-        /*
-         * Detect deadlocks. In case of REQUEUE_PI this is a valid
-         * situation and we return success to user space.
-         */
-        if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
-                ret = -EDEADLK;
-                goto out_unlock_put_key;
-        }
-        /*
-         * Surprise - we got the lock. Just return to userspace:
-         */
-        if (unlikely(!curval))
-                goto out_unlock_put_key;
-        uval = curval;
-        /*
-         * Set the WAITERS flag, so the owner will know it has someone
-         * to wake at next unlock
-         */
-        newval = curval | FUTEX_WAITERS;
-        /*
-         * There are two cases, where a futex might have no owner (the
-         * owner TID is 0): OWNER_DIED. We take over the futex in this
-         * case. We also do an unconditional take over, when the owner
-         * of the futex died.
-         *
-         * This is safe as we are protected by the hash bucket lock !
-         */
-        if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
-                /* Keep the OWNER_DIED bit */
-                newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current);
-                ownerdied = 0;
-                lock_taken = 1;
-        }
-        curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-        if (unlikely(curval == -EFAULT))
-                goto uaddr_faulted;
-        if (unlikely(curval != uval))
-                goto retry_locked;
-        /*
-         * We took the lock due to owner died take over.
-         */
-        if (unlikely(lock_taken))
-                goto out_unlock_put_key;
-        /*
-         * We dont have the lock. Look up the PI state (or create it if
-         * we are the first waiter):
-         */
-        ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
        if (unlikely(ret)) {
                switch (ret) {
+                case 1:
+                        /* We got the lock. */
+                        ret = 0;
+                        goto out_unlock_put_key;
+                case -EFAULT:
+                        goto uaddr_faulted;
                case -EAGAIN:
                        /*
                         * Task is exiting and we just wait for the
@@ -1423,25 +1851,6 @@ retry_locked:
                        put_futex_key(fshared, &q.key);
                        cond_resched();
                        goto retry;
-                case -ESRCH:
-                        /*
-                         * No owner found for this futex. Check if the
-                         * OWNER_DIED bit is set to figure out whether
-                         * this is a robust futex or not.
-                         */
-                        if (get_futex_value_locked(&curval, uaddr))
-                                goto uaddr_faulted;
-                        /*
-                         * We simply start over in case of a robust
-                         * futex. The code above will take the futex
-                         * and return happy.
-                         */
-                        if (curval & FUTEX_OWNER_DIED) {
-                                ownerdied = 1;
-                                goto retry_locked;
-                        }
                default:
                        goto out_unlock_put_key;
                }
@@ -1465,71 +1874,21 @@ retry_locked:
        }
        spin_lock(q.lock_ptr);
+        /*
-        if (!ret) {
+         * Fixup the pi_state owner and possibly acquire the lock if we
-                /*
+         * haven't already.
-                 * Got the lock. We might not be the anticipated owner
+         */
-                 * if we did a lock-steal - fix up the PI-state in
+        res = fixup_owner(uaddr, fshared, &q, !ret);
-                 * that case:
+        /*
-                 */
+         * If fixup_owner() returned an error, proprogate that.  If it acquired
-                if (q.pi_state->owner != curr)
+         * the lock, clear our -ETIMEDOUT or -EINTR.
-                        ret = fixup_pi_state_owner(uaddr, &q, curr, fshared);
+         */
-        } else {
+        if (res)
-                /*
+                ret = (res < 0) ? res : 0;
-                 * Catch the rare case, where the lock was released
-                 * when we were on the way back before we locked the
-                 * hash bucket.
-                 */
-                if (q.pi_state->owner == curr) {
-                        /*
-                         * Try to get the rt_mutex now. This might
-                         * fail as some other task acquired the
-                         * rt_mutex after we removed ourself from the
-                         * rt_mutex waiters list.
-                         */
-                        if (rt_mutex_trylock(&q.pi_state->pi_mutex))
-                                ret = 0;
-                        else {
-                                /*
-                                 * pi_state is incorrect, some other
-                                 * task did a lock steal and we
-                                 * returned due to timeout or signal
-                                 * without taking the rt_mutex. Too
-                                 * late. We can access the
-                                 * rt_mutex_owner without locking, as
-                                 * the other task is now blocked on
-                                 * the hash bucket lock. Fix the state
-                                 * up.
-                                 */
-                                struct task_struct *owner;
-                                int res;
-                                owner = rt_mutex_owner(&q.pi_state->pi_mutex);
-                                res = fixup_pi_state_owner(uaddr, &q, owner,
-                                                           fshared);
-                                /* propagate -EFAULT, if the fixup failed */
-                                if (res)
-                                        ret = res;
-                        }
-                } else {
-                        /*
-                         * Paranoia check. If we did not take the lock
-                         * in the trylock above, then we should not be
-                         * the owner of the rtmutex, neither the real
-                         * nor the pending one:
-                         */
-                        if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
-                                printk(KERN_ERR "futex_lock_pi: ret = %d "
-                                       "pi-mutex: %p pi-state %p\n", ret,
-                                       q.pi_state->pi_mutex.owner,
-                                       q.pi_state->owner);
-                }
-        }
        /*
-         * If fixup_pi_state_owner() faulted and was unable to handle the
+         * If fixup_owner() faulted and was unable to handle the fault, unlock
-         * fault, unlock it and return the fault to userspace.
+         * it and return the fault to userspace.
         */
        if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
                rt_mutex_unlock(&q.pi_state->pi_mutex);
@@ -1537,9 +1896,7 @@ retry_locked:
        /* Unqueue and drop the lock */
        unqueue_me_pi(&q);
-        if (to)
+        goto out;
-                destroy_hrtimer_on_stack(&to->timer);
-        return ret != -EINTR ? ret : -ERESTARTNOINTR;
 out_unlock_put_key:
        queue_unlock(&q, hb);
@@ -1549,7 +1906,7 @@ out_put_key:
 out:
        if (to)
                destroy_hrtimer_on_stack(&to->timer);
-        return ret;
+        return ret != -EINTR ? ret : -ERESTARTNOINTR;
 uaddr_faulted:
        /*
@@ -1572,7 +1929,6 @@ uaddr_faulted:
        goto retry;
 }
 /*
 * Userspace attempted a TID -> 0 atomic transition, and failed.
 * This is the in-kernel slowpath: we look up the PI state (if any),
@@ -1674,6 +2030,229 @@ pi_faulted:
        return ret;
 }
+/**
+ * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
+ * @hb:         the hash_bucket futex_q was original enqueued on
+ * @q:          the futex_q woken while waiting to be requeued
+ * @key2:       the futex_key of the requeue target futex
+ * @timeout:    the timeout associated with the wait (NULL if none)
+ *
+ * Detect if the task was woken on the initial futex as opposed to the requeue
+ * target futex.  If so, determine if it was a timeout or a signal that caused
+ * the wakeup and return the appropriate error code to the caller.  Must be
+ * called with the hb lock held.
+ *
+ * Returns
+ *  0 - no early wakeup detected
+ * <0 - -ETIMEDOUT or -ERESTARTNOINTR
+ */
+static inline
+int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
+                                   struct futex_q *q, union futex_key *key2,
+                                   struct hrtimer_sleeper *timeout)
+{
+        int ret = 0;
+        /*
+         * With the hb lock held, we avoid races while we process the wakeup.
+         * We only need to hold hb (and not hb2) to ensure atomicity as the
+         * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
+         * It can't be requeued from uaddr2 to something else since we don't
+         * support a PI aware source futex for requeue.
+         */
+        if (!match_futex(&q->key, key2)) {
+                WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
+                /*
+                 * We were woken prior to requeue by a timeout or a signal.
+                 * Unqueue the futex_q and determine which it was.
+                 */
+                plist_del(&q->list, &q->list.plist);
+                drop_futex_key_refs(&q->key);
+                if (timeout && !timeout->task)
+                        ret = -ETIMEDOUT;
+                else
+                        ret = -ERESTARTNOINTR;
+        }
+        return ret;
+}
+/**
+ * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
+ * @uaddr:      the futex we initialyl wait on (non-pi)
+ * @fshared:    whether the futexes are shared (1) or not (0).  They must be
+ *              the same type, no requeueing from private to shared, etc.
+ * @val:        the expected value of uaddr
+ * @abs_time:   absolute timeout
+ * @bitset:     32 bit wakeup bitset set by userspace, defaults to all.
+ * @clockrt:    whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
+ * @uaddr2:     the pi futex we will take prior to returning to user-space
+ *
+ * The caller will wait on uaddr and will be requeued by futex_requeue() to
+ * uaddr2 which must be PI aware.  Normal wakeup will wake on uaddr2 and
+ * complete the acquisition of the rt_mutex prior to returning to userspace.
+ * This ensures the rt_mutex maintains an owner when it has waiters; without
+ * one, the pi logic wouldn't know which task to boost/deboost, if there was a
+ * need to.
+ *
+ * We call schedule in futex_wait_queue_me() when we enqueue and return there
+ * via the following:
+ * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
+ * 2) wakeup on uaddr2 after a requeue and subsequent unlock
+ * 3) signal (before or after requeue)
+ * 4) timeout (before or after requeue)
+ *
+ * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function.
+ *
+ * If 2, we may then block on trying to take the rt_mutex and return via:
+ * 5) successful lock
+ * 6) signal
+ * 7) timeout
+ * 8) other lock acquisition failure
+ *
+ * If 6, we setup a restart_block with futex_lock_pi() as the function.
+ *
+ * If 4 or 7, we cleanup and return with -ETIMEDOUT.
+ *
+ * Returns:
+ *  0 - On success
+ * <0 - On error
+ */
+static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
+                                 u32 val, ktime_t *abs_time, u32 bitset,
+                                 int clockrt, u32 __user *uaddr2)
+{
+        struct hrtimer_sleeper timeout, *to = NULL;
+        struct rt_mutex_waiter rt_waiter;
+        struct rt_mutex *pi_mutex = NULL;
+        struct futex_hash_bucket *hb;
+        union futex_key key2;
+        struct futex_q q;
+        int res, ret;
+        if (!bitset)
+                return -EINVAL;
+        if (abs_time) {
+                to = &timeout;
+                hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+                                      CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+                hrtimer_init_sleeper(to, current);
+                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
+                                             current->timer_slack_ns);
+        }
+        /*
+         * The waiter is allocated on our stack, manipulated by the requeue
+         * code while we sleep on uaddr.
+         */
+        debug_rt_mutex_init_waiter(&rt_waiter);
+        rt_waiter.task = NULL;
+        q.pi_state = NULL;
+        q.bitset = bitset;
+        q.rt_waiter = &rt_waiter;
+        key2 = FUTEX_KEY_INIT;
+        ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
+        if (unlikely(ret != 0))
+                goto out;
+        /* Prepare to wait on uaddr. */
+        ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+        if (ret)
+                goto out_key2;
+        /* Queue the futex_q, drop the hb lock, wait for wakeup. */
+        futex_wait_queue_me(hb, &q, to);
+        spin_lock(&hb->lock);
+        ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
+        spin_unlock(&hb->lock);
+        if (ret)
+                goto out_put_keys;
+        /*
+         * In order for us to be here, we know our q.key == key2, and since
+         * we took the hb->lock above, we also know that futex_requeue() has
+         * completed and we no longer have to concern ourselves with a wakeup
+         * race with the atomic proxy lock acquition by the requeue code.
+         */
+        /* Check if the requeue code acquired the second futex for us. */
+        if (!q.rt_waiter) {
+                /*
+                 * Got the lock. We might not be the anticipated owner if we
+                 * did a lock-steal - fix up the PI-state in that case.
+                 */
+                if (q.pi_state && (q.pi_state->owner != current)) {
+                        spin_lock(q.lock_ptr);
+                        ret = fixup_pi_state_owner(uaddr2, &q, current,
+                                                   fshared);
+                        spin_unlock(q.lock_ptr);
+                }
+        } else {
+                /*
+                 * We have been woken up by futex_unlock_pi(), a timeout, or a
+                 * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
+                 * the pi_state.
+                 */
+                WARN_ON(!&q.pi_state);
+                pi_mutex = &q.pi_state->pi_mutex;
+                ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
+                debug_rt_mutex_free_waiter(&rt_waiter);
+                spin_lock(q.lock_ptr);
+                /*
+                 * Fixup the pi_state owner and possibly acquire the lock if we
+                 * haven't already.
+                 */
+                res = fixup_owner(uaddr2, fshared, &q, !ret);
+                /*
+                 * If fixup_owner() returned an error, proprogate that.  If it
+                 * acquired the lock, clear our -ETIMEDOUT or -EINTR.
+                 */
+                if (res)
+                        ret = (res < 0) ? res : 0;
+                /* Unqueue and drop the lock. */
+                unqueue_me_pi(&q);
+        }
+        /*
+         * If fixup_pi_state_owner() faulted and was unable to handle the
+         * fault, unlock the rt_mutex and return the fault to userspace.
+         */
+        if (ret == -EFAULT) {
+                if (rt_mutex_owner(pi_mutex) == current)
+                        rt_mutex_unlock(pi_mutex);
+        } else if (ret == -EINTR) {
+                /*
+                 * We've already been requeued, but we have no way to
+                 * restart by calling futex_lock_pi() directly. We
+                 * could restart the syscall, but that will look at
+                 * the user space value and return right away. So we
+                 * drop back with EWOULDBLOCK to tell user space that
+                 * "val" has been changed. That's the same what the
+                 * restart of the syscall would do in
+                 * futex_wait_setup().
+                 */
+                ret = -EWOULDBLOCK;
+        }
+out_put_keys:
+        put_futex_key(fshared, &q.key);
+out_key2:
+        put_futex_key(fshared, &key2);
+out:
+        if (to) {
+                hrtimer_cancel(&to->timer);
+                destroy_hrtimer_on_stack(&to->timer);
+        }
+        return ret;
+}
 /*
 * Support for robust futexes: the kernel cleans up held futexes at
 * thread exit time.
@@ -1896,7 +2475,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                fshared = 1;
        clockrt = op & FUTEX_CLOCK_REALTIME;
-        if (clockrt && cmd != FUTEX_WAIT_BITSET)
+        if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
                return -ENOSYS;
        switch (cmd) {
@@ -1911,10 +2490,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                ret = futex_wake(uaddr, fshared, val, val3);
                break;
        case FUTEX_REQUEUE:
-                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
+                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
                break;
        case FUTEX_CMP_REQUEUE:
-                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
+                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+                                    0);
                break;
        case FUTEX_WAKE_OP:
                ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
@@ -1931,6 +2511,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                if (futex_cmpxchg_enabled)
                        ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
                break;
+        case FUTEX_WAIT_REQUEUE_PI:
+                val3 = FUTEX_BITSET_MATCH_ANY;
+                ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
+                                            clockrt, uaddr2);
+                break;
+        case FUTEX_CMP_REQUEUE_PI:
+                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+                                    1);
+                break;
        default:
                ret = -ENOSYS;
        }
@@ -1948,7 +2537,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
        int cmd = op & FUTEX_CMD_MASK;
        if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
-                      cmd == FUTEX_WAIT_BITSET)) {
+                      cmd == FUTEX_WAIT_BITSET ||
+                      cmd == FUTEX_WAIT_REQUEUE_PI)) {
                if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
                        return -EFAULT;
                if (!timespec_valid(&ts))
@@ -1960,11 +2550,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
                tp = &t;
        }
        /*
-         * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
+         * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
         * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
         */
        if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
-            cmd == FUTEX_WAKE_OP)
+            cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
                val2 = (u32) (unsigned long) utime;
        return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 3394f8f52964..7d047808419d 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
-obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
+obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c687ba4363f2..13c68e71b726 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -359,7 +359,6 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
        spin_lock(&desc->lock);
        mask_ack_irq(desc, irq);
-        desc = irq_remap_to_desc(irq, desc);
        if (unlikely(desc->status & IRQ_INPROGRESS))
                goto out_unlock;
@@ -438,7 +437,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
        desc->status &= ~IRQ_INPROGRESS;
 out:
        desc->chip->eoi(irq);
-        desc = irq_remap_to_desc(irq, desc);
        spin_unlock(&desc->lock);
 }
@@ -475,7 +473,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                    !desc->action)) {
                desc->status |= (IRQ_PENDING | IRQ_MASKED);
                mask_ack_irq(desc, irq);
-                desc = irq_remap_to_desc(irq, desc);
                goto out_unlock;
        }
        kstat_incr_irqs_this_cpu(irq, desc);
@@ -483,7 +480,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
        /* Start handling the irq */
        if (desc->chip->ack)
                desc->chip->ack(irq);
-        desc = irq_remap_to_desc(irq, desc);
        /* Mark the IRQ currently in progress.*/
        desc->status |= IRQ_INPROGRESS;
@@ -544,10 +540,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
        if (!noirqdebug)
                note_interrupt(irq, desc, action_ret);
-        if (desc->chip->eoi) {
+        if (desc->chip->eoi)
                desc->chip->eoi(irq);
-                desc = irq_remap_to_desc(irq, desc);
-        }
 }
 void
@@ -582,10 +576,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
        /* Uninstall? */
        if (handle == handle_bad_irq) {
-                if (desc->chip != &no_irq_chip) {
+                if (desc->chip != &no_irq_chip)
                        mask_ack_irq(desc, irq);
-                        desc = irq_remap_to_desc(irq, desc);
-                }
                desc->status |= IRQ_DISABLED;
                desc->depth = 1;
        }
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 26e08754744f..18041a254d32 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,6 +11,7 @@
 */
 #include <linux/irq.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/interrupt.h>
@@ -81,45 +82,48 @@ static struct irq_desc irq_desc_init = {
        .lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
 };
-void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
+void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
 {
-        int node;
        void *ptr;
-        node = cpu_to_node(cpu);
+        if (slab_is_available())
-        ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node);
+                ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
+                                   GFP_ATOMIC, node);
+        else
+                ptr = alloc_bootmem_node(NODE_DATA(node),
+                                nr * sizeof(*desc->kstat_irqs));
        /*
         * don't overwite if can not get new one
         * init_copy_kstat_irqs() could still use old one
         */
        if (ptr) {
-                printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n",
+                printk(KERN_DEBUG "  alloc kstat_irqs on node %d\n", node);
-                         cpu, node);
                desc->kstat_irqs = ptr;
        }
 }
-static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
+static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
 {
        memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
        spin_lock_init(&desc->lock);
        desc->irq = irq;
 #ifdef CONFIG_SMP
-        desc->cpu = cpu;
+        desc->node = node;
 #endif
        lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-        init_kstat_irqs(desc, cpu, nr_cpu_ids);
+        init_kstat_irqs(desc, node, nr_cpu_ids);
        if (!desc->kstat_irqs) {
                printk(KERN_ERR "can not alloc kstat_irqs\n");
                BUG_ON(1);
        }
-        if (!init_alloc_desc_masks(desc, cpu, false)) {
+        if (!alloc_desc_masks(desc, node, false)) {
                printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
                BUG_ON(1);
        }
-        arch_init_chip_data(desc, cpu);
+        init_desc_masks(desc);
+        arch_init_chip_data(desc, node);
 }
 /*
@@ -169,7 +173,8 @@ int __init early_irq_init(void)
                desc[i].irq = i;
                desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-                init_alloc_desc_masks(&desc[i], 0, true);
+                alloc_desc_masks(&desc[i], 0, true);
+                init_desc_masks(&desc[i]);
                irq_desc_ptrs[i] = desc + i;
        }
@@ -187,11 +192,10 @@ struct irq_desc *irq_to_desc(unsigned int irq)
        return NULL;
 }
-struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
 {
        struct irq_desc *desc;
        unsigned long flags;
-        int node;
        if (irq >= nr_irqs) {
                WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
@@ -210,15 +214,17 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
        if (desc)
                goto out_unlock;
-        node = cpu_to_node(cpu);
+        if (slab_is_available())
-        desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
+                desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-        printk(KERN_DEBUG "  alloc irq_desc for %d on cpu %d node %d\n",
+        else
-                 irq, cpu, node);
+                desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
+        printk(KERN_DEBUG "  alloc irq_desc for %d on node %d\n", irq, node);
        if (!desc) {
                printk(KERN_ERR "can not alloc irq_desc\n");
                BUG_ON(1);
        }
-        init_one_irq_desc(irq, desc, cpu);
+        init_one_irq_desc(irq, desc, node);
        irq_desc_ptrs[irq] = desc;
@@ -256,7 +262,8 @@ int __init early_irq_init(void)
        for (i = 0; i < count; i++) {
                desc[i].irq = i;
-                init_alloc_desc_masks(&desc[i], 0, true);
+                alloc_desc_masks(&desc[i], 0, true);
+                init_desc_masks(&desc[i]);
                desc[i].kstat_irqs = kstat_irqs_all[i];
        }
        return arch_early_irq_init();
@@ -267,7 +274,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
        return (irq < NR_IRQS) ? irq_desc + irq : NULL;
 }
-struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
 {
        return irq_to_desc(irq);
 }
@@ -453,11 +460,8 @@ unsigned int __do_IRQ(unsigned int irq)
                /*
                 * No locking required for CPU-local interrupts:
                 */
-                if (desc->chip->ack) {
+                if (desc->chip->ack)
                        desc->chip->ack(irq);
-                        /* get new one */
-                        desc = irq_remap_to_desc(irq, desc);
-                }
                if (likely(!(desc->status & IRQ_DISABLED))) {
                        action_ret = handle_IRQ_event(irq, desc->action);
                        if (!noirqdebug)
@@ -468,10 +472,8 @@ unsigned int __do_IRQ(unsigned int irq)
        }
        spin_lock(&desc->lock);
-        if (desc->chip->ack) {
+        if (desc->chip->ack)
                desc->chip->ack(irq);
-                desc = irq_remap_to_desc(irq, desc);
-        }
        /*
         * REPLAY is when Linux resends an IRQ that was dropped earlier
         * WAITING is used by probe to mark irqs that are being tested
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 01ce20eab38f..73468253143b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -16,7 +16,7 @@ extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
 extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
 extern struct lock_class_key irq_desc_lock_class;
-extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
+extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
 extern void clear_kstat_irqs(struct irq_desc *desc);
 extern spinlock_t sparse_irq_lock;
@@ -42,6 +42,9 @@ static inline void unregister_handler_proc(unsigned int irq,
 extern int irq_select_affinity_usr(unsigned int irq);
+extern void
+irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask);
 /*
 * Debugging printout:
 */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2734eca59243..aaf5c9d05770 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -80,7 +80,7 @@ int irq_can_set_affinity(unsigned int irq)
        return 1;
 }
-static void
+void
 irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask)
 {
        struct irqaction *action = desc->action;
@@ -109,17 +109,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
        spin_lock_irqsave(&desc->lock, flags);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-        if (desc->status & IRQ_MOVE_PCNTXT)
+        if (desc->status & IRQ_MOVE_PCNTXT) {
-                desc->chip->set_affinity(irq, cpumask);
+                if (!desc->chip->set_affinity(irq, cpumask)) {
+                        cpumask_copy(desc->affinity, cpumask);
+                        irq_set_thread_affinity(desc, cpumask);
+                }
+        }
        else {
                desc->status |= IRQ_MOVE_PENDING;
                cpumask_copy(desc->pending_mask, cpumask);
        }
 #else
-        cpumask_copy(desc->affinity, cpumask);
+        if (!desc->chip->set_affinity(irq, cpumask)) {
-        desc->chip->set_affinity(irq, cpumask);
+                cpumask_copy(desc->affinity, cpumask);
+                irq_set_thread_affinity(desc, cpumask);
+        }
 #endif
-        irq_set_thread_affinity(desc, cpumask);
        desc->status |= IRQ_AFFINITY_SET;
        spin_unlock_irqrestore(&desc->lock, flags);
        return 0;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index e05ad9be43b7..cfe767ca1545 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,5 +1,8 @@
 #include <linux/irq.h>
+#include <linux/interrupt.h>
+#include "internals.h"
 void move_masked_irq(int irq)
 {
@@ -39,11 +42,12 @@ void move_masked_irq(int irq)
         * masking the irqs.
         */
        if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
-                   < nr_cpu_ids)) {
+                   < nr_cpu_ids))
-                cpumask_and(desc->affinity,
+                if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
-                            desc->pending_mask, cpu_online_mask);
+                        cpumask_copy(desc->affinity, desc->pending_mask);
-                desc->chip->set_affinity(irq, desc->affinity);
+                        irq_set_thread_affinity(desc, desc->pending_mask);
-        }
+                }
        cpumask_clear(desc->pending_mask);
 }
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 44bbdcbaf8d2..2f69bee57bf2 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -15,9 +15,9 @@
 static void init_copy_kstat_irqs(struct irq_desc *old_desc,
                                 struct irq_desc *desc,
-                                 int cpu, int nr)
+                                 int node, int nr)
 {
-        init_kstat_irqs(desc, cpu, nr);
+        init_kstat_irqs(desc, node, nr);
        if (desc->kstat_irqs != old_desc->kstat_irqs)
                memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
@@ -34,20 +34,20 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
 }
 static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
-                 struct irq_desc *desc, int cpu)
+                 struct irq_desc *desc, int node)
 {
        memcpy(desc, old_desc, sizeof(struct irq_desc));
-        if (!init_alloc_desc_masks(desc, cpu, false)) {
+        if (!alloc_desc_masks(desc, node, false)) {
                printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
                                "for migration.\n", irq);
                return false;
        }
        spin_lock_init(&desc->lock);
-        desc->cpu = cpu;
+        desc->node = node;
        lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-        init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+        init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
        init_copy_desc_masks(old_desc, desc);
-        arch_init_copy_chip_data(old_desc, desc, cpu);
+        arch_init_copy_chip_data(old_desc, desc, node);
        return true;
 }
@@ -59,12 +59,11 @@ static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
 }
 static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
-                                                int cpu)
+                                                int node)
 {
        struct irq_desc *desc;
        unsigned int irq;
        unsigned long flags;
-        int node;
        irq = old_desc->irq;
@@ -76,7 +75,6 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
        if (desc && old_desc != desc)
                goto out_unlock;
-        node = cpu_to_node(cpu);
        desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
        if (!desc) {
                printk(KERN_ERR "irq %d: can not get new irq_desc "
@@ -85,7 +83,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
                desc = old_desc;
                goto out_unlock;
        }
-        if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) {
+        if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
                /* still use old one */
                kfree(desc);
                desc = old_desc;
@@ -97,9 +95,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
        /* free the old one */
        free_one_irq_desc(old_desc, desc);
-        spin_unlock(&old_desc->lock);
        kfree(old_desc);
-        spin_lock(&desc->lock);
        return desc;
@@ -109,24 +105,14 @@ out_unlock:
        return desc;
 }
-struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
+struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
 {
-        int old_cpu;
-        int node, old_node;
        /* those all static, do move them */
        if (desc->irq < NR_IRQS_LEGACY)
                return desc;
-        old_cpu = desc->cpu;
+        if (desc->node != node)
-        if (old_cpu != cpu) {
+                desc = __real_move_irq_desc(desc, node);
-                node = cpu_to_node(cpu);
-                old_node = cpu_to_node(old_cpu);
-                if (old_node != node)
-                        desc = __real_move_irq_desc(desc, cpu);
-                else
-                        desc->cpu = cpu;
-        }
        return desc;
 }
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 507cf2b5e9f1..e5cc0cd28d54 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -249,7 +249,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                /* didnt get the lock, go to sleep: */
                spin_unlock_mutex(&lock->wait_lock, flags);
-                __schedule();
+                preempt_enable_no_resched();
+                schedule();
+                preempt_disable();
                spin_lock_mutex(&lock->wait_lock, flags);
        }
@@ -471,5 +473,28 @@ int __sched mutex_trylock(struct mutex *lock)
        return ret;
 }
 EXPORT_SYMBOL(mutex_trylock);
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+        /* dec if we can't possibly hit 0 */
+        if (atomic_add_unless(cnt, -1, 1))
+                return 0;
+        /* we might hit 0, so take the lock */
+        mutex_lock(lock);
+        if (!atomic_dec_and_test(cnt)) {
+                /* when we actually did the dec, we didn't hit 0 */
+                mutex_unlock(lock);
+                return 0;
+        }
+        /* we hit 0, and we hold the lock */
+        return 1;
+}
+EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 69d9cb921ffa..820c5af44f3e 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -300,7 +300,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 * assigned pending owner [which might not have taken the
 * lock yet]:
 */
-static inline int try_to_steal_lock(struct rt_mutex *lock)
+static inline int try_to_steal_lock(struct rt_mutex *lock,
+                                    struct task_struct *task)
 {
        struct task_struct *pendowner = rt_mutex_owner(lock);
        struct rt_mutex_waiter *next;
@@ -309,11 +310,11 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
        if (!rt_mutex_owner_pending(lock))
                return 0;
-        if (pendowner == current)
+        if (pendowner == task)
                return 1;
        spin_lock_irqsave(&pendowner->pi_lock, flags);
-        if (current->prio >= pendowner->prio) {
+        if (task->prio >= pendowner->prio) {
                spin_unlock_irqrestore(&pendowner->pi_lock, flags);
                return 0;
        }
@@ -338,21 +339,21 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
         * We are going to steal the lock and a waiter was
         * enqueued on the pending owners pi_waiters queue. So
         * we have to enqueue this waiter into
-         * current->pi_waiters list. This covers the case,
+         * task->pi_waiters list. This covers the case,
-         * where current is boosted because it holds another
+         * where task is boosted because it holds another
         * lock and gets unboosted because the booster is
         * interrupted, so we would delay a waiter with higher
-         * priority as current->normal_prio.
+         * priority as task->normal_prio.
         *
         * Note: in the rare case of a SCHED_OTHER task changing
         * its priority and thus stealing the lock, next->task
-         * might be current:
+         * might be task:
         */
-        if (likely(next->task != current)) {
+        if (likely(next->task != task)) {
-                spin_lock_irqsave(&current->pi_lock, flags);
+                spin_lock_irqsave(&task->pi_lock, flags);
-                plist_add(&next->pi_list_entry, &current->pi_waiters);
+                plist_add(&next->pi_list_entry, &task->pi_waiters);
-                __rt_mutex_adjust_prio(current);
+                __rt_mutex_adjust_prio(task);
-                spin_unlock_irqrestore(&current->pi_lock, flags);
+                spin_unlock_irqrestore(&task->pi_lock, flags);
        }
        return 1;
 }
@@ -389,7 +390,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
         */
        mark_rt_mutex_waiters(lock);
-        if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
+        if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current))
                return 0;
        /* We got the lock. */
@@ -411,6 +412,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
 */
 static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
                                   struct rt_mutex_waiter *waiter,
+                                   struct task_struct *task,
                                   int detect_deadlock)
 {
        struct task_struct *owner = rt_mutex_owner(lock);
@@ -418,21 +420,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        unsigned long flags;
        int chain_walk = 0, res;
-        spin_lock_irqsave(&current->pi_lock, flags);
+        spin_lock_irqsave(&task->pi_lock, flags);
-        __rt_mutex_adjust_prio(current);
+        __rt_mutex_adjust_prio(task);
-        waiter->task = current;
+        waiter->task = task;
        waiter->lock = lock;
-        plist_node_init(&waiter->list_entry, current->prio);
+        plist_node_init(&waiter->list_entry, task->prio);
-        plist_node_init(&waiter->pi_list_entry, current->prio);
+        plist_node_init(&waiter->pi_list_entry, task->prio);
        /* Get the top priority waiter on the lock */
        if (rt_mutex_has_waiters(lock))
                top_waiter = rt_mutex_top_waiter(lock);
        plist_add(&waiter->list_entry, &lock->wait_list);
-        current->pi_blocked_on = waiter;
+        task->pi_blocked_on = waiter;
-        spin_unlock_irqrestore(&current->pi_lock, flags);
+        spin_unlock_irqrestore(&task->pi_lock, flags);
        if (waiter == rt_mutex_top_waiter(lock)) {
                spin_lock_irqsave(&owner->pi_lock, flags);
@@ -460,7 +462,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        spin_unlock(&lock->wait_lock);
        res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
-                                         current);
+                                         task);
        spin_lock(&lock->wait_lock);
@@ -605,37 +607,25 @@ void rt_mutex_adjust_pi(struct task_struct *task)
        rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
 }
-/*
+/**
- * Slow path lock function:
+ * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
+ * @lock:                the rt_mutex to take
+ * @state:               the state the task should block in (TASK_INTERRUPTIBLE
+ *                       or TASK_UNINTERRUPTIBLE)
+ * @timeout:             the pre-initialized and started timer, or NULL for none
+ * @waiter:              the pre-initialized rt_mutex_waiter
+ * @detect_deadlock:     passed to task_blocks_on_rt_mutex
+ *
+ * lock->wait_lock must be held by the caller.
 */
 static int __sched
-rt_mutex_slowlock(struct rt_mutex *lock, int state,
+__rt_mutex_slowlock(struct rt_mutex *lock, int state,
-                  struct hrtimer_sleeper *timeout,
+                    struct hrtimer_sleeper *timeout,
-                  int detect_deadlock)
+                    struct rt_mutex_waiter *waiter,
+                    int detect_deadlock)
 {
-        struct rt_mutex_waiter waiter;
        int ret = 0;
-        debug_rt_mutex_init_waiter(&waiter);
-        waiter.task = NULL;
-        spin_lock(&lock->wait_lock);
-        /* Try to acquire the lock again: */
-        if (try_to_take_rt_mutex(lock)) {
-                spin_unlock(&lock->wait_lock);
-                return 0;
-        }
-        set_current_state(state);
-        /* Setup the timer, when timeout != NULL */
-        if (unlikely(timeout)) {
-                hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
-                if (!hrtimer_active(&timeout->timer))
-                        timeout->task = NULL;
-        }
        for (;;) {
                /* Try to acquire the lock: */
                if (try_to_take_rt_mutex(lock))
@@ -656,19 +646,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
                }
                /*
-                 * waiter.task is NULL the first time we come here and
+                 * waiter->task is NULL the first time we come here and
                 * when we have been woken up by the previous owner
                 * but the lock got stolen by a higher prio task.
                 */
-                if (!waiter.task) {
+                if (!waiter->task) {
-                        ret = task_blocks_on_rt_mutex(lock, &waiter,
+                        ret = task_blocks_on_rt_mutex(lock, waiter, current,
                                                      detect_deadlock);
                        /*
                         * If we got woken up by the owner then start loop
                         * all over without going into schedule to try
                         * to get the lock now:
                         */
-                        if (unlikely(!waiter.task)) {
+                        if (unlikely(!waiter->task)) {
                                /*
                                 * Reset the return value. We might
                                 * have returned with -EDEADLK and the
@@ -684,15 +674,52 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
                spin_unlock(&lock->wait_lock);
-                debug_rt_mutex_print_deadlock(&waiter);
+                debug_rt_mutex_print_deadlock(waiter);
-                if (waiter.task)
+                if (waiter->task)
                        schedule_rt_mutex(lock);
                spin_lock(&lock->wait_lock);
                set_current_state(state);
        }
+        return ret;
+}
+/*
+ * Slow path lock function:
+ */
+static int __sched
+rt_mutex_slowlock(struct rt_mutex *lock, int state,
+                  struct hrtimer_sleeper *timeout,
+                  int detect_deadlock)
+{
+        struct rt_mutex_waiter waiter;
+        int ret = 0;
+        debug_rt_mutex_init_waiter(&waiter);
+        waiter.task = NULL;
+        spin_lock(&lock->wait_lock);
+        /* Try to acquire the lock again: */
+        if (try_to_take_rt_mutex(lock)) {
+                spin_unlock(&lock->wait_lock);
+                return 0;
+        }
+        set_current_state(state);
+        /* Setup the timer, when timeout != NULL */
+        if (unlikely(timeout)) {
+                hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
+                if (!hrtimer_active(&timeout->timer))
+                        timeout->task = NULL;
+        }
+        ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
+                                  detect_deadlock);
        set_current_state(TASK_RUNNING);
        if (unlikely(waiter.task))
@@ -864,9 +891,9 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
 EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
 /**
- * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible
+ * rt_mutex_timed_lock - lock a rt_mutex interruptible
- *                                     the timeout structure is provided
+ *                      the timeout structure is provided
- *                                     by the caller
+ *                      by the caller
 *
 * @lock:               the rt_mutex to be locked
 * @timeout:            timeout structure or NULL (no timeout)
@@ -913,7 +940,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
 }
 EXPORT_SYMBOL_GPL(rt_mutex_unlock);
-/***
+/**
 * rt_mutex_destroy - mark a mutex unusable
 * @lock: the mutex to be destroyed
 *
@@ -986,6 +1013,59 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
 }
 /**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock:               the rt_mutex to take
+ * @waiter:             the pre-initialized rt_mutex_waiter
+ * @task:               the task to prepare
+ * @detect_deadlock:    perform deadlock detection (1) or not (0)
+ *
+ * Returns:
+ *  0 - task blocked on lock
+ *  1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for FUTEX_REQUEUE_PI support.
+ */
+int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+                              struct rt_mutex_waiter *waiter,
+                              struct task_struct *task, int detect_deadlock)
+{
+        int ret;
+        spin_lock(&lock->wait_lock);
+        mark_rt_mutex_waiters(lock);
+        if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
+                /* We got the lock for task. */
+                debug_rt_mutex_lock(lock);
+                rt_mutex_set_owner(lock, task, 0);
+                rt_mutex_deadlock_account_lock(lock, task);
+                return 1;
+        }
+        ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
+        if (ret && !waiter->task) {
+                /*
+                 * Reset the return value. We might have
+                 * returned with -EDEADLK and the owner
+                 * released the lock while we were walking the
+                 * pi chain.  Let the waiter sort it out.
+                 */
+                ret = 0;
+        }
+        spin_unlock(&lock->wait_lock);
+        debug_rt_mutex_print_deadlock(waiter);
+        return ret;
+}
+/**
 * rt_mutex_next_owner - return the next owner of the lock
 *
 * @lock: the rt lock query
@@ -1004,3 +1084,57 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
        return rt_mutex_top_waiter(lock)->task;
 }
+/**
+ * rt_mutex_finish_proxy_lock() - Complete lock acquisition
+ * @lock:               the rt_mutex we were woken on
+ * @to:                 the timeout, null if none. hrtimer should already have
+ *                      been started.
+ * @waiter:             the pre-initialized rt_mutex_waiter
+ * @detect_deadlock:    perform deadlock detection (1) or not (0)
+ *
+ * Complete the lock acquisition started our behalf by another thread.
+ *
+ * Returns:
+ *  0 - success
+ * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK
+ *
+ * Special API call for PI-futex requeue support
+ */
+int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+                               struct hrtimer_sleeper *to,
+                               struct rt_mutex_waiter *waiter,
+                               int detect_deadlock)
+{
+        int ret;
+        spin_lock(&lock->wait_lock);
+        set_current_state(TASK_INTERRUPTIBLE);
+        ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter,
+                                  detect_deadlock);
+        set_current_state(TASK_RUNNING);
+        if (unlikely(waiter->task))
+                remove_waiter(lock, waiter);
+        /*
+         * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+         * have to fix that up.
+         */
+        fixup_rt_mutex_waiters(lock);
+        spin_unlock(&lock->wait_lock);
+        /*
+         * Readjust priority, when we did not get the lock. We might have been
+         * the pending owner and boosted. Since we did not take the lock, the
+         * PI boost has to go.
+         */
+        if (unlikely(ret))
+                rt_mutex_adjust_prio(current);
+        return ret;
+}
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index e124bf5800ea..97a2f81866af 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -120,6 +120,14 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
                                       struct task_struct *proxy_owner);
 extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
                                  struct task_struct *proxy_owner);
+extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+                                     struct rt_mutex_waiter *waiter,
+                                     struct task_struct *task,
+                                     int detect_deadlock);
+extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+                                      struct hrtimer_sleeper *to,
+                                      struct rt_mutex_waiter *waiter,
+                                      int detect_deadlock);
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 # include "rtmutex-debug.h"
diff --git a/kernel/sched.c b/kernel/sched.c
index 26efa475bdc1..076e403b9c88 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -630,6 +630,10 @@ struct rq {
        struct list_head migration_queue;
 #endif
+        /* calc_load related fields */
+        unsigned long calc_load_update;
+        long calc_load_active;
 #ifdef CONFIG_SCHED_HRTICK
 #ifdef CONFIG_SMP
        int hrtick_csd_pending;
@@ -1728,6 +1732,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 }
 #endif
+static void calc_load_account_active(struct rq *this_rq);
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -2458,6 +2464,17 @@ out:
        return success;
 }
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes.  Returns 1 if the process was woken up, 0 if it was already
+ * running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
 int wake_up_process(struct task_struct *p)
 {
        return try_to_wake_up(p, TASK_ALL, 0);
@@ -2766,7 +2783,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
         * combine the page table reload and the switch backend into
         * one hypercall.
         */
-        arch_enter_lazy_cpu_mode();
+        arch_start_context_switch(prev);
        if (unlikely(!mm)) {
                next->active_mm = oldmm;
@@ -2856,19 +2873,72 @@ unsigned long nr_iowait(void)
        return sum;
 }
-unsigned long nr_active(void)
+/* Variables and functions for calc_load */
+static atomic_long_t calc_load_tasks;
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+/**
+ * get_avenrun - get the load average array
+ * @loads:      pointer to dest load array
+ * @offset:     offset to add
+ * @shift:      shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
 {
-        unsigned long i, running = 0, uninterruptible = 0;
+        loads[0] = (avenrun[0] + offset) << shift;
+        loads[1] = (avenrun[1] + offset) << shift;
+        loads[2] = (avenrun[2] + offset) << shift;
+}
-        for_each_online_cpu(i) {
+static unsigned long
-                running += cpu_rq(i)->nr_running;
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
-                uninterruptible += cpu_rq(i)->nr_uninterruptible;
+{
-        }
+        load *= exp;
+        load += active * (FIXED_1 - exp);
+        return load >> FSHIFT;
+}
-        if (unlikely((long)uninterruptible < 0))
+/*
-                uninterruptible = 0;
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ */
+void calc_global_load(void)
+{
+        unsigned long upd = calc_load_update + 10;
+        long active;
+        if (time_before(jiffies, upd))
+                return;
+        active = atomic_long_read(&calc_load_tasks);
+        active = active > 0 ? active * FIXED_1 : 0;
-        return running + uninterruptible;
+        avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+        avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+        avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+        calc_load_update += LOAD_FREQ;
+}
+/*
+ * Either called from update_cpu_load() or from a cpu going idle
+ */
+static void calc_load_account_active(struct rq *this_rq)
+{
+        long nr_active, delta;
+        nr_active = this_rq->nr_running;
+        nr_active += (long) this_rq->nr_uninterruptible;
+        if (nr_active != this_rq->calc_load_active) {
+                delta = nr_active - this_rq->calc_load_active;
+                this_rq->calc_load_active = nr_active;
+                atomic_long_add(delta, &calc_load_tasks);
+        }
 }
 /*
@@ -2899,6 +2969,11 @@ static void update_cpu_load(struct rq *this_rq)
                        new_load += scale-1;
                this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
        }
+        if (time_after_eq(jiffies, this_rq->calc_load_update)) {
+                this_rq->calc_load_update += LOAD_FREQ;
+                calc_load_account_active(this_rq);
+        }
 }
 #ifdef CONFIG_SMP
@@ -4240,10 +4315,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 static struct {
        atomic_t load_balancer;
        cpumask_var_t cpu_mask;
+        cpumask_var_t ilb_grp_nohz_mask;
 } nohz ____cacheline_aligned = {
        .load_balancer = ATOMIC_INIT(-1),
 };
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu:        The cpu whose lowest level of sched domain is to
+ *              be returned.
+ * @flag:       The flag to check for the lowest sched_domain
+ *              for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+        struct sched_domain *sd;
+        for_each_domain(cpu, sd)
+                if (sd && (sd->flags & flag))
+                        break;
+        return sd;
+}
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu:        The cpu whose domains we're iterating over.
+ * @sd:         variable holding the value of the power_savings_sd
+ *              for cpu.
+ * @flag:       The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+        for (sd = lowest_flag_domain(cpu, flag); \
+                (sd && (sd->flags & flag)); sd = sd->parent)
+/**
+ * is_semi_idle_group - Checks if the given sched_group is semi-idle.
+ * @ilb_group:  group to be checked for semi-idleness
+ *
+ * Returns:     1 if the group is semi-idle. 0 otherwise.
+ *
+ * We define a sched_group to be semi idle if it has atleast one idle-CPU
+ * and atleast one non-idle CPU. This helper function checks if the given
+ * sched_group is semi-idle or not.
+ */
+static inline int is_semi_idle_group(struct sched_group *ilb_group)
+{
+        cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+                                        sched_group_cpus(ilb_group));
+        /*
+         * A sched_group is semi-idle when it has atleast one busy cpu
+         * and atleast one idle cpu.
+         */
+        if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+                return 0;
+        if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+                return 0;
+        return 1;
+}
+/**
+ * find_new_ilb - Finds the optimum idle load balancer for nomination.
+ * @cpu:        The cpu which is nominating a new idle_load_balancer.
+ *
+ * Returns:     Returns the id of the idle load balancer if it exists,
+ *              Else, returns >= nr_cpu_ids.
+ *
+ * This algorithm picks the idle load balancer such that it belongs to a
+ * semi-idle powersavings sched_domain. The idea is to try and avoid
+ * completely idle packages/cores just for the purpose of idle load balancing
+ * when there are other idle cpu's which are better suited for that job.
+ */
+static int find_new_ilb(int cpu)
+{
+        struct sched_domain *sd;
+        struct sched_group *ilb_group;
+        /*
+         * Have idle load balancer selection from semi-idle packages only
+         * when power-aware load balancing is enabled
+         */
+        if (!(sched_smt_power_savings || sched_mc_power_savings))
+                goto out_done;
+        /*
+         * Optimize for the case when we have no idle CPUs or only one
+         * idle CPU. Don't walk the sched_domain hierarchy in such cases
+         */
+        if (cpumask_weight(nohz.cpu_mask) < 2)
+                goto out_done;
+        for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
+                ilb_group = sd->groups;
+                do {
+                        if (is_semi_idle_group(ilb_group))
+                                return cpumask_first(nohz.ilb_grp_nohz_mask);
+                        ilb_group = ilb_group->next;
+                } while (ilb_group != sd->groups);
+        }
+out_done:
+        return cpumask_first(nohz.cpu_mask);
+}
+#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+static inline int find_new_ilb(int call_cpu)
+{
+        return cpumask_first(nohz.cpu_mask);
+}
+#endif
 /*
 * This routine will try to nominate the ilb (idle load balancing)
 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4298,8 +4489,24 @@ int select_nohz_load_balancer(int stop_tick)
                        /* make me the ilb owner */
                        if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
                                return 1;
-                } else if (atomic_read(&nohz.load_balancer) == cpu)
+                } else if (atomic_read(&nohz.load_balancer) == cpu) {
+                        int new_ilb;
+                        if (!(sched_smt_power_savings ||
+                                                sched_mc_power_savings))
+                                return 1;
+                        /*
+                         * Check to see if there is a more power-efficient
+                         * ilb.
+                         */
+                        new_ilb = find_new_ilb(cpu);
+                        if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
+                                atomic_set(&nohz.load_balancer, -1);
+                                resched_cpu(new_ilb);
+                                return 0;
+                        }
                        return 1;
+                }
        } else {
                if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
                        return 0;
@@ -4468,15 +4675,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
                }
                if (atomic_read(&nohz.load_balancer) == -1) {
-                        /*
+                        int ilb = find_new_ilb(cpu);
-                         * simple selection for now: Nominate the
-                         * first cpu in the nohz list to be the next
-                         * ilb owner.
-                         *
-                         * TBD: Traverse the sched domains and nominate
-                         * the nearest cpu in the nohz.cpu_mask.
-                         */
-                        int ilb = cpumask_first(nohz.cpu_mask);
                        if (ilb < nr_cpu_ids)
                                resched_cpu(ilb);
@@ -5007,13 +5206,15 @@ pick_next_task(struct rq *rq)
 /*
 * schedule() is the main scheduler function.
 */
-asmlinkage void __sched __schedule(void)
+asmlinkage void __sched schedule(void)
 {
        struct task_struct *prev, *next;
        unsigned long *switch_count;
        struct rq *rq;
        int cpu;
+need_resched:
+        preempt_disable();
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
        rcu_qsctr_inc(cpu);
@@ -5070,15 +5271,9 @@ need_resched_nonpreemptible:
        if (unlikely(reacquire_kernel_lock(current) < 0))
                goto need_resched_nonpreemptible;
-}
-asmlinkage void __sched schedule(void)
-{
-need_resched:
-        preempt_disable();
-        __schedule();
        preempt_enable_no_resched();
-        if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+        if (need_resched())
                goto need_resched;
 }
 EXPORT_SYMBOL(schedule);
@@ -5221,7 +5416,7 @@ EXPORT_SYMBOL(default_wake_function);
 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
 * zero in this (rare) case, and we handle it by continuing to scan the queue.
 */
-void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
                        int nr_exclusive, int sync, void *key)
 {
        wait_queue_t *curr, *next;
@@ -5241,6 +5436,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 * @mode: which threads
 * @nr_exclusive: how many wake-one or wake-many threads to wake up
 * @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
 */
 void __wake_up(wait_queue_head_t *q, unsigned int mode,
                        int nr_exclusive, void *key)
@@ -5279,6 +5477,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
 * with each other. This can prevent needless bouncing between CPUs.
 *
 * On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
 */
 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
                        int nr_exclusive, void *key)
@@ -5315,6 +5516,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 * awakened in the same order in which they were queued.
 *
 * See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
 */
 void complete(struct completion *x)
 {
@@ -5332,6 +5536,9 @@ EXPORT_SYMBOL(complete);
 * @x:  holds the state of this particular completion
 *
 * This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
 */
 void complete_all(struct completion *x)
 {
@@ -6490,8 +6697,9 @@ void sched_show_task(struct task_struct *p)
 #ifdef CONFIG_DEBUG_STACK_USAGE
        free = stack_not_used(p);
 #endif
-        printk(KERN_CONT "%5lu %5d %6d\n", free,
+        printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
-                task_pid_nr(p), task_pid_nr(p->real_parent));
+                task_pid_nr(p), task_pid_nr(p->real_parent),
+                (unsigned long)task_thread_info(p)->flags);
        show_stack(p, NULL);
 }
@@ -6970,6 +7178,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
        }
 }
+/*
+ * remove the tasks which were accounted by rq from calc_load_tasks.
+ */
+static void calc_global_load_remove(struct rq *rq)
+{
+        atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+}
 #endif /* CONFIG_HOTPLUG_CPU */
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7204,6 +7420,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                /* Update our root-domain */
                rq = cpu_rq(cpu);
                spin_lock_irqsave(&rq->lock, flags);
+                rq->calc_load_update = calc_load_update;
+                rq->calc_load_active = 0;
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -7243,7 +7461,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                cpuset_unlock();
                migrate_nr_uninterruptible(rq);
                BUG_ON(rq->nr_running != 0);
+                calc_global_load_remove(rq);
                /*
                 * No need to migrate the tasks: it was best-effort if
                 * they didn't take sched_hotcpu_mutex. Just wake up
@@ -7753,8 +7971,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 /*
 * The cpus mask in sched_group and sched_domain hangs off the end.
- * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
+ *
- * for nr_cpu_ids < CONFIG_NR_CPUS.
+ * ( See the the comments in include/linux/sched.h:struct sched_group
+ *   and struct sched_domain. )
 */
 struct static_sched_group {
        struct sched_group sg;
@@ -7875,7 +8094,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
                        struct sched_domain *sd;
                        sd = &per_cpu(phys_domains, j).sd;
-                        if (j != cpumask_first(sched_group_cpus(sd->groups))) {
+                        if (j != group_first_cpu(sd->groups)) {
                                /*
                                 * Only add "power" once for each
                                 * physical package.
@@ -7953,7 +8172,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
        WARN_ON(!sd || !sd->groups);
-        if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
+        if (cpu != group_first_cpu(sd->groups))
                return;
        child = sd->child;
@@ -8938,6 +9157,8 @@ void __init sched_init(void)
                rq = cpu_rq(i);
                spin_lock_init(&rq->lock);
                rq->nr_running = 0;
+                rq->calc_load_active = 0;
+                rq->calc_load_update = jiffies + LOAD_FREQ;
                init_cfs_rq(&rq->cfs, rq);
                init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -9045,6 +9266,9 @@ void __init sched_init(void)
         * when this runqueue becomes "idle".
         */
        init_idle(current, smp_processor_id());
+        calc_load_update = jiffies + LOAD_FREQ;
        /*
         * During early bootup we pretend to be a normal task:
         */
@@ -9055,6 +9279,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ
        alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+        alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
 #endif
        alloc_bootmem_cpumask_var(&cpu_isolated_map);
 #endif /* SMP */
@@ -9800,6 +10025,13 @@ static int sched_rt_global_constraints(void)
        if (sysctl_sched_rt_period <= 0)
                return -EINVAL;
+        /*
+         * There's always some RT tasks in the root group
+         * -- migration, kstopmachine etc..
+         */
+        if (sysctl_sched_rt_runtime == 0)
+                return -EBUSY;
        spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
        for_each_possible_cpu(i) {
                struct rt_rq *rt_rq = &cpu_rq(i)->rt;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index cdd3c89574cd..344712a5e3ed 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -165,7 +165,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
                vec->count = 0;
                if (bootmem)
                        alloc_bootmem_cpumask_var(&vec->mask);
-                else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL))
+                else if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
                        goto cleanup;
        }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3816f217f119..5f9650e8fe75 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1487,17 +1487,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
        find_matching_se(&se, &pse);
-        while (se) {
+        BUG_ON(!pse);
-                BUG_ON(!pse);
-                if (wakeup_preempt_entity(se, pse) == 1) {
+        if (wakeup_preempt_entity(se, pse) == 1)
-                        resched_task(curr);
+                resched_task(curr);
-                        break;
-                }
-                se = parent_entity(se);
-                pse = parent_entity(pse);
-        }
 }
 static struct task_struct *pick_next_task_fair(struct rq *rq)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 8a21a2e28c13..499672c10cbd 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
        schedstat_inc(rq, sched_goidle);
+        /* adjust the active tasks as we might go into a long sleep */
+        calc_load_account_active(rq);
        return rq->idle;
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f2c66f8f9712..9bf0d2a73045 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1591,7 +1591,7 @@ static inline void init_sched_rt_class(void)
        unsigned int i;
        for_each_possible_cpu(i)
-                alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
+                zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
                                        GFP_KERNEL, cpu_to_node(i));
 }
 #endif /* CONFIG_SMP */
diff --git a/kernel/smp.c b/kernel/smp.c
index 858baac568ee..ad63d8501207 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -52,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
+                if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
                                cpu_to_node(cpu)))
                        return NOTIFY_BAD;
                break;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b525dd348511..f674f332a024 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -828,7 +828,7 @@ int __init __weak arch_early_irq_init(void)
        return 0;
 }
-int __weak arch_init_chip_data(struct irq_desc *desc, int cpu)
+int __weak arch_init_chip_data(struct irq_desc *desc, int node)
 {
        return 0;
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 45bd711a242e..944ba03cae19 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -743,6 +743,14 @@ static struct ctl_table kern_table[] = {
        },
        {
                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "bootloader_version",
+                .data           = &bootloader_version,
+                .maxlen         = sizeof (int),
+                .mode           = 0444,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "kstack_depth_to_print",
                .data           = &kstack_depth_to_print,
                .maxlen         = sizeof(int),
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 687dff49f6e7..52a8bf8931f3 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -22,7 +22,7 @@
 /*
 * This read-write spinlock protects us from races in SMP while
- * playing with xtime and avenrun.
+ * playing with xtime.
 */
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
diff --git a/kernel/timer.c b/kernel/timer.c
index cffffad01c31..a26ed294f938 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1123,47 +1123,6 @@ void update_process_times(int user_tick)
 }
 /*
- * Nr of active tasks - counted in fixed-point numbers
- */
-static unsigned long count_active_tasks(void)
-{
-        return nr_active() * FIXED_1;
-}
-/*
- * Hmm.. Changed this, as the GNU make sources (load.c) seems to
- * imply that avenrun[] is the standard name for this kind of thing.
- * Nothing else seems to be standardized: the fractional size etc
- * all seem to differ on different machines.
- *
- * Requires xtime_lock to access.
- */
-unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun);
-/*
- * calc_load - given tick count, update the avenrun load estimates.
- * This is called while holding a write_lock on xtime_lock.
- */
-static inline void calc_load(unsigned long ticks)
-{
-        unsigned long active_tasks; /* fixed-point */
-        static int count = LOAD_FREQ;
-        count -= ticks;
-        if (unlikely(count < 0)) {
-                active_tasks = count_active_tasks();
-                do {
-                        CALC_LOAD(avenrun[0], EXP_1, active_tasks);
-                        CALC_LOAD(avenrun[1], EXP_5, active_tasks);
-                        CALC_LOAD(avenrun[2], EXP_15, active_tasks);
-                        count += LOAD_FREQ;
-                } while (count < 0);
-        }
-}
-/*
 * This function runs timers and the timer-tq in bottom half context.
 */
 static void run_timer_softirq(struct softirq_action *h)
@@ -1187,16 +1146,6 @@ void run_local_timers(void)
 }
 /*
- * Called by the timer interrupt. xtime_lock must already be taken
- * by the timer IRQ!
- */
-static inline void update_times(unsigned long ticks)
-{
-        update_wall_time();
-        calc_load(ticks);
-}
-/*
 * The 64-bit jiffies value is not atomic - you MUST NOT read it
 * without sampling the sequence number in xtime_lock.
 * jiffies is defined in the linker script...
@@ -1205,7 +1154,8 @@ static inline void update_times(unsigned long ticks)
 void do_timer(unsigned long ticks)
 {
        jiffies_64 += ticks;
-        update_times(ticks);
+        update_wall_time();
+        calc_global_load();
 }
 #ifdef __ARCH_WANT_SYS_ALARM
@@ -1406,37 +1356,17 @@ int do_sysinfo(struct sysinfo *info)
 {
        unsigned long mem_total, sav_total;
        unsigned int mem_unit, bitcount;
-        unsigned long seq;
+        struct timespec tp;
        memset(info, 0, sizeof(struct sysinfo));
-        do {
+        ktime_get_ts(&tp);
-                struct timespec tp;
+        monotonic_to_bootbased(&tp);
-                seq = read_seqbegin(&xtime_lock);
+        info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
-                /*
-                 * This is annoying.  The below is the same thing
-                 * posix_get_clock_monotonic() does, but it wants to
-                 * take the lock which we want to cover the loads stuff
-                 * too.
-                 */
-                getnstimeofday(&tp);
-                tp.tv_sec += wall_to_monotonic.tv_sec;
-                tp.tv_nsec += wall_to_monotonic.tv_nsec;
-                monotonic_to_bootbased(&tp);
-                if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
-                        tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
-                        tp.tv_sec++;
-                }
-                info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
-                info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
+        get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
-                info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
-                info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
-                info->procs = nr_threads;
+        info->procs = nr_threads;
-        } while (read_seqretry(&xtime_lock, seq));
        si_meminfo(info);
        si_swapinfo(info);
diff --git a/kernel/wait.c b/kernel/wait.c
index 42a2dbc181c8..ea7c3b4275cf 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
        if (!list_empty(&wait->task_list))
                list_del_init(&wait->task_list);
        else if (waitqueue_active(q))
-                __wake_up_common(q, mode, 1, 0, key);
+                __wake_up_locked_key(q, mode, key);
        spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(abort_exclusive_wait);