22 files changed, 2000 insertions, 726 deletions
diff --git a/kernel/futex.c b/kernel/futex.c
index 40a8777a27d..3019b92e691 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -69,6 +69,14 @@ int __read_mostly futex_cmpxchg_enabled;
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
 /*
+ * Futex flags used to encode options to functions and preserve them across
+ * restarts.
+ */
+#define FLAGS_SHARED            0x01
+#define FLAGS_CLOCKRT           0x02
+#define FLAGS_HAS_TIMEOUT       0x04
+/*
 * Priority Inheritance state:
 */
 struct futex_pi_state {
@@ -123,6 +131,12 @@ struct futex_q {
        u32 bitset;
 };
+static const struct futex_q futex_q_init = {
+        /* list gets initialized in queue_me()*/
+        .key = FUTEX_KEY_INIT,
+        .bitset = FUTEX_BITSET_MATCH_ANY
+};
 /*
 * Hash buckets are shared by all the futex_keys that hash to the same
 * location.  Each key may have multiple futex_q structures, one for each task
@@ -283,8 +297,7 @@ again:
        return 0;
 }
-static inline
+static inline void put_futex_key(union futex_key *key)
-void put_futex_key(int fshared, union futex_key *key)
 {
        drop_futex_key_refs(key);
 }
@@ -870,7 +883,8 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 /*
 * Wake up waiters matching bitset queued on this futex (uaddr).
 */
-static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
+static int
+futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
@@ -881,7 +895,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
        if (!bitset)
                return -EINVAL;
-        ret = get_futex_key(uaddr, fshared, &key);
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
        if (unlikely(ret != 0))
                goto out;
@@ -907,7 +921,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
        }
        spin_unlock(&hb->lock);
-        put_futex_key(fshared, &key);
+        put_futex_key(&key);
 out:
        return ret;
 }
@@ -917,7 +931,7 @@ out:
 * to this virtual address:
 */
 static int
-futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
+futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
              int nr_wake, int nr_wake2, int op)
 {
        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
@@ -927,10 +941,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
        int ret, op_ret;
 retry:
-        ret = get_futex_key(uaddr1, fshared, &key1);
+        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
        if (unlikely(ret != 0))
                goto out;
-        ret = get_futex_key(uaddr2, fshared, &key2);
+        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
        if (unlikely(ret != 0))
                goto out_put_key1;
@@ -962,11 +976,11 @@ retry_private:
                if (ret)
                        goto out_put_keys;
-                if (!fshared)
+                if (!(flags & FLAGS_SHARED))
                        goto retry_private;
-                put_futex_key(fshared, &key2);
+                put_futex_key(&key2);
-                put_futex_key(fshared, &key1);
+                put_futex_key(&key1);
                goto retry;
        }
@@ -996,9 +1010,9 @@ retry_private:
        double_unlock_hb(hb1, hb2);
 out_put_keys:
-        put_futex_key(fshared, &key2);
+        put_futex_key(&key2);
 out_put_key1:
-        put_futex_key(fshared, &key1);
+        put_futex_key(&key1);
 out:
        return ret;
 }
@@ -1133,13 +1147,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 /**
 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
 * @uaddr1:     source futex user address
- * @fshared:    0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
+ * @flags:      futex flags (FLAGS_SHARED, etc.)
 * @uaddr2:     target futex user address
 * @nr_wake:    number of waiters to wake (must be 1 for requeue_pi)
 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
 * @cmpval:     @uaddr1 expected value (or %NULL)
 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
- *              pi futex (pi to pi requeue is not supported)
+ *              pi futex (pi to pi requeue is not supported)
 *
 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
 * uaddr2 atomically on behalf of the top waiter.
@@ -1148,9 +1162,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 * >=0 - on success, the number of tasks requeued or woken
 *  <0 - on error
 */
-static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
+static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
-                         int nr_wake, int nr_requeue, u32 *cmpval,
+                         u32 __user *uaddr2, int nr_wake, int nr_requeue,
-                         int requeue_pi)
+                         u32 *cmpval, int requeue_pi)
 {
        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
        int drop_count = 0, task_count = 0, ret;
@@ -1191,10 +1205,10 @@ retry:
                pi_state = NULL;
        }
-        ret = get_futex_key(uaddr1, fshared, &key1);
+        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
        if (unlikely(ret != 0))
                goto out;
-        ret = get_futex_key(uaddr2, fshared, &key2);
+        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
        if (unlikely(ret != 0))
                goto out_put_key1;
@@ -1216,11 +1230,11 @@ retry_private:
                        if (ret)
                                goto out_put_keys;
-                        if (!fshared)
+                        if (!(flags & FLAGS_SHARED))
                                goto retry_private;
-                        put_futex_key(fshared, &key2);
+                        put_futex_key(&key2);
-                        put_futex_key(fshared, &key1);
+                        put_futex_key(&key1);
                        goto retry;
                }
                if (curval != *cmpval) {
@@ -1260,8 +1274,8 @@ retry_private:
                        break;
                case -EFAULT:
                        double_unlock_hb(hb1, hb2);
-                        put_futex_key(fshared, &key2);
+                        put_futex_key(&key2);
-                        put_futex_key(fshared, &key1);
+                        put_futex_key(&key1);
                        ret = fault_in_user_writeable(uaddr2);
                        if (!ret)
                                goto retry;
@@ -1269,8 +1283,8 @@ retry_private:
                case -EAGAIN:
                        /* The owner was exiting, try again. */
                        double_unlock_hb(hb1, hb2);
-                        put_futex_key(fshared, &key2);
+                        put_futex_key(&key2);
-                        put_futex_key(fshared, &key1);
+                        put_futex_key(&key1);
                        cond_resched();
                        goto retry;
                default:
@@ -1352,9 +1366,9 @@ out_unlock:
                drop_futex_key_refs(&key1);
 out_put_keys:
-        put_futex_key(fshared, &key2);
+        put_futex_key(&key2);
 out_put_key1:
-        put_futex_key(fshared, &key1);
+        put_futex_key(&key1);
 out:
        if (pi_state != NULL)
                free_pi_state(pi_state);
@@ -1494,7 +1508,7 @@ static void unqueue_me_pi(struct futex_q *q)
 * private futexes.
 */
 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
-                                struct task_struct *newowner, int fshared)
+                                struct task_struct *newowner)
 {
        u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
        struct futex_pi_state *pi_state = q->pi_state;
@@ -1587,20 +1601,11 @@ handle_fault:
        goto retry;
 }
-/*
- * In case we must use restart_block to restart a futex_wait,
- * we encode in the 'flags' shared capability
- */
-#define FLAGS_SHARED            0x01
-#define FLAGS_CLOCKRT           0x02
-#define FLAGS_HAS_TIMEOUT       0x04
 static long futex_wait_restart(struct restart_block *restart);
 /**
 * fixup_owner() - Post lock pi_state and corner case management
 * @uaddr:      user address of the futex
- * @fshared:    whether the futex is shared (1) or not (0)
 * @q:          futex_q (contains pi_state and access to the rt_mutex)
 * @locked:     if the attempt to take the rt_mutex succeeded (1) or not (0)
 *
@@ -1613,8 +1618,7 @@ static long futex_wait_restart(struct restart_block *restart);
 *  0 - success, lock not taken
 * <0 - on error (-EFAULT)
 */
-static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
+static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
-                       int locked)
 {
        struct task_struct *owner;
        int ret = 0;
@@ -1625,7 +1629,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
                 * did a lock-steal - fix up the PI-state in that case:
                 */
                if (q->pi_state->owner != current)
-                        ret = fixup_pi_state_owner(uaddr, q, current, fshared);
+                        ret = fixup_pi_state_owner(uaddr, q, current);
                goto out;
        }
@@ -1652,7 +1656,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
                 * lock. Fix the state up.
                 */
                owner = rt_mutex_owner(&q->pi_state->pi_mutex);
-                ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
+                ret = fixup_pi_state_owner(uaddr, q, owner);
                goto out;
        }
@@ -1715,7 +1719,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 * futex_wait_setup() - Prepare to wait on a futex
 * @uaddr:      the futex userspace address
 * @val:        the expected value
- * @fshared:    whether the futex is shared (1) or not (0)
+ * @flags:      futex flags (FLAGS_SHARED, etc.)
 * @q:          the associated futex_q
 * @hb:         storage for hash_bucket pointer to be returned to caller
 *
@@ -1728,7 +1732,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 *  0 - uaddr contains val and hb has been locked
 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
 */
-static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
+static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
                           struct futex_q *q, struct futex_hash_bucket **hb)
 {
        u32 uval;
@@ -1752,8 +1756,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
         * rare, but normal.
         */
 retry:
-        q->key = FUTEX_KEY_INIT;
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
-        ret = get_futex_key(uaddr, fshared, &q->key);
        if (unlikely(ret != 0))
                return ret;
@@ -1769,10 +1772,10 @@ retry_private:
                if (ret)
                        goto out;
-                if (!fshared)
+                if (!(flags & FLAGS_SHARED))
                        goto retry_private;
-                put_futex_key(fshared, &q->key);
+                put_futex_key(&q->key);
                goto retry;
        }
@@ -1783,32 +1786,29 @@ retry_private:
 out:
        if (ret)
-                put_futex_key(fshared, &q->key);
+                put_futex_key(&q->key);
        return ret;
 }
-static int futex_wait(u32 __user *uaddr, int fshared,
+static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
-                      u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+                      ktime_t *abs_time, u32 bitset)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
        struct restart_block *restart;
        struct futex_hash_bucket *hb;
-        struct futex_q q;
+        struct futex_q q = futex_q_init;
        int ret;
        if (!bitset)
                return -EINVAL;
-        q.pi_state = NULL;
        q.bitset = bitset;
-        q.rt_waiter = NULL;
-        q.requeue_pi_key = NULL;
        if (abs_time) {
                to = &timeout;
-                hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+                hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
-                                      CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+                                      CLOCK_REALTIME : CLOCK_MONOTONIC,
+                                      HRTIMER_MODE_ABS);
                hrtimer_init_sleeper(to, current);
                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
                                             current->timer_slack_ns);
@@ -1819,7 +1819,7 @@ retry:
         * Prepare to wait on uaddr. On success, holds hb lock and increments
         * q.key refs.
         */
-        ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+        ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
        if (ret)
                goto out;
@@ -1852,12 +1852,7 @@ retry:
        restart->futex.val = val;
        restart->futex.time = abs_time->tv64;
        restart->futex.bitset = bitset;
-        restart->futex.flags = FLAGS_HAS_TIMEOUT;
+        restart->futex.flags = flags;
-        if (fshared)
-                restart->futex.flags |= FLAGS_SHARED;
-        if (clockrt)
-                restart->futex.flags |= FLAGS_CLOCKRT;
        ret = -ERESTART_RESTARTBLOCK;
@@ -1873,7 +1868,6 @@ out:
 static long futex_wait_restart(struct restart_block *restart)
 {
        u32 __user *uaddr = restart->futex.uaddr;
-        int fshared = 0;
        ktime_t t, *tp = NULL;
        if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
@@ -1881,11 +1875,9 @@ static long futex_wait_restart(struct restart_block *restart)
                tp = &t;
        }
        restart->fn = do_no_restart_syscall;
-        if (restart->futex.flags & FLAGS_SHARED)
-                fshared = 1;
+        return (long)futex_wait(uaddr, restart->futex.flags,
-        return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
+                                restart->futex.val, tp, restart->futex.bitset);
-                                restart->futex.bitset,
-                                restart->futex.flags & FLAGS_CLOCKRT);
 }
@@ -1895,12 +1887,12 @@ static long futex_wait_restart(struct restart_block *restart)
 * if there are waiters then it will block, it does PI, etc. (Due to
 * races the kernel might see a 0 value of the futex too.)
 */
-static int futex_lock_pi(u32 __user *uaddr, int fshared,
+static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
-                         int detect, ktime_t *time, int trylock)
+                         ktime_t *time, int trylock)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
        struct futex_hash_bucket *hb;
-        struct futex_q q;
+        struct futex_q q = futex_q_init;
        int res, ret;
        if (refill_pi_state_cache())
@@ -1914,12 +1906,8 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
                hrtimer_set_expires(&to->timer, *time);
        }
-        q.pi_state = NULL;
-        q.rt_waiter = NULL;
-        q.requeue_pi_key = NULL;
 retry:
-        q.key = FUTEX_KEY_INIT;
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key);
-        ret = get_futex_key(uaddr, fshared, &q.key);
        if (unlikely(ret != 0))
                goto out;
@@ -1941,7 +1929,7 @@ retry_private:
                         * exit to complete.
                         */
                        queue_unlock(&q, hb);
-                        put_futex_key(fshared, &q.key);
+                        put_futex_key(&q.key);
                        cond_resched();
                        goto retry;
                default:
@@ -1971,7 +1959,7 @@ retry_private:
         * Fixup the pi_state owner and possibly acquire the lock if we
         * haven't already.
         */
-        res = fixup_owner(uaddr, fshared, &q, !ret);
+        res = fixup_owner(uaddr, &q, !ret);
        /*
         * If fixup_owner() returned an error, proprogate that.  If it acquired
         * the lock, clear our -ETIMEDOUT or -EINTR.
@@ -1995,7 +1983,7 @@ out_unlock_put_key:
        queue_unlock(&q, hb);
 out_put_key:
-        put_futex_key(fshared, &q.key);
+        put_futex_key(&q.key);
 out:
        if (to)
                destroy_hrtimer_on_stack(&to->timer);
@@ -2008,10 +1996,10 @@ uaddr_faulted:
        if (ret)
                goto out_put_key;
-        if (!fshared)
+        if (!(flags & FLAGS_SHARED))
                goto retry_private;
-        put_futex_key(fshared, &q.key);
+        put_futex_key(&q.key);
        goto retry;
 }
@@ -2020,7 +2008,7 @@ uaddr_faulted:
 * This is the in-kernel slowpath: we look up the PI state (if any),
 * and do the rt-mutex unlock.
 */
-static int futex_unlock_pi(u32 __user *uaddr, int fshared)
+static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
@@ -2038,7 +2026,7 @@ retry:
        if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
                return -EPERM;
-        ret = get_futex_key(uaddr, fshared, &key);
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
        if (unlikely(ret != 0))
                goto out;
@@ -2093,14 +2081,14 @@ retry:
 out_unlock:
        spin_unlock(&hb->lock);
-        put_futex_key(fshared, &key);
+        put_futex_key(&key);
 out:
        return ret;
 pi_faulted:
        spin_unlock(&hb->lock);
-        put_futex_key(fshared, &key);
+        put_futex_key(&key);
        ret = fault_in_user_writeable(uaddr);
        if (!ret)
@@ -2160,7 +2148,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 /**
 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
 * @uaddr:      the futex we initially wait on (non-pi)
- * @fshared:    whether the futexes are shared (1) or not (0).  They must be
+ * @flags:      futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
 *              the same type, no requeueing from private to shared, etc.
 * @val:        the expected value of uaddr
 * @abs_time:   absolute timeout
@@ -2198,16 +2186,16 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 *  0 - On success
 * <0 - On error
 */
-static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
+static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                                 u32 val, ktime_t *abs_time, u32 bitset,
-                                 int clockrt, u32 __user *uaddr2)
+                                 u32 __user *uaddr2)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
        struct rt_mutex_waiter rt_waiter;
        struct rt_mutex *pi_mutex = NULL;
        struct futex_hash_bucket *hb;
-        union futex_key key2;
+        union futex_key key2 = FUTEX_KEY_INIT;
-        struct futex_q q;
+        struct futex_q q = futex_q_init;
        int res, ret;
        if (!bitset)
@@ -2215,8 +2203,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        if (abs_time) {
                to = &timeout;
-                hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+                hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
-                                      CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+                                      CLOCK_REALTIME : CLOCK_MONOTONIC,
+                                      HRTIMER_MODE_ABS);
                hrtimer_init_sleeper(to, current);
                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
                                             current->timer_slack_ns);
@@ -2229,12 +2218,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        debug_rt_mutex_init_waiter(&rt_waiter);
        rt_waiter.task = NULL;
-        key2 = FUTEX_KEY_INIT;
+        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
-        ret = get_futex_key(uaddr2, fshared, &key2);
        if (unlikely(ret != 0))
                goto out;
-        q.pi_state = NULL;
        q.bitset = bitset;
        q.rt_waiter = &rt_waiter;
        q.requeue_pi_key = &key2;
@@ -2243,7 +2230,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
         * Prepare to wait on uaddr. On success, increments q.key (key1) ref
         * count.
         */
-        ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+        ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
        if (ret)
                goto out_key2;
@@ -2273,8 +2260,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
                 */
                if (q.pi_state && (q.pi_state->owner != current)) {
                        spin_lock(q.lock_ptr);
-                        ret = fixup_pi_state_owner(uaddr2, &q, current,
+                        ret = fixup_pi_state_owner(uaddr2, &q, current);
-                                                   fshared);
                        spin_unlock(q.lock_ptr);
                }
        } else {
@@ -2293,7 +2279,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
                 * Fixup the pi_state owner and possibly acquire the lock if we
                 * haven't already.
                 */
-                res = fixup_owner(uaddr2, fshared, &q, !ret);
+                res = fixup_owner(uaddr2, &q, !ret);
                /*
                 * If fixup_owner() returned an error, proprogate that.  If it
                 * acquired the lock, clear -ETIMEDOUT or -EINTR.
@@ -2324,9 +2310,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        }
 out_put_keys:
-        put_futex_key(fshared, &q.key);
+        put_futex_key(&q.key);
 out_key2:
-        put_futex_key(fshared, &key2);
+        put_futex_key(&key2);
 out:
        if (to) {
@@ -2551,58 +2537,57 @@ void exit_robust_list(struct task_struct *curr)
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                u32 __user *uaddr2, u32 val2, u32 val3)
 {
-        int clockrt, ret = -ENOSYS;
+        int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
-        int cmd = op & FUTEX_CMD_MASK;
+        unsigned int flags = 0;
-        int fshared = 0;
        if (!(op & FUTEX_PRIVATE_FLAG))
-                fshared = 1;
+                flags |= FLAGS_SHARED;
-        clockrt = op & FUTEX_CLOCK_REALTIME;
+        if (op & FUTEX_CLOCK_REALTIME) {
-        if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+                flags |= FLAGS_CLOCKRT;
-                return -ENOSYS;
+                if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+                        return -ENOSYS;
+        }
        switch (cmd) {
        case FUTEX_WAIT:
                val3 = FUTEX_BITSET_MATCH_ANY;
        case FUTEX_WAIT_BITSET:
-                ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
+                ret = futex_wait(uaddr, flags, val, timeout, val3);
                break;
        case FUTEX_WAKE:
                val3 = FUTEX_BITSET_MATCH_ANY;
        case FUTEX_WAKE_BITSET:
-                ret = futex_wake(uaddr, fshared, val, val3);
+                ret = futex_wake(uaddr, flags, val, val3);
                break;
        case FUTEX_REQUEUE:
-                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
+                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
                break;
        case FUTEX_CMP_REQUEUE:
-                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
-                                    0);
                break;
        case FUTEX_WAKE_OP:
-                ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
+                ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
                break;
        case FUTEX_LOCK_PI:
                if (futex_cmpxchg_enabled)
-                        ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
+                        ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
                break;
        case FUTEX_UNLOCK_PI:
                if (futex_cmpxchg_enabled)
-                        ret = futex_unlock_pi(uaddr, fshared);
+                        ret = futex_unlock_pi(uaddr, flags);
                break;
        case FUTEX_TRYLOCK_PI:
                if (futex_cmpxchg_enabled)
-                        ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
+                        ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
                break;
        case FUTEX_WAIT_REQUEUE_PI:
                val3 = FUTEX_BITSET_MATCH_ANY;
-                ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
+                ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
-                                            clockrt, uaddr2);
+                                            uaddr2);
                break;
        case FUTEX_CMP_REQUEUE_PI:
-                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
-                                    1);
                break;
        default:
                ret = -ENOSYS;
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index e5325825aeb..086adf25a55 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -641,7 +641,7 @@ int __init init_hw_breakpoint(void)
        constraints_initialized = 1;
-        perf_pmu_register(&perf_breakpoint);
+        perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
        return register_die_notifier(&hw_breakpoint_exceptions_nb);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9737a76e106..7663e5df0e6 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -354,13 +354,20 @@ static inline int kprobe_aggrprobe(struct kprobe *p)
        return p->pre_handler == aggr_pre_handler;
 }
+/* Return true(!0) if the kprobe is unused */
+static inline int kprobe_unused(struct kprobe *p)
+{
+        return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
+               list_empty(&p->list);
+}
 /*
 * Keep all fields in the kprobe consistent
 */
-static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
+static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
 {
-        memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
+        memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
-        memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
+        memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn));
 }
 #ifdef CONFIG_OPTPROBES
@@ -384,6 +391,17 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
        }
 }
+/* Free optimized instructions and optimized_kprobe */
+static __kprobes void free_aggr_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        op = container_of(p, struct optimized_kprobe, kp);
+        arch_remove_optimized_kprobe(op);
+        arch_remove_kprobe(p);
+        kfree(op);
+}
 /* Return true(!0) if the kprobe is ready for optimization. */
 static inline int kprobe_optready(struct kprobe *p)
 {
@@ -397,6 +415,33 @@ static inline int kprobe_optready(struct kprobe *p)
        return 0;
 }
+/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */
+static inline int kprobe_disarmed(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        /* If kprobe is not aggr/opt probe, just return kprobe is disabled */
+        if (!kprobe_aggrprobe(p))
+                return kprobe_disabled(p);
+        op = container_of(p, struct optimized_kprobe, kp);
+        return kprobe_disabled(p) && list_empty(&op->list);
+}
+/* Return true(!0) if the probe is queued on (un)optimizing lists */
+static int __kprobes kprobe_queued(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        if (kprobe_aggrprobe(p)) {
+                op = container_of(p, struct optimized_kprobe, kp);
+                if (!list_empty(&op->list))
+                        return 1;
+        }
+        return 0;
+}
 /*
 * Return an optimized kprobe whose optimizing code replaces
 * instructions including addr (exclude breakpoint).
@@ -422,30 +467,23 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
 /* Optimization staging list, protected by kprobe_mutex */
 static LIST_HEAD(optimizing_list);
+static LIST_HEAD(unoptimizing_list);
 static void kprobe_optimizer(struct work_struct *work);
 static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
+static DECLARE_COMPLETION(optimizer_comp);
 #define OPTIMIZE_DELAY 5
-/* Kprobe jump optimizer */
+/*
-static __kprobes void kprobe_optimizer(struct work_struct *work)
+ * Optimize (replace a breakpoint with a jump) kprobes listed on
+ * optimizing_list.
+ */
+static __kprobes void do_optimize_kprobes(void)
 {
-        struct optimized_kprobe *op, *tmp;
+        /* Optimization never be done when disarmed */
+        if (kprobes_all_disarmed || !kprobes_allow_optimization ||
-        /* Lock modules while optimizing kprobes */
+            list_empty(&optimizing_list))
-        mutex_lock(&module_mutex);
+                return;
-        mutex_lock(&kprobe_mutex);
-        if (kprobes_all_disarmed || !kprobes_allow_optimization)
-                goto end;
-        /*
-         * Wait for quiesence period to ensure all running interrupts
-         * are done. Because optprobe may modify multiple instructions
-         * there is a chance that Nth instruction is interrupted. In that
-         * case, running interrupt can return to 2nd-Nth byte of jump
-         * instruction. This wait is for avoiding it.
-         */
-        synchronize_sched();
        /*
         * The optimization/unoptimization refers online_cpus via
@@ -459,17 +497,111 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
         */
        get_online_cpus();
        mutex_lock(&text_mutex);
-        list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
+        arch_optimize_kprobes(&optimizing_list);
-                WARN_ON(kprobe_disabled(&op->kp));
+        mutex_unlock(&text_mutex);
-                if (arch_optimize_kprobe(op) < 0)
+        put_online_cpus();
-                        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+}
-                list_del_init(&op->list);
+/*
+ * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
+ * if need) kprobes listed on unoptimizing_list.
+ */
+static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
+{
+        struct optimized_kprobe *op, *tmp;
+        /* Unoptimization must be done anytime */
+        if (list_empty(&unoptimizing_list))
+                return;
+        /* Ditto to do_optimize_kprobes */
+        get_online_cpus();
+        mutex_lock(&text_mutex);
+        arch_unoptimize_kprobes(&unoptimizing_list, free_list);
+        /* Loop free_list for disarming */
+        list_for_each_entry_safe(op, tmp, free_list, list) {
+                /* Disarm probes if marked disabled */
+                if (kprobe_disabled(&op->kp))
+                        arch_disarm_kprobe(&op->kp);
+                if (kprobe_unused(&op->kp)) {
+                        /*
+                         * Remove unused probes from hash list. After waiting
+                         * for synchronization, these probes are reclaimed.
+                         * (reclaiming is done by do_free_cleaned_kprobes.)
+                         */
+                        hlist_del_rcu(&op->kp.hlist);
+                } else
+                        list_del_init(&op->list);
        }
        mutex_unlock(&text_mutex);
        put_online_cpus();
-end:
+}
+/* Reclaim all kprobes on the free_list */
+static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
+{
+        struct optimized_kprobe *op, *tmp;
+        list_for_each_entry_safe(op, tmp, free_list, list) {
+                BUG_ON(!kprobe_unused(&op->kp));
+                list_del_init(&op->list);
+                free_aggr_kprobe(&op->kp);
+        }
+}
+/* Start optimizer after OPTIMIZE_DELAY passed */
+static __kprobes void kick_kprobe_optimizer(void)
+{
+        if (!delayed_work_pending(&optimizing_work))
+                schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+}
+/* Kprobe jump optimizer */
+static __kprobes void kprobe_optimizer(struct work_struct *work)
+{
+        LIST_HEAD(free_list);
+        /* Lock modules while optimizing kprobes */
+        mutex_lock(&module_mutex);
+        mutex_lock(&kprobe_mutex);
+        /*
+         * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
+         * kprobes before waiting for quiesence period.
+         */
+        do_unoptimize_kprobes(&free_list);
+        /*
+         * Step 2: Wait for quiesence period to ensure all running interrupts
+         * are done. Because optprobe may modify multiple instructions
+         * there is a chance that Nth instruction is interrupted. In that
+         * case, running interrupt can return to 2nd-Nth byte of jump
+         * instruction. This wait is for avoiding it.
+         */
+        synchronize_sched();
+        /* Step 3: Optimize kprobes after quiesence period */
+        do_optimize_kprobes();
+        /* Step 4: Free cleaned kprobes after quiesence period */
+        do_free_cleaned_kprobes(&free_list);
        mutex_unlock(&kprobe_mutex);
        mutex_unlock(&module_mutex);
+        /* Step 5: Kick optimizer again if needed */
+        if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
+                kick_kprobe_optimizer();
+        else
+                /* Wake up all waiters */
+                complete_all(&optimizer_comp);
+}
+/* Wait for completing optimization and unoptimization */
+static __kprobes void wait_for_kprobe_optimizer(void)
+{
+        if (delayed_work_pending(&optimizing_work))
+                wait_for_completion(&optimizer_comp);
 }
 /* Optimize kprobe if p is ready to be optimized */
@@ -495,42 +627,99 @@ static __kprobes void optimize_kprobe(struct kprobe *p)
        /* Check if it is already optimized. */
        if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
                return;
        op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
-        list_add(&op->list, &optimizing_list);
-        if (!delayed_work_pending(&optimizing_work))
+        if (!list_empty(&op->list))
-                schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+                /* This is under unoptimizing. Just dequeue the probe */
+                list_del_init(&op->list);
+        else {
+                list_add(&op->list, &optimizing_list);
+                kick_kprobe_optimizer();
+        }
+}
+/* Short cut to direct unoptimizing */
+static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+        get_online_cpus();
+        arch_unoptimize_kprobe(op);
+        put_online_cpus();
+        if (kprobe_disabled(&op->kp))
+                arch_disarm_kprobe(&op->kp);
 }
 /* Unoptimize a kprobe if p is optimized */
-static __kprobes void unoptimize_kprobe(struct kprobe *p)
+static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force)
 {
        struct optimized_kprobe *op;
-        if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
+        if (!kprobe_aggrprobe(p) || kprobe_disarmed(p))
-                op = container_of(p, struct optimized_kprobe, kp);
+                return; /* This is not an optprobe nor optimized */
-                if (!list_empty(&op->list))
-                        /* Dequeue from the optimization queue */
+        op = container_of(p, struct optimized_kprobe, kp);
+        if (!kprobe_optimized(p)) {
+                /* Unoptimized or unoptimizing case */
+                if (force && !list_empty(&op->list)) {
+                        /*
+                         * Only if this is unoptimizing kprobe and forced,
+                         * forcibly unoptimize it. (No need to unoptimize
+                         * unoptimized kprobe again :)
+                         */
                        list_del_init(&op->list);
-                else
+                        force_unoptimize_kprobe(op);
-                        /* Replace jump with break */
+                }
-                        arch_unoptimize_kprobe(op);
+                return;
-                op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+        }
+        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+        if (!list_empty(&op->list)) {
+                /* Dequeue from the optimization queue */
+                list_del_init(&op->list);
+                return;
+        }
+        /* Optimized kprobe case */
+        if (force)
+                /* Forcibly update the code: this is a special case */
+                force_unoptimize_kprobe(op);
+        else {
+                list_add(&op->list, &unoptimizing_list);
+                kick_kprobe_optimizer();
        }
 }
+/* Cancel unoptimizing for reusing */
+static void reuse_unused_kprobe(struct kprobe *ap)
+{
+        struct optimized_kprobe *op;
+        BUG_ON(!kprobe_unused(ap));
+        /*
+         * Unused kprobe MUST be on the way of delayed unoptimizing (means
+         * there is still a relative jump) and disabled.
+         */
+        op = container_of(ap, struct optimized_kprobe, kp);
+        if (unlikely(list_empty(&op->list)))
+                printk(KERN_WARNING "Warning: found a stray unused "
+                        "aggrprobe@%p\n", ap->addr);
+        /* Enable the probe again */
+        ap->flags &= ~KPROBE_FLAG_DISABLED;
+        /* Optimize it again (remove from op->list) */
+        BUG_ON(!kprobe_optready(ap));
+        optimize_kprobe(ap);
+}
 /* Remove optimized instructions */
 static void __kprobes kill_optimized_kprobe(struct kprobe *p)
 {
        struct optimized_kprobe *op;
        op = container_of(p, struct optimized_kprobe, kp);
-        if (!list_empty(&op->list)) {
+        if (!list_empty(&op->list))
-                /* Dequeue from the optimization queue */
+                /* Dequeue from the (un)optimization queue */
                list_del_init(&op->list);
-                op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
-        }
+        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
-        /* Don't unoptimize, because the target code will be freed. */
+        /* Don't touch the code, because it is already freed. */
        arch_remove_optimized_kprobe(op);
 }
@@ -543,16 +732,6 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
        arch_prepare_optimized_kprobe(op);
 }
-/* Free optimized instructions and optimized_kprobe */
-static __kprobes void free_aggr_kprobe(struct kprobe *p)
-{
-        struct optimized_kprobe *op;
-        op = container_of(p, struct optimized_kprobe, kp);
-        arch_remove_optimized_kprobe(op);
-        kfree(op);
-}
 /* Allocate new optimized_kprobe and try to prepare optimized instructions */
 static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
 {
@@ -587,7 +766,8 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
        op = container_of(ap, struct optimized_kprobe, kp);
        if (!arch_prepared_optinsn(&op->optinsn)) {
                /* If failed to setup optimizing, fallback to kprobe */
-                free_aggr_kprobe(ap);
+                arch_remove_optimized_kprobe(op);
+                kfree(op);
                return;
        }
@@ -631,21 +811,16 @@ static void __kprobes unoptimize_all_kprobes(void)
                return;
        kprobes_allow_optimization = false;
-        printk(KERN_INFO "Kprobes globally unoptimized\n");
-        get_online_cpus();      /* For avoiding text_mutex deadlock */
-        mutex_lock(&text_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry_rcu(p, node, head, hlist) {
                        if (!kprobe_disabled(p))
-                                unoptimize_kprobe(p);
+                                unoptimize_kprobe(p, false);
                }
        }
+        /* Wait for unoptimizing completion */
-        mutex_unlock(&text_mutex);
+        wait_for_kprobe_optimizer();
-        put_online_cpus();
+        printk(KERN_INFO "Kprobes globally unoptimized\n");
-        /* Allow all currently running kprobes to complete */
-        synchronize_sched();
 }
 int sysctl_kprobes_optimization;
@@ -669,44 +844,60 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
 }
 #endif /* CONFIG_SYSCTL */
+/* Put a breakpoint for a probe. Must be called with text_mutex locked */
 static void __kprobes __arm_kprobe(struct kprobe *p)
 {
-        struct kprobe *old_p;
+        struct kprobe *_p;
        /* Check collision with other optimized kprobes */
-        old_p = get_optimized_kprobe((unsigned long)p->addr);
+        _p = get_optimized_kprobe((unsigned long)p->addr);
-        if (unlikely(old_p))
+        if (unlikely(_p))
-                unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
+                /* Fallback to unoptimized kprobe */
+                unoptimize_kprobe(_p, true);
        arch_arm_kprobe(p);
        optimize_kprobe(p);     /* Try to optimize (add kprobe to a list) */
 }
-static void __kprobes __disarm_kprobe(struct kprobe *p)
+/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
+static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt)
 {
-        struct kprobe *old_p;
+        struct kprobe *_p;
-        unoptimize_kprobe(p);   /* Try to unoptimize */
+        unoptimize_kprobe(p, false);    /* Try to unoptimize */
-        arch_disarm_kprobe(p);
-        /* If another kprobe was blocked, optimize it. */
+        if (!kprobe_queued(p)) {
-        old_p = get_optimized_kprobe((unsigned long)p->addr);
+                arch_disarm_kprobe(p);
-        if (unlikely(old_p))
+                /* If another kprobe was blocked, optimize it. */
-                optimize_kprobe(old_p);
+                _p = get_optimized_kprobe((unsigned long)p->addr);
+                if (unlikely(_p) && reopt)
+                        optimize_kprobe(_p);
+        }
+        /* TODO: reoptimize others after unoptimized this probe */
 }
 #else /* !CONFIG_OPTPROBES */
 #define optimize_kprobe(p)                      do {} while (0)
-#define unoptimize_kprobe(p)                    do {} while (0)
+#define unoptimize_kprobe(p, f)                 do {} while (0)
 #define kill_optimized_kprobe(p)                do {} while (0)
 #define prepare_optimized_kprobe(p)             do {} while (0)
 #define try_to_optimize_kprobe(p)               do {} while (0)
 #define __arm_kprobe(p)                         arch_arm_kprobe(p)
-#define __disarm_kprobe(p)                      arch_disarm_kprobe(p)
+#define __disarm_kprobe(p, o)                   arch_disarm_kprobe(p)
+#define kprobe_disarmed(p)                      kprobe_disabled(p)
+#define wait_for_kprobe_optimizer()             do {} while (0)
+/* There should be no unused kprobes can be reused without optimization */
+static void reuse_unused_kprobe(struct kprobe *ap)
+{
+        printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
+        BUG_ON(kprobe_unused(ap));
+}
 static __kprobes void free_aggr_kprobe(struct kprobe *p)
 {
+        arch_remove_kprobe(p);
        kfree(p);
 }
@@ -732,11 +923,10 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
 /* Disarm a kprobe with text_mutex */
 static void __kprobes disarm_kprobe(struct kprobe *kp)
 {
-        get_online_cpus();      /* For avoiding text_mutex deadlock */
+        /* Ditto */
        mutex_lock(&text_mutex);
-        __disarm_kprobe(kp);
+        __disarm_kprobe(kp, true);
        mutex_unlock(&text_mutex);
-        put_online_cpus();
 }
 /*
@@ -942,7 +1132,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
        BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
        if (p->break_handler || p->post_handler)
-                unoptimize_kprobe(ap);  /* Fall back to normal kprobe */
+                unoptimize_kprobe(ap, true);    /* Fall back to normal kprobe */
        if (p->break_handler) {
                if (ap->break_handler)
@@ -993,19 +1183,21 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 * This is the second or subsequent kprobe at the address - handle
 * the intricacies
 */
-static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
+static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
                                          struct kprobe *p)
 {
        int ret = 0;
-        struct kprobe *ap = old_p;
+        struct kprobe *ap = orig_p;
-        if (!kprobe_aggrprobe(old_p)) {
+        if (!kprobe_aggrprobe(orig_p)) {
-                /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
+                /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
-                ap = alloc_aggr_kprobe(old_p);
+                ap = alloc_aggr_kprobe(orig_p);
                if (!ap)
                        return -ENOMEM;
-                init_aggr_kprobe(ap, old_p);
+                init_aggr_kprobe(ap, orig_p);
-        }
+        } else if (kprobe_unused(ap))
+                /* This probe is going to die. Rescue it */
+                reuse_unused_kprobe(ap);
        if (kprobe_gone(ap)) {
                /*
@@ -1039,23 +1231,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
        return add_new_kprobe(ap, p);
 }
-/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
-static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
-{
-        struct kprobe *kp;
-        list_for_each_entry_rcu(kp, &p->list, list) {
-                if (!kprobe_disabled(kp))
-                        /*
-                         * There is an active probe on the list.
-                         * We can't disable aggr_kprobe.
-                         */
-                        return 0;
-        }
-        p->flags |= KPROBE_FLAG_DISABLED;
-        return 1;
-}
 static int __kprobes in_kprobes_functions(unsigned long addr)
 {
        struct kprobe_blackpoint *kb;
@@ -1098,34 +1273,33 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
 /* Check passed kprobe is valid and return kprobe in kprobe_table. */
 static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
 {
-        struct kprobe *old_p, *list_p;
+        struct kprobe *ap, *list_p;
-        old_p = get_kprobe(p->addr);
+        ap = get_kprobe(p->addr);
-        if (unlikely(!old_p))
+        if (unlikely(!ap))
                return NULL;
-        if (p != old_p) {
+        if (p != ap) {
-                list_for_each_entry_rcu(list_p, &old_p->list, list)
+                list_for_each_entry_rcu(list_p, &ap->list, list)
                        if (list_p == p)
                        /* kprobe p is a valid probe */
                                goto valid;
                return NULL;
        }
 valid:
-        return old_p;
+        return ap;
 }
 /* Return error if the kprobe is being re-registered */
 static inline int check_kprobe_rereg(struct kprobe *p)
 {
        int ret = 0;
-        struct kprobe *old_p;
        mutex_lock(&kprobe_mutex);
-        old_p = __get_valid_kprobe(p);
+        if (__get_valid_kprobe(p))
-        if (old_p)
                ret = -EINVAL;
        mutex_unlock(&kprobe_mutex);
        return ret;
 }
@@ -1229,67 +1403,121 @@ fail_with_jump_label:
 }
 EXPORT_SYMBOL_GPL(register_kprobe);
+/* Check if all probes on the aggrprobe are disabled */
+static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
+{
+        struct kprobe *kp;
+        list_for_each_entry_rcu(kp, &ap->list, list)
+                if (!kprobe_disabled(kp))
+                        /*
+                         * There is an active probe on the list.
+                         * We can't disable this ap.
+                         */
+                        return 0;
+        return 1;
+}
+/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
+static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
+{
+        struct kprobe *orig_p;
+        /* Get an original kprobe for return */
+        orig_p = __get_valid_kprobe(p);
+        if (unlikely(orig_p == NULL))
+                return NULL;
+        if (!kprobe_disabled(p)) {
+                /* Disable probe if it is a child probe */
+                if (p != orig_p)
+                        p->flags |= KPROBE_FLAG_DISABLED;
+                /* Try to disarm and disable this/parent probe */
+                if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
+                        disarm_kprobe(orig_p);
+                        orig_p->flags |= KPROBE_FLAG_DISABLED;
+                }
+        }
+        return orig_p;
+}
 /*
 * Unregister a kprobe without a scheduler synchronization.
 */
 static int __kprobes __unregister_kprobe_top(struct kprobe *p)
 {
-        struct kprobe *old_p, *list_p;
+        struct kprobe *ap, *list_p;
-        old_p = __get_valid_kprobe(p);
+        /* Disable kprobe. This will disarm it if needed. */
-        if (old_p == NULL)
+        ap = __disable_kprobe(p);
+        if (ap == NULL)
                return -EINVAL;
-        if (old_p == p ||
+        if (ap == p)
-            (kprobe_aggrprobe(old_p) &&
-             list_is_singular(&old_p->list))) {
                /*
-                 * Only probe on the hash list. Disarm only if kprobes are
+                 * This probe is an independent(and non-optimized) kprobe
-                 * enabled and not gone - otherwise, the breakpoint would
+                 * (not an aggrprobe). Remove from the hash list.
-                 * already have been removed. We save on flushing icache.
                 */
-                if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
+                goto disarmed;
-                        disarm_kprobe(old_p);
-                hlist_del_rcu(&old_p->hlist);
+        /* Following process expects this probe is an aggrprobe */
-        } else {
+        WARN_ON(!kprobe_aggrprobe(ap));
+        if (list_is_singular(&ap->list) && kprobe_disarmed(ap))
+                /*
+                 * !disarmed could be happen if the probe is under delayed
+                 * unoptimizing.
+                 */
+                goto disarmed;
+        else {
+                /* If disabling probe has special handlers, update aggrprobe */
                if (p->break_handler && !kprobe_gone(p))
-                        old_p->break_handler = NULL;
+                        ap->break_handler = NULL;
                if (p->post_handler && !kprobe_gone(p)) {
-                        list_for_each_entry_rcu(list_p, &old_p->list, list) {
+                        list_for_each_entry_rcu(list_p, &ap->list, list) {
                                if ((list_p != p) && (list_p->post_handler))
                                        goto noclean;
                        }
-                        old_p->post_handler = NULL;
+                        ap->post_handler = NULL;
                }
 noclean:
+                /*
+                 * Remove from the aggrprobe: this path will do nothing in
+                 * __unregister_kprobe_bottom().
+                 */
                list_del_rcu(&p->list);
-                if (!kprobe_disabled(old_p)) {
+                if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
-                        try_to_disable_aggr_kprobe(old_p);
+                        /*
-                        if (!kprobes_all_disarmed) {
+                         * Try to optimize this probe again, because post
-                                if (kprobe_disabled(old_p))
+                         * handler may have been changed.
-                                        disarm_kprobe(old_p);
+                         */
-                                else
+                        optimize_kprobe(ap);
-                                        /* Try to optimize this probe again */
-                                        optimize_kprobe(old_p);
-                        }
-                }
        }
        return 0;
+disarmed:
+        BUG_ON(!kprobe_disarmed(ap));
+        hlist_del_rcu(&ap->hlist);
+        return 0;
 }
 static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
 {
-        struct kprobe *old_p;
+        struct kprobe *ap;
        if (list_empty(&p->list))
+                /* This is an independent kprobe */
                arch_remove_kprobe(p);
        else if (list_is_singular(&p->list)) {
-                /* "p" is the last child of an aggr_kprobe */
+                /* This is the last child of an aggrprobe */
-                old_p = list_entry(p->list.next, struct kprobe, list);
+                ap = list_entry(p->list.next, struct kprobe, list);
                list_del(&p->list);
-                arch_remove_kprobe(old_p);
+                free_aggr_kprobe(ap);
-                free_aggr_kprobe(old_p);
        }
+        /* Otherwise, do nothing. */
 }
 int __kprobes register_kprobes(struct kprobe **kps, int num)
@@ -1607,29 +1835,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
 int __kprobes disable_kprobe(struct kprobe *kp)
 {
        int ret = 0;
-        struct kprobe *p;
        mutex_lock(&kprobe_mutex);
-        /* Check whether specified probe is valid. */
+        /* Disable this kprobe */
-        p = __get_valid_kprobe(kp);
+        if (__disable_kprobe(kp) == NULL)
-        if (unlikely(p == NULL)) {
                ret = -EINVAL;
-                goto out;
-        }
-        /* If the probe is already disabled (or gone), just return */
-        if (kprobe_disabled(kp))
-                goto out;
-        kp->flags |= KPROBE_FLAG_DISABLED;
-        if (p != kp)
-                /* When kp != p, p is always enabled. */
-                try_to_disable_aggr_kprobe(p);
-        if (!kprobes_all_disarmed && kprobe_disabled(p))
-                disarm_kprobe(p);
-out:
        mutex_unlock(&kprobe_mutex);
        return ret;
 }
@@ -1927,36 +2139,27 @@ static void __kprobes disarm_all_kprobes(void)
        mutex_lock(&kprobe_mutex);
        /* If kprobes are already disarmed, just return */
-        if (kprobes_all_disarmed)
+        if (kprobes_all_disarmed) {
-                goto already_disabled;
+                mutex_unlock(&kprobe_mutex);
+                return;
+        }
        kprobes_all_disarmed = true;
        printk(KERN_INFO "Kprobes globally disabled\n");
-        /*
-         * Here we call get_online_cpus() for avoiding text_mutex deadlock,
-         * because disarming may also unoptimize kprobes.
-         */
-        get_online_cpus();
        mutex_lock(&text_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry_rcu(p, node, head, hlist) {
                        if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
-                                __disarm_kprobe(p);
+                                __disarm_kprobe(p, false);
                }
        }
        mutex_unlock(&text_mutex);
-        put_online_cpus();
        mutex_unlock(&kprobe_mutex);
-        /* Allow all currently running kprobes to complete */
-        synchronize_sched();
-        return;
-already_disabled:
+        /* Wait for disarming all kprobes by optimizer */
-        mutex_unlock(&kprobe_mutex);
+        wait_for_kprobe_optimizer();
-        return;
 }
 /*
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2870feee81d..11847bf1e8c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -13,6 +13,7 @@
 #include <linux/mm.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
+#include <linux/idr.h>
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/slab.h>
@@ -21,7 +22,9 @@
 #include <linux/dcache.h>
 #include <linux/percpu.h>
 #include <linux/ptrace.h>
+#include <linux/reboot.h>
 #include <linux/vmstat.h>
+#include <linux/device.h>
 #include <linux/vmalloc.h>
 #include <linux/hardirq.h>
 #include <linux/rculist.h>
@@ -133,6 +136,28 @@ static void unclone_ctx(struct perf_event_context *ctx)
        }
 }
+static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
+{
+        /*
+         * only top level events have the pid namespace they were created in
+         */
+        if (event->parent)
+                event = event->parent;
+        return task_tgid_nr_ns(p, event->ns);
+}
+static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+{
+        /*
+         * only top level events have the pid namespace they were created in
+         */
+        if (event->parent)
+                event = event->parent;
+        return task_pid_nr_ns(p, event->ns);
+}
 /*
 * If we inherit events we want to return the parent event id
 * to userspace.
@@ -312,9 +337,84 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
                ctx->nr_stat++;
 }
+/*
+ * Called at perf_event creation and when events are attached/detached from a
+ * group.
+ */
+static void perf_event__read_size(struct perf_event *event)
+{
+        int entry = sizeof(u64); /* value */
+        int size = 0;
+        int nr = 1;
+        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+                size += sizeof(u64);
+        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+                size += sizeof(u64);
+        if (event->attr.read_format & PERF_FORMAT_ID)
+                entry += sizeof(u64);
+        if (event->attr.read_format & PERF_FORMAT_GROUP) {
+                nr += event->group_leader->nr_siblings;
+                size += sizeof(u64);
+        }
+        size += entry * nr;
+        event->read_size = size;
+}
+static void perf_event__header_size(struct perf_event *event)
+{
+        struct perf_sample_data *data;
+        u64 sample_type = event->attr.sample_type;
+        u16 size = 0;
+        perf_event__read_size(event);
+        if (sample_type & PERF_SAMPLE_IP)
+                size += sizeof(data->ip);
+        if (sample_type & PERF_SAMPLE_ADDR)
+                size += sizeof(data->addr);
+        if (sample_type & PERF_SAMPLE_PERIOD)
+                size += sizeof(data->period);
+        if (sample_type & PERF_SAMPLE_READ)
+                size += event->read_size;
+        event->header_size = size;
+}
+static void perf_event__id_header_size(struct perf_event *event)
+{
+        struct perf_sample_data *data;
+        u64 sample_type = event->attr.sample_type;
+        u16 size = 0;
+        if (sample_type & PERF_SAMPLE_TID)
+                size += sizeof(data->tid_entry);
+        if (sample_type & PERF_SAMPLE_TIME)
+                size += sizeof(data->time);
+        if (sample_type & PERF_SAMPLE_ID)
+                size += sizeof(data->id);
+        if (sample_type & PERF_SAMPLE_STREAM_ID)
+                size += sizeof(data->stream_id);
+        if (sample_type & PERF_SAMPLE_CPU)
+                size += sizeof(data->cpu_entry);
+        event->id_header_size = size;
+}
 static void perf_group_attach(struct perf_event *event)
 {
-        struct perf_event *group_leader = event->group_leader;
+        struct perf_event *group_leader = event->group_leader, *pos;
        /*
         * We can have double attach due to group movement in perf_event_open.
@@ -333,6 +433,11 @@ static void perf_group_attach(struct perf_event *event)
        list_add_tail(&event->group_entry, &group_leader->sibling_list);
        group_leader->nr_siblings++;
+        perf_event__header_size(group_leader);
+        list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
+                perf_event__header_size(pos);
 }
 /*
@@ -391,7 +496,7 @@ static void perf_group_detach(struct perf_event *event)
        if (event->group_leader != event) {
                list_del_init(&event->group_entry);
                event->group_leader->nr_siblings--;
-                return;
+                goto out;
        }
        if (!list_empty(&event->group_entry))
@@ -410,6 +515,12 @@ static void perf_group_detach(struct perf_event *event)
                /* Inherit group flags from the previous leader */
                sibling->group_flags = event->group_flags;
        }
+out:
+        perf_event__header_size(event->group_leader);
+        list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
+                perf_event__header_size(tmp);
 }
 static inline int
@@ -1073,7 +1184,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
        /*
         * not supported on inherited events
         */
-        if (event->attr.inherit)
+        if (event->attr.inherit || !is_sampling_event(event))
                return -EINVAL;
        atomic_add(refresh, &event->event_limit);
@@ -2289,31 +2400,6 @@ static int perf_release(struct inode *inode, struct file *file)
        return perf_event_release_kernel(event);
 }
-static int perf_event_read_size(struct perf_event *event)
-{
-        int entry = sizeof(u64); /* value */
-        int size = 0;
-        int nr = 1;
-        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-                size += sizeof(u64);
-        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-                size += sizeof(u64);
-        if (event->attr.read_format & PERF_FORMAT_ID)
-                entry += sizeof(u64);
-        if (event->attr.read_format & PERF_FORMAT_GROUP) {
-                nr += event->group_leader->nr_siblings;
-                size += sizeof(u64);
-        }
-        size += entry * nr;
-        return size;
-}
 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
 {
        struct perf_event *child;
@@ -2428,7 +2514,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
        if (event->state == PERF_EVENT_STATE_ERROR)
                return 0;
-        if (count < perf_event_read_size(event))
+        if (count < event->read_size)
                return -ENOSPC;
        WARN_ON_ONCE(event->ctx->parent_ctx);
@@ -2514,7 +2600,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
        int ret = 0;
        u64 value;
-        if (!event->attr.sample_period)
+        if (!is_sampling_event(event))
                return -EINVAL;
        if (copy_from_user(&value, arg, sizeof(value)))
@@ -3305,6 +3391,73 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
        } while (len);
 }
+static void __perf_event_header__init_id(struct perf_event_header *header,
+                                         struct perf_sample_data *data,
+                                         struct perf_event *event)
+{
+        u64 sample_type = event->attr.sample_type;
+        data->type = sample_type;
+        header->size += event->id_header_size;
+        if (sample_type & PERF_SAMPLE_TID) {
+                /* namespace issues */
+                data->tid_entry.pid = perf_event_pid(event, current);
+                data->tid_entry.tid = perf_event_tid(event, current);
+        }
+        if (sample_type & PERF_SAMPLE_TIME)
+                data->time = perf_clock();
+        if (sample_type & PERF_SAMPLE_ID)
+                data->id = primary_event_id(event);
+        if (sample_type & PERF_SAMPLE_STREAM_ID)
+                data->stream_id = event->id;
+        if (sample_type & PERF_SAMPLE_CPU) {
+                data->cpu_entry.cpu      = raw_smp_processor_id();
+                data->cpu_entry.reserved = 0;
+        }
+}
+static void perf_event_header__init_id(struct perf_event_header *header,
+                                       struct perf_sample_data *data,
+                                       struct perf_event *event)
+{
+        if (event->attr.sample_id_all)
+                __perf_event_header__init_id(header, data, event);
+}
+static void __perf_event__output_id_sample(struct perf_output_handle *handle,
+                                           struct perf_sample_data *data)
+{
+        u64 sample_type = data->type;
+        if (sample_type & PERF_SAMPLE_TID)
+                perf_output_put(handle, data->tid_entry);
+        if (sample_type & PERF_SAMPLE_TIME)
+                perf_output_put(handle, data->time);
+        if (sample_type & PERF_SAMPLE_ID)
+                perf_output_put(handle, data->id);
+        if (sample_type & PERF_SAMPLE_STREAM_ID)
+                perf_output_put(handle, data->stream_id);
+        if (sample_type & PERF_SAMPLE_CPU)
+                perf_output_put(handle, data->cpu_entry);
+}
+static void perf_event__output_id_sample(struct perf_event *event,
+                                         struct perf_output_handle *handle,
+                                         struct perf_sample_data *sample)
+{
+        if (event->attr.sample_id_all)
+                __perf_event__output_id_sample(handle, sample);
+}
 int perf_output_begin(struct perf_output_handle *handle,
                      struct perf_event *event, unsigned int size,
                      int nmi, int sample)
@@ -3312,6 +3465,7 @@ int perf_output_begin(struct perf_output_handle *handle,
        struct perf_buffer *buffer;
        unsigned long tail, offset, head;
        int have_lost;
+        struct perf_sample_data sample_data;
        struct {
                struct perf_event_header header;
                u64                      id;
@@ -3338,8 +3492,12 @@ int perf_output_begin(struct perf_output_handle *handle,
                goto out;
        have_lost = local_read(&buffer->lost);
-        if (have_lost)
+        if (have_lost) {
-                size += sizeof(lost_event);
+                lost_event.header.size = sizeof(lost_event);
+                perf_event_header__init_id(&lost_event.header, &sample_data,
+                                           event);
+                size += lost_event.header.size;
+        }
        perf_output_get_handle(handle);
@@ -3370,11 +3528,11 @@ int perf_output_begin(struct perf_output_handle *handle,
        if (have_lost) {
                lost_event.header.type = PERF_RECORD_LOST;
                lost_event.header.misc = 0;
-                lost_event.header.size = sizeof(lost_event);
                lost_event.id          = event->id;
                lost_event.lost        = local_xchg(&buffer->lost, 0);
                perf_output_put(handle, lost_event);
+                perf_event__output_id_sample(event, handle, &sample_data);
        }
        return 0;
@@ -3407,28 +3565,6 @@ void perf_output_end(struct perf_output_handle *handle)
        rcu_read_unlock();
 }
-static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
-{
-        /*
-         * only top level events have the pid namespace they were created in
-         */
-        if (event->parent)
-                event = event->parent;
-        return task_tgid_nr_ns(p, event->ns);
-}
-static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
-{
-        /*
-         * only top level events have the pid namespace they were created in
-         */
-        if (event->parent)
-                event = event->parent;
-        return task_pid_nr_ns(p, event->ns);
-}
 static void perf_output_read_one(struct perf_output_handle *handle,
                                 struct perf_event *event,
                                 u64 enabled, u64 running)
@@ -3603,61 +3739,16 @@ void perf_prepare_sample(struct perf_event_header *header,
 {
        u64 sample_type = event->attr.sample_type;
-        data->type = sample_type;
        header->type = PERF_RECORD_SAMPLE;
-        header->size = sizeof(*header);
+        header->size = sizeof(*header) + event->header_size;
        header->misc = 0;
        header->misc |= perf_misc_flags(regs);
-        if (sample_type & PERF_SAMPLE_IP) {
+        __perf_event_header__init_id(header, data, event);
-                data->ip = perf_instruction_pointer(regs);
-                header->size += sizeof(data->ip);
-        }
-        if (sample_type & PERF_SAMPLE_TID) {
-                /* namespace issues */
-                data->tid_entry.pid = perf_event_pid(event, current);
-                data->tid_entry.tid = perf_event_tid(event, current);
-                header->size += sizeof(data->tid_entry);
-        }
-        if (sample_type & PERF_SAMPLE_TIME) {
-                data->time = perf_clock();
-                header->size += sizeof(data->time);
-        }
-        if (sample_type & PERF_SAMPLE_ADDR)
-                header->size += sizeof(data->addr);
-        if (sample_type & PERF_SAMPLE_ID) {
-                data->id = primary_event_id(event);
-                header->size += sizeof(data->id);
-        }
-        if (sample_type & PERF_SAMPLE_STREAM_ID) {
-                data->stream_id = event->id;
-                header->size += sizeof(data->stream_id);
-        }
-        if (sample_type & PERF_SAMPLE_CPU) {
-                data->cpu_entry.cpu             = raw_smp_processor_id();
-                data->cpu_entry.reserved        = 0;
-                header->size += sizeof(data->cpu_entry);
-        }
-        if (sample_type & PERF_SAMPLE_PERIOD)
-                header->size += sizeof(data->period);
-        if (sample_type & PERF_SAMPLE_READ)
+        if (sample_type & PERF_SAMPLE_IP)
-                header->size += perf_event_read_size(event);
+                data->ip = perf_instruction_pointer(regs);
        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
                int size = 1;
@@ -3722,23 +3813,26 @@ perf_event_read_event(struct perf_event *event,
                        struct task_struct *task)
 {
        struct perf_output_handle handle;
+        struct perf_sample_data sample;
        struct perf_read_event read_event = {
                .header = {
                        .type = PERF_RECORD_READ,
                        .misc = 0,
-                        .size = sizeof(read_event) + perf_event_read_size(event),
+                        .size = sizeof(read_event) + event->read_size,
                },
                .pid = perf_event_pid(event, task),
                .tid = perf_event_tid(event, task),
        };
        int ret;
+        perf_event_header__init_id(&read_event.header, &sample, event);
        ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
        if (ret)
                return;
        perf_output_put(&handle, read_event);
        perf_output_read(&handle, event);
+        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
 }
@@ -3768,14 +3862,16 @@ static void perf_event_task_output(struct perf_event *event,
                                     struct perf_task_event *task_event)
 {
        struct perf_output_handle handle;
+        struct perf_sample_data sample;
        struct task_struct *task = task_event->task;
-        int size, ret;
+        int ret, size = task_event->event_id.header.size;
-        size  = task_event->event_id.header.size;
+        perf_event_header__init_id(&task_event->event_id.header, &sample, event);
-        ret = perf_output_begin(&handle, event, size, 0, 0);
+        ret = perf_output_begin(&handle, event,
+                                task_event->event_id.header.size, 0, 0);
        if (ret)
-                return;
+                goto out;
        task_event->event_id.pid = perf_event_pid(event, task);
        task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3785,7 +3881,11 @@ static void perf_event_task_output(struct perf_event *event,
        perf_output_put(&handle, task_event->event_id);
+        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
+out:
+        task_event->event_id.header.size = size;
 }
 static int perf_event_task_match(struct perf_event *event)
@@ -3900,11 +4000,16 @@ static void perf_event_comm_output(struct perf_event *event,
                                     struct perf_comm_event *comm_event)
 {
        struct perf_output_handle handle;
+        struct perf_sample_data sample;
        int size = comm_event->event_id.header.size;
-        int ret = perf_output_begin(&handle, event, size, 0, 0);
+        int ret;
+        perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
+        ret = perf_output_begin(&handle, event,
+                                comm_event->event_id.header.size, 0, 0);
        if (ret)
-                return;
+                goto out;
        comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
        comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
@@ -3912,7 +4017,12 @@ static void perf_event_comm_output(struct perf_event *event,
        perf_output_put(&handle, comm_event->event_id);
        perf_output_copy(&handle, comm_event->comm,
                                   comm_event->comm_size);
+        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
+out:
+        comm_event->event_id.header.size = size;
 }
 static int perf_event_comm_match(struct perf_event *event)
@@ -3957,7 +4067,6 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
        comm_event->comm_size = size;
        comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
        rcu_read_lock();
        list_for_each_entry_rcu(pmu, &pmus, entry) {
                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
@@ -4038,11 +4147,15 @@ static void perf_event_mmap_output(struct perf_event *event,
                                     struct perf_mmap_event *mmap_event)
 {
        struct perf_output_handle handle;
+        struct perf_sample_data sample;
        int size = mmap_event->event_id.header.size;
-        int ret = perf_output_begin(&handle, event, size, 0, 0);
+        int ret;
+        perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
+        ret = perf_output_begin(&handle, event,
+                                mmap_event->event_id.header.size, 0, 0);
        if (ret)
-                return;
+                goto out;
        mmap_event->event_id.pid = perf_event_pid(event, current);
        mmap_event->event_id.tid = perf_event_tid(event, current);
@@ -4050,7 +4163,12 @@ static void perf_event_mmap_output(struct perf_event *event,
        perf_output_put(&handle, mmap_event->event_id);
        perf_output_copy(&handle, mmap_event->file_name,
                                   mmap_event->file_size);
+        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
+out:
+        mmap_event->event_id.header.size = size;
 }
 static int perf_event_mmap_match(struct perf_event *event,
@@ -4205,6 +4323,7 @@ void perf_event_mmap(struct vm_area_struct *vma)
 static void perf_log_throttle(struct perf_event *event, int enable)
 {
        struct perf_output_handle handle;
+        struct perf_sample_data sample;
        int ret;
        struct {
@@ -4226,11 +4345,15 @@ static void perf_log_throttle(struct perf_event *event, int enable)
        if (enable)
                throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
-        ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
+        perf_event_header__init_id(&throttle_event.header, &sample, event);
+        ret = perf_output_begin(&handle, event,
+                                throttle_event.header.size, 1, 0);
        if (ret)
                return;
        perf_output_put(&handle, throttle_event);
+        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
 }
@@ -4246,6 +4369,13 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
        struct hw_perf_event *hwc = &event->hw;
        int ret = 0;
+        /*
+         * Non-sampling counters might still use the PMI to fold short
+         * hardware counters, ignore those.
+         */
+        if (unlikely(!is_sampling_event(event)))
+                return 0;
        if (!throttle) {
                hwc->interrupts++;
        } else {
@@ -4391,7 +4521,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
        if (!regs)
                return;
-        if (!hwc->sample_period)
+        if (!is_sampling_event(event))
                return;
        if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
@@ -4554,7 +4684,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
        struct hw_perf_event *hwc = &event->hw;
        struct hlist_head *head;
-        if (hwc->sample_period) {
+        if (is_sampling_event(event)) {
                hwc->last_period = hwc->sample_period;
                perf_swevent_set_period(event);
        }
@@ -4811,15 +4941,6 @@ static int perf_tp_event_init(struct perf_event *event)
        if (event->attr.type != PERF_TYPE_TRACEPOINT)
                return -ENOENT;
-        /*
-         * Raw tracepoint data is a severe data leak, only allow root to
-         * have these.
-         */
-        if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
-                        perf_paranoid_tracepoint_raw() &&
-                        !capable(CAP_SYS_ADMIN))
-                return -EPERM;
        err = perf_trace_init(event);
        if (err)
                return err;
@@ -4842,7 +4963,7 @@ static struct pmu perf_tracepoint = {
 static inline void perf_tp_register(void)
 {
-        perf_pmu_register(&perf_tracepoint);
+        perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
 }
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4932,31 +5053,33 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 static void perf_swevent_start_hrtimer(struct perf_event *event)
 {
        struct hw_perf_event *hwc = &event->hw;
+        s64 period;
+        if (!is_sampling_event(event))
+                return;
        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        hwc->hrtimer.function = perf_swevent_hrtimer;
-        if (hwc->sample_period) {
-                s64 period = local64_read(&hwc->period_left);
-                if (period) {
+        period = local64_read(&hwc->period_left);
-                        if (period < 0)
+        if (period) {
-                                period = 10000;
+                if (period < 0)
+                        period = 10000;
-                        local64_set(&hwc->period_left, 0);
+                local64_set(&hwc->period_left, 0);
-                } else {
+        } else {
-                        period = max_t(u64, 10000, hwc->sample_period);
+                period = max_t(u64, 10000, hwc->sample_period);
-                }
+        }
-                __hrtimer_start_range_ns(&hwc->hrtimer,
+        __hrtimer_start_range_ns(&hwc->hrtimer,
                                ns_to_ktime(period), 0,
                                HRTIMER_MODE_REL_PINNED, 0);
-        }
 }
 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
 {
        struct hw_perf_event *hwc = &event->hw;
-        if (hwc->sample_period) {
+        if (is_sampling_event(event)) {
                ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
                local64_set(&hwc->period_left, ktime_to_ns(remaining));
@@ -5184,8 +5307,61 @@ static void free_pmu_context(struct pmu *pmu)
 out:
        mutex_unlock(&pmus_lock);
 }
+static struct idr pmu_idr;
+static ssize_t
+type_show(struct device *dev, struct device_attribute *attr, char *page)
+{
+        struct pmu *pmu = dev_get_drvdata(dev);
+        return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
+}
+static struct device_attribute pmu_dev_attrs[] = {
+       __ATTR_RO(type),
+       __ATTR_NULL,
+};
+static int pmu_bus_running;
+static struct bus_type pmu_bus = {
+        .name           = "event_source",
+        .dev_attrs      = pmu_dev_attrs,
+};
+static void pmu_dev_release(struct device *dev)
+{
+        kfree(dev);
+}
+static int pmu_dev_alloc(struct pmu *pmu)
+{
+        int ret = -ENOMEM;
+        pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
+        if (!pmu->dev)
+                goto out;
+        device_initialize(pmu->dev);
+        ret = dev_set_name(pmu->dev, "%s", pmu->name);
+        if (ret)
+                goto free_dev;
+        dev_set_drvdata(pmu->dev, pmu);
+        pmu->dev->bus = &pmu_bus;
+        pmu->dev->release = pmu_dev_release;
+        ret = device_add(pmu->dev);
+        if (ret)
+                goto free_dev;
+out:
+        return ret;
+free_dev:
+        put_device(pmu->dev);
+        goto out;
+}
-int perf_pmu_register(struct pmu *pmu)
+int perf_pmu_register(struct pmu *pmu, char *name, int type)
 {
        int cpu, ret;
@@ -5195,13 +5371,38 @@ int perf_pmu_register(struct pmu *pmu)
        if (!pmu->pmu_disable_count)
                goto unlock;
+        pmu->type = -1;
+        if (!name)
+                goto skip_type;
+        pmu->name = name;
+        if (type < 0) {
+                int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
+                if (!err)
+                        goto free_pdc;
+                err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
+                if (err) {
+                        ret = err;
+                        goto free_pdc;
+                }
+        }
+        pmu->type = type;
+        if (pmu_bus_running) {
+                ret = pmu_dev_alloc(pmu);
+                if (ret)
+                        goto free_idr;
+        }
+skip_type:
        pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
        if (pmu->pmu_cpu_context)
                goto got_cpu_context;
        pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
        if (!pmu->pmu_cpu_context)
-                goto free_pdc;
+                goto free_dev;
        for_each_possible_cpu(cpu) {
                struct perf_cpu_context *cpuctx;
@@ -5245,6 +5446,14 @@ unlock:
        return ret;
+free_dev:
+        device_del(pmu->dev);
+        put_device(pmu->dev);
+free_idr:
+        if (pmu->type >= PERF_TYPE_MAX)
+                idr_remove(&pmu_idr, pmu->type);
 free_pdc:
        free_percpu(pmu->pmu_disable_count);
        goto unlock;
@@ -5264,6 +5473,10 @@ void perf_pmu_unregister(struct pmu *pmu)
        synchronize_rcu();
        free_percpu(pmu->pmu_disable_count);
+        if (pmu->type >= PERF_TYPE_MAX)
+                idr_remove(&pmu_idr, pmu->type);
+        device_del(pmu->dev);
+        put_device(pmu->dev);
        free_pmu_context(pmu);
 }
@@ -5273,6 +5486,13 @@ struct pmu *perf_init_event(struct perf_event *event)
        int idx;
        idx = srcu_read_lock(&pmus_srcu);
+        rcu_read_lock();
+        pmu = idr_find(&pmu_idr, event->attr.type);
+        rcu_read_unlock();
+        if (pmu)
+                goto unlock;
        list_for_each_entry_rcu(pmu, &pmus, entry) {
                int ret = pmu->event_init(event);
                if (!ret)
@@ -5738,6 +5958,12 @@ SYSCALL_DEFINE5(perf_event_open,
        mutex_unlock(&current->perf_event_mutex);
        /*
+         * Precalculate sample_data sizes
+         */
+        perf_event__header_size(event);
+        perf_event__id_header_size(event);
+        /*
         * Drop the reference on the group_event after placing the
         * new event on the sibling_list. This ensures destruction
         * of the group leader will find the pointer to itself in
@@ -6090,6 +6316,12 @@ inherit_event(struct perf_event *parent_event,
        child_event->overflow_handler = parent_event->overflow_handler;
        /*
+         * Precalculate sample_data sizes
+         */
+        perf_event__header_size(child_event);
+        perf_event__id_header_size(child_event);
+        /*
         * Link it up in the child's context:
         */
        raw_spin_lock_irqsave(&child_ctx->lock, flags);
@@ -6320,7 +6552,7 @@ static void __cpuinit perf_event_init_cpu(int cpu)
        mutex_unlock(&swhash->hlist_mutex);
 }
-#ifdef CONFIG_HOTPLUG_CPU
+#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
 static void perf_pmu_rotate_stop(struct pmu *pmu)
 {
        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
@@ -6374,6 +6606,26 @@ static void perf_event_exit_cpu(int cpu)
 static inline void perf_event_exit_cpu(int cpu) { }
 #endif
+static int
+perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
+{
+        int cpu;
+        for_each_online_cpu(cpu)
+                perf_event_exit_cpu(cpu);
+        return NOTIFY_OK;
+}
+/*
+ * Run the perf reboot notifier at the very last possible moment so that
+ * the generic watchdog code runs as long as possible.
+ */
+static struct notifier_block perf_reboot_notifier = {
+        .notifier_call = perf_reboot,
+        .priority = INT_MIN,
+};
 static int __cpuinit
 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 {
@@ -6402,14 +6654,45 @@ void __init perf_event_init(void)
 {
        int ret;
+        idr_init(&pmu_idr);
        perf_event_init_all_cpus();
        init_srcu_struct(&pmus_srcu);
-        perf_pmu_register(&perf_swevent);
+        perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
-        perf_pmu_register(&perf_cpu_clock);
+        perf_pmu_register(&perf_cpu_clock, NULL, -1);
-        perf_pmu_register(&perf_task_clock);
+        perf_pmu_register(&perf_task_clock, NULL, -1);
        perf_tp_register();
        perf_cpu_notifier(perf_cpu_notify);
+        register_reboot_notifier(&perf_reboot_notifier);
        ret = init_hw_breakpoint();
        WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
 }
+static int __init perf_event_sysfs_init(void)
+{
+        struct pmu *pmu;
+        int ret;
+        mutex_lock(&pmus_lock);
+        ret = bus_register(&pmu_bus);
+        if (ret)
+                goto unlock;
+        list_for_each_entry(pmu, &pmus, entry) {
+                if (!pmu->name || pmu->type < 0)
+                        continue;
+                ret = pmu_dev_alloc(pmu);
+                WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
+        }
+        pmu_bus_running = 1;
+        ret = 0;
+unlock:
+        mutex_unlock(&pmus_lock);
+        return ret;
+}
+device_initcall(perf_event_sysfs_init);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ecf770509d0..031d5e3a619 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -22,6 +22,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/suspend.h>
+#include <trace/events/power.h>
 #include "power.h"
@@ -201,6 +202,7 @@ int suspend_devices_and_enter(suspend_state_t state)
        if (!suspend_ops)
                return -ENOSYS;
+        trace_machine_suspend(state);
        if (suspend_ops->begin) {
                error = suspend_ops->begin(state);
                if (error)
@@ -229,6 +231,7 @@ int suspend_devices_and_enter(suspend_state_t state)
 Close:
        if (suspend_ops->end)
                suspend_ops->end();
+        trace_machine_suspend(PWR_EVENT_EXIT);
        return error;
 Recover_platform:
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index d806735342a..03449372474 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -36,31 +36,16 @@
 #include <linux/time.h>
 #include <linux/cpu.h>
-/* Global control variables for rcupdate callback mechanism. */
+/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
-struct rcu_ctrlblk {
+static struct task_struct *rcu_kthread_task;
-        struct rcu_head *rcucblist;     /* List of pending callbacks (CBs). */
+static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
-        struct rcu_head **donetail;     /* ->next pointer of last "done" CB. */
+static unsigned long have_rcu_kthread_work;
-        struct rcu_head **curtail;      /* ->next pointer of last CB. */
+static void invoke_rcu_kthread(void);
-};
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_sched_ctrlblk = {
-        .donetail       = &rcu_sched_ctrlblk.rcucblist,
-        .curtail        = &rcu_sched_ctrlblk.rcucblist,
-};
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
-        .donetail       = &rcu_bh_ctrlblk.rcucblist,
-        .curtail        = &rcu_bh_ctrlblk.rcucblist,
-};
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 /* Forward declarations for rcutiny_plugin.h. */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+struct rcu_ctrlblk;
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static int rcu_kthread(void *arg);
 static void __call_rcu(struct rcu_head *head,
                       void (*func)(struct rcu_head *rcu),
                       struct rcu_ctrlblk *rcp);
@@ -123,7 +108,7 @@ void rcu_sched_qs(int cpu)
 {
        if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
            rcu_qsctr_help(&rcu_bh_ctrlblk))
-                raise_softirq(RCU_SOFTIRQ);
+                invoke_rcu_kthread();
 }
 /*
@@ -132,7 +117,7 @@ void rcu_sched_qs(int cpu)
 void rcu_bh_qs(int cpu)
 {
        if (rcu_qsctr_help(&rcu_bh_ctrlblk))
-                raise_softirq(RCU_SOFTIRQ);
+                invoke_rcu_kthread();
 }
 /*
@@ -152,13 +137,14 @@ void rcu_check_callbacks(int cpu, int user)
 }
 /*
- * Helper function for rcu_process_callbacks() that operates on the
+ * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
- * specified rcu_ctrlkblk structure.
+ * whose grace period has elapsed.
 */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 {
        struct rcu_head *next, *list;
        unsigned long flags;
+        RCU_TRACE(int cb_count = 0);
        /* If no RCU callbacks ready to invoke, just return. */
        if (&rcp->rcucblist == rcp->donetail)
@@ -180,19 +166,58 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
                next = list->next;
                prefetch(next);
                debug_rcu_head_unqueue(list);
+                local_bh_disable();
                list->func(list);
+                local_bh_enable();
                list = next;
+                RCU_TRACE(cb_count++);
        }
+        RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
 }
 /*
- * Invoke any callbacks whose grace period has completed.
+ * This kthread invokes RCU callbacks whose grace periods have
+ * elapsed.  It is awakened as needed, and takes the place of the
+ * RCU_SOFTIRQ that was used previously for this purpose.
+ * This is a kthread, but it is never stopped, at least not until
+ * the system goes down.
 */
-static void rcu_process_callbacks(struct softirq_action *unused)
+static int rcu_kthread(void *arg)
 {
-        __rcu_process_callbacks(&rcu_sched_ctrlblk);
+        unsigned long work;
-        __rcu_process_callbacks(&rcu_bh_ctrlblk);
+        unsigned long morework;
-        rcu_preempt_process_callbacks();
+        unsigned long flags;
+        for (;;) {
+                wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0);
+                morework = rcu_boost();
+                local_irq_save(flags);
+                work = have_rcu_kthread_work;
+                have_rcu_kthread_work = morework;
+                local_irq_restore(flags);
+                if (work) {
+                        rcu_process_callbacks(&rcu_sched_ctrlblk);
+                        rcu_process_callbacks(&rcu_bh_ctrlblk);
+                        rcu_preempt_process_callbacks();
+                }
+                schedule_timeout_interruptible(1); /* Leave CPU for others. */
+        }
+        return 0;  /* Not reached, but needed to shut gcc up. */
+}
+/*
+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
+ * or to boost readers.
+ */
+static void invoke_rcu_kthread(void)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        have_rcu_kthread_work = 1;
+        wake_up(&rcu_kthread_wq);
+        local_irq_restore(flags);
 }
 /*
@@ -230,6 +255,7 @@ static void __call_rcu(struct rcu_head *head,
        local_irq_save(flags);
        *rcp->curtail = head;
        rcp->curtail = &head->next;
+        RCU_TRACE(rcp->qlen++);
        local_irq_restore(flags);
 }
@@ -282,7 +308,16 @@ void rcu_barrier_sched(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
-void __init rcu_init(void)
+/*
+ * Spawn the kthread that invokes RCU callbacks.
+ */
+static int __init rcu_spawn_kthreads(void)
 {
-        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+        struct sched_param sp;
+        rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
+        sp.sched_priority = RCU_BOOST_PRIO;
+        sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
+        return 0;
 }
+early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 6ceca4f745f..015abaea962 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -22,6 +22,40 @@
 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
 */
+#include <linux/kthread.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(stmt) stmt
+#else /* #ifdef CONFIG_RCU_TRACE */
+#define RCU_TRACE(stmt)
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+        struct rcu_head *rcucblist;     /* List of pending callbacks (CBs). */
+        struct rcu_head **donetail;     /* ->next pointer of last "done" CB. */
+        struct rcu_head **curtail;      /* ->next pointer of last CB. */
+        RCU_TRACE(long qlen);           /* Number of pending CBs. */
+};
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+        .donetail       = &rcu_sched_ctrlblk.rcucblist,
+        .curtail        = &rcu_sched_ctrlblk.rcucblist,
+};
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+        .donetail       = &rcu_bh_ctrlblk.rcucblist,
+        .curtail        = &rcu_bh_ctrlblk.rcucblist,
+};
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 #ifdef CONFIG_TINY_PREEMPT_RCU
 #include <linux/delay.h>
@@ -46,17 +80,45 @@ struct rcu_preempt_ctrlblk {
        struct list_head *gp_tasks;
                                /* Pointer to the first task blocking the */
                                /*  current grace period, or NULL if there */
-                                /*  is not such task. */
+                                /*  is no such task. */
        struct list_head *exp_tasks;
                                /* Pointer to first task blocking the */
                                /*  current expedited grace period, or NULL */
                                /*  if there is no such task.  If there */
                                /*  is no current expedited grace period, */
                                /*  then there cannot be any such task. */
+#ifdef CONFIG_RCU_BOOST
+        struct list_head *boost_tasks;
+                                /* Pointer to first task that needs to be */
+                                /*  priority-boosted, or NULL if no priority */
+                                /*  boosting is needed.  If there is no */
+                                /*  current or expedited grace period, there */
+                                /*  can be no such task. */
+#endif /* #ifdef CONFIG_RCU_BOOST */
        u8 gpnum;               /* Current grace period. */
        u8 gpcpu;               /* Last grace period blocked by the CPU. */
        u8 completed;           /* Last grace period completed. */
                                /*  If all three are equal, RCU is idle. */
+#ifdef CONFIG_RCU_BOOST
+        s8 boosted_this_gp;     /* Has boosting already happened? */
+        unsigned long boost_time; /* When to start boosting (jiffies) */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_TRACE
+        unsigned long n_grace_periods;
+#ifdef CONFIG_RCU_BOOST
+        unsigned long n_tasks_boosted;
+        unsigned long n_exp_boosts;
+        unsigned long n_normal_boosts;
+        unsigned long n_normal_balk_blkd_tasks;
+        unsigned long n_normal_balk_gp_tasks;
+        unsigned long n_normal_balk_boost_tasks;
+        unsigned long n_normal_balk_boosted;
+        unsigned long n_normal_balk_notyet;
+        unsigned long n_normal_balk_nos;
+        unsigned long n_exp_balk_blkd_tasks;
+        unsigned long n_exp_balk_nos;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#endif /* #ifdef CONFIG_RCU_TRACE */
 };
 static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
@@ -122,6 +184,210 @@ static int rcu_preempt_gp_in_progress(void)
 }
 /*
+ * Advance a ->blkd_tasks-list pointer to the next entry, instead
+ * returning NULL if at the end of the list.
+ */
+static struct list_head *rcu_next_node_entry(struct task_struct *t)
+{
+        struct list_head *np;
+        np = t->rcu_node_entry.next;
+        if (np == &rcu_preempt_ctrlblk.blkd_tasks)
+                np = NULL;
+        return np;
+}
+#ifdef CONFIG_RCU_TRACE
+#ifdef CONFIG_RCU_BOOST
+static void rcu_initiate_boost_trace(void);
+static void rcu_initiate_exp_boost_trace(void);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+/*
+ * Dump additional statistice for TINY_PREEMPT_RCU.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+        seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
+                   rcu_preempt_ctrlblk.rcb.qlen,
+                   rcu_preempt_ctrlblk.n_grace_periods,
+                   rcu_preempt_ctrlblk.gpnum,
+                   rcu_preempt_ctrlblk.gpcpu,
+                   rcu_preempt_ctrlblk.completed,
+                   "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
+                   "N."[!rcu_preempt_ctrlblk.gp_tasks],
+                   "E."[!rcu_preempt_ctrlblk.exp_tasks]);
+#ifdef CONFIG_RCU_BOOST
+        seq_printf(m, "             ttb=%c btg=",
+                   "B."[!rcu_preempt_ctrlblk.boost_tasks]);
+        switch (rcu_preempt_ctrlblk.boosted_this_gp) {
+        case -1:
+                seq_puts(m, "exp");
+                break;
+        case 0:
+                seq_puts(m, "no");
+                break;
+        case 1:
+                seq_puts(m, "begun");
+                break;
+        case 2:
+                seq_puts(m, "done");
+                break;
+        default:
+                seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
+        }
+        seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
+                   rcu_preempt_ctrlblk.n_tasks_boosted,
+                   rcu_preempt_ctrlblk.n_exp_boosts,
+                   rcu_preempt_ctrlblk.n_normal_boosts,
+                   (int)(jiffies & 0xffff),
+                   (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
+        seq_printf(m, "             %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n",
+                   "normal balk",
+                   rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks,
+                   rcu_preempt_ctrlblk.n_normal_balk_gp_tasks,
+                   rcu_preempt_ctrlblk.n_normal_balk_boost_tasks,
+                   rcu_preempt_ctrlblk.n_normal_balk_boosted,
+                   rcu_preempt_ctrlblk.n_normal_balk_notyet,
+                   rcu_preempt_ctrlblk.n_normal_balk_nos);
+        seq_printf(m, "             exp balk: bt=%lu nos=%lu\n",
+                   rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
+                   rcu_preempt_ctrlblk.n_exp_balk_nos);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+}
+#endif /* #ifdef CONFIG_RCU_TRACE */
+#ifdef CONFIG_RCU_BOOST
+#include "rtmutex_common.h"
+/*
+ * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
+ * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
+ */
+static int rcu_boost(void)
+{
+        unsigned long flags;
+        struct rt_mutex mtx;
+        struct list_head *np;
+        struct task_struct *t;
+        if (rcu_preempt_ctrlblk.boost_tasks == NULL)
+                return 0;  /* Nothing to boost. */
+        raw_local_irq_save(flags);
+        rcu_preempt_ctrlblk.boosted_this_gp++;
+        t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct,
+                         rcu_node_entry);
+        np = rcu_next_node_entry(t);
+        rt_mutex_init_proxy_locked(&mtx, t);
+        t->rcu_boost_mutex = &mtx;
+        t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+        raw_local_irq_restore(flags);
+        rt_mutex_lock(&mtx);
+        RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
+        rcu_preempt_ctrlblk.boosted_this_gp++;
+        rt_mutex_unlock(&mtx);
+        return rcu_preempt_ctrlblk.boost_tasks != NULL;
+}
+/*
+ * Check to see if it is now time to start boosting RCU readers blocking
+ * the current grace period, and, if so, tell the rcu_kthread_task to
+ * start boosting them.  If there is an expedited boost in progress,
+ * we wait for it to complete.
+ *
+ * If there are no blocked readers blocking the current grace period,
+ * return 0 to let the caller know, otherwise return 1.  Note that this
+ * return value is independent of whether or not boosting was done.
+ */
+static int rcu_initiate_boost(void)
+{
+        if (!rcu_preempt_blocked_readers_cgp()) {
+                RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++);
+                return 0;
+        }
+        if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
+            rcu_preempt_ctrlblk.boost_tasks == NULL &&
+            rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
+            ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
+                rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
+                invoke_rcu_kthread();
+                RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
+        } else
+                RCU_TRACE(rcu_initiate_boost_trace());
+        return 1;
+}
+/*
+ * Initiate boosting for an expedited grace period.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+        unsigned long flags;
+        raw_local_irq_save(flags);
+        if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
+                rcu_preempt_ctrlblk.boost_tasks =
+                        rcu_preempt_ctrlblk.blkd_tasks.next;
+                rcu_preempt_ctrlblk.boosted_this_gp = -1;
+                invoke_rcu_kthread();
+                RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
+        } else
+                RCU_TRACE(rcu_initiate_exp_boost_trace());
+        raw_local_irq_restore(flags);
+}
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
+/*
+ * Do priority-boost accounting for the start of a new grace period.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+        rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
+        if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
+                rcu_preempt_ctrlblk.boosted_this_gp = 0;
+}
+#else /* #ifdef CONFIG_RCU_BOOST */
+/*
+ * If there is no RCU priority boosting, we don't boost.
+ */
+static int rcu_boost(void)
+{
+        return 0;
+}
+/*
+ * If there is no RCU priority boosting, we don't initiate boosting,
+ * but we do indicate whether there are blocked readers blocking the
+ * current grace period.
+ */
+static int rcu_initiate_boost(void)
+{
+        return rcu_preempt_blocked_readers_cgp();
+}
+/*
+ * If there is no RCU priority boosting, we don't initiate expedited boosting.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+}
+/*
+ * If there is no RCU priority boosting, nothing to do at grace-period start.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+}
+#endif /* else #ifdef CONFIG_RCU_BOOST */
+/*
 * Record a preemptible-RCU quiescent state for the specified CPU.  Note
 * that this just means that the task currently running on the CPU is
 * in a quiescent state.  There might be any number of tasks blocked
@@ -148,11 +414,14 @@ static void rcu_preempt_cpu_qs(void)
        rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
        current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+        /* If there is no GP then there is nothing more to do.  */
+        if (!rcu_preempt_gp_in_progress())
+                return;
        /*
-         * If there is no GP, or if blocked readers are still blocking GP,
+         * Check up on boosting.  If there are no readers blocking the
-         * then there is nothing more to do.
+         * current grace period, leave.
         */
-        if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
+        if (rcu_initiate_boost())
                return;
        /* Advance callbacks. */
@@ -164,9 +433,9 @@ static void rcu_preempt_cpu_qs(void)
        if (!rcu_preempt_blocked_readers_any())
                rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
-        /* If there are done callbacks, make RCU_SOFTIRQ process them. */
+        /* If there are done callbacks, cause them to be invoked. */
        if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
-                raise_softirq(RCU_SOFTIRQ);
+                invoke_rcu_kthread();
 }
 /*
@@ -178,12 +447,16 @@ static void rcu_preempt_start_gp(void)
                /* Official start of GP. */
                rcu_preempt_ctrlblk.gpnum++;
+                RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
                /* Any blocked RCU readers block new GP. */
                if (rcu_preempt_blocked_readers_any())
                        rcu_preempt_ctrlblk.gp_tasks =
                                rcu_preempt_ctrlblk.blkd_tasks.next;
+                /* Set up for RCU priority boosting. */
+                rcu_preempt_boost_start_gp();
                /* If there is no running reader, CPU is done with GP. */
                if (!rcu_preempt_running_reader())
                        rcu_preempt_cpu_qs();
@@ -304,14 +577,16 @@ static void rcu_read_unlock_special(struct task_struct *t)
                 */
                empty = !rcu_preempt_blocked_readers_cgp();
                empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
-                np = t->rcu_node_entry.next;
+                np = rcu_next_node_entry(t);
-                if (np == &rcu_preempt_ctrlblk.blkd_tasks)
-                        np = NULL;
                list_del(&t->rcu_node_entry);
                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
                        rcu_preempt_ctrlblk.gp_tasks = np;
                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
                        rcu_preempt_ctrlblk.exp_tasks = np;
+#ifdef CONFIG_RCU_BOOST
+                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
+                        rcu_preempt_ctrlblk.boost_tasks = np;
+#endif /* #ifdef CONFIG_RCU_BOOST */
                INIT_LIST_HEAD(&t->rcu_node_entry);
                /*
@@ -331,6 +606,14 @@ static void rcu_read_unlock_special(struct task_struct *t)
                if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
                        rcu_report_exp_done();
        }
+#ifdef CONFIG_RCU_BOOST
+        /* Unboost self if was boosted. */
+        if (special & RCU_READ_UNLOCK_BOOSTED) {
+                t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
+                rt_mutex_unlock(t->rcu_boost_mutex);
+                t->rcu_boost_mutex = NULL;
+        }
+#endif /* #ifdef CONFIG_RCU_BOOST */
        local_irq_restore(flags);
 }
@@ -374,7 +657,7 @@ static void rcu_preempt_check_callbacks(void)
                rcu_preempt_cpu_qs();
        if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
            rcu_preempt_ctrlblk.rcb.donetail)
-                raise_softirq(RCU_SOFTIRQ);
+                invoke_rcu_kthread();
        if (rcu_preempt_gp_in_progress() &&
            rcu_cpu_blocking_cur_gp() &&
            rcu_preempt_running_reader())
@@ -383,7 +666,7 @@ static void rcu_preempt_check_callbacks(void)
 /*
 * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
- * update, so this is invoked from __rcu_process_callbacks() to
+ * update, so this is invoked from rcu_process_callbacks() to
 * handle that case.  Of course, it is invoked for all flavors of
 * RCU, but RCU callbacks can appear only on one of the lists, and
 * neither ->nexttail nor ->donetail can possibly be NULL, so there
@@ -400,7 +683,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
 */
 static void rcu_preempt_process_callbacks(void)
 {
-        __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+        rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
 }
 /*
@@ -417,6 +700,7 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
        local_irq_save(flags);
        *rcu_preempt_ctrlblk.nexttail = head;
        rcu_preempt_ctrlblk.nexttail = &head->next;
+        RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
        rcu_preempt_start_gp();  /* checks to see if GP needed. */
        local_irq_restore(flags);
 }
@@ -532,6 +816,7 @@ void synchronize_rcu_expedited(void)
        /* Wait for tail of ->blkd_tasks list to drain. */
        if (rcu_preempted_readers_exp())
+                rcu_initiate_expedited_boost();
                wait_event(sync_rcu_preempt_exp_wq,
                           !rcu_preempted_readers_exp());
@@ -572,6 +857,27 @@ void exit_rcu(void)
 #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
+#ifdef CONFIG_RCU_TRACE
+/*
+ * Because preemptible RCU does not exist, it is not necessary to
+ * dump out its statistics.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+}
+#endif /* #ifdef CONFIG_RCU_TRACE */
+/*
+ * Because preemptible RCU does not exist, it is never necessary to
+ * boost preempted RCU readers.
+ */
+static int rcu_boost(void)
+{
+        return 0;
+}
 /*
 * Because preemptible RCU does not exist, it never has any callbacks
 * to check.
@@ -599,17 +905,116 @@ static void rcu_preempt_process_callbacks(void)
 #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #include <linux/kernel_stat.h>
 /*
 * During boot, we forgive RCU lockdep issues.  After this function is
 * invoked, we start taking RCU lockdep issues seriously.
 */
-void rcu_scheduler_starting(void)
+void __init rcu_scheduler_starting(void)
 {
        WARN_ON(nr_context_switches() > 0);
        rcu_scheduler_active = 1;
 }
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+#ifdef CONFIG_RCU_BOOST
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define RCU_BOOST_PRIO 1
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_TRACE
+#ifdef CONFIG_RCU_BOOST
+static void rcu_initiate_boost_trace(void)
+{
+        if (rcu_preempt_ctrlblk.gp_tasks == NULL)
+                rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++;
+        else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
+                rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++;
+        else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
+                rcu_preempt_ctrlblk.n_normal_balk_boosted++;
+        else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
+                rcu_preempt_ctrlblk.n_normal_balk_notyet++;
+        else
+                rcu_preempt_ctrlblk.n_normal_balk_nos++;
+}
+static void rcu_initiate_exp_boost_trace(void)
+{
+        if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
+                rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
+        else
+                rcu_preempt_ctrlblk.n_exp_balk_nos++;
+}
+#endif /* #ifdef CONFIG_RCU_BOOST */
+static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
+{
+        unsigned long flags;
+        raw_local_irq_save(flags);
+        rcp->qlen -= n;
+        raw_local_irq_restore(flags);
+}
+/*
+ * Dump statistics for TINY_RCU, such as they are.
+ */
+static int show_tiny_stats(struct seq_file *m, void *unused)
+{
+        show_tiny_preempt_stats(m);
+        seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
+        seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
+        return 0;
+}
+static int show_tiny_stats_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, show_tiny_stats, NULL);
+}
+static const struct file_operations show_tiny_stats_fops = {
+        .owner = THIS_MODULE,
+        .open = show_tiny_stats_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+static struct dentry *rcudir;
+static int __init rcutiny_trace_init(void)
+{
+        struct dentry *retval;
+        rcudir = debugfs_create_dir("rcu", NULL);
+        if (!rcudir)
+                goto free_out;
+        retval = debugfs_create_file("rcudata", 0444, rcudir,
+                                     NULL, &show_tiny_stats_fops);
+        if (!retval)
+                goto free_out;
+        return 0;
+free_out:
+        debugfs_remove_recursive(rcudir);
+        return 1;
+}
+static void __exit rcutiny_trace_cleanup(void)
+{
+        debugfs_remove_recursive(rcudir);
+}
+module_init(rcutiny_trace_init);
+module_exit(rcutiny_trace_cleanup);
+MODULE_AUTHOR("Paul E. McKenney");
+MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
+MODULE_LICENSE("GPL");
+#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9d8e8fb2515..89613f97ff2 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,6 +47,7 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <asm/byteorder.h>
+#include <linux/sched.h>
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
@@ -64,6 +65,9 @@ static int irqreader = 1;	/* RCU readers from irq (timers). */
 static int fqs_duration = 0;    /* Duration of bursts (us), 0 to disable. */
 static int fqs_holdoff = 0;     /* Hold time within burst (us). */
 static int fqs_stutter = 3;     /* Wait time between bursts (s). */
+static int test_boost = 1;      /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
+static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
+static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
 static char *torture_type = "rcu"; /* What RCU implementation to torture. */
 module_param(nreaders, int, 0444);
@@ -88,6 +92,12 @@ module_param(fqs_holdoff, int, 0444);
 MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
 module_param(fqs_stutter, int, 0444);
 MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+module_param(test_boost, int, 0444);
+MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
+module_param(test_boost_interval, int, 0444);
+MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
+module_param(test_boost_duration, int, 0444);
+MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
 module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
@@ -109,6 +119,7 @@ static struct task_struct *stats_task;
 static struct task_struct *shuffler_task;
 static struct task_struct *stutter_task;
 static struct task_struct *fqs_task;
+static struct task_struct *boost_tasks[NR_CPUS];
 #define RCU_TORTURE_PIPE_LEN 10
@@ -134,6 +145,12 @@ static atomic_t n_rcu_torture_alloc_fail;
 static atomic_t n_rcu_torture_free;
 static atomic_t n_rcu_torture_mberror;
 static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_boost_ktrerror;
+static long n_rcu_torture_boost_rterror;
+static long n_rcu_torture_boost_allocerror;
+static long n_rcu_torture_boost_afferror;
+static long n_rcu_torture_boost_failure;
+static long n_rcu_torture_boosts;
 static long n_rcu_torture_timers;
 static struct list_head rcu_torture_removed;
 static cpumask_var_t shuffle_tmp_mask;
@@ -147,6 +164,16 @@ static int stutter_pause_test;
 #endif
 int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
+#ifdef CONFIG_RCU_BOOST
+#define rcu_can_boost() 1
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define rcu_can_boost() 0
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+static unsigned long boost_starttime;   /* jiffies of next boost test start. */
+DEFINE_MUTEX(boost_mutex);              /* protect setting boost_starttime */
+                                        /*  and boost task create/destroy. */
 /* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */
 #define FULLSTOP_DONTSTOP 0     /* Normal operation. */
@@ -277,6 +304,7 @@ struct rcu_torture_ops {
        void (*fqs)(void);
        int (*stats)(char *page);
        int irq_capable;
+        int can_boost;
        char *name;
 };
@@ -366,6 +394,7 @@ static struct rcu_torture_ops rcu_ops = {
        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
+        .can_boost      = rcu_can_boost(),
        .name           = "rcu"
 };
@@ -408,6 +437,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
+        .can_boost      = rcu_can_boost(),
        .name           = "rcu_sync"
 };
@@ -424,6 +454,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
+        .can_boost      = rcu_can_boost(),
        .name           = "rcu_expedited"
 };
@@ -684,6 +715,110 @@ static struct rcu_torture_ops sched_expedited_ops = {
 };
 /*
+ * RCU torture priority-boost testing.  Runs one real-time thread per
+ * CPU for moderate bursts, repeatedly registering RCU callbacks and
+ * spinning waiting for them to be invoked.  If a given callback takes
+ * too long to be invoked, we assume that priority inversion has occurred.
+ */
+struct rcu_boost_inflight {
+        struct rcu_head rcu;
+        int inflight;
+};
+static void rcu_torture_boost_cb(struct rcu_head *head)
+{
+        struct rcu_boost_inflight *rbip =
+                container_of(head, struct rcu_boost_inflight, rcu);
+        smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
+        rbip->inflight = 0;
+}
+static int rcu_torture_boost(void *arg)
+{
+        unsigned long call_rcu_time;
+        unsigned long endtime;
+        unsigned long oldstarttime;
+        struct rcu_boost_inflight rbi = { .inflight = 0 };
+        struct sched_param sp;
+        VERBOSE_PRINTK_STRING("rcu_torture_boost started");
+        /* Set real-time priority. */
+        sp.sched_priority = 1;
+        if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
+                VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
+                n_rcu_torture_boost_rterror++;
+        }
+        /* Each pass through the following loop does one boost-test cycle. */
+        do {
+                /* Wait for the next test interval. */
+                oldstarttime = boost_starttime;
+                while (jiffies - oldstarttime > ULONG_MAX / 2) {
+                        schedule_timeout_uninterruptible(1);
+                        rcu_stutter_wait("rcu_torture_boost");
+                        if (kthread_should_stop() ||
+                            fullstop != FULLSTOP_DONTSTOP)
+                                goto checkwait;
+                }
+                /* Do one boost-test interval. */
+                endtime = oldstarttime + test_boost_duration * HZ;
+                call_rcu_time = jiffies;
+                while (jiffies - endtime > ULONG_MAX / 2) {
+                        /* If we don't have a callback in flight, post one. */
+                        if (!rbi.inflight) {
+                                smp_mb(); /* RCU core before ->inflight = 1. */
+                                rbi.inflight = 1;
+                                call_rcu(&rbi.rcu, rcu_torture_boost_cb);
+                                if (jiffies - call_rcu_time >
+                                         test_boost_duration * HZ - HZ / 2) {
+                                        VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
+                                        n_rcu_torture_boost_failure++;
+                                }
+                                call_rcu_time = jiffies;
+                        }
+                        cond_resched();
+                        rcu_stutter_wait("rcu_torture_boost");
+                        if (kthread_should_stop() ||
+                            fullstop != FULLSTOP_DONTSTOP)
+                                goto checkwait;
+                }
+                /*
+                 * Set the start time of the next test interval.
+                 * Yes, this is vulnerable to long delays, but such
+                 * delays simply cause a false negative for the next
+                 * interval.  Besides, we are running at RT priority,
+                 * so delays should be relatively rare.
+                 */
+                while (oldstarttime == boost_starttime) {
+                        if (mutex_trylock(&boost_mutex)) {
+                                boost_starttime = jiffies +
+                                                  test_boost_interval * HZ;
+                                n_rcu_torture_boosts++;
+                                mutex_unlock(&boost_mutex);
+                                break;
+                        }
+                        schedule_timeout_uninterruptible(1);
+                }
+                /* Go do the stutter. */
+checkwait:      rcu_stutter_wait("rcu_torture_boost");
+        } while (!kthread_should_stop() && fullstop  == FULLSTOP_DONTSTOP);
+        /* Clean up and exit. */
+        VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
+        rcutorture_shutdown_absorb("rcu_torture_boost");
+        while (!kthread_should_stop() || rbi.inflight)
+                schedule_timeout_uninterruptible(1);
+        smp_mb(); /* order accesses to ->inflight before stack-frame death. */
+        return 0;
+}
+/*
 * RCU torture force-quiescent-state kthread.  Repeatedly induces
 * bursts of calls to force_quiescent_state(), increasing the probability
 * of occurrence of some important types of race conditions.
@@ -933,7 +1068,8 @@ rcu_torture_printk(char *page)
        cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
        cnt += sprintf(&page[cnt],
                       "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
-                       "rtmbe: %d nt: %ld",
+                       "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld "
+                       "rtbf: %ld rtb: %ld nt: %ld",
                       rcu_torture_current,
                       rcu_torture_current_version,
                       list_empty(&rcu_torture_freelist),
@@ -941,8 +1077,19 @@ rcu_torture_printk(char *page)
                       atomic_read(&n_rcu_torture_alloc_fail),
                       atomic_read(&n_rcu_torture_free),
                       atomic_read(&n_rcu_torture_mberror),
+                       n_rcu_torture_boost_ktrerror,
+                       n_rcu_torture_boost_rterror,
+                       n_rcu_torture_boost_allocerror,
+                       n_rcu_torture_boost_afferror,
+                       n_rcu_torture_boost_failure,
+                       n_rcu_torture_boosts,
                       n_rcu_torture_timers);
-        if (atomic_read(&n_rcu_torture_mberror) != 0)
+        if (atomic_read(&n_rcu_torture_mberror) != 0 ||
+            n_rcu_torture_boost_ktrerror != 0 ||
+            n_rcu_torture_boost_rterror != 0 ||
+            n_rcu_torture_boost_allocerror != 0 ||
+            n_rcu_torture_boost_afferror != 0 ||
+            n_rcu_torture_boost_failure != 0)
                cnt += sprintf(&page[cnt], " !!!");
        cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
        if (i > 1) {
@@ -1094,22 +1241,91 @@ rcu_torture_stutter(void *arg)
 }
 static inline void
-rcu_torture_print_module_parms(char *tag)
+rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
 {
        printk(KERN_ALERT "%s" TORTURE_FLAG
                "--- %s: nreaders=%d nfakewriters=%d "
                "stat_interval=%d verbose=%d test_no_idle_hz=%d "
                "shuffle_interval=%d stutter=%d irqreader=%d "
-                "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
+                "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
+                "test_boost=%d/%d test_boost_interval=%d "
+                "test_boost_duration=%d\n",
                torture_type, tag, nrealreaders, nfakewriters,
                stat_interval, verbose, test_no_idle_hz, shuffle_interval,
-                stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
+                stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
+                test_boost, cur_ops->can_boost,
+                test_boost_interval, test_boost_duration);
 }
-static struct notifier_block rcutorture_nb = {
+static struct notifier_block rcutorture_shutdown_nb = {
        .notifier_call = rcutorture_shutdown_notify,
 };
+static void rcutorture_booster_cleanup(int cpu)
+{
+        struct task_struct *t;
+        if (boost_tasks[cpu] == NULL)
+                return;
+        mutex_lock(&boost_mutex);
+        VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
+        t = boost_tasks[cpu];
+        boost_tasks[cpu] = NULL;
+        mutex_unlock(&boost_mutex);
+        /* This must be outside of the mutex, otherwise deadlock! */
+        kthread_stop(t);
+}
+static int rcutorture_booster_init(int cpu)
+{
+        int retval;
+        if (boost_tasks[cpu] != NULL)
+                return 0;  /* Already created, nothing more to do. */
+        /* Don't allow time recalculation while creating a new task. */
+        mutex_lock(&boost_mutex);
+        VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
+        boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
+                                          "rcu_torture_boost");
+        if (IS_ERR(boost_tasks[cpu])) {
+                retval = PTR_ERR(boost_tasks[cpu]);
+                VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
+                n_rcu_torture_boost_ktrerror++;
+                boost_tasks[cpu] = NULL;
+                mutex_unlock(&boost_mutex);
+                return retval;
+        }
+        kthread_bind(boost_tasks[cpu], cpu);
+        wake_up_process(boost_tasks[cpu]);
+        mutex_unlock(&boost_mutex);
+        return 0;
+}
+static int rcutorture_cpu_notify(struct notifier_block *self,
+                                 unsigned long action, void *hcpu)
+{
+        long cpu = (long)hcpu;
+        switch (action) {
+        case CPU_ONLINE:
+        case CPU_DOWN_FAILED:
+                (void)rcutorture_booster_init(cpu);
+                break;
+        case CPU_DOWN_PREPARE:
+                rcutorture_booster_cleanup(cpu);
+                break;
+        default:
+                break;
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block rcutorture_cpu_nb = {
+        .notifier_call = rcutorture_cpu_notify,
+};
 static void
 rcu_torture_cleanup(void)
 {
@@ -1127,7 +1343,7 @@ rcu_torture_cleanup(void)
        }
        fullstop = FULLSTOP_RMMOD;
        mutex_unlock(&fullstop_mutex);
-        unregister_reboot_notifier(&rcutorture_nb);
+        unregister_reboot_notifier(&rcutorture_shutdown_nb);
        if (stutter_task) {
                VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
                kthread_stop(stutter_task);
@@ -1184,6 +1400,12 @@ rcu_torture_cleanup(void)
                kthread_stop(fqs_task);
        }
        fqs_task = NULL;
+        if ((test_boost == 1 && cur_ops->can_boost) ||
+            test_boost == 2) {
+                unregister_cpu_notifier(&rcutorture_cpu_nb);
+                for_each_possible_cpu(i)
+                        rcutorture_booster_cleanup(i);
+        }
        /* Wait for all RCU callbacks to fire.  */
@@ -1195,9 +1417,9 @@ rcu_torture_cleanup(void)
        if (cur_ops->cleanup)
                cur_ops->cleanup();
        if (atomic_read(&n_rcu_torture_error))
-                rcu_torture_print_module_parms("End of test: FAILURE");
+                rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
        else
-                rcu_torture_print_module_parms("End of test: SUCCESS");
+                rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
 }
 static int __init
@@ -1242,7 +1464,7 @@ rcu_torture_init(void)
                nrealreaders = nreaders;
        else
                nrealreaders = 2 * num_online_cpus();
-        rcu_torture_print_module_parms("Start of test");
+        rcu_torture_print_module_parms(cur_ops, "Start of test");
        fullstop = FULLSTOP_DONTSTOP;
        /* Set up the freelist. */
@@ -1263,6 +1485,12 @@ rcu_torture_init(void)
        atomic_set(&n_rcu_torture_free, 0);
        atomic_set(&n_rcu_torture_mberror, 0);
        atomic_set(&n_rcu_torture_error, 0);
+        n_rcu_torture_boost_ktrerror = 0;
+        n_rcu_torture_boost_rterror = 0;
+        n_rcu_torture_boost_allocerror = 0;
+        n_rcu_torture_boost_afferror = 0;
+        n_rcu_torture_boost_failure = 0;
+        n_rcu_torture_boosts = 0;
        for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
                atomic_set(&rcu_torture_wcount[i], 0);
        for_each_possible_cpu(cpu) {
@@ -1376,7 +1604,27 @@ rcu_torture_init(void)
                        goto unwind;
                }
        }
-        register_reboot_notifier(&rcutorture_nb);
+        if (test_boost_interval < 1)
+                test_boost_interval = 1;
+        if (test_boost_duration < 2)
+                test_boost_duration = 2;
+        if ((test_boost == 1 && cur_ops->can_boost) ||
+            test_boost == 2) {
+                int retval;
+                boost_starttime = jiffies + test_boost_interval * HZ;
+                register_cpu_notifier(&rcutorture_cpu_nb);
+                for_each_possible_cpu(i) {
+                        if (cpu_is_offline(i))
+                                continue;  /* Heuristic: CPU can go offline. */
+                        retval = rcutorture_booster_init(i);
+                        if (retval < 0) {
+                                firsterr = retval;
+                                goto unwind;
+                        }
+                }
+        }
+        register_reboot_notifier(&rcutorture_shutdown_nb);
        mutex_unlock(&fullstop_mutex);
        return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ccdc04c4798..d0ddfea6579 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
        .gpnum = -300, \
        .completed = -300, \
        .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
-        .orphan_cbs_list = NULL, \
-        .orphan_cbs_tail = &structname.orphan_cbs_list, \
-        .orphan_qlen = 0, \
        .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
        .n_force_qs = 0, \
        .n_force_qs_ngp = 0, \
@@ -620,9 +617,17 @@ static void __init check_cpu_stall_init(void)
 static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
 {
        if (rdp->gpnum != rnp->gpnum) {
-                rdp->qs_pending = 1;
+                /*
-                rdp->passed_quiesc = 0;
+                 * If the current grace period is waiting for this CPU,
+                 * set up to detect a quiescent state, otherwise don't
+                 * go looking for one.
+                 */
                rdp->gpnum = rnp->gpnum;
+                if (rnp->qsmask & rdp->grpmask) {
+                        rdp->qs_pending = 1;
+                        rdp->passed_quiesc = 0;
+                } else
+                        rdp->qs_pending = 0;
        }
 }
@@ -681,6 +686,24 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
                /* Remember that we saw this grace-period completion. */
                rdp->completed = rnp->completed;
+                /*
+                 * If we were in an extended quiescent state, we may have
+                 * missed some grace periods that others CPUs handled on
+                 * our behalf. Catch up with this state to avoid noting
+                 * spurious new grace periods.  If another grace period
+                 * has started, then rnp->gpnum will have advanced, so
+                 * we will detect this later on.
+                 */
+                if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
+                        rdp->gpnum = rdp->completed;
+                /*
+                 * If RCU does not need a quiescent state from this CPU,
+                 * then make sure that this CPU doesn't go looking for one.
+                 */
+                if ((rnp->qsmask & rdp->grpmask) == 0)
+                        rdp->qs_pending = 0;
        }
 }
@@ -984,53 +1007,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 #ifdef CONFIG_HOTPLUG_CPU
 /*
- * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the
+ * Move a dying CPU's RCU callbacks to online CPU's callback list.
- * specified flavor of RCU.  The callbacks will be adopted by the next
+ * Synchronization is not required because this function executes
- * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever
+ * in stop_machine() context.
- * comes first.  Because this is invoked from the CPU_DYING notifier,
- * irqs are already disabled.
 */
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
 {
        int i;
+        /* current DYING CPU is cleared in the cpu_online_mask */
+        int receive_cpu = cpumask_any(cpu_online_mask);
        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+        struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
        if (rdp->nxtlist == NULL)
                return;  /* irqs disabled, so comparison is stable. */
-        raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
-        *rsp->orphan_cbs_tail = rdp->nxtlist;
+        *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
-        rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
+        receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+        receive_rdp->qlen += rdp->qlen;
+        receive_rdp->n_cbs_adopted += rdp->qlen;
+        rdp->n_cbs_orphaned += rdp->qlen;
        rdp->nxtlist = NULL;
        for (i = 0; i < RCU_NEXT_SIZE; i++)
                rdp->nxttail[i] = &rdp->nxtlist;
-        rsp->orphan_qlen += rdp->qlen;
-        rdp->n_cbs_orphaned += rdp->qlen;
        rdp->qlen = 0;
-        raw_spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
-}
-/*
- * Adopt previously orphaned RCU callbacks.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
-{
-        unsigned long flags;
-        struct rcu_data *rdp;
-        raw_spin_lock_irqsave(&rsp->onofflock, flags);
-        rdp = this_cpu_ptr(rsp->rda);
-        if (rsp->orphan_cbs_list == NULL) {
-                raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
-                return;
-        }
-        *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
-        rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
-        rdp->qlen += rsp->orphan_qlen;
-        rdp->n_cbs_adopted += rsp->orphan_qlen;
-        rsp->orphan_cbs_list = NULL;
-        rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
-        rsp->orphan_qlen = 0;
-        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 /*
@@ -1081,8 +1082,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        if (need_report & RCU_OFL_TASKS_EXP_GP)
                rcu_report_exp_rnp(rsp, rnp);
-        rcu_adopt_orphan_cbs(rsp);
 }
 /*
@@ -1100,11 +1099,7 @@ static void rcu_offline_cpu(int cpu)
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
-{
-}
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
 {
 }
@@ -1440,22 +1435,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
         */
        local_irq_save(flags);
        rdp = this_cpu_ptr(rsp->rda);
-        rcu_process_gp_end(rsp, rdp);
-        check_for_new_grace_period(rsp, rdp);
        /* Add the callback to our list. */
        *rdp->nxttail[RCU_NEXT_TAIL] = head;
        rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
-        /* Start a new grace period if one not already started. */
-        if (!rcu_gp_in_progress(rsp)) {
-                unsigned long nestflag;
-                struct rcu_node *rnp_root = rcu_get_root(rsp);
-                raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
-                rcu_start_gp(rsp, nestflag);  /* releases rnp_root->lock. */
-        }
        /*
         * Force the grace period if too many callbacks or too long waiting.
         * Enforce hysteresis, and don't invoke force_quiescent_state()
@@ -1464,12 +1448,27 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
         * is the only one waiting for a grace period to complete.
         */
        if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
-                rdp->blimit = LONG_MAX;
-                if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+                /* Are we ignoring a completed grace period? */
-                    *rdp->nxttail[RCU_DONE_TAIL] != head)
+                rcu_process_gp_end(rsp, rdp);
-                        force_quiescent_state(rsp, 0);
+                check_for_new_grace_period(rsp, rdp);
-                rdp->n_force_qs_snap = rsp->n_force_qs;
-                rdp->qlen_last_fqs_check = rdp->qlen;
+                /* Start a new grace period if one not already started. */
+                if (!rcu_gp_in_progress(rsp)) {
+                        unsigned long nestflag;
+                        struct rcu_node *rnp_root = rcu_get_root(rsp);
+                        raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
+                        rcu_start_gp(rsp, nestflag);  /* rlses rnp_root->lock */
+                } else {
+                        /* Give the grace period a kick. */
+                        rdp->blimit = LONG_MAX;
+                        if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+                            *rdp->nxttail[RCU_DONE_TAIL] != head)
+                                force_quiescent_state(rsp, 0);
+                        rdp->n_force_qs_snap = rsp->n_force_qs;
+                        rdp->qlen_last_fqs_check = rdp->qlen;
+                }
        } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
                force_quiescent_state(rsp, 1);
        local_irq_restore(flags);
@@ -1699,13 +1698,12 @@ static void _rcu_barrier(struct rcu_state *rsp,
         * decrement rcu_barrier_cpu_count -- otherwise the first CPU
         * might complete its grace period before all of the other CPUs
         * did their increment, causing this function to return too
-         * early.
+         * early.  Note that on_each_cpu() disables irqs, which prevents
+         * any CPUs from coming online or going offline until each online
+         * CPU has queued its RCU-barrier callback.
         */
        atomic_set(&rcu_barrier_cpu_count, 1);
-        preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
-        rcu_adopt_orphan_cbs(rsp);
        on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
-        preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
        if (atomic_dec_and_test(&rcu_barrier_cpu_count))
                complete(&rcu_barrier_completion);
        wait_for_completion(&rcu_barrier_completion);
@@ -1831,18 +1829,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
        case CPU_DYING:
        case CPU_DYING_FROZEN:
                /*
-                 * preempt_disable() in _rcu_barrier() prevents stop_machine(),
+                 * The whole machine is "stopped" except this CPU, so we can
-                 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
+                 * touch any data without introducing corruption. We send the
-                 * returns, all online cpus have queued rcu_barrier_func().
+                 * dying CPU's callbacks to an arbitrarily chosen online CPU.
-                 * The dying CPU clears its cpu_online_mask bit and
-                 * moves all of its RCU callbacks to ->orphan_cbs_list
-                 * in the context of stop_machine(), so subsequent calls
-                 * to _rcu_barrier() will adopt these callbacks and only
-                 * then queue rcu_barrier_func() on all remaining CPUs.
                 */
-                rcu_send_cbs_to_orphanage(&rcu_bh_state);
+                rcu_send_cbs_to_online(&rcu_bh_state);
-                rcu_send_cbs_to_orphanage(&rcu_sched_state);
+                rcu_send_cbs_to_online(&rcu_sched_state);
-                rcu_preempt_send_cbs_to_orphanage();
+                rcu_preempt_send_cbs_to_online();
                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
@@ -1880,8 +1873,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 {
        int i;
-        for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
+        for (i = NUM_RCU_LVLS - 1; i > 0; i--)
                rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+        rsp->levelspread[0] = RCU_FANOUT_LEAF;
 }
 #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
 static void __init rcu_init_levelspread(struct rcu_state *rsp)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 91d4170c5c1..e8f057e44e3 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -31,46 +31,51 @@
 /*
 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
 * In theory, it should be possible to add more levels straightforwardly.
- * In practice, this has not been tested, so there is probably some
+ * In practice, this did work well going from three levels to four.
- * bug somewhere.
+ * Of course, your mileage may vary.
 */
 #define MAX_RCU_LVLS 4
-#define RCU_FANOUT            (CONFIG_RCU_FANOUT)
+#if CONFIG_RCU_FANOUT > 16
-#define RCU_FANOUT_SQ         (RCU_FANOUT * RCU_FANOUT)
+#define RCU_FANOUT_LEAF       16
-#define RCU_FANOUT_CUBE       (RCU_FANOUT_SQ * RCU_FANOUT)
+#else /* #if CONFIG_RCU_FANOUT > 16 */
-#define RCU_FANOUT_FOURTH     (RCU_FANOUT_CUBE * RCU_FANOUT)
+#define RCU_FANOUT_LEAF       (CONFIG_RCU_FANOUT)
+#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
-#if NR_CPUS <= RCU_FANOUT
+#define RCU_FANOUT_1          (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_2          (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_3          (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_4          (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
+#if NR_CPUS <= RCU_FANOUT_1
 #  define NUM_RCU_LVLS        1
 #  define NUM_RCU_LVL_0       1
 #  define NUM_RCU_LVL_1       (NR_CPUS)
 #  define NUM_RCU_LVL_2       0
 #  define NUM_RCU_LVL_3       0
 #  define NUM_RCU_LVL_4       0
-#elif NR_CPUS <= RCU_FANOUT_SQ
+#elif NR_CPUS <= RCU_FANOUT_2
 #  define NUM_RCU_LVLS        2
 #  define NUM_RCU_LVL_0       1
-#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
+#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
 #  define NUM_RCU_LVL_2       (NR_CPUS)
 #  define NUM_RCU_LVL_3       0
 #  define NUM_RCU_LVL_4       0
-#elif NR_CPUS <= RCU_FANOUT_CUBE
+#elif NR_CPUS <= RCU_FANOUT_3
 #  define NUM_RCU_LVLS        3
 #  define NUM_RCU_LVL_0       1
-#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
+#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
-#  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
+#  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-#  define NUM_RCU_LVL_3       NR_CPUS
+#  define NUM_RCU_LVL_3       (NR_CPUS)
 #  define NUM_RCU_LVL_4       0
-#elif NR_CPUS <= RCU_FANOUT_FOURTH
+#elif NR_CPUS <= RCU_FANOUT_4
 #  define NUM_RCU_LVLS        4
 #  define NUM_RCU_LVL_0       1
-#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE)
+#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
-#  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
+#  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
-#  define NUM_RCU_LVL_3       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
+#  define NUM_RCU_LVL_3       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-#  define NUM_RCU_LVL_4       NR_CPUS
+#  define NUM_RCU_LVL_4       (NR_CPUS)
 #else
 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
-#endif /* #if (NR_CPUS) <= RCU_FANOUT */
+#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
 #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
 #define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
@@ -203,8 +208,8 @@ struct rcu_data {
        long            qlen_last_fqs_check;
                                        /* qlen at last check for QS forcing */
        unsigned long   n_cbs_invoked;  /* count of RCU cbs invoked. */
-        unsigned long   n_cbs_orphaned; /* RCU cbs sent to orphanage. */
+        unsigned long   n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
-        unsigned long   n_cbs_adopted;  /* RCU cbs adopted from orphanage. */
+        unsigned long   n_cbs_adopted;  /* RCU cbs adopted from dying CPU */
        unsigned long   n_force_qs_snap;
                                        /* did other CPU force QS recently? */
        long            blimit;         /* Upper limit on a processed batch */
@@ -309,15 +314,7 @@ struct rcu_state {
        /* End of fields guarded by root rcu_node's lock. */
        raw_spinlock_t onofflock;               /* exclude on/offline and */
-                                                /*  starting new GP.  Also */
+                                                /*  starting new GP. */
-                                                /*  protects the following */
-                                                /*  orphan_cbs fields. */
-        struct rcu_head *orphan_cbs_list;       /* list of rcu_head structs */
-                                                /*  orphaned by all CPUs in */
-                                                /*  a given leaf rcu_node */
-                                                /*  going offline. */
-        struct rcu_head **orphan_cbs_tail;      /* And tail pointer. */
-        long orphan_qlen;                       /* Number of orphaned cbs. */
        raw_spinlock_t fqslock;                 /* Only one task forcing */
                                                /*  quiescent states. */
        unsigned long jiffies_force_qs;         /* Time at which to invoke */
@@ -390,7 +387,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
 static int rcu_preempt_pending(int cpu);
 static int rcu_preempt_needs_cpu(int cpu);
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
-static void rcu_preempt_send_cbs_to_orphanage(void);
+static void rcu_preempt_send_cbs_to_online(void);
 static void __init __rcu_init_preempt(void);
 static void rcu_needs_cpu_flush(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 71a4147473f..a3638710dc6 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
 */
 #include <linux/delay.h>
+#include <linux/stop_machine.h>
 /*
 * Check the RCU kernel configuration parameters and print informative
@@ -773,11 +774,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 }
 /*
- * Move preemptable RCU's callbacks to ->orphan_cbs_list.
+ * Move preemptable RCU's callbacks from dying CPU to other online CPU.
 */
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
 {
-        rcu_send_cbs_to_orphanage(&rcu_preempt_state);
+        rcu_send_cbs_to_online(&rcu_preempt_state);
 }
 /*
@@ -1001,7 +1002,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 /*
 * Because there is no preemptable RCU, there are no callbacks to move.
 */
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
 {
 }
@@ -1014,6 +1015,132 @@ static void __init __rcu_init_preempt(void)
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifndef CONFIG_SMP
+void synchronize_sched_expedited(void)
+{
+        cond_resched();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+#else /* #ifndef CONFIG_SMP */
+static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
+static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+        /*
+         * There must be a full memory barrier on each affected CPU
+         * between the time that try_stop_cpus() is called and the
+         * time that it returns.
+         *
+         * In the current initial implementation of cpu_stop, the
+         * above condition is already met when the control reaches
+         * this point and the following smp_mb() is not strictly
+         * necessary.  Do smp_mb() anyway for documentation and
+         * robustness against future implementation changes.
+         */
+        smp_mb(); /* See above comment block. */
+        return 0;
+}
+/*
+ * Wait for an rcu-sched grace period to elapse, but use "big hammer"
+ * approach to force grace period to end quickly.  This consumes
+ * significant time on all CPUs, and is thus not recommended for
+ * any sort of common-case code.
+ *
+ * Note that it is illegal to call this function while holding any
+ * lock that is acquired by a CPU-hotplug notifier.  Failing to
+ * observe this restriction will result in deadlock.
+ *
+ * This implementation can be thought of as an application of ticket
+ * locking to RCU, with sync_sched_expedited_started and
+ * sync_sched_expedited_done taking on the roles of the halves
+ * of the ticket-lock word.  Each task atomically increments
+ * sync_sched_expedited_started upon entry, snapshotting the old value,
+ * then attempts to stop all the CPUs.  If this succeeds, then each
+ * CPU will have executed a context switch, resulting in an RCU-sched
+ * grace period.  We are then done, so we use atomic_cmpxchg() to
+ * update sync_sched_expedited_done to match our snapshot -- but
+ * only if someone else has not already advanced past our snapshot.
+ *
+ * On the other hand, if try_stop_cpus() fails, we check the value
+ * of sync_sched_expedited_done.  If it has advanced past our
+ * initial snapshot, then someone else must have forced a grace period
+ * some time after we took our snapshot.  In this case, our work is
+ * done for us, and we can simply return.  Otherwise, we try again,
+ * but keep our initial snapshot for purposes of checking for someone
+ * doing our work for us.
+ *
+ * If we fail too many times in a row, we fall back to synchronize_sched().
+ */
+void synchronize_sched_expedited(void)
+{
+        int firstsnap, s, snap, trycount = 0;
+        /* Note that atomic_inc_return() implies full memory barrier. */
+        firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
+        get_online_cpus();
+        /*
+         * Each pass through the following loop attempts to force a
+         * context switch on each CPU.
+         */
+        while (try_stop_cpus(cpu_online_mask,
+                             synchronize_sched_expedited_cpu_stop,
+                             NULL) == -EAGAIN) {
+                put_online_cpus();
+                /* No joy, try again later.  Or just synchronize_sched(). */
+                if (trycount++ < 10)
+                        udelay(trycount * num_online_cpus());
+                else {
+                        synchronize_sched();
+                        return;
+                }
+                /* Check to see if someone else did our work for us. */
+                s = atomic_read(&sync_sched_expedited_done);
+                if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
+                        smp_mb(); /* ensure test happens before caller kfree */
+                        return;
+                }
+                /*
+                 * Refetching sync_sched_expedited_started allows later
+                 * callers to piggyback on our grace period.  We subtract
+                 * 1 to get the same token that the last incrementer got.
+                 * We retry after they started, so our grace period works
+                 * for them, and they started after our first try, so their
+                 * grace period works for us.
+                 */
+                get_online_cpus();
+                snap = atomic_read(&sync_sched_expedited_started) - 1;
+                smp_mb(); /* ensure read is before try_stop_cpus(). */
+        }
+        /*
+         * Everyone up to our most recent fetch is covered by our grace
+         * period.  Update the counter, but only if our work is still
+         * relevant -- which it won't be if someone who started later
+         * than we did beat us to the punch.
+         */
+        do {
+                s = atomic_read(&sync_sched_expedited_done);
+                if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
+                        smp_mb(); /* ensure test happens before caller kfree */
+                        break;
+                }
+        } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
+        put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+#endif /* #else #ifndef CONFIG_SMP */
 #if !defined(CONFIG_RCU_FAST_NO_HZ)
 /*
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d15430b9d12..c8e97853b97 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -166,13 +166,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
        gpnum = rsp->gpnum;
        seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
-                      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
+                      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
                   rsp->completed, gpnum, rsp->signaled,
                   (long)(rsp->jiffies_force_qs - jiffies),
                   (int)(jiffies & 0xffff),
                   rsp->n_force_qs, rsp->n_force_qs_ngp,
                   rsp->n_force_qs - rsp->n_force_qs_ngp,
-                   rsp->n_force_qs_lh, rsp->orphan_qlen);
+                   rsp->n_force_qs_lh);
        for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
                if (rnp->level != level) {
                        seq_puts(m, "\n");
@@ -300,7 +300,7 @@ static const struct file_operations rcu_pending_fops = {
 static struct dentry *rcudir;
-static int __init rcuclassic_trace_init(void)
+static int __init rcutree_trace_init(void)
 {
        struct dentry *retval;
@@ -337,14 +337,14 @@ free_out:
        return 1;
 }
-static void __exit rcuclassic_trace_cleanup(void)
+static void __exit rcutree_trace_cleanup(void)
 {
        debugfs_remove_recursive(rcudir);
 }
-module_init(rcuclassic_trace_init);
+module_init(rcutree_trace_init);
-module_exit(rcuclassic_trace_cleanup);
+module_exit(rcutree_trace_cleanup);
 MODULE_AUTHOR("Paul E. McKenney");
 MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
diff --git a/kernel/sched.c b/kernel/sched.c
index 114a0deb2b0..04949089e76 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8067,8 +8067,6 @@ void __init sched_init(void)
                zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 #endif /* SMP */
-        perf_event_init();
        scheduler_running = 1;
 }
@@ -9241,72 +9239,3 @@ struct cgroup_subsys cpuacct_subsys = {
 };
 #endif  /* CONFIG_CGROUP_CPUACCT */
-#ifndef CONFIG_SMP
-void synchronize_sched_expedited(void)
-{
-        barrier();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-#else /* #ifndef CONFIG_SMP */
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-static int synchronize_sched_expedited_cpu_stop(void *data)
-{
-        /*
-         * There must be a full memory barrier on each affected CPU
-         * between the time that try_stop_cpus() is called and the
-         * time that it returns.
-         *
-         * In the current initial implementation of cpu_stop, the
-         * above condition is already met when the control reaches
-         * this point and the following smp_mb() is not strictly
-         * necessary.  Do smp_mb() anyway for documentation and
-         * robustness against future implementation changes.
-         */
-        smp_mb(); /* See above comment block. */
-        return 0;
-}
-/*
- * Wait for an rcu-sched grace period to elapse, but use "big hammer"
- * approach to force grace period to end quickly.  This consumes
- * significant time on all CPUs, and is thus not recommended for
- * any sort of common-case code.
- *
- * Note that it is illegal to call this function while holding any
- * lock that is acquired by a CPU-hotplug notifier.  Failing to
- * observe this restriction will result in deadlock.
- */
-void synchronize_sched_expedited(void)
-{
-        int snap, trycount = 0;
-        smp_mb();  /* ensure prior mod happens before capturing snap. */
-        snap = atomic_read(&synchronize_sched_expedited_count) + 1;
-        get_online_cpus();
-        while (try_stop_cpus(cpu_online_mask,
-                             synchronize_sched_expedited_cpu_stop,
-                             NULL) == -EAGAIN) {
-                put_online_cpus();
-                if (trycount++ < 10)
-                        udelay(trycount * num_online_cpus());
-                else {
-                        synchronize_sched();
-                        return;
-                }
-                if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
-                        smp_mb(); /* ensure test happens before caller kfree */
-                        return;
-                }
-                get_online_cpus();
-        }
-        atomic_inc(&synchronize_sched_expedited_count);
-        smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
-        put_online_cpus();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/srcu.c b/kernel/srcu.c
index c71e0750053..98d8c1e80ed 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -31,6 +31,7 @@
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
+#include <linux/delay.h>
 #include <linux/srcu.h>
 static int init_srcu_struct_fields(struct srcu_struct *sp)
@@ -203,9 +204,14 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
         * all srcu_read_lock() calls using the old counters have completed.
         * Their corresponding critical sections might well be still
         * executing, but the srcu_read_lock() primitives themselves
-         * will have finished executing.
+         * will have finished executing.  We initially give readers
+         * an arbitrarily chosen 10 microseconds to get out of their
+         * SRCU read-side critical sections, then loop waiting 1/HZ
+         * seconds per iteration.
         */
+        if (srcu_readers_active_idx(sp, idx))
+                udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY);
        while (srcu_readers_active_idx(sp, idx))
                schedule_timeout_interruptible(1);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 121e4fff03d..ae5cbb1e3ce 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -744,21 +744,21 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &one,
        },
-#endif
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
        {
-                .procname       = "unknown_nmi_panic",
+                .procname       = "nmi_watchdog",
-                .data           = &unknown_nmi_panic,
+                .data           = &watchdog_enabled,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_dowatchdog_enabled,
        },
+#endif
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
        {
-                .procname       = "nmi_watchdog",
+                .procname       = "unknown_nmi_panic",
-                .data           = &nmi_watchdog_enabled,
+                .data           = &unknown_nmi_panic,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = proc_nmi_enabled,
+                .proc_handler   = proc_dointvec,
        },
 #endif
 #if defined(CONFIG_X86)
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 1357c578606..4b2545a136f 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -136,7 +136,6 @@ static const struct bin_table bin_kern_table[] = {
        { CTL_INT,      KERN_IA64_UNALIGNED,            "ignore-unaligned-usertrap" },
        { CTL_INT,      KERN_COMPAT_LOG,                "compat-log" },
        { CTL_INT,      KERN_MAX_LOCK_DEPTH,            "max_lock_depth" },
-        { CTL_INT,      KERN_NMI_WATCHDOG,              "nmi_watchdog" },
        { CTL_INT,      KERN_PANIC_ON_NMI,              "panic_on_unrecovered_nmi" },
        {}
 };
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index ea37e2ff416..14674dce77a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -69,6 +69,21 @@ config EVENT_TRACING
        select CONTEXT_SWITCH_TRACER
        bool
+config EVENT_POWER_TRACING_DEPRECATED
+        depends on EVENT_TRACING
+        bool "Deprecated power event trace API, to be removed"
+        default y
+        help
+          Provides old power event types:
+          C-state/idle accounting events:
+          power:power_start
+          power:power_end
+          and old cpufreq accounting event:
+          power:power_frequency
+          This is for userspace compatibility
+          and will vanish after 5 kernel iterations,
+          namely 2.6.41.
 config CONTEXT_SWITCH_TRACER
        bool
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index a22582a0616..f55fcf61b22 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,8 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/power.h>
-EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
+#ifdef EVENT_POWER_TRACING_DEPRECATED
+EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
+#endif
+EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 39c059ca670..19a359d5e6d 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -21,17 +21,46 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
 /* Count the events in use (per event id, not per instance) */
 static int      total_ref_count;
+static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
+                                 struct perf_event *p_event)
+{
+        /* No tracing, just counting, so no obvious leak */
+        if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
+                return 0;
+        /* Some events are ok to be traced by non-root users... */
+        if (p_event->attach_state == PERF_ATTACH_TASK) {
+                if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
+                        return 0;
+        }
+        /*
+         * ...otherwise raw tracepoint data can be a severe data leak,
+         * only allow root to have these.
+         */
+        if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        return 0;
+}
 static int perf_trace_event_init(struct ftrace_event_call *tp_event,
                                 struct perf_event *p_event)
 {
        struct hlist_head __percpu *list;
-        int ret = -ENOMEM;
+        int ret;
        int cpu;
+        ret = perf_trace_event_perm(tp_event, p_event);
+        if (ret)
+                return ret;
        p_event->tp_event = tp_event;
        if (tp_event->perf_refcount++ > 0)
                return 0;
+        ret = -ENOMEM;
        list = alloc_percpu(struct hlist_head);
        if (!list)
                goto fail;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0725eeab193..35fde09b81d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -27,6 +27,12 @@
 DEFINE_MUTEX(event_mutex);
+DEFINE_MUTEX(event_storage_mutex);
+EXPORT_SYMBOL_GPL(event_storage_mutex);
+char event_storage[EVENT_STORAGE_SIZE];
+EXPORT_SYMBOL_GPL(event_storage);
 LIST_HEAD(ftrace_events);
 LIST_HEAD(ftrace_common_fields);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 4ba44deaac2..4b74d71705c 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -83,13 +83,19 @@ static void __always_unused ____ftrace_check_##name(void)	\
 #undef __array
 #define __array(type, item, len)                                        \
-        BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);                         \
+        do {                                                            \
-        ret = trace_define_field(event_call, #type "[" #len "]", #item, \
+                BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);                 \
+                mutex_lock(&event_storage_mutex);                       \
+                snprintf(event_storage, sizeof(event_storage),          \
+                         "%s[%d]", #type, len);                         \
+                ret = trace_define_field(event_call, event_storage, #item, \
                                 offsetof(typeof(field), item),         \
                                 sizeof(field.item),                    \
                                 is_signed_type(type), FILTER_OTHER);   \
-        if (ret)                                                        \
+                mutex_unlock(&event_storage_mutex);                     \
-                return ret;
+                if (ret)                                                \
+                        return ret;                                     \
+        } while (0);
 #undef __array_desc
 #define __array_desc(type, container, item, len)                        \
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index c812c4927ca..6e7b575ac33 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -57,6 +57,8 @@ static int __init hardlockup_panic_setup(char *str)
 {
        if (!strncmp(str, "panic", 5))
                hardlockup_panic = 1;
+        else if (!strncmp(str, "0", 1))
+                no_watchdog = 1;
        return 1;
 }
 __setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -548,13 +550,13 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
        .notifier_call = cpu_callback
 };
-static int __init spawn_watchdog_task(void)
+void __init lockup_detector_init(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
        int err;
        if (no_watchdog)
-                return 0;
+                return;
        err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
        WARN_ON(notifier_to_errno(err));
@@ -562,6 +564,5 @@ static int __init spawn_watchdog_task(void)
        cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
        register_cpu_notifier(&cpu_nfb);
-        return 0;
+        return;
 }
-early_initcall(spawn_watchdog_task);