25 files changed, 544 insertions, 787 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 4cb8e8b23c6e..9c323a6daa46 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -108,7 +108,6 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
 obj-$(CONFIG_TORTURE_TEST) += torture.o
-obj-$(CONFIG_MEMBARRIER) += membarrier.o
 obj-$(CONFIG_HAS_IOMEM) += memremap.o
diff --git a/kernel/exit.c b/kernel/exit.c
index c5548faa9f37..f9ef3ecc78c1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -764,7 +764,6 @@ void __noreturn do_exit(long code)
 {
        struct task_struct *tsk = current;
        int group_dead;
-        TASKS_RCU(int tasks_rcu_i);
        profile_task_exit(tsk);
        kcov_task_exit(tsk);
@@ -819,7 +818,8 @@ void __noreturn do_exit(long code)
         * Ensure that we must observe the pi_state in exit_mm() ->
         * mm_release() -> exit_pi_state_list().
         */
-        raw_spin_unlock_wait(&tsk->pi_lock);
+        raw_spin_lock_irq(&tsk->pi_lock);
+        raw_spin_unlock_irq(&tsk->pi_lock);
        if (unlikely(in_atomic())) {
                pr_info("note: %s[%d] exited with preempt_count %d\n",
@@ -881,9 +881,7 @@ void __noreturn do_exit(long code)
         */
        flush_ptrace_hw_breakpoint(tsk);
-        TASKS_RCU(preempt_disable());
+        exit_tasks_rcu_start();
-        TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
-        TASKS_RCU(preempt_enable());
        exit_notify(tsk, group_dead);
        proc_exit_connector(tsk);
        mpol_put_task_policy(tsk);
@@ -918,7 +916,7 @@ void __noreturn do_exit(long code)
        if (tsk->nr_dirtied)
                __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
        exit_rcu();
-        TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
+        exit_tasks_rcu_finish();
        do_task_dead();
 }
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index fd24153e8a48..294294c71ba4 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -268,123 +268,6 @@ static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
 #define queued_spin_lock_slowpath       native_queued_spin_lock_slowpath
 #endif
-/*
- * Various notes on spin_is_locked() and spin_unlock_wait(), which are
- * 'interesting' functions:
- *
- * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE
- * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64,
- * PPC). Also qspinlock has a similar issue per construction, the setting of
- * the locked byte can be unordered acquiring the lock proper.
- *
- * This gets to be 'interesting' in the following cases, where the /should/s
- * end up false because of this issue.
- *
- *
- * CASE 1:
- *
- * So the spin_is_locked() correctness issue comes from something like:
- *
- *   CPU0                               CPU1
- *
- *   global_lock();                     local_lock(i)
- *     spin_lock(&G)                      spin_lock(&L[i])
- *     for (i)                            if (!spin_is_locked(&G)) {
- *       spin_unlock_wait(&L[i]);           smp_acquire__after_ctrl_dep();
- *                                          return;
- *                                        }
- *                                        // deal with fail
- *
- * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such
- * that there is exclusion between the two critical sections.
- *
- * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from
- * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i])
- * /should/ be constrained by the ACQUIRE from spin_lock(&G).
- *
- * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB.
- *
- *
- * CASE 2:
- *
- * For spin_unlock_wait() there is a second correctness issue, namely:
- *
- *   CPU0                               CPU1
- *
- *   flag = set;
- *   smp_mb();                          spin_lock(&l)
- *   spin_unlock_wait(&l);              if (!flag)
- *                                        // add to lockless list
- *                                      spin_unlock(&l);
- *   // iterate lockless list
- *
- * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0
- * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE
- * semantics etc..)
- *
- * Where flag /should/ be ordered against the locked store of l.
- */
-/*
- * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before
- * issuing an _unordered_ store to set _Q_LOCKED_VAL.
- *
- * This means that the store can be delayed, but no later than the
- * store-release from the unlock. This means that simply observing
- * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired.
- *
- * There are two paths that can issue the unordered store:
- *
- *  (1) clear_pending_set_locked():     *,1,0 -> *,0,1
- *
- *  (2) set_locked():                   t,0,0 -> t,0,1 ; t != 0
- *      atomic_cmpxchg_relaxed():       t,0,0 -> 0,0,1
- *
- * However, in both cases we have other !0 state we've set before to queue
- * ourseves:
- *
- * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our
- * load is constrained by that ACQUIRE to not pass before that, and thus must
- * observe the store.
- *
- * For (2) we have a more intersting scenario. We enqueue ourselves using
- * xchg_tail(), which ends up being a RELEASE. This in itself is not
- * sufficient, however that is followed by an smp_cond_acquire() on the same
- * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and
- * guarantees we must observe that store.
- *
- * Therefore both cases have other !0 state that is observable before the
- * unordered locked byte store comes through. This means we can use that to
- * wait for the lock store, and then wait for an unlock.
- */
-#ifndef queued_spin_unlock_wait
-void queued_spin_unlock_wait(struct qspinlock *lock)
-{
-        u32 val;
-        for (;;) {
-                val = atomic_read(&lock->val);
-                if (!val) /* not locked, we're done */
-                        goto done;
-                if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */
-                        break;
-                /* not locked, but pending, wait until we observe the lock */
-                cpu_relax();
-        }
-        /* any unlock is good */
-        while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
-                cpu_relax();
-done:
-        smp_acquire__after_ctrl_dep();
-}
-EXPORT_SYMBOL(queued_spin_unlock_wait);
-#endif
 #endif /* _GEN_PV_LOCK_SLOWPATH */
 /**
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
deleted file mode 100644
index 9f9284f37f8d..000000000000
--- a/kernel/membarrier.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
- *
- * membarrier system call
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-#include <linux/syscalls.h>
-#include <linux/membarrier.h>
-#include <linux/tick.h>
-/*
- * Bitmask made from a "or" of all commands within enum membarrier_cmd,
- * except MEMBARRIER_CMD_QUERY.
- */
-#define MEMBARRIER_CMD_BITMASK  (MEMBARRIER_CMD_SHARED)
-/**
- * sys_membarrier - issue memory barriers on a set of threads
- * @cmd:   Takes command values defined in enum membarrier_cmd.
- * @flags: Currently needs to be 0. For future extensions.
- *
- * If this system call is not implemented, -ENOSYS is returned. If the
- * command specified does not exist, or if the command argument is invalid,
- * this system call returns -EINVAL. For a given command, with flags argument
- * set to 0, this system call is guaranteed to always return the same value
- * until reboot.
- *
- * All memory accesses performed in program order from each targeted thread
- * is guaranteed to be ordered with respect to sys_membarrier(). If we use
- * the semantic "barrier()" to represent a compiler barrier forcing memory
- * accesses to be performed in program order across the barrier, and
- * smp_mb() to represent explicit memory barriers forcing full memory
- * ordering across the barrier, we have the following ordering table for
- * each pair of barrier(), sys_membarrier() and smp_mb():
- *
- * The pair ordering is detailed as (O: ordered, X: not ordered):
- *
- *                        barrier()   smp_mb() sys_membarrier()
- *        barrier()          X           X            O
- *        smp_mb()           X           O            O
- *        sys_membarrier()   O           O            O
- */
-SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
-{
-        /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
-        if (tick_nohz_full_enabled())
-                return -ENOSYS;
-        if (unlikely(flags))
-                return -EINVAL;
-        switch (cmd) {
-        case MEMBARRIER_CMD_QUERY:
-                return MEMBARRIER_CMD_BITMASK;
-        case MEMBARRIER_CMD_SHARED:
-                if (num_online_cpus() > 1)
-                        synchronize_sched();
-                return 0;
-        default:
-                return -EINVAL;
-        }
-}
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index be90c945063f..9210379c0353 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -69,8 +69,7 @@ config TREE_SRCU
          This option selects the full-fledged version of SRCU.
 config TASKS_RCU
-        bool
+        def_bool PREEMPT
-        default n
        select SRCU
        help
          This option enables a task-based RCU implementation that uses
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 808b8c85f626..e4b43fef89f5 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -356,22 +356,10 @@ do {									\
 #ifdef CONFIG_TINY_RCU
 /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
-static inline bool rcu_gp_is_normal(void)  /* Internal RCU use. */
+static inline bool rcu_gp_is_normal(void) { return true; }
-{
+static inline bool rcu_gp_is_expedited(void) { return false; }
-        return true;
+static inline void rcu_expedite_gp(void) { }
-}
+static inline void rcu_unexpedite_gp(void) { }
-static inline bool rcu_gp_is_expedited(void)  /* Internal RCU use. */
-{
-        return false;
-}
-static inline void rcu_expedite_gp(void)
-{
-}
-static inline void rcu_unexpedite_gp(void)
-{
-}
 #else /* #ifdef CONFIG_TINY_RCU */
 bool rcu_gp_is_normal(void);     /* Internal RCU use. */
 bool rcu_gp_is_expedited(void);  /* Internal RCU use. */
@@ -419,12 +407,8 @@ static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
        *gpnum = 0;
        *completed = 0;
 }
-static inline void rcutorture_record_test_transition(void)
+static inline void rcutorture_record_test_transition(void) { }
-{
+static inline void rcutorture_record_progress(unsigned long vernum) { }
-}
-static inline void rcutorture_record_progress(unsigned long vernum)
-{
-}
 #ifdef CONFIG_RCU_TRACE
 void do_trace_rcu_torture_read(const char *rcutorturename,
                               struct rcu_head *rhp,
@@ -460,92 +444,20 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
 #endif
 #ifdef CONFIG_TINY_RCU
+static inline unsigned long rcu_batches_started(void) { return 0; }
-/*
+static inline unsigned long rcu_batches_started_bh(void) { return 0; }
- * Return the number of grace periods started.
+static inline unsigned long rcu_batches_started_sched(void) { return 0; }
- */
+static inline unsigned long rcu_batches_completed(void) { return 0; }
-static inline unsigned long rcu_batches_started(void)
+static inline unsigned long rcu_batches_completed_bh(void) { return 0; }
-{
+static inline unsigned long rcu_batches_completed_sched(void) { return 0; }
-        return 0;
+static inline unsigned long rcu_exp_batches_completed(void) { return 0; }
-}
+static inline unsigned long rcu_exp_batches_completed_sched(void) { return 0; }
+static inline unsigned long
-/*
+srcu_batches_completed(struct srcu_struct *sp) { return 0; }
- * Return the number of bottom-half grace periods started.
+static inline void rcu_force_quiescent_state(void) { }
- */
+static inline void rcu_bh_force_quiescent_state(void) { }
-static inline unsigned long rcu_batches_started_bh(void)
+static inline void rcu_sched_force_quiescent_state(void) { }
-{
+static inline void show_rcu_gp_kthreads(void) { }
-        return 0;
-}
-/*
- * Return the number of sched grace periods started.
- */
-static inline unsigned long rcu_batches_started_sched(void)
-{
-        return 0;
-}
-/*
- * Return the number of grace periods completed.
- */
-static inline unsigned long rcu_batches_completed(void)
-{
-        return 0;
-}
-/*
- * Return the number of bottom-half grace periods completed.
- */
-static inline unsigned long rcu_batches_completed_bh(void)
-{
-        return 0;
-}
-/*
- * Return the number of sched grace periods completed.
- */
-static inline unsigned long rcu_batches_completed_sched(void)
-{
-        return 0;
-}
-/*
- * Return the number of expedited grace periods completed.
- */
-static inline unsigned long rcu_exp_batches_completed(void)
-{
-        return 0;
-}
-/*
- * Return the number of expedited sched grace periods completed.
- */
-static inline unsigned long rcu_exp_batches_completed_sched(void)
-{
-        return 0;
-}
-static inline unsigned long srcu_batches_completed(struct srcu_struct *sp)
-{
-        return 0;
-}
-static inline void rcu_force_quiescent_state(void)
-{
-}
-static inline void rcu_bh_force_quiescent_state(void)
-{
-}
-static inline void rcu_sched_force_quiescent_state(void)
-{
-}
-static inline void show_rcu_gp_kthreads(void)
-{
-}
 #else /* #ifdef CONFIG_TINY_RCU */
 extern unsigned long rcutorture_testseq;
 extern unsigned long rcutorture_vernum;
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 2b62a38b080f..7649fcd2c4c7 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -36,24 +36,6 @@ void rcu_cblist_init(struct rcu_cblist *rclp)
 }
 /*
- * Debug function to actually count the number of callbacks.
- * If the number exceeds the limit specified, return -1.
- */
-long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim)
-{
-        int cnt = 0;
-        struct rcu_head **rhpp = &rclp->head;
-        for (;;) {
-                if (!*rhpp)
-                        return cnt;
-                if (++cnt > lim)
-                        return -1;
-                rhpp = &(*rhpp)->next;
-        }
-}
-/*
 * Dequeue the oldest rcu_head structure from the specified callback
 * list.  This function assumes that the callback is non-lazy, but
 * the caller can later invoke rcu_cblist_dequeued_lazy() if it
@@ -103,17 +85,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
 }
 /*
- * Is the specified segment of the specified rcu_segcblist structure
- * empty of callbacks?
- */
-bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
-{
-        if (seg == RCU_DONE_TAIL)
-                return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
-        return rsclp->tails[seg - 1] == rsclp->tails[seg];
-}
-/*
 * Does the specified rcu_segcblist structure contain callbacks that
 * are ready to be invoked?
 */
@@ -134,50 +105,6 @@ bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
 }
 /*
- * Dequeue and return the first ready-to-invoke callback.  If there
- * are no ready-to-invoke callbacks, return NULL.  Disables interrupts
- * to avoid interference.  Does not protect from interference from other
- * CPUs or tasks.
- */
-struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp)
-{
-        unsigned long flags;
-        int i;
-        struct rcu_head *rhp;
-        local_irq_save(flags);
-        if (!rcu_segcblist_ready_cbs(rsclp)) {
-                local_irq_restore(flags);
-                return NULL;
-        }
-        rhp = rsclp->head;
-        BUG_ON(!rhp);
-        rsclp->head = rhp->next;
-        for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) {
-                if (rsclp->tails[i] != &rhp->next)
-                        break;
-                rsclp->tails[i] = &rsclp->head;
-        }
-        smp_mb(); /* Dequeue before decrement for rcu_barrier(). */
-        WRITE_ONCE(rsclp->len, rsclp->len - 1);
-        local_irq_restore(flags);
-        return rhp;
-}
-/*
- * Account for the fact that a previously dequeued callback turned out
- * to be marked as lazy.
- */
-void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp)
-{
-        unsigned long flags;
-        local_irq_save(flags);
-        rsclp->len_lazy--;
-        local_irq_restore(flags);
-}
-/*
 * Return a pointer to the first callback in the specified rcu_segcblist
 * structure.  This is useful for diagnostics.
 */
@@ -203,17 +130,6 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
 }
 /*
- * Does the specified rcu_segcblist structure contain callbacks that
- * have not yet been processed beyond having been posted, that is,
- * does it contain callbacks in its last segment?
- */
-bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp)
-{
-        return rcu_segcblist_is_enabled(rsclp) &&
-               !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL);
-}
-/*
 * Enqueue the specified callback onto the specified rcu_segcblist
 * structure, updating accounting as needed.  Note that the ->len
 * field may be accessed locklessly, hence the WRITE_ONCE().
@@ -503,3 +419,27 @@ bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
                        return true;
        return false;
 }
+/*
+ * Merge the source rcu_segcblist structure into the destination
+ * rcu_segcblist structure, then initialize the source.  Any pending
+ * callbacks from the source get to start over.  It is best to
+ * advance and accelerate both the destination and the source
+ * before merging.
+ */
+void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
+                         struct rcu_segcblist *src_rsclp)
+{
+        struct rcu_cblist donecbs;
+        struct rcu_cblist pendcbs;
+        rcu_cblist_init(&donecbs);
+        rcu_cblist_init(&pendcbs);
+        rcu_segcblist_extract_count(src_rsclp, &donecbs);
+        rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs);
+        rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs);
+        rcu_segcblist_insert_count(dst_rsclp, &donecbs);
+        rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs);
+        rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs);
+        rcu_segcblist_init(src_rsclp);
+}
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 6e36e36478cd..581c12b63544 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -31,29 +31,7 @@ static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
        rclp->len_lazy--;
 }
-/*
- * Interim function to return rcu_cblist head pointer.  Longer term, the
- * rcu_cblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp)
-{
-        return rclp->head;
-}
-/*
- * Interim function to return rcu_cblist head pointer.  Longer term, the
- * rcu_cblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp)
-{
-        WARN_ON_ONCE(!rclp->head);
-        return rclp->tail;
-}
 void rcu_cblist_init(struct rcu_cblist *rclp);
-long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim);
 struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);
 /*
@@ -134,14 +112,10 @@ static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
 void rcu_segcblist_init(struct rcu_segcblist *rsclp);
 void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
-bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg);
 bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
 bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
-struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp);
-void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp);
 struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
 struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp);
-bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp);
 void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
                           struct rcu_head *rhp, bool lazy);
 bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
@@ -162,3 +136,5 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
 bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
 bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
                                    unsigned long seq);
+void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
+                         struct rcu_segcblist *src_rsclp);
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 3cc18110b612..1f87a02c3399 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -317,8 +317,6 @@ static struct rcu_perf_ops sched_ops = {
        .name           = "sched"
 };
-#ifdef CONFIG_TASKS_RCU
 /*
 * Definitions for RCU-tasks perf testing.
 */
@@ -346,24 +344,11 @@ static struct rcu_perf_ops tasks_ops = {
        .name           = "tasks"
 };
-#define RCUPERF_TASKS_OPS &tasks_ops,
 static bool __maybe_unused torturing_tasks(void)
 {
        return cur_ops == &tasks_ops;
 }
-#else /* #ifdef CONFIG_TASKS_RCU */
-#define RCUPERF_TASKS_OPS
-static bool __maybe_unused torturing_tasks(void)
-{
-        return false;
-}
-#endif /* #else #ifdef CONFIG_TASKS_RCU */
 /*
 * If performance tests complete, wait for shutdown to commence.
 */
@@ -658,7 +643,7 @@ rcu_perf_init(void)
        int firsterr = 0;
        static struct rcu_perf_ops *perf_ops[] = {
                &rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops,
-                RCUPERF_TASKS_OPS
+                &tasks_ops,
        };
        if (!torture_init_begin(perf_type, verbose, &perf_runnable))
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index b8f7f8ce8575..45f2ffbc1e78 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -199,7 +199,8 @@ MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot");
 static u64 notrace rcu_trace_clock_local(void)
 {
        u64 ts = trace_clock_local();
-        unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC);
+        (void)do_div(ts, NSEC_PER_USEC);
        return ts;
 }
 #else /* #ifdef CONFIG_RCU_TRACE */
@@ -496,7 +497,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
        .fqs            = NULL,
        .stats          = NULL,
        .irq_capable    = 1,
-        .name           = "rcu_busted"
+        .name           = "busted"
 };
 /*
@@ -522,7 +523,7 @@ static void srcu_read_delay(struct torture_random_state *rrsp)
        delay = torture_random(rrsp) %
                (nrealreaders * 2 * longdelay * uspertick);
-        if (!delay)
+        if (!delay && in_task())
                schedule_timeout_interruptible(longdelay);
        else
                rcu_read_delay(rrsp);
@@ -561,44 +562,7 @@ static void srcu_torture_barrier(void)
 static void srcu_torture_stats(void)
 {
-        int __maybe_unused cpu;
+        srcu_torture_stats_print(srcu_ctlp, torture_type, TORTURE_FLAG);
-        int idx;
-#ifdef CONFIG_TREE_SRCU
-        idx = srcu_ctlp->srcu_idx & 0x1;
-        pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
-                 torture_type, TORTURE_FLAG, idx);
-        for_each_possible_cpu(cpu) {
-                unsigned long l0, l1;
-                unsigned long u0, u1;
-                long c0, c1;
-                struct srcu_data *counts;
-                counts = per_cpu_ptr(srcu_ctlp->sda, cpu);
-                u0 = counts->srcu_unlock_count[!idx];
-                u1 = counts->srcu_unlock_count[idx];
-                /*
-                 * Make sure that a lock is always counted if the corresponding
-                 * unlock is counted.
-                 */
-                smp_rmb();
-                l0 = counts->srcu_lock_count[!idx];
-                l1 = counts->srcu_lock_count[idx];
-                c0 = l0 - u0;
-                c1 = l1 - u1;
-                pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
-        }
-        pr_cont("\n");
-#elif defined(CONFIG_TINY_SRCU)
-        idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1;
-        pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
-                 torture_type, TORTURE_FLAG, idx,
-                 READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]),
-                 READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx]));
-#endif
 }
 static void srcu_torture_synchronize_expedited(void)
@@ -620,6 +584,7 @@ static struct rcu_torture_ops srcu_ops = {
        .call           = srcu_torture_call,
        .cb_barrier     = srcu_torture_barrier,
        .stats          = srcu_torture_stats,
+        .irq_capable    = 1,
        .name           = "srcu"
 };
@@ -652,6 +617,7 @@ static struct rcu_torture_ops srcud_ops = {
        .call           = srcu_torture_call,
        .cb_barrier     = srcu_torture_barrier,
        .stats          = srcu_torture_stats,
+        .irq_capable    = 1,
        .name           = "srcud"
 };
@@ -696,8 +662,6 @@ static struct rcu_torture_ops sched_ops = {
        .name           = "sched"
 };
-#ifdef CONFIG_TASKS_RCU
 /*
 * Definitions for RCU-tasks torture testing.
 */
@@ -735,24 +699,11 @@ static struct rcu_torture_ops tasks_ops = {
        .name           = "tasks"
 };
-#define RCUTORTURE_TASKS_OPS &tasks_ops,
 static bool __maybe_unused torturing_tasks(void)
 {
        return cur_ops == &tasks_ops;
 }
-#else /* #ifdef CONFIG_TASKS_RCU */
-#define RCUTORTURE_TASKS_OPS
-static bool __maybe_unused torturing_tasks(void)
-{
-        return false;
-}
-#endif /* #else #ifdef CONFIG_TASKS_RCU */
 /*
 * RCU torture priority-boost testing.  Runs one real-time thread per
 * CPU for moderate bursts, repeatedly registering RCU callbacks and
@@ -1114,6 +1065,11 @@ rcu_torture_fakewriter(void *arg)
        return 0;
 }
+static void rcu_torture_timer_cb(struct rcu_head *rhp)
+{
+        kfree(rhp);
+}
 /*
 * RCU torture reader from timer handler.  Dereferences rcu_torture_current,
 * incrementing the corresponding element of the pipeline array.  The
@@ -1176,6 +1132,14 @@ static void rcu_torture_timer(unsigned long unused)
        __this_cpu_inc(rcu_torture_batch[completed]);
        preempt_enable();
        cur_ops->readunlock(idx);
+        /* Test call_rcu() invocation from interrupt handler. */
+        if (cur_ops->call) {
+                struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_NOWAIT);
+                if (rhp)
+                        cur_ops->call(rhp, rcu_torture_timer_cb);
+        }
 }
 /*
@@ -1354,11 +1318,12 @@ rcu_torture_stats_print(void)
                srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
                                        &flags, &gpnum, &completed);
                wtp = READ_ONCE(writer_task);
-                pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n",
+                pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx cpu %d\n",
                         rcu_torture_writer_state_getname(),
                         rcu_torture_writer_state,
                         gpnum, completed, flags,
-                         wtp == NULL ? ~0UL : wtp->state);
+                         wtp == NULL ? ~0UL : wtp->state,
+                         wtp == NULL ? -1 : (int)task_cpu(wtp));
                show_rcu_gp_kthreads();
                rcu_ftrace_dump(DUMP_ALL);
        }
@@ -1749,7 +1714,7 @@ rcu_torture_init(void)
        int firsterr = 0;
        static struct rcu_torture_ops *torture_ops[] = {
                &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
-                &sched_ops, RCUTORTURE_TASKS_OPS
+                &sched_ops, &tasks_ops,
        };
        if (!torture_init_begin(torture_type, verbose, &torture_runnable))
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 1a1c1047d2ed..76ac5f50b2c7 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -33,6 +33,8 @@
 #include "rcu_segcblist.h"
 #include "rcu.h"
+int rcu_scheduler_active __read_mostly;
 static int init_srcu_struct_fields(struct srcu_struct *sp)
 {
        sp->srcu_lock_nesting[0] = 0;
@@ -193,3 +195,9 @@ void synchronize_srcu(struct srcu_struct *sp)
        destroy_rcu_head_on_stack(&rs.head);
 }
 EXPORT_SYMBOL_GPL(synchronize_srcu);
+/* Lockdep diagnostics.  */
+void __init rcu_scheduler_starting(void)
+{
+        rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+}
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index d0ca524bf042..729a8706751d 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -51,6 +51,7 @@ module_param(counter_wrap_check, ulong, 0444);
 static void srcu_invoke_callbacks(struct work_struct *work);
 static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
+static void process_srcu(struct work_struct *work);
 /*
 * Initialize SRCU combining tree.  Note that statically allocated
@@ -896,6 +897,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
        __call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm);
        wait_for_completion(&rcu.completion);
        destroy_rcu_head_on_stack(&rcu.head);
+        /*
+         * Make sure that later code is ordered after the SRCU grace
+         * period.  This pairs with the raw_spin_lock_irq_rcu_node()
+         * in srcu_invoke_callbacks().  Unlike Tree RCU, this is needed
+         * because the current CPU might have been totally uninvolved with
+         * (and thus unordered against) that grace period.
+         */
+        smp_mb();
 }
 /**
@@ -1194,7 +1204,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
 /*
 * This is the work-queue function that handles SRCU grace periods.
 */
-void process_srcu(struct work_struct *work)
+static void process_srcu(struct work_struct *work)
 {
        struct srcu_struct *sp;
@@ -1203,7 +1213,6 @@ void process_srcu(struct work_struct *work)
        srcu_advance_state(sp);
        srcu_reschedule(sp, srcu_get_delay(sp));
 }
-EXPORT_SYMBOL_GPL(process_srcu);
 void srcutorture_get_gp_data(enum rcutorture_type test_type,
                             struct srcu_struct *sp, int *flags,
@@ -1217,6 +1226,43 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
 }
 EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
+void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf)
+{
+        int cpu;
+        int idx;
+        unsigned long s0 = 0, s1 = 0;
+        idx = sp->srcu_idx & 0x1;
+        pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", tt, tf, idx);
+        for_each_possible_cpu(cpu) {
+                unsigned long l0, l1;
+                unsigned long u0, u1;
+                long c0, c1;
+                struct srcu_data *counts;
+                counts = per_cpu_ptr(sp->sda, cpu);
+                u0 = counts->srcu_unlock_count[!idx];
+                u1 = counts->srcu_unlock_count[idx];
+                /*
+                 * Make sure that a lock is always counted if the corresponding
+                 * unlock is counted.
+                 */
+                smp_rmb();
+                l0 = counts->srcu_lock_count[!idx];
+                l1 = counts->srcu_lock_count[idx];
+                c0 = l0 - u0;
+                c1 = l1 - u1;
+                pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
+                s0 += c0;
+                s1 += c1;
+        }
+        pr_cont(" T(%ld,%ld)\n", s0, s1);
+}
+EXPORT_SYMBOL_GPL(srcu_torture_stats_print);
 static int __init srcu_bootup_announce(void)
 {
        pr_info("Hierarchical SRCU implementation.\n");
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index f8488965250f..a64eee0db39e 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -56,8 +56,6 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
        .curtail        = &rcu_bh_ctrlblk.rcucblist,
 };
-#include "tiny_plugin.h"
 void rcu_barrier_bh(void)
 {
        wait_rcu_gp(call_rcu_bh);
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
deleted file mode 100644
index f0a01b2a3062..000000000000
--- a/kernel/rcu/tiny_plugin.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
- * Internal non-public definitions that provide either classic
- * or preemptible semantics.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * Copyright (c) 2010 Linaro
- *
- * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
- */
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
-#include <linux/kernel_stat.h>
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-/*
- * During boot, we forgive RCU lockdep issues.  After this function is
- * invoked, we start taking RCU lockdep issues seriously.  Note that unlike
- * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE
- * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
- * The reason for this is that Tiny RCU does not need kthreads, so does
- * not have to care about the fact that the scheduler is half-initialized
- * at a certain phase of the boot process.  Unless SRCU is in the mix.
- */
-void __init rcu_scheduler_starting(void)
-{
-        WARN_ON(nr_context_switches() > 0);
-        rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU)
-                ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING;
-}
-#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9bb5dff50815..84fe96641b2e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -97,9 +97,6 @@ struct rcu_state sname##_state = { \
        .gp_state = RCU_GP_IDLE, \
        .gpnum = 0UL - 300UL, \
        .completed = 0UL - 300UL, \
-        .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
-        .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \
-        .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \
        .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
        .name = RCU_STATE_NAME(sname), \
        .abbr = sabbr, \
@@ -843,13 +840,9 @@ static void rcu_eqs_enter(bool user)
 */
 void rcu_idle_enter(void)
 {
-        unsigned long flags;
+        RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_idle_enter() invoked with irqs enabled!!!");
-        local_irq_save(flags);
        rcu_eqs_enter(false);
-        local_irq_restore(flags);
 }
-EXPORT_SYMBOL_GPL(rcu_idle_enter);
 #ifdef CONFIG_NO_HZ_FULL
 /**
@@ -862,7 +855,8 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
 */
 void rcu_user_enter(void)
 {
-        rcu_eqs_enter(1);
+        RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_user_enter() invoked with irqs enabled!!!");
+        rcu_eqs_enter(true);
 }
 #endif /* CONFIG_NO_HZ_FULL */
@@ -955,8 +949,10 @@ static void rcu_eqs_exit(bool user)
        if (oldval & DYNTICK_TASK_NEST_MASK) {
                rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
        } else {
+                __this_cpu_inc(disable_rcu_irq_enter);
                rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
                rcu_eqs_exit_common(oldval, user);
+                __this_cpu_dec(disable_rcu_irq_enter);
        }
 }
@@ -979,7 +975,6 @@ void rcu_idle_exit(void)
        rcu_eqs_exit(false);
        local_irq_restore(flags);
 }
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
 #ifdef CONFIG_NO_HZ_FULL
 /**
@@ -1358,12 +1353,13 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
        j = jiffies;
        gpa = READ_ONCE(rsp->gp_activity);
        if (j - gpa > 2 * HZ) {
-                pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n",
+                pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
                       rsp->name, j - gpa,
                       rsp->gpnum, rsp->completed,
                       rsp->gp_flags,
                       gp_state_getname(rsp->gp_state), rsp->gp_state,
-                       rsp->gp_kthread ? rsp->gp_kthread->state : ~0);
+                       rsp->gp_kthread ? rsp->gp_kthread->state : ~0,
+                       rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1);
                if (rsp->gp_kthread) {
                        sched_show_task(rsp->gp_kthread);
                        wake_up_process(rsp->gp_kthread);
@@ -2067,8 +2063,8 @@ static bool rcu_gp_init(struct rcu_state *rsp)
 }
 /*
- * Helper function for wait_event_interruptible_timeout() wakeup
+ * Helper function for swait_event_idle() wakeup at force-quiescent-state
- * at force-quiescent-state time.
+ * time.
 */
 static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
 {
@@ -2206,9 +2202,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
                                               READ_ONCE(rsp->gpnum),
                                               TPS("reqwait"));
                        rsp->gp_state = RCU_GP_WAIT_GPS;
-                        swait_event_interruptible(rsp->gp_wq,
+                        swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
-                                                 READ_ONCE(rsp->gp_flags) &
+                                                     RCU_GP_FLAG_INIT);
-                                                 RCU_GP_FLAG_INIT);
                        rsp->gp_state = RCU_GP_DONE_GPS;
                        /* Locking provides needed memory barrier. */
                        if (rcu_gp_init(rsp))
@@ -2239,7 +2234,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
                                               READ_ONCE(rsp->gpnum),
                                               TPS("fqswait"));
                        rsp->gp_state = RCU_GP_WAIT_FQS;
-                        ret = swait_event_interruptible_timeout(rsp->gp_wq,
+                        ret = swait_event_idle_timeout(rsp->gp_wq,
                                        rcu_gp_fqs_check_wake(rsp, &gf), j);
                        rsp->gp_state = RCU_GP_DOING_FQS;
                        /* Locking provides needed memory barriers. */
@@ -2409,6 +2404,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
                        return;
                }
                WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
+                WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 &&
+                             rcu_preempt_blocked_readers_cgp(rnp));
                rnp->qsmask &= ~mask;
                trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
                                                 mask, rnp->qsmask, rnp->level,
@@ -3476,10 +3473,11 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
        struct rcu_state *rsp = rdp->rsp;
        if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
-                _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence);
+                _rcu_barrier_trace(rsp, TPS("LastCB"), -1,
+                                   rsp->barrier_sequence);
                complete(&rsp->barrier_completion);
        } else {
-                _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence);
+                _rcu_barrier_trace(rsp, TPS("CB"), -1, rsp->barrier_sequence);
        }
 }
@@ -3491,14 +3489,15 @@ static void rcu_barrier_func(void *type)
        struct rcu_state *rsp = type;
        struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
-        _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence);
+        _rcu_barrier_trace(rsp, TPS("IRQ"), -1, rsp->barrier_sequence);
        rdp->barrier_head.func = rcu_barrier_callback;
        debug_rcu_head_queue(&rdp->barrier_head);
        if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
                atomic_inc(&rsp->barrier_cpu_count);
        } else {
                debug_rcu_head_unqueue(&rdp->barrier_head);
-                _rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence);
+                _rcu_barrier_trace(rsp, TPS("IRQNQ"), -1,
+                                   rsp->barrier_sequence);
        }
 }
@@ -3512,14 +3511,15 @@ static void _rcu_barrier(struct rcu_state *rsp)
        struct rcu_data *rdp;
        unsigned long s = rcu_seq_snap(&rsp->barrier_sequence);
-        _rcu_barrier_trace(rsp, "Begin", -1, s);
+        _rcu_barrier_trace(rsp, TPS("Begin"), -1, s);
        /* Take mutex to serialize concurrent rcu_barrier() requests. */
        mutex_lock(&rsp->barrier_mutex);
        /* Did someone else do our work for us? */
        if (rcu_seq_done(&rsp->barrier_sequence, s)) {
-                _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence);
+                _rcu_barrier_trace(rsp, TPS("EarlyExit"), -1,
+                                   rsp->barrier_sequence);
                smp_mb(); /* caller's subsequent code after above check. */
                mutex_unlock(&rsp->barrier_mutex);
                return;
@@ -3527,7 +3527,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
        /* Mark the start of the barrier operation. */
        rcu_seq_start(&rsp->barrier_sequence);
-        _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence);
+        _rcu_barrier_trace(rsp, TPS("Inc1"), -1, rsp->barrier_sequence);
        /*
         * Initialize the count to one rather than to zero in order to
@@ -3550,10 +3550,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
                rdp = per_cpu_ptr(rsp->rda, cpu);
                if (rcu_is_nocb_cpu(cpu)) {
                        if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
-                                _rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
+                                _rcu_barrier_trace(rsp, TPS("OfflineNoCB"), cpu,
                                                   rsp->barrier_sequence);
                        } else {
-                                _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
+                                _rcu_barrier_trace(rsp, TPS("OnlineNoCB"), cpu,
                                                   rsp->barrier_sequence);
                                smp_mb__before_atomic();
                                atomic_inc(&rsp->barrier_cpu_count);
@@ -3561,11 +3561,11 @@ static void _rcu_barrier(struct rcu_state *rsp)
                                           rcu_barrier_callback, rsp, cpu, 0);
                        }
                } else if (rcu_segcblist_n_cbs(&rdp->cblist)) {
-                        _rcu_barrier_trace(rsp, "OnlineQ", cpu,
+                        _rcu_barrier_trace(rsp, TPS("OnlineQ"), cpu,
                                           rsp->barrier_sequence);
                        smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
                } else {
-                        _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
+                        _rcu_barrier_trace(rsp, TPS("OnlineNQ"), cpu,
                                           rsp->barrier_sequence);
                }
        }
@@ -3582,7 +3582,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
        wait_for_completion(&rsp->barrier_completion);
        /* Mark the end of the barrier operation. */
-        _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence);
+        _rcu_barrier_trace(rsp, TPS("Inc2"), -1, rsp->barrier_sequence);
        rcu_seq_end(&rsp->barrier_sequence);
        /* Other rcu_barrier() invocations can now safely proceed. */
@@ -3684,8 +3684,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
         */
        rnp = rdp->mynode;
        raw_spin_lock_rcu_node(rnp);            /* irqs already disabled. */
-        if (!rdp->beenonline)
-                WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
        rdp->beenonline = true;  /* We have now been online. */
        rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
        rdp->completed = rnp->completed;
@@ -3789,6 +3787,8 @@ void rcu_cpu_starting(unsigned int cpu)
 {
        unsigned long flags;
        unsigned long mask;
+        int nbits;
+        unsigned long oldmask;
        struct rcu_data *rdp;
        struct rcu_node *rnp;
        struct rcu_state *rsp;
@@ -3799,9 +3799,15 @@ void rcu_cpu_starting(unsigned int cpu)
                mask = rdp->grpmask;
                raw_spin_lock_irqsave_rcu_node(rnp, flags);
                rnp->qsmaskinitnext |= mask;
+                oldmask = rnp->expmaskinitnext;
                rnp->expmaskinitnext |= mask;
+                oldmask ^= rnp->expmaskinitnext;
+                nbits = bitmap_weight(&oldmask, BITS_PER_LONG);
+                /* Allow lockless access for expedited grace periods. */
+                smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */
                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
        }
+        smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
 }
 #ifdef CONFIG_HOTPLUG_CPU
@@ -3845,96 +3851,30 @@ void rcu_report_dead(unsigned int cpu)
                rcu_cleanup_dying_idle_cpu(cpu, rsp);
 }
-/*
+/* Migrate the dead CPU's callbacks to the current CPU. */
- * Send the specified CPU's RCU callbacks to the orphanage.  The
- * specified CPU must be offline, and the caller must hold the
- * ->orphan_lock.
- */
-static void
-rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
-                          struct rcu_node *rnp, struct rcu_data *rdp)
-{
-        lockdep_assert_held(&rsp->orphan_lock);
-        /* No-CBs CPUs do not have orphanable callbacks. */
-        if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu))
-                return;
-        /*
-         * Orphan the callbacks.  First adjust the counts.  This is safe
-         * because _rcu_barrier() excludes CPU-hotplug operations, so it
-         * cannot be running now.  Thus no memory barrier is required.
-         */
-        rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist);
-        rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done);
-        /*
-         * Next, move those callbacks still needing a grace period to
-         * the orphanage, where some other CPU will pick them up.
-         * Some of the callbacks might have gone partway through a grace
-         * period, but that is too bad.  They get to start over because we
-         * cannot assume that grace periods are synchronized across CPUs.
-         */
-        rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
-        /*
-         * Then move the ready-to-invoke callbacks to the orphanage,
-         * where some other CPU will pick them up.  These will not be
-         * required to pass though another grace period: They are done.
-         */
-        rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done);
-        /* Finally, disallow further callbacks on this CPU.  */
-        rcu_segcblist_disable(&rdp->cblist);
-}
-/*
- * Adopt the RCU callbacks from the specified rcu_state structure's
- * orphanage.  The caller must hold the ->orphan_lock.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
-{
-        struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
-        lockdep_assert_held(&rsp->orphan_lock);
-        /* No-CBs CPUs are handled specially. */
-        if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
-            rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
-                return;
-        /* Do the accounting first. */
-        rdp->n_cbs_adopted += rsp->orphan_done.len;
-        if (rsp->orphan_done.len_lazy != rsp->orphan_done.len)
-                rcu_idle_count_callbacks_posted();
-        rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done);
-        /*
-         * We do not need a memory barrier here because the only way we
-         * can get here if there is an rcu_barrier() in flight is if
-         * we are the task doing the rcu_barrier().
-         */
-        /* First adopt the ready-to-invoke callbacks, then the done ones. */
-        rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done);
-        WARN_ON_ONCE(rsp->orphan_done.head);
-        rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
-        WARN_ON_ONCE(rsp->orphan_pend.head);
-        WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) !=
-                     !rcu_segcblist_n_cbs(&rdp->cblist));
-}
-/* Orphan the dead CPU's callbacks, and then adopt them. */
 static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp)
 {
        unsigned long flags;
+        struct rcu_data *my_rdp;
        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-        struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
+        struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
-        raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
+        if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist))
-        rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
+                return;  /* No callbacks to migrate. */
-        rcu_adopt_orphan_cbs(rsp, flags);
-        raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
+        local_irq_save(flags);
+        my_rdp = this_cpu_ptr(rsp->rda);
+        if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) {
+                local_irq_restore(flags);
+                return;
+        }
+        raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
+        rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */
+        rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */
+        rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
+        WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
+                     !rcu_segcblist_n_cbs(&my_rdp->cblist));
+        raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags);
        WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
                  !rcu_segcblist_empty(&rdp->cblist),
                  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 9af0f31d6847..8e1f285f0a70 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -219,8 +219,6 @@ struct rcu_data {
                                        /* qlen at last check for QS forcing */
        unsigned long   n_cbs_invoked;  /* count of RCU cbs invoked. */
        unsigned long   n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */
-        unsigned long   n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
-        unsigned long   n_cbs_adopted;  /* RCU cbs adopted from dying CPU */
        unsigned long   n_force_qs_snap;
                                        /* did other CPU force QS recently? */
        long            blimit;         /* Upper limit on a processed batch */
@@ -268,7 +266,9 @@ struct rcu_data {
        struct rcu_head **nocb_follower_tail;
        struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
        struct task_struct *nocb_kthread;
+        raw_spinlock_t nocb_lock;       /* Guard following pair of fields. */
        int nocb_defer_wakeup;          /* Defer wakeup of nocb_kthread. */
+        struct timer_list nocb_timer;   /* Enforce finite deferral. */
        /* The following fields are used by the leader, hence own cacheline. */
        struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
@@ -350,15 +350,6 @@ struct rcu_state {
        /* End of fields guarded by root rcu_node's lock. */
-        raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
-                                                /* Protect following fields. */
-        struct rcu_cblist orphan_pend;          /* Orphaned callbacks that */
-                                                /*  need a grace period. */
-        struct rcu_cblist orphan_done;          /* Orphaned callbacks that */
-                                                /*  are ready to invoke. */
-                                                /* (Contains counts.) */
-        /* End of fields guarded by orphan_lock. */
        struct mutex barrier_mutex;             /* Guards barrier fields. */
        atomic_t barrier_cpu_count;             /* # CPUs waiting on. */
        struct completion barrier_completion;   /* Wake at barrier end. */
@@ -495,7 +486,7 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
 static void rcu_init_one_nocb(struct rcu_node *rnp);
 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
                            bool lazy, unsigned long flags);
-static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
                                      struct rcu_data *rdp,
                                      unsigned long flags);
 static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index dd21ca47e4b4..46d61b597731 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -73,7 +73,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
        unsigned long flags;
        unsigned long mask;
        unsigned long oldmask;
-        int ncpus = READ_ONCE(rsp->ncpus);
+        int ncpus = smp_load_acquire(&rsp->ncpus); /* Order against locking. */
        struct rcu_node *rnp;
        struct rcu_node *rnp_up;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 908b309d60d7..55bde94b9572 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -180,6 +180,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
        struct task_struct *t = current;
        lockdep_assert_held(&rnp->lock);
+        WARN_ON_ONCE(rdp->mynode != rnp);
+        WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
        /*
         * Decide where to queue the newly blocked task.  In theory,
@@ -261,6 +263,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
                rnp->gp_tasks = &t->rcu_node_entry;
        if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
                rnp->exp_tasks = &t->rcu_node_entry;
+        WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) !=
+                     !(rnp->qsmask & rdp->grpmask));
+        WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) !=
+                     !(rnp->expmask & rdp->grpmask));
        raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */
        /*
@@ -482,6 +488,7 @@ void rcu_read_unlock_special(struct task_struct *t)
                rnp = t->rcu_blocked_node;
                raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
                WARN_ON_ONCE(rnp != t->rcu_blocked_node);
+                WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
                empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
                empty_exp = sync_rcu_preempt_exp_done(rnp);
                smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -495,10 +502,10 @@ void rcu_read_unlock_special(struct task_struct *t)
                if (&t->rcu_node_entry == rnp->exp_tasks)
                        rnp->exp_tasks = np;
                if (IS_ENABLED(CONFIG_RCU_BOOST)) {
-                        if (&t->rcu_node_entry == rnp->boost_tasks)
-                                rnp->boost_tasks = np;
                        /* Snapshot ->boost_mtx ownership w/rnp->lock held. */
                        drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
+                        if (&t->rcu_node_entry == rnp->boost_tasks)
+                                rnp->boost_tasks = np;
                }
                /*
@@ -636,10 +643,17 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
 */
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 {
+        struct task_struct *t;
        RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
        WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
-        if (rcu_preempt_has_tasks(rnp))
+        if (rcu_preempt_has_tasks(rnp)) {
                rnp->gp_tasks = rnp->blkd_tasks.next;
+                t = container_of(rnp->gp_tasks, struct task_struct,
+                                 rcu_node_entry);
+                trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"),
+                                                rnp->gpnum, t->pid);
+        }
        WARN_ON_ONCE(rnp->qsmask);
 }
@@ -1788,23 +1802,62 @@ bool rcu_is_nocb_cpu(int cpu)
 }
 /*
- * Kick the leader kthread for this NOCB group.
+ * Kick the leader kthread for this NOCB group.  Caller holds ->nocb_lock
+ * and this function releases it.
 */
-static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+static void __wake_nocb_leader(struct rcu_data *rdp, bool force,
+                               unsigned long flags)
+        __releases(rdp->nocb_lock)
 {
        struct rcu_data *rdp_leader = rdp->nocb_leader;
-        if (!READ_ONCE(rdp_leader->nocb_kthread))
+        lockdep_assert_held(&rdp->nocb_lock);
+        if (!READ_ONCE(rdp_leader->nocb_kthread)) {
+                raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
                return;
-        if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
+        }
+        if (rdp_leader->nocb_leader_sleep || force) {
                /* Prior smp_mb__after_atomic() orders against prior enqueue. */
                WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
+                del_timer(&rdp->nocb_timer);
+                raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
                smp_mb(); /* ->nocb_leader_sleep before swake_up(). */
                swake_up(&rdp_leader->nocb_wq);
+        } else {
+                raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
        }
 }
 /*
+ * Kick the leader kthread for this NOCB group, but caller has not
+ * acquired locks.
+ */
+static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+        __wake_nocb_leader(rdp, force, flags);
+}
+/*
+ * Arrange to wake the leader kthread for this NOCB group at some
+ * future time when it is safe to do so.
+ */
+static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype,
+                                   const char *reason)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+        if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
+                mod_timer(&rdp->nocb_timer, jiffies + 1);
+        WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
+        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, reason);
+        raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+}
+/*
 * Does the specified CPU need an RCU callback for the specified flavor
 * of rcu_barrier()?
 */
@@ -1891,11 +1944,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
                                            TPS("WakeEmpty"));
                } else {
-                        WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE);
+                        wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
-                        /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
+                                               TPS("WakeEmptyIsDeferred"));
-                        smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
-                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
-                                            TPS("WakeEmptyIsDeferred"));
                }
                rdp->qlen_last_fqs_check = 0;
        } else if (len > rdp->qlen_last_fqs_check + qhimark) {
@@ -1905,11 +1955,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
                                            TPS("WakeOvf"));
                } else {
-                        WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE);
+                        wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
-                        /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
+                                               TPS("WakeOvfIsDeferred"));
-                        smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
-                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
-                                            TPS("WakeOvfIsDeferred"));
                }
                rdp->qlen_last_fqs_check = LONG_MAX / 2;
        } else {
@@ -1961,30 +2008,19 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
 * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
 * not a no-CBs CPU.
 */
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
                                                     struct rcu_data *rdp,
                                                     unsigned long flags)
 {
-        long ql = rsp->orphan_done.len;
+        RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_nocb_adopt_orphan_cbs() invoked with irqs enabled!!!");
-        long qll = rsp->orphan_done.len_lazy;
-        /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
        if (!rcu_is_nocb_cpu(smp_processor_id()))
-                return false;
+                return false; /* Not NOCBs CPU, caller must migrate CBs. */
+        __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist),
-        /* First, enqueue the donelist, if any.  This preserves CB ordering. */
+                                rcu_segcblist_tail(&rdp->cblist),
-        if (rsp->orphan_done.head) {
+                                rcu_segcblist_n_cbs(&rdp->cblist),
-                __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done),
+                                rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags);
-                                        rcu_cblist_tail(&rsp->orphan_done),
+        rcu_segcblist_init(&rdp->cblist);
-                                        ql, qll, flags);
+        rcu_segcblist_disable(&rdp->cblist);
-        }
-        if (rsp->orphan_pend.head) {
-                __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend),
-                                        rcu_cblist_tail(&rsp->orphan_pend),
-                                        ql, qll, flags);
-        }
-        rcu_cblist_init(&rsp->orphan_done);
-        rcu_cblist_init(&rsp->orphan_pend);
        return true;
 }
@@ -2031,6 +2067,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 static void nocb_leader_wait(struct rcu_data *my_rdp)
 {
        bool firsttime = true;
+        unsigned long flags;
        bool gotcbs;
        struct rcu_data *rdp;
        struct rcu_head **tail;
@@ -2039,13 +2076,17 @@ wait_again:
        /* Wait for callbacks to appear. */
        if (!rcu_nocb_poll) {
-                trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
+                trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep"));
                swait_event_interruptible(my_rdp->nocb_wq,
                                !READ_ONCE(my_rdp->nocb_leader_sleep));
-                /* Memory barrier handled by smp_mb() calls below and repoll. */
+                raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
+                my_rdp->nocb_leader_sleep = true;
+                WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+                del_timer(&my_rdp->nocb_timer);
+                raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
        } else if (firsttime) {
                firsttime = false; /* Don't drown trace log with "Poll"! */
-                trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll");
+                trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Poll"));
        }
        /*
@@ -2054,7 +2095,7 @@ wait_again:
         * nocb_gp_head, where they await a grace period.
         */
        gotcbs = false;
-        smp_mb(); /* wakeup before ->nocb_head reads. */
+        smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */
        for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
                rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
                if (!rdp->nocb_gp_head)
@@ -2066,56 +2107,41 @@ wait_again:
                gotcbs = true;
        }
-        /*
+        /* No callbacks?  Sleep a bit if polling, and go retry.  */
-         * If there were no callbacks, sleep a bit, rescan after a
-         * memory barrier, and go retry.
-         */
        if (unlikely(!gotcbs)) {
-                if (!rcu_nocb_poll)
-                        trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
-                                            "WokeEmpty");
                WARN_ON(signal_pending(current));
-                schedule_timeout_interruptible(1);
+                if (rcu_nocb_poll) {
+                        schedule_timeout_interruptible(1);
-                /* Rescan in case we were a victim of memory ordering. */
+                } else {
-                my_rdp->nocb_leader_sleep = true;
+                        trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
-                smp_mb();  /* Ensure _sleep true before scan. */
+                                            TPS("WokeEmpty"));
-                for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
+                }
-                        if (READ_ONCE(rdp->nocb_head)) {
-                                /* Found CB, so short-circuit next wait. */
-                                my_rdp->nocb_leader_sleep = false;
-                                break;
-                        }
                goto wait_again;
        }
        /* Wait for one grace period. */
        rcu_nocb_wait_gp(my_rdp);
-        /*
-         * We left ->nocb_leader_sleep unset to reduce cache thrashing.
-         * We set it now, but recheck for new callbacks while
-         * traversing our follower list.
-         */
-        my_rdp->nocb_leader_sleep = true;
-        smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */
        /* Each pass through the following loop wakes a follower, if needed. */
        for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
-                if (READ_ONCE(rdp->nocb_head))
+                if (!rcu_nocb_poll &&
+                    READ_ONCE(rdp->nocb_head) &&
+                    READ_ONCE(my_rdp->nocb_leader_sleep)) {
+                        raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
                        my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
+                        raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
+                }
                if (!rdp->nocb_gp_head)
                        continue; /* No CBs, so no need to wake follower. */
                /* Append callbacks to follower's "done" list. */
-                tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
+                raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+                tail = rdp->nocb_follower_tail;
+                rdp->nocb_follower_tail = rdp->nocb_gp_tail;
                *tail = rdp->nocb_gp_head;
-                smp_mb__after_atomic(); /* Store *tail before wakeup. */
+                raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
                if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
-                        /*
+                        /* List was empty, so wake up the follower.  */
-                         * List was empty, wake up the follower.
-                         * Memory barriers supplied by atomic_long_add().
-                         */
                        swake_up(&rdp->nocb_wq);
                }
        }
@@ -2131,28 +2157,16 @@ wait_again:
 */
 static void nocb_follower_wait(struct rcu_data *rdp)
 {
-        bool firsttime = true;
        for (;;) {
-                if (!rcu_nocb_poll) {
+                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep"));
-                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                swait_event_interruptible(rdp->nocb_wq,
-                                            "FollowerSleep");
+                                         READ_ONCE(rdp->nocb_follower_head));
-                        swait_event_interruptible(rdp->nocb_wq,
-                                                 READ_ONCE(rdp->nocb_follower_head));
-                } else if (firsttime) {
-                        /* Don't drown trace log with "Poll"! */
-                        firsttime = false;
-                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll");
-                }
                if (smp_load_acquire(&rdp->nocb_follower_head)) {
                        /* ^^^ Ensure CB invocation follows _head test. */
                        return;
                }
-                if (!rcu_nocb_poll)
-                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
-                                            "WokeEmpty");
                WARN_ON(signal_pending(current));
-                schedule_timeout_interruptible(1);
+                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeEmpty"));
        }
 }
@@ -2165,6 +2179,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
 static int rcu_nocb_kthread(void *arg)
 {
        int c, cl;
+        unsigned long flags;
        struct rcu_head *list;
        struct rcu_head *next;
        struct rcu_head **tail;
@@ -2179,11 +2194,14 @@ static int rcu_nocb_kthread(void *arg)
                        nocb_follower_wait(rdp);
                /* Pull the ready-to-invoke callbacks onto local list. */
-                list = READ_ONCE(rdp->nocb_follower_head);
+                raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+                list = rdp->nocb_follower_head;
+                rdp->nocb_follower_head = NULL;
+                tail = rdp->nocb_follower_tail;
+                rdp->nocb_follower_tail = &rdp->nocb_follower_head;
+                raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
                BUG_ON(!list);
-                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
+                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeNonEmpty"));
-                WRITE_ONCE(rdp->nocb_follower_head, NULL);
-                tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
                /* Each pass through the following loop invokes a callback. */
                trace_rcu_batch_start(rdp->rsp->name,
@@ -2226,18 +2244,39 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
 }
 /* Do a deferred wakeup of rcu_nocb_kthread(). */
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
 {
+        unsigned long flags;
        int ndw;
-        if (!rcu_nocb_need_deferred_wakeup(rdp))
+        raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+        if (!rcu_nocb_need_deferred_wakeup(rdp)) {
+                raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
                return;
+        }
        ndw = READ_ONCE(rdp->nocb_defer_wakeup);
        WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
-        wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE);
+        __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
 }
+/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
+static void do_nocb_deferred_wakeup_timer(unsigned long x)
+{
+        do_nocb_deferred_wakeup_common((struct rcu_data *)x);
+}
+/*
+ * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
+ * This means we do an inexact common-case check.  Note that if
+ * we miss, ->nocb_timer will eventually clean things up.
+ */
+static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+        if (rcu_nocb_need_deferred_wakeup(rdp))
+                do_nocb_deferred_wakeup_common(rdp);
+}
 void __init rcu_init_nohz(void)
 {
        int cpu;
@@ -2287,6 +2326,9 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
        rdp->nocb_tail = &rdp->nocb_head;
        init_swait_queue_head(&rdp->nocb_wq);
        rdp->nocb_follower_tail = &rdp->nocb_follower_head;
+        raw_spin_lock_init(&rdp->nocb_lock);
+        setup_timer(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer,
+                    (unsigned long)rdp);
 }
 /*
@@ -2459,7 +2501,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
        return false;
 }
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
                                                     struct rcu_data *rdp,
                                                     unsigned long flags)
 {
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 00e77c470017..5033b66d2753 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -568,7 +568,7 @@ static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq);
 static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
 /* Track exiting tasks in order to allow them to be waited for. */
-DEFINE_SRCU(tasks_rcu_exit_srcu);
+DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
 /* Control stall timeouts.  Disable with <= 0, otherwise jiffies till stall. */
 #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
@@ -875,6 +875,22 @@ static void rcu_spawn_tasks_kthread(void)
        mutex_unlock(&rcu_tasks_kthread_mutex);
 }
+/* Do the srcu_read_lock() for the above synchronize_srcu().  */
+void exit_tasks_rcu_start(void)
+{
+        preempt_disable();
+        current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
+        preempt_enable();
+}
+/* Do the srcu_read_unlock() for the above synchronize_srcu().  */
+void exit_tasks_rcu_finish(void)
+{
+        preempt_disable();
+        __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx);
+        preempt_enable();
+}
 #endif /* #ifdef CONFIG_TASKS_RCU */
 #ifndef CONFIG_TINY_RCU
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 53f0164ed362..78f54932ea1d 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -25,3 +25,4 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
+obj-$(CONFIG_MEMBARRIER) += membarrier.o
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 13fc5ae9bf2f..c9524d2d9316 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -300,6 +300,8 @@ EXPORT_SYMBOL(try_wait_for_completion);
 */
 bool completion_done(struct completion *x)
 {
+        unsigned long flags;
        if (!READ_ONCE(x->done))
                return false;
@@ -307,14 +309,9 @@ bool completion_done(struct completion *x)
         * If ->done, we need to wait for complete() to release ->wait.lock
         * otherwise we can end up freeing the completion before complete()
         * is done referencing it.
-         *
-         * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
-         * the loads of ->done and ->wait.lock such that we cannot observe
-         * the lock before complete() acquires it while observing the ->done
-         * after it's acquired the lock.
         */
-        smp_rmb();
+        spin_lock_irqsave(&x->wait.lock, flags);
-        spin_unlock_wait(&x->wait.lock);
+        spin_unlock_irqrestore(&x->wait.lock, flags);
        return true;
 }
 EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0869b20fba81..e053c31d96da 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -951,8 +951,13 @@ struct migration_arg {
 static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
                                 struct task_struct *p, int dest_cpu)
 {
-        if (unlikely(!cpu_active(dest_cpu)))
+        if (p->flags & PF_KTHREAD) {
-                return rq;
+                if (unlikely(!cpu_online(dest_cpu)))
+                        return rq;
+        } else {
+                if (unlikely(!cpu_active(dest_cpu)))
+                        return rq;
+        }
        /* Affinity changed (again). */
        if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
@@ -2635,6 +2640,16 @@ static struct rq *finish_task_switch(struct task_struct *prev)
        prev_state = prev->state;
        vtime_task_switch(prev);
        perf_event_task_sched_in(prev, current);
+        /*
+         * The membarrier system call requires a full memory barrier
+         * after storing to rq->curr, before going back to user-space.
+         *
+         * TODO: This smp_mb__after_unlock_lock can go away if PPC end
+         * up adding a full barrier to switch_mm(), or we should figure
+         * out if a smp_mb__after_unlock_lock is really the proper API
+         * to use.
+         */
+        smp_mb__after_unlock_lock();
        finish_lock_switch(rq, prev);
        finish_arch_post_lock_switch();
@@ -3324,6 +3339,21 @@ static void __sched notrace __schedule(bool preempt)
        if (likely(prev != next)) {
                rq->nr_switches++;
                rq->curr = next;
+                /*
+                 * The membarrier system call requires each architecture
+                 * to have a full memory barrier after updating
+                 * rq->curr, before returning to user-space. For TSO
+                 * (e.g. x86), the architecture must provide its own
+                 * barrier in switch_mm(). For weakly ordered machines
+                 * for which spin_unlock() acts as a full memory
+                 * barrier, finish_lock_switch() in common code takes
+                 * care of this barrier. For weakly ordered machines for
+                 * which spin_unlock() acts as a RELEASE barrier (only
+                 * arm64 and PowerPC), arm64 has a full barrier in
+                 * switch_to(), and PowerPC has
+                 * smp_mb__after_unlock_lock() before
+                 * finish_lock_switch().
+                 */
                ++*switch_count;
                trace_sched_switch(preempt, prev, next);
@@ -3352,8 +3382,8 @@ void __noreturn do_task_dead(void)
         * To avoid it, we have to wait for releasing tsk->pi_lock which
         * is held by try_to_wake_up()
         */
-        smp_mb();
+        raw_spin_lock_irq(&current->pi_lock);
-        raw_spin_unlock_wait(&current->pi_lock);
+        raw_spin_unlock_irq(&current->pi_lock);
        /* Causes final put_task_struct in finish_task_switch(): */
        __set_current_state(TASK_DEAD);
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
new file mode 100644
index 000000000000..a92fddc22747
--- /dev/null
+++ b/kernel/sched/membarrier.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * membarrier system call
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <linux/syscalls.h>
+#include <linux/membarrier.h>
+#include <linux/tick.h>
+#include <linux/cpumask.h>
+#include "sched.h"      /* for cpu_rq(). */
+/*
+ * Bitmask made from a "or" of all commands within enum membarrier_cmd,
+ * except MEMBARRIER_CMD_QUERY.
+ */
+#define MEMBARRIER_CMD_BITMASK  \
+        (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
+static void ipi_mb(void *info)
+{
+        smp_mb();       /* IPIs should be serializing but paranoid. */
+}
+static void membarrier_private_expedited(void)
+{
+        int cpu;
+        bool fallback = false;
+        cpumask_var_t tmpmask;
+        if (num_online_cpus() == 1)
+                return;
+        /*
+         * Matches memory barriers around rq->curr modification in
+         * scheduler.
+         */
+        smp_mb();       /* system call entry is not a mb. */
+        /*
+         * Expedited membarrier commands guarantee that they won't
+         * block, hence the GFP_NOWAIT allocation flag and fallback
+         * implementation.
+         */
+        if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
+                /* Fallback for OOM. */
+                fallback = true;
+        }
+        cpus_read_lock();
+        for_each_online_cpu(cpu) {
+                struct task_struct *p;
+                /*
+                 * Skipping the current CPU is OK even through we can be
+                 * migrated at any point. The current CPU, at the point
+                 * where we read raw_smp_processor_id(), is ensured to
+                 * be in program order with respect to the caller
+                 * thread. Therefore, we can skip this CPU from the
+                 * iteration.
+                 */
+                if (cpu == raw_smp_processor_id())
+                        continue;
+                rcu_read_lock();
+                p = task_rcu_dereference(&cpu_rq(cpu)->curr);
+                if (p && p->mm == current->mm) {
+                        if (!fallback)
+                                __cpumask_set_cpu(cpu, tmpmask);
+                        else
+                                smp_call_function_single(cpu, ipi_mb, NULL, 1);
+                }
+                rcu_read_unlock();
+        }
+        if (!fallback) {
+                smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+                free_cpumask_var(tmpmask);
+        }
+        cpus_read_unlock();
+        /*
+         * Memory barrier on the caller thread _after_ we finished
+         * waiting for the last IPI. Matches memory barriers around
+         * rq->curr modification in scheduler.
+         */
+        smp_mb();       /* exit from system call is not a mb */
+}
+/**
+ * sys_membarrier - issue memory barriers on a set of threads
+ * @cmd:   Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0. For future extensions.
+ *
+ * If this system call is not implemented, -ENOSYS is returned. If the
+ * command specified does not exist, not available on the running
+ * kernel, or if the command argument is invalid, this system call
+ * returns -EINVAL. For a given command, with flags argument set to 0,
+ * this system call is guaranteed to always return the same value until
+ * reboot.
+ *
+ * All memory accesses performed in program order from each targeted thread
+ * is guaranteed to be ordered with respect to sys_membarrier(). If we use
+ * the semantic "barrier()" to represent a compiler barrier forcing memory
+ * accesses to be performed in program order across the barrier, and
+ * smp_mb() to represent explicit memory barriers forcing full memory
+ * ordering across the barrier, we have the following ordering table for
+ * each pair of barrier(), sys_membarrier() and smp_mb():
+ *
+ * The pair ordering is detailed as (O: ordered, X: not ordered):
+ *
+ *                        barrier()   smp_mb() sys_membarrier()
+ *        barrier()          X           X            O
+ *        smp_mb()           X           O            O
+ *        sys_membarrier()   O           O            O
+ */
+SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+{
+        if (unlikely(flags))
+                return -EINVAL;
+        switch (cmd) {
+        case MEMBARRIER_CMD_QUERY:
+        {
+                int cmd_mask = MEMBARRIER_CMD_BITMASK;
+                if (tick_nohz_full_enabled())
+                        cmd_mask &= ~MEMBARRIER_CMD_SHARED;
+                return cmd_mask;
+        }
+        case MEMBARRIER_CMD_SHARED:
+                /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
+                if (tick_nohz_full_enabled())
+                        return -EINVAL;
+                if (num_online_cpus() > 1)
+                        synchronize_sched();
+                return 0;
+        case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+                membarrier_private_expedited();
+                return 0;
+        default:
+                return -EINVAL;
+        }
+}
diff --git a/kernel/task_work.c b/kernel/task_work.c
index d513051fcca2..836a72a66fba 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -96,20 +96,16 @@ void task_work_run(void)
                 * work->func() can do task_work_add(), do not set
                 * work_exited unless the list is empty.
                 */
+                raw_spin_lock_irq(&task->pi_lock);
                do {
                        work = READ_ONCE(task->task_works);
                        head = !work && (task->flags & PF_EXITING) ?
                                &work_exited : NULL;
                } while (cmpxchg(&task->task_works, work, head) != work);
+                raw_spin_unlock_irq(&task->pi_lock);
                if (!work)
                        break;
-                /*
-                 * Synchronize with task_work_cancel(). It can't remove
-                 * the first entry == work, cmpxchg(task_works) should
-                 * fail, but it can play with *work and other entries.
-                 */
-                raw_spin_unlock_wait(&task->pi_lock);
                do {
                        next = work->next;
diff --git a/kernel/torture.c b/kernel/torture.c
index 55de96529287..637e172835d8 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -117,7 +117,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
                                 torture_type, cpu);
                (*n_offl_successes)++;
                delta = jiffies - starttime;
-                sum_offl += delta;
+                *sum_offl += delta;
                if (*min_offl < 0) {
                        *min_offl = delta;
                        *max_offl = delta;