29 files changed, 540 insertions, 597 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 76768ee812b2..08561f1acd13 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -231,6 +231,10 @@ config RWSEM_SPIN_ON_OWNER
       def_bool y
       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
+config LOCK_SPIN_ON_OWNER
+       def_bool y
+       depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER
 config ARCH_USE_QUEUE_RWLOCK
        bool
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5d220234b3ca..1972b161c61e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -58,22 +58,23 @@ static int cpu_hotplug_disabled;
 static struct {
        struct task_struct *active_writer;
-        struct mutex lock; /* Synchronizes accesses to refcount, */
+        /* wait queue to wake up the active_writer */
+        wait_queue_head_t wq;
+        /* verifies that no writer will get active while readers are active */
+        struct mutex lock;
        /*
         * Also blocks the new readers during
         * an ongoing cpu hotplug operation.
         */
-        int refcount;
+        atomic_t refcount;
-        /* And allows lockless put_online_cpus(). */
-        atomic_t puts_pending;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map dep_map;
 #endif
 } cpu_hotplug = {
        .active_writer = NULL,
+        .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
        .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
-        .refcount = 0,
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
        .dep_map = {.name = "cpu_hotplug.lock" },
 #endif
@@ -86,15 +87,6 @@ static struct {
 #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
 #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
-static void apply_puts_pending(int max)
-{
-        int delta;
-        if (atomic_read(&cpu_hotplug.puts_pending) >= max) {
-                delta = atomic_xchg(&cpu_hotplug.puts_pending, 0);
-                cpu_hotplug.refcount -= delta;
-        }
-}
 void get_online_cpus(void)
 {
@@ -103,8 +95,7 @@ void get_online_cpus(void)
                return;
        cpuhp_lock_acquire_read();
        mutex_lock(&cpu_hotplug.lock);
-        apply_puts_pending(65536);
+        atomic_inc(&cpu_hotplug.refcount);
-        cpu_hotplug.refcount++;
        mutex_unlock(&cpu_hotplug.lock);
 }
 EXPORT_SYMBOL_GPL(get_online_cpus);
@@ -116,8 +107,7 @@ bool try_get_online_cpus(void)
        if (!mutex_trylock(&cpu_hotplug.lock))
                return false;
        cpuhp_lock_acquire_tryread();
-        apply_puts_pending(65536);
+        atomic_inc(&cpu_hotplug.refcount);
-        cpu_hotplug.refcount++;
        mutex_unlock(&cpu_hotplug.lock);
        return true;
 }
@@ -125,20 +115,18 @@ EXPORT_SYMBOL_GPL(try_get_online_cpus);
 void put_online_cpus(void)
 {
+        int refcount;
        if (cpu_hotplug.active_writer == current)
                return;
-        if (!mutex_trylock(&cpu_hotplug.lock)) {
-                atomic_inc(&cpu_hotplug.puts_pending);
-                cpuhp_lock_release();
-                return;
-        }
-        if (WARN_ON(!cpu_hotplug.refcount))
+        refcount = atomic_dec_return(&cpu_hotplug.refcount);
-                cpu_hotplug.refcount++; /* try to fix things up */
+        if (WARN_ON(refcount < 0)) /* try to fix things up */
+                atomic_inc(&cpu_hotplug.refcount);
+        if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq))
+                wake_up(&cpu_hotplug.wq);
-        if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
-                wake_up_process(cpu_hotplug.active_writer);
-        mutex_unlock(&cpu_hotplug.lock);
        cpuhp_lock_release();
 }
@@ -168,18 +156,20 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
 */
 void cpu_hotplug_begin(void)
 {
-        cpu_hotplug.active_writer = current;
+        DEFINE_WAIT(wait);
+        cpu_hotplug.active_writer = current;
        cpuhp_lock_acquire();
        for (;;) {
                mutex_lock(&cpu_hotplug.lock);
-                apply_puts_pending(1);
+                prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
-                if (likely(!cpu_hotplug.refcount))
+                if (likely(!atomic_read(&cpu_hotplug.refcount)))
-                        break;
+                                break;
-                __set_current_state(TASK_UNINTERRUPTIBLE);
                mutex_unlock(&cpu_hotplug.lock);
                schedule();
        }
+        finish_wait(&cpu_hotplug.wq, &wait);
 }
 void cpu_hotplug_done(void)
diff --git a/kernel/futex.c b/kernel/futex.c
index 63678b573d61..4eeb63de7e54 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2258,7 +2258,7 @@ static long futex_wait_restart(struct restart_block *restart)
 * if there are waiters then it will block, it does PI, etc. (Due to
 * races the kernel might see a 0 value of the futex too.)
 */
-static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
+static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
                         ktime_t *time, int trylock)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
@@ -2953,11 +2953,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
        case FUTEX_WAKE_OP:
                return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
        case FUTEX_LOCK_PI:
-                return futex_lock_pi(uaddr, flags, val, timeout, 0);
+                return futex_lock_pi(uaddr, flags, timeout, 0);
        case FUTEX_UNLOCK_PI:
                return futex_unlock_pi(uaddr, flags);
        case FUTEX_TRYLOCK_PI:
-                return futex_lock_pi(uaddr, flags, 0, timeout, 1);
+                return futex_lock_pi(uaddr, flags, NULL, 1);
        case FUTEX_WAIT_REQUEUE_PI:
                val3 = FUTEX_BITSET_MATCH_ANY;
                return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 8541bfdfd232..4ca8eb151975 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
-obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o
+obj-y += mutex.o semaphore.o rwsem.o
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_lockdep.o = -pg
@@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y)
 obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
 endif
 obj-$(CONFIG_SMP) += spinlock.o
+obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
 obj-$(CONFIG_SMP) += lglock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 4d60986fcbee..d1fe2ba5bac9 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -108,20 +108,4 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
        arch_mcs_spin_unlock_contended(&next->locked);
 }
-/*
- * Cancellable version of the MCS lock above.
- *
- * Intended for adaptive spinning of sleeping locks:
- * mutex_lock()/rwsem_down_{read,write}() etc.
- */
-struct optimistic_spin_node {
-        struct optimistic_spin_node *next, *prev;
-        int locked; /* 1 if lock acquired */
-        int cpu; /* encoded CPU # value */
-};
-extern bool osq_lock(struct optimistic_spin_queue *lock);
-extern void osq_unlock(struct optimistic_spin_queue *lock);
 #endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 454195194d4a..57407062e209 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -147,7 +147,7 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
 }
 /*
- * after acquiring lock with fastpath or when we lost out in contested
+ * After acquiring lock with fastpath or when we lost out in contested
 * slowpath, set ctx and wake up any waiters so they can recheck.
 *
 * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
@@ -191,19 +191,32 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock,
        spin_unlock_mutex(&lock->base.wait_lock, flags);
 }
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 /*
- * In order to avoid a stampede of mutex spinners from acquiring the mutex
+ * After acquiring lock in the slowpath set ctx and wake up any
- * more or less simultaneously, the spinners need to acquire a MCS lock
+ * waiters so they can recheck.
- * first before spinning on the owner field.
 *
+ * Callers must hold the mutex wait_lock.
 */
+static __always_inline void
+ww_mutex_set_context_slowpath(struct ww_mutex *lock,
+                              struct ww_acquire_ctx *ctx)
+{
+        struct mutex_waiter *cur;
-/*
+        ww_mutex_lock_acquired(lock, ctx);
- * Mutex spinning code migrated from kernel/sched/core.c
+        lock->ctx = ctx;
- */
+        /*
+         * Give any possible sleeping processes the chance to wake up,
+         * so they can recheck if they have to back off.
+         */
+        list_for_each_entry(cur, &lock->base.wait_list, list) {
+                debug_mutex_wake_waiter(&lock->base, cur);
+                wake_up_process(cur->task);
+        }
+}
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
 {
        if (lock->owner != owner)
@@ -307,6 +320,11 @@ static bool mutex_optimistic_spin(struct mutex *lock,
        if (!mutex_can_spin_on_owner(lock))
                goto done;
+        /*
+         * In order to avoid a stampede of mutex spinners trying to
+         * acquire the mutex all at once, the spinners need to take a
+         * MCS (queued) lock first before spinning on the owner field.
+         */
        if (!osq_lock(&lock->osq))
                goto done;
@@ -469,7 +487,7 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock)
 EXPORT_SYMBOL(ww_mutex_unlock);
 static inline int __sched
-__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
+__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
 {
        struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
        struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
@@ -557,7 +575,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                }
                if (use_ww_ctx && ww_ctx->acquired > 0) {
-                        ret = __mutex_lock_check_stamp(lock, ww_ctx);
+                        ret = __ww_mutex_lock_check_stamp(lock, ww_ctx);
                        if (ret)
                                goto err;
                }
@@ -569,6 +587,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                schedule_preempt_disabled();
                spin_lock_mutex(&lock->wait_lock, flags);
        }
+        __set_task_state(task, TASK_RUNNING);
        mutex_remove_waiter(lock, &waiter, current_thread_info());
        /* set it to 0 if there are no waiters left: */
        if (likely(list_empty(&lock->wait_list)))
@@ -582,23 +602,7 @@ skip_wait:
        if (use_ww_ctx) {
                struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
-                struct mutex_waiter *cur;
+                ww_mutex_set_context_slowpath(ww, ww_ctx);
-                /*
-                 * This branch gets optimized out for the common case,
-                 * and is only important for ww_mutex_lock.
-                 */
-                ww_mutex_lock_acquired(ww, ww_ctx);
-                ww->ctx = ww_ctx;
-                /*
-                 * Give any possible sleeping processes the chance to wake up,
-                 * so they can recheck if they have to back off.
-                 */
-                list_for_each_entry(cur, &lock->wait_list, list) {
-                        debug_mutex_wake_waiter(lock, cur);
-                        wake_up_process(cur->task);
-                }
        }
        spin_unlock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/osq_lock.c
index 9887a905a762..c112d00341b0 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/osq_lock.c
@@ -1,8 +1,6 @@
 #include <linux/percpu.h>
 #include <linux/sched.h>
-#include "mcs_spinlock.h"
+#include <linux/osq_lock.h>
-#ifdef CONFIG_SMP
 /*
 * An MCS like lock especially tailored for optimistic spinning for sleeping
@@ -111,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
         * cmpxchg in an attempt to undo our queueing.
         */
-        while (!smp_load_acquire(&node->locked)) {
+        while (!ACCESS_ONCE(node->locked)) {
                /*
                 * If we need to reschedule bail... so we can block.
                 */
@@ -203,6 +201,3 @@ void osq_unlock(struct optimistic_spin_queue *lock)
        if (next)
                ACCESS_ONCE(next->locked) = 1;
 }
-#endif
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 7c98873a3077..3059bc2f022d 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1130,6 +1130,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
                set_current_state(state);
        }
+        __set_current_state(TASK_RUNNING);
        return ret;
 }
@@ -1188,10 +1189,9 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
        ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
        if (likely(!ret))
+                /* sleep on the mutex */
                ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
-        set_current_state(TASK_RUNNING);
        if (unlikely(ret)) {
                remove_waiter(lock, &waiter);
                rt_mutex_handle_deadlock(ret, chwalk, &waiter);
@@ -1626,10 +1626,9 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
        set_current_state(TASK_INTERRUPTIBLE);
+        /* sleep on the mutex */
        ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
-        set_current_state(TASK_RUNNING);
        if (unlikely(ret))
                remove_waiter(lock, waiter);
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 2c93571162cb..2555ae15ec14 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -154,7 +154,7 @@ void __sched __down_read(struct rw_semaphore *sem)
                set_task_state(tsk, TASK_UNINTERRUPTIBLE);
        }
-        tsk->state = TASK_RUNNING;
+        __set_task_state(tsk, TASK_RUNNING);
 out:
        ;
 }
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 7628c3fc37ca..2f7cc4076f50 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -242,8 +242,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
                schedule();
        }
-        tsk->state = TASK_RUNNING;
+        __set_task_state(tsk, TASK_RUNNING);
        return sem;
 }
 EXPORT_SYMBOL(rwsem_down_read_failed);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 4803da6eab62..ae9fc7cc360e 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -402,6 +402,7 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh,
 }
 EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
+#ifdef CONFIG_SRCU
 /*
 *      SRCU notifier chain routines.    Registration and unregistration
 *      use a mutex, and call_chain is synchronized by SRCU (no locks).
@@ -528,6 +529,8 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)
 }
 EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
+#endif /* CONFIG_SRCU */
 static ATOMIC_NOTIFIER_HEAD(die_chain);
 int notrace notify_die(enum die_val val, const char *str,
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 48b28d387c7f..7e01f78f0417 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -251,6 +251,7 @@ config APM_EMULATION
 config PM_OPP
        bool
+        select SRCU
        ---help---
          SOCs have a standard set of tuples consisting of frequency and
          voltage pairs that the device will support per voltage domain. This
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index e6fae503d1bc..50a808424b06 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,4 +1,5 @@
-obj-y += update.o srcu.o
+obj-y += update.o
+obj-$(CONFIG_SRCU) += srcu.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_TREE_RCU) += tree.o
 obj-$(CONFIG_PREEMPT_RCU) += tree.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 07bb02eda844..80adef7d4c3d 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -137,4 +137,10 @@ int rcu_jiffies_till_stall_check(void);
 void rcu_early_boot_tests(void);
+/*
+ * This function really isn't for public consumption, but RCU is special in
+ * that context switches can allow the state machine to make progress.
+ */
+extern void resched_cpu(int cpu);
 #endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 4d559baf06e0..30d42aa55d83 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -244,7 +244,8 @@ struct rcu_torture_ops {
        int (*readlock)(void);
        void (*read_delay)(struct torture_random_state *rrsp);
        void (*readunlock)(int idx);
-        int (*completed)(void);
+        unsigned long (*started)(void);
+        unsigned long (*completed)(void);
        void (*deferred_free)(struct rcu_torture *p);
        void (*sync)(void);
        void (*exp_sync)(void);
@@ -296,11 +297,6 @@ static void rcu_torture_read_unlock(int idx) __releases(RCU)
        rcu_read_unlock();
 }
-static int rcu_torture_completed(void)
-{
-        return rcu_batches_completed();
-}
 /*
 * Update callback in the pipe.  This should be invoked after a grace period.
 */
@@ -356,7 +352,7 @@ rcu_torture_cb(struct rcu_head *p)
                cur_ops->deferred_free(rp);
 }
-static int rcu_no_completed(void)
+static unsigned long rcu_no_completed(void)
 {
        return 0;
 }
@@ -377,7 +373,8 @@ static struct rcu_torture_ops rcu_ops = {
        .readlock       = rcu_torture_read_lock,
        .read_delay     = rcu_read_delay,
        .readunlock     = rcu_torture_read_unlock,
-        .completed      = rcu_torture_completed,
+        .started        = rcu_batches_started,
+        .completed      = rcu_batches_completed,
        .deferred_free  = rcu_torture_deferred_free,
        .sync           = synchronize_rcu,
        .exp_sync       = synchronize_rcu_expedited,
@@ -407,11 +404,6 @@ static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
        rcu_read_unlock_bh();
 }
-static int rcu_bh_torture_completed(void)
-{
-        return rcu_batches_completed_bh();
-}
 static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
 {
        call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
@@ -423,7 +415,8 @@ static struct rcu_torture_ops rcu_bh_ops = {
        .readlock       = rcu_bh_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = rcu_bh_torture_read_unlock,
-        .completed      = rcu_bh_torture_completed,
+        .started        = rcu_batches_started_bh,
+        .completed      = rcu_batches_completed_bh,
        .deferred_free  = rcu_bh_torture_deferred_free,
        .sync           = synchronize_rcu_bh,
        .exp_sync       = synchronize_rcu_bh_expedited,
@@ -466,6 +459,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
        .readlock       = rcu_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = rcu_torture_read_unlock,
+        .started        = rcu_no_completed,
        .completed      = rcu_no_completed,
        .deferred_free  = rcu_busted_torture_deferred_free,
        .sync           = synchronize_rcu_busted,
@@ -510,7 +504,7 @@ static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
        srcu_read_unlock(&srcu_ctl, idx);
 }
-static int srcu_torture_completed(void)
+static unsigned long srcu_torture_completed(void)
 {
        return srcu_batches_completed(&srcu_ctl);
 }
@@ -564,6 +558,7 @@ static struct rcu_torture_ops srcu_ops = {
        .readlock       = srcu_torture_read_lock,
        .read_delay     = srcu_read_delay,
        .readunlock     = srcu_torture_read_unlock,
+        .started        = NULL,
        .completed      = srcu_torture_completed,
        .deferred_free  = srcu_torture_deferred_free,
        .sync           = srcu_torture_synchronize,
@@ -600,7 +595,8 @@ static struct rcu_torture_ops sched_ops = {
        .readlock       = sched_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = sched_torture_read_unlock,
-        .completed      = rcu_no_completed,
+        .started        = rcu_batches_started_sched,
+        .completed      = rcu_batches_completed_sched,
        .deferred_free  = rcu_sched_torture_deferred_free,
        .sync           = synchronize_sched,
        .exp_sync       = synchronize_sched_expedited,
@@ -638,6 +634,7 @@ static struct rcu_torture_ops tasks_ops = {
        .readlock       = tasks_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = tasks_torture_read_unlock,
+        .started        = rcu_no_completed,
        .completed      = rcu_no_completed,
        .deferred_free  = rcu_tasks_torture_deferred_free,
        .sync           = synchronize_rcu_tasks,
@@ -1015,8 +1012,8 @@ static void rcutorture_trace_dump(void)
 static void rcu_torture_timer(unsigned long unused)
 {
        int idx;
-        int completed;
+        unsigned long started;
-        int completed_end;
+        unsigned long completed;
        static DEFINE_TORTURE_RANDOM(rand);
        static DEFINE_SPINLOCK(rand_lock);
        struct rcu_torture *p;
@@ -1024,7 +1021,10 @@ static void rcu_torture_timer(unsigned long unused)
        unsigned long long ts;
        idx = cur_ops->readlock();
-        completed = cur_ops->completed();
+        if (cur_ops->started)
+                started = cur_ops->started();
+        else
+                started = cur_ops->completed();
        ts = rcu_trace_clock_local();
        p = rcu_dereference_check(rcu_torture_current,
                                  rcu_read_lock_bh_held() ||
@@ -1047,14 +1047,16 @@ static void rcu_torture_timer(unsigned long unused)
                /* Should not happen, but... */
                pipe_count = RCU_TORTURE_PIPE_LEN;
        }
-        completed_end = cur_ops->completed();
+        completed = cur_ops->completed();
        if (pipe_count > 1) {
                do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
-                                          completed, completed_end);
+                                          started, completed);
                rcutorture_trace_dump();
        }
        __this_cpu_inc(rcu_torture_count[pipe_count]);
-        completed = completed_end - completed;
+        completed = completed - started;
+        if (cur_ops->started)
+                completed++;
        if (completed > RCU_TORTURE_PIPE_LEN) {
                /* Should not happen, but... */
                completed = RCU_TORTURE_PIPE_LEN;
@@ -1073,8 +1075,8 @@ static void rcu_torture_timer(unsigned long unused)
 static int
 rcu_torture_reader(void *arg)
 {
-        int completed;
+        unsigned long started;
-        int completed_end;
+        unsigned long completed;
        int idx;
        DEFINE_TORTURE_RANDOM(rand);
        struct rcu_torture *p;
@@ -1093,7 +1095,10 @@ rcu_torture_reader(void *arg)
                                mod_timer(&t, jiffies + 1);
                }
                idx = cur_ops->readlock();
-                completed = cur_ops->completed();
+                if (cur_ops->started)
+                        started = cur_ops->started();
+                else
+                        started = cur_ops->completed();
                ts = rcu_trace_clock_local();
                p = rcu_dereference_check(rcu_torture_current,
                                          rcu_read_lock_bh_held() ||
@@ -1114,14 +1119,16 @@ rcu_torture_reader(void *arg)
                        /* Should not happen, but... */
                        pipe_count = RCU_TORTURE_PIPE_LEN;
                }
-                completed_end = cur_ops->completed();
+                completed = cur_ops->completed();
                if (pipe_count > 1) {
                        do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
-                                                  ts, completed, completed_end);
+                                                  ts, started, completed);
                        rcutorture_trace_dump();
                }
                __this_cpu_inc(rcu_torture_count[pipe_count]);
-                completed = completed_end - completed;
+                completed = completed - started;
+                if (cur_ops->started)
+                        completed++;
                if (completed > RCU_TORTURE_PIPE_LEN) {
                        /* Should not happen, but... */
                        completed = RCU_TORTURE_PIPE_LEN;
@@ -1420,6 +1427,9 @@ static int rcu_torture_barrier(void *arg)
                cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
                if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
                        n_rcu_torture_barrier_error++;
+                        pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n",
+                               atomic_read(&barrier_cbs_invoked),
+                               n_barrier_cbs);
                        WARN_ON_ONCE(1);
                }
                n_barrier_successes++;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index e037f3eb2f7b..445bf8ffe3fb 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -546,7 +546,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier);
 * Report the number of batches, correlated with, but not necessarily
 * precisely the same as, the number of grace periods that have elapsed.
 */
-long srcu_batches_completed(struct srcu_struct *sp)
+unsigned long srcu_batches_completed(struct srcu_struct *sp)
 {
        return sp->completed;
 }
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 0db5649f8817..cc9ceca7bde1 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -47,54 +47,14 @@ static void __call_rcu(struct rcu_head *head,
                       void (*func)(struct rcu_head *rcu),
                       struct rcu_ctrlblk *rcp);
-static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
 #include "tiny_plugin.h"
-/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcu/tree.c. */
-static void rcu_idle_enter_common(long long newval)
-{
-        if (newval) {
-                RCU_TRACE(trace_rcu_dyntick(TPS("--="),
-                                            rcu_dynticks_nesting, newval));
-                rcu_dynticks_nesting = newval;
-                return;
-        }
-        RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
-                                    rcu_dynticks_nesting, newval));
-        if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
-                struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
-                RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
-                                            rcu_dynticks_nesting, newval));
-                ftrace_dump(DUMP_ALL);
-                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
-                          current->pid, current->comm,
-                          idle->pid, idle->comm); /* must be idle task! */
-        }
-        rcu_sched_qs(); /* implies rcu_bh_inc() */
-        barrier();
-        rcu_dynticks_nesting = newval;
-}
 /*
 * Enter idle, which is an extended quiescent state if we have fully
- * entered that mode (i.e., if the new value of dynticks_nesting is zero).
+ * entered that mode.
 */
 void rcu_idle_enter(void)
 {
-        unsigned long flags;
-        long long newval;
-        local_irq_save(flags);
-        WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
-        if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) ==
-            DYNTICK_TASK_NEST_VALUE)
-                newval = 0;
-        else
-                newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE;
-        rcu_idle_enter_common(newval);
-        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -103,55 +63,14 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
 */
 void rcu_irq_exit(void)
 {
-        unsigned long flags;
-        long long newval;
-        local_irq_save(flags);
-        newval = rcu_dynticks_nesting - 1;
-        WARN_ON_ONCE(newval < 0);
-        rcu_idle_enter_common(newval);
-        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(rcu_irq_exit);
-/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcu/tree.c. */
-static void rcu_idle_exit_common(long long oldval)
-{
-        if (oldval) {
-                RCU_TRACE(trace_rcu_dyntick(TPS("++="),
-                                            oldval, rcu_dynticks_nesting));
-                return;
-        }
-        RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
-        if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
-                struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
-                RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
-                          oldval, rcu_dynticks_nesting));
-                ftrace_dump(DUMP_ALL);
-                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
-                          current->pid, current->comm,
-                          idle->pid, idle->comm); /* must be idle task! */
-        }
-}
 /*
 * Exit idle, so that we are no longer in an extended quiescent state.
 */
 void rcu_idle_exit(void)
 {
-        unsigned long flags;
-        long long oldval;
-        local_irq_save(flags);
-        oldval = rcu_dynticks_nesting;
-        WARN_ON_ONCE(rcu_dynticks_nesting < 0);
-        if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK)
-                rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
-        else
-                rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
-        rcu_idle_exit_common(oldval);
-        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -160,15 +79,6 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit);
 */
 void rcu_irq_enter(void)
 {
-        unsigned long flags;
-        long long oldval;
-        local_irq_save(flags);
-        oldval = rcu_dynticks_nesting;
-        rcu_dynticks_nesting++;
-        WARN_ON_ONCE(rcu_dynticks_nesting == 0);
-        rcu_idle_exit_common(oldval);
-        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(rcu_irq_enter);
@@ -179,23 +89,13 @@ EXPORT_SYMBOL_GPL(rcu_irq_enter);
 */
 bool notrace __rcu_is_watching(void)
 {
-        return rcu_dynticks_nesting;
+        return true;
 }
 EXPORT_SYMBOL(__rcu_is_watching);
 #endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
 /*
- * Test whether the current CPU was interrupted from idle.  Nested
- * interrupts don't count, we must be running at the first interrupt
- * level.
- */
-static int rcu_is_cpu_rrupt_from_idle(void)
-{
-        return rcu_dynticks_nesting <= 1;
-}
-/*
 * Helper function for rcu_sched_qs() and rcu_bh_qs().
 * Also irqs are disabled to avoid confusion due to interrupt handlers
 * invoking call_rcu().
@@ -250,7 +150,7 @@ void rcu_bh_qs(void)
 void rcu_check_callbacks(int user)
 {
        RCU_TRACE(check_cpu_stalls());
-        if (user || rcu_is_cpu_rrupt_from_idle())
+        if (user)
                rcu_sched_qs();
        else if (!in_softirq())
                rcu_bh_qs();
@@ -357,6 +257,11 @@ static void __call_rcu(struct rcu_head *head,
        rcp->curtail = &head->next;
        RCU_TRACE(rcp->qlen++);
        local_irq_restore(flags);
+        if (unlikely(is_idle_task(current))) {
+                /* force scheduling for rcu_sched_qs() */
+                resched_cpu(0);
+        }
 }
 /*
@@ -383,6 +288,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
 void __init rcu_init(void)
 {
        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+        RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk));
+        RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk));
        rcu_early_boot_tests();
 }
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 858c56569127..f94e209a10d6 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -145,17 +145,16 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
        rcp->ticks_this_gp++;
        j = jiffies;
        js = ACCESS_ONCE(rcp->jiffies_stall);
-        if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
+        if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
                pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
-                       rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
+                       rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
                       jiffies - rcp->gp_start, rcp->qlen);
                dump_stack();
-        }
-        if (*rcp->curtail && ULONG_CMP_GE(j, js))
                ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
                        3 * rcu_jiffies_till_stall_check() + 3;
-        else if (ULONG_CMP_GE(j, js))
+        } else if (ULONG_CMP_GE(j, js)) {
                ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+        }
 }
 static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 7680fc275036..48d640ca1a05 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -156,6 +156,10 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
+/* rcuc/rcub kthread realtime priority */
+static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
+module_param(kthread_prio, int, 0644);
 /*
 * Track the rcutorture test sequence number and the update version
 * number within a given test.  The rcutorture_testseq is incremented
@@ -215,6 +219,9 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 };
+DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
+EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
 /*
 * Let the RCU core know that this CPU has gone through the scheduler,
 * which is a quiescent state.  This is called when the need for a
@@ -284,6 +291,22 @@ void rcu_note_context_switch(void)
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
+/*
+ * Register a quiesecent state for all RCU flavors.  If there is an
+ * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
+ * dyntick-idle quiescent state visible to other CPUs (but only for those
+ * RCU flavors in desparate need of a quiescent state, which will normally
+ * be none of them).  Either way, do a lightweight quiescent state for
+ * all RCU flavors.
+ */
+void rcu_all_qs(void)
+{
+        if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+                rcu_momentary_dyntick_idle();
+        this_cpu_inc(rcu_qs_ctr);
+}
+EXPORT_SYMBOL_GPL(rcu_all_qs);
 static long blimit = 10;        /* Maximum callbacks per rcu_do_batch. */
 static long qhimark = 10000;    /* If this many pending, ignore blimit. */
 static long qlowmark = 100;     /* Once only this many pending, use blimit. */
@@ -315,18 +338,54 @@ static void force_quiescent_state(struct rcu_state *rsp);
 static int rcu_pending(void);
 /*
- * Return the number of RCU-sched batches processed thus far for debug & stats.
+ * Return the number of RCU batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started(void)
+{
+        return rcu_state_p->gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started);
+/*
+ * Return the number of RCU-sched batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started_sched(void)
+{
+        return rcu_sched_state.gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
+/*
+ * Return the number of RCU BH batches started thus far for debug & stats.
 */
-long rcu_batches_completed_sched(void)
+unsigned long rcu_batches_started_bh(void)
+{
+        return rcu_bh_state.gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
+/*
+ * Return the number of RCU batches completed thus far for debug & stats.
+ */
+unsigned long rcu_batches_completed(void)
+{
+        return rcu_state_p->completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+/*
+ * Return the number of RCU-sched batches completed thus far for debug & stats.
+ */
+unsigned long rcu_batches_completed_sched(void)
 {
        return rcu_sched_state.completed;
 }
 EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
 /*
- * Return the number of RCU BH batches processed thus far for debug & stats.
+ * Return the number of RCU BH batches completed thus far for debug & stats.
 */
-long rcu_batches_completed_bh(void)
+unsigned long rcu_batches_completed_bh(void)
 {
        return rcu_bh_state.completed;
 }
@@ -759,39 +818,71 @@ void rcu_irq_enter(void)
 /**
 * rcu_nmi_enter - inform RCU of entry to NMI context
 *
- * If the CPU was idle with dynamic ticks active, and there is no
+ * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and
- * irq handler running, this updates rdtp->dynticks_nmi to let the
+ * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know
- * RCU grace-period handling know that the CPU is active.
+ * that the CPU is active.  This implementation permits nested NMIs, as
+ * long as the nesting level does not overflow an int.  (You will probably
+ * run out of stack space first.)
 */
 void rcu_nmi_enter(void)
 {
        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+        int incby = 2;
-        if (rdtp->dynticks_nmi_nesting == 0 &&
+        /* Complain about underflow. */
-            (atomic_read(&rdtp->dynticks) & 0x1))
+        WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0);
-                return;
-        rdtp->dynticks_nmi_nesting++;
+        /*
-        smp_mb__before_atomic();  /* Force delay from prior write. */
+         * If idle from RCU viewpoint, atomically increment ->dynticks
-        atomic_inc(&rdtp->dynticks);
+         * to mark non-idle and increment ->dynticks_nmi_nesting by one.
-        /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
+         * Otherwise, increment ->dynticks_nmi_nesting by two.  This means
-        smp_mb__after_atomic();  /* See above. */
+         * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
-        WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+         * to be in the outermost NMI handler that interrupted an RCU-idle
+         * period (observation due to Andy Lutomirski).
+         */
+        if (!(atomic_read(&rdtp->dynticks) & 0x1)) {
+                smp_mb__before_atomic();  /* Force delay from prior write. */
+                atomic_inc(&rdtp->dynticks);
+                /* atomic_inc() before later RCU read-side crit sects */
+                smp_mb__after_atomic();  /* See above. */
+                WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+                incby = 1;
+        }
+        rdtp->dynticks_nmi_nesting += incby;
+        barrier();
 }
 /**
 * rcu_nmi_exit - inform RCU of exit from NMI context
 *
- * If the CPU was idle with dynamic ticks active, and there is no
+ * If we are returning from the outermost NMI handler that interrupted an
- * irq handler running, this updates rdtp->dynticks_nmi to let the
+ * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
- * RCU grace-period handling know that the CPU is no longer active.
+ * to let the RCU grace-period handling know that the CPU is back to
+ * being RCU-idle.
 */
 void rcu_nmi_exit(void)
 {
        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-        if (rdtp->dynticks_nmi_nesting == 0 ||
+        /*
-            --rdtp->dynticks_nmi_nesting != 0)
+         * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
+         * (We are exiting an NMI handler, so RCU better be paying attention
+         * to us!)
+         */
+        WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
+        WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+        /*
+         * If the nesting level is not 1, the CPU wasn't RCU-idle, so
+         * leave it in non-RCU-idle state.
+         */
+        if (rdtp->dynticks_nmi_nesting != 1) {
+                rdtp->dynticks_nmi_nesting -= 2;
                return;
+        }
+        /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
+        rdtp->dynticks_nmi_nesting = 0;
        /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
        smp_mb__before_atomic();  /* See above. */
        atomic_inc(&rdtp->dynticks);
@@ -898,17 +989,14 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
                trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
                return 1;
        } else {
+                if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
+                                 rdp->mynode->gpnum))
+                        ACCESS_ONCE(rdp->gpwrap) = true;
                return 0;
        }
 }
 /*
- * This function really isn't for public consumption, but RCU is special in
- * that context switches can allow the state machine to make progress.
- */
-extern void resched_cpu(int cpu);
-/*
 * Return true if the specified CPU has passed through a quiescent
 * state by virtue of being in or having passed through an dynticks
 * idle state since the last call to dyntick_save_progress_counter()
@@ -1011,6 +1099,22 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
        j1 = rcu_jiffies_till_stall_check();
        ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
        rsp->jiffies_resched = j + j1 / 2;
+        rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs);
+}
+/*
+ * Complain about starvation of grace-period kthread.
+ */
+static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
+{
+        unsigned long gpa;
+        unsigned long j;
+        j = jiffies;
+        gpa = ACCESS_ONCE(rsp->gp_activity);
+        if (j - gpa > 2 * HZ)
+                pr_err("%s kthread starved for %ld jiffies!\n",
+                       rsp->name, j - gpa);
 }
 /*
@@ -1033,11 +1137,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
        }
 }
-static void print_other_cpu_stall(struct rcu_state *rsp)
+static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 {
        int cpu;
        long delta;
        unsigned long flags;
+        unsigned long gpa;
+        unsigned long j;
        int ndetected = 0;
        struct rcu_node *rnp = rcu_get_root(rsp);
        long totqlen = 0;
@@ -1075,30 +1181,34 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
-        /*
-         * Now rat on any tasks that got kicked up to the root rcu_node
-         * due to CPU offlining.
-         */
-        rnp = rcu_get_root(rsp);
-        raw_spin_lock_irqsave(&rnp->lock, flags);
-        ndetected += rcu_print_task_stall(rnp);
-        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        print_cpu_stall_info_end();
        for_each_possible_cpu(cpu)
                totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
        pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
               smp_processor_id(), (long)(jiffies - rsp->gp_start),
               (long)rsp->gpnum, (long)rsp->completed, totqlen);
-        if (ndetected == 0)
+        if (ndetected) {
-                pr_err("INFO: Stall ended before state dump start\n");
-        else
                rcu_dump_cpu_stacks(rsp);
+        } else {
+                if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
+                    ACCESS_ONCE(rsp->completed) == gpnum) {
+                        pr_err("INFO: Stall ended before state dump start\n");
+                } else {
+                        j = jiffies;
+                        gpa = ACCESS_ONCE(rsp->gp_activity);
+                        pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n",
+                               rsp->name, j - gpa, j, gpa,
+                               jiffies_till_next_fqs);
+                        /* In this case, the current CPU might be at fault. */
+                        sched_show_task(current);
+                }
+        }
        /* Complain about tasks blocking the grace period. */
        rcu_print_detail_task_stall(rsp);
+        rcu_check_gp_kthread_starvation(rsp);
        force_quiescent_state(rsp);  /* Kick them all. */
 }
@@ -1123,6 +1233,9 @@ static void print_cpu_stall(struct rcu_state *rsp)
        pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
                jiffies - rsp->gp_start,
                (long)rsp->gpnum, (long)rsp->completed, totqlen);
+        rcu_check_gp_kthread_starvation(rsp);
        rcu_dump_cpu_stacks(rsp);
        raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1193,7 +1306,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
                   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
                /* They had a few time units to dump stack, so complain. */
-                print_other_cpu_stall(rsp);
+                print_other_cpu_stall(rsp, gpnum);
        }
 }
@@ -1530,7 +1643,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
        bool ret;
        /* Handle the ends of any preceding grace periods first. */
-        if (rdp->completed == rnp->completed) {
+        if (rdp->completed == rnp->completed &&
+            !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
                /* No grace period end, so just accelerate recent callbacks. */
                ret = rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1545,7 +1659,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
                trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
        }
-        if (rdp->gpnum != rnp->gpnum) {
+        if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
                /*
                 * If the current grace period is waiting for this CPU,
                 * set up to detect a quiescent state, otherwise don't
@@ -1554,8 +1668,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
                rdp->gpnum = rnp->gpnum;
                trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
                rdp->passed_quiesce = 0;
+                rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
                rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
                zero_cpu_stall_ticks(rdp);
+                ACCESS_ONCE(rdp->gpwrap) = false;
        }
        return ret;
 }
@@ -1569,7 +1685,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
        local_irq_save(flags);
        rnp = rdp->mynode;
        if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
-             rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */
+             rdp->completed == ACCESS_ONCE(rnp->completed) &&
+             !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
            !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
                local_irq_restore(flags);
                return;
@@ -1589,6 +1706,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
        struct rcu_data *rdp;
        struct rcu_node *rnp = rcu_get_root(rsp);
+        ACCESS_ONCE(rsp->gp_activity) = jiffies;
        rcu_bind_gp_kthread();
        raw_spin_lock_irq(&rnp->lock);
        smp_mb__after_unlock_lock();
@@ -1649,6 +1767,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
                                            rnp->grphi, rnp->qsmask);
                raw_spin_unlock_irq(&rnp->lock);
                cond_resched_rcu_qs();
+                ACCESS_ONCE(rsp->gp_activity) = jiffies;
        }
        mutex_unlock(&rsp->onoff_mutex);
@@ -1665,6 +1784,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
        unsigned long maxj;
        struct rcu_node *rnp = rcu_get_root(rsp);
+        ACCESS_ONCE(rsp->gp_activity) = jiffies;
        rsp->n_force_qs++;
        if (fqs_state == RCU_SAVE_DYNTICK) {
                /* Collect dyntick-idle snapshots. */
@@ -1703,6 +1823,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
        struct rcu_data *rdp;
        struct rcu_node *rnp = rcu_get_root(rsp);
+        ACCESS_ONCE(rsp->gp_activity) = jiffies;
        raw_spin_lock_irq(&rnp->lock);
        smp_mb__after_unlock_lock();
        gp_duration = jiffies - rsp->gp_start;
@@ -1739,6 +1860,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
                nocb += rcu_future_gp_cleanup(rsp, rnp);
                raw_spin_unlock_irq(&rnp->lock);
                cond_resched_rcu_qs();
+                ACCESS_ONCE(rsp->gp_activity) = jiffies;
        }
        rnp = rcu_get_root(rsp);
        raw_spin_lock_irq(&rnp->lock);
@@ -1788,6 +1910,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
                        if (rcu_gp_init(rsp))
                                break;
                        cond_resched_rcu_qs();
+                        ACCESS_ONCE(rsp->gp_activity) = jiffies;
                        WARN_ON(signal_pending(current));
                        trace_rcu_grace_period(rsp->name,
                                               ACCESS_ONCE(rsp->gpnum),
@@ -1831,9 +1954,11 @@ static int __noreturn rcu_gp_kthread(void *arg)
                                                       ACCESS_ONCE(rsp->gpnum),
                                                       TPS("fqsend"));
                                cond_resched_rcu_qs();
+                                ACCESS_ONCE(rsp->gp_activity) = jiffies;
                        } else {
                                /* Deal with stray signal. */
                                cond_resched_rcu_qs();
+                                ACCESS_ONCE(rsp->gp_activity) = jiffies;
                                WARN_ON(signal_pending(current));
                                trace_rcu_grace_period(rsp->name,
                                                       ACCESS_ONCE(rsp->gpnum),
@@ -2010,8 +2135,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
        rnp = rdp->mynode;
        raw_spin_lock_irqsave(&rnp->lock, flags);
        smp_mb__after_unlock_lock();
-        if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
+        if ((rdp->passed_quiesce == 0 &&
-            rnp->completed == rnp->gpnum) {
+             rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
+            rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
+            rdp->gpwrap) {
                /*
                 * The grace period in which this quiescent state was
@@ -2020,6 +2147,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
                 * within the current grace period.
                 */
                rdp->passed_quiesce = 0;        /* need qs for new gp. */
+                rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
@@ -2064,7 +2192,8 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
         * Was there a quiescent state since the beginning of the grace
         * period? If no, then exit and wait for the next call.
         */
-        if (!rdp->passed_quiesce)
+        if (!rdp->passed_quiesce &&
+            rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
                return;
        /*
@@ -2195,6 +2324,46 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 }
 /*
+ * All CPUs for the specified rcu_node structure have gone offline,
+ * and all tasks that were preempted within an RCU read-side critical
+ * section while running on one of those CPUs have since exited their RCU
+ * read-side critical section.  Some other CPU is reporting this fact with
+ * the specified rcu_node structure's ->lock held and interrupts disabled.
+ * This function therefore goes up the tree of rcu_node structures,
+ * clearing the corresponding bits in the ->qsmaskinit fields.  Note that
+ * the leaf rcu_node structure's ->qsmaskinit field has already been
+ * updated
+ *
+ * This function does check that the specified rcu_node structure has
+ * all CPUs offline and no blocked tasks, so it is OK to invoke it
+ * prematurely.  That said, invoking it after the fact will cost you
+ * a needless lock acquisition.  So once it has done its work, don't
+ * invoke it again.
+ */
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
+{
+        long mask;
+        struct rcu_node *rnp = rnp_leaf;
+        if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
+                return;
+        for (;;) {
+                mask = rnp->grpmask;
+                rnp = rnp->parent;
+                if (!rnp)
+                        break;
+                raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+                smp_mb__after_unlock_lock(); /* GP memory ordering. */
+                rnp->qsmaskinit &= ~mask;
+                if (rnp->qsmaskinit) {
+                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                        return;
+                }
+                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+        }
+}
+/*
 * The CPU has been completely removed, and some other CPU is reporting
 * this fact from process context.  Do the remainder of the cleanup,
 * including orphaning the outgoing CPU's RCU callbacks, and also
@@ -2204,8 +2373,6 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
        unsigned long flags;
-        unsigned long mask;
-        int need_report = 0;
        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
@@ -2219,40 +2386,15 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
        /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
        rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
        rcu_adopt_orphan_cbs(rsp, flags);
+        raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
-        /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
+        /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
-        mask = rdp->grpmask;    /* rnp->grplo is constant. */
+        raw_spin_lock_irqsave(&rnp->lock, flags);
-        do {
+        smp_mb__after_unlock_lock();    /* Enforce GP memory-order guarantee. */
-                raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
+        rnp->qsmaskinit &= ~rdp->grpmask;
-                smp_mb__after_unlock_lock();
+        if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp))
-                rnp->qsmaskinit &= ~mask;
+                rcu_cleanup_dead_rnp(rnp);
-                if (rnp->qsmaskinit != 0) {
+        rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */
-                        if (rnp != rdp->mynode)
-                                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-                        break;
-                }
-                if (rnp == rdp->mynode)
-                        need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
-                else
-                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-                mask = rnp->grpmask;
-                rnp = rnp->parent;
-        } while (rnp != NULL);
-        /*
-         * We still hold the leaf rcu_node structure lock here, and
-         * irqs are still disabled.  The reason for this subterfuge is
-         * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock
-         * held leads to deadlock.
-         */
-        raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
-        rnp = rdp->mynode;
-        if (need_report & RCU_OFL_TASKS_NORM_GP)
-                rcu_report_unblock_qs_rnp(rnp, flags);
-        else
-                raw_spin_unlock_irqrestore(&rnp->lock, flags);
-        if (need_report & RCU_OFL_TASKS_EXP_GP)
-                rcu_report_exp_rnp(rsp, rnp, true);
        WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
                  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
                  cpu, rdp->qlen, rdp->nxtlist);
@@ -2268,6 +2410,10 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 {
 }
+static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
+{
+}
 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
 }
@@ -2464,12 +2610,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
                }
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
-        rnp = rcu_get_root(rsp);
-        if (rnp->qsmask == 0) {
-                raw_spin_lock_irqsave(&rnp->lock, flags);
-                smp_mb__after_unlock_lock();
-                rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
-        }
 }
 /*
@@ -2569,7 +2709,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 * Schedule RCU callback invocation.  If the specified type of RCU
 * does not support RCU priority boosting, just do a direct call,
 * otherwise wake up the per-CPU kernel kthread.  Note that because we
- * are running on the current CPU with interrupts disabled, the
+ * are running on the current CPU with softirqs disabled, the
 * rcu_cpu_kthread_task cannot disappear out from under us.
 */
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
@@ -3109,9 +3249,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
        /* Is the RCU core waiting for a quiescent state from this CPU? */
        if (rcu_scheduler_fully_active &&
-            rdp->qs_pending && !rdp->passed_quiesce) {
+            rdp->qs_pending && !rdp->passed_quiesce &&
+            rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
                rdp->n_rp_qs_pending++;
-        } else if (rdp->qs_pending && rdp->passed_quiesce) {
+        } else if (rdp->qs_pending &&
+                   (rdp->passed_quiesce ||
+                    rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
                rdp->n_rp_report_qs++;
                return 1;
        }
@@ -3135,7 +3278,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
        }
        /* Has a new RCU grace period started? */
-        if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
+        if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
+            unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
                rdp->n_rp_gp_started++;
                return 1;
        }
@@ -3318,6 +3462,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
                        } else {
                                _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
                                                   rsp->n_barrier_done);
+                                smp_mb__before_atomic();
                                atomic_inc(&rsp->barrier_cpu_count);
                                __call_rcu(&rdp->barrier_head,
                                           rcu_barrier_callback, rsp, cpu, 0);
@@ -3385,9 +3530,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
        /* Set up local state, ensuring consistent view of global state. */
        raw_spin_lock_irqsave(&rnp->lock, flags);
        rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
-        init_callback_list(rdp);
-        rdp->qlen_lazy = 0;
-        ACCESS_ONCE(rdp->qlen) = 0;
        rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
        WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
        WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
@@ -3444,6 +3586,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
                        rdp->gpnum = rnp->completed;
                        rdp->completed = rnp->completed;
                        rdp->passed_quiesce = 0;
+                        rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
                        rdp->qs_pending = 0;
                        trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
                }
@@ -3535,17 +3678,35 @@ static int rcu_pm_notify(struct notifier_block *self,
 static int __init rcu_spawn_gp_kthread(void)
 {
        unsigned long flags;
+        int kthread_prio_in = kthread_prio;
        struct rcu_node *rnp;
        struct rcu_state *rsp;
+        struct sched_param sp;
        struct task_struct *t;
+        /* Force priority into range. */
+        if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
+                kthread_prio = 1;
+        else if (kthread_prio < 0)
+                kthread_prio = 0;
+        else if (kthread_prio > 99)
+                kthread_prio = 99;
+        if (kthread_prio != kthread_prio_in)
+                pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
+                         kthread_prio, kthread_prio_in);
        rcu_scheduler_fully_active = 1;
        for_each_rcu_flavor(rsp) {
-                t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name);
+                t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name);
                BUG_ON(IS_ERR(t));
                rnp = rcu_get_root(rsp);
                raw_spin_lock_irqsave(&rnp->lock, flags);
                rsp->gp_kthread = t;
+                if (kthread_prio) {
+                        sp.sched_priority = kthread_prio;
+                        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+                }
+                wake_up_process(t);
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
        rcu_spawn_nocb_kthreads();
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8e7b1843896e..119de399eb2f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,7 +27,6 @@
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
-#include <linux/irq_work.h>
 /*
 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -172,11 +171,6 @@ struct rcu_node {
                                /*  queued on this rcu_node structure that */
                                /*  are blocking the current grace period, */
                                /*  there can be no such task. */
-        struct completion boost_completion;
-                                /* Used to ensure that the rt_mutex used */
-                                /*  to carry out the boosting is fully */
-                                /*  released with no future boostee accesses */
-                                /*  before that rt_mutex is re-initialized. */
        struct rt_mutex boost_mtx;
                                /* Used only for the priority-boosting */
                                /*  side effect, not as a lock. */
@@ -257,9 +251,12 @@ struct rcu_data {
                                        /*  in order to detect GP end. */
        unsigned long   gpnum;          /* Highest gp number that this CPU */
                                        /*  is aware of having started. */
+        unsigned long   rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */
+                                        /*  for rcu_all_qs() invocations. */
        bool            passed_quiesce; /* User-mode/idle loop etc. */
        bool            qs_pending;     /* Core waits for quiesc state. */
        bool            beenonline;     /* CPU online at least once. */
+        bool            gpwrap;         /* Possible gpnum/completed wrap. */
        struct rcu_node *mynode;        /* This CPU's leaf of hierarchy */
        unsigned long grpmask;          /* Mask to apply to leaf qsmask. */
 #ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -340,14 +337,10 @@ struct rcu_data {
 #ifdef CONFIG_RCU_NOCB_CPU
        struct rcu_head *nocb_head;     /* CBs waiting for kthread. */
        struct rcu_head **nocb_tail;
-        atomic_long_t nocb_q_count;     /* # CBs waiting for kthread */
+        atomic_long_t nocb_q_count;     /* # CBs waiting for nocb */
-        atomic_long_t nocb_q_count_lazy; /*  (approximate). */
+        atomic_long_t nocb_q_count_lazy; /*  invocation (all stages). */
        struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
        struct rcu_head **nocb_follower_tail;
-        atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */
-        atomic_long_t nocb_follower_count_lazy; /*  (approximate). */
-        int nocb_p_count;               /* # CBs being invoked by kthread */
-        int nocb_p_count_lazy;          /*  (approximate). */
        wait_queue_head_t nocb_wq;      /* For nocb kthreads to sleep on. */
        struct task_struct *nocb_kthread;
        int nocb_defer_wakeup;          /* Defer wakeup of nocb_kthread. */
@@ -356,8 +349,6 @@ struct rcu_data {
        struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
                                        /* CBs waiting for GP. */
        struct rcu_head **nocb_gp_tail;
-        long nocb_gp_count;
-        long nocb_gp_count_lazy;
        bool nocb_leader_sleep;         /* Is the nocb leader thread asleep? */
        struct rcu_data *nocb_next_follower;
                                        /* Next follower in wakeup chain. */
@@ -488,10 +479,14 @@ struct rcu_state {
                                                /*  due to no GP active. */
        unsigned long gp_start;                 /* Time at which GP started, */
                                                /*  but in jiffies. */
+        unsigned long gp_activity;              /* Time of last GP kthread */
+                                                /*  activity in jiffies. */
        unsigned long jiffies_stall;            /* Time at which to check */
                                                /*  for CPU stalls. */
        unsigned long jiffies_resched;          /* Time at which to resched */
                                                /*  a reluctant CPU. */
+        unsigned long n_force_qs_gpstart;       /* Snapshot of n_force_qs at */
+                                                /*  GP start. */
        unsigned long gp_max;                   /* Maximum GP duration in */
                                                /*  jiffies. */
        const char *name;                       /* Name of structure. */
@@ -514,13 +509,6 @@ extern struct list_head rcu_struct_flavors;
 #define for_each_rcu_flavor(rsp) \
        list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
-/* Return values for rcu_preempt_offline_tasks(). */
-#define RCU_OFL_TASKS_NORM_GP   0x1             /* Tasks blocking normal */
-                                                /*  GP were moved to root. */
-#define RCU_OFL_TASKS_EXP_GP    0x2             /* Tasks blocking expedited */
-                                                /*  GP were moved to root. */
 /*
 * RCU implementation internal declarations:
 */
@@ -546,27 +534,16 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
 /* Forward declarations for rcutree_plugin.h */
 static void rcu_bootup_announce(void);
-long rcu_batches_completed(void);
 static void rcu_preempt_note_context_switch(void);
 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
-static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
-                                      unsigned long flags);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 static int rcu_print_task_stall(struct rcu_node *rnp);
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
-#ifdef CONFIG_HOTPLUG_CPU
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
-                                     struct rcu_node *rnp,
-                                     struct rcu_data *rdp);
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_preempt_check_callbacks(void);
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
-#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU)
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
-                               bool wake);
-#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */
 static void __init __rcu_init_preempt(void);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
@@ -622,24 +599,15 @@ static void rcu_dynticks_task_exit(void);
 #endif /* #ifndef RCU_TREE_NONCORE */
 #ifdef CONFIG_RCU_TRACE
-#ifdef CONFIG_RCU_NOCB_CPU
+/* Read out queue lengths for tracing. */
-/* Sum up queue lengths for tracing. */
 static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
 {
-        *ql = atomic_long_read(&rdp->nocb_q_count) +
+#ifdef CONFIG_RCU_NOCB_CPU
-              rdp->nocb_p_count +
+        *ql = atomic_long_read(&rdp->nocb_q_count);
-              atomic_long_read(&rdp->nocb_follower_count) +
+        *qll = atomic_long_read(&rdp->nocb_q_count_lazy);
-              rdp->nocb_p_count + rdp->nocb_gp_count;
-        *qll = atomic_long_read(&rdp->nocb_q_count_lazy) +
-               rdp->nocb_p_count_lazy +
-               atomic_long_read(&rdp->nocb_follower_count_lazy) +
-               rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy;
-}
 #else /* #ifdef CONFIG_RCU_NOCB_CPU */
-static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
-{
        *ql = 0;
        *qll = 0;
-}
 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+}
 #endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3ec85cb5d544..2e850a51bb8f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -34,10 +34,6 @@
 #include "../locking/rtmutex_common.h"
-/* rcuc/rcub kthread realtime priority */
-static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
-module_param(kthread_prio, int, 0644);
 /*
 * Control variables for per-CPU and per-rcu_node kthreads.  These
 * handle all flavors of RCU.
@@ -103,6 +99,8 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
 static struct rcu_state *rcu_state_p = &rcu_preempt_state;
 static int rcu_preempted_readers_exp(struct rcu_node *rnp);
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+                               bool wake);
 /*
 * Tell them what RCU they are running.
@@ -114,25 +112,6 @@ static void __init rcu_bootup_announce(void)
 }
 /*
- * Return the number of RCU-preempt batches processed thus far
- * for debug and statistics.
- */
-static long rcu_batches_completed_preempt(void)
-{
-        return rcu_preempt_state.completed;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
-/*
- * Return the number of RCU batches processed thus far for debug & stats.
- */
-long rcu_batches_completed(void)
-{
-        return rcu_batches_completed_preempt();
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-/*
 * Record a preemptible-RCU quiescent state for the specified CPU.  Note
 * that this just means that the task currently running on the CPU is
 * not in a quiescent state.  There might be any number of tasks blocked
@@ -307,15 +286,25 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
 }
 /*
+ * Return true if the specified rcu_node structure has tasks that were
+ * preempted within an RCU read-side critical section.
+ */
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
+{
+        return !list_empty(&rnp->blkd_tasks);
+}
+/*
 * Handle special cases during rcu_read_unlock(), such as needing to
 * notify RCU core processing or task having blocked during the RCU
 * read-side critical section.
 */
 void rcu_read_unlock_special(struct task_struct *t)
 {
-        int empty;
+        bool empty;
-        int empty_exp;
+        bool empty_exp;
-        int empty_exp_now;
+        bool empty_norm;
+        bool empty_exp_now;
        unsigned long flags;
        struct list_head *np;
 #ifdef CONFIG_RCU_BOOST
@@ -367,7 +356,8 @@ void rcu_read_unlock_special(struct task_struct *t)
                                break;
                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                }
-                empty = !rcu_preempt_blocked_readers_cgp(rnp);
+                empty = !rcu_preempt_has_tasks(rnp);
+                empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
                empty_exp = !rcu_preempted_readers_exp(rnp);
                smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
                np = rcu_next_node_entry(t, rnp);
@@ -387,13 +377,21 @@ void rcu_read_unlock_special(struct task_struct *t)
 #endif /* #ifdef CONFIG_RCU_BOOST */
                /*
+                 * If this was the last task on the list, go see if we
+                 * need to propagate ->qsmaskinit bit clearing up the
+                 * rcu_node tree.
+                 */
+                if (!empty && !rcu_preempt_has_tasks(rnp))
+                        rcu_cleanup_dead_rnp(rnp);
+                /*
                 * If this was the last task on the current list, and if
                 * we aren't waiting on any CPUs, report the quiescent state.
                 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
                 * so we must take a snapshot of the expedited state.
                 */
                empty_exp_now = !rcu_preempted_readers_exp(rnp);
-                if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
+                if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
                        trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
                                                         rnp->gpnum,
                                                         0, rnp->qsmask,
@@ -408,10 +406,8 @@ void rcu_read_unlock_special(struct task_struct *t)
 #ifdef CONFIG_RCU_BOOST
                /* Unboost if we were boosted. */
-                if (drop_boost_mutex) {
+                if (drop_boost_mutex)
                        rt_mutex_unlock(&rnp->boost_mtx);
-                        complete(&rnp->boost_completion);
-                }
 #endif /* #ifdef CONFIG_RCU_BOOST */
                /*
@@ -519,99 +515,13 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 {
        WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
-        if (!list_empty(&rnp->blkd_tasks))
+        if (rcu_preempt_has_tasks(rnp))
                rnp->gp_tasks = rnp->blkd_tasks.next;
        WARN_ON_ONCE(rnp->qsmask);
 }
 #ifdef CONFIG_HOTPLUG_CPU
-/*
- * Handle tasklist migration for case in which all CPUs covered by the
- * specified rcu_node have gone offline.  Move them up to the root
- * rcu_node.  The reason for not just moving them to the immediate
- * parent is to remove the need for rcu_read_unlock_special() to
- * make more than two attempts to acquire the target rcu_node's lock.
- * Returns true if there were tasks blocking the current RCU grace
- * period.
- *
- * Returns 1 if there was previously a task blocking the current grace
- * period on the specified rcu_node structure.
- *
- * The caller must hold rnp->lock with irqs disabled.
- */
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
-                                     struct rcu_node *rnp,
-                                     struct rcu_data *rdp)
-{
-        struct list_head *lp;
-        struct list_head *lp_root;
-        int retval = 0;
-        struct rcu_node *rnp_root = rcu_get_root(rsp);
-        struct task_struct *t;
-        if (rnp == rnp_root) {
-                WARN_ONCE(1, "Last CPU thought to be offlined?");
-                return 0;  /* Shouldn't happen: at least one CPU online. */
-        }
-        /* If we are on an internal node, complain bitterly. */
-        WARN_ON_ONCE(rnp != rdp->mynode);
-        /*
-         * Move tasks up to root rcu_node.  Don't try to get fancy for
-         * this corner-case operation -- just put this node's tasks
-         * at the head of the root node's list, and update the root node's
-         * ->gp_tasks and ->exp_tasks pointers to those of this node's,
-         * if non-NULL.  This might result in waiting for more tasks than
-         * absolutely necessary, but this is a good performance/complexity
-         * tradeoff.
-         */
-        if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0)
-                retval |= RCU_OFL_TASKS_NORM_GP;
-        if (rcu_preempted_readers_exp(rnp))
-                retval |= RCU_OFL_TASKS_EXP_GP;
-        lp = &rnp->blkd_tasks;
-        lp_root = &rnp_root->blkd_tasks;
-        while (!list_empty(lp)) {
-                t = list_entry(lp->next, typeof(*t), rcu_node_entry);
-                raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
-                smp_mb__after_unlock_lock();
-                list_del(&t->rcu_node_entry);
-                t->rcu_blocked_node = rnp_root;
-                list_add(&t->rcu_node_entry, lp_root);
-                if (&t->rcu_node_entry == rnp->gp_tasks)
-                        rnp_root->gp_tasks = rnp->gp_tasks;
-                if (&t->rcu_node_entry == rnp->exp_tasks)
-                        rnp_root->exp_tasks = rnp->exp_tasks;
-#ifdef CONFIG_RCU_BOOST
-                if (&t->rcu_node_entry == rnp->boost_tasks)
-                        rnp_root->boost_tasks = rnp->boost_tasks;
-#endif /* #ifdef CONFIG_RCU_BOOST */
-                raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
-        }
-        rnp->gp_tasks = NULL;
-        rnp->exp_tasks = NULL;
-#ifdef CONFIG_RCU_BOOST
-        rnp->boost_tasks = NULL;
-        /*
-         * In case root is being boosted and leaf was not.  Make sure
-         * that we boost the tasks blocking the current grace period
-         * in this case.
-         */
-        raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
-        smp_mb__after_unlock_lock();
-        if (rnp_root->boost_tasks != NULL &&
-            rnp_root->boost_tasks != rnp_root->gp_tasks &&
-            rnp_root->boost_tasks != rnp_root->exp_tasks)
-                rnp_root->boost_tasks = rnp_root->gp_tasks;
-        raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
-#endif /* #ifdef CONFIG_RCU_BOOST */
-        return retval;
-}
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
@@ -771,7 +681,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
        raw_spin_lock_irqsave(&rnp->lock, flags);
        smp_mb__after_unlock_lock();
-        if (list_empty(&rnp->blkd_tasks)) {
+        if (!rcu_preempt_has_tasks(rnp)) {
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        } else {
                rnp->exp_tasks = rnp->blkd_tasks.next;
@@ -933,15 +843,6 @@ static void __init rcu_bootup_announce(void)
 }
 /*
- * Return the number of RCU batches processed thus far for debug & stats.
- */
-long rcu_batches_completed(void)
-{
-        return rcu_batches_completed_sched();
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-/*
 * Because preemptible RCU does not exist, we never have to check for
 * CPUs being in quiescent states.
 */
@@ -960,11 +861,12 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 #ifdef CONFIG_HOTPLUG_CPU
-/* Because preemptible RCU does not exist, no quieting of tasks. */
+/*
-static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
+ * Because there is no preemptible RCU, there can be no readers blocked.
-        __releases(rnp->lock)
+ */
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
 {
-        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        return false;
 }
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -996,23 +898,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
        WARN_ON_ONCE(rnp->qsmask);
 }
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * Because preemptible RCU does not exist, it never needs to migrate
- * tasks that were blocked within RCU read-side critical sections, and
- * such non-existent tasks cannot possibly have been blocking the current
- * grace period.
- */
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
-                                     struct rcu_node *rnp,
-                                     struct rcu_data *rdp)
-{
-        return 0;
-}
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
 * Because preemptible RCU does not exist, it never has any callbacks
 * to check.
@@ -1031,20 +916,6 @@ void synchronize_rcu_expedited(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * Because preemptible RCU does not exist, there is never any need to
- * report on tasks preempted in RCU read-side critical sections during
- * expedited RCU grace periods.
- */
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
-                               bool wake)
-{
-}
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
 * Because preemptible RCU does not exist, rcu_barrier() is just
 * another name for rcu_barrier_sched().
@@ -1080,7 +951,7 @@ void exit_rcu(void)
 static void rcu_initiate_boost_trace(struct rcu_node *rnp)
 {
-        if (list_empty(&rnp->blkd_tasks))
+        if (!rcu_preempt_has_tasks(rnp))
                rnp->n_balk_blkd_tasks++;
        else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
                rnp->n_balk_exp_gp_tasks++;
@@ -1127,7 +998,8 @@ static int rcu_boost(struct rcu_node *rnp)
        struct task_struct *t;
        struct list_head *tb;
-        if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
+        if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
+            ACCESS_ONCE(rnp->boost_tasks) == NULL)
                return 0;  /* Nothing left to boost. */
        raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1175,15 +1047,11 @@ static int rcu_boost(struct rcu_node *rnp)
         */
        t = container_of(tb, struct task_struct, rcu_node_entry);
        rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
-        init_completion(&rnp->boost_completion);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        /* Lock only for side effect: boosts task t's priority. */
        rt_mutex_lock(&rnp->boost_mtx);
        rt_mutex_unlock(&rnp->boost_mtx);  /* Then keep lockdep happy. */
-        /* Wait for boostee to be done w/boost_mtx before reinitializing. */
-        wait_for_completion(&rnp->boost_completion);
        return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
               ACCESS_ONCE(rnp->boost_tasks) != NULL;
 }
@@ -1416,12 +1284,8 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
        for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
                if ((mask & 0x1) && cpu != outgoingcpu)
                        cpumask_set_cpu(cpu, cm);
-        if (cpumask_weight(cm) == 0) {
+        if (cpumask_weight(cm) == 0)
                cpumask_setall(cm);
-                for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
-                        cpumask_clear_cpu(cpu, cm);
-                WARN_ON_ONCE(cpumask_weight(cm) == 0);
-        }
        set_cpus_allowed_ptr(t, cm);
        free_cpumask_var(cm);
 }
@@ -1446,12 +1310,8 @@ static void __init rcu_spawn_boost_kthreads(void)
        for_each_possible_cpu(cpu)
                per_cpu(rcu_cpu_has_work, cpu) = 0;
        BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
-        rnp = rcu_get_root(rcu_state_p);
+        rcu_for_each_leaf_node(rcu_state_p, rnp)
-        (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
+                (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
-        if (NUM_RCU_NODES > 1) {
-                rcu_for_each_leaf_node(rcu_state_p, rnp)
-                        (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
-        }
 }
 static void rcu_prepare_kthreads(int cpu)
@@ -1605,7 +1465,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
                 * completed since we last checked and there are
                 * callbacks not yet ready to invoke.
                 */
-                if (rdp->completed != rnp->completed &&
+                if ((rdp->completed != rnp->completed ||
+                     unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
                    rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
                        note_gp_changes(rsp, rdp);
@@ -1898,11 +1759,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
                ticks_value = rsp->gpnum - rdp->gpnum;
        }
        print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
-        pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
+        pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
               cpu, ticks_value, ticks_title,
               atomic_read(&rdtp->dynticks) & 0xfff,
               rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
               rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
+               ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
               fast_no_hz);
 }
@@ -2056,9 +1918,26 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
 static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
 {
        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+        unsigned long ret;
+#ifdef CONFIG_PROVE_RCU
        struct rcu_head *rhp;
+#endif /* #ifdef CONFIG_PROVE_RCU */
-        /* No-CBs CPUs might have callbacks on any of three lists. */
+        /*
+         * Check count of all no-CBs callbacks awaiting invocation.
+         * There needs to be a barrier before this function is called,
+         * but associated with a prior determination that no more
+         * callbacks would be posted.  In the worst case, the first
+         * barrier in _rcu_barrier() suffices (but the caller cannot
+         * necessarily rely on this, not a substitute for the caller
+         * getting the concurrency design right!).  There must also be
+         * a barrier between the following load an posting of a callback
+         * (if a callback is in fact needed).  This is associated with an
+         * atomic_inc() in the caller.
+         */
+        ret = atomic_long_read(&rdp->nocb_q_count);
+#ifdef CONFIG_PROVE_RCU
        rhp = ACCESS_ONCE(rdp->nocb_head);
        if (!rhp)
                rhp = ACCESS_ONCE(rdp->nocb_gp_head);
@@ -2072,8 +1951,9 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
                       cpu, rhp->func);
                WARN_ON_ONCE(1);
        }
+#endif /* #ifdef CONFIG_PROVE_RCU */
-        return !!rhp;
+        return !!ret;
 }
 /*
@@ -2095,9 +1975,10 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
        struct task_struct *t;
        /* Enqueue the callback on the nocb list and update counts. */
+        atomic_long_add(rhcount, &rdp->nocb_q_count);
+        /* rcu_barrier() relies on ->nocb_q_count add before xchg. */
        old_rhpp = xchg(&rdp->nocb_tail, rhtp);
        ACCESS_ONCE(*old_rhpp) = rhp;
-        atomic_long_add(rhcount, &rdp->nocb_q_count);
        atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
        smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
@@ -2288,9 +2169,6 @@ wait_again:
                /* Move callbacks to wait-for-GP list, which is empty. */
                ACCESS_ONCE(rdp->nocb_head) = NULL;
                rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
-                rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0);
-                rdp->nocb_gp_count_lazy =
-                        atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
                gotcbs = true;
        }
@@ -2338,9 +2216,6 @@ wait_again:
                /* Append callbacks to follower's "done" list. */
                tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
                *tail = rdp->nocb_gp_head;
-                atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count);
-                atomic_long_add(rdp->nocb_gp_count_lazy,
-                                &rdp->nocb_follower_count_lazy);
                smp_mb__after_atomic(); /* Store *tail before wakeup. */
                if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
                        /*
@@ -2415,13 +2290,11 @@ static int rcu_nocb_kthread(void *arg)
                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
                ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
                tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
-                c = atomic_long_xchg(&rdp->nocb_follower_count, 0);
-                cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0);
-                rdp->nocb_p_count += c;
-                rdp->nocb_p_count_lazy += cl;
                /* Each pass through the following loop invokes a callback. */
-                trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
+                trace_rcu_batch_start(rdp->rsp->name,
+                                      atomic_long_read(&rdp->nocb_q_count_lazy),
+                                      atomic_long_read(&rdp->nocb_q_count), -1);
                c = cl = 0;
                while (list) {
                        next = list->next;
@@ -2443,9 +2316,9 @@ static int rcu_nocb_kthread(void *arg)
                        list = next;
                }
                trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
-                ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c;
+                smp_mb__before_atomic();  /* _add after CB invocation. */
-                ACCESS_ONCE(rdp->nocb_p_count_lazy) =
+                atomic_long_add(-c, &rdp->nocb_q_count);
-                                                rdp->nocb_p_count_lazy - cl;
+                atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
                rdp->n_nocbs_invoked += c;
        }
        return 0;
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 5cdc62e1beeb..fbb6240509ea 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -46,6 +46,8 @@
 #define RCU_TREE_NONCORE
 #include "tree.h"
+DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
 static int r_open(struct inode *inode, struct file *file,
                                        const struct seq_operations *op)
 {
@@ -115,11 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
        if (!rdp->beenonline)
                return;
-        seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d",
+        seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d",
                   rdp->cpu,
                   cpu_is_offline(rdp->cpu) ? '!' : ' ',
                   ulong2long(rdp->completed), ulong2long(rdp->gpnum),
-                   rdp->passed_quiesce, rdp->qs_pending);
+                   rdp->passed_quiesce,
+                   rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
+                   rdp->qs_pending);
        seq_printf(m, " dt=%d/%llx/%d df=%lu",
                   atomic_read(&rdp->dynticks->dynticks),
                   rdp->dynticks->dynticks_nesting,
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 607f852b4d04..7052d3fd4e7b 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -268,6 +268,15 @@ bool try_wait_for_completion(struct completion *x)
        unsigned long flags;
        int ret = 1;
+        /*
+         * Since x->done will need to be locked only
+         * in the non-blocking case, we check x->done
+         * first without taking the lock so we can
+         * return early in the blocking case.
+         */
+        if (!ACCESS_ONCE(x->done))
+                return 0;
        spin_lock_irqsave(&x->wait.lock, flags);
        if (!x->done)
                ret = 0;
@@ -288,13 +297,6 @@ EXPORT_SYMBOL(try_wait_for_completion);
 */
 bool completion_done(struct completion *x)
 {
-        unsigned long flags;
+        return !!ACCESS_ONCE(x->done);
-        int ret = 1;
-        spin_lock_irqsave(&x->wait.lock, flags);
-        if (!x->done)
-                ret = 0;
-        spin_unlock_irqrestore(&x->wait.lock, flags);
-        return ret;
 }
 EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ae1188f62693..1612578a5b7a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1814,6 +1814,10 @@ void __dl_clear_params(struct task_struct *p)
        dl_se->dl_period = 0;
        dl_se->flags = 0;
        dl_se->dl_bw = 0;
+        dl_se->dl_throttled = 0;
+        dl_se->dl_new = 1;
+        dl_se->dl_yielded = 0;
 }
 /*
@@ -1839,7 +1843,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 #endif
        RB_CLEAR_NODE(&p->dl.rb_node);
-        hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        init_dl_task_timer(&p->dl);
        __dl_clear_params(p);
        INIT_LIST_HEAD(&p->rt.run_list);
@@ -2049,6 +2053,9 @@ static inline int dl_bw_cpus(int i)
 * allocated bandwidth to reflect the new situation.
 *
 * This function is called while holding p's rq->lock.
+ *
+ * XXX we should delay bw change until the task's 0-lag point, see
+ * __setparam_dl().
 */
 static int dl_overflow(struct task_struct *p, int policy,
                       const struct sched_attr *attr)
@@ -3251,15 +3258,31 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
 {
        struct sched_dl_entity *dl_se = &p->dl;
-        init_dl_task_timer(dl_se);
        dl_se->dl_runtime = attr->sched_runtime;
        dl_se->dl_deadline = attr->sched_deadline;
        dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
        dl_se->flags = attr->sched_flags;
        dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
-        dl_se->dl_throttled = 0;
-        dl_se->dl_new = 1;
+        /*
-        dl_se->dl_yielded = 0;
+         * Changing the parameters of a task is 'tricky' and we're not doing
+         * the correct thing -- also see task_dead_dl() and switched_from_dl().
+         *
+         * What we SHOULD do is delay the bandwidth release until the 0-lag
+         * point. This would include retaining the task_struct until that time
+         * and change dl_overflow() to not immediately decrement the current
+         * amount.
+         *
+         * Instead we retain the current runtime/deadline and let the new
+         * parameters take effect after the current reservation period lapses.
+         * This is safe (albeit pessimistic) because the 0-lag point is always
+         * before the current scheduling deadline.
+         *
+         * We can still have temporary overloads because we do not delay the
+         * change in bandwidth until that time; so admission control is
+         * not on the safe side. It does however guarantee tasks will never
+         * consume more than promised.
+         */
 }
 /*
@@ -4642,6 +4665,9 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
        struct dl_bw *cur_dl_b;
        unsigned long flags;
+        if (!cpumask_weight(cur))
+                return ret;
        rcu_read_lock_sched();
        cur_dl_b = dl_bw_of(cpumask_any(cur));
        trial_cpus = cpumask_weight(trial);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b52092f2636d..726470d47f87 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1094,6 +1094,7 @@ static void task_dead_dl(struct task_struct *p)
         * Since we are TASK_DEAD we won't slip out of the domain!
         */
        raw_spin_lock_irq(&dl_b->lock);
+        /* XXX we should retain the bw until 0-lag */
        dl_b->total_bw -= p->dl.dl_bw;
        raw_spin_unlock_irq(&dl_b->lock);
@@ -1614,8 +1615,8 @@ static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
 static void switched_from_dl(struct rq *rq, struct task_struct *p)
 {
+        /* XXX we should retain the bw until 0-lag */
        cancel_dl_timer(rq, p);
        __dl_clear_params(p);
        /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 40667cbf371b..fe331fc391f5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1730,7 +1730,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
        nodes = node_online_map;
        for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
                unsigned long max_faults = 0;
-                nodemask_t max_group;
+                nodemask_t max_group = NODE_MASK_NONE;
                int a, b;
                /* Are there nodes at this distance from each other? */
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index f032fb5284e3..40190f28db35 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -280,6 +280,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
        unsigned int cpu;
        int ret = 0;
+        get_online_cpus();
        mutex_lock(&smpboot_threads_lock);
        for_each_online_cpu(cpu) {
                ret = __smpboot_create_thread(plug_thread, cpu);
@@ -292,6 +293,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
        list_add(&plug_thread->list, &hotplug_threads);
 out:
        mutex_unlock(&smpboot_threads_lock);
+        put_online_cpus();
        return ret;
 }
 EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 501baa9ac1be..479e4436f787 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -114,8 +114,12 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
                trace_softirqs_off(ip);
        raw_local_irq_restore(flags);
-        if (preempt_count() == cnt)
+        if (preempt_count() == cnt) {
+#ifdef CONFIG_DEBUG_PREEMPT
+                current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
+#endif
                trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+        }
 }
 EXPORT_SYMBOL(__local_bh_disable_ip);
 #endif /* CONFIG_TRACE_IRQFLAGS */
@@ -656,9 +660,8 @@ static void run_ksoftirqd(unsigned int cpu)
                 * in the task stack here.
                 */
                __do_softirq();
-                rcu_note_context_switch();
                local_irq_enable();
-                cond_resched();
+                cond_resched_rcu_qs();
                return;
        }
        local_irq_enable();
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 37e50aadd471..d8c724cda37b 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -122,7 +122,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
        mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai);
        boot = ktime_add(mono, off_boot);
        xtim = ktime_add(mono, off_real);
-        tai = ktime_add(xtim, off_tai);
+        tai = ktime_add(mono, off_tai);
        base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
        base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;