9 files changed, 275 insertions, 79 deletions
diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt
index 273e654d7d08..2f0fcb2112d2 100644
--- a/Documentation/RCU/RTFP.txt
+++ b/Documentation/RCU/RTFP.txt
@@ -31,6 +31,14 @@ has lapsed, so this approach may be used in non-GPL software, if desired.
 (In contrast, implementation of RCU is permitted only in software licensed
 under either GPL or LGPL.  Sorry!!!)
+In 1987, Rashid et al. described lazy TLB-flush [RichardRashid87a].
+At first glance, this has nothing to do with RCU, but nevertheless
+this paper helped inspire the update-side batching used in the later
+RCU implementation in DYNIX/ptx.  In 1988, Barbara Liskov published
+a description of Argus that noted that use of out-of-date values can
+be tolerated in some situations.  Thus, this paper provides some early
+theoretical justification for use of stale data.
 In 1990, Pugh [Pugh90] noted that explicitly tracking which threads
 were reading a given data structure permitted deferred free to operate
 in the presence of non-terminating threads.  However, this explicit
@@ -41,11 +49,11 @@ providing a fine-grained locking design, however, it would be interesting
 to see how much of the performance advantage reported in 1990 remains
 today.
-At about this same time, Adams [Adams91] described ``chaotic relaxation'',
+At about this same time, Andrews [Andrews91textbook] described ``chaotic
-where the normal barriers between successive iterations of convergent
+relaxation'', where the normal barriers between successive iterations
-numerical algorithms are relaxed, so that iteration $n$ might use
+of convergent numerical algorithms are relaxed, so that iteration $n$
-data from iteration $n-1$ or even $n-2$.  This introduces error,
+might use data from iteration $n-1$ or even $n-2$.  This introduces
-which typically slows convergence and thus increases the number of
+error, which typically slows convergence and thus increases the number of
 iterations required.  However, this increase is sometimes more than made
 up for by a reduction in the number of expensive barrier operations,
 which are otherwise required to synchronize the threads at the end
@@ -55,7 +63,8 @@ is thus inapplicable to most data structures in operating-system kernels.
 In 1992, Henry (now Alexia) Massalin completed a dissertation advising
 parallel programmers to defer processing when feasible to simplify
-synchronization.  RCU makes extremely heavy use of this advice.
+synchronization [HMassalinPhD].  RCU makes extremely heavy use of
+this advice.
 In 1993, Jacobson [Jacobson93] verbally described what is perhaps the
 simplest deferred-free technique: simply waiting a fixed amount of time
@@ -90,27 +99,29 @@ mechanism, which is quite similar to RCU [Gamsa99].  These operating
 systems made pervasive use of RCU in place of "existence locks", which
 greatly simplifies locking hierarchies and helps avoid deadlocks.
-2001 saw the first RCU presentation involving Linux [McKenney01a]
+The year 2000 saw an email exchange that would likely have
-at OLS.  The resulting abundance of RCU patches was presented the
+led to yet another independent invention of something like RCU
-following year [McKenney02a], and use of RCU in dcache was first
+[RustyRussell2000a,RustyRussell2000b].  Instead, 2001 saw the first
-described that same year [Linder02a].
+RCU presentation involving Linux [McKenney01a] at OLS.  The resulting
+abundance of RCU patches was presented the following year [McKenney02a],
+and use of RCU in dcache was first described that same year [Linder02a].
 Also in 2002, Michael [Michael02b,Michael02a] presented "hazard-pointer"
 techniques that defer the destruction of data structures to simplify
 non-blocking synchronization (wait-free synchronization, lock-free
 synchronization, and obstruction-free synchronization are all examples of
-non-blocking synchronization).  In particular, this technique eliminates
+non-blocking synchronization).  The corresponding journal article appeared
-locking, reduces contention, reduces memory latency for readers, and
+in 2004 [MagedMichael04a].  This technique eliminates locking, reduces
-parallelizes pipeline stalls and memory latency for writers.  However,
+contention, reduces memory latency for readers, and parallelizes pipeline
-these techniques still impose significant read-side overhead in the
+stalls and memory latency for writers.  However, these techniques still
-form of memory barriers.  Researchers at Sun worked along similar lines
+impose significant read-side overhead in the form of memory barriers.
-in the same timeframe [HerlihyLM02].  These techniques can be thought
+Researchers at Sun worked along similar lines in the same timeframe
-of as inside-out reference counts, where the count is represented by the
+[HerlihyLM02].  These techniques can be thought of as inside-out reference
-number of hazard pointers referencing a given data structure rather than
+counts, where the count is represented by the number of hazard pointers
-the more conventional counter field within the data structure itself.
+referencing a given data structure rather than the more conventional
-The key advantage of inside-out reference counts is that they can be
+counter field within the data structure itself.  The key advantage
-stored in immortal variables, thus allowing races between access and
+of inside-out reference counts is that they can be stored in immortal
-deletion to be avoided.
+variables, thus allowing races between access and deletion to be avoided.
 By the same token, RCU can be thought of as a "bulk reference count",
 where some form of reference counter covers all reference by a given CPU
@@ -123,8 +134,10 @@ can be thought of in other terms as well.
 In 2003, the K42 group described how RCU could be used to create
 hot-pluggable implementations of operating-system functions [Appavoo03a].
-Later that year saw a paper describing an RCU implementation of System
+Later that year saw a paper describing an RCU implementation
-V IPC [Arcangeli03], and an introduction to RCU in Linux Journal
+of System V IPC [Arcangeli03] (following up on a suggestion by
+Hugh Dickins [Dickins02a] and an implementation by Mingming Cao
+[MingmingCao2002IPCRCU]), and an introduction to RCU in Linux Journal
 [McKenney03a].
 2004 has seen a Linux-Journal article on use of RCU in dcache
@@ -383,6 +396,21 @@ for Programming Languages and Operating Systems}"
 }
 }
+@phdthesis{HMassalinPhD
+,author="H. Massalin"
+,title="Synthesis: An Efficient Implementation of Fundamental Operating
+System Services"
+,school="Columbia University"
+,address="New York, NY"
+,year="1992"
+,annotation={
+        Mondo optimizing compiler.
+        Wait-free stuff.
+        Good advice: defer work to avoid synchronization.  See page 90
+                (PDF page 106), Section 5.4, fourth bullet point.
+}
+}
 @unpublished{Jacobson93
 ,author="Van Jacobson"
 ,title="Avoid Read-Side Locking Via Delayed Free"
@@ -671,6 +699,20 @@ Orran Krieger and Rusty Russell and Dipankar Sarma and Maneesh Soni"
 [Viewed October 18, 2004]"
 }
+@conference{Michael02b
+,author="Maged M. Michael"
+,title="High Performance Dynamic Lock-Free Hash Tables and List-Based Sets"
+,Year="2002"
+,Month="August"
+,booktitle="{Proceedings of the 14\textsuperscript{th} Annual ACM
+Symposium on Parallel
+Algorithms and Architecture}"
+,pages="73-82"
+,annotation={
+Like the title says...
+}
+}
 @Conference{Linder02a
 ,Author="Hanna Linder and Dipankar Sarma and Maneesh Soni"
 ,Title="Scalability of the Directory Entry Cache"
@@ -727,6 +769,24 @@ Andrea Arcangeli and Andi Kleen and Orran Krieger and Rusty Russell"
 }
 }
+@conference{Michael02a
+,author="Maged M. Michael"
+,title="Safe Memory Reclamation for Dynamic Lock-Free Objects Using Atomic
+Reads and Writes"
+,Year="2002"
+,Month="August"
+,booktitle="{Proceedings of the 21\textsuperscript{st} Annual ACM
+Symposium on Principles of Distributed Computing}"
+,pages="21-30"
+,annotation={
+        Each thread keeps an array of pointers to items that it is
+        currently referencing.  Sort of an inside-out garbage collection
+        mechanism, but one that requires the accessing code to explicitly
+        state its needs.  Also requires read-side memory barriers on
+        most architectures.
+}
+}
 @unpublished{Dickins02a
 ,author="Hugh Dickins"
 ,title="Use RCU for System-V IPC"
@@ -735,6 +795,17 @@ Andrea Arcangeli and Andi Kleen and Orran Krieger and Rusty Russell"
 ,note="private communication"
 }
+@InProceedings{HerlihyLM02
+,author={Maurice Herlihy and Victor Luchangco and Mark Moir}
+,title="The Repeat Offender Problem: A Mechanism for Supporting Dynamic-Sized,
+Lock-Free Data Structures"
+,booktitle={Proceedings of 16\textsuperscript{th} International
+Symposium on Distributed Computing}
+,year=2002
+,month="October"
+,pages="339-353"
+}
 @unpublished{Sarma02b
 ,Author="Dipankar Sarma"
 ,Title="Some dcache\_rcu benchmark numbers"
@@ -749,6 +820,19 @@ Andrea Arcangeli and Andi Kleen and Orran Krieger and Rusty Russell"
 }
 }
+@unpublished{MingmingCao2002IPCRCU
+,Author="Mingming Cao"
+,Title="[PATCH]updated ipc lock patch"
+,month="October"
+,year="2002"
+,note="Available:
+\url{https://lkml.org/lkml/2002/10/24/262}
+[Viewed February 15, 2014]"
+,annotation={
+        Mingming Cao's patch to introduce RCU to SysV IPC.
+}
+}
 @unpublished{LinusTorvalds2003a
 ,Author="Linus Torvalds"
 ,Title="Re: {[PATCH]} small fixes in brlock.h"
@@ -982,6 +1066,23 @@ Realtime Applications"
 }
 }
+@article{MagedMichael04a
+,author="Maged M. Michael"
+,title="Hazard Pointers: Safe Memory Reclamation for Lock-Free Objects"
+,Year="2004"
+,Month="June"
+,journal="IEEE Transactions on Parallel and Distributed Systems"
+,volume="15"
+,number="6"
+,pages="491-504"
+,url="Available:
+\url{http://www.research.ibm.com/people/m/michael/ieeetpds-2004.pdf}
+[Viewed March 1, 2005]"
+,annotation={
+        New canonical hazard-pointer citation.
+}
+}
 @phdthesis{PaulEdwardMcKenneyPhD
 ,author="Paul E. McKenney"
 ,title="Exploiting Deferred Destruction:
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index 91266193b8f4..9d10d1db16a5 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -256,10 +256,10 @@ over a rather long period of time, but improvements are always welcome!
                variations on this theme.
        b.      Limiting update rate.  For example, if updates occur only
-                once per hour, then no explicit rate limiting is required,
+                once per hour, then no explicit rate limiting is
-                unless your system is already badly broken.  The dcache
+                required, unless your system is already badly broken.
-                subsystem takes this approach -- updates are guarded
+                Older versions of the dcache subsystem take this approach,
-                by a global lock, limiting their rate.
+                guarding updates with a global lock, limiting their rate.
        c.      Trusted update -- if updates can only be done manually by
                superuser or some other trusted user, then it might not
@@ -268,7 +268,8 @@ over a rather long period of time, but improvements are always welcome!
                the machine.
        d.      Use call_rcu_bh() rather than call_rcu(), in order to take
-                advantage of call_rcu_bh()'s faster grace periods.
+                advantage of call_rcu_bh()'s faster grace periods.  (This
+                is only a partial solution, though.)
        e.      Periodically invoke synchronize_rcu(), permitting a limited
                number of updates per grace period.
@@ -276,6 +277,13 @@ over a rather long period of time, but improvements are always welcome!
        The same cautions apply to call_rcu_bh(), call_rcu_sched(),
        call_srcu(), and kfree_rcu().
+        Note that although these primitives do take action to avoid memory
+        exhaustion when any given CPU has too many callbacks, a determined
+        user could still exhaust memory.  This is especially the case
+        if a system with a large number of CPUs has been configured to
+        offload all of its RCU callbacks onto a single CPU, or if the
+        system has relatively little free memory.
 9.      All RCU list-traversal primitives, which include
        rcu_dereference(), list_for_each_entry_rcu(), and
        list_for_each_safe_rcu(), must be either within an RCU read-side
diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt
index 827104fb9364..f3cd299fcc41 100644
--- a/Documentation/kernel-per-CPU-kthreads.txt
+++ b/Documentation/kernel-per-CPU-kthreads.txt
@@ -162,7 +162,18 @@ Purpose: Execute workqueue requests
 To reduce its OS jitter, do any of the following:
 1.      Run your workload at a real-time priority, which will allow
        preempting the kworker daemons.
-2.      Do any of the following needed to avoid jitter that your
+2.      A given workqueue can be made visible in the sysfs filesystem
+        by passing the WQ_SYSFS to that workqueue's alloc_workqueue().
+        Such a workqueue can be confined to a given subset of the
+        CPUs using the /sys/devices/virtual/workqueue/*/cpumask sysfs
+        files.  The set of WQ_SYSFS workqueues can be displayed using
+        "ls sys/devices/virtual/workqueue".  That said, the workqueues
+        maintainer would like to caution people against indiscriminately
+        sprinkling WQ_SYSFS across all the workqueues.  The reason for
+        caution is that it is easy to add WQ_SYSFS, but because sysfs is
+        part of the formal user/kernel API, it can be nearly impossible
+        to remove it, even if its addition was a mistake.
+3.      Do any of the following needed to avoid jitter that your
        application cannot tolerate:
        a.      Build your kernel with CONFIG_SLUB=y rather than
                CONFIG_SLAB=y, thus avoiding the slab allocator's periodic
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 102dc19c4119..11c1d2049662 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -608,26 +608,30 @@ as follows:
        b = p;  /* BUG: Compiler can reorder!!! */
        do_something();
-The solution is again ACCESS_ONCE(), which preserves the ordering between
+The solution is again ACCESS_ONCE() and barrier(), which preserves the
-the load from variable 'a' and the store to variable 'b':
+ordering between the load from variable 'a' and the store to variable 'b':
        q = ACCESS_ONCE(a);
        if (q) {
+                barrier();
                ACCESS_ONCE(b) = p;
                do_something();
        } else {
+                barrier();
                ACCESS_ONCE(b) = p;
                do_something_else();
        }
-You could also use barrier() to prevent the compiler from moving
+The initial ACCESS_ONCE() is required to prevent the compiler from
-the stores to variable 'b', but barrier() would not prevent the
+proving the value of 'a', and the pair of barrier() invocations are
-compiler from proving to itself that a==1 always, so ACCESS_ONCE()
+required to prevent the compiler from pulling the two identical stores
-is also needed.
+to 'b' out from the legs of the "if" statement.
 It is important to note that control dependencies absolutely require a
 a conditional.  For example, the following "optimized" version of
-the above example breaks ordering:
+the above example breaks ordering, which is why the barrier() invocations
+are absolutely required if you have identical stores in both legs of
+the "if" statement:
        q = ACCESS_ONCE(a);
        ACCESS_ONCE(b) = p;  /* BUG: No ordering vs. load from a!!! */
@@ -643,9 +647,11 @@ It is of course legal for the prior load to be part of the conditional,
 for example, as follows:
        if (ACCESS_ONCE(a) > 0) {
+                barrier();
                ACCESS_ONCE(b) = q / 2;
                do_something();
        } else {
+                barrier();
                ACCESS_ONCE(b) = q / 3;
                do_something_else();
        }
@@ -659,9 +665,11 @@ the needed conditional.  For example:
        q = ACCESS_ONCE(a);
        if (q % MAX) {
+                barrier();
                ACCESS_ONCE(b) = p;
                do_something();
        } else {
+                barrier();
                ACCESS_ONCE(b) = p;
                do_something_else();
        }
@@ -723,8 +731,13 @@ In summary:
      use smb_rmb(), smp_wmb(), or, in the case of prior stores and
      later loads, smp_mb().
+  (*) If both legs of the "if" statement begin with identical stores
+      to the same variable, a barrier() statement is required at the
+      beginning of each leg of the "if" statement.
  (*) Control dependencies require at least one run-time conditional
-      between the prior load and the subsequent store.  If the compiler
+      between the prior load and the subsequent store, and this
+      conditional must involve the prior load.  If the compiler
      is able to optimize the conditional away, it will have also
      optimized away the ordering.  Careful use of ACCESS_ONCE() can
      help to preserve the needed conditional.
@@ -1249,6 +1262,23 @@ The ACCESS_ONCE() function can prevent any number of optimizations that,
 while perfectly safe in single-threaded code, can be fatal in concurrent
 code.  Here are some examples of these sorts of optimizations:
+ (*) The compiler is within its rights to reorder loads and stores
+     to the same variable, and in some cases, the CPU is within its
+     rights to reorder loads to the same variable.  This means that
+     the following code:
+        a[0] = x;
+        a[1] = x;
+     Might result in an older value of x stored in a[1] than in a[0].
+     Prevent both the compiler and the CPU from doing this as follows:
+        a[0] = ACCESS_ONCE(x);
+        a[1] = ACCESS_ONCE(x);
+     In short, ACCESS_ONCE() provides cache coherence for accesses from
+     multiple CPUs to a single variable.
 (*) The compiler is within its rights to merge successive loads from
     the same variable.  Such merging can cause the compiler to "optimize"
     the following code:
@@ -1644,12 +1674,12 @@ for each construct.  These operations all imply certain barriers:
     Memory operations issued after the ACQUIRE will be completed after the
     ACQUIRE operation has completed.
-     Memory operations issued before the ACQUIRE may be completed after the
+     Memory operations issued before the ACQUIRE may be completed after
-     ACQUIRE operation has completed.  An smp_mb__before_spinlock(), combined
+     the ACQUIRE operation has completed.  An smp_mb__before_spinlock(),
-     with a following ACQUIRE, orders prior loads against subsequent stores and
+     combined with a following ACQUIRE, orders prior loads against
-     stores and prior stores against subsequent stores.  Note that this is
+     subsequent loads and stores and also orders prior stores against
-     weaker than smp_mb()!  The smp_mb__before_spinlock() primitive is free on
+     subsequent stores.  Note that this is weaker than smp_mb()!  The
-     many architectures.
+     smp_mb__before_spinlock() primitive is free on many architectures.
 (2) RELEASE operation implication:
@@ -1694,24 +1724,21 @@ may occur as:
        ACQUIRE M, STORE *B, STORE *A, RELEASE M
-This same reordering can of course occur if the lock's ACQUIRE and RELEASE are
+When the ACQUIRE and RELEASE are a lock acquisition and release,
-to the same lock variable, but only from the perspective of another CPU not
+respectively, this same reordering can occur if the lock's ACQUIRE and
-holding that lock.
+RELEASE are to the same lock variable, but only from the perspective of
+another CPU not holding that lock.  In short, a ACQUIRE followed by an
-In short, a RELEASE followed by an ACQUIRE may -not- be assumed to be a full
+RELEASE may -not- be assumed to be a full memory barrier.
-memory barrier because it is possible for a preceding RELEASE to pass a
-later ACQUIRE from the viewpoint of the CPU, but not from the viewpoint
+Similarly, the reverse case of a RELEASE followed by an ACQUIRE does not
-of the compiler.  Note that deadlocks cannot be introduced by this
+imply a full memory barrier.  If it is necessary for a RELEASE-ACQUIRE
-interchange because if such a deadlock threatened, the RELEASE would
+pair to produce a full barrier, the ACQUIRE can be followed by an
-simply complete.
+smp_mb__after_unlock_lock() invocation.  This will produce a full barrier
+if either (a) the RELEASE and the ACQUIRE are executed by the same
-If it is necessary for a RELEASE-ACQUIRE pair to produce a full barrier, the
+CPU or task, or (b) the RELEASE and ACQUIRE act on the same variable.
-ACQUIRE can be followed by an smp_mb__after_unlock_lock() invocation.  This
+The smp_mb__after_unlock_lock() primitive is free on many architectures.
-will produce a full barrier if either (a) the RELEASE and the ACQUIRE are
+Without smp_mb__after_unlock_lock(), the CPU's execution of the critical
-executed by the same CPU or task, or (b) the RELEASE and ACQUIRE act on the
+sections corresponding to the RELEASE and the ACQUIRE can cross, so that:
-same variable.  The smp_mb__after_unlock_lock() primitive is free on many
-architectures.  Without smp_mb__after_unlock_lock(), the critical sections
-corresponding to the RELEASE and the ACQUIRE can cross:
        *A = a;
        RELEASE M
@@ -1722,7 +1749,36 @@ could occur as:
        ACQUIRE N, STORE *B, STORE *A, RELEASE M
-With smp_mb__after_unlock_lock(), they cannot, so that:
+It might appear that this reordering could introduce a deadlock.
+However, this cannot happen because if such a deadlock threatened,
+the RELEASE would simply complete, thereby avoiding the deadlock.
+        Why does this work?
+        One key point is that we are only talking about the CPU doing
+        the reordering, not the compiler.  If the compiler (or, for
+        that matter, the developer) switched the operations, deadlock
+        -could- occur.
+        But suppose the CPU reordered the operations.  In this case,
+        the unlock precedes the lock in the assembly code.  The CPU
+        simply elected to try executing the later lock operation first.
+        If there is a deadlock, this lock operation will simply spin (or
+        try to sleep, but more on that later).  The CPU will eventually
+        execute the unlock operation (which preceded the lock operation
+        in the assembly code), which will unravel the potential deadlock,
+        allowing the lock operation to succeed.
+        But what if the lock is a sleeplock?  In that case, the code will
+        try to enter the scheduler, where it will eventually encounter
+        a memory barrier, which will force the earlier unlock operation
+        to complete, again unraveling the deadlock.  There might be
+        a sleep-unlock race, but the locking primitive needs to resolve
+        such races properly in any case.
+With smp_mb__after_unlock_lock(), the two critical sections cannot overlap.
+For example, with the following code, the store to *A will always be
+seen by other CPUs before the store to *B:
        *A = a;
        RELEASE M
@@ -1730,13 +1786,18 @@ With smp_mb__after_unlock_lock(), they cannot, so that:
        smp_mb__after_unlock_lock();
        *B = b;
-will always occur as either of the following:
+The operations will always occur in one of the following orders:
-        STORE *A, RELEASE, ACQUIRE, STORE *B
+        STORE *A, RELEASE, ACQUIRE, smp_mb__after_unlock_lock(), STORE *B
-        STORE *A, ACQUIRE, RELEASE, STORE *B
+        STORE *A, ACQUIRE, RELEASE, smp_mb__after_unlock_lock(), STORE *B
+        ACQUIRE, STORE *A, RELEASE, smp_mb__after_unlock_lock(), STORE *B
 If the RELEASE and ACQUIRE were instead both operating on the same lock
-variable, only the first of these two alternatives can occur.
+variable, only the first of these alternatives can occur.  In addition,
+the more strongly ordered systems may rule out some of the above orders.
+But in any case, as noted earlier, the smp_mb__after_unlock_lock()
+ensures that the store to *A will always be seen as happening before
+the store to *B.
 Locks and semaphores may not provide any guarantee of ordering on UP compiled
 systems, and so cannot be counted on in such a situation to actually achieve
@@ -2757,7 +2818,7 @@ in that order, but, without intervention, the sequence may have almost any
 combination of elements combined or discarded, provided the program's view of
 the world remains consistent.  Note that ACCESS_ONCE() is -not- optional
 in the above example, as there are architectures where a given CPU might
-interchange successive loads to the same location.  On such architectures,
+reorder successive loads to the same location.  On such architectures,
 ACCESS_ONCE() does whatever is necessary to prevent this, for example, on
 Itanium the volatile casts used by ACCESS_ONCE() cause GCC to emit the
 special ld.acq and st.rel instructions that prevent such reordering.
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index f3706c6b2e21..cda2583d67e9 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1007,11 +1007,21 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
 #define kfree_rcu(ptr, rcu_head)                                        \
        __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))
-#ifdef CONFIG_RCU_NOCB_CPU
+#if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL)
+static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
+{
+        *delta_jiffies = ULONG_MAX;
+        return 0;
+}
+#endif /* #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL) */
+#if defined(CONFIG_RCU_NOCB_CPU_ALL)
+static inline bool rcu_is_nocb_cpu(int cpu) { return true; }
+#elif defined(CONFIG_RCU_NOCB_CPU)
 bool rcu_is_nocb_cpu(int cpu);
 #else
 static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
-#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+#endif
 /* Only for use by adaptive-ticks code. */
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index c364e9148de2..e8cb6e3b52a7 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -68,12 +68,6 @@ static inline void kfree_call_rcu(struct rcu_head *head,
        call_rcu(head, func);
 }
-static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
-{
-        *delta_jiffies = ULONG_MAX;
-        return 0;
-}
 static inline void rcu_note_context_switch(int cpu)
 {
        rcu_sched_qs(cpu);
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 08b084068967..e9c63884df0a 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -31,7 +31,9 @@
 #define __LINUX_RCUTREE_H
 void rcu_note_context_switch(int cpu);
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
 int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies);
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 void rcu_cpu_stall_reset(void);
 /*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c7ed5db2dd79..351faba48b91 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2878,7 +2878,7 @@ static int rcu_pending(int cpu)
 * non-NULL, store an indication of whether all callbacks are lazy.
 * (If there are no callbacks, all of them are deemed to be lazy.)
 */
-static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
+static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
 {
        bool al = true;
        bool hc = false;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index fffe4178a23d..962d1d589929 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1586,11 +1586,13 @@ static void rcu_prepare_kthreads(int cpu)
 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
 * any flavor of RCU.
 */
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
 int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 {
        *delta_jiffies = ULONG_MAX;
        return rcu_cpu_has_callbacks(cpu, NULL);
 }
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 /*
 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
@@ -1656,7 +1658,7 @@ extern int tick_nohz_active;
 * only if it has been awhile since the last time we did so.  Afterwards,
 * if there are any callbacks ready for immediate invocation, return true.
 */
-static bool rcu_try_advance_all_cbs(void)
+static bool __maybe_unused rcu_try_advance_all_cbs(void)
 {
        bool cbs_ready = false;
        struct rcu_data *rdp;
@@ -1696,6 +1698,7 @@ static bool rcu_try_advance_all_cbs(void)
 *
 * The caller must have disabled interrupts.
 */
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
 int rcu_needs_cpu(int cpu, unsigned long *dj)
 {
        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
@@ -1726,6 +1729,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
        }
        return 0;
 }
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 /*
 * Prepare a CPU for idle from an RCU perspective.  The first major task
@@ -1739,6 +1743,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
 */
 static void rcu_prepare_for_idle(int cpu)
 {
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
        struct rcu_data *rdp;
        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
        struct rcu_node *rnp;
@@ -1790,6 +1795,7 @@ static void rcu_prepare_for_idle(int cpu)
                rcu_accelerate_cbs(rsp, rnp, rdp);
                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
        }
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 }
 /*
@@ -1799,11 +1805,12 @@ static void rcu_prepare_for_idle(int cpu)
 */
 static void rcu_cleanup_after_idle(int cpu)
 {
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
        if (rcu_is_nocb_cpu(cpu))
                return;
        if (rcu_try_advance_all_cbs())
                invoke_rcu_core();
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 }
 /*
@@ -2101,6 +2108,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
        init_waitqueue_head(&rnp->nocb_gp_wq[1]);
 }
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
 /* Is the specified CPU a no-CPUs CPU? */
 bool rcu_is_nocb_cpu(int cpu)
 {
@@ -2108,6 +2116,7 @@ bool rcu_is_nocb_cpu(int cpu)
                return cpumask_test_cpu(cpu, rcu_nocb_mask);
        return false;
 }
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 /*
 * Enqueue the specified string of rcu_head structures onto the specified