aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/RCU/RTFP.txt149
-rw-r--r--Documentation/RCU/checklist.txt18
-rw-r--r--Documentation/kernel-per-CPU-kthreads.txt13
-rw-r--r--Documentation/memory-barriers.txt137
-rw-r--r--include/linux/rcupdate.h14
-rw-r--r--include/linux/rcutiny.h6
-rw-r--r--include/linux/rcutree.h2
-rw-r--r--kernel/rcu/tree.c2
-rw-r--r--kernel/rcu/tree_plugin.h13
9 files changed, 275 insertions, 79 deletions
diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt
index 273e654d7d08..2f0fcb2112d2 100644
--- a/Documentation/RCU/RTFP.txt
+++ b/Documentation/RCU/RTFP.txt
@@ -31,6 +31,14 @@ has lapsed, so this approach may be used in non-GPL software, if desired.
31(In contrast, implementation of RCU is permitted only in software licensed 31(In contrast, implementation of RCU is permitted only in software licensed
32under either GPL or LGPL. Sorry!!!) 32under either GPL or LGPL. Sorry!!!)
33 33
34In 1987, Rashid et al. described lazy TLB-flush [RichardRashid87a].
35At first glance, this has nothing to do with RCU, but nevertheless
36this paper helped inspire the update-side batching used in the later
37RCU implementation in DYNIX/ptx. In 1988, Barbara Liskov published
38a description of Argus that noted that use of out-of-date values can
39be tolerated in some situations. Thus, this paper provides some early
40theoretical justification for use of stale data.
41
34In 1990, Pugh [Pugh90] noted that explicitly tracking which threads 42In 1990, Pugh [Pugh90] noted that explicitly tracking which threads
35were reading a given data structure permitted deferred free to operate 43were reading a given data structure permitted deferred free to operate
36in the presence of non-terminating threads. However, this explicit 44in the presence of non-terminating threads. However, this explicit
@@ -41,11 +49,11 @@ providing a fine-grained locking design, however, it would be interesting
41to see how much of the performance advantage reported in 1990 remains 49to see how much of the performance advantage reported in 1990 remains
42today. 50today.
43 51
44At about this same time, Adams [Adams91] described ``chaotic relaxation'', 52At about this same time, Andrews [Andrews91textbook] described ``chaotic
45where the normal barriers between successive iterations of convergent 53relaxation'', where the normal barriers between successive iterations
46numerical algorithms are relaxed, so that iteration $n$ might use 54of convergent numerical algorithms are relaxed, so that iteration $n$
47data from iteration $n-1$ or even $n-2$. This introduces error, 55might use data from iteration $n-1$ or even $n-2$. This introduces
48which typically slows convergence and thus increases the number of 56error, which typically slows convergence and thus increases the number of
49iterations required. However, this increase is sometimes more than made 57iterations required. However, this increase is sometimes more than made
50up for by a reduction in the number of expensive barrier operations, 58up for by a reduction in the number of expensive barrier operations,
51which are otherwise required to synchronize the threads at the end 59which are otherwise required to synchronize the threads at the end
@@ -55,7 +63,8 @@ is thus inapplicable to most data structures in operating-system kernels.
55 63
56In 1992, Henry (now Alexia) Massalin completed a dissertation advising 64In 1992, Henry (now Alexia) Massalin completed a dissertation advising
57parallel programmers to defer processing when feasible to simplify 65parallel programmers to defer processing when feasible to simplify
58synchronization. RCU makes extremely heavy use of this advice. 66synchronization [HMassalinPhD]. RCU makes extremely heavy use of
67this advice.
59 68
60In 1993, Jacobson [Jacobson93] verbally described what is perhaps the 69In 1993, Jacobson [Jacobson93] verbally described what is perhaps the
61simplest deferred-free technique: simply waiting a fixed amount of time 70simplest deferred-free technique: simply waiting a fixed amount of time
@@ -90,27 +99,29 @@ mechanism, which is quite similar to RCU [Gamsa99]. These operating
90systems made pervasive use of RCU in place of "existence locks", which 99systems made pervasive use of RCU in place of "existence locks", which
91greatly simplifies locking hierarchies and helps avoid deadlocks. 100greatly simplifies locking hierarchies and helps avoid deadlocks.
92 101
932001 saw the first RCU presentation involving Linux [McKenney01a] 102The year 2000 saw an email exchange that would likely have
94at OLS. The resulting abundance of RCU patches was presented the 103led to yet another independent invention of something like RCU
95following year [McKenney02a], and use of RCU in dcache was first 104[RustyRussell2000a,RustyRussell2000b]. Instead, 2001 saw the first
96described that same year [Linder02a]. 105RCU presentation involving Linux [McKenney01a] at OLS. The resulting
106abundance of RCU patches was presented the following year [McKenney02a],
107and use of RCU in dcache was first described that same year [Linder02a].
97 108
98Also in 2002, Michael [Michael02b,Michael02a] presented "hazard-pointer" 109Also in 2002, Michael [Michael02b,Michael02a] presented "hazard-pointer"
99techniques that defer the destruction of data structures to simplify 110techniques that defer the destruction of data structures to simplify
100non-blocking synchronization (wait-free synchronization, lock-free 111non-blocking synchronization (wait-free synchronization, lock-free
101synchronization, and obstruction-free synchronization are all examples of 112synchronization, and obstruction-free synchronization are all examples of
102non-blocking synchronization). In particular, this technique eliminates 113non-blocking synchronization). The corresponding journal article appeared
103locking, reduces contention, reduces memory latency for readers, and 114in 2004 [MagedMichael04a]. This technique eliminates locking, reduces
104parallelizes pipeline stalls and memory latency for writers. However, 115contention, reduces memory latency for readers, and parallelizes pipeline
105these techniques still impose significant read-side overhead in the 116stalls and memory latency for writers. However, these techniques still
106form of memory barriers. Researchers at Sun worked along similar lines 117impose significant read-side overhead in the form of memory barriers.
107in the same timeframe [HerlihyLM02]. These techniques can be thought 118Researchers at Sun worked along similar lines in the same timeframe
108of as inside-out reference counts, where the count is represented by the 119[HerlihyLM02]. These techniques can be thought of as inside-out reference
109number of hazard pointers referencing a given data structure rather than 120counts, where the count is represented by the number of hazard pointers
110the more conventional counter field within the data structure itself. 121referencing a given data structure rather than the more conventional
111The key advantage of inside-out reference counts is that they can be 122counter field within the data structure itself. The key advantage
112stored in immortal variables, thus allowing races between access and 123of inside-out reference counts is that they can be stored in immortal
113deletion to be avoided. 124variables, thus allowing races between access and deletion to be avoided.
114 125
115By the same token, RCU can be thought of as a "bulk reference count", 126By the same token, RCU can be thought of as a "bulk reference count",
116where some form of reference counter covers all reference by a given CPU 127where some form of reference counter covers all reference by a given CPU
@@ -123,8 +134,10 @@ can be thought of in other terms as well.
123 134
124In 2003, the K42 group described how RCU could be used to create 135In 2003, the K42 group described how RCU could be used to create
125hot-pluggable implementations of operating-system functions [Appavoo03a]. 136hot-pluggable implementations of operating-system functions [Appavoo03a].
126Later that year saw a paper describing an RCU implementation of System 137Later that year saw a paper describing an RCU implementation
127V IPC [Arcangeli03], and an introduction to RCU in Linux Journal 138of System V IPC [Arcangeli03] (following up on a suggestion by
139Hugh Dickins [Dickins02a] and an implementation by Mingming Cao
140[MingmingCao2002IPCRCU]), and an introduction to RCU in Linux Journal
128[McKenney03a]. 141[McKenney03a].
129 142
1302004 has seen a Linux-Journal article on use of RCU in dcache 1432004 has seen a Linux-Journal article on use of RCU in dcache
@@ -383,6 +396,21 @@ for Programming Languages and Operating Systems}"
383} 396}
384} 397}
385 398
399@phdthesis{HMassalinPhD
400,author="H. Massalin"
401,title="Synthesis: An Efficient Implementation of Fundamental Operating
402System Services"
403,school="Columbia University"
404,address="New York, NY"
405,year="1992"
406,annotation={
407 Mondo optimizing compiler.
408 Wait-free stuff.
409 Good advice: defer work to avoid synchronization. See page 90
410 (PDF page 106), Section 5.4, fourth bullet point.
411}
412}
413
386@unpublished{Jacobson93 414@unpublished{Jacobson93
387,author="Van Jacobson" 415,author="Van Jacobson"
388,title="Avoid Read-Side Locking Via Delayed Free" 416,title="Avoid Read-Side Locking Via Delayed Free"
@@ -671,6 +699,20 @@ Orran Krieger and Rusty Russell and Dipankar Sarma and Maneesh Soni"
671[Viewed October 18, 2004]" 699[Viewed October 18, 2004]"
672} 700}
673 701
702@conference{Michael02b
703,author="Maged M. Michael"
704,title="High Performance Dynamic Lock-Free Hash Tables and List-Based Sets"
705,Year="2002"
706,Month="August"
707,booktitle="{Proceedings of the 14\textsuperscript{th} Annual ACM
708Symposium on Parallel
709Algorithms and Architecture}"
710,pages="73-82"
711,annotation={
712Like the title says...
713}
714}
715
674@Conference{Linder02a 716@Conference{Linder02a
675,Author="Hanna Linder and Dipankar Sarma and Maneesh Soni" 717,Author="Hanna Linder and Dipankar Sarma and Maneesh Soni"
676,Title="Scalability of the Directory Entry Cache" 718,Title="Scalability of the Directory Entry Cache"
@@ -727,6 +769,24 @@ Andrea Arcangeli and Andi Kleen and Orran Krieger and Rusty Russell"
727} 769}
728} 770}
729 771
772@conference{Michael02a
773,author="Maged M. Michael"
774,title="Safe Memory Reclamation for Dynamic Lock-Free Objects Using Atomic
775Reads and Writes"
776,Year="2002"
777,Month="August"
778,booktitle="{Proceedings of the 21\textsuperscript{st} Annual ACM
779Symposium on Principles of Distributed Computing}"
780,pages="21-30"
781,annotation={
782 Each thread keeps an array of pointers to items that it is
783 currently referencing. Sort of an inside-out garbage collection
784 mechanism, but one that requires the accessing code to explicitly
785 state its needs. Also requires read-side memory barriers on
786 most architectures.
787}
788}
789
730@unpublished{Dickins02a 790@unpublished{Dickins02a
731,author="Hugh Dickins" 791,author="Hugh Dickins"
732,title="Use RCU for System-V IPC" 792,title="Use RCU for System-V IPC"
@@ -735,6 +795,17 @@ Andrea Arcangeli and Andi Kleen and Orran Krieger and Rusty Russell"
735,note="private communication" 795,note="private communication"
736} 796}
737 797
798@InProceedings{HerlihyLM02
799,author={Maurice Herlihy and Victor Luchangco and Mark Moir}
800,title="The Repeat Offender Problem: A Mechanism for Supporting Dynamic-Sized,
801Lock-Free Data Structures"
802,booktitle={Proceedings of 16\textsuperscript{th} International
803Symposium on Distributed Computing}
804,year=2002
805,month="October"
806,pages="339-353"
807}
808
738@unpublished{Sarma02b 809@unpublished{Sarma02b
739,Author="Dipankar Sarma" 810,Author="Dipankar Sarma"
740,Title="Some dcache\_rcu benchmark numbers" 811,Title="Some dcache\_rcu benchmark numbers"
@@ -749,6 +820,19 @@ Andrea Arcangeli and Andi Kleen and Orran Krieger and Rusty Russell"
749} 820}
750} 821}
751 822
823@unpublished{MingmingCao2002IPCRCU
824,Author="Mingming Cao"
825,Title="[PATCH]updated ipc lock patch"
826,month="October"
827,year="2002"
828,note="Available:
829\url{https://lkml.org/lkml/2002/10/24/262}
830[Viewed February 15, 2014]"
831,annotation={
832 Mingming Cao's patch to introduce RCU to SysV IPC.
833}
834}
835
752@unpublished{LinusTorvalds2003a 836@unpublished{LinusTorvalds2003a
753,Author="Linus Torvalds" 837,Author="Linus Torvalds"
754,Title="Re: {[PATCH]} small fixes in brlock.h" 838,Title="Re: {[PATCH]} small fixes in brlock.h"
@@ -982,6 +1066,23 @@ Realtime Applications"
982} 1066}
983} 1067}
984 1068
1069@article{MagedMichael04a
1070,author="Maged M. Michael"
1071,title="Hazard Pointers: Safe Memory Reclamation for Lock-Free Objects"
1072,Year="2004"
1073,Month="June"
1074,journal="IEEE Transactions on Parallel and Distributed Systems"
1075,volume="15"
1076,number="6"
1077,pages="491-504"
1078,url="Available:
1079\url{http://www.research.ibm.com/people/m/michael/ieeetpds-2004.pdf}
1080[Viewed March 1, 2005]"
1081,annotation={
1082 New canonical hazard-pointer citation.
1083}
1084}
1085
985@phdthesis{PaulEdwardMcKenneyPhD 1086@phdthesis{PaulEdwardMcKenneyPhD
986,author="Paul E. McKenney" 1087,author="Paul E. McKenney"
987,title="Exploiting Deferred Destruction: 1088,title="Exploiting Deferred Destruction:
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index 91266193b8f4..9d10d1db16a5 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -256,10 +256,10 @@ over a rather long period of time, but improvements are always welcome!
256 variations on this theme. 256 variations on this theme.
257 257
258 b. Limiting update rate. For example, if updates occur only 258 b. Limiting update rate. For example, if updates occur only
259 once per hour, then no explicit rate limiting is required, 259 once per hour, then no explicit rate limiting is
260 unless your system is already badly broken. The dcache 260 required, unless your system is already badly broken.
261 subsystem takes this approach -- updates are guarded 261 Older versions of the dcache subsystem take this approach,
262 by a global lock, limiting their rate. 262 guarding updates with a global lock, limiting their rate.
263 263
264 c. Trusted update -- if updates can only be done manually by 264 c. Trusted update -- if updates can only be done manually by
265 superuser or some other trusted user, then it might not 265 superuser or some other trusted user, then it might not
@@ -268,7 +268,8 @@ over a rather long period of time, but improvements are always welcome!
268 the machine. 268 the machine.
269 269
270 d. Use call_rcu_bh() rather than call_rcu(), in order to take 270 d. Use call_rcu_bh() rather than call_rcu(), in order to take
271 advantage of call_rcu_bh()'s faster grace periods. 271 advantage of call_rcu_bh()'s faster grace periods. (This
272 is only a partial solution, though.)
272 273
273 e. Periodically invoke synchronize_rcu(), permitting a limited 274 e. Periodically invoke synchronize_rcu(), permitting a limited
274 number of updates per grace period. 275 number of updates per grace period.
@@ -276,6 +277,13 @@ over a rather long period of time, but improvements are always welcome!
276 The same cautions apply to call_rcu_bh(), call_rcu_sched(), 277 The same cautions apply to call_rcu_bh(), call_rcu_sched(),
277 call_srcu(), and kfree_rcu(). 278 call_srcu(), and kfree_rcu().
278 279
280 Note that although these primitives do take action to avoid memory
281 exhaustion when any given CPU has too many callbacks, a determined
282 user could still exhaust memory. This is especially the case
283 if a system with a large number of CPUs has been configured to
284 offload all of its RCU callbacks onto a single CPU, or if the
285 system has relatively little free memory.
286
2799. All RCU list-traversal primitives, which include 2879. All RCU list-traversal primitives, which include
280 rcu_dereference(), list_for_each_entry_rcu(), and 288 rcu_dereference(), list_for_each_entry_rcu(), and
281 list_for_each_safe_rcu(), must be either within an RCU read-side 289 list_for_each_safe_rcu(), must be either within an RCU read-side
diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt
index 827104fb9364..f3cd299fcc41 100644
--- a/Documentation/kernel-per-CPU-kthreads.txt
+++ b/Documentation/kernel-per-CPU-kthreads.txt
@@ -162,7 +162,18 @@ Purpose: Execute workqueue requests
162To reduce its OS jitter, do any of the following: 162To reduce its OS jitter, do any of the following:
1631. Run your workload at a real-time priority, which will allow 1631. Run your workload at a real-time priority, which will allow
164 preempting the kworker daemons. 164 preempting the kworker daemons.
1652. Do any of the following needed to avoid jitter that your 1652. A given workqueue can be made visible in the sysfs filesystem
166 by passing the WQ_SYSFS to that workqueue's alloc_workqueue().
167 Such a workqueue can be confined to a given subset of the
168 CPUs using the /sys/devices/virtual/workqueue/*/cpumask sysfs
169 files. The set of WQ_SYSFS workqueues can be displayed using
170 "ls sys/devices/virtual/workqueue". That said, the workqueues
171 maintainer would like to caution people against indiscriminately
172 sprinkling WQ_SYSFS across all the workqueues. The reason for
173 caution is that it is easy to add WQ_SYSFS, but because sysfs is
174 part of the formal user/kernel API, it can be nearly impossible
175 to remove it, even if its addition was a mistake.
1763. Do any of the following needed to avoid jitter that your
166 application cannot tolerate: 177 application cannot tolerate:
167 a. Build your kernel with CONFIG_SLUB=y rather than 178 a. Build your kernel with CONFIG_SLUB=y rather than
168 CONFIG_SLAB=y, thus avoiding the slab allocator's periodic 179 CONFIG_SLAB=y, thus avoiding the slab allocator's periodic
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 102dc19c4119..11c1d2049662 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -608,26 +608,30 @@ as follows:
608 b = p; /* BUG: Compiler can reorder!!! */ 608 b = p; /* BUG: Compiler can reorder!!! */
609 do_something(); 609 do_something();
610 610
611The solution is again ACCESS_ONCE(), which preserves the ordering between 611The solution is again ACCESS_ONCE() and barrier(), which preserves the
612the load from variable 'a' and the store to variable 'b': 612ordering between the load from variable 'a' and the store to variable 'b':
613 613
614 q = ACCESS_ONCE(a); 614 q = ACCESS_ONCE(a);
615 if (q) { 615 if (q) {
616 barrier();
616 ACCESS_ONCE(b) = p; 617 ACCESS_ONCE(b) = p;
617 do_something(); 618 do_something();
618 } else { 619 } else {
620 barrier();
619 ACCESS_ONCE(b) = p; 621 ACCESS_ONCE(b) = p;
620 do_something_else(); 622 do_something_else();
621 } 623 }
622 624
623You could also use barrier() to prevent the compiler from moving 625The initial ACCESS_ONCE() is required to prevent the compiler from
624the stores to variable 'b', but barrier() would not prevent the 626proving the value of 'a', and the pair of barrier() invocations are
625compiler from proving to itself that a==1 always, so ACCESS_ONCE() 627required to prevent the compiler from pulling the two identical stores
626is also needed. 628to 'b' out from the legs of the "if" statement.
627 629
628It is important to note that control dependencies absolutely require a 630It is important to note that control dependencies absolutely require a
629a conditional. For example, the following "optimized" version of 631a conditional. For example, the following "optimized" version of
630the above example breaks ordering: 632the above example breaks ordering, which is why the barrier() invocations
633are absolutely required if you have identical stores in both legs of
634the "if" statement:
631 635
632 q = ACCESS_ONCE(a); 636 q = ACCESS_ONCE(a);
633 ACCESS_ONCE(b) = p; /* BUG: No ordering vs. load from a!!! */ 637 ACCESS_ONCE(b) = p; /* BUG: No ordering vs. load from a!!! */
@@ -643,9 +647,11 @@ It is of course legal for the prior load to be part of the conditional,
643for example, as follows: 647for example, as follows:
644 648
645 if (ACCESS_ONCE(a) > 0) { 649 if (ACCESS_ONCE(a) > 0) {
650 barrier();
646 ACCESS_ONCE(b) = q / 2; 651 ACCESS_ONCE(b) = q / 2;
647 do_something(); 652 do_something();
648 } else { 653 } else {
654 barrier();
649 ACCESS_ONCE(b) = q / 3; 655 ACCESS_ONCE(b) = q / 3;
650 do_something_else(); 656 do_something_else();
651 } 657 }
@@ -659,9 +665,11 @@ the needed conditional. For example:
659 665
660 q = ACCESS_ONCE(a); 666 q = ACCESS_ONCE(a);
661 if (q % MAX) { 667 if (q % MAX) {
668 barrier();
662 ACCESS_ONCE(b) = p; 669 ACCESS_ONCE(b) = p;
663 do_something(); 670 do_something();
664 } else { 671 } else {
672 barrier();
665 ACCESS_ONCE(b) = p; 673 ACCESS_ONCE(b) = p;
666 do_something_else(); 674 do_something_else();
667 } 675 }
@@ -723,8 +731,13 @@ In summary:
723 use smb_rmb(), smp_wmb(), or, in the case of prior stores and 731 use smb_rmb(), smp_wmb(), or, in the case of prior stores and
724 later loads, smp_mb(). 732 later loads, smp_mb().
725 733
734 (*) If both legs of the "if" statement begin with identical stores
735 to the same variable, a barrier() statement is required at the
736 beginning of each leg of the "if" statement.
737
726 (*) Control dependencies require at least one run-time conditional 738 (*) Control dependencies require at least one run-time conditional
727 between the prior load and the subsequent store. If the compiler 739 between the prior load and the subsequent store, and this
740 conditional must involve the prior load. If the compiler
728 is able to optimize the conditional away, it will have also 741 is able to optimize the conditional away, it will have also
729 optimized away the ordering. Careful use of ACCESS_ONCE() can 742 optimized away the ordering. Careful use of ACCESS_ONCE() can
730 help to preserve the needed conditional. 743 help to preserve the needed conditional.
@@ -1249,6 +1262,23 @@ The ACCESS_ONCE() function can prevent any number of optimizations that,
1249while perfectly safe in single-threaded code, can be fatal in concurrent 1262while perfectly safe in single-threaded code, can be fatal in concurrent
1250code. Here are some examples of these sorts of optimizations: 1263code. Here are some examples of these sorts of optimizations:
1251 1264
1265 (*) The compiler is within its rights to reorder loads and stores
1266 to the same variable, and in some cases, the CPU is within its
1267 rights to reorder loads to the same variable. This means that
1268 the following code:
1269
1270 a[0] = x;
1271 a[1] = x;
1272
1273 Might result in an older value of x stored in a[1] than in a[0].
1274 Prevent both the compiler and the CPU from doing this as follows:
1275
1276 a[0] = ACCESS_ONCE(x);
1277 a[1] = ACCESS_ONCE(x);
1278
1279 In short, ACCESS_ONCE() provides cache coherence for accesses from
1280 multiple CPUs to a single variable.
1281
1252 (*) The compiler is within its rights to merge successive loads from 1282 (*) The compiler is within its rights to merge successive loads from
1253 the same variable. Such merging can cause the compiler to "optimize" 1283 the same variable. Such merging can cause the compiler to "optimize"
1254 the following code: 1284 the following code:
@@ -1644,12 +1674,12 @@ for each construct. These operations all imply certain barriers:
1644 Memory operations issued after the ACQUIRE will be completed after the 1674 Memory operations issued after the ACQUIRE will be completed after the
1645 ACQUIRE operation has completed. 1675 ACQUIRE operation has completed.
1646 1676
1647 Memory operations issued before the ACQUIRE may be completed after the 1677 Memory operations issued before the ACQUIRE may be completed after
1648 ACQUIRE operation has completed. An smp_mb__before_spinlock(), combined 1678 the ACQUIRE operation has completed. An smp_mb__before_spinlock(),
1649 with a following ACQUIRE, orders prior loads against subsequent stores and 1679 combined with a following ACQUIRE, orders prior loads against
1650 stores and prior stores against subsequent stores. Note that this is 1680 subsequent loads and stores and also orders prior stores against
1651 weaker than smp_mb()! The smp_mb__before_spinlock() primitive is free on 1681 subsequent stores. Note that this is weaker than smp_mb()! The
1652 many architectures. 1682 smp_mb__before_spinlock() primitive is free on many architectures.
1653 1683
1654 (2) RELEASE operation implication: 1684 (2) RELEASE operation implication:
1655 1685
@@ -1694,24 +1724,21 @@ may occur as:
1694 1724
1695 ACQUIRE M, STORE *B, STORE *A, RELEASE M 1725 ACQUIRE M, STORE *B, STORE *A, RELEASE M
1696 1726
1697This same reordering can of course occur if the lock's ACQUIRE and RELEASE are 1727When the ACQUIRE and RELEASE are a lock acquisition and release,
1698to the same lock variable, but only from the perspective of another CPU not 1728respectively, this same reordering can occur if the lock's ACQUIRE and
1699holding that lock. 1729RELEASE are to the same lock variable, but only from the perspective of
1700 1730another CPU not holding that lock. In short, a ACQUIRE followed by an
1701In short, a RELEASE followed by an ACQUIRE may -not- be assumed to be a full 1731RELEASE may -not- be assumed to be a full memory barrier.
1702memory barrier because it is possible for a preceding RELEASE to pass a 1732
1703later ACQUIRE from the viewpoint of the CPU, but not from the viewpoint 1733Similarly, the reverse case of a RELEASE followed by an ACQUIRE does not
1704of the compiler. Note that deadlocks cannot be introduced by this 1734imply a full memory barrier. If it is necessary for a RELEASE-ACQUIRE
1705interchange because if such a deadlock threatened, the RELEASE would 1735pair to produce a full barrier, the ACQUIRE can be followed by an
1706simply complete. 1736smp_mb__after_unlock_lock() invocation. This will produce a full barrier
1707 1737if either (a) the RELEASE and the ACQUIRE are executed by the same
1708If it is necessary for a RELEASE-ACQUIRE pair to produce a full barrier, the 1738CPU or task, or (b) the RELEASE and ACQUIRE act on the same variable.
1709ACQUIRE can be followed by an smp_mb__after_unlock_lock() invocation. This 1739The smp_mb__after_unlock_lock() primitive is free on many architectures.
1710will produce a full barrier if either (a) the RELEASE and the ACQUIRE are 1740Without smp_mb__after_unlock_lock(), the CPU's execution of the critical
1711executed by the same CPU or task, or (b) the RELEASE and ACQUIRE act on the 1741sections corresponding to the RELEASE and the ACQUIRE can cross, so that:
1712same variable. The smp_mb__after_unlock_lock() primitive is free on many
1713architectures. Without smp_mb__after_unlock_lock(), the critical sections
1714corresponding to the RELEASE and the ACQUIRE can cross:
1715 1742
1716 *A = a; 1743 *A = a;
1717 RELEASE M 1744 RELEASE M
@@ -1722,7 +1749,36 @@ could occur as:
1722 1749
1723 ACQUIRE N, STORE *B, STORE *A, RELEASE M 1750 ACQUIRE N, STORE *B, STORE *A, RELEASE M
1724 1751
1725With smp_mb__after_unlock_lock(), they cannot, so that: 1752It might appear that this reordering could introduce a deadlock.
1753However, this cannot happen because if such a deadlock threatened,
1754the RELEASE would simply complete, thereby avoiding the deadlock.
1755
1756 Why does this work?
1757
1758 One key point is that we are only talking about the CPU doing
1759 the reordering, not the compiler. If the compiler (or, for
1760 that matter, the developer) switched the operations, deadlock
1761 -could- occur.
1762
1763 But suppose the CPU reordered the operations. In this case,
1764 the unlock precedes the lock in the assembly code. The CPU
1765 simply elected to try executing the later lock operation first.
1766 If there is a deadlock, this lock operation will simply spin (or
1767 try to sleep, but more on that later). The CPU will eventually
1768 execute the unlock operation (which preceded the lock operation
1769 in the assembly code), which will unravel the potential deadlock,
1770 allowing the lock operation to succeed.
1771
1772 But what if the lock is a sleeplock? In that case, the code will
1773 try to enter the scheduler, where it will eventually encounter
1774 a memory barrier, which will force the earlier unlock operation
1775 to complete, again unraveling the deadlock. There might be
1776 a sleep-unlock race, but the locking primitive needs to resolve
1777 such races properly in any case.
1778
1779With smp_mb__after_unlock_lock(), the two critical sections cannot overlap.
1780For example, with the following code, the store to *A will always be
1781seen by other CPUs before the store to *B:
1726 1782
1727 *A = a; 1783 *A = a;
1728 RELEASE M 1784 RELEASE M
@@ -1730,13 +1786,18 @@ With smp_mb__after_unlock_lock(), they cannot, so that:
1730 smp_mb__after_unlock_lock(); 1786 smp_mb__after_unlock_lock();
1731 *B = b; 1787 *B = b;
1732 1788
1733will always occur as either of the following: 1789The operations will always occur in one of the following orders:
1734 1790
1735 STORE *A, RELEASE, ACQUIRE, STORE *B 1791 STORE *A, RELEASE, ACQUIRE, smp_mb__after_unlock_lock(), STORE *B
1736 STORE *A, ACQUIRE, RELEASE, STORE *B 1792 STORE *A, ACQUIRE, RELEASE, smp_mb__after_unlock_lock(), STORE *B
1793 ACQUIRE, STORE *A, RELEASE, smp_mb__after_unlock_lock(), STORE *B
1737 1794
1738If the RELEASE and ACQUIRE were instead both operating on the same lock 1795If the RELEASE and ACQUIRE were instead both operating on the same lock
1739variable, only the first of these two alternatives can occur. 1796variable, only the first of these alternatives can occur. In addition,
1797the more strongly ordered systems may rule out some of the above orders.
1798But in any case, as noted earlier, the smp_mb__after_unlock_lock()
1799ensures that the store to *A will always be seen as happening before
1800the store to *B.
1740 1801
1741Locks and semaphores may not provide any guarantee of ordering on UP compiled 1802Locks and semaphores may not provide any guarantee of ordering on UP compiled
1742systems, and so cannot be counted on in such a situation to actually achieve 1803systems, and so cannot be counted on in such a situation to actually achieve
@@ -2757,7 +2818,7 @@ in that order, but, without intervention, the sequence may have almost any
2757combination of elements combined or discarded, provided the program's view of 2818combination of elements combined or discarded, provided the program's view of
2758the world remains consistent. Note that ACCESS_ONCE() is -not- optional 2819the world remains consistent. Note that ACCESS_ONCE() is -not- optional
2759in the above example, as there are architectures where a given CPU might 2820in the above example, as there are architectures where a given CPU might
2760interchange successive loads to the same location. On such architectures, 2821reorder successive loads to the same location. On such architectures,
2761ACCESS_ONCE() does whatever is necessary to prevent this, for example, on 2822ACCESS_ONCE() does whatever is necessary to prevent this, for example, on
2762Itanium the volatile casts used by ACCESS_ONCE() cause GCC to emit the 2823Itanium the volatile casts used by ACCESS_ONCE() cause GCC to emit the
2763special ld.acq and st.rel instructions that prevent such reordering. 2824special ld.acq and st.rel instructions that prevent such reordering.
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index f3706c6b2e21..cda2583d67e9 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1007,11 +1007,21 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
1007#define kfree_rcu(ptr, rcu_head) \ 1007#define kfree_rcu(ptr, rcu_head) \
1008 __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head)) 1008 __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))
1009 1009
1010#ifdef CONFIG_RCU_NOCB_CPU 1010#if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL)
1011static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
1012{
1013 *delta_jiffies = ULONG_MAX;
1014 return 0;
1015}
1016#endif /* #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL) */
1017
1018#if defined(CONFIG_RCU_NOCB_CPU_ALL)
1019static inline bool rcu_is_nocb_cpu(int cpu) { return true; }
1020#elif defined(CONFIG_RCU_NOCB_CPU)
1011bool rcu_is_nocb_cpu(int cpu); 1021bool rcu_is_nocb_cpu(int cpu);
1012#else 1022#else
1013static inline bool rcu_is_nocb_cpu(int cpu) { return false; } 1023static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
1014#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ 1024#endif
1015 1025
1016 1026
1017/* Only for use by adaptive-ticks code. */ 1027/* Only for use by adaptive-ticks code. */
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index c364e9148de2..e8cb6e3b52a7 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -68,12 +68,6 @@ static inline void kfree_call_rcu(struct rcu_head *head,
68 call_rcu(head, func); 68 call_rcu(head, func);
69} 69}
70 70
71static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
72{
73 *delta_jiffies = ULONG_MAX;
74 return 0;
75}
76
77static inline void rcu_note_context_switch(int cpu) 71static inline void rcu_note_context_switch(int cpu)
78{ 72{
79 rcu_sched_qs(cpu); 73 rcu_sched_qs(cpu);
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 08b084068967..e9c63884df0a 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -31,7 +31,9 @@
31#define __LINUX_RCUTREE_H 31#define __LINUX_RCUTREE_H
32 32
33void rcu_note_context_switch(int cpu); 33void rcu_note_context_switch(int cpu);
34#ifndef CONFIG_RCU_NOCB_CPU_ALL
34int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies); 35int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies);
36#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
35void rcu_cpu_stall_reset(void); 37void rcu_cpu_stall_reset(void);
36 38
37/* 39/*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c7ed5db2dd79..351faba48b91 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2878,7 +2878,7 @@ static int rcu_pending(int cpu)
2878 * non-NULL, store an indication of whether all callbacks are lazy. 2878 * non-NULL, store an indication of whether all callbacks are lazy.
2879 * (If there are no callbacks, all of them are deemed to be lazy.) 2879 * (If there are no callbacks, all of them are deemed to be lazy.)
2880 */ 2880 */
2881static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) 2881static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
2882{ 2882{
2883 bool al = true; 2883 bool al = true;
2884 bool hc = false; 2884 bool hc = false;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index fffe4178a23d..962d1d589929 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1586,11 +1586,13 @@ static void rcu_prepare_kthreads(int cpu)
1586 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs 1586 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
1587 * any flavor of RCU. 1587 * any flavor of RCU.
1588 */ 1588 */
1589#ifndef CONFIG_RCU_NOCB_CPU_ALL
1589int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) 1590int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
1590{ 1591{
1591 *delta_jiffies = ULONG_MAX; 1592 *delta_jiffies = ULONG_MAX;
1592 return rcu_cpu_has_callbacks(cpu, NULL); 1593 return rcu_cpu_has_callbacks(cpu, NULL);
1593} 1594}
1595#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1594 1596
1595/* 1597/*
1596 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up 1598 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
@@ -1656,7 +1658,7 @@ extern int tick_nohz_active;
1656 * only if it has been awhile since the last time we did so. Afterwards, 1658 * only if it has been awhile since the last time we did so. Afterwards,
1657 * if there are any callbacks ready for immediate invocation, return true. 1659 * if there are any callbacks ready for immediate invocation, return true.
1658 */ 1660 */
1659static bool rcu_try_advance_all_cbs(void) 1661static bool __maybe_unused rcu_try_advance_all_cbs(void)
1660{ 1662{
1661 bool cbs_ready = false; 1663 bool cbs_ready = false;
1662 struct rcu_data *rdp; 1664 struct rcu_data *rdp;
@@ -1696,6 +1698,7 @@ static bool rcu_try_advance_all_cbs(void)
1696 * 1698 *
1697 * The caller must have disabled interrupts. 1699 * The caller must have disabled interrupts.
1698 */ 1700 */
1701#ifndef CONFIG_RCU_NOCB_CPU_ALL
1699int rcu_needs_cpu(int cpu, unsigned long *dj) 1702int rcu_needs_cpu(int cpu, unsigned long *dj)
1700{ 1703{
1701 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1704 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
@@ -1726,6 +1729,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
1726 } 1729 }
1727 return 0; 1730 return 0;
1728} 1731}
1732#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1729 1733
1730/* 1734/*
1731 * Prepare a CPU for idle from an RCU perspective. The first major task 1735 * Prepare a CPU for idle from an RCU perspective. The first major task
@@ -1739,6 +1743,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
1739 */ 1743 */
1740static void rcu_prepare_for_idle(int cpu) 1744static void rcu_prepare_for_idle(int cpu)
1741{ 1745{
1746#ifndef CONFIG_RCU_NOCB_CPU_ALL
1742 struct rcu_data *rdp; 1747 struct rcu_data *rdp;
1743 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1748 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1744 struct rcu_node *rnp; 1749 struct rcu_node *rnp;
@@ -1790,6 +1795,7 @@ static void rcu_prepare_for_idle(int cpu)
1790 rcu_accelerate_cbs(rsp, rnp, rdp); 1795 rcu_accelerate_cbs(rsp, rnp, rdp);
1791 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1796 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1792 } 1797 }
1798#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1793} 1799}
1794 1800
1795/* 1801/*
@@ -1799,11 +1805,12 @@ static void rcu_prepare_for_idle(int cpu)
1799 */ 1805 */
1800static void rcu_cleanup_after_idle(int cpu) 1806static void rcu_cleanup_after_idle(int cpu)
1801{ 1807{
1802 1808#ifndef CONFIG_RCU_NOCB_CPU_ALL
1803 if (rcu_is_nocb_cpu(cpu)) 1809 if (rcu_is_nocb_cpu(cpu))
1804 return; 1810 return;
1805 if (rcu_try_advance_all_cbs()) 1811 if (rcu_try_advance_all_cbs())
1806 invoke_rcu_core(); 1812 invoke_rcu_core();
1813#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1807} 1814}
1808 1815
1809/* 1816/*
@@ -2101,6 +2108,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
2101 init_waitqueue_head(&rnp->nocb_gp_wq[1]); 2108 init_waitqueue_head(&rnp->nocb_gp_wq[1]);
2102} 2109}
2103 2110
2111#ifndef CONFIG_RCU_NOCB_CPU_ALL
2104/* Is the specified CPU a no-CPUs CPU? */ 2112/* Is the specified CPU a no-CPUs CPU? */
2105bool rcu_is_nocb_cpu(int cpu) 2113bool rcu_is_nocb_cpu(int cpu)
2106{ 2114{
@@ -2108,6 +2116,7 @@ bool rcu_is_nocb_cpu(int cpu)
2108 return cpumask_test_cpu(cpu, rcu_nocb_mask); 2116 return cpumask_test_cpu(cpu, rcu_nocb_mask);
2109 return false; 2117 return false;
2110} 2118}
2119#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
2111 2120
2112/* 2121/*
2113 * Enqueue the specified string of rcu_head structures onto the specified 2122 * Enqueue the specified string of rcu_head structures onto the specified