aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/RCU/trace.txt254
-rw-r--r--Documentation/RCU/whatisRCU.txt2
-rw-r--r--include/linux/hardirq.h24
-rw-r--r--include/linux/rcupdate.h10
-rw-r--r--include/linux/rcutiny.h104
-rw-r--r--include/linux/rcutree.h7
-rw-r--r--include/linux/srcu.h1
-rw-r--r--init/Kconfig9
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/rcupdate.c122
-rw-r--r--kernel/rcutiny.c282
-rw-r--r--kernel/rcutorture.c65
-rw-r--r--kernel/rcutree.c465
-rw-r--r--kernel/rcutree.h69
-rw-r--r--kernel/rcutree_plugin.h309
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/sched.c1
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/srcu.c74
-rw-r--r--lib/Kconfig.debug2
20 files changed, 1234 insertions, 581 deletions
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 187bbf10c923..8608fd85e921 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -1,185 +1,10 @@
1CONFIG_RCU_TRACE debugfs Files and Formats 1CONFIG_RCU_TRACE debugfs Files and Formats
2 2
3 3
4The rcupreempt and rcutree implementations of RCU provide debugfs trace 4The rcutree implementation of RCU provides debugfs trace output that
5output that summarizes counters and state. This information is useful for 5summarizes counters and state. This information is useful for debugging
6debugging RCU itself, and can sometimes also help to debug abuses of RCU. 6RCU itself, and can sometimes also help to debug abuses of RCU.
7Note that the rcuclassic implementation of RCU does not provide debugfs 7The following sections describe the debugfs files and formats.
8trace output.
9
10The following sections describe the debugfs files and formats for
11preemptable RCU (rcupreempt) and hierarchical RCU (rcutree).
12
13
14Preemptable RCU debugfs Files and Formats
15
16This implementation of RCU provides three debugfs files under the
17top-level directory RCU: rcu/rcuctrs (which displays the per-CPU
18counters used by preemptable RCU) rcu/rcugp (which displays grace-period
19counters), and rcu/rcustats (which internal counters for debugging RCU).
20
21The output of "cat rcu/rcuctrs" looks as follows:
22
23CPU last cur F M
24 0 5 -5 0 0
25 1 -1 0 0 0
26 2 0 1 0 0
27 3 0 1 0 0
28 4 0 1 0 0
29 5 0 1 0 0
30 6 0 2 0 0
31 7 0 -1 0 0
32 8 0 1 0 0
33ggp = 26226, state = waitzero
34
35The per-CPU fields are as follows:
36
37o "CPU" gives the CPU number. Offline CPUs are not displayed.
38
39o "last" gives the value of the counter that is being decremented
40 for the current grace period phase. In the example above,
41 the counters sum to 4, indicating that there are still four
42 RCU read-side critical sections still running that started
43 before the last counter flip.
44
45o "cur" gives the value of the counter that is currently being
46 both incremented (by rcu_read_lock()) and decremented (by
47 rcu_read_unlock()). In the example above, the counters sum to
48 1, indicating that there is only one RCU read-side critical section
49 still running that started after the last counter flip.
50
51o "F" indicates whether RCU is waiting for this CPU to acknowledge
52 a counter flip. In the above example, RCU is not waiting on any,
53 which is consistent with the state being "waitzero" rather than
54 "waitack".
55
56o "M" indicates whether RCU is waiting for this CPU to execute a
57 memory barrier. In the above example, RCU is not waiting on any,
58 which is consistent with the state being "waitzero" rather than
59 "waitmb".
60
61o "ggp" is the global grace-period counter.
62
63o "state" is the RCU state, which can be one of the following:
64
65 o "idle": there is no grace period in progress.
66
67 o "waitack": RCU just incremented the global grace-period
68 counter, which has the effect of reversing the roles of
69 the "last" and "cur" counters above, and is waiting for
70 all the CPUs to acknowledge the flip. Once the flip has
71 been acknowledged, CPUs will no longer be incrementing
72 what are now the "last" counters, so that their sum will
73 decrease monotonically down to zero.
74
75 o "waitzero": RCU is waiting for the sum of the "last" counters
76 to decrease to zero.
77
78 o "waitmb": RCU is waiting for each CPU to execute a memory
79 barrier, which ensures that instructions from a given CPU's
80 last RCU read-side critical section cannot be reordered
81 with instructions following the memory-barrier instruction.
82
83The output of "cat rcu/rcugp" looks as follows:
84
85oldggp=48870 newggp=48873
86
87Note that reading from this file provokes a synchronize_rcu(). The
88"oldggp" value is that of "ggp" from rcu/rcuctrs above, taken before
89executing the synchronize_rcu(), and the "newggp" value is also the
90"ggp" value, but taken after the synchronize_rcu() command returns.
91
92
93The output of "cat rcu/rcugp" looks as follows:
94
95na=1337955 nl=40 wa=1337915 wl=44 da=1337871 dl=0 dr=1337871 di=1337871
961=50989 e1=6138 i1=49722 ie1=82 g1=49640 a1=315203 ae1=265563 a2=49640
97z1=1401244 ze1=1351605 z2=49639 m1=5661253 me1=5611614 m2=49639
98
99These are counters tracking internal preemptable-RCU events, however,
100some of them may be useful for debugging algorithms using RCU. In
101particular, the "nl", "wl", and "dl" values track the number of RCU
102callbacks in various states. The fields are as follows:
103
104o "na" is the total number of RCU callbacks that have been enqueued
105 since boot.
106
107o "nl" is the number of RCU callbacks waiting for the previous
108 grace period to end so that they can start waiting on the next
109 grace period.
110
111o "wa" is the total number of RCU callbacks that have started waiting
112 for a grace period since boot. "na" should be roughly equal to
113 "nl" plus "wa".
114
115o "wl" is the number of RCU callbacks currently waiting for their
116 grace period to end.
117
118o "da" is the total number of RCU callbacks whose grace periods
119 have completed since boot. "wa" should be roughly equal to
120 "wl" plus "da".
121
122o "dr" is the total number of RCU callbacks that have been removed
123 from the list of callbacks ready to invoke. "dr" should be roughly
124 equal to "da".
125
126o "di" is the total number of RCU callbacks that have been invoked
127 since boot. "di" should be roughly equal to "da", though some
128 early versions of preemptable RCU had a bug so that only the
129 last CPU's count of invocations was displayed, rather than the
130 sum of all CPU's counts.
131
132o "1" is the number of calls to rcu_try_flip(). This should be
133 roughly equal to the sum of "e1", "i1", "a1", "z1", and "m1"
134 described below. In other words, the number of times that
135 the state machine is visited should be equal to the sum of the
136 number of times that each state is visited plus the number of
137 times that the state-machine lock acquisition failed.
138
139o "e1" is the number of times that rcu_try_flip() was unable to
140 acquire the fliplock.
141
142o "i1" is the number of calls to rcu_try_flip_idle().
143
144o "ie1" is the number of times rcu_try_flip_idle() exited early
145 due to the calling CPU having no work for RCU.
146
147o "g1" is the number of times that rcu_try_flip_idle() decided
148 to start a new grace period. "i1" should be roughly equal to
149 "ie1" plus "g1".
150
151o "a1" is the number of calls to rcu_try_flip_waitack().
152
153o "ae1" is the number of times that rcu_try_flip_waitack() found
154 that at least one CPU had not yet acknowledge the new grace period
155 (AKA "counter flip").
156
157o "a2" is the number of time rcu_try_flip_waitack() found that
158 all CPUs had acknowledged. "a1" should be roughly equal to
159 "ae1" plus "a2". (This particular output was collected on
160 a 128-CPU machine, hence the smaller-than-usual fraction of
161 calls to rcu_try_flip_waitack() finding all CPUs having already
162 acknowledged.)
163
164o "z1" is the number of calls to rcu_try_flip_waitzero().
165
166o "ze1" is the number of times that rcu_try_flip_waitzero() found
167 that not all of the old RCU read-side critical sections had
168 completed.
169
170o "z2" is the number of times that rcu_try_flip_waitzero() finds
171 the sum of the counters equal to zero, in other words, that
172 all of the old RCU read-side critical sections had completed.
173 The value of "z1" should be roughly equal to "ze1" plus
174 "z2".
175
176o "m1" is the number of calls to rcu_try_flip_waitmb().
177
178o "me1" is the number of times that rcu_try_flip_waitmb() finds
179 that at least one CPU has not yet executed a memory barrier.
180
181o "m2" is the number of times that rcu_try_flip_waitmb() finds that
182 all CPUs have executed a memory barrier.
183 8
184 9
185Hierarchical RCU debugfs Files and Formats 10Hierarchical RCU debugfs Files and Formats
@@ -210,9 +35,10 @@ rcu_bh:
210 6 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=859/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 35 6 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=859/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
211 7 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3761/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 36 7 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3761/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
212 37
213The first section lists the rcu_data structures for rcu, the second for 38The first section lists the rcu_data structures for rcu_sched, the second
214rcu_bh. Each section has one line per CPU, or eight for this 8-CPU system. 39for rcu_bh. Note that CONFIG_TREE_PREEMPT_RCU kernels will have an
215The fields are as follows: 40additional section for rcu_preempt. Each section has one line per CPU,
41or eight for this 8-CPU system. The fields are as follows:
216 42
217o The number at the beginning of each line is the CPU number. 43o The number at the beginning of each line is the CPU number.
218 CPUs numbers followed by an exclamation mark are offline, 44 CPUs numbers followed by an exclamation mark are offline,
@@ -223,9 +49,9 @@ o The number at the beginning of each line is the CPU number.
223 49
224o "c" is the count of grace periods that this CPU believes have 50o "c" is the count of grace periods that this CPU believes have
225 completed. CPUs in dynticks idle mode may lag quite a ways 51 completed. CPUs in dynticks idle mode may lag quite a ways
226 behind, for example, CPU 4 under "rcu" above, which has slept 52 behind, for example, CPU 4 under "rcu_sched" above, which has
227 through the past 25 RCU grace periods. It is not unusual to 53 slept through the past 25 RCU grace periods. It is not unusual
228 see CPUs lagging by thousands of grace periods. 54 to see CPUs lagging by thousands of grace periods.
229 55
230o "g" is the count of grace periods that this CPU believes have 56o "g" is the count of grace periods that this CPU believes have
231 started. Again, CPUs in dynticks idle mode may lag behind. 57 started. Again, CPUs in dynticks idle mode may lag behind.
@@ -308,8 +134,10 @@ The output of "cat rcu/rcugp" looks as follows:
308rcu_sched: completed=33062 gpnum=33063 134rcu_sched: completed=33062 gpnum=33063
309rcu_bh: completed=464 gpnum=464 135rcu_bh: completed=464 gpnum=464
310 136
311Again, this output is for both "rcu" and "rcu_bh". The fields are 137Again, this output is for both "rcu_sched" and "rcu_bh". Note that
312taken from the rcu_state structure, and are as follows: 138kernels built with CONFIG_TREE_PREEMPT_RCU will have an additional
139"rcu_preempt" line. The fields are taken from the rcu_state structure,
140and are as follows:
313 141
314o "completed" is the number of grace periods that have completed. 142o "completed" is the number of grace periods that have completed.
315 It is comparable to the "c" field from rcu/rcudata in that a 143 It is comparable to the "c" field from rcu/rcudata in that a
@@ -324,23 +152,24 @@ o "gpnum" is the number of grace periods that have started. It is
324 If these two fields are equal (as they are for "rcu_bh" above), 152 If these two fields are equal (as they are for "rcu_bh" above),
325 then there is no grace period in progress, in other words, RCU 153 then there is no grace period in progress, in other words, RCU
326 is idle. On the other hand, if the two fields differ (as they 154 is idle. On the other hand, if the two fields differ (as they
327 do for "rcu" above), then an RCU grace period is in progress. 155 do for "rcu_sched" above), then an RCU grace period is in progress.
328 156
329 157
330The output of "cat rcu/rcuhier" looks as follows, with very long lines: 158The output of "cat rcu/rcuhier" looks as follows, with very long lines:
331 159
332c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 160c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 oqlen=0
3331/1 0:127 ^0 1611/1 .>. 0:127 ^0
3343/3 0:35 ^0 0/0 36:71 ^1 0/0 72:107 ^2 0/0 108:127 ^3 1623/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3
3353/3f 0:5 ^0 2/3 6:11 ^1 0/0 12:17 ^2 0/0 18:23 ^3 0/0 24:29 ^4 0/0 30:35 ^5 0/0 36:41 ^0 0/0 42:47 ^1 0/0 48:53 ^2 0/0 54:59 ^3 0/0 60:65 ^4 0/0 66:71 ^5 0/0 72:77 ^0 0/0 78:83 ^1 0/0 84:89 ^2 0/0 90:95 ^3 0/0 96:101 ^4 0/0 102:107 ^5 0/0 108:113 ^0 0/0 114:119 ^1 0/0 120:125 ^2 0/0 126:127 ^3 1633/3f .>. 0:5 ^0 2/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3
336rcu_bh: 164rcu_bh:
337c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 165c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 oqlen=0
3380/1 0:127 ^0 1660/1 .>. 0:127 ^0
3390/3 0:35 ^0 0/0 36:71 ^1 0/0 72:107 ^2 0/0 108:127 ^3 1670/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3
3400/3f 0:5 ^0 0/3 6:11 ^1 0/0 12:17 ^2 0/0 18:23 ^3 0/0 24:29 ^4 0/0 30:35 ^5 0/0 36:41 ^0 0/0 42:47 ^1 0/0 48:53 ^2 0/0 54:59 ^3 0/0 60:65 ^4 0/0 66:71 ^5 0/0 72:77 ^0 0/0 78:83 ^1 0/0 84:89 ^2 0/0 90:95 ^3 0/0 96:101 ^4 0/0 102:107 ^5 0/0 108:113 ^0 0/0 114:119 ^1 0/0 120:125 ^2 0/0 126:127 ^3 1680/3f .>. 0:5 ^0 0/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3
341 169
342This is once again split into "rcu" and "rcu_bh" portions. The fields are 170This is once again split into "rcu_sched" and "rcu_bh" portions,
343as follows: 171and CONFIG_TREE_PREEMPT_RCU kernels will again have an additional
172"rcu_preempt" section. The fields are as follows:
344 173
345o "c" is exactly the same as "completed" under rcu/rcugp. 174o "c" is exactly the same as "completed" under rcu/rcugp.
346 175
@@ -372,6 +201,11 @@ o "fqlh" is the number of calls to force_quiescent_state() that
372 exited immediately (without even being counted in nfqs above) 201 exited immediately (without even being counted in nfqs above)
373 due to contention on ->fqslock. 202 due to contention on ->fqslock.
374 203
204o "oqlen" is the number of callbacks on the "orphan" callback
205 list. RCU callbacks are placed on this list by CPUs going
206 offline, and are "adopted" either by the CPU helping the outgoing
207 CPU or by the next rcu_barrier*() call, whichever comes first.
208
375o Each element of the form "1/1 0:127 ^0" represents one struct 209o Each element of the form "1/1 0:127 ^0" represents one struct
376 rcu_node. Each line represents one level of the hierarchy, from 210 rcu_node. Each line represents one level of the hierarchy, from
377 root to leaves. It is best to think of the rcu_data structures 211 root to leaves. It is best to think of the rcu_data structures
@@ -379,7 +213,7 @@ o Each element of the form "1/1 0:127 ^0" represents one struct
379 might be either one, two, or three levels of rcu_node structures, 213 might be either one, two, or three levels of rcu_node structures,
380 depending on the relationship between CONFIG_RCU_FANOUT and 214 depending on the relationship between CONFIG_RCU_FANOUT and
381 CONFIG_NR_CPUS. 215 CONFIG_NR_CPUS.
382 216
383 o The numbers separated by the "/" are the qsmask followed 217 o The numbers separated by the "/" are the qsmask followed
384 by the qsmaskinit. The qsmask will have one bit 218 by the qsmaskinit. The qsmask will have one bit
385 set for each entity in the next lower level that 219 set for each entity in the next lower level that
@@ -389,10 +223,19 @@ o Each element of the form "1/1 0:127 ^0" represents one struct
389 The value of qsmaskinit is assigned to that of qsmask 223 The value of qsmaskinit is assigned to that of qsmask
390 at the beginning of each grace period. 224 at the beginning of each grace period.
391 225
392 For example, for "rcu", the qsmask of the first entry 226 For example, for "rcu_sched", the qsmask of the first
393 of the lowest level is 0x14, meaning that we are still 227 entry of the lowest level is 0x14, meaning that we
394 waiting for CPUs 2 and 4 to check in for the current 228 are still waiting for CPUs 2 and 4 to check in for the
395 grace period. 229 current grace period.
230
231 o The characters separated by the ">" indicate the state
232 of the blocked-tasks lists. A "T" preceding the ">"
233 indicates that at least one task blocked in an RCU
234 read-side critical section blocks the current grace
235 period, while a "." preceding the ">" indicates otherwise.
236 The character following the ">" indicates similarly for
237 the next grace period. A "T" should appear in this
238 field only for rcu-preempt.
396 239
397 o The numbers separated by the ":" are the range of CPUs 240 o The numbers separated by the ":" are the range of CPUs
398 served by this struct rcu_node. This can be helpful 241 served by this struct rcu_node. This can be helpful
@@ -431,8 +274,9 @@ rcu_bh:
431 6 np=120834 qsp=9902 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921 274 6 np=120834 qsp=9902 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921
432 7 np=144888 qsp=26336 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542 275 7 np=144888 qsp=26336 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542
433 276
434As always, this is once again split into "rcu" and "rcu_bh" portions. 277As always, this is once again split into "rcu_sched" and "rcu_bh"
435The fields are as follows: 278portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional
279"rcu_preempt" section. The fields are as follows:
436 280
437o "np" is the number of times that __rcu_pending() has been invoked 281o "np" is the number of times that __rcu_pending() has been invoked
438 for the corresponding flavor of RCU. 282 for the corresponding flavor of RCU.
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index e41a7fecf0d3..d542ca243b80 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -830,7 +830,7 @@ sched: Critical sections Grace period Barrier
830SRCU: Critical sections Grace period Barrier 830SRCU: Critical sections Grace period Barrier
831 831
832 srcu_read_lock synchronize_srcu N/A 832 srcu_read_lock synchronize_srcu N/A
833 srcu_read_unlock 833 srcu_read_unlock synchronize_srcu_expedited
834 834
835SRCU: Initialization/cleanup 835SRCU: Initialization/cleanup
836 init_srcu_struct 836 init_srcu_struct
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 6d527ee82b2b..d5b387669dab 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -139,10 +139,34 @@ static inline void account_system_vtime(struct task_struct *tsk)
139#endif 139#endif
140 140
141#if defined(CONFIG_NO_HZ) 141#if defined(CONFIG_NO_HZ)
142#if defined(CONFIG_TINY_RCU)
143extern void rcu_enter_nohz(void);
144extern void rcu_exit_nohz(void);
145
146static inline void rcu_irq_enter(void)
147{
148 rcu_exit_nohz();
149}
150
151static inline void rcu_irq_exit(void)
152{
153 rcu_enter_nohz();
154}
155
156static inline void rcu_nmi_enter(void)
157{
158}
159
160static inline void rcu_nmi_exit(void)
161{
162}
163
164#else
142extern void rcu_irq_enter(void); 165extern void rcu_irq_enter(void);
143extern void rcu_irq_exit(void); 166extern void rcu_irq_exit(void);
144extern void rcu_nmi_enter(void); 167extern void rcu_nmi_enter(void);
145extern void rcu_nmi_exit(void); 168extern void rcu_nmi_exit(void);
169#endif
146#else 170#else
147# define rcu_irq_enter() do { } while (0) 171# define rcu_irq_enter() do { } while (0)
148# define rcu_irq_exit() do { } while (0) 172# define rcu_irq_exit() do { } while (0)
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 3ebd0b7bcb08..24440f4bf476 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -52,11 +52,6 @@ struct rcu_head {
52}; 52};
53 53
54/* Exported common interfaces */ 54/* Exported common interfaces */
55#ifdef CONFIG_TREE_PREEMPT_RCU
56extern void synchronize_rcu(void);
57#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
58#define synchronize_rcu synchronize_sched
59#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
60extern void synchronize_rcu_bh(void); 55extern void synchronize_rcu_bh(void);
61extern void synchronize_sched(void); 56extern void synchronize_sched(void);
62extern void rcu_barrier(void); 57extern void rcu_barrier(void);
@@ -67,12 +62,11 @@ extern int sched_expedited_torture_stats(char *page);
67 62
68/* Internal to kernel */ 63/* Internal to kernel */
69extern void rcu_init(void); 64extern void rcu_init(void);
70extern void rcu_scheduler_starting(void);
71extern int rcu_needs_cpu(int cpu);
72extern int rcu_scheduler_active;
73 65
74#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) 66#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
75#include <linux/rcutree.h> 67#include <linux/rcutree.h>
68#elif defined(CONFIG_TINY_RCU)
69#include <linux/rcutiny.h>
76#else 70#else
77#error "Unknown RCU implementation specified to kernel configuration" 71#error "Unknown RCU implementation specified to kernel configuration"
78#endif 72#endif
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
new file mode 100644
index 000000000000..c4ba9a78721e
--- /dev/null
+++ b/include/linux/rcutiny.h
@@ -0,0 +1,104 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU
24 */
25#ifndef __LINUX_TINY_H
26#define __LINUX_TINY_H
27
28#include <linux/cache.h>
29
30void rcu_sched_qs(int cpu);
31void rcu_bh_qs(int cpu);
32
33#define __rcu_read_lock() preempt_disable()
34#define __rcu_read_unlock() preempt_enable()
35#define __rcu_read_lock_bh() local_bh_disable()
36#define __rcu_read_unlock_bh() local_bh_enable()
37#define call_rcu_sched call_rcu
38
39#define rcu_init_sched() do { } while (0)
40extern void rcu_check_callbacks(int cpu, int user);
41
42static inline int rcu_needs_cpu(int cpu)
43{
44 return 0;
45}
46
47/*
48 * Return the number of grace periods.
49 */
50static inline long rcu_batches_completed(void)
51{
52 return 0;
53}
54
55/*
56 * Return the number of bottom-half grace periods.
57 */
58static inline long rcu_batches_completed_bh(void)
59{
60 return 0;
61}
62
63extern int rcu_expedited_torture_stats(char *page);
64
65#define synchronize_rcu synchronize_sched
66
67static inline void synchronize_rcu_expedited(void)
68{
69 synchronize_sched();
70}
71
72static inline void synchronize_rcu_bh_expedited(void)
73{
74 synchronize_sched();
75}
76
77struct notifier_block;
78
79#ifdef CONFIG_NO_HZ
80
81extern void rcu_enter_nohz(void);
82extern void rcu_exit_nohz(void);
83
84#else /* #ifdef CONFIG_NO_HZ */
85
86static inline void rcu_enter_nohz(void)
87{
88}
89
90static inline void rcu_exit_nohz(void)
91{
92}
93
94#endif /* #else #ifdef CONFIG_NO_HZ */
95
96static inline void rcu_scheduler_starting(void)
97{
98}
99
100static inline void exit_rcu(void)
101{
102}
103
104#endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 9642c6bcb399..c93eee5911b0 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -34,15 +34,15 @@ struct notifier_block;
34 34
35extern void rcu_sched_qs(int cpu); 35extern void rcu_sched_qs(int cpu);
36extern void rcu_bh_qs(int cpu); 36extern void rcu_bh_qs(int cpu);
37extern int rcu_cpu_notify(struct notifier_block *self,
38 unsigned long action, void *hcpu);
39extern int rcu_needs_cpu(int cpu); 37extern int rcu_needs_cpu(int cpu);
38extern void rcu_scheduler_starting(void);
40extern int rcu_expedited_torture_stats(char *page); 39extern int rcu_expedited_torture_stats(char *page);
41 40
42#ifdef CONFIG_TREE_PREEMPT_RCU 41#ifdef CONFIG_TREE_PREEMPT_RCU
43 42
44extern void __rcu_read_lock(void); 43extern void __rcu_read_lock(void);
45extern void __rcu_read_unlock(void); 44extern void __rcu_read_unlock(void);
45extern void synchronize_rcu(void);
46extern void exit_rcu(void); 46extern void exit_rcu(void);
47 47
48#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 48#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
@@ -57,7 +57,7 @@ static inline void __rcu_read_unlock(void)
57 preempt_enable(); 57 preempt_enable();
58} 58}
59 59
60#define __synchronize_sched() synchronize_rcu() 60#define synchronize_rcu synchronize_sched
61 61
62static inline void exit_rcu(void) 62static inline void exit_rcu(void)
63{ 63{
@@ -83,7 +83,6 @@ static inline void synchronize_rcu_bh_expedited(void)
83 synchronize_sched_expedited(); 83 synchronize_sched_expedited();
84} 84}
85 85
86extern void __rcu_init(void);
87extern void rcu_check_callbacks(int cpu, int user); 86extern void rcu_check_callbacks(int cpu, int user);
88 87
89extern long rcu_batches_completed(void); 88extern long rcu_batches_completed(void);
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index aca0eee53930..4765d97dcafb 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -48,6 +48,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp);
48int srcu_read_lock(struct srcu_struct *sp) __acquires(sp); 48int srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
49void srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp); 49void srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
50void synchronize_srcu(struct srcu_struct *sp); 50void synchronize_srcu(struct srcu_struct *sp);
51void synchronize_srcu_expedited(struct srcu_struct *sp);
51long srcu_batches_completed(struct srcu_struct *sp); 52long srcu_batches_completed(struct srcu_struct *sp);
52 53
53#endif 54#endif
diff --git a/init/Kconfig b/init/Kconfig
index 2e9a1457132c..9ee778294756 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -334,6 +334,15 @@ config TREE_PREEMPT_RCU
334 is also required. It also scales down nicely to 334 is also required. It also scales down nicely to
335 smaller systems. 335 smaller systems.
336 336
337config TINY_RCU
338 bool "UP-only small-memory-footprint RCU"
339 depends on !SMP
340 help
341 This option selects the RCU implementation that is
342 designed for UP systems from which real-time response
343 is not required. This option greatly reduces the
344 memory footprint of RCU.
345
337endchoice 346endchoice
338 347
339config RCU_TRACE 348config RCU_TRACE
diff --git a/kernel/Makefile b/kernel/Makefile
index d7c13d249b2d..dcf6789bf547 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
82obj-$(CONFIG_TREE_RCU) += rcutree.o 82obj-$(CONFIG_TREE_RCU) += rcutree.o
83obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o 83obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
84obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o 84obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
85obj-$(CONFIG_TINY_RCU) += rcutiny.o
85obj-$(CONFIG_RELAY) += relay.o 86obj-$(CONFIG_RELAY) += relay.o
86obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 87obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
87obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 88obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 400183346ad2..9b7fd4723878 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,7 +44,6 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h>
48 47
49#ifdef CONFIG_DEBUG_LOCK_ALLOC 48#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key; 49static struct lock_class_key rcu_lock_key;
@@ -53,8 +52,6 @@ struct lockdep_map rcu_lock_map =
53EXPORT_SYMBOL_GPL(rcu_lock_map); 52EXPORT_SYMBOL_GPL(rcu_lock_map);
54#endif 53#endif
55 54
56int rcu_scheduler_active __read_mostly;
57
58/* 55/*
59 * Awaken the corresponding synchronize_rcu() instance now that a 56 * Awaken the corresponding synchronize_rcu() instance now that a
60 * grace period has elapsed. 57 * grace period has elapsed.
@@ -66,122 +63,3 @@ void wakeme_after_rcu(struct rcu_head *head)
66 rcu = container_of(head, struct rcu_synchronize, head); 63 rcu = container_of(head, struct rcu_synchronize, head);
67 complete(&rcu->completion); 64 complete(&rcu->completion);
68} 65}
69
70#ifdef CONFIG_TREE_PREEMPT_RCU
71
72/**
73 * synchronize_rcu - wait until a grace period has elapsed.
74 *
75 * Control will return to the caller some time after a full grace
76 * period has elapsed, in other words after all currently executing RCU
77 * read-side critical sections have completed. RCU read-side critical
78 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
79 * and may be nested.
80 */
81void synchronize_rcu(void)
82{
83 struct rcu_synchronize rcu;
84
85 if (!rcu_scheduler_active)
86 return;
87
88 init_completion(&rcu.completion);
89 /* Will wake me after RCU finished. */
90 call_rcu(&rcu.head, wakeme_after_rcu);
91 /* Wait for it. */
92 wait_for_completion(&rcu.completion);
93}
94EXPORT_SYMBOL_GPL(synchronize_rcu);
95
96#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
97
98/**
99 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
100 *
101 * Control will return to the caller some time after a full rcu-sched
102 * grace period has elapsed, in other words after all currently executing
103 * rcu-sched read-side critical sections have completed. These read-side
104 * critical sections are delimited by rcu_read_lock_sched() and
105 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
106 * local_irq_disable(), and so on may be used in place of
107 * rcu_read_lock_sched().
108 *
109 * This means that all preempt_disable code sequences, including NMI and
110 * hardware-interrupt handlers, in progress on entry will have completed
111 * before this primitive returns. However, this does not guarantee that
112 * softirq handlers will have completed, since in some kernels, these
113 * handlers can run in process context, and can block.
114 *
115 * This primitive provides the guarantees made by the (now removed)
116 * synchronize_kernel() API. In contrast, synchronize_rcu() only
117 * guarantees that rcu_read_lock() sections will have completed.
118 * In "classic RCU", these two guarantees happen to be one and
119 * the same, but can differ in realtime RCU implementations.
120 */
121void synchronize_sched(void)
122{
123 struct rcu_synchronize rcu;
124
125 if (rcu_blocking_is_gp())
126 return;
127
128 init_completion(&rcu.completion);
129 /* Will wake me after RCU finished. */
130 call_rcu_sched(&rcu.head, wakeme_after_rcu);
131 /* Wait for it. */
132 wait_for_completion(&rcu.completion);
133}
134EXPORT_SYMBOL_GPL(synchronize_sched);
135
136/**
137 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
138 *
139 * Control will return to the caller some time after a full rcu_bh grace
140 * period has elapsed, in other words after all currently executing rcu_bh
141 * read-side critical sections have completed. RCU read-side critical
142 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
143 * and may be nested.
144 */
145void synchronize_rcu_bh(void)
146{
147 struct rcu_synchronize rcu;
148
149 if (rcu_blocking_is_gp())
150 return;
151
152 init_completion(&rcu.completion);
153 /* Will wake me after RCU finished. */
154 call_rcu_bh(&rcu.head, wakeme_after_rcu);
155 /* Wait for it. */
156 wait_for_completion(&rcu.completion);
157}
158EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
159
160static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
161 unsigned long action, void *hcpu)
162{
163 return rcu_cpu_notify(self, action, hcpu);
164}
165
166void __init rcu_init(void)
167{
168 int i;
169
170 __rcu_init();
171 cpu_notifier(rcu_barrier_cpu_hotplug, 0);
172
173 /*
174 * We don't need protection against CPU-hotplug here because
175 * this is called early in boot, before either interrupts
176 * or the scheduler are operational.
177 */
178 for_each_online_cpu(i)
179 rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i);
180}
181
182void rcu_scheduler_starting(void)
183{
184 WARN_ON(num_online_cpus() != 1);
185 WARN_ON(nr_context_switches() > 0);
186 rcu_scheduler_active = 1;
187}
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
new file mode 100644
index 000000000000..9f6d9ff2572c
--- /dev/null
+++ b/kernel/rcutiny.c
@@ -0,0 +1,282 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU
24 */
25#include <linux/moduleparam.h>
26#include <linux/completion.h>
27#include <linux/interrupt.h>
28#include <linux/notifier.h>
29#include <linux/rcupdate.h>
30#include <linux/kernel.h>
31#include <linux/module.h>
32#include <linux/mutex.h>
33#include <linux/sched.h>
34#include <linux/types.h>
35#include <linux/init.h>
36#include <linux/time.h>
37#include <linux/cpu.h>
38
39/* Global control variables for rcupdate callback mechanism. */
40struct rcu_ctrlblk {
41 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
42 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
43 struct rcu_head **curtail; /* ->next pointer of last CB. */
44};
45
46/* Definition for rcupdate control block. */
47static struct rcu_ctrlblk rcu_ctrlblk = {
48 .donetail = &rcu_ctrlblk.rcucblist,
49 .curtail = &rcu_ctrlblk.rcucblist,
50};
51
52static struct rcu_ctrlblk rcu_bh_ctrlblk = {
53 .donetail = &rcu_bh_ctrlblk.rcucblist,
54 .curtail = &rcu_bh_ctrlblk.rcucblist,
55};
56
57#ifdef CONFIG_NO_HZ
58
59static long rcu_dynticks_nesting = 1;
60
61/*
62 * Enter dynticks-idle mode, which is an extended quiescent state
63 * if we have fully entered that mode (i.e., if the new value of
64 * dynticks_nesting is zero).
65 */
66void rcu_enter_nohz(void)
67{
68 if (--rcu_dynticks_nesting == 0)
69 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
70}
71
72/*
73 * Exit dynticks-idle mode, so that we are no longer in an extended
74 * quiescent state.
75 */
76void rcu_exit_nohz(void)
77{
78 rcu_dynticks_nesting++;
79}
80
81#endif /* #ifdef CONFIG_NO_HZ */
82
83/*
84 * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc().
85 * Also disable irqs to avoid confusion due to interrupt handlers
86 * invoking call_rcu().
87 */
88static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
89{
90 unsigned long flags;
91
92 local_irq_save(flags);
93 if (rcp->rcucblist != NULL &&
94 rcp->donetail != rcp->curtail) {
95 rcp->donetail = rcp->curtail;
96 local_irq_restore(flags);
97 return 1;
98 }
99 local_irq_restore(flags);
100
101 return 0;
102}
103
104/*
105 * Record an rcu quiescent state. And an rcu_bh quiescent state while we
106 * are at it, given that any rcu quiescent state is also an rcu_bh
107 * quiescent state. Use "+" instead of "||" to defeat short circuiting.
108 */
109void rcu_sched_qs(int cpu)
110{
111 if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk))
112 raise_softirq(RCU_SOFTIRQ);
113}
114
115/*
116 * Record an rcu_bh quiescent state.
117 */
118void rcu_bh_qs(int cpu)
119{
120 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
121 raise_softirq(RCU_SOFTIRQ);
122}
123
124/*
125 * Check to see if the scheduling-clock interrupt came from an extended
126 * quiescent state, and, if so, tell RCU about it.
127 */
128void rcu_check_callbacks(int cpu, int user)
129{
130 if (user ||
131 (idle_cpu(cpu) &&
132 !in_softirq() &&
133 hardirq_count() <= (1 << HARDIRQ_SHIFT)))
134 rcu_sched_qs(cpu);
135 else if (!in_softirq())
136 rcu_bh_qs(cpu);
137}
138
139/*
140 * Helper function for rcu_process_callbacks() that operates on the
141 * specified rcu_ctrlkblk structure.
142 */
143static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
144{
145 struct rcu_head *next, *list;
146 unsigned long flags;
147
148 /* If no RCU callbacks ready to invoke, just return. */
149 if (&rcp->rcucblist == rcp->donetail)
150 return;
151
152 /* Move the ready-to-invoke callbacks to a local list. */
153 local_irq_save(flags);
154 list = rcp->rcucblist;
155 rcp->rcucblist = *rcp->donetail;
156 *rcp->donetail = NULL;
157 if (rcp->curtail == rcp->donetail)
158 rcp->curtail = &rcp->rcucblist;
159 rcp->donetail = &rcp->rcucblist;
160 local_irq_restore(flags);
161
162 /* Invoke the callbacks on the local list. */
163 while (list) {
164 next = list->next;
165 prefetch(next);
166 list->func(list);
167 list = next;
168 }
169}
170
171/*
172 * Invoke any callbacks whose grace period has completed.
173 */
174static void rcu_process_callbacks(struct softirq_action *unused)
175{
176 __rcu_process_callbacks(&rcu_ctrlblk);
177 __rcu_process_callbacks(&rcu_bh_ctrlblk);
178}
179
180/*
181 * Wait for a grace period to elapse. But it is illegal to invoke
182 * synchronize_sched() from within an RCU read-side critical section.
183 * Therefore, any legal call to synchronize_sched() is a quiescent
184 * state, and so on a UP system, synchronize_sched() need do nothing.
185 * Ditto for synchronize_rcu_bh(). (But Lai Jiangshan points out the
186 * benefits of doing might_sleep() to reduce latency.)
187 *
188 * Cool, huh? (Due to Josh Triplett.)
189 *
190 * But we want to make this a static inline later.
191 */
192void synchronize_sched(void)
193{
194 cond_resched();
195}
196EXPORT_SYMBOL_GPL(synchronize_sched);
197
198void synchronize_rcu_bh(void)
199{
200 synchronize_sched();
201}
202EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
203
204/*
205 * Helper function for call_rcu() and call_rcu_bh().
206 */
207static void __call_rcu(struct rcu_head *head,
208 void (*func)(struct rcu_head *rcu),
209 struct rcu_ctrlblk *rcp)
210{
211 unsigned long flags;
212
213 head->func = func;
214 head->next = NULL;
215
216 local_irq_save(flags);
217 *rcp->curtail = head;
218 rcp->curtail = &head->next;
219 local_irq_restore(flags);
220}
221
222/*
223 * Post an RCU callback to be invoked after the end of an RCU grace
224 * period. But since we have but one CPU, that would be after any
225 * quiescent state.
226 */
227void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
228{
229 __call_rcu(head, func, &rcu_ctrlblk);
230}
231EXPORT_SYMBOL_GPL(call_rcu);
232
233/*
234 * Post an RCU bottom-half callback to be invoked after any subsequent
235 * quiescent state.
236 */
237void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
238{
239 __call_rcu(head, func, &rcu_bh_ctrlblk);
240}
241EXPORT_SYMBOL_GPL(call_rcu_bh);
242
243void rcu_barrier(void)
244{
245 struct rcu_synchronize rcu;
246
247 init_completion(&rcu.completion);
248 /* Will wake me after RCU finished. */
249 call_rcu(&rcu.head, wakeme_after_rcu);
250 /* Wait for it. */
251 wait_for_completion(&rcu.completion);
252}
253EXPORT_SYMBOL_GPL(rcu_barrier);
254
255void rcu_barrier_bh(void)
256{
257 struct rcu_synchronize rcu;
258
259 init_completion(&rcu.completion);
260 /* Will wake me after RCU finished. */
261 call_rcu_bh(&rcu.head, wakeme_after_rcu);
262 /* Wait for it. */
263 wait_for_completion(&rcu.completion);
264}
265EXPORT_SYMBOL_GPL(rcu_barrier_bh);
266
267void rcu_barrier_sched(void)
268{
269 struct rcu_synchronize rcu;
270
271 init_completion(&rcu.completion);
272 /* Will wake me after RCU finished. */
273 call_rcu_sched(&rcu.head, wakeme_after_rcu);
274 /* Wait for it. */
275 wait_for_completion(&rcu.completion);
276}
277EXPORT_SYMBOL_GPL(rcu_barrier_sched);
278
279void __init rcu_init(void)
280{
281 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
282}
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 697c0a0229d4..a621a67ef4e3 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -327,6 +327,11 @@ rcu_torture_cb(struct rcu_head *p)
327 cur_ops->deferred_free(rp); 327 cur_ops->deferred_free(rp);
328} 328}
329 329
330static int rcu_no_completed(void)
331{
332 return 0;
333}
334
330static void rcu_torture_deferred_free(struct rcu_torture *p) 335static void rcu_torture_deferred_free(struct rcu_torture *p)
331{ 336{
332 call_rcu(&p->rtort_rcu, rcu_torture_cb); 337 call_rcu(&p->rtort_rcu, rcu_torture_cb);
@@ -388,6 +393,21 @@ static struct rcu_torture_ops rcu_sync_ops = {
388 .name = "rcu_sync" 393 .name = "rcu_sync"
389}; 394};
390 395
396static struct rcu_torture_ops rcu_expedited_ops = {
397 .init = rcu_sync_torture_init,
398 .cleanup = NULL,
399 .readlock = rcu_torture_read_lock,
400 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
401 .readunlock = rcu_torture_read_unlock,
402 .completed = rcu_no_completed,
403 .deferred_free = rcu_sync_torture_deferred_free,
404 .sync = synchronize_rcu_expedited,
405 .cb_barrier = NULL,
406 .stats = NULL,
407 .irq_capable = 1,
408 .name = "rcu_expedited"
409};
410
391/* 411/*
392 * Definitions for rcu_bh torture testing. 412 * Definitions for rcu_bh torture testing.
393 */ 413 */
@@ -547,6 +567,25 @@ static struct rcu_torture_ops srcu_ops = {
547 .name = "srcu" 567 .name = "srcu"
548}; 568};
549 569
570static void srcu_torture_synchronize_expedited(void)
571{
572 synchronize_srcu_expedited(&srcu_ctl);
573}
574
575static struct rcu_torture_ops srcu_expedited_ops = {
576 .init = srcu_torture_init,
577 .cleanup = srcu_torture_cleanup,
578 .readlock = srcu_torture_read_lock,
579 .read_delay = srcu_read_delay,
580 .readunlock = srcu_torture_read_unlock,
581 .completed = srcu_torture_completed,
582 .deferred_free = rcu_sync_torture_deferred_free,
583 .sync = srcu_torture_synchronize_expedited,
584 .cb_barrier = NULL,
585 .stats = srcu_torture_stats,
586 .name = "srcu_expedited"
587};
588
550/* 589/*
551 * Definitions for sched torture testing. 590 * Definitions for sched torture testing.
552 */ 591 */
@@ -562,11 +601,6 @@ static void sched_torture_read_unlock(int idx)
562 preempt_enable(); 601 preempt_enable();
563} 602}
564 603
565static int sched_torture_completed(void)
566{
567 return 0;
568}
569
570static void rcu_sched_torture_deferred_free(struct rcu_torture *p) 604static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
571{ 605{
572 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); 606 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
@@ -583,7 +617,7 @@ static struct rcu_torture_ops sched_ops = {
583 .readlock = sched_torture_read_lock, 617 .readlock = sched_torture_read_lock,
584 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 618 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
585 .readunlock = sched_torture_read_unlock, 619 .readunlock = sched_torture_read_unlock,
586 .completed = sched_torture_completed, 620 .completed = rcu_no_completed,
587 .deferred_free = rcu_sched_torture_deferred_free, 621 .deferred_free = rcu_sched_torture_deferred_free,
588 .sync = sched_torture_synchronize, 622 .sync = sched_torture_synchronize,
589 .cb_barrier = rcu_barrier_sched, 623 .cb_barrier = rcu_barrier_sched,
@@ -592,13 +626,13 @@ static struct rcu_torture_ops sched_ops = {
592 .name = "sched" 626 .name = "sched"
593}; 627};
594 628
595static struct rcu_torture_ops sched_ops_sync = { 629static struct rcu_torture_ops sched_sync_ops = {
596 .init = rcu_sync_torture_init, 630 .init = rcu_sync_torture_init,
597 .cleanup = NULL, 631 .cleanup = NULL,
598 .readlock = sched_torture_read_lock, 632 .readlock = sched_torture_read_lock,
599 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 633 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
600 .readunlock = sched_torture_read_unlock, 634 .readunlock = sched_torture_read_unlock,
601 .completed = sched_torture_completed, 635 .completed = rcu_no_completed,
602 .deferred_free = rcu_sync_torture_deferred_free, 636 .deferred_free = rcu_sync_torture_deferred_free,
603 .sync = sched_torture_synchronize, 637 .sync = sched_torture_synchronize,
604 .cb_barrier = NULL, 638 .cb_barrier = NULL,
@@ -612,7 +646,7 @@ static struct rcu_torture_ops sched_expedited_ops = {
612 .readlock = sched_torture_read_lock, 646 .readlock = sched_torture_read_lock,
613 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 647 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
614 .readunlock = sched_torture_read_unlock, 648 .readunlock = sched_torture_read_unlock,
615 .completed = sched_torture_completed, 649 .completed = rcu_no_completed,
616 .deferred_free = rcu_sync_torture_deferred_free, 650 .deferred_free = rcu_sync_torture_deferred_free,
617 .sync = synchronize_sched_expedited, 651 .sync = synchronize_sched_expedited,
618 .cb_barrier = NULL, 652 .cb_barrier = NULL,
@@ -1097,9 +1131,10 @@ rcu_torture_init(void)
1097 int cpu; 1131 int cpu;
1098 int firsterr = 0; 1132 int firsterr = 0;
1099 static struct rcu_torture_ops *torture_ops[] = 1133 static struct rcu_torture_ops *torture_ops[] =
1100 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1134 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1101 &sched_expedited_ops, 1135 &rcu_bh_ops, &rcu_bh_sync_ops,
1102 &srcu_ops, &sched_ops, &sched_ops_sync, }; 1136 &srcu_ops, &srcu_expedited_ops,
1137 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1103 1138
1104 mutex_lock(&fullstop_mutex); 1139 mutex_lock(&fullstop_mutex);
1105 1140
@@ -1110,8 +1145,12 @@ rcu_torture_init(void)
1110 break; 1145 break;
1111 } 1146 }
1112 if (i == ARRAY_SIZE(torture_ops)) { 1147 if (i == ARRAY_SIZE(torture_ops)) {
1113 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", 1148 printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n",
1114 torture_type); 1149 torture_type);
1150 printk(KERN_ALERT "rcu-torture types:");
1151 for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
1152 printk(KERN_ALERT " %s", torture_ops[i]->name);
1153 printk(KERN_ALERT "\n");
1115 mutex_unlock(&fullstop_mutex); 1154 mutex_unlock(&fullstop_mutex);
1116 return -EINVAL; 1155 return -EINVAL;
1117 } 1156 }
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f3077c0ab181..53ae9598f798 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,18 +46,22 @@
46#include <linux/cpu.h> 46#include <linux/cpu.h>
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h>
49 50
50#include "rcutree.h" 51#include "rcutree.h"
51 52
52/* Data structures. */ 53/* Data structures. */
53 54
55static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
56
54#define RCU_STATE_INITIALIZER(name) { \ 57#define RCU_STATE_INITIALIZER(name) { \
55 .level = { &name.node[0] }, \ 58 .level = { &name.node[0] }, \
56 .levelcnt = { \ 59 .levelcnt = { \
57 NUM_RCU_LVL_0, /* root of hierarchy. */ \ 60 NUM_RCU_LVL_0, /* root of hierarchy. */ \
58 NUM_RCU_LVL_1, \ 61 NUM_RCU_LVL_1, \
59 NUM_RCU_LVL_2, \ 62 NUM_RCU_LVL_2, \
60 NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \ 63 NUM_RCU_LVL_3, \
64 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
61 }, \ 65 }, \
62 .signaled = RCU_GP_IDLE, \ 66 .signaled = RCU_GP_IDLE, \
63 .gpnum = -300, \ 67 .gpnum = -300, \
@@ -77,6 +81,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
77struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
78DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
79 83
84static int rcu_scheduler_active __read_mostly;
85
80 86
81/* 87/*
82 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 88 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
@@ -98,7 +104,7 @@ void rcu_sched_qs(int cpu)
98 struct rcu_data *rdp; 104 struct rcu_data *rdp;
99 105
100 rdp = &per_cpu(rcu_sched_data, cpu); 106 rdp = &per_cpu(rcu_sched_data, cpu);
101 rdp->passed_quiesc_completed = rdp->completed; 107 rdp->passed_quiesc_completed = rdp->gpnum - 1;
102 barrier(); 108 barrier();
103 rdp->passed_quiesc = 1; 109 rdp->passed_quiesc = 1;
104 rcu_preempt_note_context_switch(cpu); 110 rcu_preempt_note_context_switch(cpu);
@@ -109,7 +115,7 @@ void rcu_bh_qs(int cpu)
109 struct rcu_data *rdp; 115 struct rcu_data *rdp;
110 116
111 rdp = &per_cpu(rcu_bh_data, cpu); 117 rdp = &per_cpu(rcu_bh_data, cpu);
112 rdp->passed_quiesc_completed = rdp->completed; 118 rdp->passed_quiesc_completed = rdp->gpnum - 1;
113 barrier(); 119 barrier();
114 rdp->passed_quiesc = 1; 120 rdp->passed_quiesc = 1;
115} 121}
@@ -335,28 +341,9 @@ void rcu_irq_exit(void)
335 set_need_resched(); 341 set_need_resched();
336} 342}
337 343
338/*
339 * Record the specified "completed" value, which is later used to validate
340 * dynticks counter manipulations. Specify "rsp->completed - 1" to
341 * unconditionally invalidate any future dynticks manipulations (which is
342 * useful at the beginning of a grace period).
343 */
344static void dyntick_record_completed(struct rcu_state *rsp, long comp)
345{
346 rsp->dynticks_completed = comp;
347}
348
349#ifdef CONFIG_SMP 344#ifdef CONFIG_SMP
350 345
351/* 346/*
352 * Recall the previously recorded value of the completion for dynticks.
353 */
354static long dyntick_recall_completed(struct rcu_state *rsp)
355{
356 return rsp->dynticks_completed;
357}
358
359/*
360 * Snapshot the specified CPU's dynticks counter so that we can later 347 * Snapshot the specified CPU's dynticks counter so that we can later
361 * credit them with an implicit quiescent state. Return 1 if this CPU 348 * credit them with an implicit quiescent state. Return 1 if this CPU
362 * is in dynticks idle mode, which is an extended quiescent state. 349 * is in dynticks idle mode, which is an extended quiescent state.
@@ -419,24 +406,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
419 406
420#else /* #ifdef CONFIG_NO_HZ */ 407#else /* #ifdef CONFIG_NO_HZ */
421 408
422static void dyntick_record_completed(struct rcu_state *rsp, long comp)
423{
424}
425
426#ifdef CONFIG_SMP 409#ifdef CONFIG_SMP
427 410
428/*
429 * If there are no dynticks, then the only way that a CPU can passively
430 * be in a quiescent state is to be offline. Unlike dynticks idle, which
431 * is a point in time during the prior (already finished) grace period,
432 * an offline CPU is always in a quiescent state, and thus can be
433 * unconditionally applied. So just return the current value of completed.
434 */
435static long dyntick_recall_completed(struct rcu_state *rsp)
436{
437 return rsp->completed;
438}
439
440static int dyntick_save_progress_counter(struct rcu_data *rdp) 411static int dyntick_save_progress_counter(struct rcu_data *rdp)
441{ 412{
442 return 0; 413 return 0;
@@ -553,13 +524,33 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
553/* 524/*
554 * Update CPU-local rcu_data state to record the newly noticed grace period. 525 * Update CPU-local rcu_data state to record the newly noticed grace period.
555 * This is used both when we started the grace period and when we notice 526 * This is used both when we started the grace period and when we notice
556 * that someone else started the grace period. 527 * that someone else started the grace period. The caller must hold the
528 * ->lock of the leaf rcu_node structure corresponding to the current CPU,
529 * and must have irqs disabled.
557 */ 530 */
531static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
532{
533 if (rdp->gpnum != rnp->gpnum) {
534 rdp->qs_pending = 1;
535 rdp->passed_quiesc = 0;
536 rdp->gpnum = rnp->gpnum;
537 }
538}
539
558static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) 540static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
559{ 541{
560 rdp->qs_pending = 1; 542 unsigned long flags;
561 rdp->passed_quiesc = 0; 543 struct rcu_node *rnp;
562 rdp->gpnum = rsp->gpnum; 544
545 local_irq_save(flags);
546 rnp = rdp->mynode;
547 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
548 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
549 local_irq_restore(flags);
550 return;
551 }
552 __note_new_gpnum(rsp, rnp, rdp);
553 spin_unlock_irqrestore(&rnp->lock, flags);
563} 554}
564 555
565/* 556/*
@@ -583,6 +574,79 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
583} 574}
584 575
585/* 576/*
577 * Advance this CPU's callbacks, but only if the current grace period
578 * has ended. This may be called only from the CPU to whom the rdp
579 * belongs. In addition, the corresponding leaf rcu_node structure's
580 * ->lock must be held by the caller, with irqs disabled.
581 */
582static void
583__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
584{
585 /* Did another grace period end? */
586 if (rdp->completed != rnp->completed) {
587
588 /* Advance callbacks. No harm if list empty. */
589 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
590 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
591 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
592
593 /* Remember that we saw this grace-period completion. */
594 rdp->completed = rnp->completed;
595 }
596}
597
598/*
599 * Advance this CPU's callbacks, but only if the current grace period
600 * has ended. This may be called only from the CPU to whom the rdp
601 * belongs.
602 */
603static void
604rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
605{
606 unsigned long flags;
607 struct rcu_node *rnp;
608
609 local_irq_save(flags);
610 rnp = rdp->mynode;
611 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
612 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
613 local_irq_restore(flags);
614 return;
615 }
616 __rcu_process_gp_end(rsp, rnp, rdp);
617 spin_unlock_irqrestore(&rnp->lock, flags);
618}
619
620/*
621 * Do per-CPU grace-period initialization for running CPU. The caller
622 * must hold the lock of the leaf rcu_node structure corresponding to
623 * this CPU.
624 */
625static void
626rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
627{
628 /* Prior grace period ended, so advance callbacks for current CPU. */
629 __rcu_process_gp_end(rsp, rnp, rdp);
630
631 /*
632 * Because this CPU just now started the new grace period, we know
633 * that all of its callbacks will be covered by this upcoming grace
634 * period, even the ones that were registered arbitrarily recently.
635 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
636 *
637 * Other CPUs cannot be sure exactly when the grace period started.
638 * Therefore, their recently registered callbacks must pass through
639 * an additional RCU_NEXT_READY stage, so that they will be handled
640 * by the next RCU grace period.
641 */
642 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
643 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
644
645 /* Set state so that this CPU will detect the next quiescent state. */
646 __note_new_gpnum(rsp, rnp, rdp);
647}
648
649/*
586 * Start a new RCU grace period if warranted, re-initializing the hierarchy 650 * Start a new RCU grace period if warranted, re-initializing the hierarchy
587 * in preparation for detecting the next grace period. The caller must hold 651 * in preparation for detecting the next grace period. The caller must hold
588 * the root node's ->lock, which is released before return. Hard irqs must 652 * the root node's ->lock, which is released before return. Hard irqs must
@@ -596,7 +660,23 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
596 struct rcu_node *rnp = rcu_get_root(rsp); 660 struct rcu_node *rnp = rcu_get_root(rsp);
597 661
598 if (!cpu_needs_another_gp(rsp, rdp)) { 662 if (!cpu_needs_another_gp(rsp, rdp)) {
599 spin_unlock_irqrestore(&rnp->lock, flags); 663 if (rnp->completed == rsp->completed) {
664 spin_unlock_irqrestore(&rnp->lock, flags);
665 return;
666 }
667 spin_unlock(&rnp->lock); /* irqs remain disabled. */
668
669 /*
670 * Propagate new ->completed value to rcu_node structures
671 * so that other CPUs don't have to wait until the start
672 * of the next grace period to process their callbacks.
673 */
674 rcu_for_each_node_breadth_first(rsp, rnp) {
675 spin_lock(&rnp->lock); /* irqs already disabled. */
676 rnp->completed = rsp->completed;
677 spin_unlock(&rnp->lock); /* irqs remain disabled. */
678 }
679 local_irq_restore(flags);
600 return; 680 return;
601 } 681 }
602 682
@@ -606,29 +686,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
606 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 686 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
607 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 687 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
608 record_gp_stall_check_time(rsp); 688 record_gp_stall_check_time(rsp);
609 dyntick_record_completed(rsp, rsp->completed - 1);
610 note_new_gpnum(rsp, rdp);
611
612 /*
613 * Because this CPU just now started the new grace period, we know
614 * that all of its callbacks will be covered by this upcoming grace
615 * period, even the ones that were registered arbitrarily recently.
616 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
617 *
618 * Other CPUs cannot be sure exactly when the grace period started.
619 * Therefore, their recently registered callbacks must pass through
620 * an additional RCU_NEXT_READY stage, so that they will be handled
621 * by the next RCU grace period.
622 */
623 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
624 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
625 689
626 /* Special-case the common single-level case. */ 690 /* Special-case the common single-level case. */
627 if (NUM_RCU_NODES == 1) { 691 if (NUM_RCU_NODES == 1) {
628 rcu_preempt_check_blocked_tasks(rnp); 692 rcu_preempt_check_blocked_tasks(rnp);
629 rnp->qsmask = rnp->qsmaskinit; 693 rnp->qsmask = rnp->qsmaskinit;
630 rnp->gpnum = rsp->gpnum; 694 rnp->gpnum = rsp->gpnum;
695 rnp->completed = rsp->completed;
631 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 696 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
697 rcu_start_gp_per_cpu(rsp, rnp, rdp);
632 spin_unlock_irqrestore(&rnp->lock, flags); 698 spin_unlock_irqrestore(&rnp->lock, flags);
633 return; 699 return;
634 } 700 }
@@ -661,6 +727,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
661 rcu_preempt_check_blocked_tasks(rnp); 727 rcu_preempt_check_blocked_tasks(rnp);
662 rnp->qsmask = rnp->qsmaskinit; 728 rnp->qsmask = rnp->qsmaskinit;
663 rnp->gpnum = rsp->gpnum; 729 rnp->gpnum = rsp->gpnum;
730 rnp->completed = rsp->completed;
731 if (rnp == rdp->mynode)
732 rcu_start_gp_per_cpu(rsp, rnp, rdp);
664 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 733 spin_unlock(&rnp->lock); /* irqs remain disabled. */
665 } 734 }
666 735
@@ -672,58 +741,32 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
672} 741}
673 742
674/* 743/*
675 * Advance this CPU's callbacks, but only if the current grace period 744 * Report a full set of quiescent states to the specified rcu_state
676 * has ended. This may be called only from the CPU to whom the rdp 745 * data structure. This involves cleaning up after the prior grace
677 * belongs. 746 * period and letting rcu_start_gp() start up the next grace period
747 * if one is needed. Note that the caller must hold rnp->lock, as
748 * required by rcu_start_gp(), which will release it.
678 */ 749 */
679static void 750static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
680rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
681{
682 long completed_snap;
683 unsigned long flags;
684
685 local_irq_save(flags);
686 completed_snap = ACCESS_ONCE(rsp->completed); /* outside of lock. */
687
688 /* Did another grace period end? */
689 if (rdp->completed != completed_snap) {
690
691 /* Advance callbacks. No harm if list empty. */
692 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
693 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
694 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
695
696 /* Remember that we saw this grace-period completion. */
697 rdp->completed = completed_snap;
698 }
699 local_irq_restore(flags);
700}
701
702/*
703 * Clean up after the prior grace period and let rcu_start_gp() start up
704 * the next grace period if one is needed. Note that the caller must
705 * hold rnp->lock, as required by rcu_start_gp(), which will release it.
706 */
707static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
708 __releases(rcu_get_root(rsp)->lock) 751 __releases(rcu_get_root(rsp)->lock)
709{ 752{
710 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 753 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
711 rsp->completed = rsp->gpnum; 754 rsp->completed = rsp->gpnum;
712 rsp->signaled = RCU_GP_IDLE; 755 rsp->signaled = RCU_GP_IDLE;
713 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
714 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 756 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
715} 757}
716 758
717/* 759/*
718 * Similar to cpu_quiet(), for which it is a helper function. Allows 760 * Similar to rcu_report_qs_rdp(), for which it is a helper function.
719 * a group of CPUs to be quieted at one go, though all the CPUs in the 761 * Allows quiescent states for a group of CPUs to be reported at one go
720 * group must be represented by the same leaf rcu_node structure. 762 * to the specified rcu_node structure, though all the CPUs in the group
721 * That structure's lock must be held upon entry, and it is released 763 * must be represented by the same rcu_node structure (which need not be
722 * before return. 764 * a leaf rcu_node structure, though it often will be). That structure's
765 * lock must be held upon entry, and it is released before return.
723 */ 766 */
724static void 767static void
725cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, 768rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
726 unsigned long flags) 769 struct rcu_node *rnp, unsigned long flags)
727 __releases(rnp->lock) 770 __releases(rnp->lock)
728{ 771{
729 struct rcu_node *rnp_c; 772 struct rcu_node *rnp_c;
@@ -759,21 +802,23 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
759 802
760 /* 803 /*
761 * Get here if we are the last CPU to pass through a quiescent 804 * Get here if we are the last CPU to pass through a quiescent
762 * state for this grace period. Invoke cpu_quiet_msk_finish() 805 * state for this grace period. Invoke rcu_report_qs_rsp()
763 * to clean up and start the next grace period if one is needed. 806 * to clean up and start the next grace period if one is needed.
764 */ 807 */
765 cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */ 808 rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */
766} 809}
767 810
768/* 811/*
769 * Record a quiescent state for the specified CPU, which must either be 812 * Record a quiescent state for the specified CPU to that CPU's rcu_data
770 * the current CPU. The lastcomp argument is used to make sure we are 813 * structure. This must be either called from the specified CPU, or
771 * still in the grace period of interest. We don't want to end the current 814 * called when the specified CPU is known to be offline (and when it is
772 * grace period based on quiescent states detected in an earlier grace 815 * also known that no other CPU is concurrently trying to help the offline
773 * period! 816 * CPU). The lastcomp argument is used to make sure we are still in the
817 * grace period of interest. We don't want to end the current grace period
818 * based on quiescent states detected in an earlier grace period!
774 */ 819 */
775static void 820static void
776cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) 821rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
777{ 822{
778 unsigned long flags; 823 unsigned long flags;
779 unsigned long mask; 824 unsigned long mask;
@@ -781,15 +826,15 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
781 826
782 rnp = rdp->mynode; 827 rnp = rdp->mynode;
783 spin_lock_irqsave(&rnp->lock, flags); 828 spin_lock_irqsave(&rnp->lock, flags);
784 if (lastcomp != ACCESS_ONCE(rsp->completed)) { 829 if (lastcomp != rnp->completed) {
785 830
786 /* 831 /*
787 * Someone beat us to it for this grace period, so leave. 832 * Someone beat us to it for this grace period, so leave.
788 * The race with GP start is resolved by the fact that we 833 * The race with GP start is resolved by the fact that we
789 * hold the leaf rcu_node lock, so that the per-CPU bits 834 * hold the leaf rcu_node lock, so that the per-CPU bits
790 * cannot yet be initialized -- so we would simply find our 835 * cannot yet be initialized -- so we would simply find our
791 * CPU's bit already cleared in cpu_quiet_msk() if this race 836 * CPU's bit already cleared in rcu_report_qs_rnp() if this
792 * occurred. 837 * race occurred.
793 */ 838 */
794 rdp->passed_quiesc = 0; /* try again later! */ 839 rdp->passed_quiesc = 0; /* try again later! */
795 spin_unlock_irqrestore(&rnp->lock, flags); 840 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -807,7 +852,7 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
807 */ 852 */
808 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 853 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
809 854
810 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */ 855 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
811 } 856 }
812} 857}
813 858
@@ -838,8 +883,11 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
838 if (!rdp->passed_quiesc) 883 if (!rdp->passed_quiesc)
839 return; 884 return;
840 885
841 /* Tell RCU we are done (but cpu_quiet() will be the judge of that). */ 886 /*
842 cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); 887 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
888 * judge of that).
889 */
890 rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
843} 891}
844 892
845#ifdef CONFIG_HOTPLUG_CPU 893#ifdef CONFIG_HOTPLUG_CPU
@@ -899,8 +947,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
899static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) 947static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
900{ 948{
901 unsigned long flags; 949 unsigned long flags;
902 long lastcomp;
903 unsigned long mask; 950 unsigned long mask;
951 int need_report = 0;
904 struct rcu_data *rdp = rsp->rda[cpu]; 952 struct rcu_data *rdp = rsp->rda[cpu];
905 struct rcu_node *rnp; 953 struct rcu_node *rnp;
906 954
@@ -914,30 +962,32 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
914 spin_lock(&rnp->lock); /* irqs already disabled. */ 962 spin_lock(&rnp->lock); /* irqs already disabled. */
915 rnp->qsmaskinit &= ~mask; 963 rnp->qsmaskinit &= ~mask;
916 if (rnp->qsmaskinit != 0) { 964 if (rnp->qsmaskinit != 0) {
917 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 965 if (rnp != rdp->mynode)
966 spin_unlock(&rnp->lock); /* irqs remain disabled. */
918 break; 967 break;
919 } 968 }
920 969 if (rnp == rdp->mynode)
921 /* 970 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
922 * If there was a task blocking the current grace period, 971 else
923 * and if all CPUs have checked in, we need to propagate 972 spin_unlock(&rnp->lock); /* irqs remain disabled. */
924 * the quiescent state up the rcu_node hierarchy. But that
925 * is inconvenient at the moment due to deadlock issues if
926 * this should end the current grace period. So set the
927 * offlined CPU's bit in ->qsmask in order to force the
928 * next force_quiescent_state() invocation to clean up this
929 * mess in a deadlock-free manner.
930 */
931 if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask)
932 rnp->qsmask |= mask;
933
934 mask = rnp->grpmask; 973 mask = rnp->grpmask;
935 spin_unlock(&rnp->lock); /* irqs remain disabled. */
936 rnp = rnp->parent; 974 rnp = rnp->parent;
937 } while (rnp != NULL); 975 } while (rnp != NULL);
938 lastcomp = rsp->completed;
939 976
940 spin_unlock_irqrestore(&rsp->onofflock, flags); 977 /*
978 * We still hold the leaf rcu_node structure lock here, and
979 * irqs are still disabled. The reason for this subterfuge is
980 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
981 * held leads to deadlock.
982 */
983 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
984 rnp = rdp->mynode;
985 if (need_report & RCU_OFL_TASKS_NORM_GP)
986 rcu_report_unblock_qs_rnp(rnp, flags);
987 else
988 spin_unlock_irqrestore(&rnp->lock, flags);
989 if (need_report & RCU_OFL_TASKS_EXP_GP)
990 rcu_report_exp_rnp(rsp, rnp);
941 991
942 rcu_adopt_orphan_cbs(rsp); 992 rcu_adopt_orphan_cbs(rsp);
943} 993}
@@ -1109,7 +1159,7 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1109 rcu_for_each_leaf_node(rsp, rnp) { 1159 rcu_for_each_leaf_node(rsp, rnp) {
1110 mask = 0; 1160 mask = 0;
1111 spin_lock_irqsave(&rnp->lock, flags); 1161 spin_lock_irqsave(&rnp->lock, flags);
1112 if (rsp->completed != lastcomp) { 1162 if (rnp->completed != lastcomp) {
1113 spin_unlock_irqrestore(&rnp->lock, flags); 1163 spin_unlock_irqrestore(&rnp->lock, flags);
1114 return 1; 1164 return 1;
1115 } 1165 }
@@ -1123,10 +1173,10 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1123 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) 1173 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1124 mask |= bit; 1174 mask |= bit;
1125 } 1175 }
1126 if (mask != 0 && rsp->completed == lastcomp) { 1176 if (mask != 0 && rnp->completed == lastcomp) {
1127 1177
1128 /* cpu_quiet_msk() releases rnp->lock. */ 1178 /* rcu_report_qs_rnp() releases rnp->lock. */
1129 cpu_quiet_msk(mask, rsp, rnp, flags); 1179 rcu_report_qs_rnp(mask, rsp, rnp, flags);
1130 continue; 1180 continue;
1131 } 1181 }
1132 spin_unlock_irqrestore(&rnp->lock, flags); 1182 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1144,6 +1194,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1144 long lastcomp; 1194 long lastcomp;
1145 struct rcu_node *rnp = rcu_get_root(rsp); 1195 struct rcu_node *rnp = rcu_get_root(rsp);
1146 u8 signaled; 1196 u8 signaled;
1197 u8 forcenow;
1147 1198
1148 if (!rcu_gp_in_progress(rsp)) 1199 if (!rcu_gp_in_progress(rsp))
1149 return; /* No grace period in progress, nothing to force. */ 1200 return; /* No grace period in progress, nothing to force. */
@@ -1156,10 +1207,10 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1156 goto unlock_ret; /* no emergency and done recently. */ 1207 goto unlock_ret; /* no emergency and done recently. */
1157 rsp->n_force_qs++; 1208 rsp->n_force_qs++;
1158 spin_lock(&rnp->lock); 1209 spin_lock(&rnp->lock);
1159 lastcomp = rsp->completed; 1210 lastcomp = rsp->gpnum - 1;
1160 signaled = rsp->signaled; 1211 signaled = rsp->signaled;
1161 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 1212 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1162 if (lastcomp == rsp->gpnum) { 1213 if(!rcu_gp_in_progress(rsp)) {
1163 rsp->n_force_qs_ngp++; 1214 rsp->n_force_qs_ngp++;
1164 spin_unlock(&rnp->lock); 1215 spin_unlock(&rnp->lock);
1165 goto unlock_ret; /* no GP in progress, time updated. */ 1216 goto unlock_ret; /* no GP in progress, time updated. */
@@ -1180,21 +1231,29 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1180 if (rcu_process_dyntick(rsp, lastcomp, 1231 if (rcu_process_dyntick(rsp, lastcomp,
1181 dyntick_save_progress_counter)) 1232 dyntick_save_progress_counter))
1182 goto unlock_ret; 1233 goto unlock_ret;
1234 /* fall into next case. */
1235
1236 case RCU_SAVE_COMPLETED:
1183 1237
1184 /* Update state, record completion counter. */ 1238 /* Update state, record completion counter. */
1239 forcenow = 0;
1185 spin_lock(&rnp->lock); 1240 spin_lock(&rnp->lock);
1186 if (lastcomp == rsp->completed && 1241 if (lastcomp + 1 == rsp->gpnum &&
1187 rsp->signaled == RCU_SAVE_DYNTICK) { 1242 lastcomp == rsp->completed &&
1243 rsp->signaled == signaled) {
1188 rsp->signaled = RCU_FORCE_QS; 1244 rsp->signaled = RCU_FORCE_QS;
1189 dyntick_record_completed(rsp, lastcomp); 1245 rsp->completed_fqs = lastcomp;
1246 forcenow = signaled == RCU_SAVE_COMPLETED;
1190 } 1247 }
1191 spin_unlock(&rnp->lock); 1248 spin_unlock(&rnp->lock);
1192 break; 1249 if (!forcenow)
1250 break;
1251 /* fall into next case. */
1193 1252
1194 case RCU_FORCE_QS: 1253 case RCU_FORCE_QS:
1195 1254
1196 /* Check dyntick-idle state, send IPI to laggarts. */ 1255 /* Check dyntick-idle state, send IPI to laggarts. */
1197 if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp), 1256 if (rcu_process_dyntick(rsp, rsp->completed_fqs,
1198 rcu_implicit_dynticks_qs)) 1257 rcu_implicit_dynticks_qs))
1199 goto unlock_ret; 1258 goto unlock_ret;
1200 1259
@@ -1351,6 +1410,68 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1351} 1410}
1352EXPORT_SYMBOL_GPL(call_rcu_bh); 1411EXPORT_SYMBOL_GPL(call_rcu_bh);
1353 1412
1413/**
1414 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
1415 *
1416 * Control will return to the caller some time after a full rcu-sched
1417 * grace period has elapsed, in other words after all currently executing
1418 * rcu-sched read-side critical sections have completed. These read-side
1419 * critical sections are delimited by rcu_read_lock_sched() and
1420 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
1421 * local_irq_disable(), and so on may be used in place of
1422 * rcu_read_lock_sched().
1423 *
1424 * This means that all preempt_disable code sequences, including NMI and
1425 * hardware-interrupt handlers, in progress on entry will have completed
1426 * before this primitive returns. However, this does not guarantee that
1427 * softirq handlers will have completed, since in some kernels, these
1428 * handlers can run in process context, and can block.
1429 *
1430 * This primitive provides the guarantees made by the (now removed)
1431 * synchronize_kernel() API. In contrast, synchronize_rcu() only
1432 * guarantees that rcu_read_lock() sections will have completed.
1433 * In "classic RCU", these two guarantees happen to be one and
1434 * the same, but can differ in realtime RCU implementations.
1435 */
1436void synchronize_sched(void)
1437{
1438 struct rcu_synchronize rcu;
1439
1440 if (rcu_blocking_is_gp())
1441 return;
1442
1443 init_completion(&rcu.completion);
1444 /* Will wake me after RCU finished. */
1445 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1446 /* Wait for it. */
1447 wait_for_completion(&rcu.completion);
1448}
1449EXPORT_SYMBOL_GPL(synchronize_sched);
1450
1451/**
1452 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
1453 *
1454 * Control will return to the caller some time after a full rcu_bh grace
1455 * period has elapsed, in other words after all currently executing rcu_bh
1456 * read-side critical sections have completed. RCU read-side critical
1457 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
1458 * and may be nested.
1459 */
1460void synchronize_rcu_bh(void)
1461{
1462 struct rcu_synchronize rcu;
1463
1464 if (rcu_blocking_is_gp())
1465 return;
1466
1467 init_completion(&rcu.completion);
1468 /* Will wake me after RCU finished. */
1469 call_rcu_bh(&rcu.head, wakeme_after_rcu);
1470 /* Wait for it. */
1471 wait_for_completion(&rcu.completion);
1472}
1473EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
1474
1354/* 1475/*
1355 * Check to see if there is any immediate RCU-related work to be done 1476 * Check to see if there is any immediate RCU-related work to be done
1356 * by the current CPU, for the specified type of RCU, returning 1 if so. 1477 * by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -1360,6 +1481,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
1360 */ 1481 */
1361static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) 1482static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1362{ 1483{
1484 struct rcu_node *rnp = rdp->mynode;
1485
1363 rdp->n_rcu_pending++; 1486 rdp->n_rcu_pending++;
1364 1487
1365 /* Check for CPU stalls, if enabled. */ 1488 /* Check for CPU stalls, if enabled. */
@@ -1384,13 +1507,13 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1384 } 1507 }
1385 1508
1386 /* Has another RCU grace period completed? */ 1509 /* Has another RCU grace period completed? */
1387 if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */ 1510 if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
1388 rdp->n_rp_gp_completed++; 1511 rdp->n_rp_gp_completed++;
1389 return 1; 1512 return 1;
1390 } 1513 }
1391 1514
1392 /* Has a new RCU grace period started? */ 1515 /* Has a new RCU grace period started? */
1393 if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */ 1516 if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
1394 rdp->n_rp_gp_started++; 1517 rdp->n_rp_gp_started++;
1395 return 1; 1518 return 1;
1396 } 1519 }
@@ -1433,6 +1556,21 @@ int rcu_needs_cpu(int cpu)
1433 rcu_preempt_needs_cpu(cpu); 1556 rcu_preempt_needs_cpu(cpu);
1434} 1557}
1435 1558
1559/*
1560 * This function is invoked towards the end of the scheduler's initialization
1561 * process. Before this is called, the idle task might contain
1562 * RCU read-side critical sections (during which time, this idle
1563 * task is booting the system). After this function is called, the
1564 * idle tasks are prohibited from containing RCU read-side critical
1565 * sections.
1566 */
1567void rcu_scheduler_starting(void)
1568{
1569 WARN_ON(num_online_cpus() != 1);
1570 WARN_ON(nr_context_switches() > 0);
1571 rcu_scheduler_active = 1;
1572}
1573
1436static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 1574static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
1437static atomic_t rcu_barrier_cpu_count; 1575static atomic_t rcu_barrier_cpu_count;
1438static DEFINE_MUTEX(rcu_barrier_mutex); 1576static DEFINE_MUTEX(rcu_barrier_mutex);
@@ -1544,21 +1682,16 @@ static void __cpuinit
1544rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) 1682rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1545{ 1683{
1546 unsigned long flags; 1684 unsigned long flags;
1547 long lastcomp;
1548 unsigned long mask; 1685 unsigned long mask;
1549 struct rcu_data *rdp = rsp->rda[cpu]; 1686 struct rcu_data *rdp = rsp->rda[cpu];
1550 struct rcu_node *rnp = rcu_get_root(rsp); 1687 struct rcu_node *rnp = rcu_get_root(rsp);
1551 1688
1552 /* Set up local state, ensuring consistent view of global state. */ 1689 /* Set up local state, ensuring consistent view of global state. */
1553 spin_lock_irqsave(&rnp->lock, flags); 1690 spin_lock_irqsave(&rnp->lock, flags);
1554 lastcomp = rsp->completed;
1555 rdp->completed = lastcomp;
1556 rdp->gpnum = lastcomp;
1557 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1691 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1558 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1692 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1559 rdp->beenonline = 1; /* We have now been online. */ 1693 rdp->beenonline = 1; /* We have now been online. */
1560 rdp->preemptable = preemptable; 1694 rdp->preemptable = preemptable;
1561 rdp->passed_quiesc_completed = lastcomp - 1;
1562 rdp->qlen_last_fqs_check = 0; 1695 rdp->qlen_last_fqs_check = 0;
1563 rdp->n_force_qs_snap = rsp->n_force_qs; 1696 rdp->n_force_qs_snap = rsp->n_force_qs;
1564 rdp->blimit = blimit; 1697 rdp->blimit = blimit;
@@ -1580,6 +1713,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1580 spin_lock(&rnp->lock); /* irqs already disabled. */ 1713 spin_lock(&rnp->lock); /* irqs already disabled. */
1581 rnp->qsmaskinit |= mask; 1714 rnp->qsmaskinit |= mask;
1582 mask = rnp->grpmask; 1715 mask = rnp->grpmask;
1716 if (rnp == rdp->mynode) {
1717 rdp->gpnum = rnp->completed; /* if GP in progress... */
1718 rdp->completed = rnp->completed;
1719 rdp->passed_quiesc_completed = rnp->completed - 1;
1720 }
1583 spin_unlock(&rnp->lock); /* irqs already disabled. */ 1721 spin_unlock(&rnp->lock); /* irqs already disabled. */
1584 rnp = rnp->parent; 1722 rnp = rnp->parent;
1585 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 1723 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
@@ -1597,8 +1735,8 @@ static void __cpuinit rcu_online_cpu(int cpu)
1597/* 1735/*
1598 * Handle CPU online/offline notification events. 1736 * Handle CPU online/offline notification events.
1599 */ 1737 */
1600int __cpuinit rcu_cpu_notify(struct notifier_block *self, 1738static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1601 unsigned long action, void *hcpu) 1739 unsigned long action, void *hcpu)
1602{ 1740{
1603 long cpu = (long)hcpu; 1741 long cpu = (long)hcpu;
1604 1742
@@ -1685,8 +1823,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1685 cpustride *= rsp->levelspread[i]; 1823 cpustride *= rsp->levelspread[i];
1686 rnp = rsp->level[i]; 1824 rnp = rsp->level[i];
1687 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1825 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1688 if (rnp != rcu_get_root(rsp)) 1826 spin_lock_init(&rnp->lock);
1689 spin_lock_init(&rnp->lock); 1827 lockdep_set_class(&rnp->lock, &rcu_node_class[i]);
1690 rnp->gpnum = 0; 1828 rnp->gpnum = 0;
1691 rnp->qsmask = 0; 1829 rnp->qsmask = 0;
1692 rnp->qsmaskinit = 0; 1830 rnp->qsmaskinit = 0;
@@ -1707,9 +1845,10 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1707 rnp->level = i; 1845 rnp->level = i;
1708 INIT_LIST_HEAD(&rnp->blocked_tasks[0]); 1846 INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
1709 INIT_LIST_HEAD(&rnp->blocked_tasks[1]); 1847 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1848 INIT_LIST_HEAD(&rnp->blocked_tasks[2]);
1849 INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
1710 } 1850 }
1711 } 1851 }
1712 spin_lock_init(&rcu_get_root(rsp)->lock);
1713} 1852}
1714 1853
1715/* 1854/*
@@ -1735,16 +1874,30 @@ do { \
1735 } \ 1874 } \
1736} while (0) 1875} while (0)
1737 1876
1738void __init __rcu_init(void) 1877void __init rcu_init(void)
1739{ 1878{
1879 int i;
1880
1740 rcu_bootup_announce(); 1881 rcu_bootup_announce();
1741#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1742 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1883 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1743#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 1884#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1885#if NUM_RCU_LVL_4 != 0
1886 printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n");
1887#endif /* #if NUM_RCU_LVL_4 != 0 */
1744 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); 1888 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1745 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); 1889 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1746 __rcu_init_preempt(); 1890 __rcu_init_preempt();
1747 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1891 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1892
1893 /*
1894 * We don't need protection against CPU-hotplug here because
1895 * this is called early in boot, before either interrupts
1896 * or the scheduler are operational.
1897 */
1898 cpu_notifier(rcu_cpu_notify, 0);
1899 for_each_online_cpu(i)
1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i);
1748} 1901}
1749 1902
1750#include "rcutree_plugin.h" 1903#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1899023b0962..d2a0046f63b2 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -34,10 +34,11 @@
34 * In practice, this has not been tested, so there is probably some 34 * In practice, this has not been tested, so there is probably some
35 * bug somewhere. 35 * bug somewhere.
36 */ 36 */
37#define MAX_RCU_LVLS 3 37#define MAX_RCU_LVLS 4
38#define RCU_FANOUT (CONFIG_RCU_FANOUT) 38#define RCU_FANOUT (CONFIG_RCU_FANOUT)
39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) 39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) 40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
41#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT)
41 42
42#if NR_CPUS <= RCU_FANOUT 43#if NR_CPUS <= RCU_FANOUT
43# define NUM_RCU_LVLS 1 44# define NUM_RCU_LVLS 1
@@ -45,23 +46,33 @@
45# define NUM_RCU_LVL_1 (NR_CPUS) 46# define NUM_RCU_LVL_1 (NR_CPUS)
46# define NUM_RCU_LVL_2 0 47# define NUM_RCU_LVL_2 0
47# define NUM_RCU_LVL_3 0 48# define NUM_RCU_LVL_3 0
49# define NUM_RCU_LVL_4 0
48#elif NR_CPUS <= RCU_FANOUT_SQ 50#elif NR_CPUS <= RCU_FANOUT_SQ
49# define NUM_RCU_LVLS 2 51# define NUM_RCU_LVLS 2
50# define NUM_RCU_LVL_0 1 52# define NUM_RCU_LVL_0 1
51# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 53# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
52# define NUM_RCU_LVL_2 (NR_CPUS) 54# define NUM_RCU_LVL_2 (NR_CPUS)
53# define NUM_RCU_LVL_3 0 55# define NUM_RCU_LVL_3 0
56# define NUM_RCU_LVL_4 0
54#elif NR_CPUS <= RCU_FANOUT_CUBE 57#elif NR_CPUS <= RCU_FANOUT_CUBE
55# define NUM_RCU_LVLS 3 58# define NUM_RCU_LVLS 3
56# define NUM_RCU_LVL_0 1 59# define NUM_RCU_LVL_0 1
57# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) 60# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
58# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 61# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
59# define NUM_RCU_LVL_3 NR_CPUS 62# define NUM_RCU_LVL_3 NR_CPUS
63# define NUM_RCU_LVL_4 0
64#elif NR_CPUS <= RCU_FANOUT_FOURTH
65# define NUM_RCU_LVLS 4
66# define NUM_RCU_LVL_0 1
67# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE)
68# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
69# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
70# define NUM_RCU_LVL_4 NR_CPUS
60#else 71#else
61# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" 72# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
62#endif /* #if (NR_CPUS) <= RCU_FANOUT */ 73#endif /* #if (NR_CPUS) <= RCU_FANOUT */
63 74
64#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) 75#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
65#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) 76#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
66 77
67/* 78/*
@@ -84,14 +95,21 @@ struct rcu_node {
84 long gpnum; /* Current grace period for this node. */ 95 long gpnum; /* Current grace period for this node. */
85 /* This will either be equal to or one */ 96 /* This will either be equal to or one */
86 /* behind the root rcu_node's gpnum. */ 97 /* behind the root rcu_node's gpnum. */
98 long completed; /* Last grace period completed for this node. */
99 /* This will either be equal to or one */
100 /* behind the root rcu_node's gpnum. */
87 unsigned long qsmask; /* CPUs or groups that need to switch in */ 101 unsigned long qsmask; /* CPUs or groups that need to switch in */
88 /* order for current grace period to proceed.*/ 102 /* order for current grace period to proceed.*/
89 /* In leaf rcu_node, each bit corresponds to */ 103 /* In leaf rcu_node, each bit corresponds to */
90 /* an rcu_data structure, otherwise, each */ 104 /* an rcu_data structure, otherwise, each */
91 /* bit corresponds to a child rcu_node */ 105 /* bit corresponds to a child rcu_node */
92 /* structure. */ 106 /* structure. */
107 unsigned long expmask; /* Groups that have ->blocked_tasks[] */
108 /* elements that need to drain to allow the */
109 /* current expedited grace period to */
110 /* complete (only for TREE_PREEMPT_RCU). */
93 unsigned long qsmaskinit; 111 unsigned long qsmaskinit;
94 /* Per-GP initialization for qsmask. */ 112 /* Per-GP initial value for qsmask & expmask. */
95 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 113 unsigned long grpmask; /* Mask to apply to parent qsmask. */
96 /* Only one bit will be set in this mask. */ 114 /* Only one bit will be set in this mask. */
97 int grplo; /* lowest-numbered CPU or group here. */ 115 int grplo; /* lowest-numbered CPU or group here. */
@@ -99,7 +117,7 @@ struct rcu_node {
99 u8 grpnum; /* CPU/group number for next level up. */ 117 u8 grpnum; /* CPU/group number for next level up. */
100 u8 level; /* root is at level 0. */ 118 u8 level; /* root is at level 0. */
101 struct rcu_node *parent; 119 struct rcu_node *parent;
102 struct list_head blocked_tasks[2]; 120 struct list_head blocked_tasks[4];
103 /* Tasks blocked in RCU read-side critsect. */ 121 /* Tasks blocked in RCU read-side critsect. */
104 /* Grace period number (->gpnum) x blocked */ 122 /* Grace period number (->gpnum) x blocked */
105 /* by tasks on the (x & 0x1) element of the */ 123 /* by tasks on the (x & 0x1) element of the */
@@ -114,6 +132,21 @@ struct rcu_node {
114 for ((rnp) = &(rsp)->node[0]; \ 132 for ((rnp) = &(rsp)->node[0]; \
115 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) 133 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
116 134
135/*
136 * Do a breadth-first scan of the non-leaf rcu_node structures for the
137 * specified rcu_state structure. Note that if there is a singleton
138 * rcu_node tree with but one rcu_node structure, this loop is a no-op.
139 */
140#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
141 for ((rnp) = &(rsp)->node[0]; \
142 (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
143
144/*
145 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
146 * structure. Note that if there is a singleton rcu_node tree with but
147 * one rcu_node structure, this loop -will- visit the rcu_node structure.
148 * It is still a leaf node, even if it is also the root node.
149 */
117#define rcu_for_each_leaf_node(rsp, rnp) \ 150#define rcu_for_each_leaf_node(rsp, rnp) \
118 for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \ 151 for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
119 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) 152 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
@@ -204,11 +237,12 @@ struct rcu_data {
204#define RCU_GP_IDLE 0 /* No grace period in progress. */ 237#define RCU_GP_IDLE 0 /* No grace period in progress. */
205#define RCU_GP_INIT 1 /* Grace period being initialized. */ 238#define RCU_GP_INIT 1 /* Grace period being initialized. */
206#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ 239#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
207#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ 240#define RCU_SAVE_COMPLETED 3 /* Need to save rsp->completed. */
241#define RCU_FORCE_QS 4 /* Need to force quiescent state. */
208#ifdef CONFIG_NO_HZ 242#ifdef CONFIG_NO_HZ
209#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 243#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
210#else /* #ifdef CONFIG_NO_HZ */ 244#else /* #ifdef CONFIG_NO_HZ */
211#define RCU_SIGNAL_INIT RCU_FORCE_QS 245#define RCU_SIGNAL_INIT RCU_SAVE_COMPLETED
212#endif /* #else #ifdef CONFIG_NO_HZ */ 246#endif /* #else #ifdef CONFIG_NO_HZ */
213 247
214#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 248#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
@@ -246,7 +280,7 @@ struct rcu_state {
246 long gpnum; /* Current gp number. */ 280 long gpnum; /* Current gp number. */
247 long completed; /* # of last completed gp. */ 281 long completed; /* # of last completed gp. */
248 282
249 /* End of fields guarded by root rcu_node's lock. */ 283 /* End of fields guarded by root rcu_node's lock. */
250 284
251 spinlock_t onofflock; /* exclude on/offline and */ 285 spinlock_t onofflock; /* exclude on/offline and */
252 /* starting new GP. Also */ 286 /* starting new GP. Also */
@@ -260,6 +294,8 @@ struct rcu_state {
260 long orphan_qlen; /* Number of orphaned cbs. */ 294 long orphan_qlen; /* Number of orphaned cbs. */
261 spinlock_t fqslock; /* Only one task forcing */ 295 spinlock_t fqslock; /* Only one task forcing */
262 /* quiescent states. */ 296 /* quiescent states. */
297 long completed_fqs; /* Value of completed @ snap. */
298 /* Protected by fqslock. */
263 unsigned long jiffies_force_qs; /* Time at which to invoke */ 299 unsigned long jiffies_force_qs; /* Time at which to invoke */
264 /* force_quiescent_state(). */ 300 /* force_quiescent_state(). */
265 unsigned long n_force_qs; /* Number of calls to */ 301 unsigned long n_force_qs; /* Number of calls to */
@@ -274,11 +310,15 @@ struct rcu_state {
274 unsigned long jiffies_stall; /* Time at which to check */ 310 unsigned long jiffies_stall; /* Time at which to check */
275 /* for CPU stalls. */ 311 /* for CPU stalls. */
276#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 312#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
277#ifdef CONFIG_NO_HZ
278 long dynticks_completed; /* Value of completed @ snap. */
279#endif /* #ifdef CONFIG_NO_HZ */
280}; 313};
281 314
315/* Return values for rcu_preempt_offline_tasks(). */
316
317#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */
318 /* GP were moved to root. */
319#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
320 /* GP were moved to root. */
321
282#ifdef RCU_TREE_NONCORE 322#ifdef RCU_TREE_NONCORE
283 323
284/* 324/*
@@ -298,10 +338,14 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
298#else /* #ifdef RCU_TREE_NONCORE */ 338#else /* #ifdef RCU_TREE_NONCORE */
299 339
300/* Forward declarations for rcutree_plugin.h */ 340/* Forward declarations for rcutree_plugin.h */
301static inline void rcu_bootup_announce(void); 341static void rcu_bootup_announce(void);
302long rcu_batches_completed(void); 342long rcu_batches_completed(void);
303static void rcu_preempt_note_context_switch(int cpu); 343static void rcu_preempt_note_context_switch(int cpu);
304static int rcu_preempted_readers(struct rcu_node *rnp); 344static int rcu_preempted_readers(struct rcu_node *rnp);
345#ifdef CONFIG_HOTPLUG_CPU
346static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
347 unsigned long flags);
348#endif /* #ifdef CONFIG_HOTPLUG_CPU */
305#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 349#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
306static void rcu_print_task_stall(struct rcu_node *rnp); 350static void rcu_print_task_stall(struct rcu_node *rnp);
307#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 351#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
@@ -315,6 +359,9 @@ static void rcu_preempt_offline_cpu(int cpu);
315static void rcu_preempt_check_callbacks(int cpu); 359static void rcu_preempt_check_callbacks(int cpu);
316static void rcu_preempt_process_callbacks(void); 360static void rcu_preempt_process_callbacks(void);
317void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 361void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
362#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
363static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
364#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
318static int rcu_preempt_pending(int cpu); 365static int rcu_preempt_pending(int cpu);
319static int rcu_preempt_needs_cpu(int cpu); 366static int rcu_preempt_needs_cpu(int cpu);
320static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 367static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index ef2a58c2b9d5..37fbccdf41d5 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -24,16 +24,19 @@
24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com> 24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
25 */ 25 */
26 26
27#include <linux/delay.h>
27 28
28#ifdef CONFIG_TREE_PREEMPT_RCU 29#ifdef CONFIG_TREE_PREEMPT_RCU
29 30
30struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 31struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
31DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 32DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
32 33
34static int rcu_preempted_readers_exp(struct rcu_node *rnp);
35
33/* 36/*
34 * Tell them what RCU they are running. 37 * Tell them what RCU they are running.
35 */ 38 */
36static inline void rcu_bootup_announce(void) 39static void __init rcu_bootup_announce(void)
37{ 40{
38 printk(KERN_INFO 41 printk(KERN_INFO
39 "Experimental preemptable hierarchical RCU implementation.\n"); 42 "Experimental preemptable hierarchical RCU implementation.\n");
@@ -67,7 +70,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
67static void rcu_preempt_qs(int cpu) 70static void rcu_preempt_qs(int cpu)
68{ 71{
69 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 72 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
70 rdp->passed_quiesc_completed = rdp->completed; 73 rdp->passed_quiesc_completed = rdp->gpnum - 1;
71 barrier(); 74 barrier();
72 rdp->passed_quiesc = 1; 75 rdp->passed_quiesc = 1;
73} 76}
@@ -157,14 +160,58 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);
157 */ 160 */
158static int rcu_preempted_readers(struct rcu_node *rnp) 161static int rcu_preempted_readers(struct rcu_node *rnp)
159{ 162{
160 return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); 163 int phase = rnp->gpnum & 0x1;
164
165 return !list_empty(&rnp->blocked_tasks[phase]) ||
166 !list_empty(&rnp->blocked_tasks[phase + 2]);
167}
168
169/*
170 * Record a quiescent state for all tasks that were previously queued
171 * on the specified rcu_node structure and that were blocking the current
172 * RCU grace period. The caller must hold the specified rnp->lock with
173 * irqs disabled, and this lock is released upon return, but irqs remain
174 * disabled.
175 */
176static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
177 __releases(rnp->lock)
178{
179 unsigned long mask;
180 struct rcu_node *rnp_p;
181
182 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
183 spin_unlock_irqrestore(&rnp->lock, flags);
184 return; /* Still need more quiescent states! */
185 }
186
187 rnp_p = rnp->parent;
188 if (rnp_p == NULL) {
189 /*
190 * Either there is only one rcu_node in the tree,
191 * or tasks were kicked up to root rcu_node due to
192 * CPUs going offline.
193 */
194 rcu_report_qs_rsp(&rcu_preempt_state, flags);
195 return;
196 }
197
198 /* Report up the rest of the hierarchy. */
199 mask = rnp->grpmask;
200 spin_unlock(&rnp->lock); /* irqs remain disabled. */
201 spin_lock(&rnp_p->lock); /* irqs already disabled. */
202 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
161} 203}
162 204
205/*
206 * Handle special cases during rcu_read_unlock(), such as needing to
207 * notify RCU core processing or task having blocked during the RCU
208 * read-side critical section.
209 */
163static void rcu_read_unlock_special(struct task_struct *t) 210static void rcu_read_unlock_special(struct task_struct *t)
164{ 211{
165 int empty; 212 int empty;
213 int empty_exp;
166 unsigned long flags; 214 unsigned long flags;
167 unsigned long mask;
168 struct rcu_node *rnp; 215 struct rcu_node *rnp;
169 int special; 216 int special;
170 217
@@ -207,36 +254,30 @@ static void rcu_read_unlock_special(struct task_struct *t)
207 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 254 spin_unlock(&rnp->lock); /* irqs remain disabled. */
208 } 255 }
209 empty = !rcu_preempted_readers(rnp); 256 empty = !rcu_preempted_readers(rnp);
257 empty_exp = !rcu_preempted_readers_exp(rnp);
258 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
210 list_del_init(&t->rcu_node_entry); 259 list_del_init(&t->rcu_node_entry);
211 t->rcu_blocked_node = NULL; 260 t->rcu_blocked_node = NULL;
212 261
213 /* 262 /*
214 * If this was the last task on the current list, and if 263 * If this was the last task on the current list, and if
215 * we aren't waiting on any CPUs, report the quiescent state. 264 * we aren't waiting on any CPUs, report the quiescent state.
216 * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk() 265 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
217 * drop rnp->lock and restore irq.
218 */ 266 */
219 if (!empty && rnp->qsmask == 0 && 267 if (empty)
220 !rcu_preempted_readers(rnp)) {
221 struct rcu_node *rnp_p;
222
223 if (rnp->parent == NULL) {
224 /* Only one rcu_node in the tree. */
225 cpu_quiet_msk_finish(&rcu_preempt_state, flags);
226 return;
227 }
228 /* Report up the rest of the hierarchy. */
229 mask = rnp->grpmask;
230 spin_unlock_irqrestore(&rnp->lock, flags); 268 spin_unlock_irqrestore(&rnp->lock, flags);
231 rnp_p = rnp->parent; 269 else
232 spin_lock_irqsave(&rnp_p->lock, flags); 270 rcu_report_unblock_qs_rnp(rnp, flags);
233 WARN_ON_ONCE(rnp->qsmask); 271
234 cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags); 272 /*
235 return; 273 * If this was the last task on the expedited lists,
236 } 274 * then we need to report up the rcu_node hierarchy.
237 spin_unlock(&rnp->lock); 275 */
276 if (!empty_exp && !rcu_preempted_readers_exp(rnp))
277 rcu_report_exp_rnp(&rcu_preempt_state, rnp);
278 } else {
279 local_irq_restore(flags);
238 } 280 }
239 local_irq_restore(flags);
240} 281}
241 282
242/* 283/*
@@ -303,6 +344,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
303 * rcu_node. The reason for not just moving them to the immediate 344 * rcu_node. The reason for not just moving them to the immediate
304 * parent is to remove the need for rcu_read_unlock_special() to 345 * parent is to remove the need for rcu_read_unlock_special() to
305 * make more than two attempts to acquire the target rcu_node's lock. 346 * make more than two attempts to acquire the target rcu_node's lock.
347 * Returns true if there were tasks blocking the current RCU grace
348 * period.
306 * 349 *
307 * Returns 1 if there was previously a task blocking the current grace 350 * Returns 1 if there was previously a task blocking the current grace
308 * period on the specified rcu_node structure. 351 * period on the specified rcu_node structure.
@@ -316,7 +359,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
316 int i; 359 int i;
317 struct list_head *lp; 360 struct list_head *lp;
318 struct list_head *lp_root; 361 struct list_head *lp_root;
319 int retval = rcu_preempted_readers(rnp); 362 int retval = 0;
320 struct rcu_node *rnp_root = rcu_get_root(rsp); 363 struct rcu_node *rnp_root = rcu_get_root(rsp);
321 struct task_struct *tp; 364 struct task_struct *tp;
322 365
@@ -326,7 +369,9 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
326 } 369 }
327 WARN_ON_ONCE(rnp != rdp->mynode && 370 WARN_ON_ONCE(rnp != rdp->mynode &&
328 (!list_empty(&rnp->blocked_tasks[0]) || 371 (!list_empty(&rnp->blocked_tasks[0]) ||
329 !list_empty(&rnp->blocked_tasks[1]))); 372 !list_empty(&rnp->blocked_tasks[1]) ||
373 !list_empty(&rnp->blocked_tasks[2]) ||
374 !list_empty(&rnp->blocked_tasks[3])));
330 375
331 /* 376 /*
332 * Move tasks up to root rcu_node. Rely on the fact that the 377 * Move tasks up to root rcu_node. Rely on the fact that the
@@ -334,7 +379,11 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
334 * rcu_nodes in terms of gp_num value. This fact allows us to 379 * rcu_nodes in terms of gp_num value. This fact allows us to
335 * move the blocked_tasks[] array directly, element by element. 380 * move the blocked_tasks[] array directly, element by element.
336 */ 381 */
337 for (i = 0; i < 2; i++) { 382 if (rcu_preempted_readers(rnp))
383 retval |= RCU_OFL_TASKS_NORM_GP;
384 if (rcu_preempted_readers_exp(rnp))
385 retval |= RCU_OFL_TASKS_EXP_GP;
386 for (i = 0; i < 4; i++) {
338 lp = &rnp->blocked_tasks[i]; 387 lp = &rnp->blocked_tasks[i];
339 lp_root = &rnp_root->blocked_tasks[i]; 388 lp_root = &rnp_root->blocked_tasks[i];
340 while (!list_empty(lp)) { 389 while (!list_empty(lp)) {
@@ -346,7 +395,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
346 spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 395 spin_unlock(&rnp_root->lock); /* irqs remain disabled */
347 } 396 }
348 } 397 }
349
350 return retval; 398 return retval;
351} 399}
352 400
@@ -398,14 +446,183 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
398} 446}
399EXPORT_SYMBOL_GPL(call_rcu); 447EXPORT_SYMBOL_GPL(call_rcu);
400 448
449/**
450 * synchronize_rcu - wait until a grace period has elapsed.
451 *
452 * Control will return to the caller some time after a full grace
453 * period has elapsed, in other words after all currently executing RCU
454 * read-side critical sections have completed. RCU read-side critical
455 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
456 * and may be nested.
457 */
458void synchronize_rcu(void)
459{
460 struct rcu_synchronize rcu;
461
462 if (!rcu_scheduler_active)
463 return;
464
465 init_completion(&rcu.completion);
466 /* Will wake me after RCU finished. */
467 call_rcu(&rcu.head, wakeme_after_rcu);
468 /* Wait for it. */
469 wait_for_completion(&rcu.completion);
470}
471EXPORT_SYMBOL_GPL(synchronize_rcu);
472
473static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
474static long sync_rcu_preempt_exp_count;
475static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
476
401/* 477/*
402 * Wait for an rcu-preempt grace period. We are supposed to expedite the 478 * Return non-zero if there are any tasks in RCU read-side critical
403 * grace period, but this is the crude slow compatability hack, so just 479 * sections blocking the current preemptible-RCU expedited grace period.
404 * invoke synchronize_rcu(). 480 * If there is no preemptible-RCU expedited grace period currently in
481 * progress, returns zero unconditionally.
482 */
483static int rcu_preempted_readers_exp(struct rcu_node *rnp)
484{
485 return !list_empty(&rnp->blocked_tasks[2]) ||
486 !list_empty(&rnp->blocked_tasks[3]);
487}
488
489/*
490 * return non-zero if there is no RCU expedited grace period in progress
491 * for the specified rcu_node structure, in other words, if all CPUs and
492 * tasks covered by the specified rcu_node structure have done their bit
493 * for the current expedited grace period. Works only for preemptible
494 * RCU -- other RCU implementation use other means.
495 *
496 * Caller must hold sync_rcu_preempt_exp_mutex.
497 */
498static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
499{
500 return !rcu_preempted_readers_exp(rnp) &&
501 ACCESS_ONCE(rnp->expmask) == 0;
502}
503
504/*
505 * Report the exit from RCU read-side critical section for the last task
506 * that queued itself during or before the current expedited preemptible-RCU
507 * grace period. This event is reported either to the rcu_node structure on
508 * which the task was queued or to one of that rcu_node structure's ancestors,
509 * recursively up the tree. (Calm down, calm down, we do the recursion
510 * iteratively!)
511 *
512 * Caller must hold sync_rcu_preempt_exp_mutex.
513 */
514static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
515{
516 unsigned long flags;
517 unsigned long mask;
518
519 spin_lock_irqsave(&rnp->lock, flags);
520 for (;;) {
521 if (!sync_rcu_preempt_exp_done(rnp))
522 break;
523 if (rnp->parent == NULL) {
524 wake_up(&sync_rcu_preempt_exp_wq);
525 break;
526 }
527 mask = rnp->grpmask;
528 spin_unlock(&rnp->lock); /* irqs remain disabled */
529 rnp = rnp->parent;
530 spin_lock(&rnp->lock); /* irqs already disabled */
531 rnp->expmask &= ~mask;
532 }
533 spin_unlock_irqrestore(&rnp->lock, flags);
534}
535
536/*
537 * Snapshot the tasks blocking the newly started preemptible-RCU expedited
538 * grace period for the specified rcu_node structure. If there are no such
539 * tasks, report it up the rcu_node hierarchy.
540 *
541 * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock.
542 */
543static void
544sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
545{
546 int must_wait;
547
548 spin_lock(&rnp->lock); /* irqs already disabled */
549 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
550 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
551 must_wait = rcu_preempted_readers_exp(rnp);
552 spin_unlock(&rnp->lock); /* irqs remain disabled */
553 if (!must_wait)
554 rcu_report_exp_rnp(rsp, rnp);
555}
556
557/*
558 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
559 * is to invoke synchronize_sched_expedited() to push all the tasks to
560 * the ->blocked_tasks[] lists, move all entries from the first set of
561 * ->blocked_tasks[] lists to the second set, and finally wait for this
562 * second set to drain.
405 */ 563 */
406void synchronize_rcu_expedited(void) 564void synchronize_rcu_expedited(void)
407{ 565{
408 synchronize_rcu(); 566 unsigned long flags;
567 struct rcu_node *rnp;
568 struct rcu_state *rsp = &rcu_preempt_state;
569 long snap;
570 int trycount = 0;
571
572 smp_mb(); /* Caller's modifications seen first by other CPUs. */
573 snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
574 smp_mb(); /* Above access cannot bleed into critical section. */
575
576 /*
577 * Acquire lock, falling back to synchronize_rcu() if too many
578 * lock-acquisition failures. Of course, if someone does the
579 * expedited grace period for us, just leave.
580 */
581 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
582 if (trycount++ < 10)
583 udelay(trycount * num_online_cpus());
584 else {
585 synchronize_rcu();
586 return;
587 }
588 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
589 goto mb_ret; /* Others did our work for us. */
590 }
591 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
592 goto unlock_mb_ret; /* Others did our work for us. */
593
594 /* force all RCU readers onto blocked_tasks[]. */
595 synchronize_sched_expedited();
596
597 spin_lock_irqsave(&rsp->onofflock, flags);
598
599 /* Initialize ->expmask for all non-leaf rcu_node structures. */
600 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
601 spin_lock(&rnp->lock); /* irqs already disabled. */
602 rnp->expmask = rnp->qsmaskinit;
603 spin_unlock(&rnp->lock); /* irqs remain disabled. */
604 }
605
606 /* Snapshot current state of ->blocked_tasks[] lists. */
607 rcu_for_each_leaf_node(rsp, rnp)
608 sync_rcu_preempt_exp_init(rsp, rnp);
609 if (NUM_RCU_NODES > 1)
610 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
611
612 spin_unlock_irqrestore(&rsp->onofflock, flags);
613
614 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */
615 rnp = rcu_get_root(rsp);
616 wait_event(sync_rcu_preempt_exp_wq,
617 sync_rcu_preempt_exp_done(rnp));
618
619 /* Clean up and exit. */
620 smp_mb(); /* ensure expedited GP seen before counter increment. */
621 ACCESS_ONCE(sync_rcu_preempt_exp_count)++;
622unlock_mb_ret:
623 mutex_unlock(&sync_rcu_preempt_exp_mutex);
624mb_ret:
625 smp_mb(); /* ensure subsequent action seen after grace period. */
409} 626}
410EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 627EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
411 628
@@ -481,7 +698,7 @@ void exit_rcu(void)
481/* 698/*
482 * Tell them what RCU they are running. 699 * Tell them what RCU they are running.
483 */ 700 */
484static inline void rcu_bootup_announce(void) 701static void __init rcu_bootup_announce(void)
485{ 702{
486 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 703 printk(KERN_INFO "Hierarchical RCU implementation.\n");
487} 704}
@@ -512,6 +729,16 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
512 return 0; 729 return 0;
513} 730}
514 731
732#ifdef CONFIG_HOTPLUG_CPU
733
734/* Because preemptible RCU does not exist, no quieting of tasks. */
735static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
736{
737 spin_unlock_irqrestore(&rnp->lock, flags);
738}
739
740#endif /* #ifdef CONFIG_HOTPLUG_CPU */
741
515#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 742#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
516 743
517/* 744/*
@@ -594,6 +821,20 @@ void synchronize_rcu_expedited(void)
594} 821}
595EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 822EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
596 823
824#ifdef CONFIG_HOTPLUG_CPU
825
826/*
827 * Because preemptable RCU does not exist, there is never any need to
828 * report on tasks preempted in RCU read-side critical sections during
829 * expedited RCU grace periods.
830 */
831static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
832{
833 return;
834}
835
836#endif /* #ifdef CONFIG_HOTPLUG_CPU */
837
597/* 838/*
598 * Because preemptable RCU does not exist, it never has any work to do. 839 * Because preemptable RCU does not exist, it never has any work to do.
599 */ 840 */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 4b31c779e62e..9d2c88423b31 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -155,12 +155,15 @@ static const struct file_operations rcudata_csv_fops = {
155 155
156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
157{ 157{
158 long gpnum;
158 int level = 0; 159 int level = 0;
160 int phase;
159 struct rcu_node *rnp; 161 struct rcu_node *rnp;
160 162
163 gpnum = rsp->gpnum;
161 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " 164 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
162 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", 165 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
163 rsp->completed, rsp->gpnum, rsp->signaled, 166 rsp->completed, gpnum, rsp->signaled,
164 (long)(rsp->jiffies_force_qs - jiffies), 167 (long)(rsp->jiffies_force_qs - jiffies),
165 (int)(jiffies & 0xffff), 168 (int)(jiffies & 0xffff),
166 rsp->n_force_qs, rsp->n_force_qs_ngp, 169 rsp->n_force_qs, rsp->n_force_qs_ngp,
@@ -171,8 +174,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
171 seq_puts(m, "\n"); 174 seq_puts(m, "\n");
172 level = rnp->level; 175 level = rnp->level;
173 } 176 }
174 seq_printf(m, "%lx/%lx %d:%d ^%d ", 177 phase = gpnum & 0x1;
178 seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ",
175 rnp->qsmask, rnp->qsmaskinit, 179 rnp->qsmask, rnp->qsmaskinit,
180 "T."[list_empty(&rnp->blocked_tasks[phase])],
181 "E."[list_empty(&rnp->blocked_tasks[phase + 2])],
182 "T."[list_empty(&rnp->blocked_tasks[!phase])],
183 "E."[list_empty(&rnp->blocked_tasks[!phase + 2])],
176 rnp->grplo, rnp->grphi, rnp->grpnum); 184 rnp->grplo, rnp->grphi, rnp->grpnum);
177 } 185 }
178 seq_puts(m, "\n"); 186 seq_puts(m, "\n");
diff --git a/kernel/sched.c b/kernel/sched.c
index ec0af1fcb195..6ae2739b8f19 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -10901,6 +10901,7 @@ void synchronize_sched_expedited(void)
10901 spin_unlock_irqrestore(&rq->lock, flags); 10901 spin_unlock_irqrestore(&rq->lock, flags);
10902 } 10902 }
10903 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 10903 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10904 synchronize_sched_expedited_count++;
10904 mutex_unlock(&rcu_sched_expedited_mutex); 10905 mutex_unlock(&rcu_sched_expedited_mutex);
10905 put_online_cpus(); 10906 put_online_cpus();
10906 if (need_full_sync) 10907 if (need_full_sync)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f8749e5216e0..21939d9e830e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -302,9 +302,9 @@ void irq_exit(void)
302 if (!in_interrupt() && local_softirq_pending()) 302 if (!in_interrupt() && local_softirq_pending())
303 invoke_softirq(); 303 invoke_softirq();
304 304
305 rcu_irq_exit();
305#ifdef CONFIG_NO_HZ 306#ifdef CONFIG_NO_HZ
306 /* Make sure that timer wheel updates are propagated */ 307 /* Make sure that timer wheel updates are propagated */
307 rcu_irq_exit();
308 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) 308 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
309 tick_nohz_stop_sched_tick(0); 309 tick_nohz_stop_sched_tick(0);
310#endif 310#endif
diff --git a/kernel/srcu.c b/kernel/srcu.c
index b0aeeaf22ce4..818d7d9aa03c 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -49,6 +49,7 @@ int init_srcu_struct(struct srcu_struct *sp)
49 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); 49 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
50 return (sp->per_cpu_ref ? 0 : -ENOMEM); 50 return (sp->per_cpu_ref ? 0 : -ENOMEM);
51} 51}
52EXPORT_SYMBOL_GPL(init_srcu_struct);
52 53
53/* 54/*
54 * srcu_readers_active_idx -- returns approximate number of readers 55 * srcu_readers_active_idx -- returns approximate number of readers
@@ -97,6 +98,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
97 free_percpu(sp->per_cpu_ref); 98 free_percpu(sp->per_cpu_ref);
98 sp->per_cpu_ref = NULL; 99 sp->per_cpu_ref = NULL;
99} 100}
101EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
100 102
101/** 103/**
102 * srcu_read_lock - register a new reader for an SRCU-protected structure. 104 * srcu_read_lock - register a new reader for an SRCU-protected structure.
@@ -118,6 +120,7 @@ int srcu_read_lock(struct srcu_struct *sp)
118 preempt_enable(); 120 preempt_enable();
119 return idx; 121 return idx;
120} 122}
123EXPORT_SYMBOL_GPL(srcu_read_lock);
121 124
122/** 125/**
123 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. 126 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
@@ -136,22 +139,12 @@ void srcu_read_unlock(struct srcu_struct *sp, int idx)
136 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; 139 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
137 preempt_enable(); 140 preempt_enable();
138} 141}
142EXPORT_SYMBOL_GPL(srcu_read_unlock);
139 143
140/** 144/*
141 * synchronize_srcu - wait for prior SRCU read-side critical-section completion 145 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
142 * @sp: srcu_struct with which to synchronize.
143 *
144 * Flip the completed counter, and wait for the old count to drain to zero.
145 * As with classic RCU, the updater must use some separate means of
146 * synchronizing concurrent updates. Can block; must be called from
147 * process context.
148 *
149 * Note that it is illegal to call synchornize_srcu() from the corresponding
150 * SRCU read-side critical section; doing so will result in deadlock.
151 * However, it is perfectly legal to call synchronize_srcu() on one
152 * srcu_struct from some other srcu_struct's read-side critical section.
153 */ 146 */
154void synchronize_srcu(struct srcu_struct *sp) 147void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
155{ 148{
156 int idx; 149 int idx;
157 150
@@ -173,7 +166,7 @@ void synchronize_srcu(struct srcu_struct *sp)
173 return; 166 return;
174 } 167 }
175 168
176 synchronize_sched(); /* Force memory barrier on all CPUs. */ 169 sync_func(); /* Force memory barrier on all CPUs. */
177 170
178 /* 171 /*
179 * The preceding synchronize_sched() ensures that any CPU that 172 * The preceding synchronize_sched() ensures that any CPU that
@@ -190,7 +183,7 @@ void synchronize_srcu(struct srcu_struct *sp)
190 idx = sp->completed & 0x1; 183 idx = sp->completed & 0x1;
191 sp->completed++; 184 sp->completed++;
192 185
193 synchronize_sched(); /* Force memory barrier on all CPUs. */ 186 sync_func(); /* Force memory barrier on all CPUs. */
194 187
195 /* 188 /*
196 * At this point, because of the preceding synchronize_sched(), 189 * At this point, because of the preceding synchronize_sched(),
@@ -203,7 +196,7 @@ void synchronize_srcu(struct srcu_struct *sp)
203 while (srcu_readers_active_idx(sp, idx)) 196 while (srcu_readers_active_idx(sp, idx))
204 schedule_timeout_interruptible(1); 197 schedule_timeout_interruptible(1);
205 198
206 synchronize_sched(); /* Force memory barrier on all CPUs. */ 199 sync_func(); /* Force memory barrier on all CPUs. */
207 200
208 /* 201 /*
209 * The preceding synchronize_sched() forces all srcu_read_unlock() 202 * The preceding synchronize_sched() forces all srcu_read_unlock()
@@ -237,6 +230,47 @@ void synchronize_srcu(struct srcu_struct *sp)
237} 230}
238 231
239/** 232/**
233 * synchronize_srcu - wait for prior SRCU read-side critical-section completion
234 * @sp: srcu_struct with which to synchronize.
235 *
236 * Flip the completed counter, and wait for the old count to drain to zero.
237 * As with classic RCU, the updater must use some separate means of
238 * synchronizing concurrent updates. Can block; must be called from
239 * process context.
240 *
241 * Note that it is illegal to call synchronize_srcu() from the corresponding
242 * SRCU read-side critical section; doing so will result in deadlock.
243 * However, it is perfectly legal to call synchronize_srcu() on one
244 * srcu_struct from some other srcu_struct's read-side critical section.
245 */
246void synchronize_srcu(struct srcu_struct *sp)
247{
248 __synchronize_srcu(sp, synchronize_sched);
249}
250EXPORT_SYMBOL_GPL(synchronize_srcu);
251
252/**
253 * synchronize_srcu_expedited - like synchronize_srcu, but less patient
254 * @sp: srcu_struct with which to synchronize.
255 *
256 * Flip the completed counter, and wait for the old count to drain to zero.
257 * As with classic RCU, the updater must use some separate means of
258 * synchronizing concurrent updates. Can block; must be called from
259 * process context.
260 *
261 * Note that it is illegal to call synchronize_srcu_expedited()
262 * from the corresponding SRCU read-side critical section; doing so
263 * will result in deadlock. However, it is perfectly legal to call
264 * synchronize_srcu_expedited() on one srcu_struct from some other
265 * srcu_struct's read-side critical section.
266 */
267void synchronize_srcu_expedited(struct srcu_struct *sp)
268{
269 __synchronize_srcu(sp, synchronize_sched_expedited);
270}
271EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
272
273/**
240 * srcu_batches_completed - return batches completed. 274 * srcu_batches_completed - return batches completed.
241 * @sp: srcu_struct on which to report batch completion. 275 * @sp: srcu_struct on which to report batch completion.
242 * 276 *
@@ -248,10 +282,4 @@ long srcu_batches_completed(struct srcu_struct *sp)
248{ 282{
249 return sp->completed; 283 return sp->completed;
250} 284}
251
252EXPORT_SYMBOL_GPL(init_srcu_struct);
253EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
254EXPORT_SYMBOL_GPL(srcu_read_lock);
255EXPORT_SYMBOL_GPL(srcu_read_unlock);
256EXPORT_SYMBOL_GPL(synchronize_srcu);
257EXPORT_SYMBOL_GPL(srcu_batches_completed); 285EXPORT_SYMBOL_GPL(srcu_batches_completed);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 234ceb10861f..a79c4d0407ab 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -750,7 +750,7 @@ config RCU_TORTURE_TEST_RUNNABLE
750config RCU_CPU_STALL_DETECTOR 750config RCU_CPU_STALL_DETECTOR
751 bool "Check for stalled CPUs delaying RCU grace periods" 751 bool "Check for stalled CPUs delaying RCU grace periods"
752 depends on TREE_RCU || TREE_PREEMPT_RCU 752 depends on TREE_RCU || TREE_PREEMPT_RCU
753 default n 753 default y
754 help 754 help
755 This option causes RCU to printk information on which 755 This option causes RCU to printk information on which
756 CPUs are delaying the current grace period, but only when 756 CPUs are delaying the current grace period, but only when