aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/RCU/RTFP.txt2
-rw-r--r--Documentation/RCU/checklist.txt17
-rw-r--r--Documentation/RCU/listRCU.txt2
-rw-r--r--Documentation/RCU/rcuref.txt61
-rw-r--r--Documentation/RCU/trace.txt396
-rw-r--r--Documentation/RCU/whatisRCU.txt17
-rw-r--r--Documentation/memory-barriers.txt9
-rw-r--r--arch/um/drivers/mconsole_kern.c2
-rw-r--r--include/linux/rculist.h17
-rw-r--r--include/linux/rcupdate.h27
-rw-r--r--include/linux/sched.h10
-rw-r--r--include/linux/srcu.h34
-rw-r--r--init/Kconfig19
-rw-r--r--kernel/ksysfs.c18
-rw-r--r--kernel/rcu.h2
-rw-r--r--kernel/rcupdate.c3
-rw-r--r--kernel/rcutiny.c2
-rw-r--r--kernel/rcutiny_plugin.h5
-rw-r--r--kernel/rcutorture.c54
-rw-r--r--kernel/rcutree.c222
-rw-r--r--kernel/rcutree.h19
-rw-r--r--kernel/rcutree_plugin.h18
-rw-r--r--kernel/rcutree_trace.c318
-rw-r--r--kernel/sched/core.c8
-rw-r--r--kernel/srcu.c16
-rw-r--r--lib/Kconfig.debug2
26 files changed, 795 insertions, 505 deletions
diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt
index 7c1dfb19fc40..7f40c72a9c51 100644
--- a/Documentation/RCU/RTFP.txt
+++ b/Documentation/RCU/RTFP.txt
@@ -186,7 +186,7 @@ Bibtex Entries
186 186
187@article{Kung80 187@article{Kung80
188,author="H. T. Kung and Q. Lehman" 188,author="H. T. Kung and Q. Lehman"
189,title="Concurrent Maintenance of Binary Search Trees" 189,title="Concurrent Manipulation of Binary Search Trees"
190,Year="1980" 190,Year="1980"
191,Month="September" 191,Month="September"
192,journal="ACM Transactions on Database Systems" 192,journal="ACM Transactions on Database Systems"
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index cdb20d41a44a..31ef8fe07f82 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -271,15 +271,14 @@ over a rather long period of time, but improvements are always welcome!
271 The same cautions apply to call_rcu_bh() and call_rcu_sched(). 271 The same cautions apply to call_rcu_bh() and call_rcu_sched().
272 272
2739. All RCU list-traversal primitives, which include 2739. All RCU list-traversal primitives, which include
274 rcu_dereference(), list_for_each_entry_rcu(), 274 rcu_dereference(), list_for_each_entry_rcu(), and
275 list_for_each_continue_rcu(), and list_for_each_safe_rcu(), 275 list_for_each_safe_rcu(), must be either within an RCU read-side
276 must be either within an RCU read-side critical section or 276 critical section or must be protected by appropriate update-side
277 must be protected by appropriate update-side locks. RCU 277 locks. RCU read-side critical sections are delimited by
278 read-side critical sections are delimited by rcu_read_lock() 278 rcu_read_lock() and rcu_read_unlock(), or by similar primitives
279 and rcu_read_unlock(), or by similar primitives such as 279 such as rcu_read_lock_bh() and rcu_read_unlock_bh(), in which
280 rcu_read_lock_bh() and rcu_read_unlock_bh(), in which case 280 case the matching rcu_dereference() primitive must be used in
281 the matching rcu_dereference() primitive must be used in order 281 order to keep lockdep happy, in this case, rcu_dereference_bh().
282 to keep lockdep happy, in this case, rcu_dereference_bh().
283 282
284 The reason that it is permissible to use RCU list-traversal 283 The reason that it is permissible to use RCU list-traversal
285 primitives when the update-side lock is held is that doing so 284 primitives when the update-side lock is held is that doing so
diff --git a/Documentation/RCU/listRCU.txt b/Documentation/RCU/listRCU.txt
index 4349c1487e91..adb5a3782846 100644
--- a/Documentation/RCU/listRCU.txt
+++ b/Documentation/RCU/listRCU.txt
@@ -205,7 +205,7 @@ RCU ("read-copy update") its name. The RCU code is as follows:
205 audit_copy_rule(&ne->rule, &e->rule); 205 audit_copy_rule(&ne->rule, &e->rule);
206 ne->rule.action = newaction; 206 ne->rule.action = newaction;
207 ne->rule.file_count = newfield_count; 207 ne->rule.file_count = newfield_count;
208 list_replace_rcu(e, ne); 208 list_replace_rcu(&e->list, &ne->list);
209 call_rcu(&e->rcu, audit_free_rule); 209 call_rcu(&e->rcu, audit_free_rule);
210 return 0; 210 return 0;
211 } 211 }
diff --git a/Documentation/RCU/rcuref.txt b/Documentation/RCU/rcuref.txt
index 4202ad093130..141d531aa14b 100644
--- a/Documentation/RCU/rcuref.txt
+++ b/Documentation/RCU/rcuref.txt
@@ -20,7 +20,7 @@ release_referenced() delete()
20{ { 20{ {
21 ... write_lock(&list_lock); 21 ... write_lock(&list_lock);
22 atomic_dec(&el->rc, relfunc) ... 22 atomic_dec(&el->rc, relfunc) ...
23 ... delete_element 23 ... remove_element
24} write_unlock(&list_lock); 24} write_unlock(&list_lock);
25 ... 25 ...
26 if (atomic_dec_and_test(&el->rc)) 26 if (atomic_dec_and_test(&el->rc))
@@ -52,7 +52,7 @@ release_referenced() delete()
52{ { 52{ {
53 ... spin_lock(&list_lock); 53 ... spin_lock(&list_lock);
54 if (atomic_dec_and_test(&el->rc)) ... 54 if (atomic_dec_and_test(&el->rc)) ...
55 call_rcu(&el->head, el_free); delete_element 55 call_rcu(&el->head, el_free); remove_element
56 ... spin_unlock(&list_lock); 56 ... spin_unlock(&list_lock);
57} ... 57} ...
58 if (atomic_dec_and_test(&el->rc)) 58 if (atomic_dec_and_test(&el->rc))
@@ -64,3 +64,60 @@ Sometimes, a reference to the element needs to be obtained in the
64update (write) stream. In such cases, atomic_inc_not_zero() might be 64update (write) stream. In such cases, atomic_inc_not_zero() might be
65overkill, since we hold the update-side spinlock. One might instead 65overkill, since we hold the update-side spinlock. One might instead
66use atomic_inc() in such cases. 66use atomic_inc() in such cases.
67
68It is not always convenient to deal with "FAIL" in the
69search_and_reference() code path. In such cases, the
70atomic_dec_and_test() may be moved from delete() to el_free()
71as follows:
72
731. 2.
74add() search_and_reference()
75{ {
76 alloc_object rcu_read_lock();
77 ... search_for_element
78 atomic_set(&el->rc, 1); atomic_inc(&el->rc);
79 spin_lock(&list_lock); ...
80
81 add_element rcu_read_unlock();
82 ... }
83 spin_unlock(&list_lock); 4.
84} delete()
853. {
86release_referenced() spin_lock(&list_lock);
87{ ...
88 ... remove_element
89 if (atomic_dec_and_test(&el->rc)) spin_unlock(&list_lock);
90 kfree(el); ...
91 ... call_rcu(&el->head, el_free);
92} ...
935. }
94void el_free(struct rcu_head *rhp)
95{
96 release_referenced();
97}
98
99The key point is that the initial reference added by add() is not removed
100until after a grace period has elapsed following removal. This means that
101search_and_reference() cannot find this element, which means that the value
102of el->rc cannot increase. Thus, once it reaches zero, there are no
103readers that can or ever will be able to reference the element. The
104element can therefore safely be freed. This in turn guarantees that if
105any reader finds the element, that reader may safely acquire a reference
106without checking the value of the reference counter.
107
108In cases where delete() can sleep, synchronize_rcu() can be called from
109delete(), so that el_free() can be subsumed into delete as follows:
110
1114.
112delete()
113{
114 spin_lock(&list_lock);
115 ...
116 remove_element
117 spin_unlock(&list_lock);
118 ...
119 synchronize_rcu();
120 if (atomic_dec_and_test(&el->rc))
121 kfree(el);
122 ...
123}
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 672d19083252..c776968f4463 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -10,51 +10,63 @@ for rcutree and next for rcutiny.
10 10
11CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats 11CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats
12 12
13These implementations of RCU provides several debugfs files under the 13These implementations of RCU provide several debugfs directories under the
14top-level directory "rcu": 14top-level directory "rcu":
15 15
16rcu/rcudata: 16rcu/rcu_bh
17rcu/rcu_preempt
18rcu/rcu_sched
19
20Each directory contains files for the corresponding flavor of RCU.
21Note that rcu/rcu_preempt is only present for CONFIG_TREE_PREEMPT_RCU.
22For CONFIG_TREE_RCU, the RCU flavor maps onto the RCU-sched flavor,
23so that activity for both appears in rcu/rcu_sched.
24
25In addition, the following file appears in the top-level directory:
26rcu/rcutorture. This file displays rcutorture test progress. The output
27of "cat rcu/rcutorture" looks as follows:
28
29rcutorture test sequence: 0 (test in progress)
30rcutorture update version number: 615
31
32The first line shows the number of rcutorture tests that have completed
33since boot. If a test is currently running, the "(test in progress)"
34string will appear as shown above. The second line shows the number of
35update cycles that the current test has started, or zero if there is
36no test in progress.
37
38
39Within each flavor directory (rcu/rcu_bh, rcu/rcu_sched, and possibly
40also rcu/rcu_preempt) the following files will be present:
41
42rcudata:
17 Displays fields in struct rcu_data. 43 Displays fields in struct rcu_data.
18rcu/rcudata.csv: 44rcuexp:
19 Comma-separated values spreadsheet version of rcudata. 45 Displays statistics for expedited grace periods.
20rcu/rcugp: 46rcugp:
21 Displays grace-period counters. 47 Displays grace-period counters.
22rcu/rcuhier: 48rcuhier:
23 Displays the struct rcu_node hierarchy. 49 Displays the struct rcu_node hierarchy.
24rcu/rcu_pending: 50rcu_pending:
25 Displays counts of the reasons rcu_pending() decided that RCU had 51 Displays counts of the reasons rcu_pending() decided that RCU had
26 work to do. 52 work to do.
27rcu/rcutorture: 53rcuboost:
28 Displays rcutorture test progress.
29rcu/rcuboost:
30 Displays RCU boosting statistics. Only present if 54 Displays RCU boosting statistics. Only present if
31 CONFIG_RCU_BOOST=y. 55 CONFIG_RCU_BOOST=y.
32 56
33The output of "cat rcu/rcudata" looks as follows: 57The output of "cat rcu/rcu_preempt/rcudata" looks as follows:
34 58
35rcu_sched: 59 0!c=30455 g=30456 pq=1 qp=1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716
36 0 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=545/1/0 df=50 of=0 ql=163 qs=NRW. kt=0/W/0 ktl=ebc3 b=10 ci=153737 co=0 ca=0 60 1!c=30719 g=30720 pq=1 qp=0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982
37 1 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=967/1/0 df=58 of=0 ql=634 qs=NRW. kt=0/W/1 ktl=58c b=10 ci=191037 co=0 ca=0 61 2!c=30150 g=30151 pq=1 qp=1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458
38 2 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=1081/1/0 df=175 of=0 ql=74 qs=N.W. kt=0/W/2 ktl=da94 b=10 ci=75991 co=0 ca=0 62 3 c=31249 g=31250 pq=1 qp=0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622
39 3 c=20942 g=20943 pq=1 pgp=20942 qp=1 dt=1846/0/0 df=404 of=0 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=72261 co=0 ca=0 63 4!c=29502 g=29503 pq=1 qp=1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521
40 4 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=369/1/0 df=83 of=0 ql=48 qs=N.W. kt=0/W/4 ktl=e0e7 b=10 ci=128365 co=0 ca=0 64 5 c=31201 g=31202 pq=1 qp=1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698
41 5 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=381/1/0 df=64 of=0 ql=169 qs=NRW. kt=0/W/5 ktl=fb2f b=10 ci=164360 co=0 ca=0 65 6!c=30253 g=30254 pq=1 qp=1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353
42 6 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=1037/1/0 df=183 of=0 ql=62 qs=N.W. kt=0/W/6 ktl=d2ad b=10 ci=65663 co=0 ca=0 66 7 c=31178 g=31178 pq=1 qp=0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969
43 7 c=20897 g=20897 pq=1 pgp=20896 qp=0 dt=1572/0/0 df=382 of=0 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=75006 co=0 ca=0 67
44rcu_bh: 68This file has one line per CPU, or eight for this 8-CPU system.
45 0 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=545/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/0 ktl=ebc3 b=10 ci=0 co=0 ca=0 69The fields are as follows:
46 1 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=967/1/0 df=3 of=0 ql=0 qs=.... kt=0/W/1 ktl=58c b=10 ci=151 co=0 ca=0
47 2 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1081/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/2 ktl=da94 b=10 ci=0 co=0 ca=0
48 3 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1846/0/0 df=8 of=0 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=0 co=0 ca=0
49 4 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=369/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/4 ktl=e0e7 b=10 ci=0 co=0 ca=0
50 5 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=381/1/0 df=4 of=0 ql=0 qs=.... kt=0/W/5 ktl=fb2f b=10 ci=0 co=0 ca=0
51 6 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1037/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/6 ktl=d2ad b=10 ci=0 co=0 ca=0
52 7 c=1474 g=1474 pq=1 pgp=1473 qp=0 dt=1572/0/0 df=8 of=0 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=0 co=0 ca=0
53
54The first section lists the rcu_data structures for rcu_sched, the second
55for rcu_bh. Note that CONFIG_TREE_PREEMPT_RCU kernels will have an
56additional section for rcu_preempt. Each section has one line per CPU,
57or eight for this 8-CPU system. The fields are as follows:
58 70
59o The number at the beginning of each line is the CPU number. 71o The number at the beginning of each line is the CPU number.
60 CPUs numbers followed by an exclamation mark are offline, 72 CPUs numbers followed by an exclamation mark are offline,
@@ -64,11 +76,13 @@ o The number at the beginning of each line is the CPU number.
64 substantially larger than the number of actual CPUs. 76 substantially larger than the number of actual CPUs.
65 77
66o "c" is the count of grace periods that this CPU believes have 78o "c" is the count of grace periods that this CPU believes have
67 completed. Offlined CPUs and CPUs in dynticks idle mode may 79 completed. Offlined CPUs and CPUs in dynticks idle mode may lag
68 lag quite a ways behind, for example, CPU 6 under "rcu_sched" 80 quite a ways behind, for example, CPU 4 under "rcu_sched" above,
69 above, which has been offline through not quite 40,000 RCU grace 81 which has been offline through 16 RCU grace periods. It is not
70 periods. It is not unusual to see CPUs lagging by thousands of 82 unusual to see offline CPUs lagging by thousands of grace periods.
71 grace periods. 83 Note that although the grace-period number is an unsigned long,
84 it is printed out as a signed long to allow more human-friendly
85 representation near boot time.
72 86
73o "g" is the count of grace periods that this CPU believes have 87o "g" is the count of grace periods that this CPU believes have
74 started. Again, offlined CPUs and CPUs in dynticks idle mode 88 started. Again, offlined CPUs and CPUs in dynticks idle mode
@@ -84,30 +98,25 @@ o "pq" indicates that this CPU has passed through a quiescent state
84 CPU has not yet reported that fact, (2) some other CPU has not 98 CPU has not yet reported that fact, (2) some other CPU has not
85 yet reported for this grace period, or (3) both. 99 yet reported for this grace period, or (3) both.
86 100
87o "pgp" indicates which grace period the last-observed quiescent
88 state for this CPU corresponds to. This is important for handling
89 the race between CPU 0 reporting an extended dynticks-idle
90 quiescent state for CPU 1 and CPU 1 suddenly waking up and
91 reporting its own quiescent state. If CPU 1 was the last CPU
92 for the current grace period, then the CPU that loses this race
93 will attempt to incorrectly mark CPU 1 as having checked in for
94 the next grace period!
95
96o "qp" indicates that RCU still expects a quiescent state from 101o "qp" indicates that RCU still expects a quiescent state from
97 this CPU. Offlined CPUs and CPUs in dyntick idle mode might 102 this CPU. Offlined CPUs and CPUs in dyntick idle mode might
98 well have qp=1, which is OK: RCU is still ignoring them. 103 well have qp=1, which is OK: RCU is still ignoring them.
99 104
100o "dt" is the current value of the dyntick counter that is incremented 105o "dt" is the current value of the dyntick counter that is incremented
101 when entering or leaving dynticks idle state, either by the 106 when entering or leaving idle, either due to a context switch or
102 scheduler or by irq. This number is even if the CPU is in 107 due to an interrupt. This number is even if the CPU is in idle
103 dyntick idle mode and odd otherwise. The number after the first 108 from RCU's viewpoint and odd otherwise. The number after the
104 "/" is the interrupt nesting depth when in dyntick-idle state, 109 first "/" is the interrupt nesting depth when in idle state,
105 or one greater than the interrupt-nesting depth otherwise. 110 or a large number added to the interrupt-nesting depth when
106 The number after the second "/" is the NMI nesting depth. 111 running a non-idle task. Some architectures do not accurately
112 count interrupt nesting when running in non-idle kernel context,
113 which can result in interesting anomalies such as negative
114 interrupt-nesting levels. The number after the second "/"
115 is the NMI nesting depth.
107 116
108o "df" is the number of times that some other CPU has forced a 117o "df" is the number of times that some other CPU has forced a
109 quiescent state on behalf of this CPU due to this CPU being in 118 quiescent state on behalf of this CPU due to this CPU being in
110 dynticks-idle state. 119 idle state.
111 120
112o "of" is the number of times that some other CPU has forced a 121o "of" is the number of times that some other CPU has forced a
113 quiescent state on behalf of this CPU due to this CPU being 122 quiescent state on behalf of this CPU due to this CPU being
@@ -120,9 +129,13 @@ o "of" is the number of times that some other CPU has forced a
120 error, so it makes sense to err conservatively. 129 error, so it makes sense to err conservatively.
121 130
122o "ql" is the number of RCU callbacks currently residing on 131o "ql" is the number of RCU callbacks currently residing on
123 this CPU. This is the total number of callbacks, regardless 132 this CPU. The first number is the number of "lazy" callbacks
124 of what state they are in (new, waiting for grace period to 133 that are known to RCU to only be freeing memory, and the number
125 start, waiting for grace period to end, ready to invoke). 134 after the "/" is the total number of callbacks, lazy or not.
135 These counters count callbacks regardless of what phase of
136 grace-period processing that they are in (new, waiting for
137 grace period to start, waiting for grace period to end, ready
138 to invoke).
126 139
127o "qs" gives an indication of the state of the callback queue 140o "qs" gives an indication of the state of the callback queue
128 with four characters: 141 with four characters:
@@ -150,6 +163,43 @@ o "qs" gives an indication of the state of the callback queue
150 If there are no callbacks in a given one of the above states, 163 If there are no callbacks in a given one of the above states,
151 the corresponding character is replaced by ".". 164 the corresponding character is replaced by ".".
152 165
166o "b" is the batch limit for this CPU. If more than this number
167 of RCU callbacks is ready to invoke, then the remainder will
168 be deferred.
169
170o "ci" is the number of RCU callbacks that have been invoked for
171 this CPU. Note that ci+nci+ql is the number of callbacks that have
172 been registered in absence of CPU-hotplug activity.
173
174o "nci" is the number of RCU callbacks that have been offloaded from
175 this CPU. This will always be zero unless the kernel was built
176 with CONFIG_RCU_NOCB_CPU=y and the "rcu_nocbs=" kernel boot
177 parameter was specified.
178
179o "co" is the number of RCU callbacks that have been orphaned due to
180 this CPU going offline. These orphaned callbacks have been moved
181 to an arbitrarily chosen online CPU.
182
183o "ca" is the number of RCU callbacks that have been adopted by this
184 CPU due to other CPUs going offline. Note that ci+co-ca+ql is
185 the number of RCU callbacks registered on this CPU.
186
187
188Kernels compiled with CONFIG_RCU_BOOST=y display the following from
189/debug/rcu/rcu_preempt/rcudata:
190
191 0!c=12865 g=12866 pq=1 qp=1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871
192 1 c=14407 g=14408 pq=1 qp=0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485
193 2 c=14407 g=14408 pq=1 qp=0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490
194 3 c=14407 g=14408 pq=1 qp=0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290
195 4 c=14405 g=14406 pq=1 qp=1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114
196 5!c=14168 g=14169 pq=1 qp=0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722
197 6 c=14404 g=14405 pq=1 qp=0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811
198 7 c=14407 g=14408 pq=1 qp=1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042
199
200This is similar to the output discussed above, but contains the following
201additional fields:
202
153o "kt" is the per-CPU kernel-thread state. The digit preceding 203o "kt" is the per-CPU kernel-thread state. The digit preceding
154 the first slash is zero if there is no work pending and 1 204 the first slash is zero if there is no work pending and 1
155 otherwise. The character between the first pair of slashes is 205 otherwise. The character between the first pair of slashes is
@@ -184,35 +234,51 @@ o "ktl" is the low-order 16 bits (in hexadecimal) of the count of
184 234
185 This field is displayed only for CONFIG_RCU_BOOST kernels. 235 This field is displayed only for CONFIG_RCU_BOOST kernels.
186 236
187o "b" is the batch limit for this CPU. If more than this number
188 of RCU callbacks is ready to invoke, then the remainder will
189 be deferred.
190 237
191o "ci" is the number of RCU callbacks that have been invoked for 238The output of "cat rcu/rcu_preempt/rcuexp" looks as follows:
192 this CPU. Note that ci+ql is the number of callbacks that have
193 been registered in absence of CPU-hotplug activity.
194 239
195o "co" is the number of RCU callbacks that have been orphaned due to 240s=21872 d=21872 w=0 tf=0 wd1=0 wd2=0 n=0 sc=21872 dt=21872 dl=0 dx=21872
196 this CPU going offline. These orphaned callbacks have been moved 241
197 to an arbitrarily chosen online CPU. 242These fields are as follows:
243
244o "s" is the starting sequence number.
198 245
199o "ca" is the number of RCU callbacks that have been adopted due to 246o "d" is the ending sequence number. When the starting and ending
200 other CPUs going offline. Note that ci+co-ca+ql is the number of 247 numbers differ, there is an expedited grace period in progress.
201 RCU callbacks registered on this CPU.
202 248
203There is also an rcu/rcudata.csv file with the same information in 249o "w" is the number of times that the sequence numbers have been
204comma-separated-variable spreadsheet format. 250 in danger of wrapping.
205 251
252o "tf" is the number of times that contention has resulted in a
253 failure to begin an expedited grace period.
206 254
207The output of "cat rcu/rcugp" looks as follows: 255o "wd1" and "wd2" are the number of times that an attempt to
256 start an expedited grace period found that someone else had
257 completed an expedited grace period that satisfies the
258 attempted request. "Our work is done."
208 259
209rcu_sched: completed=33062 gpnum=33063 260o "n" is number of times that contention was so great that
210rcu_bh: completed=464 gpnum=464 261 the request was demoted from an expedited grace period to
262 a normal grace period.
263
264o "sc" is the number of times that the attempt to start a
265 new expedited grace period succeeded.
266
267o "dt" is the number of times that we attempted to update
268 the "d" counter.
269
270o "dl" is the number of times that we failed to update the "d"
271 counter.
272
273o "dx" is the number of times that we succeeded in updating
274 the "d" counter.
211 275
212Again, this output is for both "rcu_sched" and "rcu_bh". Note that 276
213kernels built with CONFIG_TREE_PREEMPT_RCU will have an additional 277The output of "cat rcu/rcu_preempt/rcugp" looks as follows:
214"rcu_preempt" line. The fields are taken from the rcu_state structure, 278
215and are as follows: 279completed=31249 gpnum=31250 age=1 max=18
280
281These fields are taken from the rcu_state structure, and are as follows:
216 282
217o "completed" is the number of grace periods that have completed. 283o "completed" is the number of grace periods that have completed.
218 It is comparable to the "c" field from rcu/rcudata in that a 284 It is comparable to the "c" field from rcu/rcudata in that a
@@ -220,44 +286,42 @@ o "completed" is the number of grace periods that have completed.
220 that the corresponding RCU grace period has completed. 286 that the corresponding RCU grace period has completed.
221 287
222o "gpnum" is the number of grace periods that have started. It is 288o "gpnum" is the number of grace periods that have started. It is
223 comparable to the "g" field from rcu/rcudata in that a CPU 289 similarly comparable to the "g" field from rcu/rcudata in that
224 whose "g" field matches the value of "gpnum" is aware that the 290 a CPU whose "g" field matches the value of "gpnum" is aware that
225 corresponding RCU grace period has started. 291 the corresponding RCU grace period has started.
292
293 If these two fields are equal, then there is no grace period
294 in progress, in other words, RCU is idle. On the other hand,
295 if the two fields differ (as they are above), then an RCU grace
296 period is in progress.
226 297
227 If these two fields are equal (as they are for "rcu_bh" above), 298o "age" is the number of jiffies that the current grace period
228 then there is no grace period in progress, in other words, RCU 299 has extended for, or zero if there is no grace period currently
229 is idle. On the other hand, if the two fields differ (as they 300 in effect.
230 do for "rcu_sched" above), then an RCU grace period is in progress.
231 301
302o "max" is the age in jiffies of the longest-duration grace period
303 thus far.
232 304
233The output of "cat rcu/rcuhier" looks as follows, with very long lines: 305The output of "cat rcu/rcu_preempt/rcuhier" looks as follows:
234 306
235c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 307c=14407 g=14408 s=0 jfq=2 j=c863 nfqs=12040/nfqsng=0(12040) fqlh=1051 oqlen=0/0
2361/1 ..>. 0:127 ^0 3083/3 ..>. 0:7 ^0
2373/3 ..>. 0:35 ^0 0/0 ..>. 36:71 ^1 0/0 ..>. 72:107 ^2 0/0 ..>. 108:127 ^3 309e/e ..>. 0:3 ^0 d/d ..>. 4:7 ^1
2383/3f ..>. 0:5 ^0 2/3 ..>. 6:11 ^1 0/0 ..>. 12:17 ^2 0/0 ..>. 18:23 ^3 0/0 ..>. 24:29 ^4 0/0 ..>. 30:35 ^5 0/0 ..>. 36:41 ^0 0/0 ..>. 42:47 ^1 0/0 ..>. 48:53 ^2 0/0 ..>. 54:59 ^3 0/0 ..>. 60:65 ^4 0/0 ..>. 66:71 ^5 0/0 ..>. 72:77 ^0 0/0 ..>. 78:83 ^1 0/0 ..>. 84:89 ^2 0/0 ..>. 90:95 ^3 0/0 ..>. 96:101 ^4 0/0 ..>. 102:107 ^5 0/0 ..>. 108:113 ^0 0/0 ..>. 114:119 ^1 0/0 ..>. 120:125 ^2 0/0 ..>. 126:127 ^3
239rcu_bh:
240c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
2410/1 ..>. 0:127 ^0
2420/3 ..>. 0:35 ^0 0/0 ..>. 36:71 ^1 0/0 ..>. 72:107 ^2 0/0 ..>. 108:127 ^3
2430/3f ..>. 0:5 ^0 0/3 ..>. 6:11 ^1 0/0 ..>. 12:17 ^2 0/0 ..>. 18:23 ^3 0/0 ..>. 24:29 ^4 0/0 ..>. 30:35 ^5 0/0 ..>. 36:41 ^0 0/0 ..>. 42:47 ^1 0/0 ..>. 48:53 ^2 0/0 ..>. 54:59 ^3 0/0 ..>. 60:65 ^4 0/0 ..>. 66:71 ^5 0/0 ..>. 72:77 ^0 0/0 ..>. 78:83 ^1 0/0 ..>. 84:89 ^2 0/0 ..>. 90:95 ^3 0/0 ..>. 96:101 ^4 0/0 ..>. 102:107 ^5 0/0 ..>. 108:113 ^0 0/0 ..>. 114:119 ^1 0/0 ..>. 120:125 ^2 0/0 ..>. 126:127 ^3
244 310
245This is once again split into "rcu_sched" and "rcu_bh" portions, 311The fields are as follows:
246and CONFIG_TREE_PREEMPT_RCU kernels will again have an additional
247"rcu_preempt" section. The fields are as follows:
248 312
249o "c" is exactly the same as "completed" under rcu/rcugp. 313o "c" is exactly the same as "completed" under rcu/rcu_preempt/rcugp.
250 314
251o "g" is exactly the same as "gpnum" under rcu/rcugp. 315o "g" is exactly the same as "gpnum" under rcu/rcu_preempt/rcugp.
252 316
253o "s" is the "signaled" state that drives force_quiescent_state()'s 317o "s" is the current state of the force_quiescent_state()
254 state machine. 318 state machine.
255 319
256o "jfq" is the number of jiffies remaining for this grace period 320o "jfq" is the number of jiffies remaining for this grace period
257 before force_quiescent_state() is invoked to help push things 321 before force_quiescent_state() is invoked to help push things
258 along. Note that CPUs in dyntick-idle mode throughout the grace 322 along. Note that CPUs in idle mode throughout the grace period
259 period will not report on their own, but rather must be check by 323 will not report on their own, but rather must be check by some
260 some other CPU via force_quiescent_state(). 324 other CPU via force_quiescent_state().
261 325
262o "j" is the low-order four hex digits of the jiffies counter. 326o "j" is the low-order four hex digits of the jiffies counter.
263 Yes, Paul did run into a number of problems that turned out to 327 Yes, Paul did run into a number of problems that turned out to
@@ -268,7 +332,8 @@ o "nfqs" is the number of calls to force_quiescent_state() since
268 332
269o "nfqsng" is the number of useless calls to force_quiescent_state(), 333o "nfqsng" is the number of useless calls to force_quiescent_state(),
270 where there wasn't actually a grace period active. This can 334 where there wasn't actually a grace period active. This can
271 happen due to races. The number in parentheses is the difference 335 no longer happen due to grace-period processing being pushed
336 into a kthread. The number in parentheses is the difference
272 between "nfqs" and "nfqsng", or the number of times that 337 between "nfqs" and "nfqsng", or the number of times that
273 force_quiescent_state() actually did some real work. 338 force_quiescent_state() actually did some real work.
274 339
@@ -276,28 +341,27 @@ o "fqlh" is the number of calls to force_quiescent_state() that
276 exited immediately (without even being counted in nfqs above) 341 exited immediately (without even being counted in nfqs above)
277 due to contention on ->fqslock. 342 due to contention on ->fqslock.
278 343
279o Each element of the form "1/1 0:127 ^0" represents one struct 344o Each element of the form "3/3 ..>. 0:7 ^0" represents one rcu_node
280 rcu_node. Each line represents one level of the hierarchy, from 345 structure. Each line represents one level of the hierarchy,
281 root to leaves. It is best to think of the rcu_data structures 346 from root to leaves. It is best to think of the rcu_data
282 as forming yet another level after the leaves. Note that there 347 structures as forming yet another level after the leaves.
283 might be either one, two, or three levels of rcu_node structures, 348 Note that there might be either one, two, three, or even four
284 depending on the relationship between CONFIG_RCU_FANOUT and 349 levels of rcu_node structures, depending on the relationship
285 CONFIG_NR_CPUS. 350 between CONFIG_RCU_FANOUT, CONFIG_RCU_FANOUT_LEAF (possibly
351 adjusted using the rcu_fanout_leaf kernel boot parameter), and
352 CONFIG_NR_CPUS (possibly adjusted using the nr_cpu_ids count of
353 possible CPUs for the booting hardware).
286 354
287 o The numbers separated by the "/" are the qsmask followed 355 o The numbers separated by the "/" are the qsmask followed
288 by the qsmaskinit. The qsmask will have one bit 356 by the qsmaskinit. The qsmask will have one bit
289 set for each entity in the next lower level that 357 set for each entity in the next lower level that has
290 has not yet checked in for the current grace period. 358 not yet checked in for the current grace period ("e"
359 indicating CPUs 5, 6, and 7 in the example above).
291 The qsmaskinit will have one bit for each entity that is 360 The qsmaskinit will have one bit for each entity that is
292 currently expected to check in during each grace period. 361 currently expected to check in during each grace period.
293 The value of qsmaskinit is assigned to that of qsmask 362 The value of qsmaskinit is assigned to that of qsmask
294 at the beginning of each grace period. 363 at the beginning of each grace period.
295 364
296 For example, for "rcu_sched", the qsmask of the first
297 entry of the lowest level is 0x14, meaning that we
298 are still waiting for CPUs 2 and 4 to check in for the
299 current grace period.
300
301 o The characters separated by the ">" indicate the state 365 o The characters separated by the ">" indicate the state
302 of the blocked-tasks lists. A "G" preceding the ">" 366 of the blocked-tasks lists. A "G" preceding the ">"
303 indicates that at least one task blocked in an RCU 367 indicates that at least one task blocked in an RCU
@@ -312,48 +376,39 @@ o Each element of the form "1/1 0:127 ^0" represents one struct
312 A "." character appears if the corresponding condition 376 A "." character appears if the corresponding condition
313 does not hold, so that "..>." indicates that no tasks 377 does not hold, so that "..>." indicates that no tasks
314 are blocked. In contrast, "GE>T" indicates maximal 378 are blocked. In contrast, "GE>T" indicates maximal
315 inconvenience from blocked tasks. 379 inconvenience from blocked tasks. CONFIG_TREE_RCU
380 builds of the kernel will always show "..>.".
316 381
317 o The numbers separated by the ":" are the range of CPUs 382 o The numbers separated by the ":" are the range of CPUs
318 served by this struct rcu_node. This can be helpful 383 served by this struct rcu_node. This can be helpful
319 in working out how the hierarchy is wired together. 384 in working out how the hierarchy is wired together.
320 385
321 For example, the first entry at the lowest level shows 386 For example, the example rcu_node structure shown above
322 "0:5", indicating that it covers CPUs 0 through 5. 387 has "0:7", indicating that it covers CPUs 0 through 7.
323 388
324 o The number after the "^" indicates the bit in the 389 o The number after the "^" indicates the bit in the
325 next higher level rcu_node structure that this 390 next higher level rcu_node structure that this rcu_node
326 rcu_node structure corresponds to. 391 structure corresponds to. For example, the "d/d ..>. 4:7
327 392 ^1" has a "1" in this position, indicating that it
328 For example, the first entry at the lowest level shows 393 corresponds to the "1" bit in the "3" shown in the
329 "^0", indicating that it corresponds to bit zero in 394 "3/3 ..>. 0:7 ^0" entry on the next level up.
330 the first entry at the middle level. 395
331 396
332 397The output of "cat rcu/rcu_sched/rcu_pending" looks as follows:
333The output of "cat rcu/rcu_pending" looks as follows: 398
334 399 0!np=26111 qsp=29 rpq=5386 cbr=1 cng=570 gpc=3674 gps=577 nn=15903
335rcu_sched: 400 1!np=28913 qsp=35 rpq=6097 cbr=1 cng=448 gpc=3700 gps=554 nn=18113
336 0 np=255892 qsp=53936 rpq=85 cbr=0 cng=14417 gpc=10033 gps=24320 nn=146741 401 2!np=32740 qsp=37 rpq=6202 cbr=0 cng=476 gpc=4627 gps=546 nn=20889
337 1 np=261224 qsp=54638 rpq=33 cbr=0 cng=25723 gpc=16310 gps=2849 nn=155792 402 3 np=23679 qsp=22 rpq=5044 cbr=1 cng=415 gpc=3403 gps=347 nn=14469
338 2 np=237496 qsp=49664 rpq=23 cbr=0 cng=2762 gpc=45478 gps=1762 nn=136629 403 4!np=30714 qsp=4 rpq=5574 cbr=0 cng=528 gpc=3931 gps=639 nn=20042
339 3 np=236249 qsp=48766 rpq=98 cbr=0 cng=286 gpc=48049 gps=1218 nn=137723 404 5 np=28910 qsp=2 rpq=5246 cbr=0 cng=428 gpc=4105 gps=709 nn=18422
340 4 np=221310 qsp=46850 rpq=7 cbr=0 cng=26 gpc=43161 gps=4634 nn=123110 405 6!np=38648 qsp=5 rpq=7076 cbr=0 cng=840 gpc=4072 gps=961 nn=25699
341 5 np=237332 qsp=48449 rpq=9 cbr=0 cng=54 gpc=47920 gps=3252 nn=137456 406 7 np=37275 qsp=2 rpq=6873 cbr=0 cng=868 gpc=3416 gps=971 nn=25147
342 6 np=219995 qsp=46718 rpq=12 cbr=0 cng=50 gpc=42098 gps=6093 nn=120834 407
343 7 np=249893 qsp=49390 rpq=42 cbr=0 cng=72 gpc=38400 gps=17102 nn=144888 408The fields are as follows:
344rcu_bh: 409
345 0 np=146741 qsp=1419 rpq=6 cbr=0 cng=6 gpc=0 gps=0 nn=145314 410o The leading number is the CPU number, with "!" indicating
346 1 np=155792 qsp=12597 rpq=3 cbr=0 cng=0 gpc=4 gps=8 nn=143180 411 an offline CPU.
347 2 np=136629 qsp=18680 rpq=1 cbr=0 cng=0 gpc=7 gps=6 nn=117936
348 3 np=137723 qsp=2843 rpq=0 cbr=0 cng=0 gpc=10 gps=7 nn=134863
349 4 np=123110 qsp=12433 rpq=0 cbr=0 cng=0 gpc=4 gps=2 nn=110671
350 5 np=137456 qsp=4210 rpq=1 cbr=0 cng=0 gpc=6 gps=5 nn=133235
351 6 np=120834 qsp=9902 rpq=2 cbr=0 cng=0 gpc=6 gps=3 nn=110921
352 7 np=144888 qsp=26336 rpq=0 cbr=0 cng=0 gpc=8 gps=2 nn=118542
353
354As always, this is once again split into "rcu_sched" and "rcu_bh"
355portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional
356"rcu_preempt" section. The fields are as follows:
357 412
358o "np" is the number of times that __rcu_pending() has been invoked 413o "np" is the number of times that __rcu_pending() has been invoked
359 for the corresponding flavor of RCU. 414 for the corresponding flavor of RCU.
@@ -377,38 +432,23 @@ o "gpc" is the number of times that an old grace period had
377o "gps" is the number of times that a new grace period had started, 432o "gps" is the number of times that a new grace period had started,
378 but this CPU was not yet aware of it. 433 but this CPU was not yet aware of it.
379 434
380o "nn" is the number of times that this CPU needed nothing. Alert 435o "nn" is the number of times that this CPU needed nothing.
381 readers will note that the rcu "nn" number for a given CPU very
382 closely matches the rcu_bh "np" number for that same CPU. This
383 is due to short-circuit evaluation in rcu_pending().
384
385
386The output of "cat rcu/rcutorture" looks as follows:
387
388rcutorture test sequence: 0 (test in progress)
389rcutorture update version number: 615
390
391The first line shows the number of rcutorture tests that have completed
392since boot. If a test is currently running, the "(test in progress)"
393string will appear as shown above. The second line shows the number of
394update cycles that the current test has started, or zero if there is
395no test in progress.
396 436
397 437
398The output of "cat rcu/rcuboost" looks as follows: 438The output of "cat rcu/rcuboost" looks as follows:
399 439
4000:5 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=2f95 bt=300f 4400:3 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894
401 balk: nt=0 egt=989 bt=0 nb=0 ny=0 nos=16 441 balk: nt=0 egt=4695 bt=0 nb=0 ny=56 nos=0
4026:7 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=2f95 bt=300f 4424:7 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894
403 balk: nt=0 egt=225 bt=0 nb=0 ny=0 nos=6 443 balk: nt=0 egt=6541 bt=0 nb=0 ny=126 nos=0
404 444
405This information is output only for rcu_preempt. Each two-line entry 445This information is output only for rcu_preempt. Each two-line entry
406corresponds to a leaf rcu_node strcuture. The fields are as follows: 446corresponds to a leaf rcu_node strcuture. The fields are as follows:
407 447
408o "n:m" is the CPU-number range for the corresponding two-line 448o "n:m" is the CPU-number range for the corresponding two-line
409 entry. In the sample output above, the first entry covers 449 entry. In the sample output above, the first entry covers
410 CPUs zero through five and the second entry covers CPUs 6 450 CPUs zero through three and the second entry covers CPUs four
411 and 7. 451 through seven.
412 452
413o "tasks=TNEB" gives the state of the various segments of the 453o "tasks=TNEB" gives the state of the various segments of the
414 rnp->blocked_tasks list: 454 rnp->blocked_tasks list:
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index bf0f6de2aa00..0cc7820967f4 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -499,6 +499,8 @@ The foo_reclaim() function might appear as follows:
499 { 499 {
500 struct foo *fp = container_of(rp, struct foo, rcu); 500 struct foo *fp = container_of(rp, struct foo, rcu);
501 501
502 foo_cleanup(fp->a);
503
502 kfree(fp); 504 kfree(fp);
503 } 505 }
504 506
@@ -521,6 +523,12 @@ o Use call_rcu() -after- removing a data element from an
521 read-side critical sections that might be referencing that 523 read-side critical sections that might be referencing that
522 data item. 524 data item.
523 525
526If the callback for call_rcu() is not doing anything more than calling
527kfree() on the structure, you can use kfree_rcu() instead of call_rcu()
528to avoid having to write your own callback:
529
530 kfree_rcu(old_fp, rcu);
531
524Again, see checklist.txt for additional rules governing the use of RCU. 532Again, see checklist.txt for additional rules governing the use of RCU.
525 533
526 534
@@ -773,8 +781,8 @@ a single atomic update, converting to RCU will require special care.
773 781
774Also, the presence of synchronize_rcu() means that the RCU version of 782Also, the presence of synchronize_rcu() means that the RCU version of
775delete() can now block. If this is a problem, there is a callback-based 783delete() can now block. If this is a problem, there is a callback-based
776mechanism that never blocks, namely call_rcu(), that can be used in 784mechanism that never blocks, namely call_rcu() or kfree_rcu(), that can
777place of synchronize_rcu(). 785be used in place of synchronize_rcu().
778 786
779 787
7807. FULL LIST OF RCU APIs 7887. FULL LIST OF RCU APIs
@@ -789,9 +797,7 @@ RCU list traversal:
789 list_for_each_entry_rcu 797 list_for_each_entry_rcu
790 hlist_for_each_entry_rcu 798 hlist_for_each_entry_rcu
791 hlist_nulls_for_each_entry_rcu 799 hlist_nulls_for_each_entry_rcu
792 800 list_for_each_entry_continue_rcu
793 list_for_each_continue_rcu (to be deprecated in favor of new
794 list_for_each_entry_continue_rcu)
795 801
796RCU pointer/list update: 802RCU pointer/list update:
797 803
@@ -813,6 +819,7 @@ RCU: Critical sections Grace period Barrier
813 rcu_read_unlock synchronize_rcu 819 rcu_read_unlock synchronize_rcu
814 rcu_dereference synchronize_rcu_expedited 820 rcu_dereference synchronize_rcu_expedited
815 call_rcu 821 call_rcu
822 kfree_rcu
816 823
817 824
818bh: Critical sections Grace period Barrier 825bh: Critical sections Grace period Barrier
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 2759f7c188f0..3c4e1b3b80a1 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -251,12 +251,13 @@ And there are a number of things that _must_ or _must_not_ be assumed:
251 251
252 And for: 252 And for:
253 253
254 *A = X; Y = *A; 254 *A = X; *(A + 4) = Y;
255 255
256 we may get either of: 256 we may get any of:
257 257
258 STORE *A = X; Y = LOAD *A; 258 STORE *A = X; STORE *(A + 4) = Y;
259 STORE *A = Y = X; 259 STORE *(A + 4) = Y; STORE *A = X;
260 STORE {*A, *(A + 4) } = {X, Y};
260 261
261 262
262========================= 263=========================
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 79ccfe6c7078..49e3b49e552f 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -648,7 +648,7 @@ static void stack_proc(void *arg)
648 struct task_struct *from = current, *to = arg; 648 struct task_struct *from = current, *to = arg;
649 649
650 to->thread.saved_task = from; 650 to->thread.saved_task = from;
651 rcu_switch(from, to); 651 rcu_user_hooks_switch(from, to);
652 switch_to(from, to, from); 652 switch_to(from, to, from);
653} 653}
654 654
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index e0f0fab20415..c92dd28eaa6c 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -286,23 +286,6 @@ static inline void list_splice_init_rcu(struct list_head *list,
286 &pos->member != (head); \ 286 &pos->member != (head); \
287 pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) 287 pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
288 288
289
290/**
291 * list_for_each_continue_rcu
292 * @pos: the &struct list_head to use as a loop cursor.
293 * @head: the head for your list.
294 *
295 * Iterate over an rcu-protected list, continuing after current point.
296 *
297 * This list-traversal primitive may safely run concurrently with
298 * the _rcu list-mutation primitives such as list_add_rcu()
299 * as long as the traversal is guarded by rcu_read_lock().
300 */
301#define list_for_each_continue_rcu(pos, head) \
302 for ((pos) = rcu_dereference_raw(list_next_rcu(pos)); \
303 (pos) != (head); \
304 (pos) = rcu_dereference_raw(list_next_rcu(pos)))
305
306/** 289/**
307 * list_for_each_entry_continue_rcu - continue iteration over list of given type 290 * list_for_each_entry_continue_rcu - continue iteration over list of given type
308 * @pos: the type * to use as a loop cursor. 291 * @pos: the type * to use as a loop cursor.
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 7c968e4f929e..8fe7c1840d30 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -90,6 +90,25 @@ extern void do_trace_rcu_torture_read(char *rcutorturename,
90 * that started after call_rcu() was invoked. RCU read-side critical 90 * that started after call_rcu() was invoked. RCU read-side critical
91 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 91 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
92 * and may be nested. 92 * and may be nested.
93 *
94 * Note that all CPUs must agree that the grace period extended beyond
95 * all pre-existing RCU read-side critical section. On systems with more
96 * than one CPU, this means that when "func()" is invoked, each CPU is
97 * guaranteed to have executed a full memory barrier since the end of its
98 * last RCU read-side critical section whose beginning preceded the call
99 * to call_rcu(). It also means that each CPU executing an RCU read-side
100 * critical section that continues beyond the start of "func()" must have
101 * executed a memory barrier after the call_rcu() but before the beginning
102 * of that RCU read-side critical section. Note that these guarantees
103 * include CPUs that are offline, idle, or executing in user mode, as
104 * well as CPUs that are executing in the kernel.
105 *
106 * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
107 * resulting RCU callback function "func()", then both CPU A and CPU B are
108 * guaranteed to execute a full memory barrier during the time interval
109 * between the call to call_rcu() and the invocation of "func()" -- even
110 * if CPU A and CPU B are the same CPU (but again only if the system has
111 * more than one CPU).
93 */ 112 */
94extern void call_rcu(struct rcu_head *head, 113extern void call_rcu(struct rcu_head *head,
95 void (*func)(struct rcu_head *head)); 114 void (*func)(struct rcu_head *head));
@@ -118,6 +137,9 @@ extern void call_rcu(struct rcu_head *head,
118 * OR 137 * OR
119 * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. 138 * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
120 * These may be nested. 139 * These may be nested.
140 *
141 * See the description of call_rcu() for more detailed information on
142 * memory ordering guarantees.
121 */ 143 */
122extern void call_rcu_bh(struct rcu_head *head, 144extern void call_rcu_bh(struct rcu_head *head,
123 void (*func)(struct rcu_head *head)); 145 void (*func)(struct rcu_head *head));
@@ -137,6 +159,9 @@ extern void call_rcu_bh(struct rcu_head *head,
137 * OR 159 * OR
138 * anything that disables preemption. 160 * anything that disables preemption.
139 * These may be nested. 161 * These may be nested.
162 *
163 * See the description of call_rcu() for more detailed information on
164 * memory ordering guarantees.
140 */ 165 */
141extern void call_rcu_sched(struct rcu_head *head, 166extern void call_rcu_sched(struct rcu_head *head,
142 void (*func)(struct rcu_head *rcu)); 167 void (*func)(struct rcu_head *rcu));
@@ -204,6 +229,8 @@ static inline void rcu_user_enter(void) { }
204static inline void rcu_user_exit(void) { } 229static inline void rcu_user_exit(void) { }
205static inline void rcu_user_enter_after_irq(void) { } 230static inline void rcu_user_enter_after_irq(void) { }
206static inline void rcu_user_exit_after_irq(void) { } 231static inline void rcu_user_exit_after_irq(void) { }
232static inline void rcu_user_hooks_switch(struct task_struct *prev,
233 struct task_struct *next) { }
207#endif /* CONFIG_RCU_USER_QS */ 234#endif /* CONFIG_RCU_USER_QS */
208 235
209extern void exit_rcu(void); 236extern void exit_rcu(void);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0dd42a02df2e..530c52ef873e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -109,6 +109,8 @@ extern void update_cpu_load_nohz(void);
109 109
110extern unsigned long get_parent_ip(unsigned long addr); 110extern unsigned long get_parent_ip(unsigned long addr);
111 111
112extern void dump_cpu_task(int cpu);
113
112struct seq_file; 114struct seq_file;
113struct cfs_rq; 115struct cfs_rq;
114struct task_group; 116struct task_group;
@@ -1844,14 +1846,6 @@ static inline void rcu_copy_process(struct task_struct *p)
1844 1846
1845#endif 1847#endif
1846 1848
1847static inline void rcu_switch(struct task_struct *prev,
1848 struct task_struct *next)
1849{
1850#ifdef CONFIG_RCU_USER_QS
1851 rcu_user_hooks_switch(prev, next);
1852#endif
1853}
1854
1855static inline void tsk_restore_flags(struct task_struct *task, 1849static inline void tsk_restore_flags(struct task_struct *task,
1856 unsigned long orig_flags, unsigned long flags) 1850 unsigned long orig_flags, unsigned long flags)
1857{ 1851{
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 55a5c52cbb25..6eb691b08358 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -16,8 +16,10 @@
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 * 17 *
18 * Copyright (C) IBM Corporation, 2006 18 * Copyright (C) IBM Corporation, 2006
19 * Copyright (C) Fujitsu, 2012
19 * 20 *
20 * Author: Paul McKenney <paulmck@us.ibm.com> 21 * Author: Paul McKenney <paulmck@us.ibm.com>
22 * Lai Jiangshan <laijs@cn.fujitsu.com>
21 * 23 *
22 * For detailed explanation of Read-Copy Update mechanism see - 24 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU/ *.txt 25 * Documentation/RCU/ *.txt
@@ -40,6 +42,8 @@ struct rcu_batch {
40 struct rcu_head *head, **tail; 42 struct rcu_head *head, **tail;
41}; 43};
42 44
45#define RCU_BATCH_INIT(name) { NULL, &(name.head) }
46
43struct srcu_struct { 47struct srcu_struct {
44 unsigned completed; 48 unsigned completed;
45 struct srcu_struct_array __percpu *per_cpu_ref; 49 struct srcu_struct_array __percpu *per_cpu_ref;
@@ -70,12 +74,42 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name,
70 __init_srcu_struct((sp), #sp, &__srcu_key); \ 74 __init_srcu_struct((sp), #sp, &__srcu_key); \
71}) 75})
72 76
77#define __SRCU_DEP_MAP_INIT(srcu_name) .dep_map = { .name = #srcu_name },
73#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 78#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
74 79
75int init_srcu_struct(struct srcu_struct *sp); 80int init_srcu_struct(struct srcu_struct *sp);
76 81
82#define __SRCU_DEP_MAP_INIT(srcu_name)
77#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 83#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
78 84
85void process_srcu(struct work_struct *work);
86
87#define __SRCU_STRUCT_INIT(name) \
88 { \
89 .completed = -300, \
90 .per_cpu_ref = &name##_srcu_array, \
91 .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \
92 .running = false, \
93 .batch_queue = RCU_BATCH_INIT(name.batch_queue), \
94 .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \
95 .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \
96 .batch_done = RCU_BATCH_INIT(name.batch_done), \
97 .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\
98 __SRCU_DEP_MAP_INIT(name) \
99 }
100
101/*
102 * define and init a srcu struct at build time.
103 * dont't call init_srcu_struct() nor cleanup_srcu_struct() on it.
104 */
105#define DEFINE_SRCU(name) \
106 static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
107 struct srcu_struct name = __SRCU_STRUCT_INIT(name);
108
109#define DEFINE_STATIC_SRCU(name) \
110 static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
111 static struct srcu_struct name = __SRCU_STRUCT_INIT(name);
112
79/** 113/**
80 * call_srcu() - Queue a callback for invocation after an SRCU grace period 114 * call_srcu() - Queue a callback for invocation after an SRCU grace period
81 * @sp: srcu_struct in queue the callback 115 * @sp: srcu_struct in queue the callback
diff --git a/init/Kconfig b/init/Kconfig
index 6fdd6e339326..ec62139207d3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -494,11 +494,11 @@ config RCU_USER_QS
494 puts RCU in extended quiescent state when the CPU runs in 494 puts RCU in extended quiescent state when the CPU runs in
495 userspace. It means that when a CPU runs in userspace, it is 495 userspace. It means that when a CPU runs in userspace, it is
496 excluded from the global RCU state machine and thus doesn't 496 excluded from the global RCU state machine and thus doesn't
497 to keep the timer tick on for RCU. 497 try to keep the timer tick on for RCU.
498 498
499 Unless you want to hack and help the development of the full 499 Unless you want to hack and help the development of the full
500 tickless feature, you shouldn't enable this option. It adds 500 tickless feature, you shouldn't enable this option. It also
501 unnecessary overhead. 501 adds unnecessary overhead.
502 502
503 If unsure say N 503 If unsure say N
504 504
@@ -582,14 +582,13 @@ config RCU_FAST_NO_HZ
582 depends on NO_HZ && SMP 582 depends on NO_HZ && SMP
583 default n 583 default n
584 help 584 help
585 This option causes RCU to attempt to accelerate grace periods 585 This option causes RCU to attempt to accelerate grace periods in
586 in order to allow CPUs to enter dynticks-idle state more 586 order to allow CPUs to enter dynticks-idle state more quickly.
587 quickly. On the other hand, this option increases the overhead 587 On the other hand, this option increases the overhead of the
588 of the dynticks-idle checking, particularly on systems with 588 dynticks-idle checking, thus degrading scheduling latency.
589 large numbers of CPUs.
590 589
591 Say Y if energy efficiency is critically important, particularly 590 Say Y if energy efficiency is critically important, and you don't
592 if you have relatively few CPUs. 591 care about real-time response.
593 592
594 Say N if you are unsure. 593 Say N if you are unsure.
595 594
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 4e316e1acf58..8715a798aa7c 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -141,6 +141,23 @@ static ssize_t fscaps_show(struct kobject *kobj,
141} 141}
142KERNEL_ATTR_RO(fscaps); 142KERNEL_ATTR_RO(fscaps);
143 143
144int rcu_expedited;
145static ssize_t rcu_expedited_show(struct kobject *kobj,
146 struct kobj_attribute *attr, char *buf)
147{
148 return sprintf(buf, "%d\n", rcu_expedited);
149}
150static ssize_t rcu_expedited_store(struct kobject *kobj,
151 struct kobj_attribute *attr,
152 const char *buf, size_t count)
153{
154 if (kstrtoint(buf, 0, &rcu_expedited))
155 return -EINVAL;
156
157 return count;
158}
159KERNEL_ATTR_RW(rcu_expedited);
160
144/* 161/*
145 * Make /sys/kernel/notes give the raw contents of our kernel .notes section. 162 * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
146 */ 163 */
@@ -182,6 +199,7 @@ static struct attribute * kernel_attrs[] = {
182 &kexec_crash_size_attr.attr, 199 &kexec_crash_size_attr.attr,
183 &vmcoreinfo_attr.attr, 200 &vmcoreinfo_attr.attr,
184#endif 201#endif
202 &rcu_expedited_attr.attr,
185 NULL 203 NULL
186}; 204};
187 205
diff --git a/kernel/rcu.h b/kernel/rcu.h
index 8ba99cdc6515..20dfba576c2b 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -109,4 +109,6 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)
109 } 109 }
110} 110}
111 111
112extern int rcu_expedited;
113
112#endif /* __LINUX_RCU_H */ 114#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 29ca1c6da594..a2cf76177b44 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -46,12 +46,15 @@
46#include <linux/export.h> 46#include <linux/export.h>
47#include <linux/hardirq.h> 47#include <linux/hardirq.h>
48#include <linux/delay.h> 48#include <linux/delay.h>
49#include <linux/module.h>
49 50
50#define CREATE_TRACE_POINTS 51#define CREATE_TRACE_POINTS
51#include <trace/events/rcu.h> 52#include <trace/events/rcu.h>
52 53
53#include "rcu.h" 54#include "rcu.h"
54 55
56module_param(rcu_expedited, int, 0);
57
55#ifdef CONFIG_PREEMPT_RCU 58#ifdef CONFIG_PREEMPT_RCU
56 59
57/* 60/*
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index e4c6a598d6f7..e7dce58f9c2a 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -195,7 +195,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle);
195 */ 195 */
196int rcu_is_cpu_rrupt_from_idle(void) 196int rcu_is_cpu_rrupt_from_idle(void)
197{ 197{
198 return rcu_dynticks_nesting <= 0; 198 return rcu_dynticks_nesting <= 1;
199} 199}
200 200
201/* 201/*
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 3d0190282204..f85016a2309b 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -706,7 +706,10 @@ void synchronize_rcu(void)
706 return; 706 return;
707 707
708 /* Once we get past the fastpath checks, same code as rcu_barrier(). */ 708 /* Once we get past the fastpath checks, same code as rcu_barrier(). */
709 rcu_barrier(); 709 if (rcu_expedited)
710 synchronize_rcu_expedited();
711 else
712 rcu_barrier();
710} 713}
711EXPORT_SYMBOL_GPL(synchronize_rcu); 714EXPORT_SYMBOL_GPL(synchronize_rcu);
712 715
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index aaa7b9f3532a..31dea01c85fd 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -339,7 +339,6 @@ rcu_stutter_wait(char *title)
339 339
340struct rcu_torture_ops { 340struct rcu_torture_ops {
341 void (*init)(void); 341 void (*init)(void);
342 void (*cleanup)(void);
343 int (*readlock)(void); 342 int (*readlock)(void);
344 void (*read_delay)(struct rcu_random_state *rrsp); 343 void (*read_delay)(struct rcu_random_state *rrsp);
345 void (*readunlock)(int idx); 344 void (*readunlock)(int idx);
@@ -431,7 +430,6 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
431 430
432static struct rcu_torture_ops rcu_ops = { 431static struct rcu_torture_ops rcu_ops = {
433 .init = NULL, 432 .init = NULL,
434 .cleanup = NULL,
435 .readlock = rcu_torture_read_lock, 433 .readlock = rcu_torture_read_lock,
436 .read_delay = rcu_read_delay, 434 .read_delay = rcu_read_delay,
437 .readunlock = rcu_torture_read_unlock, 435 .readunlock = rcu_torture_read_unlock,
@@ -475,7 +473,6 @@ static void rcu_sync_torture_init(void)
475 473
476static struct rcu_torture_ops rcu_sync_ops = { 474static struct rcu_torture_ops rcu_sync_ops = {
477 .init = rcu_sync_torture_init, 475 .init = rcu_sync_torture_init,
478 .cleanup = NULL,
479 .readlock = rcu_torture_read_lock, 476 .readlock = rcu_torture_read_lock,
480 .read_delay = rcu_read_delay, 477 .read_delay = rcu_read_delay,
481 .readunlock = rcu_torture_read_unlock, 478 .readunlock = rcu_torture_read_unlock,
@@ -493,7 +490,6 @@ static struct rcu_torture_ops rcu_sync_ops = {
493 490
494static struct rcu_torture_ops rcu_expedited_ops = { 491static struct rcu_torture_ops rcu_expedited_ops = {
495 .init = rcu_sync_torture_init, 492 .init = rcu_sync_torture_init,
496 .cleanup = NULL,
497 .readlock = rcu_torture_read_lock, 493 .readlock = rcu_torture_read_lock,
498 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 494 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
499 .readunlock = rcu_torture_read_unlock, 495 .readunlock = rcu_torture_read_unlock,
@@ -536,7 +532,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
536 532
537static struct rcu_torture_ops rcu_bh_ops = { 533static struct rcu_torture_ops rcu_bh_ops = {
538 .init = NULL, 534 .init = NULL,
539 .cleanup = NULL,
540 .readlock = rcu_bh_torture_read_lock, 535 .readlock = rcu_bh_torture_read_lock,
541 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 536 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
542 .readunlock = rcu_bh_torture_read_unlock, 537 .readunlock = rcu_bh_torture_read_unlock,
@@ -553,7 +548,6 @@ static struct rcu_torture_ops rcu_bh_ops = {
553 548
554static struct rcu_torture_ops rcu_bh_sync_ops = { 549static struct rcu_torture_ops rcu_bh_sync_ops = {
555 .init = rcu_sync_torture_init, 550 .init = rcu_sync_torture_init,
556 .cleanup = NULL,
557 .readlock = rcu_bh_torture_read_lock, 551 .readlock = rcu_bh_torture_read_lock,
558 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 552 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
559 .readunlock = rcu_bh_torture_read_unlock, 553 .readunlock = rcu_bh_torture_read_unlock,
@@ -570,7 +564,6 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
570 564
571static struct rcu_torture_ops rcu_bh_expedited_ops = { 565static struct rcu_torture_ops rcu_bh_expedited_ops = {
572 .init = rcu_sync_torture_init, 566 .init = rcu_sync_torture_init,
573 .cleanup = NULL,
574 .readlock = rcu_bh_torture_read_lock, 567 .readlock = rcu_bh_torture_read_lock,
575 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 568 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
576 .readunlock = rcu_bh_torture_read_unlock, 569 .readunlock = rcu_bh_torture_read_unlock,
@@ -589,19 +582,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = {
589 * Definitions for srcu torture testing. 582 * Definitions for srcu torture testing.
590 */ 583 */
591 584
592static struct srcu_struct srcu_ctl; 585DEFINE_STATIC_SRCU(srcu_ctl);
593
594static void srcu_torture_init(void)
595{
596 init_srcu_struct(&srcu_ctl);
597 rcu_sync_torture_init();
598}
599
600static void srcu_torture_cleanup(void)
601{
602 synchronize_srcu(&srcu_ctl);
603 cleanup_srcu_struct(&srcu_ctl);
604}
605 586
606static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) 587static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
607{ 588{
@@ -672,8 +653,7 @@ static int srcu_torture_stats(char *page)
672} 653}
673 654
674static struct rcu_torture_ops srcu_ops = { 655static struct rcu_torture_ops srcu_ops = {
675 .init = srcu_torture_init, 656 .init = rcu_sync_torture_init,
676 .cleanup = srcu_torture_cleanup,
677 .readlock = srcu_torture_read_lock, 657 .readlock = srcu_torture_read_lock,
678 .read_delay = srcu_read_delay, 658 .read_delay = srcu_read_delay,
679 .readunlock = srcu_torture_read_unlock, 659 .readunlock = srcu_torture_read_unlock,
@@ -687,8 +667,7 @@ static struct rcu_torture_ops srcu_ops = {
687}; 667};
688 668
689static struct rcu_torture_ops srcu_sync_ops = { 669static struct rcu_torture_ops srcu_sync_ops = {
690 .init = srcu_torture_init, 670 .init = rcu_sync_torture_init,
691 .cleanup = srcu_torture_cleanup,
692 .readlock = srcu_torture_read_lock, 671 .readlock = srcu_torture_read_lock,
693 .read_delay = srcu_read_delay, 672 .read_delay = srcu_read_delay,
694 .readunlock = srcu_torture_read_unlock, 673 .readunlock = srcu_torture_read_unlock,
@@ -712,8 +691,7 @@ static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
712} 691}
713 692
714static struct rcu_torture_ops srcu_raw_ops = { 693static struct rcu_torture_ops srcu_raw_ops = {
715 .init = srcu_torture_init, 694 .init = rcu_sync_torture_init,
716 .cleanup = srcu_torture_cleanup,
717 .readlock = srcu_torture_read_lock_raw, 695 .readlock = srcu_torture_read_lock_raw,
718 .read_delay = srcu_read_delay, 696 .read_delay = srcu_read_delay,
719 .readunlock = srcu_torture_read_unlock_raw, 697 .readunlock = srcu_torture_read_unlock_raw,
@@ -727,8 +705,7 @@ static struct rcu_torture_ops srcu_raw_ops = {
727}; 705};
728 706
729static struct rcu_torture_ops srcu_raw_sync_ops = { 707static struct rcu_torture_ops srcu_raw_sync_ops = {
730 .init = srcu_torture_init, 708 .init = rcu_sync_torture_init,
731 .cleanup = srcu_torture_cleanup,
732 .readlock = srcu_torture_read_lock_raw, 709 .readlock = srcu_torture_read_lock_raw,
733 .read_delay = srcu_read_delay, 710 .read_delay = srcu_read_delay,
734 .readunlock = srcu_torture_read_unlock_raw, 711 .readunlock = srcu_torture_read_unlock_raw,
@@ -747,8 +724,7 @@ static void srcu_torture_synchronize_expedited(void)
747} 724}
748 725
749static struct rcu_torture_ops srcu_expedited_ops = { 726static struct rcu_torture_ops srcu_expedited_ops = {
750 .init = srcu_torture_init, 727 .init = rcu_sync_torture_init,
751 .cleanup = srcu_torture_cleanup,
752 .readlock = srcu_torture_read_lock, 728 .readlock = srcu_torture_read_lock,
753 .read_delay = srcu_read_delay, 729 .read_delay = srcu_read_delay,
754 .readunlock = srcu_torture_read_unlock, 730 .readunlock = srcu_torture_read_unlock,
@@ -783,7 +759,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
783 759
784static struct rcu_torture_ops sched_ops = { 760static struct rcu_torture_ops sched_ops = {
785 .init = rcu_sync_torture_init, 761 .init = rcu_sync_torture_init,
786 .cleanup = NULL,
787 .readlock = sched_torture_read_lock, 762 .readlock = sched_torture_read_lock,
788 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 763 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
789 .readunlock = sched_torture_read_unlock, 764 .readunlock = sched_torture_read_unlock,
@@ -799,7 +774,6 @@ static struct rcu_torture_ops sched_ops = {
799 774
800static struct rcu_torture_ops sched_sync_ops = { 775static struct rcu_torture_ops sched_sync_ops = {
801 .init = rcu_sync_torture_init, 776 .init = rcu_sync_torture_init,
802 .cleanup = NULL,
803 .readlock = sched_torture_read_lock, 777 .readlock = sched_torture_read_lock,
804 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 778 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
805 .readunlock = sched_torture_read_unlock, 779 .readunlock = sched_torture_read_unlock,
@@ -814,7 +788,6 @@ static struct rcu_torture_ops sched_sync_ops = {
814 788
815static struct rcu_torture_ops sched_expedited_ops = { 789static struct rcu_torture_ops sched_expedited_ops = {
816 .init = rcu_sync_torture_init, 790 .init = rcu_sync_torture_init,
817 .cleanup = NULL,
818 .readlock = sched_torture_read_lock, 791 .readlock = sched_torture_read_lock,
819 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 792 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
820 .readunlock = sched_torture_read_unlock, 793 .readunlock = sched_torture_read_unlock,
@@ -1396,12 +1369,16 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1396 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " 1369 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1397 "test_boost=%d/%d test_boost_interval=%d " 1370 "test_boost=%d/%d test_boost_interval=%d "
1398 "test_boost_duration=%d shutdown_secs=%d " 1371 "test_boost_duration=%d shutdown_secs=%d "
1372 "stall_cpu=%d stall_cpu_holdoff=%d "
1373 "n_barrier_cbs=%d "
1399 "onoff_interval=%d onoff_holdoff=%d\n", 1374 "onoff_interval=%d onoff_holdoff=%d\n",
1400 torture_type, tag, nrealreaders, nfakewriters, 1375 torture_type, tag, nrealreaders, nfakewriters,
1401 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1376 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1402 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, 1377 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1403 test_boost, cur_ops->can_boost, 1378 test_boost, cur_ops->can_boost,
1404 test_boost_interval, test_boost_duration, shutdown_secs, 1379 test_boost_interval, test_boost_duration, shutdown_secs,
1380 stall_cpu, stall_cpu_holdoff,
1381 n_barrier_cbs,
1405 onoff_interval, onoff_holdoff); 1382 onoff_interval, onoff_holdoff);
1406} 1383}
1407 1384
@@ -1502,6 +1479,7 @@ rcu_torture_onoff(void *arg)
1502 unsigned long delta; 1479 unsigned long delta;
1503 int maxcpu = -1; 1480 int maxcpu = -1;
1504 DEFINE_RCU_RANDOM(rand); 1481 DEFINE_RCU_RANDOM(rand);
1482 int ret;
1505 unsigned long starttime; 1483 unsigned long starttime;
1506 1484
1507 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); 1485 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
@@ -1522,7 +1500,13 @@ rcu_torture_onoff(void *arg)
1522 torture_type, cpu); 1500 torture_type, cpu);
1523 starttime = jiffies; 1501 starttime = jiffies;
1524 n_offline_attempts++; 1502 n_offline_attempts++;
1525 if (cpu_down(cpu) == 0) { 1503 ret = cpu_down(cpu);
1504 if (ret) {
1505 if (verbose)
1506 pr_alert("%s" TORTURE_FLAG
1507 "rcu_torture_onoff task: offline %d failed: errno %d\n",
1508 torture_type, cpu, ret);
1509 } else {
1526 if (verbose) 1510 if (verbose)
1527 pr_alert("%s" TORTURE_FLAG 1511 pr_alert("%s" TORTURE_FLAG
1528 "rcu_torture_onoff task: offlined %d\n", 1512 "rcu_torture_onoff task: offlined %d\n",
@@ -1936,8 +1920,6 @@ rcu_torture_cleanup(void)
1936 1920
1937 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ 1921 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
1938 1922
1939 if (cur_ops->cleanup)
1940 cur_ops->cleanup();
1941 if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) 1923 if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
1942 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); 1924 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1943 else if (n_online_successes != n_online_attempts || 1925 else if (n_online_successes != n_online_attempts ||
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 74df86bd9204..5ffadcc3bb26 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -68,9 +68,9 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
68 .level = { &sname##_state.node[0] }, \ 68 .level = { &sname##_state.node[0] }, \
69 .call = cr, \ 69 .call = cr, \
70 .fqs_state = RCU_GP_IDLE, \ 70 .fqs_state = RCU_GP_IDLE, \
71 .gpnum = -300, \ 71 .gpnum = 0UL - 300UL, \
72 .completed = -300, \ 72 .completed = 0UL - 300UL, \
73 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \ 73 .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
74 .orphan_nxttail = &sname##_state.orphan_nxtlist, \ 74 .orphan_nxttail = &sname##_state.orphan_nxtlist, \
75 .orphan_donetail = &sname##_state.orphan_donelist, \ 75 .orphan_donetail = &sname##_state.orphan_donelist, \
76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
@@ -212,13 +212,13 @@ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
212#endif 212#endif
213}; 213};
214 214
215static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 215static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
216static int qhimark = 10000; /* If this many pending, ignore blimit. */ 216static long qhimark = 10000; /* If this many pending, ignore blimit. */
217static int qlowmark = 100; /* Once only this many pending, use blimit. */ 217static long qlowmark = 100; /* Once only this many pending, use blimit. */
218 218
219module_param(blimit, int, 0444); 219module_param(blimit, long, 0444);
220module_param(qhimark, int, 0444); 220module_param(qhimark, long, 0444);
221module_param(qlowmark, int, 0444); 221module_param(qlowmark, long, 0444);
222 222
223int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ 223int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
224int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; 224int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
@@ -313,7 +313,7 @@ static int
313cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 313cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
314{ 314{
315 return *rdp->nxttail[RCU_DONE_TAIL + 315 return *rdp->nxttail[RCU_DONE_TAIL +
316 ACCESS_ONCE(rsp->completed) != rdp->completed] && 316 (ACCESS_ONCE(rsp->completed) != rdp->completed)] &&
317 !rcu_gp_in_progress(rsp); 317 !rcu_gp_in_progress(rsp);
318} 318}
319 319
@@ -873,6 +873,29 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
873 rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); 873 rsp->jiffies_stall = jiffies + jiffies_till_stall_check();
874} 874}
875 875
876/*
877 * Dump stacks of all tasks running on stalled CPUs. This is a fallback
878 * for architectures that do not implement trigger_all_cpu_backtrace().
879 * The NMI-triggered stack traces are more accurate because they are
880 * printed by the target CPU.
881 */
882static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
883{
884 int cpu;
885 unsigned long flags;
886 struct rcu_node *rnp;
887
888 rcu_for_each_leaf_node(rsp, rnp) {
889 raw_spin_lock_irqsave(&rnp->lock, flags);
890 if (rnp->qsmask != 0) {
891 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
892 if (rnp->qsmask & (1UL << cpu))
893 dump_cpu_task(rnp->grplo + cpu);
894 }
895 raw_spin_unlock_irqrestore(&rnp->lock, flags);
896 }
897}
898
876static void print_other_cpu_stall(struct rcu_state *rsp) 899static void print_other_cpu_stall(struct rcu_state *rsp)
877{ 900{
878 int cpu; 901 int cpu;
@@ -880,6 +903,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
880 unsigned long flags; 903 unsigned long flags;
881 int ndetected = 0; 904 int ndetected = 0;
882 struct rcu_node *rnp = rcu_get_root(rsp); 905 struct rcu_node *rnp = rcu_get_root(rsp);
906 long totqlen = 0;
883 907
884 /* Only let one CPU complain about others per time interval. */ 908 /* Only let one CPU complain about others per time interval. */
885 909
@@ -924,12 +948,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
924 raw_spin_unlock_irqrestore(&rnp->lock, flags); 948 raw_spin_unlock_irqrestore(&rnp->lock, flags);
925 949
926 print_cpu_stall_info_end(); 950 print_cpu_stall_info_end();
927 printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n", 951 for_each_possible_cpu(cpu)
928 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 952 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
953 pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n",
954 smp_processor_id(), (long)(jiffies - rsp->gp_start),
955 rsp->gpnum, rsp->completed, totqlen);
929 if (ndetected == 0) 956 if (ndetected == 0)
930 printk(KERN_ERR "INFO: Stall ended before state dump start\n"); 957 printk(KERN_ERR "INFO: Stall ended before state dump start\n");
931 else if (!trigger_all_cpu_backtrace()) 958 else if (!trigger_all_cpu_backtrace())
932 dump_stack(); 959 rcu_dump_cpu_stacks(rsp);
933 960
934 /* Complain about tasks blocking the grace period. */ 961 /* Complain about tasks blocking the grace period. */
935 962
@@ -940,8 +967,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
940 967
941static void print_cpu_stall(struct rcu_state *rsp) 968static void print_cpu_stall(struct rcu_state *rsp)
942{ 969{
970 int cpu;
943 unsigned long flags; 971 unsigned long flags;
944 struct rcu_node *rnp = rcu_get_root(rsp); 972 struct rcu_node *rnp = rcu_get_root(rsp);
973 long totqlen = 0;
945 974
946 /* 975 /*
947 * OK, time to rat on ourselves... 976 * OK, time to rat on ourselves...
@@ -952,7 +981,10 @@ static void print_cpu_stall(struct rcu_state *rsp)
952 print_cpu_stall_info_begin(); 981 print_cpu_stall_info_begin();
953 print_cpu_stall_info(rsp, smp_processor_id()); 982 print_cpu_stall_info(rsp, smp_processor_id());
954 print_cpu_stall_info_end(); 983 print_cpu_stall_info_end();
955 printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start); 984 for_each_possible_cpu(cpu)
985 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
986 pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n",
987 jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen);
956 if (!trigger_all_cpu_backtrace()) 988 if (!trigger_all_cpu_backtrace())
957 dump_stack(); 989 dump_stack();
958 990
@@ -1404,15 +1436,37 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
1404 !cpu_needs_another_gp(rsp, rdp)) { 1436 !cpu_needs_another_gp(rsp, rdp)) {
1405 /* 1437 /*
1406 * Either we have not yet spawned the grace-period 1438 * Either we have not yet spawned the grace-period
1407 * task or this CPU does not need another grace period. 1439 * task, this CPU does not need another grace period,
1440 * or a grace period is already in progress.
1408 * Either way, don't start a new grace period. 1441 * Either way, don't start a new grace period.
1409 */ 1442 */
1410 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1443 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1411 return; 1444 return;
1412 } 1445 }
1413 1446
1447 /*
1448 * Because there is no grace period in progress right now,
1449 * any callbacks we have up to this point will be satisfied
1450 * by the next grace period. So promote all callbacks to be
1451 * handled after the end of the next grace period. If the
1452 * CPU is not yet aware of the end of the previous grace period,
1453 * we need to allow for the callback advancement that will
1454 * occur when it does become aware. Deadlock prevents us from
1455 * making it aware at this point: We cannot acquire a leaf
1456 * rcu_node ->lock while holding the root rcu_node ->lock.
1457 */
1458 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1459 if (rdp->completed == rsp->completed)
1460 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1461
1414 rsp->gp_flags = RCU_GP_FLAG_INIT; 1462 rsp->gp_flags = RCU_GP_FLAG_INIT;
1415 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1463 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
1464
1465 /* Ensure that CPU is aware of completion of last grace period. */
1466 rcu_process_gp_end(rsp, rdp);
1467 local_irq_restore(flags);
1468
1469 /* Wake up rcu_gp_kthread() to start the grace period. */
1416 wake_up(&rsp->gp_wq); 1470 wake_up(&rsp->gp_wq);
1417} 1471}
1418 1472
@@ -1573,7 +1627,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1573/* 1627/*
1574 * Send the specified CPU's RCU callbacks to the orphanage. The 1628 * Send the specified CPU's RCU callbacks to the orphanage. The
1575 * specified CPU must be offline, and the caller must hold the 1629 * specified CPU must be offline, and the caller must hold the
1576 * ->onofflock. 1630 * ->orphan_lock.
1577 */ 1631 */
1578static void 1632static void
1579rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, 1633rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
@@ -1581,8 +1635,8 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1581{ 1635{
1582 /* 1636 /*
1583 * Orphan the callbacks. First adjust the counts. This is safe 1637 * Orphan the callbacks. First adjust the counts. This is safe
1584 * because ->onofflock excludes _rcu_barrier()'s adoption of 1638 * because _rcu_barrier() excludes CPU-hotplug operations, so it
1585 * the callbacks, thus no memory barrier is required. 1639 * cannot be running now. Thus no memory barrier is required.
1586 */ 1640 */
1587 if (rdp->nxtlist != NULL) { 1641 if (rdp->nxtlist != NULL) {
1588 rsp->qlen_lazy += rdp->qlen_lazy; 1642 rsp->qlen_lazy += rdp->qlen_lazy;
@@ -1623,7 +1677,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1623 1677
1624/* 1678/*
1625 * Adopt the RCU callbacks from the specified rcu_state structure's 1679 * Adopt the RCU callbacks from the specified rcu_state structure's
1626 * orphanage. The caller must hold the ->onofflock. 1680 * orphanage. The caller must hold the ->orphan_lock.
1627 */ 1681 */
1628static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) 1682static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1629{ 1683{
@@ -1702,7 +1756,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1702 1756
1703 /* Exclude any attempts to start a new grace period. */ 1757 /* Exclude any attempts to start a new grace period. */
1704 mutex_lock(&rsp->onoff_mutex); 1758 mutex_lock(&rsp->onoff_mutex);
1705 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1759 raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
1706 1760
1707 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ 1761 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
1708 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); 1762 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
@@ -1729,10 +1783,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1729 /* 1783 /*
1730 * We still hold the leaf rcu_node structure lock here, and 1784 * We still hold the leaf rcu_node structure lock here, and
1731 * irqs are still disabled. The reason for this subterfuge is 1785 * irqs are still disabled. The reason for this subterfuge is
1732 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock 1786 * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock
1733 * held leads to deadlock. 1787 * held leads to deadlock.
1734 */ 1788 */
1735 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1789 raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
1736 rnp = rdp->mynode; 1790 rnp = rdp->mynode;
1737 if (need_report & RCU_OFL_TASKS_NORM_GP) 1791 if (need_report & RCU_OFL_TASKS_NORM_GP)
1738 rcu_report_unblock_qs_rnp(rnp, flags); 1792 rcu_report_unblock_qs_rnp(rnp, flags);
@@ -1769,7 +1823,8 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1769{ 1823{
1770 unsigned long flags; 1824 unsigned long flags;
1771 struct rcu_head *next, *list, **tail; 1825 struct rcu_head *next, *list, **tail;
1772 int bl, count, count_lazy, i; 1826 long bl, count, count_lazy;
1827 int i;
1773 1828
1774 /* If no callbacks are ready, just return.*/ 1829 /* If no callbacks are ready, just return.*/
1775 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 1830 if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
@@ -2205,10 +2260,28 @@ static inline int rcu_blocking_is_gp(void)
2205 * rcu_read_lock_sched(). 2260 * rcu_read_lock_sched().
2206 * 2261 *
2207 * This means that all preempt_disable code sequences, including NMI and 2262 * This means that all preempt_disable code sequences, including NMI and
2208 * hardware-interrupt handlers, in progress on entry will have completed 2263 * non-threaded hardware-interrupt handlers, in progress on entry will
2209 * before this primitive returns. However, this does not guarantee that 2264 * have completed before this primitive returns. However, this does not
2210 * softirq handlers will have completed, since in some kernels, these 2265 * guarantee that softirq handlers will have completed, since in some
2211 * handlers can run in process context, and can block. 2266 * kernels, these handlers can run in process context, and can block.
2267 *
2268 * Note that this guarantee implies further memory-ordering guarantees.
2269 * On systems with more than one CPU, when synchronize_sched() returns,
2270 * each CPU is guaranteed to have executed a full memory barrier since the
2271 * end of its last RCU-sched read-side critical section whose beginning
2272 * preceded the call to synchronize_sched(). In addition, each CPU having
2273 * an RCU read-side critical section that extends beyond the return from
2274 * synchronize_sched() is guaranteed to have executed a full memory barrier
2275 * after the beginning of synchronize_sched() and before the beginning of
2276 * that RCU read-side critical section. Note that these guarantees include
2277 * CPUs that are offline, idle, or executing in user mode, as well as CPUs
2278 * that are executing in the kernel.
2279 *
2280 * Furthermore, if CPU A invoked synchronize_sched(), which returned
2281 * to its caller on CPU B, then both CPU A and CPU B are guaranteed
2282 * to have executed a full memory barrier during the execution of
2283 * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but
2284 * again only if the system has more than one CPU).
2212 * 2285 *
2213 * This primitive provides the guarantees made by the (now removed) 2286 * This primitive provides the guarantees made by the (now removed)
2214 * synchronize_kernel() API. In contrast, synchronize_rcu() only 2287 * synchronize_kernel() API. In contrast, synchronize_rcu() only
@@ -2224,7 +2297,10 @@ void synchronize_sched(void)
2224 "Illegal synchronize_sched() in RCU-sched read-side critical section"); 2297 "Illegal synchronize_sched() in RCU-sched read-side critical section");
2225 if (rcu_blocking_is_gp()) 2298 if (rcu_blocking_is_gp())
2226 return; 2299 return;
2227 wait_rcu_gp(call_rcu_sched); 2300 if (rcu_expedited)
2301 synchronize_sched_expedited();
2302 else
2303 wait_rcu_gp(call_rcu_sched);
2228} 2304}
2229EXPORT_SYMBOL_GPL(synchronize_sched); 2305EXPORT_SYMBOL_GPL(synchronize_sched);
2230 2306
@@ -2236,6 +2312,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
2236 * read-side critical sections have completed. RCU read-side critical 2312 * read-side critical sections have completed. RCU read-side critical
2237 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), 2313 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
2238 * and may be nested. 2314 * and may be nested.
2315 *
2316 * See the description of synchronize_sched() for more detailed information
2317 * on memory ordering guarantees.
2239 */ 2318 */
2240void synchronize_rcu_bh(void) 2319void synchronize_rcu_bh(void)
2241{ 2320{
@@ -2245,13 +2324,13 @@ void synchronize_rcu_bh(void)
2245 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); 2324 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
2246 if (rcu_blocking_is_gp()) 2325 if (rcu_blocking_is_gp())
2247 return; 2326 return;
2248 wait_rcu_gp(call_rcu_bh); 2327 if (rcu_expedited)
2328 synchronize_rcu_bh_expedited();
2329 else
2330 wait_rcu_gp(call_rcu_bh);
2249} 2331}
2250EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 2332EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
2251 2333
2252static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
2253static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
2254
2255static int synchronize_sched_expedited_cpu_stop(void *data) 2334static int synchronize_sched_expedited_cpu_stop(void *data)
2256{ 2335{
2257 /* 2336 /*
@@ -2308,10 +2387,32 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
2308 */ 2387 */
2309void synchronize_sched_expedited(void) 2388void synchronize_sched_expedited(void)
2310{ 2389{
2311 int firstsnap, s, snap, trycount = 0; 2390 long firstsnap, s, snap;
2391 int trycount = 0;
2392 struct rcu_state *rsp = &rcu_sched_state;
2393
2394 /*
2395 * If we are in danger of counter wrap, just do synchronize_sched().
2396 * By allowing sync_sched_expedited_started to advance no more than
2397 * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
2398 * that more than 3.5 billion CPUs would be required to force a
2399 * counter wrap on a 32-bit system. Quite a few more CPUs would of
2400 * course be required on a 64-bit system.
2401 */
2402 if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
2403 (ulong)atomic_long_read(&rsp->expedited_done) +
2404 ULONG_MAX / 8)) {
2405 synchronize_sched();
2406 atomic_long_inc(&rsp->expedited_wrap);
2407 return;
2408 }
2312 2409
2313 /* Note that atomic_inc_return() implies full memory barrier. */ 2410 /*
2314 firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); 2411 * Take a ticket. Note that atomic_inc_return() implies a
2412 * full memory barrier.
2413 */
2414 snap = atomic_long_inc_return(&rsp->expedited_start);
2415 firstsnap = snap;
2315 get_online_cpus(); 2416 get_online_cpus();
2316 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); 2417 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
2317 2418
@@ -2323,48 +2424,65 @@ void synchronize_sched_expedited(void)
2323 synchronize_sched_expedited_cpu_stop, 2424 synchronize_sched_expedited_cpu_stop,
2324 NULL) == -EAGAIN) { 2425 NULL) == -EAGAIN) {
2325 put_online_cpus(); 2426 put_online_cpus();
2427 atomic_long_inc(&rsp->expedited_tryfail);
2428
2429 /* Check to see if someone else did our work for us. */
2430 s = atomic_long_read(&rsp->expedited_done);
2431 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
2432 /* ensure test happens before caller kfree */
2433 smp_mb__before_atomic_inc(); /* ^^^ */
2434 atomic_long_inc(&rsp->expedited_workdone1);
2435 return;
2436 }
2326 2437
2327 /* No joy, try again later. Or just synchronize_sched(). */ 2438 /* No joy, try again later. Or just synchronize_sched(). */
2328 if (trycount++ < 10) { 2439 if (trycount++ < 10) {
2329 udelay(trycount * num_online_cpus()); 2440 udelay(trycount * num_online_cpus());
2330 } else { 2441 } else {
2331 synchronize_sched(); 2442 wait_rcu_gp(call_rcu_sched);
2443 atomic_long_inc(&rsp->expedited_normal);
2332 return; 2444 return;
2333 } 2445 }
2334 2446
2335 /* Check to see if someone else did our work for us. */ 2447 /* Recheck to see if someone else did our work for us. */
2336 s = atomic_read(&sync_sched_expedited_done); 2448 s = atomic_long_read(&rsp->expedited_done);
2337 if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { 2449 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
2338 smp_mb(); /* ensure test happens before caller kfree */ 2450 /* ensure test happens before caller kfree */
2451 smp_mb__before_atomic_inc(); /* ^^^ */
2452 atomic_long_inc(&rsp->expedited_workdone2);
2339 return; 2453 return;
2340 } 2454 }
2341 2455
2342 /* 2456 /*
2343 * Refetching sync_sched_expedited_started allows later 2457 * Refetching sync_sched_expedited_started allows later
2344 * callers to piggyback on our grace period. We subtract 2458 * callers to piggyback on our grace period. We retry
2345 * 1 to get the same token that the last incrementer got. 2459 * after they started, so our grace period works for them,
2346 * We retry after they started, so our grace period works 2460 * and they started after our first try, so their grace
2347 * for them, and they started after our first try, so their 2461 * period works for us.
2348 * grace period works for us.
2349 */ 2462 */
2350 get_online_cpus(); 2463 get_online_cpus();
2351 snap = atomic_read(&sync_sched_expedited_started); 2464 snap = atomic_long_read(&rsp->expedited_start);
2352 smp_mb(); /* ensure read is before try_stop_cpus(). */ 2465 smp_mb(); /* ensure read is before try_stop_cpus(). */
2353 } 2466 }
2467 atomic_long_inc(&rsp->expedited_stoppedcpus);
2354 2468
2355 /* 2469 /*
2356 * Everyone up to our most recent fetch is covered by our grace 2470 * Everyone up to our most recent fetch is covered by our grace
2357 * period. Update the counter, but only if our work is still 2471 * period. Update the counter, but only if our work is still
2358 * relevant -- which it won't be if someone who started later 2472 * relevant -- which it won't be if someone who started later
2359 * than we did beat us to the punch. 2473 * than we did already did their update.
2360 */ 2474 */
2361 do { 2475 do {
2362 s = atomic_read(&sync_sched_expedited_done); 2476 atomic_long_inc(&rsp->expedited_done_tries);
2363 if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { 2477 s = atomic_long_read(&rsp->expedited_done);
2364 smp_mb(); /* ensure test happens before caller kfree */ 2478 if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
2479 /* ensure test happens before caller kfree */
2480 smp_mb__before_atomic_inc(); /* ^^^ */
2481 atomic_long_inc(&rsp->expedited_done_lost);
2365 break; 2482 break;
2366 } 2483 }
2367 } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); 2484 } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
2485 atomic_long_inc(&rsp->expedited_done_exit);
2368 2486
2369 put_online_cpus(); 2487 put_online_cpus();
2370} 2488}
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index a240f032848e..d274af357210 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -383,9 +383,8 @@ struct rcu_state {
383 383
384 /* End of fields guarded by root rcu_node's lock. */ 384 /* End of fields guarded by root rcu_node's lock. */
385 385
386 raw_spinlock_t onofflock ____cacheline_internodealigned_in_smp; 386 raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
387 /* exclude on/offline and */ 387 /* Protect following fields. */
388 /* starting new GP. */
389 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ 388 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
390 /* need a grace period. */ 389 /* need a grace period. */
391 struct rcu_head **orphan_nxttail; /* Tail of above. */ 390 struct rcu_head **orphan_nxttail; /* Tail of above. */
@@ -394,7 +393,7 @@ struct rcu_state {
394 struct rcu_head **orphan_donetail; /* Tail of above. */ 393 struct rcu_head **orphan_donetail; /* Tail of above. */
395 long qlen_lazy; /* Number of lazy callbacks. */ 394 long qlen_lazy; /* Number of lazy callbacks. */
396 long qlen; /* Total number of callbacks. */ 395 long qlen; /* Total number of callbacks. */
397 /* End of fields guarded by onofflock. */ 396 /* End of fields guarded by orphan_lock. */
398 397
399 struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ 398 struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */
400 399
@@ -405,6 +404,18 @@ struct rcu_state {
405 /* _rcu_barrier(). */ 404 /* _rcu_barrier(). */
406 /* End of fields guarded by barrier_mutex. */ 405 /* End of fields guarded by barrier_mutex. */
407 406
407 atomic_long_t expedited_start; /* Starting ticket. */
408 atomic_long_t expedited_done; /* Done ticket. */
409 atomic_long_t expedited_wrap; /* # near-wrap incidents. */
410 atomic_long_t expedited_tryfail; /* # acquisition failures. */
411 atomic_long_t expedited_workdone1; /* # done by others #1. */
412 atomic_long_t expedited_workdone2; /* # done by others #2. */
413 atomic_long_t expedited_normal; /* # fallbacks to normal. */
414 atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */
415 atomic_long_t expedited_done_tries; /* # tries to update _done. */
416 atomic_long_t expedited_done_lost; /* # times beaten to _done. */
417 atomic_long_t expedited_done_exit; /* # times exited _done loop. */
418
408 unsigned long jiffies_force_qs; /* Time at which to invoke */ 419 unsigned long jiffies_force_qs; /* Time at which to invoke */
409 /* force_quiescent_state(). */ 420 /* force_quiescent_state(). */
410 unsigned long n_force_qs; /* Number of calls to */ 421 unsigned long n_force_qs; /* Number of calls to */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index f92115488187..5ce3352505e9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -670,6 +670,9 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu);
670 * concurrently with new RCU read-side critical sections that began while 670 * concurrently with new RCU read-side critical sections that began while
671 * synchronize_rcu() was waiting. RCU read-side critical sections are 671 * synchronize_rcu() was waiting. RCU read-side critical sections are
672 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. 672 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
673 *
674 * See the description of synchronize_sched() for more detailed information
675 * on memory ordering guarantees.
673 */ 676 */
674void synchronize_rcu(void) 677void synchronize_rcu(void)
675{ 678{
@@ -679,7 +682,10 @@ void synchronize_rcu(void)
679 "Illegal synchronize_rcu() in RCU read-side critical section"); 682 "Illegal synchronize_rcu() in RCU read-side critical section");
680 if (!rcu_scheduler_active) 683 if (!rcu_scheduler_active)
681 return; 684 return;
682 wait_rcu_gp(call_rcu); 685 if (rcu_expedited)
686 synchronize_rcu_expedited();
687 else
688 wait_rcu_gp(call_rcu);
683} 689}
684EXPORT_SYMBOL_GPL(synchronize_rcu); 690EXPORT_SYMBOL_GPL(synchronize_rcu);
685 691
@@ -757,7 +763,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
757 * grace period for the specified rcu_node structure. If there are no such 763 * grace period for the specified rcu_node structure. If there are no such
758 * tasks, report it up the rcu_node hierarchy. 764 * tasks, report it up the rcu_node hierarchy.
759 * 765 *
760 * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock. 766 * Caller must hold sync_rcu_preempt_exp_mutex and must exclude
767 * CPU hotplug operations.
761 */ 768 */
762static void 769static void
763sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) 770sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
@@ -831,7 +838,7 @@ void synchronize_rcu_expedited(void)
831 udelay(trycount * num_online_cpus()); 838 udelay(trycount * num_online_cpus());
832 } else { 839 } else {
833 put_online_cpus(); 840 put_online_cpus();
834 synchronize_rcu(); 841 wait_rcu_gp(call_rcu);
835 return; 842 return;
836 } 843 }
837 } 844 }
@@ -875,6 +882,11 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
875 882
876/** 883/**
877 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. 884 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
885 *
886 * Note that this primitive does not necessarily wait for an RCU grace period
887 * to complete. For example, if there are no RCU callbacks queued anywhere
888 * in the system, then rcu_barrier() is within its rights to return
889 * immediately, without waiting for anything, much less an RCU grace period.
878 */ 890 */
879void rcu_barrier(void) 891void rcu_barrier(void)
880{ 892{
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 693513bc50e6..f9512687a6e5 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,29 +46,58 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "rcutree.h"
48 48
49static int show_rcubarrier(struct seq_file *m, void *unused) 49#define ulong2long(a) (*(long *)(&(a)))
50
51static int r_open(struct inode *inode, struct file *file,
52 const struct seq_operations *op)
50{ 53{
51 struct rcu_state *rsp; 54 int ret = seq_open(file, op);
55 if (!ret) {
56 struct seq_file *m = (struct seq_file *)file->private_data;
57 m->private = inode->i_private;
58 }
59 return ret;
60}
61
62static void *r_start(struct seq_file *m, loff_t *pos)
63{
64 struct rcu_state *rsp = (struct rcu_state *)m->private;
65 *pos = cpumask_next(*pos - 1, cpu_possible_mask);
66 if ((*pos) < nr_cpu_ids)
67 return per_cpu_ptr(rsp->rda, *pos);
68 return NULL;
69}
52 70
53 for_each_rcu_flavor(rsp) 71static void *r_next(struct seq_file *m, void *v, loff_t *pos)
54 seq_printf(m, "%s: bcc: %d nbd: %lu\n", 72{
55 rsp->name, 73 (*pos)++;
56 atomic_read(&rsp->barrier_cpu_count), 74 return r_start(m, pos);
57 rsp->n_barrier_done); 75}
76
77static void r_stop(struct seq_file *m, void *v)
78{
79}
80
81static int show_rcubarrier(struct seq_file *m, void *v)
82{
83 struct rcu_state *rsp = (struct rcu_state *)m->private;
84 seq_printf(m, "bcc: %d nbd: %lu\n",
85 atomic_read(&rsp->barrier_cpu_count),
86 rsp->n_barrier_done);
58 return 0; 87 return 0;
59} 88}
60 89
61static int rcubarrier_open(struct inode *inode, struct file *file) 90static int rcubarrier_open(struct inode *inode, struct file *file)
62{ 91{
63 return single_open(file, show_rcubarrier, NULL); 92 return single_open(file, show_rcubarrier, inode->i_private);
64} 93}
65 94
66static const struct file_operations rcubarrier_fops = { 95static const struct file_operations rcubarrier_fops = {
67 .owner = THIS_MODULE, 96 .owner = THIS_MODULE,
68 .open = rcubarrier_open, 97 .open = rcubarrier_open,
69 .read = seq_read, 98 .read = seq_read,
70 .llseek = seq_lseek, 99 .llseek = no_llseek,
71 .release = single_release, 100 .release = seq_release,
72}; 101};
73 102
74#ifdef CONFIG_RCU_BOOST 103#ifdef CONFIG_RCU_BOOST
@@ -86,10 +115,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
86{ 115{
87 if (!rdp->beenonline) 116 if (!rdp->beenonline)
88 return; 117 return;
89 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d qp=%d", 118 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d",
90 rdp->cpu, 119 rdp->cpu,
91 cpu_is_offline(rdp->cpu) ? '!' : ' ', 120 cpu_is_offline(rdp->cpu) ? '!' : ' ',
92 rdp->completed, rdp->gpnum, 121 ulong2long(rdp->completed), ulong2long(rdp->gpnum),
93 rdp->passed_quiesce, rdp->qs_pending); 122 rdp->passed_quiesce, rdp->qs_pending);
94 seq_printf(m, " dt=%d/%llx/%d df=%lu", 123 seq_printf(m, " dt=%d/%llx/%d df=%lu",
95 atomic_read(&rdp->dynticks->dynticks), 124 atomic_read(&rdp->dynticks->dynticks),
@@ -118,97 +147,62 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
118 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); 147 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
119} 148}
120 149
121static int show_rcudata(struct seq_file *m, void *unused) 150static int show_rcudata(struct seq_file *m, void *v)
122{ 151{
123 int cpu; 152 print_one_rcu_data(m, (struct rcu_data *)v);
124 struct rcu_state *rsp;
125
126 for_each_rcu_flavor(rsp) {
127 seq_printf(m, "%s:\n", rsp->name);
128 for_each_possible_cpu(cpu)
129 print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu));
130 }
131 return 0; 153 return 0;
132} 154}
133 155
156static const struct seq_operations rcudate_op = {
157 .start = r_start,
158 .next = r_next,
159 .stop = r_stop,
160 .show = show_rcudata,
161};
162
134static int rcudata_open(struct inode *inode, struct file *file) 163static int rcudata_open(struct inode *inode, struct file *file)
135{ 164{
136 return single_open(file, show_rcudata, NULL); 165 return r_open(inode, file, &rcudate_op);
137} 166}
138 167
139static const struct file_operations rcudata_fops = { 168static const struct file_operations rcudata_fops = {
140 .owner = THIS_MODULE, 169 .owner = THIS_MODULE,
141 .open = rcudata_open, 170 .open = rcudata_open,
142 .read = seq_read, 171 .read = seq_read,
143 .llseek = seq_lseek, 172 .llseek = no_llseek,
144 .release = single_release, 173 .release = seq_release,
145}; 174};
146 175
147static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) 176static int show_rcuexp(struct seq_file *m, void *v)
148{
149 if (!rdp->beenonline)
150 return;
151 seq_printf(m, "%d,%s,%lu,%lu,%d,%d",
152 rdp->cpu,
153 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
154 rdp->completed, rdp->gpnum,
155 rdp->passed_quiesce, rdp->qs_pending);
156 seq_printf(m, ",%d,%llx,%d,%lu",
157 atomic_read(&rdp->dynticks->dynticks),
158 rdp->dynticks->dynticks_nesting,
159 rdp->dynticks->dynticks_nmi_nesting,
160 rdp->dynticks_fqs);
161 seq_printf(m, ",%lu", rdp->offline_fqs);
162 seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen,
163 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
164 rdp->nxttail[RCU_NEXT_TAIL]],
165 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
166 rdp->nxttail[RCU_NEXT_READY_TAIL]],
167 ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
168 rdp->nxttail[RCU_WAIT_TAIL]],
169 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
170#ifdef CONFIG_RCU_BOOST
171 seq_printf(m, ",%d,\"%c\"",
172 per_cpu(rcu_cpu_has_work, rdp->cpu),
173 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
174 rdp->cpu)));
175#endif /* #ifdef CONFIG_RCU_BOOST */
176 seq_printf(m, ",%ld", rdp->blimit);
177 seq_printf(m, ",%lu,%lu,%lu\n",
178 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
179}
180
181static int show_rcudata_csv(struct seq_file *m, void *unused)
182{ 177{
183 int cpu; 178 struct rcu_state *rsp = (struct rcu_state *)m->private;
184 struct rcu_state *rsp; 179
185 180 seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n",
186 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pq\","); 181 atomic_long_read(&rsp->expedited_start),
187 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); 182 atomic_long_read(&rsp->expedited_done),
188 seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); 183 atomic_long_read(&rsp->expedited_wrap),
189#ifdef CONFIG_RCU_BOOST 184 atomic_long_read(&rsp->expedited_tryfail),
190 seq_puts(m, "\"kt\",\"ktl\""); 185 atomic_long_read(&rsp->expedited_workdone1),
191#endif /* #ifdef CONFIG_RCU_BOOST */ 186 atomic_long_read(&rsp->expedited_workdone2),
192 seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); 187 atomic_long_read(&rsp->expedited_normal),
193 for_each_rcu_flavor(rsp) { 188 atomic_long_read(&rsp->expedited_stoppedcpus),
194 seq_printf(m, "\"%s:\"\n", rsp->name); 189 atomic_long_read(&rsp->expedited_done_tries),
195 for_each_possible_cpu(cpu) 190 atomic_long_read(&rsp->expedited_done_lost),
196 print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu)); 191 atomic_long_read(&rsp->expedited_done_exit));
197 }
198 return 0; 192 return 0;
199} 193}
200 194
201static int rcudata_csv_open(struct inode *inode, struct file *file) 195static int rcuexp_open(struct inode *inode, struct file *file)
202{ 196{
203 return single_open(file, show_rcudata_csv, NULL); 197 return single_open(file, show_rcuexp, inode->i_private);
204} 198}
205 199
206static const struct file_operations rcudata_csv_fops = { 200static const struct file_operations rcuexp_fops = {
207 .owner = THIS_MODULE, 201 .owner = THIS_MODULE,
208 .open = rcudata_csv_open, 202 .open = rcuexp_open,
209 .read = seq_read, 203 .read = seq_read,
210 .llseek = seq_lseek, 204 .llseek = no_llseek,
211 .release = single_release, 205 .release = seq_release,
212}; 206};
213 207
214#ifdef CONFIG_RCU_BOOST 208#ifdef CONFIG_RCU_BOOST
@@ -254,27 +248,11 @@ static const struct file_operations rcu_node_boost_fops = {
254 .owner = THIS_MODULE, 248 .owner = THIS_MODULE,
255 .open = rcu_node_boost_open, 249 .open = rcu_node_boost_open,
256 .read = seq_read, 250 .read = seq_read,
257 .llseek = seq_lseek, 251 .llseek = no_llseek,
258 .release = single_release, 252 .release = single_release,
259}; 253};
260 254
261/* 255#endif /* #ifdef CONFIG_RCU_BOOST */
262 * Create the rcuboost debugfs entry. Standard error return.
263 */
264static int rcu_boost_trace_create_file(struct dentry *rcudir)
265{
266 return !debugfs_create_file("rcuboost", 0444, rcudir, NULL,
267 &rcu_node_boost_fops);
268}
269
270#else /* #ifdef CONFIG_RCU_BOOST */
271
272static int rcu_boost_trace_create_file(struct dentry *rcudir)
273{
274 return 0; /* There cannot be an error if we didn't create it! */
275}
276
277#endif /* #else #ifdef CONFIG_RCU_BOOST */
278 256
279static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 257static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
280{ 258{
@@ -283,8 +261,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
283 struct rcu_node *rnp; 261 struct rcu_node *rnp;
284 262
285 gpnum = rsp->gpnum; 263 gpnum = rsp->gpnum;
286 seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ", 264 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ",
287 rsp->name, rsp->completed, gpnum, rsp->fqs_state, 265 ulong2long(rsp->completed), ulong2long(gpnum),
266 rsp->fqs_state,
288 (long)(rsp->jiffies_force_qs - jiffies), 267 (long)(rsp->jiffies_force_qs - jiffies),
289 (int)(jiffies & 0xffff)); 268 (int)(jiffies & 0xffff));
290 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", 269 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
@@ -306,26 +285,24 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
306 seq_puts(m, "\n"); 285 seq_puts(m, "\n");
307} 286}
308 287
309static int show_rcuhier(struct seq_file *m, void *unused) 288static int show_rcuhier(struct seq_file *m, void *v)
310{ 289{
311 struct rcu_state *rsp; 290 struct rcu_state *rsp = (struct rcu_state *)m->private;
312 291 print_one_rcu_state(m, rsp);
313 for_each_rcu_flavor(rsp)
314 print_one_rcu_state(m, rsp);
315 return 0; 292 return 0;
316} 293}
317 294
318static int rcuhier_open(struct inode *inode, struct file *file) 295static int rcuhier_open(struct inode *inode, struct file *file)
319{ 296{
320 return single_open(file, show_rcuhier, NULL); 297 return single_open(file, show_rcuhier, inode->i_private);
321} 298}
322 299
323static const struct file_operations rcuhier_fops = { 300static const struct file_operations rcuhier_fops = {
324 .owner = THIS_MODULE, 301 .owner = THIS_MODULE,
325 .open = rcuhier_open, 302 .open = rcuhier_open,
326 .read = seq_read, 303 .read = seq_read,
327 .llseek = seq_lseek, 304 .llseek = no_llseek,
328 .release = single_release, 305 .release = seq_release,
329}; 306};
330 307
331static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) 308static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
@@ -338,42 +315,42 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
338 struct rcu_node *rnp = &rsp->node[0]; 315 struct rcu_node *rnp = &rsp->node[0];
339 316
340 raw_spin_lock_irqsave(&rnp->lock, flags); 317 raw_spin_lock_irqsave(&rnp->lock, flags);
341 completed = rsp->completed; 318 completed = ACCESS_ONCE(rsp->completed);
342 gpnum = rsp->gpnum; 319 gpnum = ACCESS_ONCE(rsp->gpnum);
343 if (rsp->completed == rsp->gpnum) 320 if (completed == gpnum)
344 gpage = 0; 321 gpage = 0;
345 else 322 else
346 gpage = jiffies - rsp->gp_start; 323 gpage = jiffies - rsp->gp_start;
347 gpmax = rsp->gp_max; 324 gpmax = rsp->gp_max;
348 raw_spin_unlock_irqrestore(&rnp->lock, flags); 325 raw_spin_unlock_irqrestore(&rnp->lock, flags);
349 seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n", 326 seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n",
350 rsp->name, completed, gpnum, gpage, gpmax); 327 ulong2long(completed), ulong2long(gpnum), gpage, gpmax);
351} 328}
352 329
353static int show_rcugp(struct seq_file *m, void *unused) 330static int show_rcugp(struct seq_file *m, void *v)
354{ 331{
355 struct rcu_state *rsp; 332 struct rcu_state *rsp = (struct rcu_state *)m->private;
356 333 show_one_rcugp(m, rsp);
357 for_each_rcu_flavor(rsp)
358 show_one_rcugp(m, rsp);
359 return 0; 334 return 0;
360} 335}
361 336
362static int rcugp_open(struct inode *inode, struct file *file) 337static int rcugp_open(struct inode *inode, struct file *file)
363{ 338{
364 return single_open(file, show_rcugp, NULL); 339 return single_open(file, show_rcugp, inode->i_private);
365} 340}
366 341
367static const struct file_operations rcugp_fops = { 342static const struct file_operations rcugp_fops = {
368 .owner = THIS_MODULE, 343 .owner = THIS_MODULE,
369 .open = rcugp_open, 344 .open = rcugp_open,
370 .read = seq_read, 345 .read = seq_read,
371 .llseek = seq_lseek, 346 .llseek = no_llseek,
372 .release = single_release, 347 .release = seq_release,
373}; 348};
374 349
375static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) 350static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
376{ 351{
352 if (!rdp->beenonline)
353 return;
377 seq_printf(m, "%3d%cnp=%ld ", 354 seq_printf(m, "%3d%cnp=%ld ",
378 rdp->cpu, 355 rdp->cpu,
379 cpu_is_offline(rdp->cpu) ? '!' : ' ', 356 cpu_is_offline(rdp->cpu) ? '!' : ' ',
@@ -389,34 +366,30 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
389 rdp->n_rp_need_nothing); 366 rdp->n_rp_need_nothing);
390} 367}
391 368
392static int show_rcu_pending(struct seq_file *m, void *unused) 369static int show_rcu_pending(struct seq_file *m, void *v)
393{ 370{
394 int cpu; 371 print_one_rcu_pending(m, (struct rcu_data *)v);
395 struct rcu_data *rdp;
396 struct rcu_state *rsp;
397
398 for_each_rcu_flavor(rsp) {
399 seq_printf(m, "%s:\n", rsp->name);
400 for_each_possible_cpu(cpu) {
401 rdp = per_cpu_ptr(rsp->rda, cpu);
402 if (rdp->beenonline)
403 print_one_rcu_pending(m, rdp);
404 }
405 }
406 return 0; 372 return 0;
407} 373}
408 374
375static const struct seq_operations rcu_pending_op = {
376 .start = r_start,
377 .next = r_next,
378 .stop = r_stop,
379 .show = show_rcu_pending,
380};
381
409static int rcu_pending_open(struct inode *inode, struct file *file) 382static int rcu_pending_open(struct inode *inode, struct file *file)
410{ 383{
411 return single_open(file, show_rcu_pending, NULL); 384 return r_open(inode, file, &rcu_pending_op);
412} 385}
413 386
414static const struct file_operations rcu_pending_fops = { 387static const struct file_operations rcu_pending_fops = {
415 .owner = THIS_MODULE, 388 .owner = THIS_MODULE,
416 .open = rcu_pending_open, 389 .open = rcu_pending_open,
417 .read = seq_read, 390 .read = seq_read,
418 .llseek = seq_lseek, 391 .llseek = no_llseek,
419 .release = single_release, 392 .release = seq_release,
420}; 393};
421 394
422static int show_rcutorture(struct seq_file *m, void *unused) 395static int show_rcutorture(struct seq_file *m, void *unused)
@@ -446,43 +419,58 @@ static struct dentry *rcudir;
446 419
447static int __init rcutree_trace_init(void) 420static int __init rcutree_trace_init(void)
448{ 421{
422 struct rcu_state *rsp;
449 struct dentry *retval; 423 struct dentry *retval;
424 struct dentry *rspdir;
450 425
451 rcudir = debugfs_create_dir("rcu", NULL); 426 rcudir = debugfs_create_dir("rcu", NULL);
452 if (!rcudir) 427 if (!rcudir)
453 goto free_out; 428 goto free_out;
454 429
455 retval = debugfs_create_file("rcubarrier", 0444, rcudir, 430 for_each_rcu_flavor(rsp) {
456 NULL, &rcubarrier_fops); 431 rspdir = debugfs_create_dir(rsp->name, rcudir);
457 if (!retval) 432 if (!rspdir)
458 goto free_out; 433 goto free_out;
459 434
460 retval = debugfs_create_file("rcudata", 0444, rcudir, 435 retval = debugfs_create_file("rcudata", 0444,
461 NULL, &rcudata_fops); 436 rspdir, rsp, &rcudata_fops);
462 if (!retval) 437 if (!retval)
463 goto free_out; 438 goto free_out;
464 439
465 retval = debugfs_create_file("rcudata.csv", 0444, rcudir, 440 retval = debugfs_create_file("rcuexp", 0444,
466 NULL, &rcudata_csv_fops); 441 rspdir, rsp, &rcuexp_fops);
467 if (!retval) 442 if (!retval)
468 goto free_out; 443 goto free_out;
469 444
470 if (rcu_boost_trace_create_file(rcudir)) 445 retval = debugfs_create_file("rcu_pending", 0444,
471 goto free_out; 446 rspdir, rsp, &rcu_pending_fops);
447 if (!retval)
448 goto free_out;
449
450 retval = debugfs_create_file("rcubarrier", 0444,
451 rspdir, rsp, &rcubarrier_fops);
452 if (!retval)
453 goto free_out;
472 454
473 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); 455#ifdef CONFIG_RCU_BOOST
474 if (!retval) 456 if (rsp == &rcu_preempt_state) {
475 goto free_out; 457 retval = debugfs_create_file("rcuboost", 0444,
458 rspdir, NULL, &rcu_node_boost_fops);
459 if (!retval)
460 goto free_out;
461 }
462#endif
476 463
477 retval = debugfs_create_file("rcuhier", 0444, rcudir, 464 retval = debugfs_create_file("rcugp", 0444,
478 NULL, &rcuhier_fops); 465 rspdir, rsp, &rcugp_fops);
479 if (!retval) 466 if (!retval)
480 goto free_out; 467 goto free_out;
481 468
482 retval = debugfs_create_file("rcu_pending", 0444, rcudir, 469 retval = debugfs_create_file("rcuhier", 0444,
483 NULL, &rcu_pending_fops); 470 rspdir, rsp, &rcuhier_fops);
484 if (!retval) 471 if (!retval)
485 goto free_out; 472 goto free_out;
473 }
486 474
487 retval = debugfs_create_file("rcutorture", 0444, rcudir, 475 retval = debugfs_create_file("rcutorture", 0444, rcudir,
488 NULL, &rcutorture_fops); 476 NULL, &rcutorture_fops);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927fda712..6d4569e0924d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1887,7 +1887,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
1887#endif 1887#endif
1888 1888
1889 /* Here we just switch the register state and the stack. */ 1889 /* Here we just switch the register state and the stack. */
1890 rcu_switch(prev, next); 1890 rcu_user_hooks_switch(prev, next);
1891 switch_to(prev, next, prev); 1891 switch_to(prev, next, prev);
1892 1892
1893 barrier(); 1893 barrier();
@@ -8076,3 +8076,9 @@ struct cgroup_subsys cpuacct_subsys = {
8076 .base_cftypes = files, 8076 .base_cftypes = files,
8077}; 8077};
8078#endif /* CONFIG_CGROUP_CPUACCT */ 8078#endif /* CONFIG_CGROUP_CPUACCT */
8079
8080void dump_cpu_task(int cpu)
8081{
8082 pr_info("Task dump for CPU %d:\n", cpu);
8083 sched_show_task(cpu_curr(cpu));
8084}
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 97c465ebd844..2b859828cdc3 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -16,8 +16,10 @@
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 * 17 *
18 * Copyright (C) IBM Corporation, 2006 18 * Copyright (C) IBM Corporation, 2006
19 * Copyright (C) Fujitsu, 2012
19 * 20 *
20 * Author: Paul McKenney <paulmck@us.ibm.com> 21 * Author: Paul McKenney <paulmck@us.ibm.com>
22 * Lai Jiangshan <laijs@cn.fujitsu.com>
21 * 23 *
22 * For detailed explanation of Read-Copy Update mechanism see - 24 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU/ *.txt 25 * Documentation/RCU/ *.txt
@@ -34,6 +36,10 @@
34#include <linux/delay.h> 36#include <linux/delay.h>
35#include <linux/srcu.h> 37#include <linux/srcu.h>
36 38
39#include <trace/events/rcu.h>
40
41#include "rcu.h"
42
37/* 43/*
38 * Initialize an rcu_batch structure to empty. 44 * Initialize an rcu_batch structure to empty.
39 */ 45 */
@@ -92,9 +98,6 @@ static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
92 } 98 }
93} 99}
94 100
95/* single-thread state-machine */
96static void process_srcu(struct work_struct *work);
97
98static int init_srcu_struct_fields(struct srcu_struct *sp) 101static int init_srcu_struct_fields(struct srcu_struct *sp)
99{ 102{
100 sp->completed = 0; 103 sp->completed = 0;
@@ -464,7 +467,9 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
464 */ 467 */
465void synchronize_srcu(struct srcu_struct *sp) 468void synchronize_srcu(struct srcu_struct *sp)
466{ 469{
467 __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT); 470 __synchronize_srcu(sp, rcu_expedited
471 ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
472 : SYNCHRONIZE_SRCU_TRYCOUNT);
468} 473}
469EXPORT_SYMBOL_GPL(synchronize_srcu); 474EXPORT_SYMBOL_GPL(synchronize_srcu);
470 475
@@ -637,7 +642,7 @@ static void srcu_reschedule(struct srcu_struct *sp)
637/* 642/*
638 * This is the work-queue function that handles SRCU grace periods. 643 * This is the work-queue function that handles SRCU grace periods.
639 */ 644 */
640static void process_srcu(struct work_struct *work) 645void process_srcu(struct work_struct *work)
641{ 646{
642 struct srcu_struct *sp; 647 struct srcu_struct *sp;
643 648
@@ -648,3 +653,4 @@ static void process_srcu(struct work_struct *work)
648 srcu_invoke_callbacks(sp); 653 srcu_invoke_callbacks(sp);
649 srcu_reschedule(sp); 654 srcu_reschedule(sp);
650} 655}
656EXPORT_SYMBOL_GPL(process_srcu);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 28e9d6c98941..41faf0b8df1d 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -972,7 +972,7 @@ config RCU_CPU_STALL_TIMEOUT
972 int "RCU CPU stall timeout in seconds" 972 int "RCU CPU stall timeout in seconds"
973 depends on TREE_RCU || TREE_PREEMPT_RCU 973 depends on TREE_RCU || TREE_PREEMPT_RCU
974 range 3 300 974 range 3 300
975 default 60 975 default 21
976 help 976 help
977 If a given RCU grace period extends more than the specified 977 If a given RCU grace period extends more than the specified
978 number of seconds, a CPU stall warning is printed. If the 978 number of seconds, a CPU stall warning is printed. If the