aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2012-12-03 00:27:05 -0500
committerIngo Molnar <mingo@kernel.org>2012-12-03 00:27:05 -0500
commit630e1e0bcddfda9566462d4f9a0d58b31c29d467 (patch)
treeb09a28cf7b9ff0fee53af2245a7e3f8d006ae091
parent7e5530af11be68f3109672aed59243f82e1272f0 (diff)
parent91d1aa43d30505b0b825db8898ffc80a8eca96c7 (diff)
Merge branch 'rcu/next' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu into core/rcu
Conflicts: arch/x86/kernel/ptrace.c Pull the latest RCU tree from Paul E. McKenney: " The major features of this series are: 1. A first version of no-callbacks CPUs. This version prohibits offlining CPU 0, but only when enabled via CONFIG_RCU_NOCB_CPU=y. Relaxing this constraint is in progress, but not yet ready for prime time. These commits were posted to LKML at https://lkml.org/lkml/2012/10/30/724, and are at branch rcu/nocb. 2. Changes to SRCU that allows statically initialized srcu_struct structures. These commits were posted to LKML at https://lkml.org/lkml/2012/10/30/296, and are at branch rcu/srcu. 3. Restructuring of RCU's debugfs output. These commits were posted to LKML at https://lkml.org/lkml/2012/10/30/341, and are at branch rcu/tracing. 4. Additional CPU-hotplug/RCU improvements, posted to LKML at https://lkml.org/lkml/2012/10/30/327, and are at branch rcu/hotplug. Note that the commit eliminating __stop_machine() was judged to be too-high of risk, so is deferred to 3.9. 5. Changes to RCU's idle interface, most notably a new module parameter that redirects normal grace-period operations to their expedited equivalents. These were posted to LKML at https://lkml.org/lkml/2012/10/30/739, and are at branch rcu/idle. 6. Additional diagnostics for RCU's CPU stall warning facility, posted to LKML at https://lkml.org/lkml/2012/10/30/315, and are at branch rcu/stall. The most notable change reduces the default RCU CPU stall-warning time from 60 seconds to 21 seconds, so that it once again happens sooner than the softlockup timeout. 7. Documentation updates, which were posted to LKML at https://lkml.org/lkml/2012/10/30/280, and are at branch rcu/doc. A couple of late-breaking changes were posted at https://lkml.org/lkml/2012/11/16/634 and https://lkml.org/lkml/2012/11/16/547. 8. Miscellaneous fixes, which were posted to LKML at https://lkml.org/lkml/2012/10/30/309, along with a late-breaking change posted at Fri, 16 Nov 2012 11:26:25 -0800 with message-ID <20121116192625.GA447@linux.vnet.ibm.com>, but which lkml.org seems to have missed. These are at branch rcu/fixes. 9. Finally, a fix for an lockdep-RCU splat was posted to LKML at https://lkml.org/lkml/2012/11/7/486. This is at rcu/next. " Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--Documentation/RCU/RTFP.txt2
-rw-r--r--Documentation/RCU/checklist.txt17
-rw-r--r--Documentation/RCU/listRCU.txt2
-rw-r--r--Documentation/RCU/rcuref.txt61
-rw-r--r--Documentation/RCU/trace.txt396
-rw-r--r--Documentation/RCU/whatisRCU.txt17
-rw-r--r--Documentation/kernel-parameters.txt21
-rw-r--r--Documentation/memory-barriers.txt9
-rw-r--r--arch/Kconfig15
-rw-r--r--arch/um/drivers/mconsole_kern.c2
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/include/asm/context_tracking.h (renamed from arch/x86/include/asm/rcu.h)15
-rw-r--r--arch/x86/kernel/entry_64.S2
-rw-r--r--arch/x86/kernel/ptrace.c7
-rw-r--r--arch/x86/kernel/signal.c5
-rw-r--r--arch/x86/kernel/traps.c2
-rw-r--r--arch/x86/mm/fault.c2
-rw-r--r--include/linux/context_tracking.h18
-rw-r--r--include/linux/rculist.h17
-rw-r--r--include/linux/rcupdate.h29
-rw-r--r--include/linux/sched.h10
-rw-r--r--include/linux/srcu.h34
-rw-r--r--include/trace/events/rcu.h1
-rw-r--r--init/Kconfig67
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/context_tracking.c83
-rw-r--r--kernel/ksysfs.c18
-rw-r--r--kernel/rcu.h2
-rw-r--r--kernel/rcupdate.c3
-rw-r--r--kernel/rcutiny.c2
-rw-r--r--kernel/rcutiny_plugin.h5
-rw-r--r--kernel/rcutorture.c54
-rw-r--r--kernel/rcutree.c347
-rw-r--r--kernel/rcutree.h67
-rw-r--r--kernel/rcutree_plugin.h415
-rw-r--r--kernel/rcutree_trace.c330
-rw-r--r--kernel/sched/core.c23
-rw-r--r--kernel/srcu.c16
-rw-r--r--lib/Kconfig.debug2
39 files changed, 1493 insertions, 628 deletions
diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt
index 7c1dfb19fc40..7f40c72a9c51 100644
--- a/Documentation/RCU/RTFP.txt
+++ b/Documentation/RCU/RTFP.txt
@@ -186,7 +186,7 @@ Bibtex Entries
186 186
187@article{Kung80 187@article{Kung80
188,author="H. T. Kung and Q. Lehman" 188,author="H. T. Kung and Q. Lehman"
189,title="Concurrent Maintenance of Binary Search Trees" 189,title="Concurrent Manipulation of Binary Search Trees"
190,Year="1980" 190,Year="1980"
191,Month="September" 191,Month="September"
192,journal="ACM Transactions on Database Systems" 192,journal="ACM Transactions on Database Systems"
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index cdb20d41a44a..31ef8fe07f82 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -271,15 +271,14 @@ over a rather long period of time, but improvements are always welcome!
271 The same cautions apply to call_rcu_bh() and call_rcu_sched(). 271 The same cautions apply to call_rcu_bh() and call_rcu_sched().
272 272
2739. All RCU list-traversal primitives, which include 2739. All RCU list-traversal primitives, which include
274 rcu_dereference(), list_for_each_entry_rcu(), 274 rcu_dereference(), list_for_each_entry_rcu(), and
275 list_for_each_continue_rcu(), and list_for_each_safe_rcu(), 275 list_for_each_safe_rcu(), must be either within an RCU read-side
276 must be either within an RCU read-side critical section or 276 critical section or must be protected by appropriate update-side
277 must be protected by appropriate update-side locks. RCU 277 locks. RCU read-side critical sections are delimited by
278 read-side critical sections are delimited by rcu_read_lock() 278 rcu_read_lock() and rcu_read_unlock(), or by similar primitives
279 and rcu_read_unlock(), or by similar primitives such as 279 such as rcu_read_lock_bh() and rcu_read_unlock_bh(), in which
280 rcu_read_lock_bh() and rcu_read_unlock_bh(), in which case 280 case the matching rcu_dereference() primitive must be used in
281 the matching rcu_dereference() primitive must be used in order 281 order to keep lockdep happy, in this case, rcu_dereference_bh().
282 to keep lockdep happy, in this case, rcu_dereference_bh().
283 282
284 The reason that it is permissible to use RCU list-traversal 283 The reason that it is permissible to use RCU list-traversal
285 primitives when the update-side lock is held is that doing so 284 primitives when the update-side lock is held is that doing so
diff --git a/Documentation/RCU/listRCU.txt b/Documentation/RCU/listRCU.txt
index 4349c1487e91..adb5a3782846 100644
--- a/Documentation/RCU/listRCU.txt
+++ b/Documentation/RCU/listRCU.txt
@@ -205,7 +205,7 @@ RCU ("read-copy update") its name. The RCU code is as follows:
205 audit_copy_rule(&ne->rule, &e->rule); 205 audit_copy_rule(&ne->rule, &e->rule);
206 ne->rule.action = newaction; 206 ne->rule.action = newaction;
207 ne->rule.file_count = newfield_count; 207 ne->rule.file_count = newfield_count;
208 list_replace_rcu(e, ne); 208 list_replace_rcu(&e->list, &ne->list);
209 call_rcu(&e->rcu, audit_free_rule); 209 call_rcu(&e->rcu, audit_free_rule);
210 return 0; 210 return 0;
211 } 211 }
diff --git a/Documentation/RCU/rcuref.txt b/Documentation/RCU/rcuref.txt
index 4202ad093130..141d531aa14b 100644
--- a/Documentation/RCU/rcuref.txt
+++ b/Documentation/RCU/rcuref.txt
@@ -20,7 +20,7 @@ release_referenced() delete()
20{ { 20{ {
21 ... write_lock(&list_lock); 21 ... write_lock(&list_lock);
22 atomic_dec(&el->rc, relfunc) ... 22 atomic_dec(&el->rc, relfunc) ...
23 ... delete_element 23 ... remove_element
24} write_unlock(&list_lock); 24} write_unlock(&list_lock);
25 ... 25 ...
26 if (atomic_dec_and_test(&el->rc)) 26 if (atomic_dec_and_test(&el->rc))
@@ -52,7 +52,7 @@ release_referenced() delete()
52{ { 52{ {
53 ... spin_lock(&list_lock); 53 ... spin_lock(&list_lock);
54 if (atomic_dec_and_test(&el->rc)) ... 54 if (atomic_dec_and_test(&el->rc)) ...
55 call_rcu(&el->head, el_free); delete_element 55 call_rcu(&el->head, el_free); remove_element
56 ... spin_unlock(&list_lock); 56 ... spin_unlock(&list_lock);
57} ... 57} ...
58 if (atomic_dec_and_test(&el->rc)) 58 if (atomic_dec_and_test(&el->rc))
@@ -64,3 +64,60 @@ Sometimes, a reference to the element needs to be obtained in the
64update (write) stream. In such cases, atomic_inc_not_zero() might be 64update (write) stream. In such cases, atomic_inc_not_zero() might be
65overkill, since we hold the update-side spinlock. One might instead 65overkill, since we hold the update-side spinlock. One might instead
66use atomic_inc() in such cases. 66use atomic_inc() in such cases.
67
68It is not always convenient to deal with "FAIL" in the
69search_and_reference() code path. In such cases, the
70atomic_dec_and_test() may be moved from delete() to el_free()
71as follows:
72
731. 2.
74add() search_and_reference()
75{ {
76 alloc_object rcu_read_lock();
77 ... search_for_element
78 atomic_set(&el->rc, 1); atomic_inc(&el->rc);
79 spin_lock(&list_lock); ...
80
81 add_element rcu_read_unlock();
82 ... }
83 spin_unlock(&list_lock); 4.
84} delete()
853. {
86release_referenced() spin_lock(&list_lock);
87{ ...
88 ... remove_element
89 if (atomic_dec_and_test(&el->rc)) spin_unlock(&list_lock);
90 kfree(el); ...
91 ... call_rcu(&el->head, el_free);
92} ...
935. }
94void el_free(struct rcu_head *rhp)
95{
96 release_referenced();
97}
98
99The key point is that the initial reference added by add() is not removed
100until after a grace period has elapsed following removal. This means that
101search_and_reference() cannot find this element, which means that the value
102of el->rc cannot increase. Thus, once it reaches zero, there are no
103readers that can or ever will be able to reference the element. The
104element can therefore safely be freed. This in turn guarantees that if
105any reader finds the element, that reader may safely acquire a reference
106without checking the value of the reference counter.
107
108In cases where delete() can sleep, synchronize_rcu() can be called from
109delete(), so that el_free() can be subsumed into delete as follows:
110
1114.
112delete()
113{
114 spin_lock(&list_lock);
115 ...
116 remove_element
117 spin_unlock(&list_lock);
118 ...
119 synchronize_rcu();
120 if (atomic_dec_and_test(&el->rc))
121 kfree(el);
122 ...
123}
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 672d19083252..c776968f4463 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -10,51 +10,63 @@ for rcutree and next for rcutiny.
10 10
11CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats 11CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats
12 12
13These implementations of RCU provides several debugfs files under the 13These implementations of RCU provide several debugfs directories under the
14top-level directory "rcu": 14top-level directory "rcu":
15 15
16rcu/rcudata: 16rcu/rcu_bh
17rcu/rcu_preempt
18rcu/rcu_sched
19
20Each directory contains files for the corresponding flavor of RCU.
21Note that rcu/rcu_preempt is only present for CONFIG_TREE_PREEMPT_RCU.
22For CONFIG_TREE_RCU, the RCU flavor maps onto the RCU-sched flavor,
23so that activity for both appears in rcu/rcu_sched.
24
25In addition, the following file appears in the top-level directory:
26rcu/rcutorture. This file displays rcutorture test progress. The output
27of "cat rcu/rcutorture" looks as follows:
28
29rcutorture test sequence: 0 (test in progress)
30rcutorture update version number: 615
31
32The first line shows the number of rcutorture tests that have completed
33since boot. If a test is currently running, the "(test in progress)"
34string will appear as shown above. The second line shows the number of
35update cycles that the current test has started, or zero if there is
36no test in progress.
37
38
39Within each flavor directory (rcu/rcu_bh, rcu/rcu_sched, and possibly
40also rcu/rcu_preempt) the following files will be present:
41
42rcudata:
17 Displays fields in struct rcu_data. 43 Displays fields in struct rcu_data.
18rcu/rcudata.csv: 44rcuexp:
19 Comma-separated values spreadsheet version of rcudata. 45 Displays statistics for expedited grace periods.
20rcu/rcugp: 46rcugp:
21 Displays grace-period counters. 47 Displays grace-period counters.
22rcu/rcuhier: 48rcuhier:
23 Displays the struct rcu_node hierarchy. 49 Displays the struct rcu_node hierarchy.
24rcu/rcu_pending: 50rcu_pending:
25 Displays counts of the reasons rcu_pending() decided that RCU had 51 Displays counts of the reasons rcu_pending() decided that RCU had
26 work to do. 52 work to do.
27rcu/rcutorture: 53rcuboost:
28 Displays rcutorture test progress.
29rcu/rcuboost:
30 Displays RCU boosting statistics. Only present if 54 Displays RCU boosting statistics. Only present if
31 CONFIG_RCU_BOOST=y. 55 CONFIG_RCU_BOOST=y.
32 56
33The output of "cat rcu/rcudata" looks as follows: 57The output of "cat rcu/rcu_preempt/rcudata" looks as follows:
34 58
35rcu_sched: 59 0!c=30455 g=30456 pq=1 qp=1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716
36 0 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=545/1/0 df=50 of=0 ql=163 qs=NRW. kt=0/W/0 ktl=ebc3 b=10 ci=153737 co=0 ca=0 60 1!c=30719 g=30720 pq=1 qp=0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982
37 1 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=967/1/0 df=58 of=0 ql=634 qs=NRW. kt=0/W/1 ktl=58c b=10 ci=191037 co=0 ca=0 61 2!c=30150 g=30151 pq=1 qp=1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458
38 2 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=1081/1/0 df=175 of=0 ql=74 qs=N.W. kt=0/W/2 ktl=da94 b=10 ci=75991 co=0 ca=0 62 3 c=31249 g=31250 pq=1 qp=0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622
39 3 c=20942 g=20943 pq=1 pgp=20942 qp=1 dt=1846/0/0 df=404 of=0 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=72261 co=0 ca=0 63 4!c=29502 g=29503 pq=1 qp=1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521
40 4 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=369/1/0 df=83 of=0 ql=48 qs=N.W. kt=0/W/4 ktl=e0e7 b=10 ci=128365 co=0 ca=0 64 5 c=31201 g=31202 pq=1 qp=1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698
41 5 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=381/1/0 df=64 of=0 ql=169 qs=NRW. kt=0/W/5 ktl=fb2f b=10 ci=164360 co=0 ca=0 65 6!c=30253 g=30254 pq=1 qp=1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353
42 6 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=1037/1/0 df=183 of=0 ql=62 qs=N.W. kt=0/W/6 ktl=d2ad b=10 ci=65663 co=0 ca=0 66 7 c=31178 g=31178 pq=1 qp=0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969
43 7 c=20897 g=20897 pq=1 pgp=20896 qp=0 dt=1572/0/0 df=382 of=0 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=75006 co=0 ca=0 67
44rcu_bh: 68This file has one line per CPU, or eight for this 8-CPU system.
45 0 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=545/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/0 ktl=ebc3 b=10 ci=0 co=0 ca=0 69The fields are as follows:
46 1 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=967/1/0 df=3 of=0 ql=0 qs=.... kt=0/W/1 ktl=58c b=10 ci=151 co=0 ca=0
47 2 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1081/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/2 ktl=da94 b=10 ci=0 co=0 ca=0
48 3 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1846/0/0 df=8 of=0 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=0 co=0 ca=0
49 4 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=369/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/4 ktl=e0e7 b=10 ci=0 co=0 ca=0
50 5 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=381/1/0 df=4 of=0 ql=0 qs=.... kt=0/W/5 ktl=fb2f b=10 ci=0 co=0 ca=0
51 6 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1037/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/6 ktl=d2ad b=10 ci=0 co=0 ca=0
52 7 c=1474 g=1474 pq=1 pgp=1473 qp=0 dt=1572/0/0 df=8 of=0 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=0 co=0 ca=0
53
54The first section lists the rcu_data structures for rcu_sched, the second
55for rcu_bh. Note that CONFIG_TREE_PREEMPT_RCU kernels will have an
56additional section for rcu_preempt. Each section has one line per CPU,
57or eight for this 8-CPU system. The fields are as follows:
58 70
59o The number at the beginning of each line is the CPU number. 71o The number at the beginning of each line is the CPU number.
60 CPUs numbers followed by an exclamation mark are offline, 72 CPUs numbers followed by an exclamation mark are offline,
@@ -64,11 +76,13 @@ o The number at the beginning of each line is the CPU number.
64 substantially larger than the number of actual CPUs. 76 substantially larger than the number of actual CPUs.
65 77
66o "c" is the count of grace periods that this CPU believes have 78o "c" is the count of grace periods that this CPU believes have
67 completed. Offlined CPUs and CPUs in dynticks idle mode may 79 completed. Offlined CPUs and CPUs in dynticks idle mode may lag
68 lag quite a ways behind, for example, CPU 6 under "rcu_sched" 80 quite a ways behind, for example, CPU 4 under "rcu_sched" above,
69 above, which has been offline through not quite 40,000 RCU grace 81 which has been offline through 16 RCU grace periods. It is not
70 periods. It is not unusual to see CPUs lagging by thousands of 82 unusual to see offline CPUs lagging by thousands of grace periods.
71 grace periods. 83 Note that although the grace-period number is an unsigned long,
84 it is printed out as a signed long to allow more human-friendly
85 representation near boot time.
72 86
73o "g" is the count of grace periods that this CPU believes have 87o "g" is the count of grace periods that this CPU believes have
74 started. Again, offlined CPUs and CPUs in dynticks idle mode 88 started. Again, offlined CPUs and CPUs in dynticks idle mode
@@ -84,30 +98,25 @@ o "pq" indicates that this CPU has passed through a quiescent state
84 CPU has not yet reported that fact, (2) some other CPU has not 98 CPU has not yet reported that fact, (2) some other CPU has not
85 yet reported for this grace period, or (3) both. 99 yet reported for this grace period, or (3) both.
86 100
87o "pgp" indicates which grace period the last-observed quiescent
88 state for this CPU corresponds to. This is important for handling
89 the race between CPU 0 reporting an extended dynticks-idle
90 quiescent state for CPU 1 and CPU 1 suddenly waking up and
91 reporting its own quiescent state. If CPU 1 was the last CPU
92 for the current grace period, then the CPU that loses this race
93 will attempt to incorrectly mark CPU 1 as having checked in for
94 the next grace period!
95
96o "qp" indicates that RCU still expects a quiescent state from 101o "qp" indicates that RCU still expects a quiescent state from
97 this CPU. Offlined CPUs and CPUs in dyntick idle mode might 102 this CPU. Offlined CPUs and CPUs in dyntick idle mode might
98 well have qp=1, which is OK: RCU is still ignoring them. 103 well have qp=1, which is OK: RCU is still ignoring them.
99 104
100o "dt" is the current value of the dyntick counter that is incremented 105o "dt" is the current value of the dyntick counter that is incremented
101 when entering or leaving dynticks idle state, either by the 106 when entering or leaving idle, either due to a context switch or
102 scheduler or by irq. This number is even if the CPU is in 107 due to an interrupt. This number is even if the CPU is in idle
103 dyntick idle mode and odd otherwise. The number after the first 108 from RCU's viewpoint and odd otherwise. The number after the
104 "/" is the interrupt nesting depth when in dyntick-idle state, 109 first "/" is the interrupt nesting depth when in idle state,
105 or one greater than the interrupt-nesting depth otherwise. 110 or a large number added to the interrupt-nesting depth when
106 The number after the second "/" is the NMI nesting depth. 111 running a non-idle task. Some architectures do not accurately
112 count interrupt nesting when running in non-idle kernel context,
113 which can result in interesting anomalies such as negative
114 interrupt-nesting levels. The number after the second "/"
115 is the NMI nesting depth.
107 116
108o "df" is the number of times that some other CPU has forced a 117o "df" is the number of times that some other CPU has forced a
109 quiescent state on behalf of this CPU due to this CPU being in 118 quiescent state on behalf of this CPU due to this CPU being in
110 dynticks-idle state. 119 idle state.
111 120
112o "of" is the number of times that some other CPU has forced a 121o "of" is the number of times that some other CPU has forced a
113 quiescent state on behalf of this CPU due to this CPU being 122 quiescent state on behalf of this CPU due to this CPU being
@@ -120,9 +129,13 @@ o "of" is the number of times that some other CPU has forced a
120 error, so it makes sense to err conservatively. 129 error, so it makes sense to err conservatively.
121 130
122o "ql" is the number of RCU callbacks currently residing on 131o "ql" is the number of RCU callbacks currently residing on
123 this CPU. This is the total number of callbacks, regardless 132 this CPU. The first number is the number of "lazy" callbacks
124 of what state they are in (new, waiting for grace period to 133 that are known to RCU to only be freeing memory, and the number
125 start, waiting for grace period to end, ready to invoke). 134 after the "/" is the total number of callbacks, lazy or not.
135 These counters count callbacks regardless of what phase of
136 grace-period processing that they are in (new, waiting for
137 grace period to start, waiting for grace period to end, ready
138 to invoke).
126 139
127o "qs" gives an indication of the state of the callback queue 140o "qs" gives an indication of the state of the callback queue
128 with four characters: 141 with four characters:
@@ -150,6 +163,43 @@ o "qs" gives an indication of the state of the callback queue
150 If there are no callbacks in a given one of the above states, 163 If there are no callbacks in a given one of the above states,
151 the corresponding character is replaced by ".". 164 the corresponding character is replaced by ".".
152 165
166o "b" is the batch limit for this CPU. If more than this number
167 of RCU callbacks is ready to invoke, then the remainder will
168 be deferred.
169
170o "ci" is the number of RCU callbacks that have been invoked for
171 this CPU. Note that ci+nci+ql is the number of callbacks that have
172 been registered in absence of CPU-hotplug activity.
173
174o "nci" is the number of RCU callbacks that have been offloaded from
175 this CPU. This will always be zero unless the kernel was built
176 with CONFIG_RCU_NOCB_CPU=y and the "rcu_nocbs=" kernel boot
177 parameter was specified.
178
179o "co" is the number of RCU callbacks that have been orphaned due to
180 this CPU going offline. These orphaned callbacks have been moved
181 to an arbitrarily chosen online CPU.
182
183o "ca" is the number of RCU callbacks that have been adopted by this
184 CPU due to other CPUs going offline. Note that ci+co-ca+ql is
185 the number of RCU callbacks registered on this CPU.
186
187
188Kernels compiled with CONFIG_RCU_BOOST=y display the following from
189/debug/rcu/rcu_preempt/rcudata:
190
191 0!c=12865 g=12866 pq=1 qp=1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871
192 1 c=14407 g=14408 pq=1 qp=0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485
193 2 c=14407 g=14408 pq=1 qp=0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490
194 3 c=14407 g=14408 pq=1 qp=0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290
195 4 c=14405 g=14406 pq=1 qp=1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114
196 5!c=14168 g=14169 pq=1 qp=0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722
197 6 c=14404 g=14405 pq=1 qp=0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811
198 7 c=14407 g=14408 pq=1 qp=1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042
199
200This is similar to the output discussed above, but contains the following
201additional fields:
202
153o "kt" is the per-CPU kernel-thread state. The digit preceding 203o "kt" is the per-CPU kernel-thread state. The digit preceding
154 the first slash is zero if there is no work pending and 1 204 the first slash is zero if there is no work pending and 1
155 otherwise. The character between the first pair of slashes is 205 otherwise. The character between the first pair of slashes is
@@ -184,35 +234,51 @@ o "ktl" is the low-order 16 bits (in hexadecimal) of the count of
184 234
185 This field is displayed only for CONFIG_RCU_BOOST kernels. 235 This field is displayed only for CONFIG_RCU_BOOST kernels.
186 236
187o "b" is the batch limit for this CPU. If more than this number
188 of RCU callbacks is ready to invoke, then the remainder will
189 be deferred.
190 237
191o "ci" is the number of RCU callbacks that have been invoked for 238The output of "cat rcu/rcu_preempt/rcuexp" looks as follows:
192 this CPU. Note that ci+ql is the number of callbacks that have
193 been registered in absence of CPU-hotplug activity.
194 239
195o "co" is the number of RCU callbacks that have been orphaned due to 240s=21872 d=21872 w=0 tf=0 wd1=0 wd2=0 n=0 sc=21872 dt=21872 dl=0 dx=21872
196 this CPU going offline. These orphaned callbacks have been moved 241
197 to an arbitrarily chosen online CPU. 242These fields are as follows:
243
244o "s" is the starting sequence number.
198 245
199o "ca" is the number of RCU callbacks that have been adopted due to 246o "d" is the ending sequence number. When the starting and ending
200 other CPUs going offline. Note that ci+co-ca+ql is the number of 247 numbers differ, there is an expedited grace period in progress.
201 RCU callbacks registered on this CPU.
202 248
203There is also an rcu/rcudata.csv file with the same information in 249o "w" is the number of times that the sequence numbers have been
204comma-separated-variable spreadsheet format. 250 in danger of wrapping.
205 251
252o "tf" is the number of times that contention has resulted in a
253 failure to begin an expedited grace period.
206 254
207The output of "cat rcu/rcugp" looks as follows: 255o "wd1" and "wd2" are the number of times that an attempt to
256 start an expedited grace period found that someone else had
257 completed an expedited grace period that satisfies the
258 attempted request. "Our work is done."
208 259
209rcu_sched: completed=33062 gpnum=33063 260o "n" is number of times that contention was so great that
210rcu_bh: completed=464 gpnum=464 261 the request was demoted from an expedited grace period to
262 a normal grace period.
263
264o "sc" is the number of times that the attempt to start a
265 new expedited grace period succeeded.
266
267o "dt" is the number of times that we attempted to update
268 the "d" counter.
269
270o "dl" is the number of times that we failed to update the "d"
271 counter.
272
273o "dx" is the number of times that we succeeded in updating
274 the "d" counter.
211 275
212Again, this output is for both "rcu_sched" and "rcu_bh". Note that 276
213kernels built with CONFIG_TREE_PREEMPT_RCU will have an additional 277The output of "cat rcu/rcu_preempt/rcugp" looks as follows:
214"rcu_preempt" line. The fields are taken from the rcu_state structure, 278
215and are as follows: 279completed=31249 gpnum=31250 age=1 max=18
280
281These fields are taken from the rcu_state structure, and are as follows:
216 282
217o "completed" is the number of grace periods that have completed. 283o "completed" is the number of grace periods that have completed.
218 It is comparable to the "c" field from rcu/rcudata in that a 284 It is comparable to the "c" field from rcu/rcudata in that a
@@ -220,44 +286,42 @@ o "completed" is the number of grace periods that have completed.
220 that the corresponding RCU grace period has completed. 286 that the corresponding RCU grace period has completed.
221 287
222o "gpnum" is the number of grace periods that have started. It is 288o "gpnum" is the number of grace periods that have started. It is
223 comparable to the "g" field from rcu/rcudata in that a CPU 289 similarly comparable to the "g" field from rcu/rcudata in that
224 whose "g" field matches the value of "gpnum" is aware that the 290 a CPU whose "g" field matches the value of "gpnum" is aware that
225 corresponding RCU grace period has started. 291 the corresponding RCU grace period has started.
292
293 If these two fields are equal, then there is no grace period
294 in progress, in other words, RCU is idle. On the other hand,
295 if the two fields differ (as they are above), then an RCU grace
296 period is in progress.
226 297
227 If these two fields are equal (as they are for "rcu_bh" above), 298o "age" is the number of jiffies that the current grace period
228 then there is no grace period in progress, in other words, RCU 299 has extended for, or zero if there is no grace period currently
229 is idle. On the other hand, if the two fields differ (as they 300 in effect.
230 do for "rcu_sched" above), then an RCU grace period is in progress.
231 301
302o "max" is the age in jiffies of the longest-duration grace period
303 thus far.
232 304
233The output of "cat rcu/rcuhier" looks as follows, with very long lines: 305The output of "cat rcu/rcu_preempt/rcuhier" looks as follows:
234 306
235c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 307c=14407 g=14408 s=0 jfq=2 j=c863 nfqs=12040/nfqsng=0(12040) fqlh=1051 oqlen=0/0
2361/1 ..>. 0:127 ^0 3083/3 ..>. 0:7 ^0
2373/3 ..>. 0:35 ^0 0/0 ..>. 36:71 ^1 0/0 ..>. 72:107 ^2 0/0 ..>. 108:127 ^3 309e/e ..>. 0:3 ^0 d/d ..>. 4:7 ^1
2383/3f ..>. 0:5 ^0 2/3 ..>. 6:11 ^1 0/0 ..>. 12:17 ^2 0/0 ..>. 18:23 ^3 0/0 ..>. 24:29 ^4 0/0 ..>. 30:35 ^5 0/0 ..>. 36:41 ^0 0/0 ..>. 42:47 ^1 0/0 ..>. 48:53 ^2 0/0 ..>. 54:59 ^3 0/0 ..>. 60:65 ^4 0/0 ..>. 66:71 ^5 0/0 ..>. 72:77 ^0 0/0 ..>. 78:83 ^1 0/0 ..>. 84:89 ^2 0/0 ..>. 90:95 ^3 0/0 ..>. 96:101 ^4 0/0 ..>. 102:107 ^5 0/0 ..>. 108:113 ^0 0/0 ..>. 114:119 ^1 0/0 ..>. 120:125 ^2 0/0 ..>. 126:127 ^3
239rcu_bh:
240c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
2410/1 ..>. 0:127 ^0
2420/3 ..>. 0:35 ^0 0/0 ..>. 36:71 ^1 0/0 ..>. 72:107 ^2 0/0 ..>. 108:127 ^3
2430/3f ..>. 0:5 ^0 0/3 ..>. 6:11 ^1 0/0 ..>. 12:17 ^2 0/0 ..>. 18:23 ^3 0/0 ..>. 24:29 ^4 0/0 ..>. 30:35 ^5 0/0 ..>. 36:41 ^0 0/0 ..>. 42:47 ^1 0/0 ..>. 48:53 ^2 0/0 ..>. 54:59 ^3 0/0 ..>. 60:65 ^4 0/0 ..>. 66:71 ^5 0/0 ..>. 72:77 ^0 0/0 ..>. 78:83 ^1 0/0 ..>. 84:89 ^2 0/0 ..>. 90:95 ^3 0/0 ..>. 96:101 ^4 0/0 ..>. 102:107 ^5 0/0 ..>. 108:113 ^0 0/0 ..>. 114:119 ^1 0/0 ..>. 120:125 ^2 0/0 ..>. 126:127 ^3
244 310
245This is once again split into "rcu_sched" and "rcu_bh" portions, 311The fields are as follows:
246and CONFIG_TREE_PREEMPT_RCU kernels will again have an additional
247"rcu_preempt" section. The fields are as follows:
248 312
249o "c" is exactly the same as "completed" under rcu/rcugp. 313o "c" is exactly the same as "completed" under rcu/rcu_preempt/rcugp.
250 314
251o "g" is exactly the same as "gpnum" under rcu/rcugp. 315o "g" is exactly the same as "gpnum" under rcu/rcu_preempt/rcugp.
252 316
253o "s" is the "signaled" state that drives force_quiescent_state()'s 317o "s" is the current state of the force_quiescent_state()
254 state machine. 318 state machine.
255 319
256o "jfq" is the number of jiffies remaining for this grace period 320o "jfq" is the number of jiffies remaining for this grace period
257 before force_quiescent_state() is invoked to help push things 321 before force_quiescent_state() is invoked to help push things
258 along. Note that CPUs in dyntick-idle mode throughout the grace 322 along. Note that CPUs in idle mode throughout the grace period
259 period will not report on their own, but rather must be check by 323 will not report on their own, but rather must be check by some
260 some other CPU via force_quiescent_state(). 324 other CPU via force_quiescent_state().
261 325
262o "j" is the low-order four hex digits of the jiffies counter. 326o "j" is the low-order four hex digits of the jiffies counter.
263 Yes, Paul did run into a number of problems that turned out to 327 Yes, Paul did run into a number of problems that turned out to
@@ -268,7 +332,8 @@ o "nfqs" is the number of calls to force_quiescent_state() since
268 332
269o "nfqsng" is the number of useless calls to force_quiescent_state(), 333o "nfqsng" is the number of useless calls to force_quiescent_state(),
270 where there wasn't actually a grace period active. This can 334 where there wasn't actually a grace period active. This can
271 happen due to races. The number in parentheses is the difference 335 no longer happen due to grace-period processing being pushed
336 into a kthread. The number in parentheses is the difference
272 between "nfqs" and "nfqsng", or the number of times that 337 between "nfqs" and "nfqsng", or the number of times that
273 force_quiescent_state() actually did some real work. 338 force_quiescent_state() actually did some real work.
274 339
@@ -276,28 +341,27 @@ o "fqlh" is the number of calls to force_quiescent_state() that
276 exited immediately (without even being counted in nfqs above) 341 exited immediately (without even being counted in nfqs above)
277 due to contention on ->fqslock. 342 due to contention on ->fqslock.
278 343
279o Each element of the form "1/1 0:127 ^0" represents one struct 344o Each element of the form "3/3 ..>. 0:7 ^0" represents one rcu_node
280 rcu_node. Each line represents one level of the hierarchy, from 345 structure. Each line represents one level of the hierarchy,
281 root to leaves. It is best to think of the rcu_data structures 346 from root to leaves. It is best to think of the rcu_data
282 as forming yet another level after the leaves. Note that there 347 structures as forming yet another level after the leaves.
283 might be either one, two, or three levels of rcu_node structures, 348 Note that there might be either one, two, three, or even four
284 depending on the relationship between CONFIG_RCU_FANOUT and 349 levels of rcu_node structures, depending on the relationship
285 CONFIG_NR_CPUS. 350 between CONFIG_RCU_FANOUT, CONFIG_RCU_FANOUT_LEAF (possibly
351 adjusted using the rcu_fanout_leaf kernel boot parameter), and
352 CONFIG_NR_CPUS (possibly adjusted using the nr_cpu_ids count of
353 possible CPUs for the booting hardware).
286 354
287 o The numbers separated by the "/" are the qsmask followed 355 o The numbers separated by the "/" are the qsmask followed
288 by the qsmaskinit. The qsmask will have one bit 356 by the qsmaskinit. The qsmask will have one bit
289 set for each entity in the next lower level that 357 set for each entity in the next lower level that has
290 has not yet checked in for the current grace period. 358 not yet checked in for the current grace period ("e"
359 indicating CPUs 5, 6, and 7 in the example above).
291 The qsmaskinit will have one bit for each entity that is 360 The qsmaskinit will have one bit for each entity that is
292 currently expected to check in during each grace period. 361 currently expected to check in during each grace period.
293 The value of qsmaskinit is assigned to that of qsmask 362 The value of qsmaskinit is assigned to that of qsmask
294 at the beginning of each grace period. 363 at the beginning of each grace period.
295 364
296 For example, for "rcu_sched", the qsmask of the first
297 entry of the lowest level is 0x14, meaning that we
298 are still waiting for CPUs 2 and 4 to check in for the
299 current grace period.
300
301 o The characters separated by the ">" indicate the state 365 o The characters separated by the ">" indicate the state
302 of the blocked-tasks lists. A "G" preceding the ">" 366 of the blocked-tasks lists. A "G" preceding the ">"
303 indicates that at least one task blocked in an RCU 367 indicates that at least one task blocked in an RCU
@@ -312,48 +376,39 @@ o Each element of the form "1/1 0:127 ^0" represents one struct
312 A "." character appears if the corresponding condition 376 A "." character appears if the corresponding condition
313 does not hold, so that "..>." indicates that no tasks 377 does not hold, so that "..>." indicates that no tasks
314 are blocked. In contrast, "GE>T" indicates maximal 378 are blocked. In contrast, "GE>T" indicates maximal
315 inconvenience from blocked tasks. 379 inconvenience from blocked tasks. CONFIG_TREE_RCU
380 builds of the kernel will always show "..>.".
316 381
317 o The numbers separated by the ":" are the range of CPUs 382 o The numbers separated by the ":" are the range of CPUs
318 served by this struct rcu_node. This can be helpful 383 served by this struct rcu_node. This can be helpful
319 in working out how the hierarchy is wired together. 384 in working out how the hierarchy is wired together.
320 385
321 For example, the first entry at the lowest level shows 386 For example, the example rcu_node structure shown above
322 "0:5", indicating that it covers CPUs 0 through 5. 387 has "0:7", indicating that it covers CPUs 0 through 7.
323 388
324 o The number after the "^" indicates the bit in the 389 o The number after the "^" indicates the bit in the
325 next higher level rcu_node structure that this 390 next higher level rcu_node structure that this rcu_node
326 rcu_node structure corresponds to. 391 structure corresponds to. For example, the "d/d ..>. 4:7
327 392 ^1" has a "1" in this position, indicating that it
328 For example, the first entry at the lowest level shows 393 corresponds to the "1" bit in the "3" shown in the
329 "^0", indicating that it corresponds to bit zero in 394 "3/3 ..>. 0:7 ^0" entry on the next level up.
330 the first entry at the middle level. 395
331 396
332 397The output of "cat rcu/rcu_sched/rcu_pending" looks as follows:
333The output of "cat rcu/rcu_pending" looks as follows: 398
334 399 0!np=26111 qsp=29 rpq=5386 cbr=1 cng=570 gpc=3674 gps=577 nn=15903
335rcu_sched: 400 1!np=28913 qsp=35 rpq=6097 cbr=1 cng=448 gpc=3700 gps=554 nn=18113
336 0 np=255892 qsp=53936 rpq=85 cbr=0 cng=14417 gpc=10033 gps=24320 nn=146741 401 2!np=32740 qsp=37 rpq=6202 cbr=0 cng=476 gpc=4627 gps=546 nn=20889
337 1 np=261224 qsp=54638 rpq=33 cbr=0 cng=25723 gpc=16310 gps=2849 nn=155792 402 3 np=23679 qsp=22 rpq=5044 cbr=1 cng=415 gpc=3403 gps=347 nn=14469
338 2 np=237496 qsp=49664 rpq=23 cbr=0 cng=2762 gpc=45478 gps=1762 nn=136629 403 4!np=30714 qsp=4 rpq=5574 cbr=0 cng=528 gpc=3931 gps=639 nn=20042
339 3 np=236249 qsp=48766 rpq=98 cbr=0 cng=286 gpc=48049 gps=1218 nn=137723 404 5 np=28910 qsp=2 rpq=5246 cbr=0 cng=428 gpc=4105 gps=709 nn=18422
340 4 np=221310 qsp=46850 rpq=7 cbr=0 cng=26 gpc=43161 gps=4634 nn=123110 405 6!np=38648 qsp=5 rpq=7076 cbr=0 cng=840 gpc=4072 gps=961 nn=25699
341 5 np=237332 qsp=48449 rpq=9 cbr=0 cng=54 gpc=47920 gps=3252 nn=137456 406 7 np=37275 qsp=2 rpq=6873 cbr=0 cng=868 gpc=3416 gps=971 nn=25147
342 6 np=219995 qsp=46718 rpq=12 cbr=0 cng=50 gpc=42098 gps=6093 nn=120834 407
343 7 np=249893 qsp=49390 rpq=42 cbr=0 cng=72 gpc=38400 gps=17102 nn=144888 408The fields are as follows:
344rcu_bh: 409
345 0 np=146741 qsp=1419 rpq=6 cbr=0 cng=6 gpc=0 gps=0 nn=145314 410o The leading number is the CPU number, with "!" indicating
346 1 np=155792 qsp=12597 rpq=3 cbr=0 cng=0 gpc=4 gps=8 nn=143180 411 an offline CPU.
347 2 np=136629 qsp=18680 rpq=1 cbr=0 cng=0 gpc=7 gps=6 nn=117936
348 3 np=137723 qsp=2843 rpq=0 cbr=0 cng=0 gpc=10 gps=7 nn=134863
349 4 np=123110 qsp=12433 rpq=0 cbr=0 cng=0 gpc=4 gps=2 nn=110671
350 5 np=137456 qsp=4210 rpq=1 cbr=0 cng=0 gpc=6 gps=5 nn=133235
351 6 np=120834 qsp=9902 rpq=2 cbr=0 cng=0 gpc=6 gps=3 nn=110921
352 7 np=144888 qsp=26336 rpq=0 cbr=0 cng=0 gpc=8 gps=2 nn=118542
353
354As always, this is once again split into "rcu_sched" and "rcu_bh"
355portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional
356"rcu_preempt" section. The fields are as follows:
357 412
358o "np" is the number of times that __rcu_pending() has been invoked 413o "np" is the number of times that __rcu_pending() has been invoked
359 for the corresponding flavor of RCU. 414 for the corresponding flavor of RCU.
@@ -377,38 +432,23 @@ o "gpc" is the number of times that an old grace period had
377o "gps" is the number of times that a new grace period had started, 432o "gps" is the number of times that a new grace period had started,
378 but this CPU was not yet aware of it. 433 but this CPU was not yet aware of it.
379 434
380o "nn" is the number of times that this CPU needed nothing. Alert 435o "nn" is the number of times that this CPU needed nothing.
381 readers will note that the rcu "nn" number for a given CPU very
382 closely matches the rcu_bh "np" number for that same CPU. This
383 is due to short-circuit evaluation in rcu_pending().
384
385
386The output of "cat rcu/rcutorture" looks as follows:
387
388rcutorture test sequence: 0 (test in progress)
389rcutorture update version number: 615
390
391The first line shows the number of rcutorture tests that have completed
392since boot. If a test is currently running, the "(test in progress)"
393string will appear as shown above. The second line shows the number of
394update cycles that the current test has started, or zero if there is
395no test in progress.
396 436
397 437
398The output of "cat rcu/rcuboost" looks as follows: 438The output of "cat rcu/rcuboost" looks as follows:
399 439
4000:5 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=2f95 bt=300f 4400:3 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894
401 balk: nt=0 egt=989 bt=0 nb=0 ny=0 nos=16 441 balk: nt=0 egt=4695 bt=0 nb=0 ny=56 nos=0
4026:7 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=2f95 bt=300f 4424:7 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894
403 balk: nt=0 egt=225 bt=0 nb=0 ny=0 nos=6 443 balk: nt=0 egt=6541 bt=0 nb=0 ny=126 nos=0
404 444
405This information is output only for rcu_preempt. Each two-line entry 445This information is output only for rcu_preempt. Each two-line entry
406corresponds to a leaf rcu_node strcuture. The fields are as follows: 446corresponds to a leaf rcu_node strcuture. The fields are as follows:
407 447
408o "n:m" is the CPU-number range for the corresponding two-line 448o "n:m" is the CPU-number range for the corresponding two-line
409 entry. In the sample output above, the first entry covers 449 entry. In the sample output above, the first entry covers
410 CPUs zero through five and the second entry covers CPUs 6 450 CPUs zero through three and the second entry covers CPUs four
411 and 7. 451 through seven.
412 452
413o "tasks=TNEB" gives the state of the various segments of the 453o "tasks=TNEB" gives the state of the various segments of the
414 rnp->blocked_tasks list: 454 rnp->blocked_tasks list:
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index bf0f6de2aa00..0cc7820967f4 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -499,6 +499,8 @@ The foo_reclaim() function might appear as follows:
499 { 499 {
500 struct foo *fp = container_of(rp, struct foo, rcu); 500 struct foo *fp = container_of(rp, struct foo, rcu);
501 501
502 foo_cleanup(fp->a);
503
502 kfree(fp); 504 kfree(fp);
503 } 505 }
504 506
@@ -521,6 +523,12 @@ o Use call_rcu() -after- removing a data element from an
521 read-side critical sections that might be referencing that 523 read-side critical sections that might be referencing that
522 data item. 524 data item.
523 525
526If the callback for call_rcu() is not doing anything more than calling
527kfree() on the structure, you can use kfree_rcu() instead of call_rcu()
528to avoid having to write your own callback:
529
530 kfree_rcu(old_fp, rcu);
531
524Again, see checklist.txt for additional rules governing the use of RCU. 532Again, see checklist.txt for additional rules governing the use of RCU.
525 533
526 534
@@ -773,8 +781,8 @@ a single atomic update, converting to RCU will require special care.
773 781
774Also, the presence of synchronize_rcu() means that the RCU version of 782Also, the presence of synchronize_rcu() means that the RCU version of
775delete() can now block. If this is a problem, there is a callback-based 783delete() can now block. If this is a problem, there is a callback-based
776mechanism that never blocks, namely call_rcu(), that can be used in 784mechanism that never blocks, namely call_rcu() or kfree_rcu(), that can
777place of synchronize_rcu(). 785be used in place of synchronize_rcu().
778 786
779 787
7807. FULL LIST OF RCU APIs 7887. FULL LIST OF RCU APIs
@@ -789,9 +797,7 @@ RCU list traversal:
789 list_for_each_entry_rcu 797 list_for_each_entry_rcu
790 hlist_for_each_entry_rcu 798 hlist_for_each_entry_rcu
791 hlist_nulls_for_each_entry_rcu 799 hlist_nulls_for_each_entry_rcu
792 800 list_for_each_entry_continue_rcu
793 list_for_each_continue_rcu (to be deprecated in favor of new
794 list_for_each_entry_continue_rcu)
795 801
796RCU pointer/list update: 802RCU pointer/list update:
797 803
@@ -813,6 +819,7 @@ RCU: Critical sections Grace period Barrier
813 rcu_read_unlock synchronize_rcu 819 rcu_read_unlock synchronize_rcu
814 rcu_dereference synchronize_rcu_expedited 820 rcu_dereference synchronize_rcu_expedited
815 call_rcu 821 call_rcu
822 kfree_rcu
816 823
817 824
818bh: Critical sections Grace period Barrier 825bh: Critical sections Grace period Barrier
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9776f068306b..9d2e5cb3a95f 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2394,6 +2394,27 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2394 ramdisk_size= [RAM] Sizes of RAM disks in kilobytes 2394 ramdisk_size= [RAM] Sizes of RAM disks in kilobytes
2395 See Documentation/blockdev/ramdisk.txt. 2395 See Documentation/blockdev/ramdisk.txt.
2396 2396
2397 rcu_nocbs= [KNL,BOOT]
2398 In kernels built with CONFIG_RCU_NOCB_CPU=y, set
2399 the specified list of CPUs to be no-callback CPUs.
2400 Invocation of these CPUs' RCU callbacks will
2401 be offloaded to "rcuoN" kthreads created for
2402 that purpose. This reduces OS jitter on the
2403 offloaded CPUs, which can be useful for HPC and
2404 real-time workloads. It can also improve energy
2405 efficiency for asymmetric multiprocessors.
2406
2407 rcu_nocbs_poll [KNL,BOOT]
2408 Rather than requiring that offloaded CPUs
2409 (specified by rcu_nocbs= above) explicitly
2410 awaken the corresponding "rcuoN" kthreads,
2411 make these kthreads poll for callbacks.
2412 This improves the real-time response for the
2413 offloaded CPUs by relieving them of the need to
2414 wake up the corresponding kthread, but degrades
2415 energy efficiency by requiring that the kthreads
2416 periodically wake up to do the polling.
2417
2397 rcutree.blimit= [KNL,BOOT] 2418 rcutree.blimit= [KNL,BOOT]
2398 Set maximum number of finished RCU callbacks to process 2419 Set maximum number of finished RCU callbacks to process
2399 in one batch. 2420 in one batch.
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 2759f7c188f0..3c4e1b3b80a1 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -251,12 +251,13 @@ And there are a number of things that _must_ or _must_not_ be assumed:
251 251
252 And for: 252 And for:
253 253
254 *A = X; Y = *A; 254 *A = X; *(A + 4) = Y;
255 255
256 we may get either of: 256 we may get any of:
257 257
258 STORE *A = X; Y = LOAD *A; 258 STORE *A = X; STORE *(A + 4) = Y;
259 STORE *A = Y = X; 259 STORE *(A + 4) = Y; STORE *A = X;
260 STORE {*A, *(A + 4) } = {X, Y};
260 261
261 262
262========================= 263=========================
diff --git a/arch/Kconfig b/arch/Kconfig
index 366ec06a5185..cc74aaea116c 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -300,15 +300,16 @@ config SECCOMP_FILTER
300 300
301 See Documentation/prctl/seccomp_filter.txt for details. 301 See Documentation/prctl/seccomp_filter.txt for details.
302 302
303config HAVE_RCU_USER_QS 303config HAVE_CONTEXT_TRACKING
304 bool 304 bool
305 help 305 help
306 Provide kernel entry/exit hooks necessary for userspace 306 Provide kernel/user boundaries probes necessary for subsystems
307 RCU extended quiescent state. Syscalls need to be wrapped inside 307 that need it, such as userspace RCU extended quiescent state.
308 rcu_user_exit()-rcu_user_enter() through the slow path using 308 Syscalls need to be wrapped inside user_exit()-user_enter() through
309 TIF_NOHZ flag. Exceptions handlers must be wrapped as well. Irqs 309 the slow path using TIF_NOHZ flag. Exceptions handlers must be
310 are already protected inside rcu_irq_enter/rcu_irq_exit() but 310 wrapped as well. Irqs are already protected inside
311 preemption or signal handling on irq exit still need to be protected. 311 rcu_irq_enter/rcu_irq_exit() but preemption or signal handling on
312 irq exit still need to be protected.
312 313
313config HAVE_VIRT_CPU_ACCOUNTING 314config HAVE_VIRT_CPU_ACCOUNTING
314 bool 315 bool
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 79ccfe6c7078..49e3b49e552f 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -648,7 +648,7 @@ static void stack_proc(void *arg)
648 struct task_struct *from = current, *to = arg; 648 struct task_struct *from = current, *to = arg;
649 649
650 to->thread.saved_task = from; 650 to->thread.saved_task = from;
651 rcu_switch(from, to); 651 rcu_user_hooks_switch(from, to);
652 switch_to(from, to, from); 652 switch_to(from, to, from);
653} 653}
654 654
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 46c3bff3ced2..110cfad24f26 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -106,7 +106,7 @@ config X86
106 select KTIME_SCALAR if X86_32 106 select KTIME_SCALAR if X86_32
107 select GENERIC_STRNCPY_FROM_USER 107 select GENERIC_STRNCPY_FROM_USER
108 select GENERIC_STRNLEN_USER 108 select GENERIC_STRNLEN_USER
109 select HAVE_RCU_USER_QS if X86_64 109 select HAVE_CONTEXT_TRACKING if X86_64
110 select HAVE_IRQ_TIME_ACCOUNTING 110 select HAVE_IRQ_TIME_ACCOUNTING
111 select GENERIC_KERNEL_THREAD 111 select GENERIC_KERNEL_THREAD
112 select GENERIC_KERNEL_EXECVE 112 select GENERIC_KERNEL_EXECVE
diff --git a/arch/x86/include/asm/rcu.h b/arch/x86/include/asm/context_tracking.h
index d1ac07a23979..1616562683e9 100644
--- a/arch/x86/include/asm/rcu.h
+++ b/arch/x86/include/asm/context_tracking.h
@@ -1,27 +1,26 @@
1#ifndef _ASM_X86_RCU_H 1#ifndef _ASM_X86_CONTEXT_TRACKING_H
2#define _ASM_X86_RCU_H 2#define _ASM_X86_CONTEXT_TRACKING_H
3 3
4#ifndef __ASSEMBLY__ 4#ifndef __ASSEMBLY__
5 5#include <linux/context_tracking.h>
6#include <linux/rcupdate.h>
7#include <asm/ptrace.h> 6#include <asm/ptrace.h>
8 7
9static inline void exception_enter(struct pt_regs *regs) 8static inline void exception_enter(struct pt_regs *regs)
10{ 9{
11 rcu_user_exit(); 10 user_exit();
12} 11}
13 12
14static inline void exception_exit(struct pt_regs *regs) 13static inline void exception_exit(struct pt_regs *regs)
15{ 14{
16#ifdef CONFIG_RCU_USER_QS 15#ifdef CONFIG_CONTEXT_TRACKING
17 if (user_mode(regs)) 16 if (user_mode(regs))
18 rcu_user_enter(); 17 user_enter();
19#endif 18#endif
20} 19}
21 20
22#else /* __ASSEMBLY__ */ 21#else /* __ASSEMBLY__ */
23 22
24#ifdef CONFIG_RCU_USER_QS 23#ifdef CONFIG_CONTEXT_TRACKING
25# define SCHEDULE_USER call schedule_user 24# define SCHEDULE_USER call schedule_user
26#else 25#else
27# define SCHEDULE_USER call schedule 26# define SCHEDULE_USER call schedule
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1328fe49a3f1..2a3806b95831 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -56,7 +56,7 @@
56#include <asm/ftrace.h> 56#include <asm/ftrace.h>
57#include <asm/percpu.h> 57#include <asm/percpu.h>
58#include <asm/asm.h> 58#include <asm/asm.h>
59#include <asm/rcu.h> 59#include <asm/context_tracking.h>
60#include <asm/smap.h> 60#include <asm/smap.h>
61#include <linux/err.h> 61#include <linux/err.h>
62 62
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 974b67e46dd0..b629bbe0d9bd 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -23,6 +23,7 @@
23#include <linux/hw_breakpoint.h> 23#include <linux/hw_breakpoint.h>
24#include <linux/rcupdate.h> 24#include <linux/rcupdate.h>
25#include <linux/module.h> 25#include <linux/module.h>
26#include <linux/context_tracking.h>
26 27
27#include <asm/uaccess.h> 28#include <asm/uaccess.h>
28#include <asm/pgtable.h> 29#include <asm/pgtable.h>
@@ -1491,7 +1492,7 @@ long syscall_trace_enter(struct pt_regs *regs)
1491{ 1492{
1492 long ret = 0; 1493 long ret = 0;
1493 1494
1494 rcu_user_exit(); 1495 user_exit();
1495 1496
1496 /* 1497 /*
1497 * If we stepped into a sysenter/syscall insn, it trapped in 1498 * If we stepped into a sysenter/syscall insn, it trapped in
@@ -1546,7 +1547,7 @@ void syscall_trace_leave(struct pt_regs *regs)
1546 * or do_notify_resume(), in which case we can be in RCU 1547 * or do_notify_resume(), in which case we can be in RCU
1547 * user mode. 1548 * user mode.
1548 */ 1549 */
1549 rcu_user_exit(); 1550 user_exit();
1550 1551
1551 audit_syscall_exit(regs); 1552 audit_syscall_exit(regs);
1552 1553
@@ -1564,5 +1565,5 @@ void syscall_trace_leave(struct pt_regs *regs)
1564 if (step || test_thread_flag(TIF_SYSCALL_TRACE)) 1565 if (step || test_thread_flag(TIF_SYSCALL_TRACE))
1565 tracehook_report_syscall_exit(regs, step); 1566 tracehook_report_syscall_exit(regs, step);
1566 1567
1567 rcu_user_enter(); 1568 user_enter();
1568} 1569}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 70b27ee6118e..fbbb604313a2 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -22,6 +22,7 @@
22#include <linux/uaccess.h> 22#include <linux/uaccess.h>
23#include <linux/user-return-notifier.h> 23#include <linux/user-return-notifier.h>
24#include <linux/uprobes.h> 24#include <linux/uprobes.h>
25#include <linux/context_tracking.h>
25 26
26#include <asm/processor.h> 27#include <asm/processor.h>
27#include <asm/ucontext.h> 28#include <asm/ucontext.h>
@@ -816,7 +817,7 @@ static void do_signal(struct pt_regs *regs)
816void 817void
817do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 818do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
818{ 819{
819 rcu_user_exit(); 820 user_exit();
820 821
821#ifdef CONFIG_X86_MCE 822#ifdef CONFIG_X86_MCE
822 /* notify userspace of pending MCEs */ 823 /* notify userspace of pending MCEs */
@@ -838,7 +839,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
838 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) 839 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
839 fire_user_return_notifiers(); 840 fire_user_return_notifiers();
840 841
841 rcu_user_enter(); 842 user_enter();
842} 843}
843 844
844void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 845void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 8276dc6794cc..eb8586693e0b 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -55,7 +55,7 @@
55#include <asm/i387.h> 55#include <asm/i387.h>
56#include <asm/fpu-internal.h> 56#include <asm/fpu-internal.h>
57#include <asm/mce.h> 57#include <asm/mce.h>
58#include <asm/rcu.h> 58#include <asm/context_tracking.h>
59 59
60#include <asm/mach_traps.h> 60#include <asm/mach_traps.h>
61 61
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8e13ecb41bee..7a529cbab7ad 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -18,7 +18,7 @@
18#include <asm/pgalloc.h> /* pgd_*(), ... */ 18#include <asm/pgalloc.h> /* pgd_*(), ... */
19#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ 19#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
20#include <asm/fixmap.h> /* VSYSCALL_START */ 20#include <asm/fixmap.h> /* VSYSCALL_START */
21#include <asm/rcu.h> /* exception_enter(), ... */ 21#include <asm/context_tracking.h> /* exception_enter(), ... */
22 22
23/* 23/*
24 * Page fault error code bits: 24 * Page fault error code bits:
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
new file mode 100644
index 000000000000..e24339ccb7f0
--- /dev/null
+++ b/include/linux/context_tracking.h
@@ -0,0 +1,18 @@
1#ifndef _LINUX_CONTEXT_TRACKING_H
2#define _LINUX_CONTEXT_TRACKING_H
3
4#ifdef CONFIG_CONTEXT_TRACKING
5#include <linux/sched.h>
6
7extern void user_enter(void);
8extern void user_exit(void);
9extern void context_tracking_task_switch(struct task_struct *prev,
10 struct task_struct *next);
11#else
12static inline void user_enter(void) { }
13static inline void user_exit(void) { }
14static inline void context_tracking_task_switch(struct task_struct *prev,
15 struct task_struct *next) { }
16#endif /* !CONFIG_CONTEXT_TRACKING */
17
18#endif
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index e0f0fab20415..c92dd28eaa6c 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -286,23 +286,6 @@ static inline void list_splice_init_rcu(struct list_head *list,
286 &pos->member != (head); \ 286 &pos->member != (head); \
287 pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) 287 pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
288 288
289
290/**
291 * list_for_each_continue_rcu
292 * @pos: the &struct list_head to use as a loop cursor.
293 * @head: the head for your list.
294 *
295 * Iterate over an rcu-protected list, continuing after current point.
296 *
297 * This list-traversal primitive may safely run concurrently with
298 * the _rcu list-mutation primitives such as list_add_rcu()
299 * as long as the traversal is guarded by rcu_read_lock().
300 */
301#define list_for_each_continue_rcu(pos, head) \
302 for ((pos) = rcu_dereference_raw(list_next_rcu(pos)); \
303 (pos) != (head); \
304 (pos) = rcu_dereference_raw(list_next_rcu(pos)))
305
306/** 289/**
307 * list_for_each_entry_continue_rcu - continue iteration over list of given type 290 * list_for_each_entry_continue_rcu - continue iteration over list of given type
308 * @pos: the type * to use as a loop cursor. 291 * @pos: the type * to use as a loop cursor.
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 7c968e4f929e..275aa3f1062d 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -90,6 +90,25 @@ extern void do_trace_rcu_torture_read(char *rcutorturename,
90 * that started after call_rcu() was invoked. RCU read-side critical 90 * that started after call_rcu() was invoked. RCU read-side critical
91 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 91 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
92 * and may be nested. 92 * and may be nested.
93 *
94 * Note that all CPUs must agree that the grace period extended beyond
95 * all pre-existing RCU read-side critical section. On systems with more
96 * than one CPU, this means that when "func()" is invoked, each CPU is
97 * guaranteed to have executed a full memory barrier since the end of its
98 * last RCU read-side critical section whose beginning preceded the call
99 * to call_rcu(). It also means that each CPU executing an RCU read-side
100 * critical section that continues beyond the start of "func()" must have
101 * executed a memory barrier after the call_rcu() but before the beginning
102 * of that RCU read-side critical section. Note that these guarantees
103 * include CPUs that are offline, idle, or executing in user mode, as
104 * well as CPUs that are executing in the kernel.
105 *
106 * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
107 * resulting RCU callback function "func()", then both CPU A and CPU B are
108 * guaranteed to execute a full memory barrier during the time interval
109 * between the call to call_rcu() and the invocation of "func()" -- even
110 * if CPU A and CPU B are the same CPU (but again only if the system has
111 * more than one CPU).
93 */ 112 */
94extern void call_rcu(struct rcu_head *head, 113extern void call_rcu(struct rcu_head *head,
95 void (*func)(struct rcu_head *head)); 114 void (*func)(struct rcu_head *head));
@@ -118,6 +137,9 @@ extern void call_rcu(struct rcu_head *head,
118 * OR 137 * OR
119 * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. 138 * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
120 * These may be nested. 139 * These may be nested.
140 *
141 * See the description of call_rcu() for more detailed information on
142 * memory ordering guarantees.
121 */ 143 */
122extern void call_rcu_bh(struct rcu_head *head, 144extern void call_rcu_bh(struct rcu_head *head,
123 void (*func)(struct rcu_head *head)); 145 void (*func)(struct rcu_head *head));
@@ -137,6 +159,9 @@ extern void call_rcu_bh(struct rcu_head *head,
137 * OR 159 * OR
138 * anything that disables preemption. 160 * anything that disables preemption.
139 * These may be nested. 161 * These may be nested.
162 *
163 * See the description of call_rcu() for more detailed information on
164 * memory ordering guarantees.
140 */ 165 */
141extern void call_rcu_sched(struct rcu_head *head, 166extern void call_rcu_sched(struct rcu_head *head,
142 void (*func)(struct rcu_head *rcu)); 167 void (*func)(struct rcu_head *rcu));
@@ -197,13 +222,13 @@ extern void rcu_user_enter(void);
197extern void rcu_user_exit(void); 222extern void rcu_user_exit(void);
198extern void rcu_user_enter_after_irq(void); 223extern void rcu_user_enter_after_irq(void);
199extern void rcu_user_exit_after_irq(void); 224extern void rcu_user_exit_after_irq(void);
200extern void rcu_user_hooks_switch(struct task_struct *prev,
201 struct task_struct *next);
202#else 225#else
203static inline void rcu_user_enter(void) { } 226static inline void rcu_user_enter(void) { }
204static inline void rcu_user_exit(void) { } 227static inline void rcu_user_exit(void) { }
205static inline void rcu_user_enter_after_irq(void) { } 228static inline void rcu_user_enter_after_irq(void) { }
206static inline void rcu_user_exit_after_irq(void) { } 229static inline void rcu_user_exit_after_irq(void) { }
230static inline void rcu_user_hooks_switch(struct task_struct *prev,
231 struct task_struct *next) { }
207#endif /* CONFIG_RCU_USER_QS */ 232#endif /* CONFIG_RCU_USER_QS */
208 233
209extern void exit_rcu(void); 234extern void exit_rcu(void);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0dd42a02df2e..530c52ef873e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -109,6 +109,8 @@ extern void update_cpu_load_nohz(void);
109 109
110extern unsigned long get_parent_ip(unsigned long addr); 110extern unsigned long get_parent_ip(unsigned long addr);
111 111
112extern void dump_cpu_task(int cpu);
113
112struct seq_file; 114struct seq_file;
113struct cfs_rq; 115struct cfs_rq;
114struct task_group; 116struct task_group;
@@ -1844,14 +1846,6 @@ static inline void rcu_copy_process(struct task_struct *p)
1844 1846
1845#endif 1847#endif
1846 1848
1847static inline void rcu_switch(struct task_struct *prev,
1848 struct task_struct *next)
1849{
1850#ifdef CONFIG_RCU_USER_QS
1851 rcu_user_hooks_switch(prev, next);
1852#endif
1853}
1854
1855static inline void tsk_restore_flags(struct task_struct *task, 1849static inline void tsk_restore_flags(struct task_struct *task,
1856 unsigned long orig_flags, unsigned long flags) 1850 unsigned long orig_flags, unsigned long flags)
1857{ 1851{
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 55a5c52cbb25..6eb691b08358 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -16,8 +16,10 @@
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 * 17 *
18 * Copyright (C) IBM Corporation, 2006 18 * Copyright (C) IBM Corporation, 2006
19 * Copyright (C) Fujitsu, 2012
19 * 20 *
20 * Author: Paul McKenney <paulmck@us.ibm.com> 21 * Author: Paul McKenney <paulmck@us.ibm.com>
22 * Lai Jiangshan <laijs@cn.fujitsu.com>
21 * 23 *
22 * For detailed explanation of Read-Copy Update mechanism see - 24 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU/ *.txt 25 * Documentation/RCU/ *.txt
@@ -40,6 +42,8 @@ struct rcu_batch {
40 struct rcu_head *head, **tail; 42 struct rcu_head *head, **tail;
41}; 43};
42 44
45#define RCU_BATCH_INIT(name) { NULL, &(name.head) }
46
43struct srcu_struct { 47struct srcu_struct {
44 unsigned completed; 48 unsigned completed;
45 struct srcu_struct_array __percpu *per_cpu_ref; 49 struct srcu_struct_array __percpu *per_cpu_ref;
@@ -70,12 +74,42 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name,
70 __init_srcu_struct((sp), #sp, &__srcu_key); \ 74 __init_srcu_struct((sp), #sp, &__srcu_key); \
71}) 75})
72 76
77#define __SRCU_DEP_MAP_INIT(srcu_name) .dep_map = { .name = #srcu_name },
73#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 78#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
74 79
75int init_srcu_struct(struct srcu_struct *sp); 80int init_srcu_struct(struct srcu_struct *sp);
76 81
82#define __SRCU_DEP_MAP_INIT(srcu_name)
77#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 83#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
78 84
85void process_srcu(struct work_struct *work);
86
87#define __SRCU_STRUCT_INIT(name) \
88 { \
89 .completed = -300, \
90 .per_cpu_ref = &name##_srcu_array, \
91 .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \
92 .running = false, \
93 .batch_queue = RCU_BATCH_INIT(name.batch_queue), \
94 .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \
95 .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \
96 .batch_done = RCU_BATCH_INIT(name.batch_done), \
97 .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\
98 __SRCU_DEP_MAP_INIT(name) \
99 }
100
101/*
102 * define and init a srcu struct at build time.
103 * dont't call init_srcu_struct() nor cleanup_srcu_struct() on it.
104 */
105#define DEFINE_SRCU(name) \
106 static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
107 struct srcu_struct name = __SRCU_STRUCT_INIT(name);
108
109#define DEFINE_STATIC_SRCU(name) \
110 static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
111 static struct srcu_struct name = __SRCU_STRUCT_INIT(name);
112
79/** 113/**
80 * call_srcu() - Queue a callback for invocation after an SRCU grace period 114 * call_srcu() - Queue a callback for invocation after an SRCU grace period
81 * @sp: srcu_struct in queue the callback 115 * @sp: srcu_struct in queue the callback
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 5bde94d8585b..d4f559b1ec34 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -549,6 +549,7 @@ TRACE_EVENT(rcu_torture_read,
549 * "EarlyExit": rcu_barrier_callback() piggybacked, thus early exit. 549 * "EarlyExit": rcu_barrier_callback() piggybacked, thus early exit.
550 * "Inc1": rcu_barrier_callback() piggyback check counter incremented. 550 * "Inc1": rcu_barrier_callback() piggyback check counter incremented.
551 * "Offline": rcu_barrier_callback() found offline CPU 551 * "Offline": rcu_barrier_callback() found offline CPU
552 * "OnlineNoCB": rcu_barrier_callback() found online no-CBs CPU.
552 * "OnlineQ": rcu_barrier_callback() found online CPU with callbacks. 553 * "OnlineQ": rcu_barrier_callback() found online CPU with callbacks.
553 * "OnlineNQ": rcu_barrier_callback() found online CPU, no callbacks. 554 * "OnlineNQ": rcu_barrier_callback() found online CPU, no callbacks.
554 * "IRQ": An rcu_barrier_callback() callback posted on remote CPU. 555 * "IRQ": An rcu_barrier_callback() callback posted on remote CPU.
diff --git a/init/Kconfig b/init/Kconfig
index 6fdd6e339326..2054e048bb98 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -486,35 +486,35 @@ config PREEMPT_RCU
486 This option enables preemptible-RCU code that is common between 486 This option enables preemptible-RCU code that is common between
487 the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations. 487 the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations.
488 488
489config CONTEXT_TRACKING
490 bool
491
489config RCU_USER_QS 492config RCU_USER_QS
490 bool "Consider userspace as in RCU extended quiescent state" 493 bool "Consider userspace as in RCU extended quiescent state"
491 depends on HAVE_RCU_USER_QS && SMP 494 depends on HAVE_CONTEXT_TRACKING && SMP
495 select CONTEXT_TRACKING
492 help 496 help
493 This option sets hooks on kernel / userspace boundaries and 497 This option sets hooks on kernel / userspace boundaries and
494 puts RCU in extended quiescent state when the CPU runs in 498 puts RCU in extended quiescent state when the CPU runs in
495 userspace. It means that when a CPU runs in userspace, it is 499 userspace. It means that when a CPU runs in userspace, it is
496 excluded from the global RCU state machine and thus doesn't 500 excluded from the global RCU state machine and thus doesn't
497 to keep the timer tick on for RCU. 501 try to keep the timer tick on for RCU.
498 502
499 Unless you want to hack and help the development of the full 503 Unless you want to hack and help the development of the full
500 tickless feature, you shouldn't enable this option. It adds 504 dynticks mode, you shouldn't enable this option. It also
501 unnecessary overhead. 505 adds unnecessary overhead.
502 506
503 If unsure say N 507 If unsure say N
504 508
505config RCU_USER_QS_FORCE 509config CONTEXT_TRACKING_FORCE
506 bool "Force userspace extended QS by default" 510 bool "Force context tracking"
507 depends on RCU_USER_QS 511 depends on CONTEXT_TRACKING
508 help 512 help
509 Set the hooks in user/kernel boundaries by default in order to 513 Probe on user/kernel boundaries by default in order to
510 test this feature that treats userspace as an extended quiescent 514 test the features that rely on it such as userspace RCU extended
511 state until we have a real user like a full adaptive nohz option. 515 quiescent states.
512 516 This test is there for debugging until we have a real user like the
513 Unless you want to hack and help the development of the full 517 full dynticks mode.
514 tickless feature, you shouldn't enable this option. It adds
515 unnecessary overhead.
516
517 If unsure say N
518 518
519config RCU_FANOUT 519config RCU_FANOUT
520 int "Tree-based hierarchical RCU fanout value" 520 int "Tree-based hierarchical RCU fanout value"
@@ -582,14 +582,13 @@ config RCU_FAST_NO_HZ
582 depends on NO_HZ && SMP 582 depends on NO_HZ && SMP
583 default n 583 default n
584 help 584 help
585 This option causes RCU to attempt to accelerate grace periods 585 This option causes RCU to attempt to accelerate grace periods in
586 in order to allow CPUs to enter dynticks-idle state more 586 order to allow CPUs to enter dynticks-idle state more quickly.
587 quickly. On the other hand, this option increases the overhead 587 On the other hand, this option increases the overhead of the
588 of the dynticks-idle checking, particularly on systems with 588 dynticks-idle checking, thus degrading scheduling latency.
589 large numbers of CPUs.
590 589
591 Say Y if energy efficiency is critically important, particularly 590 Say Y if energy efficiency is critically important, and you don't
592 if you have relatively few CPUs. 591 care about real-time response.
593 592
594 Say N if you are unsure. 593 Say N if you are unsure.
595 594
@@ -655,6 +654,28 @@ config RCU_BOOST_DELAY
655 654
656 Accept the default if unsure. 655 Accept the default if unsure.
657 656
657config RCU_NOCB_CPU
658 bool "Offload RCU callback processing from boot-selected CPUs"
659 depends on TREE_RCU || TREE_PREEMPT_RCU
660 default n
661 help
662 Use this option to reduce OS jitter for aggressive HPC or
663 real-time workloads. It can also be used to offload RCU
664 callback invocation to energy-efficient CPUs in battery-powered
665 asymmetric multiprocessors.
666
667 This option offloads callback invocation from the set of
668 CPUs specified at boot time by the rcu_nocbs parameter.
669 For each such CPU, a kthread ("rcuoN") will be created to
670 invoke callbacks, where the "N" is the CPU being offloaded.
671 Nothing prevents this kthread from running on the specified
672 CPUs, but (1) the kthreads may be preempted between each
673 callback, and (2) affinity or cgroups can be used to force
674 the kthreads to run on whatever set of CPUs is desired.
675
676 Say Y here if you want reduced OS jitter on selected CPUs.
677 Say N here if you are unsure.
678
658endmenu # "RCU Subsystem" 679endmenu # "RCU Subsystem"
659 680
660config IKCONFIG 681config IKCONFIG
diff --git a/kernel/Makefile b/kernel/Makefile
index 86e3285ae7e5..ac0d533eb7de 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -110,6 +110,7 @@ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
110obj-$(CONFIG_PADATA) += padata.o 110obj-$(CONFIG_PADATA) += padata.o
111obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 111obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
112obj-$(CONFIG_JUMP_LABEL) += jump_label.o 112obj-$(CONFIG_JUMP_LABEL) += jump_label.o
113obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
113 114
114$(obj)/configs.o: $(obj)/config_data.h 115$(obj)/configs.o: $(obj)/config_data.h
115 116
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
new file mode 100644
index 000000000000..e0e07fd55508
--- /dev/null
+++ b/kernel/context_tracking.c
@@ -0,0 +1,83 @@
1#include <linux/context_tracking.h>
2#include <linux/rcupdate.h>
3#include <linux/sched.h>
4#include <linux/percpu.h>
5#include <linux/hardirq.h>
6
7struct context_tracking {
8 /*
9 * When active is false, hooks are not set to
10 * minimize overhead: TIF flags are cleared
11 * and calls to user_enter/exit are ignored. This
12 * may be further optimized using static keys.
13 */
14 bool active;
15 enum {
16 IN_KERNEL = 0,
17 IN_USER,
18 } state;
19};
20
21static DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
22#ifdef CONFIG_CONTEXT_TRACKING_FORCE
23 .active = true,
24#endif
25};
26
27void user_enter(void)
28{
29 unsigned long flags;
30
31 /*
32 * Some contexts may involve an exception occuring in an irq,
33 * leading to that nesting:
34 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
35 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
36 * helpers are enough to protect RCU uses inside the exception. So
37 * just return immediately if we detect we are in an IRQ.
38 */
39 if (in_interrupt())
40 return;
41
42 WARN_ON_ONCE(!current->mm);
43
44 local_irq_save(flags);
45 if (__this_cpu_read(context_tracking.active) &&
46 __this_cpu_read(context_tracking.state) != IN_USER) {
47 __this_cpu_write(context_tracking.state, IN_USER);
48 rcu_user_enter();
49 }
50 local_irq_restore(flags);
51}
52
53void user_exit(void)
54{
55 unsigned long flags;
56
57 /*
58 * Some contexts may involve an exception occuring in an irq,
59 * leading to that nesting:
60 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
61 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
62 * helpers are enough to protect RCU uses inside the exception. So
63 * just return immediately if we detect we are in an IRQ.
64 */
65 if (in_interrupt())
66 return;
67
68 local_irq_save(flags);
69 if (__this_cpu_read(context_tracking.state) == IN_USER) {
70 __this_cpu_write(context_tracking.state, IN_KERNEL);
71 rcu_user_exit();
72 }
73 local_irq_restore(flags);
74}
75
76void context_tracking_task_switch(struct task_struct *prev,
77 struct task_struct *next)
78{
79 if (__this_cpu_read(context_tracking.active)) {
80 clear_tsk_thread_flag(prev, TIF_NOHZ);
81 set_tsk_thread_flag(next, TIF_NOHZ);
82 }
83}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 4e316e1acf58..8715a798aa7c 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -141,6 +141,23 @@ static ssize_t fscaps_show(struct kobject *kobj,
141} 141}
142KERNEL_ATTR_RO(fscaps); 142KERNEL_ATTR_RO(fscaps);
143 143
144int rcu_expedited;
145static ssize_t rcu_expedited_show(struct kobject *kobj,
146 struct kobj_attribute *attr, char *buf)
147{
148 return sprintf(buf, "%d\n", rcu_expedited);
149}
150static ssize_t rcu_expedited_store(struct kobject *kobj,
151 struct kobj_attribute *attr,
152 const char *buf, size_t count)
153{
154 if (kstrtoint(buf, 0, &rcu_expedited))
155 return -EINVAL;
156
157 return count;
158}
159KERNEL_ATTR_RW(rcu_expedited);
160
144/* 161/*
145 * Make /sys/kernel/notes give the raw contents of our kernel .notes section. 162 * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
146 */ 163 */
@@ -182,6 +199,7 @@ static struct attribute * kernel_attrs[] = {
182 &kexec_crash_size_attr.attr, 199 &kexec_crash_size_attr.attr,
183 &vmcoreinfo_attr.attr, 200 &vmcoreinfo_attr.attr,
184#endif 201#endif
202 &rcu_expedited_attr.attr,
185 NULL 203 NULL
186}; 204};
187 205
diff --git a/kernel/rcu.h b/kernel/rcu.h
index 8ba99cdc6515..20dfba576c2b 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -109,4 +109,6 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)
109 } 109 }
110} 110}
111 111
112extern int rcu_expedited;
113
112#endif /* __LINUX_RCU_H */ 114#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 29ca1c6da594..a2cf76177b44 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -46,12 +46,15 @@
46#include <linux/export.h> 46#include <linux/export.h>
47#include <linux/hardirq.h> 47#include <linux/hardirq.h>
48#include <linux/delay.h> 48#include <linux/delay.h>
49#include <linux/module.h>
49 50
50#define CREATE_TRACE_POINTS 51#define CREATE_TRACE_POINTS
51#include <trace/events/rcu.h> 52#include <trace/events/rcu.h>
52 53
53#include "rcu.h" 54#include "rcu.h"
54 55
56module_param(rcu_expedited, int, 0);
57
55#ifdef CONFIG_PREEMPT_RCU 58#ifdef CONFIG_PREEMPT_RCU
56 59
57/* 60/*
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index e4c6a598d6f7..e7dce58f9c2a 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -195,7 +195,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle);
195 */ 195 */
196int rcu_is_cpu_rrupt_from_idle(void) 196int rcu_is_cpu_rrupt_from_idle(void)
197{ 197{
198 return rcu_dynticks_nesting <= 0; 198 return rcu_dynticks_nesting <= 1;
199} 199}
200 200
201/* 201/*
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 3d0190282204..f85016a2309b 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -706,7 +706,10 @@ void synchronize_rcu(void)
706 return; 706 return;
707 707
708 /* Once we get past the fastpath checks, same code as rcu_barrier(). */ 708 /* Once we get past the fastpath checks, same code as rcu_barrier(). */
709 rcu_barrier(); 709 if (rcu_expedited)
710 synchronize_rcu_expedited();
711 else
712 rcu_barrier();
710} 713}
711EXPORT_SYMBOL_GPL(synchronize_rcu); 714EXPORT_SYMBOL_GPL(synchronize_rcu);
712 715
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index aaa7b9f3532a..31dea01c85fd 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -339,7 +339,6 @@ rcu_stutter_wait(char *title)
339 339
340struct rcu_torture_ops { 340struct rcu_torture_ops {
341 void (*init)(void); 341 void (*init)(void);
342 void (*cleanup)(void);
343 int (*readlock)(void); 342 int (*readlock)(void);
344 void (*read_delay)(struct rcu_random_state *rrsp); 343 void (*read_delay)(struct rcu_random_state *rrsp);
345 void (*readunlock)(int idx); 344 void (*readunlock)(int idx);
@@ -431,7 +430,6 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
431 430
432static struct rcu_torture_ops rcu_ops = { 431static struct rcu_torture_ops rcu_ops = {
433 .init = NULL, 432 .init = NULL,
434 .cleanup = NULL,
435 .readlock = rcu_torture_read_lock, 433 .readlock = rcu_torture_read_lock,
436 .read_delay = rcu_read_delay, 434 .read_delay = rcu_read_delay,
437 .readunlock = rcu_torture_read_unlock, 435 .readunlock = rcu_torture_read_unlock,
@@ -475,7 +473,6 @@ static void rcu_sync_torture_init(void)
475 473
476static struct rcu_torture_ops rcu_sync_ops = { 474static struct rcu_torture_ops rcu_sync_ops = {
477 .init = rcu_sync_torture_init, 475 .init = rcu_sync_torture_init,
478 .cleanup = NULL,
479 .readlock = rcu_torture_read_lock, 476 .readlock = rcu_torture_read_lock,
480 .read_delay = rcu_read_delay, 477 .read_delay = rcu_read_delay,
481 .readunlock = rcu_torture_read_unlock, 478 .readunlock = rcu_torture_read_unlock,
@@ -493,7 +490,6 @@ static struct rcu_torture_ops rcu_sync_ops = {
493 490
494static struct rcu_torture_ops rcu_expedited_ops = { 491static struct rcu_torture_ops rcu_expedited_ops = {
495 .init = rcu_sync_torture_init, 492 .init = rcu_sync_torture_init,
496 .cleanup = NULL,
497 .readlock = rcu_torture_read_lock, 493 .readlock = rcu_torture_read_lock,
498 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 494 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
499 .readunlock = rcu_torture_read_unlock, 495 .readunlock = rcu_torture_read_unlock,
@@ -536,7 +532,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
536 532
537static struct rcu_torture_ops rcu_bh_ops = { 533static struct rcu_torture_ops rcu_bh_ops = {
538 .init = NULL, 534 .init = NULL,
539 .cleanup = NULL,
540 .readlock = rcu_bh_torture_read_lock, 535 .readlock = rcu_bh_torture_read_lock,
541 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 536 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
542 .readunlock = rcu_bh_torture_read_unlock, 537 .readunlock = rcu_bh_torture_read_unlock,
@@ -553,7 +548,6 @@ static struct rcu_torture_ops rcu_bh_ops = {
553 548
554static struct rcu_torture_ops rcu_bh_sync_ops = { 549static struct rcu_torture_ops rcu_bh_sync_ops = {
555 .init = rcu_sync_torture_init, 550 .init = rcu_sync_torture_init,
556 .cleanup = NULL,
557 .readlock = rcu_bh_torture_read_lock, 551 .readlock = rcu_bh_torture_read_lock,
558 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 552 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
559 .readunlock = rcu_bh_torture_read_unlock, 553 .readunlock = rcu_bh_torture_read_unlock,
@@ -570,7 +564,6 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
570 564
571static struct rcu_torture_ops rcu_bh_expedited_ops = { 565static struct rcu_torture_ops rcu_bh_expedited_ops = {
572 .init = rcu_sync_torture_init, 566 .init = rcu_sync_torture_init,
573 .cleanup = NULL,
574 .readlock = rcu_bh_torture_read_lock, 567 .readlock = rcu_bh_torture_read_lock,
575 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 568 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
576 .readunlock = rcu_bh_torture_read_unlock, 569 .readunlock = rcu_bh_torture_read_unlock,
@@ -589,19 +582,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = {
589 * Definitions for srcu torture testing. 582 * Definitions for srcu torture testing.
590 */ 583 */
591 584
592static struct srcu_struct srcu_ctl; 585DEFINE_STATIC_SRCU(srcu_ctl);
593
594static void srcu_torture_init(void)
595{
596 init_srcu_struct(&srcu_ctl);
597 rcu_sync_torture_init();
598}
599
600static void srcu_torture_cleanup(void)
601{
602 synchronize_srcu(&srcu_ctl);
603 cleanup_srcu_struct(&srcu_ctl);
604}
605 586
606static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) 587static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
607{ 588{
@@ -672,8 +653,7 @@ static int srcu_torture_stats(char *page)
672} 653}
673 654
674static struct rcu_torture_ops srcu_ops = { 655static struct rcu_torture_ops srcu_ops = {
675 .init = srcu_torture_init, 656 .init = rcu_sync_torture_init,
676 .cleanup = srcu_torture_cleanup,
677 .readlock = srcu_torture_read_lock, 657 .readlock = srcu_torture_read_lock,
678 .read_delay = srcu_read_delay, 658 .read_delay = srcu_read_delay,
679 .readunlock = srcu_torture_read_unlock, 659 .readunlock = srcu_torture_read_unlock,
@@ -687,8 +667,7 @@ static struct rcu_torture_ops srcu_ops = {
687}; 667};
688 668
689static struct rcu_torture_ops srcu_sync_ops = { 669static struct rcu_torture_ops srcu_sync_ops = {
690 .init = srcu_torture_init, 670 .init = rcu_sync_torture_init,
691 .cleanup = srcu_torture_cleanup,
692 .readlock = srcu_torture_read_lock, 671 .readlock = srcu_torture_read_lock,
693 .read_delay = srcu_read_delay, 672 .read_delay = srcu_read_delay,
694 .readunlock = srcu_torture_read_unlock, 673 .readunlock = srcu_torture_read_unlock,
@@ -712,8 +691,7 @@ static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
712} 691}
713 692
714static struct rcu_torture_ops srcu_raw_ops = { 693static struct rcu_torture_ops srcu_raw_ops = {
715 .init = srcu_torture_init, 694 .init = rcu_sync_torture_init,
716 .cleanup = srcu_torture_cleanup,
717 .readlock = srcu_torture_read_lock_raw, 695 .readlock = srcu_torture_read_lock_raw,
718 .read_delay = srcu_read_delay, 696 .read_delay = srcu_read_delay,
719 .readunlock = srcu_torture_read_unlock_raw, 697 .readunlock = srcu_torture_read_unlock_raw,
@@ -727,8 +705,7 @@ static struct rcu_torture_ops srcu_raw_ops = {
727}; 705};
728 706
729static struct rcu_torture_ops srcu_raw_sync_ops = { 707static struct rcu_torture_ops srcu_raw_sync_ops = {
730 .init = srcu_torture_init, 708 .init = rcu_sync_torture_init,
731 .cleanup = srcu_torture_cleanup,
732 .readlock = srcu_torture_read_lock_raw, 709 .readlock = srcu_torture_read_lock_raw,
733 .read_delay = srcu_read_delay, 710 .read_delay = srcu_read_delay,
734 .readunlock = srcu_torture_read_unlock_raw, 711 .readunlock = srcu_torture_read_unlock_raw,
@@ -747,8 +724,7 @@ static void srcu_torture_synchronize_expedited(void)
747} 724}
748 725
749static struct rcu_torture_ops srcu_expedited_ops = { 726static struct rcu_torture_ops srcu_expedited_ops = {
750 .init = srcu_torture_init, 727 .init = rcu_sync_torture_init,
751 .cleanup = srcu_torture_cleanup,
752 .readlock = srcu_torture_read_lock, 728 .readlock = srcu_torture_read_lock,
753 .read_delay = srcu_read_delay, 729 .read_delay = srcu_read_delay,
754 .readunlock = srcu_torture_read_unlock, 730 .readunlock = srcu_torture_read_unlock,
@@ -783,7 +759,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
783 759
784static struct rcu_torture_ops sched_ops = { 760static struct rcu_torture_ops sched_ops = {
785 .init = rcu_sync_torture_init, 761 .init = rcu_sync_torture_init,
786 .cleanup = NULL,
787 .readlock = sched_torture_read_lock, 762 .readlock = sched_torture_read_lock,
788 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 763 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
789 .readunlock = sched_torture_read_unlock, 764 .readunlock = sched_torture_read_unlock,
@@ -799,7 +774,6 @@ static struct rcu_torture_ops sched_ops = {
799 774
800static struct rcu_torture_ops sched_sync_ops = { 775static struct rcu_torture_ops sched_sync_ops = {
801 .init = rcu_sync_torture_init, 776 .init = rcu_sync_torture_init,
802 .cleanup = NULL,
803 .readlock = sched_torture_read_lock, 777 .readlock = sched_torture_read_lock,
804 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 778 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
805 .readunlock = sched_torture_read_unlock, 779 .readunlock = sched_torture_read_unlock,
@@ -814,7 +788,6 @@ static struct rcu_torture_ops sched_sync_ops = {
814 788
815static struct rcu_torture_ops sched_expedited_ops = { 789static struct rcu_torture_ops sched_expedited_ops = {
816 .init = rcu_sync_torture_init, 790 .init = rcu_sync_torture_init,
817 .cleanup = NULL,
818 .readlock = sched_torture_read_lock, 791 .readlock = sched_torture_read_lock,
819 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 792 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
820 .readunlock = sched_torture_read_unlock, 793 .readunlock = sched_torture_read_unlock,
@@ -1396,12 +1369,16 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1396 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " 1369 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1397 "test_boost=%d/%d test_boost_interval=%d " 1370 "test_boost=%d/%d test_boost_interval=%d "
1398 "test_boost_duration=%d shutdown_secs=%d " 1371 "test_boost_duration=%d shutdown_secs=%d "
1372 "stall_cpu=%d stall_cpu_holdoff=%d "
1373 "n_barrier_cbs=%d "
1399 "onoff_interval=%d onoff_holdoff=%d\n", 1374 "onoff_interval=%d onoff_holdoff=%d\n",
1400 torture_type, tag, nrealreaders, nfakewriters, 1375 torture_type, tag, nrealreaders, nfakewriters,
1401 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1376 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1402 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, 1377 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1403 test_boost, cur_ops->can_boost, 1378 test_boost, cur_ops->can_boost,
1404 test_boost_interval, test_boost_duration, shutdown_secs, 1379 test_boost_interval, test_boost_duration, shutdown_secs,
1380 stall_cpu, stall_cpu_holdoff,
1381 n_barrier_cbs,
1405 onoff_interval, onoff_holdoff); 1382 onoff_interval, onoff_holdoff);
1406} 1383}
1407 1384
@@ -1502,6 +1479,7 @@ rcu_torture_onoff(void *arg)
1502 unsigned long delta; 1479 unsigned long delta;
1503 int maxcpu = -1; 1480 int maxcpu = -1;
1504 DEFINE_RCU_RANDOM(rand); 1481 DEFINE_RCU_RANDOM(rand);
1482 int ret;
1505 unsigned long starttime; 1483 unsigned long starttime;
1506 1484
1507 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); 1485 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
@@ -1522,7 +1500,13 @@ rcu_torture_onoff(void *arg)
1522 torture_type, cpu); 1500 torture_type, cpu);
1523 starttime = jiffies; 1501 starttime = jiffies;
1524 n_offline_attempts++; 1502 n_offline_attempts++;
1525 if (cpu_down(cpu) == 0) { 1503 ret = cpu_down(cpu);
1504 if (ret) {
1505 if (verbose)
1506 pr_alert("%s" TORTURE_FLAG
1507 "rcu_torture_onoff task: offline %d failed: errno %d\n",
1508 torture_type, cpu, ret);
1509 } else {
1526 if (verbose) 1510 if (verbose)
1527 pr_alert("%s" TORTURE_FLAG 1511 pr_alert("%s" TORTURE_FLAG
1528 "rcu_torture_onoff task: offlined %d\n", 1512 "rcu_torture_onoff task: offlined %d\n",
@@ -1936,8 +1920,6 @@ rcu_torture_cleanup(void)
1936 1920
1937 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ 1921 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
1938 1922
1939 if (cur_ops->cleanup)
1940 cur_ops->cleanup();
1941 if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) 1923 if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
1942 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); 1924 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1943 else if (n_online_successes != n_online_attempts || 1925 else if (n_online_successes != n_online_attempts ||
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 74df86bd9204..e441b77b614e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -68,9 +68,9 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
68 .level = { &sname##_state.node[0] }, \ 68 .level = { &sname##_state.node[0] }, \
69 .call = cr, \ 69 .call = cr, \
70 .fqs_state = RCU_GP_IDLE, \ 70 .fqs_state = RCU_GP_IDLE, \
71 .gpnum = -300, \ 71 .gpnum = 0UL - 300UL, \
72 .completed = -300, \ 72 .completed = 0UL - 300UL, \
73 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \ 73 .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
74 .orphan_nxttail = &sname##_state.orphan_nxtlist, \ 74 .orphan_nxttail = &sname##_state.orphan_nxtlist, \
75 .orphan_donetail = &sname##_state.orphan_donelist, \ 75 .orphan_donetail = &sname##_state.orphan_donelist, \
76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
@@ -207,18 +207,15 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
207DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 207DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
208 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, 208 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
209 .dynticks = ATOMIC_INIT(1), 209 .dynticks = ATOMIC_INIT(1),
210#if defined(CONFIG_RCU_USER_QS) && !defined(CONFIG_RCU_USER_QS_FORCE)
211 .ignore_user_qs = true,
212#endif
213}; 210};
214 211
215static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 212static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
216static int qhimark = 10000; /* If this many pending, ignore blimit. */ 213static long qhimark = 10000; /* If this many pending, ignore blimit. */
217static int qlowmark = 100; /* Once only this many pending, use blimit. */ 214static long qlowmark = 100; /* Once only this many pending, use blimit. */
218 215
219module_param(blimit, int, 0444); 216module_param(blimit, long, 0444);
220module_param(qhimark, int, 0444); 217module_param(qhimark, long, 0444);
221module_param(qlowmark, int, 0444); 218module_param(qlowmark, long, 0444);
222 219
223int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ 220int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
224int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; 221int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
@@ -303,7 +300,8 @@ EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
303static int 300static int
304cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) 301cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
305{ 302{
306 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]; 303 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] &&
304 rdp->nxttail[RCU_DONE_TAIL] != NULL;
307} 305}
308 306
309/* 307/*
@@ -312,8 +310,11 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
312static int 310static int
313cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 311cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
314{ 312{
315 return *rdp->nxttail[RCU_DONE_TAIL + 313 struct rcu_head **ntp;
316 ACCESS_ONCE(rsp->completed) != rdp->completed] && 314
315 ntp = rdp->nxttail[RCU_DONE_TAIL +
316 (ACCESS_ONCE(rsp->completed) != rdp->completed)];
317 return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp &&
317 !rcu_gp_in_progress(rsp); 318 !rcu_gp_in_progress(rsp);
318} 319}
319 320
@@ -416,29 +417,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
416 */ 417 */
417void rcu_user_enter(void) 418void rcu_user_enter(void)
418{ 419{
419 unsigned long flags; 420 rcu_eqs_enter(1);
420 struct rcu_dynticks *rdtp;
421
422 /*
423 * Some contexts may involve an exception occuring in an irq,
424 * leading to that nesting:
425 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
426 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
427 * helpers are enough to protect RCU uses inside the exception. So
428 * just return immediately if we detect we are in an IRQ.
429 */
430 if (in_interrupt())
431 return;
432
433 WARN_ON_ONCE(!current->mm);
434
435 local_irq_save(flags);
436 rdtp = &__get_cpu_var(rcu_dynticks);
437 if (!rdtp->ignore_user_qs && !rdtp->in_user) {
438 rdtp->in_user = true;
439 rcu_eqs_enter(true);
440 }
441 local_irq_restore(flags);
442} 421}
443 422
444/** 423/**
@@ -575,27 +554,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit);
575 */ 554 */
576void rcu_user_exit(void) 555void rcu_user_exit(void)
577{ 556{
578 unsigned long flags; 557 rcu_eqs_exit(1);
579 struct rcu_dynticks *rdtp;
580
581 /*
582 * Some contexts may involve an exception occuring in an irq,
583 * leading to that nesting:
584 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
585 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
586 * helpers are enough to protect RCU uses inside the exception. So
587 * just return immediately if we detect we are in an IRQ.
588 */
589 if (in_interrupt())
590 return;
591
592 local_irq_save(flags);
593 rdtp = &__get_cpu_var(rcu_dynticks);
594 if (rdtp->in_user) {
595 rdtp->in_user = false;
596 rcu_eqs_exit(true);
597 }
598 local_irq_restore(flags);
599} 558}
600 559
601/** 560/**
@@ -718,21 +677,6 @@ int rcu_is_cpu_idle(void)
718} 677}
719EXPORT_SYMBOL(rcu_is_cpu_idle); 678EXPORT_SYMBOL(rcu_is_cpu_idle);
720 679
721#ifdef CONFIG_RCU_USER_QS
722void rcu_user_hooks_switch(struct task_struct *prev,
723 struct task_struct *next)
724{
725 struct rcu_dynticks *rdtp;
726
727 /* Interrupts are disabled in context switch */
728 rdtp = &__get_cpu_var(rcu_dynticks);
729 if (!rdtp->ignore_user_qs) {
730 clear_tsk_thread_flag(prev, TIF_NOHZ);
731 set_tsk_thread_flag(next, TIF_NOHZ);
732 }
733}
734#endif /* #ifdef CONFIG_RCU_USER_QS */
735
736#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) 680#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
737 681
738/* 682/*
@@ -873,6 +817,29 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
873 rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); 817 rsp->jiffies_stall = jiffies + jiffies_till_stall_check();
874} 818}
875 819
820/*
821 * Dump stacks of all tasks running on stalled CPUs. This is a fallback
822 * for architectures that do not implement trigger_all_cpu_backtrace().
823 * The NMI-triggered stack traces are more accurate because they are
824 * printed by the target CPU.
825 */
826static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
827{
828 int cpu;
829 unsigned long flags;
830 struct rcu_node *rnp;
831
832 rcu_for_each_leaf_node(rsp, rnp) {
833 raw_spin_lock_irqsave(&rnp->lock, flags);
834 if (rnp->qsmask != 0) {
835 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
836 if (rnp->qsmask & (1UL << cpu))
837 dump_cpu_task(rnp->grplo + cpu);
838 }
839 raw_spin_unlock_irqrestore(&rnp->lock, flags);
840 }
841}
842
876static void print_other_cpu_stall(struct rcu_state *rsp) 843static void print_other_cpu_stall(struct rcu_state *rsp)
877{ 844{
878 int cpu; 845 int cpu;
@@ -880,6 +847,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
880 unsigned long flags; 847 unsigned long flags;
881 int ndetected = 0; 848 int ndetected = 0;
882 struct rcu_node *rnp = rcu_get_root(rsp); 849 struct rcu_node *rnp = rcu_get_root(rsp);
850 long totqlen = 0;
883 851
884 /* Only let one CPU complain about others per time interval. */ 852 /* Only let one CPU complain about others per time interval. */
885 853
@@ -924,12 +892,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
924 raw_spin_unlock_irqrestore(&rnp->lock, flags); 892 raw_spin_unlock_irqrestore(&rnp->lock, flags);
925 893
926 print_cpu_stall_info_end(); 894 print_cpu_stall_info_end();
927 printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n", 895 for_each_possible_cpu(cpu)
928 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 896 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
897 pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n",
898 smp_processor_id(), (long)(jiffies - rsp->gp_start),
899 rsp->gpnum, rsp->completed, totqlen);
929 if (ndetected == 0) 900 if (ndetected == 0)
930 printk(KERN_ERR "INFO: Stall ended before state dump start\n"); 901 printk(KERN_ERR "INFO: Stall ended before state dump start\n");
931 else if (!trigger_all_cpu_backtrace()) 902 else if (!trigger_all_cpu_backtrace())
932 dump_stack(); 903 rcu_dump_cpu_stacks(rsp);
933 904
934 /* Complain about tasks blocking the grace period. */ 905 /* Complain about tasks blocking the grace period. */
935 906
@@ -940,8 +911,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
940 911
941static void print_cpu_stall(struct rcu_state *rsp) 912static void print_cpu_stall(struct rcu_state *rsp)
942{ 913{
914 int cpu;
943 unsigned long flags; 915 unsigned long flags;
944 struct rcu_node *rnp = rcu_get_root(rsp); 916 struct rcu_node *rnp = rcu_get_root(rsp);
917 long totqlen = 0;
945 918
946 /* 919 /*
947 * OK, time to rat on ourselves... 920 * OK, time to rat on ourselves...
@@ -952,7 +925,10 @@ static void print_cpu_stall(struct rcu_state *rsp)
952 print_cpu_stall_info_begin(); 925 print_cpu_stall_info_begin();
953 print_cpu_stall_info(rsp, smp_processor_id()); 926 print_cpu_stall_info(rsp, smp_processor_id());
954 print_cpu_stall_info_end(); 927 print_cpu_stall_info_end();
955 printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start); 928 for_each_possible_cpu(cpu)
929 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
930 pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n",
931 jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen);
956 if (!trigger_all_cpu_backtrace()) 932 if (!trigger_all_cpu_backtrace())
957 dump_stack(); 933 dump_stack();
958 934
@@ -1091,6 +1067,7 @@ static void init_callback_list(struct rcu_data *rdp)
1091 rdp->nxtlist = NULL; 1067 rdp->nxtlist = NULL;
1092 for (i = 0; i < RCU_NEXT_SIZE; i++) 1068 for (i = 0; i < RCU_NEXT_SIZE; i++)
1093 rdp->nxttail[i] = &rdp->nxtlist; 1069 rdp->nxttail[i] = &rdp->nxtlist;
1070 init_nocb_callback_list(rdp);
1094} 1071}
1095 1072
1096/* 1073/*
@@ -1404,15 +1381,37 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
1404 !cpu_needs_another_gp(rsp, rdp)) { 1381 !cpu_needs_another_gp(rsp, rdp)) {
1405 /* 1382 /*
1406 * Either we have not yet spawned the grace-period 1383 * Either we have not yet spawned the grace-period
1407 * task or this CPU does not need another grace period. 1384 * task, this CPU does not need another grace period,
1385 * or a grace period is already in progress.
1408 * Either way, don't start a new grace period. 1386 * Either way, don't start a new grace period.
1409 */ 1387 */
1410 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1388 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1411 return; 1389 return;
1412 } 1390 }
1413 1391
1392 /*
1393 * Because there is no grace period in progress right now,
1394 * any callbacks we have up to this point will be satisfied
1395 * by the next grace period. So promote all callbacks to be
1396 * handled after the end of the next grace period. If the
1397 * CPU is not yet aware of the end of the previous grace period,
1398 * we need to allow for the callback advancement that will
1399 * occur when it does become aware. Deadlock prevents us from
1400 * making it aware at this point: We cannot acquire a leaf
1401 * rcu_node ->lock while holding the root rcu_node ->lock.
1402 */
1403 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1404 if (rdp->completed == rsp->completed)
1405 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1406
1414 rsp->gp_flags = RCU_GP_FLAG_INIT; 1407 rsp->gp_flags = RCU_GP_FLAG_INIT;
1415 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1408 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
1409
1410 /* Ensure that CPU is aware of completion of last grace period. */
1411 rcu_process_gp_end(rsp, rdp);
1412 local_irq_restore(flags);
1413
1414 /* Wake up rcu_gp_kthread() to start the grace period. */
1416 wake_up(&rsp->gp_wq); 1415 wake_up(&rsp->gp_wq);
1417} 1416}
1418 1417
@@ -1573,16 +1572,20 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1573/* 1572/*
1574 * Send the specified CPU's RCU callbacks to the orphanage. The 1573 * Send the specified CPU's RCU callbacks to the orphanage. The
1575 * specified CPU must be offline, and the caller must hold the 1574 * specified CPU must be offline, and the caller must hold the
1576 * ->onofflock. 1575 * ->orphan_lock.
1577 */ 1576 */
1578static void 1577static void
1579rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, 1578rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1580 struct rcu_node *rnp, struct rcu_data *rdp) 1579 struct rcu_node *rnp, struct rcu_data *rdp)
1581{ 1580{
1581 /* No-CBs CPUs do not have orphanable callbacks. */
1582 if (is_nocb_cpu(rdp->cpu))
1583 return;
1584
1582 /* 1585 /*
1583 * Orphan the callbacks. First adjust the counts. This is safe 1586 * Orphan the callbacks. First adjust the counts. This is safe
1584 * because ->onofflock excludes _rcu_barrier()'s adoption of 1587 * because _rcu_barrier() excludes CPU-hotplug operations, so it
1585 * the callbacks, thus no memory barrier is required. 1588 * cannot be running now. Thus no memory barrier is required.
1586 */ 1589 */
1587 if (rdp->nxtlist != NULL) { 1590 if (rdp->nxtlist != NULL) {
1588 rsp->qlen_lazy += rdp->qlen_lazy; 1591 rsp->qlen_lazy += rdp->qlen_lazy;
@@ -1623,13 +1626,17 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1623 1626
1624/* 1627/*
1625 * Adopt the RCU callbacks from the specified rcu_state structure's 1628 * Adopt the RCU callbacks from the specified rcu_state structure's
1626 * orphanage. The caller must hold the ->onofflock. 1629 * orphanage. The caller must hold the ->orphan_lock.
1627 */ 1630 */
1628static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) 1631static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1629{ 1632{
1630 int i; 1633 int i;
1631 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); 1634 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
1632 1635
1636 /* No-CBs CPUs are handled specially. */
1637 if (rcu_nocb_adopt_orphan_cbs(rsp, rdp))
1638 return;
1639
1633 /* Do the accounting first. */ 1640 /* Do the accounting first. */
1634 rdp->qlen_lazy += rsp->qlen_lazy; 1641 rdp->qlen_lazy += rsp->qlen_lazy;
1635 rdp->qlen += rsp->qlen; 1642 rdp->qlen += rsp->qlen;
@@ -1702,7 +1709,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1702 1709
1703 /* Exclude any attempts to start a new grace period. */ 1710 /* Exclude any attempts to start a new grace period. */
1704 mutex_lock(&rsp->onoff_mutex); 1711 mutex_lock(&rsp->onoff_mutex);
1705 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1712 raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
1706 1713
1707 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ 1714 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
1708 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); 1715 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
@@ -1729,10 +1736,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1729 /* 1736 /*
1730 * We still hold the leaf rcu_node structure lock here, and 1737 * We still hold the leaf rcu_node structure lock here, and
1731 * irqs are still disabled. The reason for this subterfuge is 1738 * irqs are still disabled. The reason for this subterfuge is
1732 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock 1739 * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock
1733 * held leads to deadlock. 1740 * held leads to deadlock.
1734 */ 1741 */
1735 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1742 raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
1736 rnp = rdp->mynode; 1743 rnp = rdp->mynode;
1737 if (need_report & RCU_OFL_TASKS_NORM_GP) 1744 if (need_report & RCU_OFL_TASKS_NORM_GP)
1738 rcu_report_unblock_qs_rnp(rnp, flags); 1745 rcu_report_unblock_qs_rnp(rnp, flags);
@@ -1769,7 +1776,8 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1769{ 1776{
1770 unsigned long flags; 1777 unsigned long flags;
1771 struct rcu_head *next, *list, **tail; 1778 struct rcu_head *next, *list, **tail;
1772 int bl, count, count_lazy, i; 1779 long bl, count, count_lazy;
1780 int i;
1773 1781
1774 /* If no callbacks are ready, just return.*/ 1782 /* If no callbacks are ready, just return.*/
1775 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 1783 if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
@@ -2107,9 +2115,15 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2107 } 2115 }
2108} 2116}
2109 2117
2118/*
2119 * Helper function for call_rcu() and friends. The cpu argument will
2120 * normally be -1, indicating "currently running CPU". It may specify
2121 * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier()
2122 * is expected to specify a CPU.
2123 */
2110static void 2124static void
2111__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), 2125__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2112 struct rcu_state *rsp, bool lazy) 2126 struct rcu_state *rsp, int cpu, bool lazy)
2113{ 2127{
2114 unsigned long flags; 2128 unsigned long flags;
2115 struct rcu_data *rdp; 2129 struct rcu_data *rdp;
@@ -2129,9 +2143,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2129 rdp = this_cpu_ptr(rsp->rda); 2143 rdp = this_cpu_ptr(rsp->rda);
2130 2144
2131 /* Add the callback to our list. */ 2145 /* Add the callback to our list. */
2132 if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL)) { 2146 if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) {
2147 int offline;
2148
2149 if (cpu != -1)
2150 rdp = per_cpu_ptr(rsp->rda, cpu);
2151 offline = !__call_rcu_nocb(rdp, head, lazy);
2152 WARN_ON_ONCE(offline);
2133 /* _call_rcu() is illegal on offline CPU; leak the callback. */ 2153 /* _call_rcu() is illegal on offline CPU; leak the callback. */
2134 WARN_ON_ONCE(1);
2135 local_irq_restore(flags); 2154 local_irq_restore(flags);
2136 return; 2155 return;
2137 } 2156 }
@@ -2160,7 +2179,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2160 */ 2179 */
2161void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 2180void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
2162{ 2181{
2163 __call_rcu(head, func, &rcu_sched_state, 0); 2182 __call_rcu(head, func, &rcu_sched_state, -1, 0);
2164} 2183}
2165EXPORT_SYMBOL_GPL(call_rcu_sched); 2184EXPORT_SYMBOL_GPL(call_rcu_sched);
2166 2185
@@ -2169,7 +2188,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
2169 */ 2188 */
2170void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 2189void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
2171{ 2190{
2172 __call_rcu(head, func, &rcu_bh_state, 0); 2191 __call_rcu(head, func, &rcu_bh_state, -1, 0);
2173} 2192}
2174EXPORT_SYMBOL_GPL(call_rcu_bh); 2193EXPORT_SYMBOL_GPL(call_rcu_bh);
2175 2194
@@ -2205,10 +2224,28 @@ static inline int rcu_blocking_is_gp(void)
2205 * rcu_read_lock_sched(). 2224 * rcu_read_lock_sched().
2206 * 2225 *
2207 * This means that all preempt_disable code sequences, including NMI and 2226 * This means that all preempt_disable code sequences, including NMI and
2208 * hardware-interrupt handlers, in progress on entry will have completed 2227 * non-threaded hardware-interrupt handlers, in progress on entry will
2209 * before this primitive returns. However, this does not guarantee that 2228 * have completed before this primitive returns. However, this does not
2210 * softirq handlers will have completed, since in some kernels, these 2229 * guarantee that softirq handlers will have completed, since in some
2211 * handlers can run in process context, and can block. 2230 * kernels, these handlers can run in process context, and can block.
2231 *
2232 * Note that this guarantee implies further memory-ordering guarantees.
2233 * On systems with more than one CPU, when synchronize_sched() returns,
2234 * each CPU is guaranteed to have executed a full memory barrier since the
2235 * end of its last RCU-sched read-side critical section whose beginning
2236 * preceded the call to synchronize_sched(). In addition, each CPU having
2237 * an RCU read-side critical section that extends beyond the return from
2238 * synchronize_sched() is guaranteed to have executed a full memory barrier
2239 * after the beginning of synchronize_sched() and before the beginning of
2240 * that RCU read-side critical section. Note that these guarantees include
2241 * CPUs that are offline, idle, or executing in user mode, as well as CPUs
2242 * that are executing in the kernel.
2243 *
2244 * Furthermore, if CPU A invoked synchronize_sched(), which returned
2245 * to its caller on CPU B, then both CPU A and CPU B are guaranteed
2246 * to have executed a full memory barrier during the execution of
2247 * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but
2248 * again only if the system has more than one CPU).
2212 * 2249 *
2213 * This primitive provides the guarantees made by the (now removed) 2250 * This primitive provides the guarantees made by the (now removed)
2214 * synchronize_kernel() API. In contrast, synchronize_rcu() only 2251 * synchronize_kernel() API. In contrast, synchronize_rcu() only
@@ -2224,7 +2261,10 @@ void synchronize_sched(void)
2224 "Illegal synchronize_sched() in RCU-sched read-side critical section"); 2261 "Illegal synchronize_sched() in RCU-sched read-side critical section");
2225 if (rcu_blocking_is_gp()) 2262 if (rcu_blocking_is_gp())
2226 return; 2263 return;
2227 wait_rcu_gp(call_rcu_sched); 2264 if (rcu_expedited)
2265 synchronize_sched_expedited();
2266 else
2267 wait_rcu_gp(call_rcu_sched);
2228} 2268}
2229EXPORT_SYMBOL_GPL(synchronize_sched); 2269EXPORT_SYMBOL_GPL(synchronize_sched);
2230 2270
@@ -2236,6 +2276,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
2236 * read-side critical sections have completed. RCU read-side critical 2276 * read-side critical sections have completed. RCU read-side critical
2237 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), 2277 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
2238 * and may be nested. 2278 * and may be nested.
2279 *
2280 * See the description of synchronize_sched() for more detailed information
2281 * on memory ordering guarantees.
2239 */ 2282 */
2240void synchronize_rcu_bh(void) 2283void synchronize_rcu_bh(void)
2241{ 2284{
@@ -2245,13 +2288,13 @@ void synchronize_rcu_bh(void)
2245 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); 2288 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
2246 if (rcu_blocking_is_gp()) 2289 if (rcu_blocking_is_gp())
2247 return; 2290 return;
2248 wait_rcu_gp(call_rcu_bh); 2291 if (rcu_expedited)
2292 synchronize_rcu_bh_expedited();
2293 else
2294 wait_rcu_gp(call_rcu_bh);
2249} 2295}
2250EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 2296EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
2251 2297
2252static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
2253static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
2254
2255static int synchronize_sched_expedited_cpu_stop(void *data) 2298static int synchronize_sched_expedited_cpu_stop(void *data)
2256{ 2299{
2257 /* 2300 /*
@@ -2308,10 +2351,32 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
2308 */ 2351 */
2309void synchronize_sched_expedited(void) 2352void synchronize_sched_expedited(void)
2310{ 2353{
2311 int firstsnap, s, snap, trycount = 0; 2354 long firstsnap, s, snap;
2355 int trycount = 0;
2356 struct rcu_state *rsp = &rcu_sched_state;
2357
2358 /*
2359 * If we are in danger of counter wrap, just do synchronize_sched().
2360 * By allowing sync_sched_expedited_started to advance no more than
2361 * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
2362 * that more than 3.5 billion CPUs would be required to force a
2363 * counter wrap on a 32-bit system. Quite a few more CPUs would of
2364 * course be required on a 64-bit system.
2365 */
2366 if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
2367 (ulong)atomic_long_read(&rsp->expedited_done) +
2368 ULONG_MAX / 8)) {
2369 synchronize_sched();
2370 atomic_long_inc(&rsp->expedited_wrap);
2371 return;
2372 }
2312 2373
2313 /* Note that atomic_inc_return() implies full memory barrier. */ 2374 /*
2314 firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); 2375 * Take a ticket. Note that atomic_inc_return() implies a
2376 * full memory barrier.
2377 */
2378 snap = atomic_long_inc_return(&rsp->expedited_start);
2379 firstsnap = snap;
2315 get_online_cpus(); 2380 get_online_cpus();
2316 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); 2381 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
2317 2382
@@ -2323,48 +2388,65 @@ void synchronize_sched_expedited(void)
2323 synchronize_sched_expedited_cpu_stop, 2388 synchronize_sched_expedited_cpu_stop,
2324 NULL) == -EAGAIN) { 2389 NULL) == -EAGAIN) {
2325 put_online_cpus(); 2390 put_online_cpus();
2391 atomic_long_inc(&rsp->expedited_tryfail);
2392
2393 /* Check to see if someone else did our work for us. */
2394 s = atomic_long_read(&rsp->expedited_done);
2395 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
2396 /* ensure test happens before caller kfree */
2397 smp_mb__before_atomic_inc(); /* ^^^ */
2398 atomic_long_inc(&rsp->expedited_workdone1);
2399 return;
2400 }
2326 2401
2327 /* No joy, try again later. Or just synchronize_sched(). */ 2402 /* No joy, try again later. Or just synchronize_sched(). */
2328 if (trycount++ < 10) { 2403 if (trycount++ < 10) {
2329 udelay(trycount * num_online_cpus()); 2404 udelay(trycount * num_online_cpus());
2330 } else { 2405 } else {
2331 synchronize_sched(); 2406 wait_rcu_gp(call_rcu_sched);
2407 atomic_long_inc(&rsp->expedited_normal);
2332 return; 2408 return;
2333 } 2409 }
2334 2410
2335 /* Check to see if someone else did our work for us. */ 2411 /* Recheck to see if someone else did our work for us. */
2336 s = atomic_read(&sync_sched_expedited_done); 2412 s = atomic_long_read(&rsp->expedited_done);
2337 if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { 2413 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
2338 smp_mb(); /* ensure test happens before caller kfree */ 2414 /* ensure test happens before caller kfree */
2415 smp_mb__before_atomic_inc(); /* ^^^ */
2416 atomic_long_inc(&rsp->expedited_workdone2);
2339 return; 2417 return;
2340 } 2418 }
2341 2419
2342 /* 2420 /*
2343 * Refetching sync_sched_expedited_started allows later 2421 * Refetching sync_sched_expedited_started allows later
2344 * callers to piggyback on our grace period. We subtract 2422 * callers to piggyback on our grace period. We retry
2345 * 1 to get the same token that the last incrementer got. 2423 * after they started, so our grace period works for them,
2346 * We retry after they started, so our grace period works 2424 * and they started after our first try, so their grace
2347 * for them, and they started after our first try, so their 2425 * period works for us.
2348 * grace period works for us.
2349 */ 2426 */
2350 get_online_cpus(); 2427 get_online_cpus();
2351 snap = atomic_read(&sync_sched_expedited_started); 2428 snap = atomic_long_read(&rsp->expedited_start);
2352 smp_mb(); /* ensure read is before try_stop_cpus(). */ 2429 smp_mb(); /* ensure read is before try_stop_cpus(). */
2353 } 2430 }
2431 atomic_long_inc(&rsp->expedited_stoppedcpus);
2354 2432
2355 /* 2433 /*
2356 * Everyone up to our most recent fetch is covered by our grace 2434 * Everyone up to our most recent fetch is covered by our grace
2357 * period. Update the counter, but only if our work is still 2435 * period. Update the counter, but only if our work is still
2358 * relevant -- which it won't be if someone who started later 2436 * relevant -- which it won't be if someone who started later
2359 * than we did beat us to the punch. 2437 * than we did already did their update.
2360 */ 2438 */
2361 do { 2439 do {
2362 s = atomic_read(&sync_sched_expedited_done); 2440 atomic_long_inc(&rsp->expedited_done_tries);
2363 if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { 2441 s = atomic_long_read(&rsp->expedited_done);
2364 smp_mb(); /* ensure test happens before caller kfree */ 2442 if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
2443 /* ensure test happens before caller kfree */
2444 smp_mb__before_atomic_inc(); /* ^^^ */
2445 atomic_long_inc(&rsp->expedited_done_lost);
2365 break; 2446 break;
2366 } 2447 }
2367 } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); 2448 } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
2449 atomic_long_inc(&rsp->expedited_done_exit);
2368 2450
2369 put_online_cpus(); 2451 put_online_cpus();
2370} 2452}
@@ -2558,9 +2640,17 @@ static void _rcu_barrier(struct rcu_state *rsp)
2558 * When that callback is invoked, we will know that all of the 2640 * When that callback is invoked, we will know that all of the
2559 * corresponding CPU's preceding callbacks have been invoked. 2641 * corresponding CPU's preceding callbacks have been invoked.
2560 */ 2642 */
2561 for_each_online_cpu(cpu) { 2643 for_each_possible_cpu(cpu) {
2644 if (!cpu_online(cpu) && !is_nocb_cpu(cpu))
2645 continue;
2562 rdp = per_cpu_ptr(rsp->rda, cpu); 2646 rdp = per_cpu_ptr(rsp->rda, cpu);
2563 if (ACCESS_ONCE(rdp->qlen)) { 2647 if (is_nocb_cpu(cpu)) {
2648 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
2649 rsp->n_barrier_done);
2650 atomic_inc(&rsp->barrier_cpu_count);
2651 __call_rcu(&rdp->barrier_head, rcu_barrier_callback,
2652 rsp, cpu, 0);
2653 } else if (ACCESS_ONCE(rdp->qlen)) {
2564 _rcu_barrier_trace(rsp, "OnlineQ", cpu, 2654 _rcu_barrier_trace(rsp, "OnlineQ", cpu,
2565 rsp->n_barrier_done); 2655 rsp->n_barrier_done);
2566 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); 2656 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
@@ -2634,6 +2724,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
2634#endif 2724#endif
2635 rdp->cpu = cpu; 2725 rdp->cpu = cpu;
2636 rdp->rsp = rsp; 2726 rdp->rsp = rsp;
2727 rcu_boot_init_nocb_percpu_data(rdp);
2637 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2728 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2638} 2729}
2639 2730
@@ -2715,6 +2806,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2715 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 2806 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2716 struct rcu_node *rnp = rdp->mynode; 2807 struct rcu_node *rnp = rdp->mynode;
2717 struct rcu_state *rsp; 2808 struct rcu_state *rsp;
2809 int ret = NOTIFY_OK;
2718 2810
2719 trace_rcu_utilization("Start CPU hotplug"); 2811 trace_rcu_utilization("Start CPU hotplug");
2720 switch (action) { 2812 switch (action) {
@@ -2728,7 +2820,10 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2728 rcu_boost_kthread_setaffinity(rnp, -1); 2820 rcu_boost_kthread_setaffinity(rnp, -1);
2729 break; 2821 break;
2730 case CPU_DOWN_PREPARE: 2822 case CPU_DOWN_PREPARE:
2731 rcu_boost_kthread_setaffinity(rnp, cpu); 2823 if (nocb_cpu_expendable(cpu))
2824 rcu_boost_kthread_setaffinity(rnp, cpu);
2825 else
2826 ret = NOTIFY_BAD;
2732 break; 2827 break;
2733 case CPU_DYING: 2828 case CPU_DYING:
2734 case CPU_DYING_FROZEN: 2829 case CPU_DYING_FROZEN:
@@ -2752,7 +2847,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2752 break; 2847 break;
2753 } 2848 }
2754 trace_rcu_utilization("End CPU hotplug"); 2849 trace_rcu_utilization("End CPU hotplug");
2755 return NOTIFY_OK; 2850 return ret;
2756} 2851}
2757 2852
2758/* 2853/*
@@ -2772,6 +2867,7 @@ static int __init rcu_spawn_gp_kthread(void)
2772 raw_spin_lock_irqsave(&rnp->lock, flags); 2867 raw_spin_lock_irqsave(&rnp->lock, flags);
2773 rsp->gp_kthread = t; 2868 rsp->gp_kthread = t;
2774 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2869 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2870 rcu_spawn_nocb_kthreads(rsp);
2775 } 2871 }
2776 return 0; 2872 return 0;
2777} 2873}
@@ -2967,6 +3063,7 @@ void __init rcu_init(void)
2967 rcu_init_one(&rcu_sched_state, &rcu_sched_data); 3063 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
2968 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 3064 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
2969 __rcu_init_preempt(); 3065 __rcu_init_preempt();
3066 rcu_init_nocb();
2970 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 3067 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
2971 3068
2972 /* 3069 /*
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index a240f032848e..4b69291b093d 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -287,6 +287,7 @@ struct rcu_data {
287 long qlen_last_fqs_check; 287 long qlen_last_fqs_check;
288 /* qlen at last check for QS forcing */ 288 /* qlen at last check for QS forcing */
289 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ 289 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
290 unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */
290 unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ 291 unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
291 unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ 292 unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
292 unsigned long n_force_qs_snap; 293 unsigned long n_force_qs_snap;
@@ -317,6 +318,18 @@ struct rcu_data {
317 struct rcu_head oom_head; 318 struct rcu_head oom_head;
318#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 319#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
319 320
321 /* 7) Callback offloading. */
322#ifdef CONFIG_RCU_NOCB_CPU
323 struct rcu_head *nocb_head; /* CBs waiting for kthread. */
324 struct rcu_head **nocb_tail;
325 atomic_long_t nocb_q_count; /* # CBs waiting for kthread */
326 atomic_long_t nocb_q_count_lazy; /* (approximate). */
327 int nocb_p_count; /* # CBs being invoked by kthread */
328 int nocb_p_count_lazy; /* (approximate). */
329 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
330 struct task_struct *nocb_kthread;
331#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
332
320 int cpu; 333 int cpu;
321 struct rcu_state *rsp; 334 struct rcu_state *rsp;
322}; 335};
@@ -369,6 +382,12 @@ struct rcu_state {
369 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ 382 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
370 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ 383 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
371 void (*func)(struct rcu_head *head)); 384 void (*func)(struct rcu_head *head));
385#ifdef CONFIG_RCU_NOCB_CPU
386 void (*call_remote)(struct rcu_head *head,
387 void (*func)(struct rcu_head *head));
388 /* call_rcu() flavor, but for */
389 /* placing on remote CPU. */
390#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
372 391
373 /* The following fields are guarded by the root rcu_node's lock. */ 392 /* The following fields are guarded by the root rcu_node's lock. */
374 393
@@ -383,9 +402,8 @@ struct rcu_state {
383 402
384 /* End of fields guarded by root rcu_node's lock. */ 403 /* End of fields guarded by root rcu_node's lock. */
385 404
386 raw_spinlock_t onofflock ____cacheline_internodealigned_in_smp; 405 raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
387 /* exclude on/offline and */ 406 /* Protect following fields. */
388 /* starting new GP. */
389 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ 407 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
390 /* need a grace period. */ 408 /* need a grace period. */
391 struct rcu_head **orphan_nxttail; /* Tail of above. */ 409 struct rcu_head **orphan_nxttail; /* Tail of above. */
@@ -394,7 +412,7 @@ struct rcu_state {
394 struct rcu_head **orphan_donetail; /* Tail of above. */ 412 struct rcu_head **orphan_donetail; /* Tail of above. */
395 long qlen_lazy; /* Number of lazy callbacks. */ 413 long qlen_lazy; /* Number of lazy callbacks. */
396 long qlen; /* Total number of callbacks. */ 414 long qlen; /* Total number of callbacks. */
397 /* End of fields guarded by onofflock. */ 415 /* End of fields guarded by orphan_lock. */
398 416
399 struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ 417 struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */
400 418
@@ -405,6 +423,18 @@ struct rcu_state {
405 /* _rcu_barrier(). */ 423 /* _rcu_barrier(). */
406 /* End of fields guarded by barrier_mutex. */ 424 /* End of fields guarded by barrier_mutex. */
407 425
426 atomic_long_t expedited_start; /* Starting ticket. */
427 atomic_long_t expedited_done; /* Done ticket. */
428 atomic_long_t expedited_wrap; /* # near-wrap incidents. */
429 atomic_long_t expedited_tryfail; /* # acquisition failures. */
430 atomic_long_t expedited_workdone1; /* # done by others #1. */
431 atomic_long_t expedited_workdone2; /* # done by others #2. */
432 atomic_long_t expedited_normal; /* # fallbacks to normal. */
433 atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */
434 atomic_long_t expedited_done_tries; /* # tries to update _done. */
435 atomic_long_t expedited_done_lost; /* # times beaten to _done. */
436 atomic_long_t expedited_done_exit; /* # times exited _done loop. */
437
408 unsigned long jiffies_force_qs; /* Time at which to invoke */ 438 unsigned long jiffies_force_qs; /* Time at which to invoke */
409 /* force_quiescent_state(). */ 439 /* force_quiescent_state(). */
410 unsigned long n_force_qs; /* Number of calls to */ 440 unsigned long n_force_qs; /* Number of calls to */
@@ -428,6 +458,8 @@ struct rcu_state {
428#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ 458#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */
429 459
430extern struct list_head rcu_struct_flavors; 460extern struct list_head rcu_struct_flavors;
461
462/* Sequence through rcu_state structures for each RCU flavor. */
431#define for_each_rcu_flavor(rsp) \ 463#define for_each_rcu_flavor(rsp) \
432 list_for_each_entry((rsp), &rcu_struct_flavors, flavors) 464 list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
433 465
@@ -504,5 +536,32 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
504static void print_cpu_stall_info_end(void); 536static void print_cpu_stall_info_end(void);
505static void zero_cpu_stall_ticks(struct rcu_data *rdp); 537static void zero_cpu_stall_ticks(struct rcu_data *rdp);
506static void increment_cpu_stall_ticks(void); 538static void increment_cpu_stall_ticks(void);
539static bool is_nocb_cpu(int cpu);
540static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
541 bool lazy);
542static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
543 struct rcu_data *rdp);
544static bool nocb_cpu_expendable(int cpu);
545static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
546static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
547static void init_nocb_callback_list(struct rcu_data *rdp);
548static void __init rcu_init_nocb(void);
507 549
508#endif /* #ifndef RCU_TREE_NONCORE */ 550#endif /* #ifndef RCU_TREE_NONCORE */
551
552#ifdef CONFIG_RCU_TRACE
553#ifdef CONFIG_RCU_NOCB_CPU
554/* Sum up queue lengths for tracing. */
555static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
556{
557 *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count;
558 *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy;
559}
560#else /* #ifdef CONFIG_RCU_NOCB_CPU */
561static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
562{
563 *ql = 0;
564 *qll = 0;
565}
566#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
567#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index f92115488187..f6e5ec2932b4 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
25 */ 25 */
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28#include <linux/gfp.h>
28#include <linux/oom.h> 29#include <linux/oom.h>
29#include <linux/smpboot.h> 30#include <linux/smpboot.h>
30 31
@@ -36,6 +37,14 @@
36#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO 37#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
37#endif 38#endif
38 39
40#ifdef CONFIG_RCU_NOCB_CPU
41static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
42static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
43static bool rcu_nocb_poll; /* Offload kthread are to poll. */
44module_param(rcu_nocb_poll, bool, 0444);
45static char __initdata nocb_buf[NR_CPUS * 5];
46#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
47
39/* 48/*
40 * Check the RCU kernel configuration parameters and print informative 49 * Check the RCU kernel configuration parameters and print informative
41 * messages about anything out of the ordinary. If you like #ifdef, you 50 * messages about anything out of the ordinary. If you like #ifdef, you
@@ -76,6 +85,18 @@ static void __init rcu_bootup_announce_oddness(void)
76 printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); 85 printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
77 if (nr_cpu_ids != NR_CPUS) 86 if (nr_cpu_ids != NR_CPUS)
78 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); 87 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
88#ifdef CONFIG_RCU_NOCB_CPU
89 if (have_rcu_nocb_mask) {
90 if (cpumask_test_cpu(0, rcu_nocb_mask)) {
91 cpumask_clear_cpu(0, rcu_nocb_mask);
92 pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n");
93 }
94 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
95 pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);
96 if (rcu_nocb_poll)
97 pr_info("\tExperimental polled no-CBs CPUs.\n");
98 }
99#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
79} 100}
80 101
81#ifdef CONFIG_TREE_PREEMPT_RCU 102#ifdef CONFIG_TREE_PREEMPT_RCU
@@ -642,7 +663,7 @@ static void rcu_preempt_do_callbacks(void)
642 */ 663 */
643void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 664void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
644{ 665{
645 __call_rcu(head, func, &rcu_preempt_state, 0); 666 __call_rcu(head, func, &rcu_preempt_state, -1, 0);
646} 667}
647EXPORT_SYMBOL_GPL(call_rcu); 668EXPORT_SYMBOL_GPL(call_rcu);
648 669
@@ -656,7 +677,7 @@ EXPORT_SYMBOL_GPL(call_rcu);
656void kfree_call_rcu(struct rcu_head *head, 677void kfree_call_rcu(struct rcu_head *head,
657 void (*func)(struct rcu_head *rcu)) 678 void (*func)(struct rcu_head *rcu))
658{ 679{
659 __call_rcu(head, func, &rcu_preempt_state, 1); 680 __call_rcu(head, func, &rcu_preempt_state, -1, 1);
660} 681}
661EXPORT_SYMBOL_GPL(kfree_call_rcu); 682EXPORT_SYMBOL_GPL(kfree_call_rcu);
662 683
@@ -670,6 +691,9 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu);
670 * concurrently with new RCU read-side critical sections that began while 691 * concurrently with new RCU read-side critical sections that began while
671 * synchronize_rcu() was waiting. RCU read-side critical sections are 692 * synchronize_rcu() was waiting. RCU read-side critical sections are
672 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. 693 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
694 *
695 * See the description of synchronize_sched() for more detailed information
696 * on memory ordering guarantees.
673 */ 697 */
674void synchronize_rcu(void) 698void synchronize_rcu(void)
675{ 699{
@@ -679,7 +703,10 @@ void synchronize_rcu(void)
679 "Illegal synchronize_rcu() in RCU read-side critical section"); 703 "Illegal synchronize_rcu() in RCU read-side critical section");
680 if (!rcu_scheduler_active) 704 if (!rcu_scheduler_active)
681 return; 705 return;
682 wait_rcu_gp(call_rcu); 706 if (rcu_expedited)
707 synchronize_rcu_expedited();
708 else
709 wait_rcu_gp(call_rcu);
683} 710}
684EXPORT_SYMBOL_GPL(synchronize_rcu); 711EXPORT_SYMBOL_GPL(synchronize_rcu);
685 712
@@ -757,7 +784,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
757 * grace period for the specified rcu_node structure. If there are no such 784 * grace period for the specified rcu_node structure. If there are no such
758 * tasks, report it up the rcu_node hierarchy. 785 * tasks, report it up the rcu_node hierarchy.
759 * 786 *
760 * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock. 787 * Caller must hold sync_rcu_preempt_exp_mutex and must exclude
788 * CPU hotplug operations.
761 */ 789 */
762static void 790static void
763sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) 791sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
@@ -831,7 +859,7 @@ void synchronize_rcu_expedited(void)
831 udelay(trycount * num_online_cpus()); 859 udelay(trycount * num_online_cpus());
832 } else { 860 } else {
833 put_online_cpus(); 861 put_online_cpus();
834 synchronize_rcu(); 862 wait_rcu_gp(call_rcu);
835 return; 863 return;
836 } 864 }
837 } 865 }
@@ -875,6 +903,11 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
875 903
876/** 904/**
877 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. 905 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
906 *
907 * Note that this primitive does not necessarily wait for an RCU grace period
908 * to complete. For example, if there are no RCU callbacks queued anywhere
909 * in the system, then rcu_barrier() is within its rights to return
910 * immediately, without waiting for anything, much less an RCU grace period.
878 */ 911 */
879void rcu_barrier(void) 912void rcu_barrier(void)
880{ 913{
@@ -1013,7 +1046,7 @@ static void rcu_preempt_check_callbacks(int cpu)
1013void kfree_call_rcu(struct rcu_head *head, 1046void kfree_call_rcu(struct rcu_head *head,
1014 void (*func)(struct rcu_head *rcu)) 1047 void (*func)(struct rcu_head *rcu))
1015{ 1048{
1016 __call_rcu(head, func, &rcu_sched_state, 1); 1049 __call_rcu(head, func, &rcu_sched_state, -1, 1);
1017} 1050}
1018EXPORT_SYMBOL_GPL(kfree_call_rcu); 1051EXPORT_SYMBOL_GPL(kfree_call_rcu);
1019 1052
@@ -2092,3 +2125,373 @@ static void increment_cpu_stall_ticks(void)
2092} 2125}
2093 2126
2094#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ 2127#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
2128
2129#ifdef CONFIG_RCU_NOCB_CPU
2130
2131/*
2132 * Offload callback processing from the boot-time-specified set of CPUs
2133 * specified by rcu_nocb_mask. For each CPU in the set, there is a
2134 * kthread created that pulls the callbacks from the corresponding CPU,
2135 * waits for a grace period to elapse, and invokes the callbacks.
2136 * The no-CBs CPUs do a wake_up() on their kthread when they insert
2137 * a callback into any empty list, unless the rcu_nocb_poll boot parameter
2138 * has been specified, in which case each kthread actively polls its
2139 * CPU. (Which isn't so great for energy efficiency, but which does
2140 * reduce RCU's overhead on that CPU.)
2141 *
2142 * This is intended to be used in conjunction with Frederic Weisbecker's
2143 * adaptive-idle work, which would seriously reduce OS jitter on CPUs
2144 * running CPU-bound user-mode computations.
2145 *
2146 * Offloading of callback processing could also in theory be used as
2147 * an energy-efficiency measure because CPUs with no RCU callbacks
2148 * queued are more aggressive about entering dyntick-idle mode.
2149 */
2150
2151
2152/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */
2153static int __init rcu_nocb_setup(char *str)
2154{
2155 alloc_bootmem_cpumask_var(&rcu_nocb_mask);
2156 have_rcu_nocb_mask = true;
2157 cpulist_parse(str, rcu_nocb_mask);
2158 return 1;
2159}
2160__setup("rcu_nocbs=", rcu_nocb_setup);
2161
2162/* Is the specified CPU a no-CPUs CPU? */
2163static bool is_nocb_cpu(int cpu)
2164{
2165 if (have_rcu_nocb_mask)
2166 return cpumask_test_cpu(cpu, rcu_nocb_mask);
2167 return false;
2168}
2169
2170/*
2171 * Enqueue the specified string of rcu_head structures onto the specified
2172 * CPU's no-CBs lists. The CPU is specified by rdp, the head of the
2173 * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
2174 * counts are supplied by rhcount and rhcount_lazy.
2175 *
2176 * If warranted, also wake up the kthread servicing this CPUs queues.
2177 */
2178static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2179 struct rcu_head *rhp,
2180 struct rcu_head **rhtp,
2181 int rhcount, int rhcount_lazy)
2182{
2183 int len;
2184 struct rcu_head **old_rhpp;
2185 struct task_struct *t;
2186
2187 /* Enqueue the callback on the nocb list and update counts. */
2188 old_rhpp = xchg(&rdp->nocb_tail, rhtp);
2189 ACCESS_ONCE(*old_rhpp) = rhp;
2190 atomic_long_add(rhcount, &rdp->nocb_q_count);
2191 atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
2192
2193 /* If we are not being polled and there is a kthread, awaken it ... */
2194 t = ACCESS_ONCE(rdp->nocb_kthread);
2195 if (rcu_nocb_poll | !t)
2196 return;
2197 len = atomic_long_read(&rdp->nocb_q_count);
2198 if (old_rhpp == &rdp->nocb_head) {
2199 wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */
2200 rdp->qlen_last_fqs_check = 0;
2201 } else if (len > rdp->qlen_last_fqs_check + qhimark) {
2202 wake_up_process(t); /* ... or if many callbacks queued. */
2203 rdp->qlen_last_fqs_check = LONG_MAX / 2;
2204 }
2205 return;
2206}
2207
2208/*
2209 * This is a helper for __call_rcu(), which invokes this when the normal
2210 * callback queue is inoperable. If this is not a no-CBs CPU, this
2211 * function returns failure back to __call_rcu(), which can complain
2212 * appropriately.
2213 *
2214 * Otherwise, this function queues the callback where the corresponding
2215 * "rcuo" kthread can find it.
2216 */
2217static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2218 bool lazy)
2219{
2220
2221 if (!is_nocb_cpu(rdp->cpu))
2222 return 0;
2223 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
2224 return 1;
2225}
2226
2227/*
2228 * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
2229 * not a no-CBs CPU.
2230 */
2231static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2232 struct rcu_data *rdp)
2233{
2234 long ql = rsp->qlen;
2235 long qll = rsp->qlen_lazy;
2236
2237 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
2238 if (!is_nocb_cpu(smp_processor_id()))
2239 return 0;
2240 rsp->qlen = 0;
2241 rsp->qlen_lazy = 0;
2242
2243 /* First, enqueue the donelist, if any. This preserves CB ordering. */
2244 if (rsp->orphan_donelist != NULL) {
2245 __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
2246 rsp->orphan_donetail, ql, qll);
2247 ql = qll = 0;
2248 rsp->orphan_donelist = NULL;
2249 rsp->orphan_donetail = &rsp->orphan_donelist;
2250 }
2251 if (rsp->orphan_nxtlist != NULL) {
2252 __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
2253 rsp->orphan_nxttail, ql, qll);
2254 ql = qll = 0;
2255 rsp->orphan_nxtlist = NULL;
2256 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
2257 }
2258 return 1;
2259}
2260
2261/*
2262 * There must be at least one non-no-CBs CPU in operation at any given
2263 * time, because no-CBs CPUs are not capable of initiating grace periods
2264 * independently. This function therefore complains if the specified
2265 * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to
2266 * avoid offlining the last such CPU. (Recursion is a wonderful thing,
2267 * but you have to have a base case!)
2268 */
2269static bool nocb_cpu_expendable(int cpu)
2270{
2271 cpumask_var_t non_nocb_cpus;
2272 int ret;
2273
2274 /*
2275 * If there are no no-CB CPUs or if this CPU is not a no-CB CPU,
2276 * then offlining this CPU is harmless. Let it happen.
2277 */
2278 if (!have_rcu_nocb_mask || is_nocb_cpu(cpu))
2279 return 1;
2280
2281 /* If no memory, play it safe and keep the CPU around. */
2282 if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO))
2283 return 0;
2284 cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask);
2285 cpumask_clear_cpu(cpu, non_nocb_cpus);
2286 ret = !cpumask_empty(non_nocb_cpus);
2287 free_cpumask_var(non_nocb_cpus);
2288 return ret;
2289}
2290
2291/*
2292 * Helper structure for remote registry of RCU callbacks.
2293 * This is needed for when a no-CBs CPU needs to start a grace period.
2294 * If it just invokes call_rcu(), the resulting callback will be queued,
2295 * which can result in deadlock.
2296 */
2297struct rcu_head_remote {
2298 struct rcu_head *rhp;
2299 call_rcu_func_t *crf;
2300 void (*func)(struct rcu_head *rhp);
2301};
2302
2303/*
2304 * Register a callback as specified by the rcu_head_remote struct.
2305 * This function is intended to be invoked via smp_call_function_single().
2306 */
2307static void call_rcu_local(void *arg)
2308{
2309 struct rcu_head_remote *rhrp =
2310 container_of(arg, struct rcu_head_remote, rhp);
2311
2312 rhrp->crf(rhrp->rhp, rhrp->func);
2313}
2314
2315/*
2316 * Set up an rcu_head_remote structure and the invoke call_rcu_local()
2317 * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via
2318 * smp_call_function_single().
2319 */
2320static void invoke_crf_remote(struct rcu_head *rhp,
2321 void (*func)(struct rcu_head *rhp),
2322 call_rcu_func_t crf)
2323{
2324 struct rcu_head_remote rhr;
2325
2326 rhr.rhp = rhp;
2327 rhr.crf = crf;
2328 rhr.func = func;
2329 smp_call_function_single(0, call_rcu_local, &rhr, 1);
2330}
2331
2332/*
2333 * Helper functions to be passed to wait_rcu_gp(), each of which
2334 * invokes invoke_crf_remote() to register a callback appropriately.
2335 */
2336static void __maybe_unused
2337call_rcu_preempt_remote(struct rcu_head *rhp,
2338 void (*func)(struct rcu_head *rhp))
2339{
2340 invoke_crf_remote(rhp, func, call_rcu);
2341}
2342static void call_rcu_bh_remote(struct rcu_head *rhp,
2343 void (*func)(struct rcu_head *rhp))
2344{
2345 invoke_crf_remote(rhp, func, call_rcu_bh);
2346}
2347static void call_rcu_sched_remote(struct rcu_head *rhp,
2348 void (*func)(struct rcu_head *rhp))
2349{
2350 invoke_crf_remote(rhp, func, call_rcu_sched);
2351}
2352
2353/*
2354 * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes
2355 * callbacks queued by the corresponding no-CBs CPU.
2356 */
2357static int rcu_nocb_kthread(void *arg)
2358{
2359 int c, cl;
2360 struct rcu_head *list;
2361 struct rcu_head *next;
2362 struct rcu_head **tail;
2363 struct rcu_data *rdp = arg;
2364
2365 /* Each pass through this loop invokes one batch of callbacks */
2366 for (;;) {
2367 /* If not polling, wait for next batch of callbacks. */
2368 if (!rcu_nocb_poll)
2369 wait_event(rdp->nocb_wq, rdp->nocb_head);
2370 list = ACCESS_ONCE(rdp->nocb_head);
2371 if (!list) {
2372 schedule_timeout_interruptible(1);
2373 continue;
2374 }
2375
2376 /*
2377 * Extract queued callbacks, update counts, and wait
2378 * for a grace period to elapse.
2379 */
2380 ACCESS_ONCE(rdp->nocb_head) = NULL;
2381 tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
2382 c = atomic_long_xchg(&rdp->nocb_q_count, 0);
2383 cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
2384 ACCESS_ONCE(rdp->nocb_p_count) += c;
2385 ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
2386 wait_rcu_gp(rdp->rsp->call_remote);
2387
2388 /* Each pass through the following loop invokes a callback. */
2389 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
2390 c = cl = 0;
2391 while (list) {
2392 next = list->next;
2393 /* Wait for enqueuing to complete, if needed. */
2394 while (next == NULL && &list->next != tail) {
2395 schedule_timeout_interruptible(1);
2396 next = list->next;
2397 }
2398 debug_rcu_head_unqueue(list);
2399 local_bh_disable();
2400 if (__rcu_reclaim(rdp->rsp->name, list))
2401 cl++;
2402 c++;
2403 local_bh_enable();
2404 list = next;
2405 }
2406 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
2407 ACCESS_ONCE(rdp->nocb_p_count) -= c;
2408 ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl;
2409 rdp->n_nocbs_invoked += c;
2410 }
2411 return 0;
2412}
2413
2414/* Initialize per-rcu_data variables for no-CBs CPUs. */
2415static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2416{
2417 rdp->nocb_tail = &rdp->nocb_head;
2418 init_waitqueue_head(&rdp->nocb_wq);
2419}
2420
2421/* Create a kthread for each RCU flavor for each no-CBs CPU. */
2422static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2423{
2424 int cpu;
2425 struct rcu_data *rdp;
2426 struct task_struct *t;
2427
2428 if (rcu_nocb_mask == NULL)
2429 return;
2430 for_each_cpu(cpu, rcu_nocb_mask) {
2431 rdp = per_cpu_ptr(rsp->rda, cpu);
2432 t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu);
2433 BUG_ON(IS_ERR(t));
2434 ACCESS_ONCE(rdp->nocb_kthread) = t;
2435 }
2436}
2437
2438/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
2439static void init_nocb_callback_list(struct rcu_data *rdp)
2440{
2441 if (rcu_nocb_mask == NULL ||
2442 !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
2443 return;
2444 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2445}
2446
2447/* Initialize the ->call_remote fields in the rcu_state structures. */
2448static void __init rcu_init_nocb(void)
2449{
2450#ifdef CONFIG_PREEMPT_RCU
2451 rcu_preempt_state.call_remote = call_rcu_preempt_remote;
2452#endif /* #ifdef CONFIG_PREEMPT_RCU */
2453 rcu_bh_state.call_remote = call_rcu_bh_remote;
2454 rcu_sched_state.call_remote = call_rcu_sched_remote;
2455}
2456
2457#else /* #ifdef CONFIG_RCU_NOCB_CPU */
2458
2459static bool is_nocb_cpu(int cpu)
2460{
2461 return false;
2462}
2463
2464static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2465 bool lazy)
2466{
2467 return 0;
2468}
2469
2470static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2471 struct rcu_data *rdp)
2472{
2473 return 0;
2474}
2475
2476static bool nocb_cpu_expendable(int cpu)
2477{
2478 return 1;
2479}
2480
2481static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2482{
2483}
2484
2485static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2486{
2487}
2488
2489static void init_nocb_callback_list(struct rcu_data *rdp)
2490{
2491}
2492
2493static void __init rcu_init_nocb(void)
2494{
2495}
2496
2497#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 693513bc50e6..0d095dcaa670 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,29 +46,58 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "rcutree.h"
48 48
49static int show_rcubarrier(struct seq_file *m, void *unused) 49#define ulong2long(a) (*(long *)(&(a)))
50
51static int r_open(struct inode *inode, struct file *file,
52 const struct seq_operations *op)
50{ 53{
51 struct rcu_state *rsp; 54 int ret = seq_open(file, op);
55 if (!ret) {
56 struct seq_file *m = (struct seq_file *)file->private_data;
57 m->private = inode->i_private;
58 }
59 return ret;
60}
61
62static void *r_start(struct seq_file *m, loff_t *pos)
63{
64 struct rcu_state *rsp = (struct rcu_state *)m->private;
65 *pos = cpumask_next(*pos - 1, cpu_possible_mask);
66 if ((*pos) < nr_cpu_ids)
67 return per_cpu_ptr(rsp->rda, *pos);
68 return NULL;
69}
52 70
53 for_each_rcu_flavor(rsp) 71static void *r_next(struct seq_file *m, void *v, loff_t *pos)
54 seq_printf(m, "%s: bcc: %d nbd: %lu\n", 72{
55 rsp->name, 73 (*pos)++;
56 atomic_read(&rsp->barrier_cpu_count), 74 return r_start(m, pos);
57 rsp->n_barrier_done); 75}
76
77static void r_stop(struct seq_file *m, void *v)
78{
79}
80
81static int show_rcubarrier(struct seq_file *m, void *v)
82{
83 struct rcu_state *rsp = (struct rcu_state *)m->private;
84 seq_printf(m, "bcc: %d nbd: %lu\n",
85 atomic_read(&rsp->barrier_cpu_count),
86 rsp->n_barrier_done);
58 return 0; 87 return 0;
59} 88}
60 89
61static int rcubarrier_open(struct inode *inode, struct file *file) 90static int rcubarrier_open(struct inode *inode, struct file *file)
62{ 91{
63 return single_open(file, show_rcubarrier, NULL); 92 return single_open(file, show_rcubarrier, inode->i_private);
64} 93}
65 94
66static const struct file_operations rcubarrier_fops = { 95static const struct file_operations rcubarrier_fops = {
67 .owner = THIS_MODULE, 96 .owner = THIS_MODULE,
68 .open = rcubarrier_open, 97 .open = rcubarrier_open,
69 .read = seq_read, 98 .read = seq_read,
70 .llseek = seq_lseek, 99 .llseek = no_llseek,
71 .release = single_release, 100 .release = seq_release,
72}; 101};
73 102
74#ifdef CONFIG_RCU_BOOST 103#ifdef CONFIG_RCU_BOOST
@@ -84,12 +113,14 @@ static char convert_kthread_status(unsigned int kthread_status)
84 113
85static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 114static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
86{ 115{
116 long ql, qll;
117
87 if (!rdp->beenonline) 118 if (!rdp->beenonline)
88 return; 119 return;
89 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d qp=%d", 120 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d",
90 rdp->cpu, 121 rdp->cpu,
91 cpu_is_offline(rdp->cpu) ? '!' : ' ', 122 cpu_is_offline(rdp->cpu) ? '!' : ' ',
92 rdp->completed, rdp->gpnum, 123 ulong2long(rdp->completed), ulong2long(rdp->gpnum),
93 rdp->passed_quiesce, rdp->qs_pending); 124 rdp->passed_quiesce, rdp->qs_pending);
94 seq_printf(m, " dt=%d/%llx/%d df=%lu", 125 seq_printf(m, " dt=%d/%llx/%d df=%lu",
95 atomic_read(&rdp->dynticks->dynticks), 126 atomic_read(&rdp->dynticks->dynticks),
@@ -97,8 +128,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
97 rdp->dynticks->dynticks_nmi_nesting, 128 rdp->dynticks->dynticks_nmi_nesting,
98 rdp->dynticks_fqs); 129 rdp->dynticks_fqs);
99 seq_printf(m, " of=%lu", rdp->offline_fqs); 130 seq_printf(m, " of=%lu", rdp->offline_fqs);
131 rcu_nocb_q_lengths(rdp, &ql, &qll);
132 qll += rdp->qlen_lazy;
133 ql += rdp->qlen;
100 seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", 134 seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
101 rdp->qlen_lazy, rdp->qlen, 135 qll, ql,
102 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != 136 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
103 rdp->nxttail[RCU_NEXT_TAIL]], 137 rdp->nxttail[RCU_NEXT_TAIL]],
104 ".R"[rdp->nxttail[RCU_WAIT_TAIL] != 138 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
@@ -114,101 +148,67 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
114 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); 148 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
115#endif /* #ifdef CONFIG_RCU_BOOST */ 149#endif /* #ifdef CONFIG_RCU_BOOST */
116 seq_printf(m, " b=%ld", rdp->blimit); 150 seq_printf(m, " b=%ld", rdp->blimit);
117 seq_printf(m, " ci=%lu co=%lu ca=%lu\n", 151 seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n",
118 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); 152 rdp->n_cbs_invoked, rdp->n_nocbs_invoked,
153 rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
119} 154}
120 155
121static int show_rcudata(struct seq_file *m, void *unused) 156static int show_rcudata(struct seq_file *m, void *v)
122{ 157{
123 int cpu; 158 print_one_rcu_data(m, (struct rcu_data *)v);
124 struct rcu_state *rsp;
125
126 for_each_rcu_flavor(rsp) {
127 seq_printf(m, "%s:\n", rsp->name);
128 for_each_possible_cpu(cpu)
129 print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu));
130 }
131 return 0; 159 return 0;
132} 160}
133 161
162static const struct seq_operations rcudate_op = {
163 .start = r_start,
164 .next = r_next,
165 .stop = r_stop,
166 .show = show_rcudata,
167};
168
134static int rcudata_open(struct inode *inode, struct file *file) 169static int rcudata_open(struct inode *inode, struct file *file)
135{ 170{
136 return single_open(file, show_rcudata, NULL); 171 return r_open(inode, file, &rcudate_op);
137} 172}
138 173
139static const struct file_operations rcudata_fops = { 174static const struct file_operations rcudata_fops = {
140 .owner = THIS_MODULE, 175 .owner = THIS_MODULE,
141 .open = rcudata_open, 176 .open = rcudata_open,
142 .read = seq_read, 177 .read = seq_read,
143 .llseek = seq_lseek, 178 .llseek = no_llseek,
144 .release = single_release, 179 .release = seq_release,
145}; 180};
146 181
147static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) 182static int show_rcuexp(struct seq_file *m, void *v)
148{
149 if (!rdp->beenonline)
150 return;
151 seq_printf(m, "%d,%s,%lu,%lu,%d,%d",
152 rdp->cpu,
153 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
154 rdp->completed, rdp->gpnum,
155 rdp->passed_quiesce, rdp->qs_pending);
156 seq_printf(m, ",%d,%llx,%d,%lu",
157 atomic_read(&rdp->dynticks->dynticks),
158 rdp->dynticks->dynticks_nesting,
159 rdp->dynticks->dynticks_nmi_nesting,
160 rdp->dynticks_fqs);
161 seq_printf(m, ",%lu", rdp->offline_fqs);
162 seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen,
163 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
164 rdp->nxttail[RCU_NEXT_TAIL]],
165 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
166 rdp->nxttail[RCU_NEXT_READY_TAIL]],
167 ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
168 rdp->nxttail[RCU_WAIT_TAIL]],
169 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
170#ifdef CONFIG_RCU_BOOST
171 seq_printf(m, ",%d,\"%c\"",
172 per_cpu(rcu_cpu_has_work, rdp->cpu),
173 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
174 rdp->cpu)));
175#endif /* #ifdef CONFIG_RCU_BOOST */
176 seq_printf(m, ",%ld", rdp->blimit);
177 seq_printf(m, ",%lu,%lu,%lu\n",
178 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
179}
180
181static int show_rcudata_csv(struct seq_file *m, void *unused)
182{ 183{
183 int cpu; 184 struct rcu_state *rsp = (struct rcu_state *)m->private;
184 struct rcu_state *rsp; 185
185 186 seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n",
186 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pq\","); 187 atomic_long_read(&rsp->expedited_start),
187 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); 188 atomic_long_read(&rsp->expedited_done),
188 seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); 189 atomic_long_read(&rsp->expedited_wrap),
189#ifdef CONFIG_RCU_BOOST 190 atomic_long_read(&rsp->expedited_tryfail),
190 seq_puts(m, "\"kt\",\"ktl\""); 191 atomic_long_read(&rsp->expedited_workdone1),
191#endif /* #ifdef CONFIG_RCU_BOOST */ 192 atomic_long_read(&rsp->expedited_workdone2),
192 seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); 193 atomic_long_read(&rsp->expedited_normal),
193 for_each_rcu_flavor(rsp) { 194 atomic_long_read(&rsp->expedited_stoppedcpus),
194 seq_printf(m, "\"%s:\"\n", rsp->name); 195 atomic_long_read(&rsp->expedited_done_tries),
195 for_each_possible_cpu(cpu) 196 atomic_long_read(&rsp->expedited_done_lost),
196 print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu)); 197 atomic_long_read(&rsp->expedited_done_exit));
197 }
198 return 0; 198 return 0;
199} 199}
200 200
201static int rcudata_csv_open(struct inode *inode, struct file *file) 201static int rcuexp_open(struct inode *inode, struct file *file)
202{ 202{
203 return single_open(file, show_rcudata_csv, NULL); 203 return single_open(file, show_rcuexp, inode->i_private);
204} 204}
205 205
206static const struct file_operations rcudata_csv_fops = { 206static const struct file_operations rcuexp_fops = {
207 .owner = THIS_MODULE, 207 .owner = THIS_MODULE,
208 .open = rcudata_csv_open, 208 .open = rcuexp_open,
209 .read = seq_read, 209 .read = seq_read,
210 .llseek = seq_lseek, 210 .llseek = no_llseek,
211 .release = single_release, 211 .release = seq_release,
212}; 212};
213 213
214#ifdef CONFIG_RCU_BOOST 214#ifdef CONFIG_RCU_BOOST
@@ -254,27 +254,11 @@ static const struct file_operations rcu_node_boost_fops = {
254 .owner = THIS_MODULE, 254 .owner = THIS_MODULE,
255 .open = rcu_node_boost_open, 255 .open = rcu_node_boost_open,
256 .read = seq_read, 256 .read = seq_read,
257 .llseek = seq_lseek, 257 .llseek = no_llseek,
258 .release = single_release, 258 .release = single_release,
259}; 259};
260 260
261/* 261#endif /* #ifdef CONFIG_RCU_BOOST */
262 * Create the rcuboost debugfs entry. Standard error return.
263 */
264static int rcu_boost_trace_create_file(struct dentry *rcudir)
265{
266 return !debugfs_create_file("rcuboost", 0444, rcudir, NULL,
267 &rcu_node_boost_fops);
268}
269
270#else /* #ifdef CONFIG_RCU_BOOST */
271
272static int rcu_boost_trace_create_file(struct dentry *rcudir)
273{
274 return 0; /* There cannot be an error if we didn't create it! */
275}
276
277#endif /* #else #ifdef CONFIG_RCU_BOOST */
278 262
279static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 263static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
280{ 264{
@@ -283,8 +267,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
283 struct rcu_node *rnp; 267 struct rcu_node *rnp;
284 268
285 gpnum = rsp->gpnum; 269 gpnum = rsp->gpnum;
286 seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ", 270 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ",
287 rsp->name, rsp->completed, gpnum, rsp->fqs_state, 271 ulong2long(rsp->completed), ulong2long(gpnum),
272 rsp->fqs_state,
288 (long)(rsp->jiffies_force_qs - jiffies), 273 (long)(rsp->jiffies_force_qs - jiffies),
289 (int)(jiffies & 0xffff)); 274 (int)(jiffies & 0xffff));
290 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", 275 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
@@ -306,26 +291,24 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
306 seq_puts(m, "\n"); 291 seq_puts(m, "\n");
307} 292}
308 293
309static int show_rcuhier(struct seq_file *m, void *unused) 294static int show_rcuhier(struct seq_file *m, void *v)
310{ 295{
311 struct rcu_state *rsp; 296 struct rcu_state *rsp = (struct rcu_state *)m->private;
312 297 print_one_rcu_state(m, rsp);
313 for_each_rcu_flavor(rsp)
314 print_one_rcu_state(m, rsp);
315 return 0; 298 return 0;
316} 299}
317 300
318static int rcuhier_open(struct inode *inode, struct file *file) 301static int rcuhier_open(struct inode *inode, struct file *file)
319{ 302{
320 return single_open(file, show_rcuhier, NULL); 303 return single_open(file, show_rcuhier, inode->i_private);
321} 304}
322 305
323static const struct file_operations rcuhier_fops = { 306static const struct file_operations rcuhier_fops = {
324 .owner = THIS_MODULE, 307 .owner = THIS_MODULE,
325 .open = rcuhier_open, 308 .open = rcuhier_open,
326 .read = seq_read, 309 .read = seq_read,
327 .llseek = seq_lseek, 310 .llseek = no_llseek,
328 .release = single_release, 311 .release = seq_release,
329}; 312};
330 313
331static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) 314static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
@@ -338,42 +321,42 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
338 struct rcu_node *rnp = &rsp->node[0]; 321 struct rcu_node *rnp = &rsp->node[0];
339 322
340 raw_spin_lock_irqsave(&rnp->lock, flags); 323 raw_spin_lock_irqsave(&rnp->lock, flags);
341 completed = rsp->completed; 324 completed = ACCESS_ONCE(rsp->completed);
342 gpnum = rsp->gpnum; 325 gpnum = ACCESS_ONCE(rsp->gpnum);
343 if (rsp->completed == rsp->gpnum) 326 if (completed == gpnum)
344 gpage = 0; 327 gpage = 0;
345 else 328 else
346 gpage = jiffies - rsp->gp_start; 329 gpage = jiffies - rsp->gp_start;
347 gpmax = rsp->gp_max; 330 gpmax = rsp->gp_max;
348 raw_spin_unlock_irqrestore(&rnp->lock, flags); 331 raw_spin_unlock_irqrestore(&rnp->lock, flags);
349 seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n", 332 seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n",
350 rsp->name, completed, gpnum, gpage, gpmax); 333 ulong2long(completed), ulong2long(gpnum), gpage, gpmax);
351} 334}
352 335
353static int show_rcugp(struct seq_file *m, void *unused) 336static int show_rcugp(struct seq_file *m, void *v)
354{ 337{
355 struct rcu_state *rsp; 338 struct rcu_state *rsp = (struct rcu_state *)m->private;
356 339 show_one_rcugp(m, rsp);
357 for_each_rcu_flavor(rsp)
358 show_one_rcugp(m, rsp);
359 return 0; 340 return 0;
360} 341}
361 342
362static int rcugp_open(struct inode *inode, struct file *file) 343static int rcugp_open(struct inode *inode, struct file *file)
363{ 344{
364 return single_open(file, show_rcugp, NULL); 345 return single_open(file, show_rcugp, inode->i_private);
365} 346}
366 347
367static const struct file_operations rcugp_fops = { 348static const struct file_operations rcugp_fops = {
368 .owner = THIS_MODULE, 349 .owner = THIS_MODULE,
369 .open = rcugp_open, 350 .open = rcugp_open,
370 .read = seq_read, 351 .read = seq_read,
371 .llseek = seq_lseek, 352 .llseek = no_llseek,
372 .release = single_release, 353 .release = seq_release,
373}; 354};
374 355
375static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) 356static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
376{ 357{
358 if (!rdp->beenonline)
359 return;
377 seq_printf(m, "%3d%cnp=%ld ", 360 seq_printf(m, "%3d%cnp=%ld ",
378 rdp->cpu, 361 rdp->cpu,
379 cpu_is_offline(rdp->cpu) ? '!' : ' ', 362 cpu_is_offline(rdp->cpu) ? '!' : ' ',
@@ -389,34 +372,30 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
389 rdp->n_rp_need_nothing); 372 rdp->n_rp_need_nothing);
390} 373}
391 374
392static int show_rcu_pending(struct seq_file *m, void *unused) 375static int show_rcu_pending(struct seq_file *m, void *v)
393{ 376{
394 int cpu; 377 print_one_rcu_pending(m, (struct rcu_data *)v);
395 struct rcu_data *rdp;
396 struct rcu_state *rsp;
397
398 for_each_rcu_flavor(rsp) {
399 seq_printf(m, "%s:\n", rsp->name);
400 for_each_possible_cpu(cpu) {
401 rdp = per_cpu_ptr(rsp->rda, cpu);
402 if (rdp->beenonline)
403 print_one_rcu_pending(m, rdp);
404 }
405 }
406 return 0; 378 return 0;
407} 379}
408 380
381static const struct seq_operations rcu_pending_op = {
382 .start = r_start,
383 .next = r_next,
384 .stop = r_stop,
385 .show = show_rcu_pending,
386};
387
409static int rcu_pending_open(struct inode *inode, struct file *file) 388static int rcu_pending_open(struct inode *inode, struct file *file)
410{ 389{
411 return single_open(file, show_rcu_pending, NULL); 390 return r_open(inode, file, &rcu_pending_op);
412} 391}
413 392
414static const struct file_operations rcu_pending_fops = { 393static const struct file_operations rcu_pending_fops = {
415 .owner = THIS_MODULE, 394 .owner = THIS_MODULE,
416 .open = rcu_pending_open, 395 .open = rcu_pending_open,
417 .read = seq_read, 396 .read = seq_read,
418 .llseek = seq_lseek, 397 .llseek = no_llseek,
419 .release = single_release, 398 .release = seq_release,
420}; 399};
421 400
422static int show_rcutorture(struct seq_file *m, void *unused) 401static int show_rcutorture(struct seq_file *m, void *unused)
@@ -446,43 +425,58 @@ static struct dentry *rcudir;
446 425
447static int __init rcutree_trace_init(void) 426static int __init rcutree_trace_init(void)
448{ 427{
428 struct rcu_state *rsp;
449 struct dentry *retval; 429 struct dentry *retval;
430 struct dentry *rspdir;
450 431
451 rcudir = debugfs_create_dir("rcu", NULL); 432 rcudir = debugfs_create_dir("rcu", NULL);
452 if (!rcudir) 433 if (!rcudir)
453 goto free_out; 434 goto free_out;
454 435
455 retval = debugfs_create_file("rcubarrier", 0444, rcudir, 436 for_each_rcu_flavor(rsp) {
456 NULL, &rcubarrier_fops); 437 rspdir = debugfs_create_dir(rsp->name, rcudir);
457 if (!retval) 438 if (!rspdir)
458 goto free_out; 439 goto free_out;
459 440
460 retval = debugfs_create_file("rcudata", 0444, rcudir, 441 retval = debugfs_create_file("rcudata", 0444,
461 NULL, &rcudata_fops); 442 rspdir, rsp, &rcudata_fops);
462 if (!retval) 443 if (!retval)
463 goto free_out; 444 goto free_out;
464 445
465 retval = debugfs_create_file("rcudata.csv", 0444, rcudir, 446 retval = debugfs_create_file("rcuexp", 0444,
466 NULL, &rcudata_csv_fops); 447 rspdir, rsp, &rcuexp_fops);
467 if (!retval) 448 if (!retval)
468 goto free_out; 449 goto free_out;
469 450
470 if (rcu_boost_trace_create_file(rcudir)) 451 retval = debugfs_create_file("rcu_pending", 0444,
471 goto free_out; 452 rspdir, rsp, &rcu_pending_fops);
453 if (!retval)
454 goto free_out;
455
456 retval = debugfs_create_file("rcubarrier", 0444,
457 rspdir, rsp, &rcubarrier_fops);
458 if (!retval)
459 goto free_out;
472 460
473 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); 461#ifdef CONFIG_RCU_BOOST
474 if (!retval) 462 if (rsp == &rcu_preempt_state) {
475 goto free_out; 463 retval = debugfs_create_file("rcuboost", 0444,
464 rspdir, NULL, &rcu_node_boost_fops);
465 if (!retval)
466 goto free_out;
467 }
468#endif
476 469
477 retval = debugfs_create_file("rcuhier", 0444, rcudir, 470 retval = debugfs_create_file("rcugp", 0444,
478 NULL, &rcuhier_fops); 471 rspdir, rsp, &rcugp_fops);
479 if (!retval) 472 if (!retval)
480 goto free_out; 473 goto free_out;
481 474
482 retval = debugfs_create_file("rcu_pending", 0444, rcudir, 475 retval = debugfs_create_file("rcuhier", 0444,
483 NULL, &rcu_pending_fops); 476 rspdir, rsp, &rcuhier_fops);
484 if (!retval) 477 if (!retval)
485 goto free_out; 478 goto free_out;
479 }
486 480
487 retval = debugfs_create_file("rcutorture", 0444, rcudir, 481 retval = debugfs_create_file("rcutorture", 0444, rcudir,
488 NULL, &rcutorture_fops); 482 NULL, &rcutorture_fops);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927fda712..80f80dfca70e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -72,6 +72,7 @@
72#include <linux/slab.h> 72#include <linux/slab.h>
73#include <linux/init_task.h> 73#include <linux/init_task.h>
74#include <linux/binfmts.h> 74#include <linux/binfmts.h>
75#include <linux/context_tracking.h>
75 76
76#include <asm/switch_to.h> 77#include <asm/switch_to.h>
77#include <asm/tlb.h> 78#include <asm/tlb.h>
@@ -1886,8 +1887,8 @@ context_switch(struct rq *rq, struct task_struct *prev,
1886 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 1887 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1887#endif 1888#endif
1888 1889
1890 context_tracking_task_switch(prev, next);
1889 /* Here we just switch the register state and the stack. */ 1891 /* Here we just switch the register state and the stack. */
1890 rcu_switch(prev, next);
1891 switch_to(prev, next, prev); 1892 switch_to(prev, next, prev);
1892 1893
1893 barrier(); 1894 barrier();
@@ -2911,7 +2912,7 @@ asmlinkage void __sched schedule(void)
2911} 2912}
2912EXPORT_SYMBOL(schedule); 2913EXPORT_SYMBOL(schedule);
2913 2914
2914#ifdef CONFIG_RCU_USER_QS 2915#ifdef CONFIG_CONTEXT_TRACKING
2915asmlinkage void __sched schedule_user(void) 2916asmlinkage void __sched schedule_user(void)
2916{ 2917{
2917 /* 2918 /*
@@ -2920,9 +2921,9 @@ asmlinkage void __sched schedule_user(void)
2920 * we haven't yet exited the RCU idle mode. Do it here manually until 2921 * we haven't yet exited the RCU idle mode. Do it here manually until
2921 * we find a better solution. 2922 * we find a better solution.
2922 */ 2923 */
2923 rcu_user_exit(); 2924 user_exit();
2924 schedule(); 2925 schedule();
2925 rcu_user_enter(); 2926 user_enter();
2926} 2927}
2927#endif 2928#endif
2928 2929
@@ -3027,7 +3028,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
3027 /* Catch callers which need to be fixed */ 3028 /* Catch callers which need to be fixed */
3028 BUG_ON(ti->preempt_count || !irqs_disabled()); 3029 BUG_ON(ti->preempt_count || !irqs_disabled());
3029 3030
3030 rcu_user_exit(); 3031 user_exit();
3031 do { 3032 do {
3032 add_preempt_count(PREEMPT_ACTIVE); 3033 add_preempt_count(PREEMPT_ACTIVE);
3033 local_irq_enable(); 3034 local_irq_enable();
@@ -4474,6 +4475,7 @@ static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
4474void sched_show_task(struct task_struct *p) 4475void sched_show_task(struct task_struct *p)
4475{ 4476{
4476 unsigned long free = 0; 4477 unsigned long free = 0;
4478 int ppid;
4477 unsigned state; 4479 unsigned state;
4478 4480
4479 state = p->state ? __ffs(p->state) + 1 : 0; 4481 state = p->state ? __ffs(p->state) + 1 : 0;
@@ -4493,8 +4495,11 @@ void sched_show_task(struct task_struct *p)
4493#ifdef CONFIG_DEBUG_STACK_USAGE 4495#ifdef CONFIG_DEBUG_STACK_USAGE
4494 free = stack_not_used(p); 4496 free = stack_not_used(p);
4495#endif 4497#endif
4498 rcu_read_lock();
4499 ppid = task_pid_nr(rcu_dereference(p->real_parent));
4500 rcu_read_unlock();
4496 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4501 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4497 task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), 4502 task_pid_nr(p), ppid,
4498 (unsigned long)task_thread_info(p)->flags); 4503 (unsigned long)task_thread_info(p)->flags);
4499 4504
4500 show_stack(p, NULL); 4505 show_stack(p, NULL);
@@ -8076,3 +8081,9 @@ struct cgroup_subsys cpuacct_subsys = {
8076 .base_cftypes = files, 8081 .base_cftypes = files,
8077}; 8082};
8078#endif /* CONFIG_CGROUP_CPUACCT */ 8083#endif /* CONFIG_CGROUP_CPUACCT */
8084
8085void dump_cpu_task(int cpu)
8086{
8087 pr_info("Task dump for CPU %d:\n", cpu);
8088 sched_show_task(cpu_curr(cpu));
8089}
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 97c465ebd844..2b859828cdc3 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -16,8 +16,10 @@
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 * 17 *
18 * Copyright (C) IBM Corporation, 2006 18 * Copyright (C) IBM Corporation, 2006
19 * Copyright (C) Fujitsu, 2012
19 * 20 *
20 * Author: Paul McKenney <paulmck@us.ibm.com> 21 * Author: Paul McKenney <paulmck@us.ibm.com>
22 * Lai Jiangshan <laijs@cn.fujitsu.com>
21 * 23 *
22 * For detailed explanation of Read-Copy Update mechanism see - 24 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU/ *.txt 25 * Documentation/RCU/ *.txt
@@ -34,6 +36,10 @@
34#include <linux/delay.h> 36#include <linux/delay.h>
35#include <linux/srcu.h> 37#include <linux/srcu.h>
36 38
39#include <trace/events/rcu.h>
40
41#include "rcu.h"
42
37/* 43/*
38 * Initialize an rcu_batch structure to empty. 44 * Initialize an rcu_batch structure to empty.
39 */ 45 */
@@ -92,9 +98,6 @@ static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
92 } 98 }
93} 99}
94 100
95/* single-thread state-machine */
96static void process_srcu(struct work_struct *work);
97
98static int init_srcu_struct_fields(struct srcu_struct *sp) 101static int init_srcu_struct_fields(struct srcu_struct *sp)
99{ 102{
100 sp->completed = 0; 103 sp->completed = 0;
@@ -464,7 +467,9 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
464 */ 467 */
465void synchronize_srcu(struct srcu_struct *sp) 468void synchronize_srcu(struct srcu_struct *sp)
466{ 469{
467 __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT); 470 __synchronize_srcu(sp, rcu_expedited
471 ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
472 : SYNCHRONIZE_SRCU_TRYCOUNT);
468} 473}
469EXPORT_SYMBOL_GPL(synchronize_srcu); 474EXPORT_SYMBOL_GPL(synchronize_srcu);
470 475
@@ -637,7 +642,7 @@ static void srcu_reschedule(struct srcu_struct *sp)
637/* 642/*
638 * This is the work-queue function that handles SRCU grace periods. 643 * This is the work-queue function that handles SRCU grace periods.
639 */ 644 */
640static void process_srcu(struct work_struct *work) 645void process_srcu(struct work_struct *work)
641{ 646{
642 struct srcu_struct *sp; 647 struct srcu_struct *sp;
643 648
@@ -648,3 +653,4 @@ static void process_srcu(struct work_struct *work)
648 srcu_invoke_callbacks(sp); 653 srcu_invoke_callbacks(sp);
649 srcu_reschedule(sp); 654 srcu_reschedule(sp);
650} 655}
656EXPORT_SYMBOL_GPL(process_srcu);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 28e9d6c98941..41faf0b8df1d 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -972,7 +972,7 @@ config RCU_CPU_STALL_TIMEOUT
972 int "RCU CPU stall timeout in seconds" 972 int "RCU CPU stall timeout in seconds"
973 depends on TREE_RCU || TREE_PREEMPT_RCU 973 depends on TREE_RCU || TREE_PREEMPT_RCU
974 range 3 300 974 range 3 300
975 default 60 975 default 21
976 help 976 help
977 If a given RCU grace period extends more than the specified 977 If a given RCU grace period extends more than the specified
978 number of seconds, a CPU stall warning is printed. If the 978 number of seconds, a CPU stall warning is printed. If the