aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/RCU/00-INDEX2
-rw-r--r--Documentation/RCU/stallwarn.txt23
-rw-r--r--Documentation/RCU/trace.txt295
-rw-r--r--Documentation/filesystems/proc.txt1
-rw-r--r--include/linux/interrupt.h1
-rw-r--r--include/linux/rcupdate.h70
-rw-r--r--include/linux/rcutiny.h8
-rw-r--r--include/linux/rcutree.h13
-rw-r--r--include/trace/events/irq.h3
-rw-r--r--init/Kconfig2
-rw-r--r--kernel/rcupdate.c32
-rw-r--r--kernel/rcutiny.c45
-rw-r--r--kernel/rcutiny_plugin.h203
-rw-r--r--kernel/rcutorture.c26
-rw-r--r--kernel/rcutree.c672
-rw-r--r--kernel/rcutree.h116
-rw-r--r--kernel/rcutree_plugin.h595
-rw-r--r--kernel/rcutree_trace.c192
-rw-r--r--kernel/softirq.c2
-rw-r--r--lib/Kconfig.debug32
-rw-r--r--tools/perf/util/trace-event-parse.c1
21 files changed, 1783 insertions, 551 deletions
diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX
index 71b6f500ddb9..1d7a885761f5 100644
--- a/Documentation/RCU/00-INDEX
+++ b/Documentation/RCU/00-INDEX
@@ -21,7 +21,7 @@ rcu.txt
21RTFP.txt 21RTFP.txt
22 - List of RCU papers (bibliography) going back to 1980. 22 - List of RCU papers (bibliography) going back to 1980.
23stallwarn.txt 23stallwarn.txt
24 - RCU CPU stall warnings (CONFIG_RCU_CPU_STALL_DETECTOR) 24 - RCU CPU stall warnings (module parameter rcu_cpu_stall_suppress)
25torture.txt 25torture.txt
26 - RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST) 26 - RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST)
27trace.txt 27trace.txt
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index 862c08ef1fde..4e959208f736 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -1,22 +1,25 @@
1Using RCU's CPU Stall Detector 1Using RCU's CPU Stall Detector
2 2
3The CONFIG_RCU_CPU_STALL_DETECTOR kernel config parameter enables 3The rcu_cpu_stall_suppress module parameter enables RCU's CPU stall
4RCU's CPU stall detector, which detects conditions that unduly delay 4detector, which detects conditions that unduly delay RCU grace periods.
5RCU grace periods. The stall detector's idea of what constitutes 5This module parameter enables CPU stall detection by default, but
6"unduly delayed" is controlled by a set of C preprocessor macros: 6may be overridden via boot-time parameter or at runtime via sysfs.
7The stall detector's idea of what constitutes "unduly delayed" is
8controlled by a set of kernel configuration variables and cpp macros:
7 9
8RCU_SECONDS_TILL_STALL_CHECK 10CONFIG_RCU_CPU_STALL_TIMEOUT
9 11
10 This macro defines the period of time that RCU will wait from 12 This kernel configuration parameter defines the period of time
11 the beginning of a grace period until it issues an RCU CPU 13 that RCU will wait from the beginning of a grace period until it
12 stall warning. This time period is normally ten seconds. 14 issues an RCU CPU stall warning. This time period is normally
15 ten seconds.
13 16
14RCU_SECONDS_TILL_STALL_RECHECK 17RCU_SECONDS_TILL_STALL_RECHECK
15 18
16 This macro defines the period of time that RCU will wait after 19 This macro defines the period of time that RCU will wait after
17 issuing a stall warning until it issues another stall warning 20 issuing a stall warning until it issues another stall warning
18 for the same stall. This time period is normally set to thirty 21 for the same stall. This time period is normally set to three
19 seconds. 22 times the check interval plus thirty seconds.
20 23
21RCU_STALL_RAT_DELAY 24RCU_STALL_RAT_DELAY
22 25
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 6a8c73f55b80..8173cec473aa 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -10,34 +10,46 @@ for rcutree and next for rcutiny.
10 10
11CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats 11CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats
12 12
13These implementations of RCU provides five debugfs files under the 13These implementations of RCU provides several debugfs files under the
14top-level directory RCU: rcu/rcudata (which displays fields in struct 14top-level directory "rcu":
15rcu_data), rcu/rcudata.csv (which is a .csv spreadsheet version of 15
16rcu/rcudata), rcu/rcugp (which displays grace-period counters), 16rcu/rcudata:
17rcu/rcuhier (which displays the struct rcu_node hierarchy), and 17 Displays fields in struct rcu_data.
18rcu/rcu_pending (which displays counts of the reasons that the 18rcu/rcudata.csv:
19rcu_pending() function decided that there was core RCU work to do). 19 Comma-separated values spreadsheet version of rcudata.
20rcu/rcugp:
21 Displays grace-period counters.
22rcu/rcuhier:
23 Displays the struct rcu_node hierarchy.
24rcu/rcu_pending:
25 Displays counts of the reasons rcu_pending() decided that RCU had
26 work to do.
27rcu/rcutorture:
28 Displays rcutorture test progress.
29rcu/rcuboost:
30 Displays RCU boosting statistics. Only present if
31 CONFIG_RCU_BOOST=y.
20 32
21The output of "cat rcu/rcudata" looks as follows: 33The output of "cat rcu/rcudata" looks as follows:
22 34
23rcu_sched: 35rcu_sched:
24 0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10 36 0 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=545/1/0 df=50 of=0 ri=0 ql=163 qs=NRW. kt=0/W/0 ktl=ebc3 b=10 ci=153737 co=0 ca=0
25 1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10 37 1 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=967/1/0 df=58 of=0 ri=0 ql=634 qs=NRW. kt=0/W/1 ktl=58c b=10 ci=191037 co=0 ca=0
26 2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10 38 2 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=1081/1/0 df=175 of=0 ri=0 ql=74 qs=N.W. kt=0/W/2 ktl=da94 b=10 ci=75991 co=0 ca=0
27 3 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=6681/1 dn=0 df=1545 of=0 ri=0 ql=0 b=10 39 3 c=20942 g=20943 pq=1 pqc=20942 qp=1 dt=1846/0/0 df=404 of=0 ri=0 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=72261 co=0 ca=0
28 4 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1003/1 dn=0 df=1992 of=0 ri=0 ql=0 b=10 40 4 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=369/1/0 df=83 of=0 ri=0 ql=48 qs=N.W. kt=0/W/4 ktl=e0e7 b=10 ci=128365 co=0 ca=0
29 5 c=17829 g=17830 pq=1 pqc=17829 qp=1 dt=3887/1 dn=0 df=3331 of=0 ri=4 ql=2 b=10 41 5 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=381/1/0 df=64 of=0 ri=0 ql=169 qs=NRW. kt=0/W/5 ktl=fb2f b=10 ci=164360 co=0 ca=0
30 6 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=859/1 dn=0 df=3224 of=0 ri=0 ql=0 b=10 42 6 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=1037/1/0 df=183 of=0 ri=0 ql=62 qs=N.W. kt=0/W/6 ktl=d2ad b=10 ci=65663 co=0 ca=0
31 7 c=17829 g=17830 pq=0 pqc=17829 qp=1 dt=3761/1 dn=0 df=1818 of=0 ri=0 ql=2 b=10 43 7 c=20897 g=20897 pq=1 pqc=20896 qp=0 dt=1572/0/0 df=382 of=0 ri=0 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=75006 co=0 ca=0
32rcu_bh: 44rcu_bh:
33 0 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=10951/1 dn=0 df=0 of=0 ri=0 ql=0 b=10 45 0 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=545/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/0 ktl=ebc3 b=10 ci=0 co=0 ca=0
34 1 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=16117/1 dn=0 df=13 of=0 ri=0 ql=0 b=10 46 1 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=967/1/0 df=3 of=0 ri=1 ql=0 qs=.... kt=0/W/1 ktl=58c b=10 ci=151 co=0 ca=0
35 2 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1445/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 47 2 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=1081/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/2 ktl=da94 b=10 ci=0 co=0 ca=0
36 3 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=6681/1 dn=0 df=9 of=0 ri=0 ql=0 b=10 48 3 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=1846/0/0 df=8 of=0 ri=1 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=0 co=0 ca=0
37 4 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1003/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 49 4 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=369/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/4 ktl=e0e7 b=10 ci=0 co=0 ca=0
38 5 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3887/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 50 5 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=381/1/0 df=4 of=0 ri=1 ql=0 qs=.... kt=0/W/5 ktl=fb2f b=10 ci=0 co=0 ca=0
39 6 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=859/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 51 6 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=1037/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/6 ktl=d2ad b=10 ci=0 co=0 ca=0
40 7 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3761/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 52 7 c=1474 g=1474 pq=1 pqc=1473 qp=0 dt=1572/0/0 df=8 of=0 ri=1 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=0 co=0 ca=0
41 53
42The first section lists the rcu_data structures for rcu_sched, the second 54The first section lists the rcu_data structures for rcu_sched, the second
43for rcu_bh. Note that CONFIG_TREE_PREEMPT_RCU kernels will have an 55for rcu_bh. Note that CONFIG_TREE_PREEMPT_RCU kernels will have an
@@ -52,17 +64,18 @@ o The number at the beginning of each line is the CPU number.
52 substantially larger than the number of actual CPUs. 64 substantially larger than the number of actual CPUs.
53 65
54o "c" is the count of grace periods that this CPU believes have 66o "c" is the count of grace periods that this CPU believes have
55 completed. CPUs in dynticks idle mode may lag quite a ways 67 completed. Offlined CPUs and CPUs in dynticks idle mode may
56 behind, for example, CPU 4 under "rcu_sched" above, which has 68 lag quite a ways behind, for example, CPU 6 under "rcu_sched"
57 slept through the past 25 RCU grace periods. It is not unusual 69 above, which has been offline through not quite 40,000 RCU grace
58 to see CPUs lagging by thousands of grace periods. 70 periods. It is not unusual to see CPUs lagging by thousands of
71 grace periods.
59 72
60o "g" is the count of grace periods that this CPU believes have 73o "g" is the count of grace periods that this CPU believes have
61 started. Again, CPUs in dynticks idle mode may lag behind. 74 started. Again, offlined CPUs and CPUs in dynticks idle mode
62 If the "c" and "g" values are equal, this CPU has already 75 may lag behind. If the "c" and "g" values are equal, this CPU
63 reported a quiescent state for the last RCU grace period that 76 has already reported a quiescent state for the last RCU grace
64 it is aware of, otherwise, the CPU believes that it owes RCU a 77 period that it is aware of, otherwise, the CPU believes that it
65 quiescent state. 78 owes RCU a quiescent state.
66 79
67o "pq" indicates that this CPU has passed through a quiescent state 80o "pq" indicates that this CPU has passed through a quiescent state
68 for the current grace period. It is possible for "pq" to be 81 for the current grace period. It is possible for "pq" to be
@@ -81,22 +94,16 @@ o "pqc" indicates which grace period the last-observed quiescent
81 the next grace period! 94 the next grace period!
82 95
83o "qp" indicates that RCU still expects a quiescent state from 96o "qp" indicates that RCU still expects a quiescent state from
84 this CPU. 97 this CPU. Offlined CPUs and CPUs in dyntick idle mode might
98 well have qp=1, which is OK: RCU is still ignoring them.
85 99
86o "dt" is the current value of the dyntick counter that is incremented 100o "dt" is the current value of the dyntick counter that is incremented
87 when entering or leaving dynticks idle state, either by the 101 when entering or leaving dynticks idle state, either by the
88 scheduler or by irq. The number after the "/" is the interrupt 102 scheduler or by irq. This number is even if the CPU is in
89 nesting depth when in dyntick-idle state, or one greater than 103 dyntick idle mode and odd otherwise. The number after the first
90 the interrupt-nesting depth otherwise. 104 "/" is the interrupt nesting depth when in dyntick-idle state,
91 105 or one greater than the interrupt-nesting depth otherwise.
92 This field is displayed only for CONFIG_NO_HZ kernels. 106 The number after the second "/" is the NMI nesting depth.
93
94o "dn" is the current value of the dyntick counter that is incremented
95 when entering or leaving dynticks idle state via NMI. If both
96 the "dt" and "dn" values are even, then this CPU is in dynticks
97 idle mode and may be ignored by RCU. If either of these two
98 counters is odd, then RCU must be alert to the possibility of
99 an RCU read-side critical section running on this CPU.
100 107
101 This field is displayed only for CONFIG_NO_HZ kernels. 108 This field is displayed only for CONFIG_NO_HZ kernels.
102 109
@@ -108,7 +115,7 @@ o "df" is the number of times that some other CPU has forced a
108 115
109o "of" is the number of times that some other CPU has forced a 116o "of" is the number of times that some other CPU has forced a
110 quiescent state on behalf of this CPU due to this CPU being 117 quiescent state on behalf of this CPU due to this CPU being
111 offline. In a perfect world, this might neve happen, but it 118 offline. In a perfect world, this might never happen, but it
112 turns out that offlining and onlining a CPU can take several grace 119 turns out that offlining and onlining a CPU can take several grace
113 periods, and so there is likely to be an extended period of time 120 periods, and so there is likely to be an extended period of time
114 when RCU believes that the CPU is online when it really is not. 121 when RCU believes that the CPU is online when it really is not.
@@ -125,6 +132,62 @@ o "ql" is the number of RCU callbacks currently residing on
125 of what state they are in (new, waiting for grace period to 132 of what state they are in (new, waiting for grace period to
126 start, waiting for grace period to end, ready to invoke). 133 start, waiting for grace period to end, ready to invoke).
127 134
135o "qs" gives an indication of the state of the callback queue
136 with four characters:
137
138 "N" Indicates that there are callbacks queued that are not
139 ready to be handled by the next grace period, and thus
140 will be handled by the grace period following the next
141 one.
142
143 "R" Indicates that there are callbacks queued that are
144 ready to be handled by the next grace period.
145
146 "W" Indicates that there are callbacks queued that are
147 waiting on the current grace period.
148
149 "D" Indicates that there are callbacks queued that have
150 already been handled by a prior grace period, and are
151 thus waiting to be invoked. Note that callbacks in
152 the process of being invoked are not counted here.
153 Callbacks in the process of being invoked are those
154 that have been removed from the rcu_data structures
155 queues by rcu_do_batch(), but which have not yet been
156 invoked.
157
158 If there are no callbacks in a given one of the above states,
159 the corresponding character is replaced by ".".
160
161o "kt" is the per-CPU kernel-thread state. The digit preceding
162 the first slash is zero if there is no work pending and 1
163 otherwise. The character between the first pair of slashes is
164 as follows:
165
166 "S" The kernel thread is stopped, in other words, all
167 CPUs corresponding to this rcu_node structure are
168 offline.
169
170 "R" The kernel thread is running.
171
172 "W" The kernel thread is waiting because there is no work
173 for it to do.
174
175 "O" The kernel thread is waiting because it has been
176 forced off of its designated CPU or because its
177 ->cpus_allowed mask permits it to run on other than
178 its designated CPU.
179
180 "Y" The kernel thread is yielding to avoid hogging CPU.
181
182 "?" Unknown value, indicates a bug.
183
184 The number after the final slash is the CPU that the kthread
185 is actually running on.
186
187o "ktl" is the low-order 16 bits (in hexadecimal) of the count of
188 the number of times that this CPU's per-CPU kthread has gone
189 through its loop servicing invoke_rcu_cpu_kthread() requests.
190
128o "b" is the batch limit for this CPU. If more than this number 191o "b" is the batch limit for this CPU. If more than this number
129 of RCU callbacks is ready to invoke, then the remainder will 192 of RCU callbacks is ready to invoke, then the remainder will
130 be deferred. 193 be deferred.
@@ -174,14 +237,14 @@ o "gpnum" is the number of grace periods that have started. It is
174The output of "cat rcu/rcuhier" looks as follows, with very long lines: 237The output of "cat rcu/rcuhier" looks as follows, with very long lines:
175 238
176c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 239c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6
1771/1 .>. 0:127 ^0 2401/1 ..>. 0:127 ^0
1783/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3 2413/3 ..>. 0:35 ^0 0/0 ..>. 36:71 ^1 0/0 ..>. 72:107 ^2 0/0 ..>. 108:127 ^3
1793/3f .>. 0:5 ^0 2/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3 2423/3f ..>. 0:5 ^0 2/3 ..>. 6:11 ^1 0/0 ..>. 12:17 ^2 0/0 ..>. 18:23 ^3 0/0 ..>. 24:29 ^4 0/0 ..>. 30:35 ^5 0/0 ..>. 36:41 ^0 0/0 ..>. 42:47 ^1 0/0 ..>. 48:53 ^2 0/0 ..>. 54:59 ^3 0/0 ..>. 60:65 ^4 0/0 ..>. 66:71 ^5 0/0 ..>. 72:77 ^0 0/0 ..>. 78:83 ^1 0/0 ..>. 84:89 ^2 0/0 ..>. 90:95 ^3 0/0 ..>. 96:101 ^4 0/0 ..>. 102:107 ^5 0/0 ..>. 108:113 ^0 0/0 ..>. 114:119 ^1 0/0 ..>. 120:125 ^2 0/0 ..>. 126:127 ^3
180rcu_bh: 243rcu_bh:
181c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 244c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
1820/1 .>. 0:127 ^0 2450/1 ..>. 0:127 ^0
1830/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3 2460/3 ..>. 0:35 ^0 0/0 ..>. 36:71 ^1 0/0 ..>. 72:107 ^2 0/0 ..>. 108:127 ^3
1840/3f .>. 0:5 ^0 0/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3 2470/3f ..>. 0:5 ^0 0/3 ..>. 6:11 ^1 0/0 ..>. 12:17 ^2 0/0 ..>. 18:23 ^3 0/0 ..>. 24:29 ^4 0/0 ..>. 30:35 ^5 0/0 ..>. 36:41 ^0 0/0 ..>. 42:47 ^1 0/0 ..>. 48:53 ^2 0/0 ..>. 54:59 ^3 0/0 ..>. 60:65 ^4 0/0 ..>. 66:71 ^5 0/0 ..>. 72:77 ^0 0/0 ..>. 78:83 ^1 0/0 ..>. 84:89 ^2 0/0 ..>. 90:95 ^3 0/0 ..>. 96:101 ^4 0/0 ..>. 102:107 ^5 0/0 ..>. 108:113 ^0 0/0 ..>. 114:119 ^1 0/0 ..>. 120:125 ^2 0/0 ..>. 126:127 ^3
185 248
186This is once again split into "rcu_sched" and "rcu_bh" portions, 249This is once again split into "rcu_sched" and "rcu_bh" portions,
187and CONFIG_TREE_PREEMPT_RCU kernels will again have an additional 250and CONFIG_TREE_PREEMPT_RCU kernels will again have an additional
@@ -240,13 +303,20 @@ o Each element of the form "1/1 0:127 ^0" represents one struct
240 current grace period. 303 current grace period.
241 304
242 o The characters separated by the ">" indicate the state 305 o The characters separated by the ">" indicate the state
243 of the blocked-tasks lists. A "T" preceding the ">" 306 of the blocked-tasks lists. A "G" preceding the ">"
244 indicates that at least one task blocked in an RCU 307 indicates that at least one task blocked in an RCU
245 read-side critical section blocks the current grace 308 read-side critical section blocks the current grace
246 period, while a "." preceding the ">" indicates otherwise. 309 period, while a "E" preceding the ">" indicates that
247 The character following the ">" indicates similarly for 310 at least one task blocked in an RCU read-side critical
248 the next grace period. A "T" should appear in this 311 section blocks the current expedited grace period.
249 field only for rcu-preempt. 312 A "T" character following the ">" indicates that at
313 least one task is blocked within an RCU read-side
314 critical section, regardless of whether any current
315 grace period (expedited or normal) is inconvenienced.
316 A "." character appears if the corresponding condition
317 does not hold, so that "..>." indicates that no tasks
318 are blocked. In contrast, "GE>T" indicates maximal
319 inconvenience from blocked tasks.
250 320
251 o The numbers separated by the ":" are the range of CPUs 321 o The numbers separated by the ":" are the range of CPUs
252 served by this struct rcu_node. This can be helpful 322 served by this struct rcu_node. This can be helpful
@@ -328,6 +398,113 @@ o "nn" is the number of times that this CPU needed nothing. Alert
328 is due to short-circuit evaluation in rcu_pending(). 398 is due to short-circuit evaluation in rcu_pending().
329 399
330 400
401The output of "cat rcu/rcutorture" looks as follows:
402
403rcutorture test sequence: 0 (test in progress)
404rcutorture update version number: 615
405
406The first line shows the number of rcutorture tests that have completed
407since boot. If a test is currently running, the "(test in progress)"
408string will appear as shown above. The second line shows the number of
409update cycles that the current test has started, or zero if there is
410no test in progress.
411
412
413The output of "cat rcu/rcuboost" looks as follows:
414
4150:5 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=2f95 bt=300f
416 balk: nt=0 egt=989 bt=0 nb=0 ny=0 nos=16
4176:7 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=2f95 bt=300f
418 balk: nt=0 egt=225 bt=0 nb=0 ny=0 nos=6
419
420This information is output only for rcu_preempt. Each two-line entry
421corresponds to a leaf rcu_node strcuture. The fields are as follows:
422
423o "n:m" is the CPU-number range for the corresponding two-line
424 entry. In the sample output above, the first entry covers
425 CPUs zero through five and the second entry covers CPUs 6
426 and 7.
427
428o "tasks=TNEB" gives the state of the various segments of the
429 rnp->blocked_tasks list:
430
431 "T" This indicates that there are some tasks that blocked
432 while running on one of the corresponding CPUs while
433 in an RCU read-side critical section.
434
435 "N" This indicates that some of the blocked tasks are preventing
436 the current normal (non-expedited) grace period from
437 completing.
438
439 "E" This indicates that some of the blocked tasks are preventing
440 the current expedited grace period from completing.
441
442 "B" This indicates that some of the blocked tasks are in
443 need of RCU priority boosting.
444
445 Each character is replaced with "." if the corresponding
446 condition does not hold.
447
448o "kt" is the state of the RCU priority-boosting kernel
449 thread associated with the corresponding rcu_node structure.
450 The state can be one of the following:
451
452 "S" The kernel thread is stopped, in other words, all
453 CPUs corresponding to this rcu_node structure are
454 offline.
455
456 "R" The kernel thread is running.
457
458 "W" The kernel thread is waiting because there is no work
459 for it to do.
460
461 "Y" The kernel thread is yielding to avoid hogging CPU.
462
463 "?" Unknown value, indicates a bug.
464
465o "ntb" is the number of tasks boosted.
466
467o "neb" is the number of tasks boosted in order to complete an
468 expedited grace period.
469
470o "nnb" is the number of tasks boosted in order to complete a
471 normal (non-expedited) grace period. When boosting a task
472 that was blocking both an expedited and a normal grace period,
473 it is counted against the expedited total above.
474
475o "j" is the low-order 16 bits of the jiffies counter in
476 hexadecimal.
477
478o "bt" is the low-order 16 bits of the value that the jiffies
479 counter will have when we next start boosting, assuming that
480 the current grace period does not end beforehand. This is
481 also in hexadecimal.
482
483o "balk: nt" counts the number of times we didn't boost (in
484 other words, we balked) even though it was time to boost because
485 there were no blocked tasks to boost. This situation occurs
486 when there is one blocked task on one rcu_node structure and
487 none on some other rcu_node structure.
488
489o "egt" counts the number of times we balked because although
490 there were blocked tasks, none of them were blocking the
491 current grace period, whether expedited or otherwise.
492
493o "bt" counts the number of times we balked because boosting
494 had already been initiated for the current grace period.
495
496o "nb" counts the number of times we balked because there
497 was at least one task blocking the current non-expedited grace
498 period that never had blocked. If it is already running, it
499 just won't help to boost its priority!
500
501o "ny" counts the number of times we balked because it was
502 not yet time to start boosting.
503
504o "nos" counts the number of times we balked for other
505 reasons, e.g., the grace period ended first.
506
507
331CONFIG_TINY_RCU and CONFIG_TINY_PREEMPT_RCU debugfs Files and Formats 508CONFIG_TINY_RCU and CONFIG_TINY_PREEMPT_RCU debugfs Files and Formats
332 509
333These implementations of RCU provides a single debugfs file under the 510These implementations of RCU provides a single debugfs file under the
@@ -394,9 +571,9 @@ o "neb" is the number of expedited grace periods that have had
394o "nnb" is the number of normal grace periods that have had 571o "nnb" is the number of normal grace periods that have had
395 to resort to RCU priority boosting since boot. 572 to resort to RCU priority boosting since boot.
396 573
397o "j" is the low-order 12 bits of the jiffies counter in hexadecimal. 574o "j" is the low-order 16 bits of the jiffies counter in hexadecimal.
398 575
399o "bt" is the low-order 12 bits of the value that the jiffies counter 576o "bt" is the low-order 16 bits of the value that the jiffies counter
400 will have at the next time that boosting is scheduled to begin. 577 will have at the next time that boosting is scheduled to begin.
401 578
402o In the line beginning with "normal balk", the fields are as follows: 579o In the line beginning with "normal balk", the fields are as follows:
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index b0b814d75ca1..60740e8ecb37 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -836,7 +836,6 @@ Provides counts of softirq handlers serviced since boot time, for each cpu.
836 TASKLET: 0 0 0 290 836 TASKLET: 0 0 0 290
837 SCHED: 27035 26983 26971 26746 837 SCHED: 27035 26983 26971 26746
838 HRTIMER: 0 0 0 0 838 HRTIMER: 0 0 0 0
839 RCU: 1678 1769 2178 2250
840 839
841 840
8421.3 IDE devices in /proc/ide 8411.3 IDE devices in /proc/ide
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index bea0ac750712..6c12989839d9 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -414,7 +414,6 @@ enum
414 TASKLET_SOFTIRQ, 414 TASKLET_SOFTIRQ,
415 SCHED_SOFTIRQ, 415 SCHED_SOFTIRQ,
416 HRTIMER_SOFTIRQ, 416 HRTIMER_SOFTIRQ,
417 RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */
418 417
419 NR_SOFTIRQS 418 NR_SOFTIRQS
420}; 419};
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index ff422d2b7f90..99f9aa7c2804 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -47,6 +47,18 @@
47extern int rcutorture_runnable; /* for sysctl */ 47extern int rcutorture_runnable; /* for sysctl */
48#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ 48#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
49 49
50#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
51extern void rcutorture_record_test_transition(void);
52extern void rcutorture_record_progress(unsigned long vernum);
53#else
54static inline void rcutorture_record_test_transition(void)
55{
56}
57static inline void rcutorture_record_progress(unsigned long vernum)
58{
59}
60#endif
61
50#define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b)) 62#define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b))
51#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b)) 63#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b))
52#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) 64#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
@@ -68,7 +80,6 @@ extern void call_rcu_sched(struct rcu_head *head,
68extern void synchronize_sched(void); 80extern void synchronize_sched(void);
69extern void rcu_barrier_bh(void); 81extern void rcu_barrier_bh(void);
70extern void rcu_barrier_sched(void); 82extern void rcu_barrier_sched(void);
71extern int sched_expedited_torture_stats(char *page);
72 83
73static inline void __rcu_read_lock_bh(void) 84static inline void __rcu_read_lock_bh(void)
74{ 85{
@@ -774,6 +785,7 @@ extern struct debug_obj_descr rcuhead_debug_descr;
774 785
775static inline void debug_rcu_head_queue(struct rcu_head *head) 786static inline void debug_rcu_head_queue(struct rcu_head *head)
776{ 787{
788 WARN_ON_ONCE((unsigned long)head & 0x3);
777 debug_object_activate(head, &rcuhead_debug_descr); 789 debug_object_activate(head, &rcuhead_debug_descr);
778 debug_object_active_state(head, &rcuhead_debug_descr, 790 debug_object_active_state(head, &rcuhead_debug_descr,
779 STATE_RCU_HEAD_READY, 791 STATE_RCU_HEAD_READY,
@@ -797,4 +809,60 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
797} 809}
798#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 810#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
799 811
812static __always_inline bool __is_kfree_rcu_offset(unsigned long offset)
813{
814 return offset < 4096;
815}
816
817static __always_inline
818void __kfree_rcu(struct rcu_head *head, unsigned long offset)
819{
820 typedef void (*rcu_callback)(struct rcu_head *);
821
822 BUILD_BUG_ON(!__builtin_constant_p(offset));
823
824 /* See the kfree_rcu() header comment. */
825 BUILD_BUG_ON(!__is_kfree_rcu_offset(offset));
826
827 call_rcu(head, (rcu_callback)offset);
828}
829
830extern void kfree(const void *);
831
832static inline void __rcu_reclaim(struct rcu_head *head)
833{
834 unsigned long offset = (unsigned long)head->func;
835
836 if (__is_kfree_rcu_offset(offset))
837 kfree((void *)head - offset);
838 else
839 head->func(head);
840}
841
842/**
843 * kfree_rcu() - kfree an object after a grace period.
844 * @ptr: pointer to kfree
845 * @rcu_head: the name of the struct rcu_head within the type of @ptr.
846 *
847 * Many rcu callbacks functions just call kfree() on the base structure.
848 * These functions are trivial, but their size adds up, and furthermore
849 * when they are used in a kernel module, that module must invoke the
850 * high-latency rcu_barrier() function at module-unload time.
851 *
852 * The kfree_rcu() function handles this issue. Rather than encoding a
853 * function address in the embedded rcu_head structure, kfree_rcu() instead
854 * encodes the offset of the rcu_head structure within the base structure.
855 * Because the functions are not allowed in the low-order 4096 bytes of
856 * kernel virtual memory, offsets up to 4095 bytes can be accommodated.
857 * If the offset is larger than 4095 bytes, a compile-time error will
858 * be generated in __kfree_rcu(). If this error is triggered, you can
859 * either fall back to use of call_rcu() or rearrange the structure to
860 * position the rcu_head structure into the first 4096 bytes.
861 *
862 * Note that the allowable offset might decrease in the future, for example,
863 * to allow something like kmem_cache_free_rcu().
864 */
865#define kfree_rcu(ptr, rcu_head) \
866 __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))
867
800#endif /* __LINUX_RCUPDATE_H */ 868#endif /* __LINUX_RCUPDATE_H */
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 30ebd7c8d874..52b3e0281fd0 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -100,6 +100,14 @@ static inline void rcu_note_context_switch(int cpu)
100} 100}
101 101
102/* 102/*
103 * Take advantage of the fact that there is only one CPU, which
104 * allows us to ignore virtualization-based context switches.
105 */
106static inline void rcu_virt_note_context_switch(int cpu)
107{
108}
109
110/*
103 * Return the number of grace periods. 111 * Return the number of grace periods.
104 */ 112 */
105static inline long rcu_batches_completed(void) 113static inline long rcu_batches_completed(void)
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 3a933482734a..e65d06634dd8 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -35,6 +35,16 @@ extern void rcu_note_context_switch(int cpu);
35extern int rcu_needs_cpu(int cpu); 35extern int rcu_needs_cpu(int cpu);
36extern void rcu_cpu_stall_reset(void); 36extern void rcu_cpu_stall_reset(void);
37 37
38/*
39 * Note a virtualization-based context switch. This is simply a
40 * wrapper around rcu_note_context_switch(), which allows TINY_RCU
41 * to save a few bytes.
42 */
43static inline void rcu_virt_note_context_switch(int cpu)
44{
45 rcu_note_context_switch(cpu);
46}
47
38#ifdef CONFIG_TREE_PREEMPT_RCU 48#ifdef CONFIG_TREE_PREEMPT_RCU
39 49
40extern void exit_rcu(void); 50extern void exit_rcu(void);
@@ -58,9 +68,12 @@ static inline void synchronize_rcu_bh_expedited(void)
58 68
59extern void rcu_barrier(void); 69extern void rcu_barrier(void);
60 70
71extern unsigned long rcutorture_testseq;
72extern unsigned long rcutorture_vernum;
61extern long rcu_batches_completed(void); 73extern long rcu_batches_completed(void);
62extern long rcu_batches_completed_bh(void); 74extern long rcu_batches_completed_bh(void);
63extern long rcu_batches_completed_sched(void); 75extern long rcu_batches_completed_sched(void);
76
64extern void rcu_force_quiescent_state(void); 77extern void rcu_force_quiescent_state(void);
65extern void rcu_bh_force_quiescent_state(void); 78extern void rcu_bh_force_quiescent_state(void);
66extern void rcu_sched_force_quiescent_state(void); 79extern void rcu_sched_force_quiescent_state(void);
diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
index 1c09820df585..ae045ca7d356 100644
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h
@@ -20,8 +20,7 @@ struct softirq_action;
20 softirq_name(BLOCK_IOPOLL), \ 20 softirq_name(BLOCK_IOPOLL), \
21 softirq_name(TASKLET), \ 21 softirq_name(TASKLET), \
22 softirq_name(SCHED), \ 22 softirq_name(SCHED), \
23 softirq_name(HRTIMER), \ 23 softirq_name(HRTIMER))
24 softirq_name(RCU))
25 24
26/** 25/**
27 * irq_handler_entry - called immediately before the irq action handler 26 * irq_handler_entry - called immediately before the irq action handler
diff --git a/init/Kconfig b/init/Kconfig
index 7a71e0a9992a..119b9727d10b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -485,7 +485,7 @@ config TREE_RCU_TRACE
485 485
486config RCU_BOOST 486config RCU_BOOST
487 bool "Enable RCU priority boosting" 487 bool "Enable RCU priority boosting"
488 depends on RT_MUTEXES && TINY_PREEMPT_RCU 488 depends on RT_MUTEXES && PREEMPT_RCU
489 default n 489 default n
490 help 490 help
491 This option boosts the priority of preempted RCU readers that 491 This option boosts the priority of preempted RCU readers that
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f3240e987928..7784bd216b6a 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -142,10 +142,17 @@ static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
142 * Ensure that queued callbacks are all executed. 142 * Ensure that queued callbacks are all executed.
143 * If we detect that we are nested in a RCU read-side critical 143 * If we detect that we are nested in a RCU read-side critical
144 * section, we should simply fail, otherwise we would deadlock. 144 * section, we should simply fail, otherwise we would deadlock.
145 * In !PREEMPT configurations, there is no way to tell if we are
146 * in a RCU read-side critical section or not, so we never
147 * attempt any fixup and just print a warning.
145 */ 148 */
149#ifndef CONFIG_PREEMPT
150 WARN_ON_ONCE(1);
151 return 0;
152#endif
146 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 153 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
147 irqs_disabled()) { 154 irqs_disabled()) {
148 WARN_ON(1); 155 WARN_ON_ONCE(1);
149 return 0; 156 return 0;
150 } 157 }
151 rcu_barrier(); 158 rcu_barrier();
@@ -184,10 +191,17 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
184 * Ensure that queued callbacks are all executed. 191 * Ensure that queued callbacks are all executed.
185 * If we detect that we are nested in a RCU read-side critical 192 * If we detect that we are nested in a RCU read-side critical
186 * section, we should simply fail, otherwise we would deadlock. 193 * section, we should simply fail, otherwise we would deadlock.
194 * In !PREEMPT configurations, there is no way to tell if we are
195 * in a RCU read-side critical section or not, so we never
196 * attempt any fixup and just print a warning.
187 */ 197 */
198#ifndef CONFIG_PREEMPT
199 WARN_ON_ONCE(1);
200 return 0;
201#endif
188 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 202 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
189 irqs_disabled()) { 203 irqs_disabled()) {
190 WARN_ON(1); 204 WARN_ON_ONCE(1);
191 return 0; 205 return 0;
192 } 206 }
193 rcu_barrier(); 207 rcu_barrier();
@@ -214,15 +228,17 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
214 * Ensure that queued callbacks are all executed. 228 * Ensure that queued callbacks are all executed.
215 * If we detect that we are nested in a RCU read-side critical 229 * If we detect that we are nested in a RCU read-side critical
216 * section, we should simply fail, otherwise we would deadlock. 230 * section, we should simply fail, otherwise we would deadlock.
217 * Note that the machinery to reliably determine whether 231 * In !PREEMPT configurations, there is no way to tell if we are
218 * or not we are in an RCU read-side critical section 232 * in a RCU read-side critical section or not, so we never
219 * exists only in the preemptible RCU implementations 233 * attempt any fixup and just print a warning.
220 * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why
221 * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT.
222 */ 234 */
235#ifndef CONFIG_PREEMPT
236 WARN_ON_ONCE(1);
237 return 0;
238#endif
223 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 239 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
224 irqs_disabled()) { 240 irqs_disabled()) {
225 WARN_ON(1); 241 WARN_ON_ONCE(1);
226 return 0; 242 return 0;
227 } 243 }
228 rcu_barrier(); 244 rcu_barrier();
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 0c343b9a46d5..421abfd3641d 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -40,10 +40,10 @@
40static struct task_struct *rcu_kthread_task; 40static struct task_struct *rcu_kthread_task;
41static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); 41static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
42static unsigned long have_rcu_kthread_work; 42static unsigned long have_rcu_kthread_work;
43static void invoke_rcu_kthread(void);
44 43
45/* Forward declarations for rcutiny_plugin.h. */ 44/* Forward declarations for rcutiny_plugin.h. */
46struct rcu_ctrlblk; 45struct rcu_ctrlblk;
46static void invoke_rcu_kthread(void);
47static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); 47static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
48static int rcu_kthread(void *arg); 48static int rcu_kthread(void *arg);
49static void __call_rcu(struct rcu_head *head, 49static void __call_rcu(struct rcu_head *head,
@@ -79,36 +79,45 @@ void rcu_exit_nohz(void)
79#endif /* #ifdef CONFIG_NO_HZ */ 79#endif /* #ifdef CONFIG_NO_HZ */
80 80
81/* 81/*
82 * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc(). 82 * Helper function for rcu_sched_qs() and rcu_bh_qs().
83 * Also disable irqs to avoid confusion due to interrupt handlers 83 * Also irqs are disabled to avoid confusion due to interrupt handlers
84 * invoking call_rcu(). 84 * invoking call_rcu().
85 */ 85 */
86static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) 86static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
87{ 87{
88 unsigned long flags;
89
90 local_irq_save(flags);
91 if (rcp->rcucblist != NULL && 88 if (rcp->rcucblist != NULL &&
92 rcp->donetail != rcp->curtail) { 89 rcp->donetail != rcp->curtail) {
93 rcp->donetail = rcp->curtail; 90 rcp->donetail = rcp->curtail;
94 local_irq_restore(flags);
95 return 1; 91 return 1;
96 } 92 }
97 local_irq_restore(flags);
98 93
99 return 0; 94 return 0;
100} 95}
101 96
102/* 97/*
98 * Wake up rcu_kthread() to process callbacks now eligible for invocation
99 * or to boost readers.
100 */
101static void invoke_rcu_kthread(void)
102{
103 have_rcu_kthread_work = 1;
104 wake_up(&rcu_kthread_wq);
105}
106
107/*
103 * Record an rcu quiescent state. And an rcu_bh quiescent state while we 108 * Record an rcu quiescent state. And an rcu_bh quiescent state while we
104 * are at it, given that any rcu quiescent state is also an rcu_bh 109 * are at it, given that any rcu quiescent state is also an rcu_bh
105 * quiescent state. Use "+" instead of "||" to defeat short circuiting. 110 * quiescent state. Use "+" instead of "||" to defeat short circuiting.
106 */ 111 */
107void rcu_sched_qs(int cpu) 112void rcu_sched_qs(int cpu)
108{ 113{
114 unsigned long flags;
115
116 local_irq_save(flags);
109 if (rcu_qsctr_help(&rcu_sched_ctrlblk) + 117 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
110 rcu_qsctr_help(&rcu_bh_ctrlblk)) 118 rcu_qsctr_help(&rcu_bh_ctrlblk))
111 invoke_rcu_kthread(); 119 invoke_rcu_kthread();
120 local_irq_restore(flags);
112} 121}
113 122
114/* 123/*
@@ -116,8 +125,12 @@ void rcu_sched_qs(int cpu)
116 */ 125 */
117void rcu_bh_qs(int cpu) 126void rcu_bh_qs(int cpu)
118{ 127{
128 unsigned long flags;
129
130 local_irq_save(flags);
119 if (rcu_qsctr_help(&rcu_bh_ctrlblk)) 131 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
120 invoke_rcu_kthread(); 132 invoke_rcu_kthread();
133 local_irq_restore(flags);
121} 134}
122 135
123/* 136/*
@@ -167,7 +180,7 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
167 prefetch(next); 180 prefetch(next);
168 debug_rcu_head_unqueue(list); 181 debug_rcu_head_unqueue(list);
169 local_bh_disable(); 182 local_bh_disable();
170 list->func(list); 183 __rcu_reclaim(list);
171 local_bh_enable(); 184 local_bh_enable();
172 list = next; 185 list = next;
173 RCU_TRACE(cb_count++); 186 RCU_TRACE(cb_count++);
@@ -208,20 +221,6 @@ static int rcu_kthread(void *arg)
208} 221}
209 222
210/* 223/*
211 * Wake up rcu_kthread() to process callbacks now eligible for invocation
212 * or to boost readers.
213 */
214static void invoke_rcu_kthread(void)
215{
216 unsigned long flags;
217
218 local_irq_save(flags);
219 have_rcu_kthread_work = 1;
220 wake_up(&rcu_kthread_wq);
221 local_irq_restore(flags);
222}
223
224/*
225 * Wait for a grace period to elapse. But it is illegal to invoke 224 * Wait for a grace period to elapse. But it is illegal to invoke
226 * synchronize_sched() from within an RCU read-side critical section. 225 * synchronize_sched() from within an RCU read-side critical section.
227 * Therefore, any legal call to synchronize_sched() is a quiescent 226 * Therefore, any legal call to synchronize_sched() is a quiescent
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 3cb8e362e883..f259c676195f 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -100,23 +100,28 @@ struct rcu_preempt_ctrlblk {
100 u8 completed; /* Last grace period completed. */ 100 u8 completed; /* Last grace period completed. */
101 /* If all three are equal, RCU is idle. */ 101 /* If all three are equal, RCU is idle. */
102#ifdef CONFIG_RCU_BOOST 102#ifdef CONFIG_RCU_BOOST
103 s8 boosted_this_gp; /* Has boosting already happened? */
104 unsigned long boost_time; /* When to start boosting (jiffies) */ 103 unsigned long boost_time; /* When to start boosting (jiffies) */
105#endif /* #ifdef CONFIG_RCU_BOOST */ 104#endif /* #ifdef CONFIG_RCU_BOOST */
106#ifdef CONFIG_RCU_TRACE 105#ifdef CONFIG_RCU_TRACE
107 unsigned long n_grace_periods; 106 unsigned long n_grace_periods;
108#ifdef CONFIG_RCU_BOOST 107#ifdef CONFIG_RCU_BOOST
109 unsigned long n_tasks_boosted; 108 unsigned long n_tasks_boosted;
109 /* Total number of tasks boosted. */
110 unsigned long n_exp_boosts; 110 unsigned long n_exp_boosts;
111 /* Number of tasks boosted for expedited GP. */
111 unsigned long n_normal_boosts; 112 unsigned long n_normal_boosts;
112 unsigned long n_normal_balk_blkd_tasks; 113 /* Number of tasks boosted for normal GP. */
113 unsigned long n_normal_balk_gp_tasks; 114 unsigned long n_balk_blkd_tasks;
114 unsigned long n_normal_balk_boost_tasks; 115 /* Refused to boost: no blocked tasks. */
115 unsigned long n_normal_balk_boosted; 116 unsigned long n_balk_exp_gp_tasks;
116 unsigned long n_normal_balk_notyet; 117 /* Refused to boost: nothing blocking GP. */
117 unsigned long n_normal_balk_nos; 118 unsigned long n_balk_boost_tasks;
118 unsigned long n_exp_balk_blkd_tasks; 119 /* Refused to boost: already boosting. */
119 unsigned long n_exp_balk_nos; 120 unsigned long n_balk_notyet;
121 /* Refused to boost: not yet time. */
122 unsigned long n_balk_nos;
123 /* Refused to boost: not sure why, though. */
124 /* This can happen due to race conditions. */
120#endif /* #ifdef CONFIG_RCU_BOOST */ 125#endif /* #ifdef CONFIG_RCU_BOOST */
121#endif /* #ifdef CONFIG_RCU_TRACE */ 126#endif /* #ifdef CONFIG_RCU_TRACE */
122}; 127};
@@ -201,7 +206,6 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t)
201 206
202#ifdef CONFIG_RCU_BOOST 207#ifdef CONFIG_RCU_BOOST
203static void rcu_initiate_boost_trace(void); 208static void rcu_initiate_boost_trace(void);
204static void rcu_initiate_exp_boost_trace(void);
205#endif /* #ifdef CONFIG_RCU_BOOST */ 209#endif /* #ifdef CONFIG_RCU_BOOST */
206 210
207/* 211/*
@@ -219,41 +223,21 @@ static void show_tiny_preempt_stats(struct seq_file *m)
219 "N."[!rcu_preempt_ctrlblk.gp_tasks], 223 "N."[!rcu_preempt_ctrlblk.gp_tasks],
220 "E."[!rcu_preempt_ctrlblk.exp_tasks]); 224 "E."[!rcu_preempt_ctrlblk.exp_tasks]);
221#ifdef CONFIG_RCU_BOOST 225#ifdef CONFIG_RCU_BOOST
222 seq_printf(m, " ttb=%c btg=", 226 seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
223 "B."[!rcu_preempt_ctrlblk.boost_tasks]); 227 " ",
224 switch (rcu_preempt_ctrlblk.boosted_this_gp) { 228 "B."[!rcu_preempt_ctrlblk.boost_tasks],
225 case -1:
226 seq_puts(m, "exp");
227 break;
228 case 0:
229 seq_puts(m, "no");
230 break;
231 case 1:
232 seq_puts(m, "begun");
233 break;
234 case 2:
235 seq_puts(m, "done");
236 break;
237 default:
238 seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
239 }
240 seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
241 rcu_preempt_ctrlblk.n_tasks_boosted, 229 rcu_preempt_ctrlblk.n_tasks_boosted,
242 rcu_preempt_ctrlblk.n_exp_boosts, 230 rcu_preempt_ctrlblk.n_exp_boosts,
243 rcu_preempt_ctrlblk.n_normal_boosts, 231 rcu_preempt_ctrlblk.n_normal_boosts,
244 (int)(jiffies & 0xffff), 232 (int)(jiffies & 0xffff),
245 (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); 233 (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
246 seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n", 234 seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n",
247 "normal balk", 235 " balk",
248 rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks, 236 rcu_preempt_ctrlblk.n_balk_blkd_tasks,
249 rcu_preempt_ctrlblk.n_normal_balk_gp_tasks, 237 rcu_preempt_ctrlblk.n_balk_exp_gp_tasks,
250 rcu_preempt_ctrlblk.n_normal_balk_boost_tasks, 238 rcu_preempt_ctrlblk.n_balk_boost_tasks,
251 rcu_preempt_ctrlblk.n_normal_balk_boosted, 239 rcu_preempt_ctrlblk.n_balk_notyet,
252 rcu_preempt_ctrlblk.n_normal_balk_notyet, 240 rcu_preempt_ctrlblk.n_balk_nos);
253 rcu_preempt_ctrlblk.n_normal_balk_nos);
254 seq_printf(m, " exp balk: bt=%lu nos=%lu\n",
255 rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
256 rcu_preempt_ctrlblk.n_exp_balk_nos);
257#endif /* #ifdef CONFIG_RCU_BOOST */ 241#endif /* #ifdef CONFIG_RCU_BOOST */
258} 242}
259 243
@@ -271,25 +255,59 @@ static int rcu_boost(void)
271{ 255{
272 unsigned long flags; 256 unsigned long flags;
273 struct rt_mutex mtx; 257 struct rt_mutex mtx;
274 struct list_head *np;
275 struct task_struct *t; 258 struct task_struct *t;
259 struct list_head *tb;
276 260
277 if (rcu_preempt_ctrlblk.boost_tasks == NULL) 261 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
262 rcu_preempt_ctrlblk.exp_tasks == NULL)
278 return 0; /* Nothing to boost. */ 263 return 0; /* Nothing to boost. */
264
279 raw_local_irq_save(flags); 265 raw_local_irq_save(flags);
280 rcu_preempt_ctrlblk.boosted_this_gp++; 266
281 t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct, 267 /*
282 rcu_node_entry); 268 * Recheck with irqs disabled: all tasks in need of boosting
283 np = rcu_next_node_entry(t); 269 * might exit their RCU read-side critical sections on their own
270 * if we are preempted just before disabling irqs.
271 */
272 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
273 rcu_preempt_ctrlblk.exp_tasks == NULL) {
274 raw_local_irq_restore(flags);
275 return 0;
276 }
277
278 /*
279 * Preferentially boost tasks blocking expedited grace periods.
280 * This cannot starve the normal grace periods because a second
281 * expedited grace period must boost all blocked tasks, including
282 * those blocking the pre-existing normal grace period.
283 */
284 if (rcu_preempt_ctrlblk.exp_tasks != NULL) {
285 tb = rcu_preempt_ctrlblk.exp_tasks;
286 RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
287 } else {
288 tb = rcu_preempt_ctrlblk.boost_tasks;
289 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
290 }
291 RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
292
293 /*
294 * We boost task t by manufacturing an rt_mutex that appears to
295 * be held by task t. We leave a pointer to that rt_mutex where
296 * task t can find it, and task t will release the mutex when it
297 * exits its outermost RCU read-side critical section. Then
298 * simply acquiring this artificial rt_mutex will boost task
299 * t's priority. (Thanks to tglx for suggesting this approach!)
300 */
301 t = container_of(tb, struct task_struct, rcu_node_entry);
284 rt_mutex_init_proxy_locked(&mtx, t); 302 rt_mutex_init_proxy_locked(&mtx, t);
285 t->rcu_boost_mutex = &mtx; 303 t->rcu_boost_mutex = &mtx;
286 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; 304 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
287 raw_local_irq_restore(flags); 305 raw_local_irq_restore(flags);
288 rt_mutex_lock(&mtx); 306 rt_mutex_lock(&mtx);
289 RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); 307 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
290 rcu_preempt_ctrlblk.boosted_this_gp++; 308
291 rt_mutex_unlock(&mtx); 309 return rcu_preempt_ctrlblk.boost_tasks != NULL ||
292 return rcu_preempt_ctrlblk.boost_tasks != NULL; 310 rcu_preempt_ctrlblk.exp_tasks != NULL;
293} 311}
294 312
295/* 313/*
@@ -304,42 +322,25 @@ static int rcu_boost(void)
304 */ 322 */
305static int rcu_initiate_boost(void) 323static int rcu_initiate_boost(void)
306{ 324{
307 if (!rcu_preempt_blocked_readers_cgp()) { 325 if (!rcu_preempt_blocked_readers_cgp() &&
308 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++); 326 rcu_preempt_ctrlblk.exp_tasks == NULL) {
327 RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++);
309 return 0; 328 return 0;
310 } 329 }
311 if (rcu_preempt_ctrlblk.gp_tasks != NULL && 330 if (rcu_preempt_ctrlblk.exp_tasks != NULL ||
312 rcu_preempt_ctrlblk.boost_tasks == NULL && 331 (rcu_preempt_ctrlblk.gp_tasks != NULL &&
313 rcu_preempt_ctrlblk.boosted_this_gp == 0 && 332 rcu_preempt_ctrlblk.boost_tasks == NULL &&
314 ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) { 333 ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) {
315 rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; 334 if (rcu_preempt_ctrlblk.exp_tasks == NULL)
335 rcu_preempt_ctrlblk.boost_tasks =
336 rcu_preempt_ctrlblk.gp_tasks;
316 invoke_rcu_kthread(); 337 invoke_rcu_kthread();
317 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
318 } else 338 } else
319 RCU_TRACE(rcu_initiate_boost_trace()); 339 RCU_TRACE(rcu_initiate_boost_trace());
320 return 1; 340 return 1;
321} 341}
322 342
323/* 343#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
324 * Initiate boosting for an expedited grace period.
325 */
326static void rcu_initiate_expedited_boost(void)
327{
328 unsigned long flags;
329
330 raw_local_irq_save(flags);
331 if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
332 rcu_preempt_ctrlblk.boost_tasks =
333 rcu_preempt_ctrlblk.blkd_tasks.next;
334 rcu_preempt_ctrlblk.boosted_this_gp = -1;
335 invoke_rcu_kthread();
336 RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
337 } else
338 RCU_TRACE(rcu_initiate_exp_boost_trace());
339 raw_local_irq_restore(flags);
340}
341
342#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
343 344
344/* 345/*
345 * Do priority-boost accounting for the start of a new grace period. 346 * Do priority-boost accounting for the start of a new grace period.
@@ -347,8 +348,6 @@ static void rcu_initiate_expedited_boost(void)
347static void rcu_preempt_boost_start_gp(void) 348static void rcu_preempt_boost_start_gp(void)
348{ 349{
349 rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; 350 rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
350 if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
351 rcu_preempt_ctrlblk.boosted_this_gp = 0;
352} 351}
353 352
354#else /* #ifdef CONFIG_RCU_BOOST */ 353#else /* #ifdef CONFIG_RCU_BOOST */
@@ -372,13 +371,6 @@ static int rcu_initiate_boost(void)
372} 371}
373 372
374/* 373/*
375 * If there is no RCU priority boosting, we don't initiate expedited boosting.
376 */
377static void rcu_initiate_expedited_boost(void)
378{
379}
380
381/*
382 * If there is no RCU priority boosting, nothing to do at grace-period start. 374 * If there is no RCU priority boosting, nothing to do at grace-period start.
383 */ 375 */
384static void rcu_preempt_boost_start_gp(void) 376static void rcu_preempt_boost_start_gp(void)
@@ -418,7 +410,7 @@ static void rcu_preempt_cpu_qs(void)
418 if (!rcu_preempt_gp_in_progress()) 410 if (!rcu_preempt_gp_in_progress())
419 return; 411 return;
420 /* 412 /*
421 * Check up on boosting. If there are no readers blocking the 413 * Check up on boosting. If there are readers blocking the
422 * current grace period, leave. 414 * current grace period, leave.
423 */ 415 */
424 if (rcu_initiate_boost()) 416 if (rcu_initiate_boost())
@@ -578,7 +570,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
578 empty = !rcu_preempt_blocked_readers_cgp(); 570 empty = !rcu_preempt_blocked_readers_cgp();
579 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; 571 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
580 np = rcu_next_node_entry(t); 572 np = rcu_next_node_entry(t);
581 list_del(&t->rcu_node_entry); 573 list_del_init(&t->rcu_node_entry);
582 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) 574 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
583 rcu_preempt_ctrlblk.gp_tasks = np; 575 rcu_preempt_ctrlblk.gp_tasks = np;
584 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) 576 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
@@ -587,7 +579,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
587 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) 579 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
588 rcu_preempt_ctrlblk.boost_tasks = np; 580 rcu_preempt_ctrlblk.boost_tasks = np;
589#endif /* #ifdef CONFIG_RCU_BOOST */ 581#endif /* #ifdef CONFIG_RCU_BOOST */
590 INIT_LIST_HEAD(&t->rcu_node_entry);
591 582
592 /* 583 /*
593 * If this was the last task on the current list, and if 584 * If this was the last task on the current list, and if
@@ -812,13 +803,16 @@ void synchronize_rcu_expedited(void)
812 rpcp->exp_tasks = rpcp->blkd_tasks.next; 803 rpcp->exp_tasks = rpcp->blkd_tasks.next;
813 if (rpcp->exp_tasks == &rpcp->blkd_tasks) 804 if (rpcp->exp_tasks == &rpcp->blkd_tasks)
814 rpcp->exp_tasks = NULL; 805 rpcp->exp_tasks = NULL;
815 local_irq_restore(flags);
816 806
817 /* Wait for tail of ->blkd_tasks list to drain. */ 807 /* Wait for tail of ->blkd_tasks list to drain. */
818 if (rcu_preempted_readers_exp()) 808 if (!rcu_preempted_readers_exp())
819 rcu_initiate_expedited_boost(); 809 local_irq_restore(flags);
810 else {
811 rcu_initiate_boost();
812 local_irq_restore(flags);
820 wait_event(sync_rcu_preempt_exp_wq, 813 wait_event(sync_rcu_preempt_exp_wq,
821 !rcu_preempted_readers_exp()); 814 !rcu_preempted_readers_exp());
815 }
822 816
823 /* Clean up and exit. */ 817 /* Clean up and exit. */
824 barrier(); /* ensure expedited GP seen before counter increment. */ 818 barrier(); /* ensure expedited GP seen before counter increment. */
@@ -931,24 +925,17 @@ void __init rcu_scheduler_starting(void)
931 925
932static void rcu_initiate_boost_trace(void) 926static void rcu_initiate_boost_trace(void)
933{ 927{
934 if (rcu_preempt_ctrlblk.gp_tasks == NULL) 928 if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
935 rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++; 929 rcu_preempt_ctrlblk.n_balk_blkd_tasks++;
930 else if (rcu_preempt_ctrlblk.gp_tasks == NULL &&
931 rcu_preempt_ctrlblk.exp_tasks == NULL)
932 rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++;
936 else if (rcu_preempt_ctrlblk.boost_tasks != NULL) 933 else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
937 rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++; 934 rcu_preempt_ctrlblk.n_balk_boost_tasks++;
938 else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
939 rcu_preempt_ctrlblk.n_normal_balk_boosted++;
940 else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) 935 else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
941 rcu_preempt_ctrlblk.n_normal_balk_notyet++; 936 rcu_preempt_ctrlblk.n_balk_notyet++;
942 else
943 rcu_preempt_ctrlblk.n_normal_balk_nos++;
944}
945
946static void rcu_initiate_exp_boost_trace(void)
947{
948 if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
949 rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
950 else 937 else
951 rcu_preempt_ctrlblk.n_exp_balk_nos++; 938 rcu_preempt_ctrlblk.n_balk_nos++;
952} 939}
953 940
954#endif /* #ifdef CONFIG_RCU_BOOST */ 941#endif /* #ifdef CONFIG_RCU_BOOST */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index c224da41890c..2e138db03382 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -131,7 +131,7 @@ struct rcu_torture {
131 131
132static LIST_HEAD(rcu_torture_freelist); 132static LIST_HEAD(rcu_torture_freelist);
133static struct rcu_torture __rcu *rcu_torture_current; 133static struct rcu_torture __rcu *rcu_torture_current;
134static long rcu_torture_current_version; 134static unsigned long rcu_torture_current_version;
135static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; 135static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
136static DEFINE_SPINLOCK(rcu_torture_lock); 136static DEFINE_SPINLOCK(rcu_torture_lock);
137static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = 137static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
@@ -146,8 +146,6 @@ static atomic_t n_rcu_torture_mberror;
146static atomic_t n_rcu_torture_error; 146static atomic_t n_rcu_torture_error;
147static long n_rcu_torture_boost_ktrerror; 147static long n_rcu_torture_boost_ktrerror;
148static long n_rcu_torture_boost_rterror; 148static long n_rcu_torture_boost_rterror;
149static long n_rcu_torture_boost_allocerror;
150static long n_rcu_torture_boost_afferror;
151static long n_rcu_torture_boost_failure; 149static long n_rcu_torture_boost_failure;
152static long n_rcu_torture_boosts; 150static long n_rcu_torture_boosts;
153static long n_rcu_torture_timers; 151static long n_rcu_torture_timers;
@@ -163,11 +161,11 @@ static int stutter_pause_test;
163#endif 161#endif
164int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 162int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
165 163
166#ifdef CONFIG_RCU_BOOST 164#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
167#define rcu_can_boost() 1 165#define rcu_can_boost() 1
168#else /* #ifdef CONFIG_RCU_BOOST */ 166#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
169#define rcu_can_boost() 0 167#define rcu_can_boost() 0
170#endif /* #else #ifdef CONFIG_RCU_BOOST */ 168#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
171 169
172static unsigned long boost_starttime; /* jiffies of next boost test start. */ 170static unsigned long boost_starttime; /* jiffies of next boost test start. */
173DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 171DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
@@ -751,6 +749,7 @@ static int rcu_torture_boost(void *arg)
751 n_rcu_torture_boost_rterror++; 749 n_rcu_torture_boost_rterror++;
752 } 750 }
753 751
752 init_rcu_head_on_stack(&rbi.rcu);
754 /* Each pass through the following loop does one boost-test cycle. */ 753 /* Each pass through the following loop does one boost-test cycle. */
755 do { 754 do {
756 /* Wait for the next test interval. */ 755 /* Wait for the next test interval. */
@@ -810,6 +809,7 @@ checkwait: rcu_stutter_wait("rcu_torture_boost");
810 809
811 /* Clean up and exit. */ 810 /* Clean up and exit. */
812 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); 811 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
812 destroy_rcu_head_on_stack(&rbi.rcu);
813 rcutorture_shutdown_absorb("rcu_torture_boost"); 813 rcutorture_shutdown_absorb("rcu_torture_boost");
814 while (!kthread_should_stop() || rbi.inflight) 814 while (!kthread_should_stop() || rbi.inflight)
815 schedule_timeout_uninterruptible(1); 815 schedule_timeout_uninterruptible(1);
@@ -886,7 +886,7 @@ rcu_torture_writer(void *arg)
886 old_rp->rtort_pipe_count++; 886 old_rp->rtort_pipe_count++;
887 cur_ops->deferred_free(old_rp); 887 cur_ops->deferred_free(old_rp);
888 } 888 }
889 rcu_torture_current_version++; 889 rcutorture_record_progress(++rcu_torture_current_version);
890 oldbatch = cur_ops->completed(); 890 oldbatch = cur_ops->completed();
891 rcu_stutter_wait("rcu_torture_writer"); 891 rcu_stutter_wait("rcu_torture_writer");
892 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 892 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
@@ -1066,8 +1066,8 @@ rcu_torture_printk(char *page)
1066 } 1066 }
1067 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); 1067 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
1068 cnt += sprintf(&page[cnt], 1068 cnt += sprintf(&page[cnt],
1069 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " 1069 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
1070 "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld " 1070 "rtmbe: %d rtbke: %ld rtbre: %ld "
1071 "rtbf: %ld rtb: %ld nt: %ld", 1071 "rtbf: %ld rtb: %ld nt: %ld",
1072 rcu_torture_current, 1072 rcu_torture_current,
1073 rcu_torture_current_version, 1073 rcu_torture_current_version,
@@ -1078,16 +1078,12 @@ rcu_torture_printk(char *page)
1078 atomic_read(&n_rcu_torture_mberror), 1078 atomic_read(&n_rcu_torture_mberror),
1079 n_rcu_torture_boost_ktrerror, 1079 n_rcu_torture_boost_ktrerror,
1080 n_rcu_torture_boost_rterror, 1080 n_rcu_torture_boost_rterror,
1081 n_rcu_torture_boost_allocerror,
1082 n_rcu_torture_boost_afferror,
1083 n_rcu_torture_boost_failure, 1081 n_rcu_torture_boost_failure,
1084 n_rcu_torture_boosts, 1082 n_rcu_torture_boosts,
1085 n_rcu_torture_timers); 1083 n_rcu_torture_timers);
1086 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1084 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1087 n_rcu_torture_boost_ktrerror != 0 || 1085 n_rcu_torture_boost_ktrerror != 0 ||
1088 n_rcu_torture_boost_rterror != 0 || 1086 n_rcu_torture_boost_rterror != 0 ||
1089 n_rcu_torture_boost_allocerror != 0 ||
1090 n_rcu_torture_boost_afferror != 0 ||
1091 n_rcu_torture_boost_failure != 0) 1087 n_rcu_torture_boost_failure != 0)
1092 cnt += sprintf(&page[cnt], " !!!"); 1088 cnt += sprintf(&page[cnt], " !!!");
1093 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 1089 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
@@ -1331,6 +1327,7 @@ rcu_torture_cleanup(void)
1331 int i; 1327 int i;
1332 1328
1333 mutex_lock(&fullstop_mutex); 1329 mutex_lock(&fullstop_mutex);
1330 rcutorture_record_test_transition();
1334 if (fullstop == FULLSTOP_SHUTDOWN) { 1331 if (fullstop == FULLSTOP_SHUTDOWN) {
1335 printk(KERN_WARNING /* but going down anyway, so... */ 1332 printk(KERN_WARNING /* but going down anyway, so... */
1336 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); 1333 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
@@ -1486,8 +1483,6 @@ rcu_torture_init(void)
1486 atomic_set(&n_rcu_torture_error, 0); 1483 atomic_set(&n_rcu_torture_error, 0);
1487 n_rcu_torture_boost_ktrerror = 0; 1484 n_rcu_torture_boost_ktrerror = 0;
1488 n_rcu_torture_boost_rterror = 0; 1485 n_rcu_torture_boost_rterror = 0;
1489 n_rcu_torture_boost_allocerror = 0;
1490 n_rcu_torture_boost_afferror = 0;
1491 n_rcu_torture_boost_failure = 0; 1486 n_rcu_torture_boost_failure = 0;
1492 n_rcu_torture_boosts = 0; 1487 n_rcu_torture_boosts = 0;
1493 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 1488 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
@@ -1624,6 +1619,7 @@ rcu_torture_init(void)
1624 } 1619 }
1625 } 1620 }
1626 register_reboot_notifier(&rcutorture_shutdown_nb); 1621 register_reboot_notifier(&rcutorture_shutdown_nb);
1622 rcutorture_record_test_transition();
1627 mutex_unlock(&fullstop_mutex); 1623 mutex_unlock(&fullstop_mutex);
1628 return 0; 1624 return 0;
1629 1625
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index dd4aea806f8e..54ff7eb92819 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -47,6 +47,8 @@
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h> 49#include <linux/kernel_stat.h>
50#include <linux/wait.h>
51#include <linux/kthread.h>
50 52
51#include "rcutree.h" 53#include "rcutree.h"
52 54
@@ -79,10 +81,41 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
79struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
80DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
81 83
84static struct rcu_state *rcu_state;
85
82int rcu_scheduler_active __read_mostly; 86int rcu_scheduler_active __read_mostly;
83EXPORT_SYMBOL_GPL(rcu_scheduler_active); 87EXPORT_SYMBOL_GPL(rcu_scheduler_active);
84 88
85/* 89/*
90 * Control variables for per-CPU and per-rcu_node kthreads. These
91 * handle all flavors of RCU.
92 */
93static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
94DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
95DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
96DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
97static DEFINE_PER_CPU(wait_queue_head_t, rcu_cpu_wq);
98DEFINE_PER_CPU(char, rcu_cpu_has_work);
99static char rcu_kthreads_spawnable;
100
101static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
102static void invoke_rcu_cpu_kthread(void);
103
104#define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */
105
106/*
107 * Track the rcutorture test sequence number and the update version
108 * number within a given test. The rcutorture_testseq is incremented
109 * on every rcutorture module load and unload, so has an odd value
110 * when a test is running. The rcutorture_vernum is set to zero
111 * when rcutorture starts and is incremented on each rcutorture update.
112 * These variables enable correlating rcutorture output with the
113 * RCU tracing information.
114 */
115unsigned long rcutorture_testseq;
116unsigned long rcutorture_vernum;
117
118/*
86 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 119 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
87 * permit this function to be invoked without holding the root rcu_node 120 * permit this function to be invoked without holding the root rcu_node
88 * structure's ->lock, but of course results can be subject to change. 121 * structure's ->lock, but of course results can be subject to change.
@@ -124,11 +157,12 @@ void rcu_note_context_switch(int cpu)
124 rcu_sched_qs(cpu); 157 rcu_sched_qs(cpu);
125 rcu_preempt_note_context_switch(cpu); 158 rcu_preempt_note_context_switch(cpu);
126} 159}
160EXPORT_SYMBOL_GPL(rcu_note_context_switch);
127 161
128#ifdef CONFIG_NO_HZ 162#ifdef CONFIG_NO_HZ
129DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 163DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
130 .dynticks_nesting = 1, 164 .dynticks_nesting = 1,
131 .dynticks = 1, 165 .dynticks = ATOMIC_INIT(1),
132}; 166};
133#endif /* #ifdef CONFIG_NO_HZ */ 167#endif /* #ifdef CONFIG_NO_HZ */
134 168
@@ -140,10 +174,8 @@ module_param(blimit, int, 0);
140module_param(qhimark, int, 0); 174module_param(qhimark, int, 0);
141module_param(qlowmark, int, 0); 175module_param(qlowmark, int, 0);
142 176
143#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 177int rcu_cpu_stall_suppress __read_mostly;
144int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
145module_param(rcu_cpu_stall_suppress, int, 0644); 178module_param(rcu_cpu_stall_suppress, int, 0644);
146#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
147 179
148static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 180static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
149static int rcu_pending(int cpu); 181static int rcu_pending(int cpu);
@@ -176,6 +208,31 @@ void rcu_bh_force_quiescent_state(void)
176EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); 208EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
177 209
178/* 210/*
211 * Record the number of times rcutorture tests have been initiated and
212 * terminated. This information allows the debugfs tracing stats to be
213 * correlated to the rcutorture messages, even when the rcutorture module
214 * is being repeatedly loaded and unloaded. In other words, we cannot
215 * store this state in rcutorture itself.
216 */
217void rcutorture_record_test_transition(void)
218{
219 rcutorture_testseq++;
220 rcutorture_vernum = 0;
221}
222EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
223
224/*
225 * Record the number of writer passes through the current rcutorture test.
226 * This is also used to correlate debugfs tracing stats with the rcutorture
227 * messages.
228 */
229void rcutorture_record_progress(unsigned long vernum)
230{
231 rcutorture_vernum++;
232}
233EXPORT_SYMBOL_GPL(rcutorture_record_progress);
234
235/*
179 * Force a quiescent state for RCU-sched. 236 * Force a quiescent state for RCU-sched.
180 */ 237 */
181void rcu_sched_force_quiescent_state(void) 238void rcu_sched_force_quiescent_state(void)
@@ -234,8 +291,8 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
234 return 1; 291 return 1;
235 } 292 }
236 293
237 /* If preemptable RCU, no point in sending reschedule IPI. */ 294 /* If preemptible RCU, no point in sending reschedule IPI. */
238 if (rdp->preemptable) 295 if (rdp->preemptible)
239 return 0; 296 return 0;
240 297
241 /* The CPU is online, so send it a reschedule IPI. */ 298 /* The CPU is online, so send it a reschedule IPI. */
@@ -264,13 +321,25 @@ void rcu_enter_nohz(void)
264 unsigned long flags; 321 unsigned long flags;
265 struct rcu_dynticks *rdtp; 322 struct rcu_dynticks *rdtp;
266 323
267 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
268 local_irq_save(flags); 324 local_irq_save(flags);
269 rdtp = &__get_cpu_var(rcu_dynticks); 325 rdtp = &__get_cpu_var(rcu_dynticks);
270 rdtp->dynticks++; 326 if (--rdtp->dynticks_nesting) {
271 rdtp->dynticks_nesting--; 327 local_irq_restore(flags);
272 WARN_ON_ONCE(rdtp->dynticks & 0x1); 328 return;
329 }
330 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
331 smp_mb__before_atomic_inc(); /* See above. */
332 atomic_inc(&rdtp->dynticks);
333 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
334 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
273 local_irq_restore(flags); 335 local_irq_restore(flags);
336
337 /* If the interrupt queued a callback, get out of dyntick mode. */
338 if (in_irq() &&
339 (__get_cpu_var(rcu_sched_data).nxtlist ||
340 __get_cpu_var(rcu_bh_data).nxtlist ||
341 rcu_preempt_needs_cpu(smp_processor_id())))
342 set_need_resched();
274} 343}
275 344
276/* 345/*
@@ -286,11 +355,16 @@ void rcu_exit_nohz(void)
286 355
287 local_irq_save(flags); 356 local_irq_save(flags);
288 rdtp = &__get_cpu_var(rcu_dynticks); 357 rdtp = &__get_cpu_var(rcu_dynticks);
289 rdtp->dynticks++; 358 if (rdtp->dynticks_nesting++) {
290 rdtp->dynticks_nesting++; 359 local_irq_restore(flags);
291 WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); 360 return;
361 }
362 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */
363 atomic_inc(&rdtp->dynticks);
364 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
365 smp_mb__after_atomic_inc(); /* See above. */
366 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
292 local_irq_restore(flags); 367 local_irq_restore(flags);
293 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
294} 368}
295 369
296/** 370/**
@@ -304,11 +378,15 @@ void rcu_nmi_enter(void)
304{ 378{
305 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 379 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
306 380
307 if (rdtp->dynticks & 0x1) 381 if (rdtp->dynticks_nmi_nesting == 0 &&
382 (atomic_read(&rdtp->dynticks) & 0x1))
308 return; 383 return;
309 rdtp->dynticks_nmi++; 384 rdtp->dynticks_nmi_nesting++;
310 WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1)); 385 smp_mb__before_atomic_inc(); /* Force delay from prior write. */
311 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 386 atomic_inc(&rdtp->dynticks);
387 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
388 smp_mb__after_atomic_inc(); /* See above. */
389 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
312} 390}
313 391
314/** 392/**
@@ -322,11 +400,14 @@ void rcu_nmi_exit(void)
322{ 400{
323 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 401 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
324 402
325 if (rdtp->dynticks & 0x1) 403 if (rdtp->dynticks_nmi_nesting == 0 ||
404 --rdtp->dynticks_nmi_nesting != 0)
326 return; 405 return;
327 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ 406 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
328 rdtp->dynticks_nmi++; 407 smp_mb__before_atomic_inc(); /* See above. */
329 WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1); 408 atomic_inc(&rdtp->dynticks);
409 smp_mb__after_atomic_inc(); /* Force delay to next write. */
410 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
330} 411}
331 412
332/** 413/**
@@ -337,13 +418,7 @@ void rcu_nmi_exit(void)
337 */ 418 */
338void rcu_irq_enter(void) 419void rcu_irq_enter(void)
339{ 420{
340 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 421 rcu_exit_nohz();
341
342 if (rdtp->dynticks_nesting++)
343 return;
344 rdtp->dynticks++;
345 WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
346 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
347} 422}
348 423
349/** 424/**
@@ -355,18 +430,7 @@ void rcu_irq_enter(void)
355 */ 430 */
356void rcu_irq_exit(void) 431void rcu_irq_exit(void)
357{ 432{
358 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 433 rcu_enter_nohz();
359
360 if (--rdtp->dynticks_nesting)
361 return;
362 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
363 rdtp->dynticks++;
364 WARN_ON_ONCE(rdtp->dynticks & 0x1);
365
366 /* If the interrupt queued a callback, get out of dyntick mode. */
367 if (__this_cpu_read(rcu_sched_data.nxtlist) ||
368 __this_cpu_read(rcu_bh_data.nxtlist))
369 set_need_resched();
370} 434}
371 435
372#ifdef CONFIG_SMP 436#ifdef CONFIG_SMP
@@ -378,19 +442,8 @@ void rcu_irq_exit(void)
378 */ 442 */
379static int dyntick_save_progress_counter(struct rcu_data *rdp) 443static int dyntick_save_progress_counter(struct rcu_data *rdp)
380{ 444{
381 int ret; 445 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
382 int snap; 446 return 0;
383 int snap_nmi;
384
385 snap = rdp->dynticks->dynticks;
386 snap_nmi = rdp->dynticks->dynticks_nmi;
387 smp_mb(); /* Order sampling of snap with end of grace period. */
388 rdp->dynticks_snap = snap;
389 rdp->dynticks_nmi_snap = snap_nmi;
390 ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
391 if (ret)
392 rdp->dynticks_fqs++;
393 return ret;
394} 447}
395 448
396/* 449/*
@@ -401,16 +454,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
401 */ 454 */
402static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) 455static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
403{ 456{
404 long curr; 457 unsigned long curr;
405 long curr_nmi; 458 unsigned long snap;
406 long snap;
407 long snap_nmi;
408 459
409 curr = rdp->dynticks->dynticks; 460 curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks);
410 snap = rdp->dynticks_snap; 461 snap = (unsigned long)rdp->dynticks_snap;
411 curr_nmi = rdp->dynticks->dynticks_nmi;
412 snap_nmi = rdp->dynticks_nmi_snap;
413 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
414 462
415 /* 463 /*
416 * If the CPU passed through or entered a dynticks idle phase with 464 * If the CPU passed through or entered a dynticks idle phase with
@@ -420,8 +468,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
420 * read-side critical section that started before the beginning 468 * read-side critical section that started before the beginning
421 * of the current RCU grace period. 469 * of the current RCU grace period.
422 */ 470 */
423 if ((curr != snap || (curr & 0x1) == 0) && 471 if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) {
424 (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
425 rdp->dynticks_fqs++; 472 rdp->dynticks_fqs++;
426 return 1; 473 return 1;
427 } 474 }
@@ -450,8 +497,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
450 497
451#endif /* #else #ifdef CONFIG_NO_HZ */ 498#endif /* #else #ifdef CONFIG_NO_HZ */
452 499
453#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
454
455int rcu_cpu_stall_suppress __read_mostly; 500int rcu_cpu_stall_suppress __read_mostly;
456 501
457static void record_gp_stall_check_time(struct rcu_state *rsp) 502static void record_gp_stall_check_time(struct rcu_state *rsp)
@@ -537,21 +582,24 @@ static void print_cpu_stall(struct rcu_state *rsp)
537 582
538static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) 583static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
539{ 584{
540 long delta; 585 unsigned long j;
586 unsigned long js;
541 struct rcu_node *rnp; 587 struct rcu_node *rnp;
542 588
543 if (rcu_cpu_stall_suppress) 589 if (rcu_cpu_stall_suppress)
544 return; 590 return;
545 delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); 591 j = ACCESS_ONCE(jiffies);
592 js = ACCESS_ONCE(rsp->jiffies_stall);
546 rnp = rdp->mynode; 593 rnp = rdp->mynode;
547 if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) { 594 if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
548 595
549 /* We haven't checked in, so go dump stack. */ 596 /* We haven't checked in, so go dump stack. */
550 print_cpu_stall(rsp); 597 print_cpu_stall(rsp);
551 598
552 } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) { 599 } else if (rcu_gp_in_progress(rsp) &&
600 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
553 601
554 /* They had two time units to dump stack, so complain. */ 602 /* They had a few time units to dump stack, so complain. */
555 print_other_cpu_stall(rsp); 603 print_other_cpu_stall(rsp);
556 } 604 }
557} 605}
@@ -587,26 +635,6 @@ static void __init check_cpu_stall_init(void)
587 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); 635 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
588} 636}
589 637
590#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
591
592static void record_gp_stall_check_time(struct rcu_state *rsp)
593{
594}
595
596static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
597{
598}
599
600void rcu_cpu_stall_reset(void)
601{
602}
603
604static void __init check_cpu_stall_init(void)
605{
606}
607
608#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
609
610/* 638/*
611 * Update CPU-local rcu_data state to record the newly noticed grace period. 639 * Update CPU-local rcu_data state to record the newly noticed grace period.
612 * This is used both when we started the grace period and when we notice 640 * This is used both when we started the grace period and when we notice
@@ -809,6 +837,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
809 rnp->completed = rsp->completed; 837 rnp->completed = rsp->completed;
810 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 838 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
811 rcu_start_gp_per_cpu(rsp, rnp, rdp); 839 rcu_start_gp_per_cpu(rsp, rnp, rdp);
840 rcu_preempt_boost_start_gp(rnp);
812 raw_spin_unlock_irqrestore(&rnp->lock, flags); 841 raw_spin_unlock_irqrestore(&rnp->lock, flags);
813 return; 842 return;
814 } 843 }
@@ -844,6 +873,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
844 rnp->completed = rsp->completed; 873 rnp->completed = rsp->completed;
845 if (rnp == rdp->mynode) 874 if (rnp == rdp->mynode)
846 rcu_start_gp_per_cpu(rsp, rnp, rdp); 875 rcu_start_gp_per_cpu(rsp, rnp, rdp);
876 rcu_preempt_boost_start_gp(rnp);
847 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 877 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
848 } 878 }
849 879
@@ -864,7 +894,18 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
864static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) 894static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
865 __releases(rcu_get_root(rsp)->lock) 895 __releases(rcu_get_root(rsp)->lock)
866{ 896{
897 unsigned long gp_duration;
898
867 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 899 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
900
901 /*
902 * Ensure that all grace-period and pre-grace-period activity
903 * is seen before the assignment to rsp->completed.
904 */
905 smp_mb(); /* See above block comment. */
906 gp_duration = jiffies - rsp->gp_start;
907 if (gp_duration > rsp->gp_max)
908 rsp->gp_max = gp_duration;
868 rsp->completed = rsp->gpnum; 909 rsp->completed = rsp->gpnum;
869 rsp->signaled = RCU_GP_IDLE; 910 rsp->signaled = RCU_GP_IDLE;
870 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 911 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
@@ -894,7 +935,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
894 return; 935 return;
895 } 936 }
896 rnp->qsmask &= ~mask; 937 rnp->qsmask &= ~mask;
897 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 938 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
898 939
899 /* Other bits still set at this level, so done. */ 940 /* Other bits still set at this level, so done. */
900 raw_spin_unlock_irqrestore(&rnp->lock, flags); 941 raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1037,6 +1078,8 @@ static void rcu_send_cbs_to_online(struct rcu_state *rsp)
1037/* 1078/*
1038 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy 1079 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
1039 * and move all callbacks from the outgoing CPU to the current one. 1080 * and move all callbacks from the outgoing CPU to the current one.
1081 * There can only be one CPU hotplug operation at a time, so no other
1082 * CPU can be attempting to update rcu_cpu_kthread_task.
1040 */ 1083 */
1041static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) 1084static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1042{ 1085{
@@ -1045,6 +1088,14 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1045 int need_report = 0; 1088 int need_report = 0;
1046 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1089 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1047 struct rcu_node *rnp; 1090 struct rcu_node *rnp;
1091 struct task_struct *t;
1092
1093 /* Stop the CPU's kthread. */
1094 t = per_cpu(rcu_cpu_kthread_task, cpu);
1095 if (t != NULL) {
1096 per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
1097 kthread_stop(t);
1098 }
1048 1099
1049 /* Exclude any attempts to start a new grace period. */ 1100 /* Exclude any attempts to start a new grace period. */
1050 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1101 raw_spin_lock_irqsave(&rsp->onofflock, flags);
@@ -1082,6 +1133,22 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1082 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1133 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1083 if (need_report & RCU_OFL_TASKS_EXP_GP) 1134 if (need_report & RCU_OFL_TASKS_EXP_GP)
1084 rcu_report_exp_rnp(rsp, rnp); 1135 rcu_report_exp_rnp(rsp, rnp);
1136
1137 /*
1138 * If there are no more online CPUs for this rcu_node structure,
1139 * kill the rcu_node structure's kthread. Otherwise, adjust its
1140 * affinity.
1141 */
1142 t = rnp->node_kthread_task;
1143 if (t != NULL &&
1144 rnp->qsmaskinit == 0) {
1145 raw_spin_lock_irqsave(&rnp->lock, flags);
1146 rnp->node_kthread_task = NULL;
1147 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1148 kthread_stop(t);
1149 rcu_stop_boost_kthread(rnp);
1150 } else
1151 rcu_node_kthread_setaffinity(rnp, -1);
1085} 1152}
1086 1153
1087/* 1154/*
@@ -1143,7 +1210,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1143 next = list->next; 1210 next = list->next;
1144 prefetch(next); 1211 prefetch(next);
1145 debug_rcu_head_unqueue(list); 1212 debug_rcu_head_unqueue(list);
1146 list->func(list); 1213 __rcu_reclaim(list);
1147 list = next; 1214 list = next;
1148 if (++count >= rdp->blimit) 1215 if (++count >= rdp->blimit)
1149 break; 1216 break;
@@ -1179,7 +1246,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1179 1246
1180 /* Re-raise the RCU softirq if there are callbacks remaining. */ 1247 /* Re-raise the RCU softirq if there are callbacks remaining. */
1181 if (cpu_has_callbacks_ready_to_invoke(rdp)) 1248 if (cpu_has_callbacks_ready_to_invoke(rdp))
1182 raise_softirq(RCU_SOFTIRQ); 1249 invoke_rcu_cpu_kthread();
1183} 1250}
1184 1251
1185/* 1252/*
@@ -1225,7 +1292,7 @@ void rcu_check_callbacks(int cpu, int user)
1225 } 1292 }
1226 rcu_preempt_check_callbacks(cpu); 1293 rcu_preempt_check_callbacks(cpu);
1227 if (rcu_pending(cpu)) 1294 if (rcu_pending(cpu))
1228 raise_softirq(RCU_SOFTIRQ); 1295 invoke_rcu_cpu_kthread();
1229} 1296}
1230 1297
1231#ifdef CONFIG_SMP 1298#ifdef CONFIG_SMP
@@ -1233,6 +1300,8 @@ void rcu_check_callbacks(int cpu, int user)
1233/* 1300/*
1234 * Scan the leaf rcu_node structures, processing dyntick state for any that 1301 * Scan the leaf rcu_node structures, processing dyntick state for any that
1235 * have not yet encountered a quiescent state, using the function specified. 1302 * have not yet encountered a quiescent state, using the function specified.
1303 * Also initiate boosting for any threads blocked on the root rcu_node.
1304 *
1236 * The caller must have suppressed start of new grace periods. 1305 * The caller must have suppressed start of new grace periods.
1237 */ 1306 */
1238static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) 1307static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
@@ -1251,6 +1320,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1251 return; 1320 return;
1252 } 1321 }
1253 if (rnp->qsmask == 0) { 1322 if (rnp->qsmask == 0) {
1323 rcu_initiate_boost(rnp);
1254 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1324 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1255 continue; 1325 continue;
1256 } 1326 }
@@ -1269,6 +1339,11 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1269 } 1339 }
1270 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1340 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1271 } 1341 }
1342 rnp = rcu_get_root(rsp);
1343 raw_spin_lock_irqsave(&rnp->lock, flags);
1344 if (rnp->qsmask == 0)
1345 rcu_initiate_boost(rnp);
1346 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1272} 1347}
1273 1348
1274/* 1349/*
@@ -1389,31 +1464,360 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1389/* 1464/*
1390 * Do softirq processing for the current CPU. 1465 * Do softirq processing for the current CPU.
1391 */ 1466 */
1392static void rcu_process_callbacks(struct softirq_action *unused) 1467static void rcu_process_callbacks(void)
1393{ 1468{
1394 /*
1395 * Memory references from any prior RCU read-side critical sections
1396 * executed by the interrupted code must be seen before any RCU
1397 * grace-period manipulations below.
1398 */
1399 smp_mb(); /* See above block comment. */
1400
1401 __rcu_process_callbacks(&rcu_sched_state, 1469 __rcu_process_callbacks(&rcu_sched_state,
1402 &__get_cpu_var(rcu_sched_data)); 1470 &__get_cpu_var(rcu_sched_data));
1403 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); 1471 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1404 rcu_preempt_process_callbacks(); 1472 rcu_preempt_process_callbacks();
1405 1473
1406 /*
1407 * Memory references from any later RCU read-side critical sections
1408 * executed by the interrupted code must be seen after any RCU
1409 * grace-period manipulations above.
1410 */
1411 smp_mb(); /* See above block comment. */
1412
1413 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ 1474 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
1414 rcu_needs_cpu_flush(); 1475 rcu_needs_cpu_flush();
1415} 1476}
1416 1477
1478/*
1479 * Wake up the current CPU's kthread. This replaces raise_softirq()
1480 * in earlier versions of RCU. Note that because we are running on
1481 * the current CPU with interrupts disabled, the rcu_cpu_kthread_task
1482 * cannot disappear out from under us.
1483 */
1484static void invoke_rcu_cpu_kthread(void)
1485{
1486 unsigned long flags;
1487
1488 local_irq_save(flags);
1489 __this_cpu_write(rcu_cpu_has_work, 1);
1490 if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
1491 local_irq_restore(flags);
1492 return;
1493 }
1494 wake_up(&__get_cpu_var(rcu_cpu_wq));
1495 local_irq_restore(flags);
1496}
1497
1498/*
1499 * Wake up the specified per-rcu_node-structure kthread.
1500 * The caller must hold ->lock.
1501 */
1502static void invoke_rcu_node_kthread(struct rcu_node *rnp)
1503{
1504 struct task_struct *t;
1505
1506 t = rnp->node_kthread_task;
1507 if (t != NULL)
1508 wake_up_process(t);
1509}
1510
1511/*
1512 * Set the specified CPU's kthread to run RT or not, as specified by
1513 * the to_rt argument. The CPU-hotplug locks are held, so the task
1514 * is not going away.
1515 */
1516static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1517{
1518 int policy;
1519 struct sched_param sp;
1520 struct task_struct *t;
1521
1522 t = per_cpu(rcu_cpu_kthread_task, cpu);
1523 if (t == NULL)
1524 return;
1525 if (to_rt) {
1526 policy = SCHED_FIFO;
1527 sp.sched_priority = RCU_KTHREAD_PRIO;
1528 } else {
1529 policy = SCHED_NORMAL;
1530 sp.sched_priority = 0;
1531 }
1532 sched_setscheduler_nocheck(t, policy, &sp);
1533}
1534
1535/*
1536 * Timer handler to initiate the waking up of per-CPU kthreads that
1537 * have yielded the CPU due to excess numbers of RCU callbacks.
1538 * We wake up the per-rcu_node kthread, which in turn will wake up
1539 * the booster kthread.
1540 */
1541static void rcu_cpu_kthread_timer(unsigned long arg)
1542{
1543 unsigned long flags;
1544 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
1545 struct rcu_node *rnp = rdp->mynode;
1546
1547 raw_spin_lock_irqsave(&rnp->lock, flags);
1548 rnp->wakemask |= rdp->grpmask;
1549 invoke_rcu_node_kthread(rnp);
1550 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1551}
1552
1553/*
1554 * Drop to non-real-time priority and yield, but only after posting a
1555 * timer that will cause us to regain our real-time priority if we
1556 * remain preempted. Either way, we restore our real-time priority
1557 * before returning.
1558 */
1559static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
1560{
1561 struct sched_param sp;
1562 struct timer_list yield_timer;
1563
1564 setup_timer_on_stack(&yield_timer, f, arg);
1565 mod_timer(&yield_timer, jiffies + 2);
1566 sp.sched_priority = 0;
1567 sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
1568 set_user_nice(current, 19);
1569 schedule();
1570 sp.sched_priority = RCU_KTHREAD_PRIO;
1571 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1572 del_timer(&yield_timer);
1573}
1574
1575/*
1576 * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
1577 * This can happen while the corresponding CPU is either coming online
1578 * or going offline. We cannot wait until the CPU is fully online
1579 * before starting the kthread, because the various notifier functions
1580 * can wait for RCU grace periods. So we park rcu_cpu_kthread() until
1581 * the corresponding CPU is online.
1582 *
1583 * Return 1 if the kthread needs to stop, 0 otherwise.
1584 *
1585 * Caller must disable bh. This function can momentarily enable it.
1586 */
1587static int rcu_cpu_kthread_should_stop(int cpu)
1588{
1589 while (cpu_is_offline(cpu) ||
1590 !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
1591 smp_processor_id() != cpu) {
1592 if (kthread_should_stop())
1593 return 1;
1594 per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
1595 per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
1596 local_bh_enable();
1597 schedule_timeout_uninterruptible(1);
1598 if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
1599 set_cpus_allowed_ptr(current, cpumask_of(cpu));
1600 local_bh_disable();
1601 }
1602 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1603 return 0;
1604}
1605
1606/*
1607 * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
1608 * earlier RCU softirq.
1609 */
1610static int rcu_cpu_kthread(void *arg)
1611{
1612 int cpu = (int)(long)arg;
1613 unsigned long flags;
1614 int spincnt = 0;
1615 unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
1616 wait_queue_head_t *wqp = &per_cpu(rcu_cpu_wq, cpu);
1617 char work;
1618 char *workp = &per_cpu(rcu_cpu_has_work, cpu);
1619
1620 for (;;) {
1621 *statusp = RCU_KTHREAD_WAITING;
1622 wait_event_interruptible(*wqp,
1623 *workp != 0 || kthread_should_stop());
1624 local_bh_disable();
1625 if (rcu_cpu_kthread_should_stop(cpu)) {
1626 local_bh_enable();
1627 break;
1628 }
1629 *statusp = RCU_KTHREAD_RUNNING;
1630 per_cpu(rcu_cpu_kthread_loops, cpu)++;
1631 local_irq_save(flags);
1632 work = *workp;
1633 *workp = 0;
1634 local_irq_restore(flags);
1635 if (work)
1636 rcu_process_callbacks();
1637 local_bh_enable();
1638 if (*workp != 0)
1639 spincnt++;
1640 else
1641 spincnt = 0;
1642 if (spincnt > 10) {
1643 *statusp = RCU_KTHREAD_YIELDING;
1644 rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
1645 spincnt = 0;
1646 }
1647 }
1648 *statusp = RCU_KTHREAD_STOPPED;
1649 return 0;
1650}
1651
1652/*
1653 * Spawn a per-CPU kthread, setting up affinity and priority.
1654 * Because the CPU hotplug lock is held, no other CPU will be attempting
1655 * to manipulate rcu_cpu_kthread_task. There might be another CPU
1656 * attempting to access it during boot, but the locking in kthread_bind()
1657 * will enforce sufficient ordering.
1658 */
1659static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
1660{
1661 struct sched_param sp;
1662 struct task_struct *t;
1663
1664 if (!rcu_kthreads_spawnable ||
1665 per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
1666 return 0;
1667 t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
1668 if (IS_ERR(t))
1669 return PTR_ERR(t);
1670 kthread_bind(t, cpu);
1671 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1672 WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
1673 per_cpu(rcu_cpu_kthread_task, cpu) = t;
1674 wake_up_process(t);
1675 sp.sched_priority = RCU_KTHREAD_PRIO;
1676 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1677 return 0;
1678}
1679
1680/*
1681 * Per-rcu_node kthread, which is in charge of waking up the per-CPU
1682 * kthreads when needed. We ignore requests to wake up kthreads
1683 * for offline CPUs, which is OK because force_quiescent_state()
1684 * takes care of this case.
1685 */
1686static int rcu_node_kthread(void *arg)
1687{
1688 int cpu;
1689 unsigned long flags;
1690 unsigned long mask;
1691 struct rcu_node *rnp = (struct rcu_node *)arg;
1692 struct sched_param sp;
1693 struct task_struct *t;
1694
1695 for (;;) {
1696 rnp->node_kthread_status = RCU_KTHREAD_WAITING;
1697 wait_event_interruptible(rnp->node_wq, rnp->wakemask != 0 ||
1698 kthread_should_stop());
1699 if (kthread_should_stop())
1700 break;
1701 rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
1702 raw_spin_lock_irqsave(&rnp->lock, flags);
1703 mask = rnp->wakemask;
1704 rnp->wakemask = 0;
1705 rcu_initiate_boost(rnp);
1706 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1707 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
1708 if ((mask & 0x1) == 0)
1709 continue;
1710 preempt_disable();
1711 t = per_cpu(rcu_cpu_kthread_task, cpu);
1712 if (!cpu_online(cpu) || t == NULL) {
1713 preempt_enable();
1714 continue;
1715 }
1716 per_cpu(rcu_cpu_has_work, cpu) = 1;
1717 sp.sched_priority = RCU_KTHREAD_PRIO;
1718 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1719 preempt_enable();
1720 }
1721 }
1722 rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
1723 return 0;
1724}
1725
1726/*
1727 * Set the per-rcu_node kthread's affinity to cover all CPUs that are
1728 * served by the rcu_node in question. The CPU hotplug lock is still
1729 * held, so the value of rnp->qsmaskinit will be stable.
1730 *
1731 * We don't include outgoingcpu in the affinity set, use -1 if there is
1732 * no outgoing CPU. If there are no CPUs left in the affinity set,
1733 * this function allows the kthread to execute on any CPU.
1734 */
1735static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1736{
1737 cpumask_var_t cm;
1738 int cpu;
1739 unsigned long mask = rnp->qsmaskinit;
1740
1741 if (rnp->node_kthread_task == NULL || mask == 0)
1742 return;
1743 if (!alloc_cpumask_var(&cm, GFP_KERNEL))
1744 return;
1745 cpumask_clear(cm);
1746 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
1747 if ((mask & 0x1) && cpu != outgoingcpu)
1748 cpumask_set_cpu(cpu, cm);
1749 if (cpumask_weight(cm) == 0) {
1750 cpumask_setall(cm);
1751 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
1752 cpumask_clear_cpu(cpu, cm);
1753 WARN_ON_ONCE(cpumask_weight(cm) == 0);
1754 }
1755 set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
1756 rcu_boost_kthread_setaffinity(rnp, cm);
1757 free_cpumask_var(cm);
1758}
1759
1760/*
1761 * Spawn a per-rcu_node kthread, setting priority and affinity.
1762 * Called during boot before online/offline can happen, or, if
1763 * during runtime, with the main CPU-hotplug locks held. So only
1764 * one of these can be executing at a time.
1765 */
1766static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
1767 struct rcu_node *rnp)
1768{
1769 unsigned long flags;
1770 int rnp_index = rnp - &rsp->node[0];
1771 struct sched_param sp;
1772 struct task_struct *t;
1773
1774 if (!rcu_kthreads_spawnable ||
1775 rnp->qsmaskinit == 0)
1776 return 0;
1777 if (rnp->node_kthread_task == NULL) {
1778 t = kthread_create(rcu_node_kthread, (void *)rnp,
1779 "rcun%d", rnp_index);
1780 if (IS_ERR(t))
1781 return PTR_ERR(t);
1782 raw_spin_lock_irqsave(&rnp->lock, flags);
1783 rnp->node_kthread_task = t;
1784 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1785 wake_up_process(t);
1786 sp.sched_priority = 99;
1787 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1788 }
1789 return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
1790}
1791
1792/*
1793 * Spawn all kthreads -- called as soon as the scheduler is running.
1794 */
1795static int __init rcu_spawn_kthreads(void)
1796{
1797 int cpu;
1798 struct rcu_node *rnp;
1799
1800 rcu_kthreads_spawnable = 1;
1801 for_each_possible_cpu(cpu) {
1802 init_waitqueue_head(&per_cpu(rcu_cpu_wq, cpu));
1803 per_cpu(rcu_cpu_has_work, cpu) = 0;
1804 if (cpu_online(cpu))
1805 (void)rcu_spawn_one_cpu_kthread(cpu);
1806 }
1807 rnp = rcu_get_root(rcu_state);
1808 init_waitqueue_head(&rnp->node_wq);
1809 rcu_init_boost_waitqueue(rnp);
1810 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1811 if (NUM_RCU_NODES > 1)
1812 rcu_for_each_leaf_node(rcu_state, rnp) {
1813 init_waitqueue_head(&rnp->node_wq);
1814 rcu_init_boost_waitqueue(rnp);
1815 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1816 }
1817 return 0;
1818}
1819early_initcall(rcu_spawn_kthreads);
1820
1417static void 1821static void
1418__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), 1822__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1419 struct rcu_state *rsp) 1823 struct rcu_state *rsp)
@@ -1439,6 +1843,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1439 /* Add the callback to our list. */ 1843 /* Add the callback to our list. */
1440 *rdp->nxttail[RCU_NEXT_TAIL] = head; 1844 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1441 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1845 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1846 rdp->qlen++;
1847
1848 /* If interrupts were disabled, don't dive into RCU core. */
1849 if (irqs_disabled_flags(flags)) {
1850 local_irq_restore(flags);
1851 return;
1852 }
1442 1853
1443 /* 1854 /*
1444 * Force the grace period if too many callbacks or too long waiting. 1855 * Force the grace period if too many callbacks or too long waiting.
@@ -1447,7 +1858,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1447 * invoking force_quiescent_state() if the newly enqueued callback 1858 * invoking force_quiescent_state() if the newly enqueued callback
1448 * is the only one waiting for a grace period to complete. 1859 * is the only one waiting for a grace period to complete.
1449 */ 1860 */
1450 if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { 1861 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1451 1862
1452 /* Are we ignoring a completed grace period? */ 1863 /* Are we ignoring a completed grace period? */
1453 rcu_process_gp_end(rsp, rdp); 1864 rcu_process_gp_end(rsp, rdp);
@@ -1583,7 +1994,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1583 * or RCU-bh, force a local reschedule. 1994 * or RCU-bh, force a local reschedule.
1584 */ 1995 */
1585 rdp->n_rp_qs_pending++; 1996 rdp->n_rp_qs_pending++;
1586 if (!rdp->preemptable && 1997 if (!rdp->preemptible &&
1587 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, 1998 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
1588 jiffies)) 1999 jiffies))
1589 set_need_resched(); 2000 set_need_resched();
@@ -1760,7 +2171,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1760 * that this CPU cannot possibly have any RCU callbacks in flight yet. 2171 * that this CPU cannot possibly have any RCU callbacks in flight yet.
1761 */ 2172 */
1762static void __cpuinit 2173static void __cpuinit
1763rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) 2174rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
1764{ 2175{
1765 unsigned long flags; 2176 unsigned long flags;
1766 unsigned long mask; 2177 unsigned long mask;
@@ -1772,7 +2183,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1772 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 2183 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1773 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 2184 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1774 rdp->beenonline = 1; /* We have now been online. */ 2185 rdp->beenonline = 1; /* We have now been online. */
1775 rdp->preemptable = preemptable; 2186 rdp->preemptible = preemptible;
1776 rdp->qlen_last_fqs_check = 0; 2187 rdp->qlen_last_fqs_check = 0;
1777 rdp->n_force_qs_snap = rsp->n_force_qs; 2188 rdp->n_force_qs_snap = rsp->n_force_qs;
1778 rdp->blimit = blimit; 2189 rdp->blimit = blimit;
@@ -1813,6 +2224,19 @@ static void __cpuinit rcu_online_cpu(int cpu)
1813 rcu_preempt_init_percpu_data(cpu); 2224 rcu_preempt_init_percpu_data(cpu);
1814} 2225}
1815 2226
2227static void __cpuinit rcu_online_kthreads(int cpu)
2228{
2229 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2230 struct rcu_node *rnp = rdp->mynode;
2231
2232 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
2233 if (rcu_kthreads_spawnable) {
2234 (void)rcu_spawn_one_cpu_kthread(cpu);
2235 if (rnp->node_kthread_task == NULL)
2236 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
2237 }
2238}
2239
1816/* 2240/*
1817 * Handle CPU online/offline notification events. 2241 * Handle CPU online/offline notification events.
1818 */ 2242 */
@@ -1820,11 +2244,23 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1820 unsigned long action, void *hcpu) 2244 unsigned long action, void *hcpu)
1821{ 2245{
1822 long cpu = (long)hcpu; 2246 long cpu = (long)hcpu;
2247 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2248 struct rcu_node *rnp = rdp->mynode;
1823 2249
1824 switch (action) { 2250 switch (action) {
1825 case CPU_UP_PREPARE: 2251 case CPU_UP_PREPARE:
1826 case CPU_UP_PREPARE_FROZEN: 2252 case CPU_UP_PREPARE_FROZEN:
1827 rcu_online_cpu(cpu); 2253 rcu_online_cpu(cpu);
2254 rcu_online_kthreads(cpu);
2255 break;
2256 case CPU_ONLINE:
2257 case CPU_DOWN_FAILED:
2258 rcu_node_kthread_setaffinity(rnp, -1);
2259 rcu_cpu_kthread_setrt(cpu, 1);
2260 break;
2261 case CPU_DOWN_PREPARE:
2262 rcu_node_kthread_setaffinity(rnp, cpu);
2263 rcu_cpu_kthread_setrt(cpu, 0);
1828 break; 2264 break;
1829 case CPU_DYING: 2265 case CPU_DYING:
1830 case CPU_DYING_FROZEN: 2266 case CPU_DYING_FROZEN:
@@ -1943,10 +2379,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
1943 j / rsp->levelspread[i - 1]; 2379 j / rsp->levelspread[i - 1];
1944 } 2380 }
1945 rnp->level = i; 2381 rnp->level = i;
1946 INIT_LIST_HEAD(&rnp->blocked_tasks[0]); 2382 INIT_LIST_HEAD(&rnp->blkd_tasks);
1947 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1948 INIT_LIST_HEAD(&rnp->blocked_tasks[2]);
1949 INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
1950 } 2383 }
1951 } 2384 }
1952 2385
@@ -1968,7 +2401,6 @@ void __init rcu_init(void)
1968 rcu_init_one(&rcu_sched_state, &rcu_sched_data); 2401 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
1969 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 2402 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
1970 __rcu_init_preempt(); 2403 __rcu_init_preempt();
1971 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1972 2404
1973 /* 2405 /*
1974 * We don't need protection against CPU-hotplug here because 2406 * We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index e8f057e44e3e..a6a97171dac6 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,13 +84,19 @@
84 * Dynticks per-CPU state. 84 * Dynticks per-CPU state.
85 */ 85 */
86struct rcu_dynticks { 86struct rcu_dynticks {
87 int dynticks_nesting; /* Track nesting level, sort of. */ 87 int dynticks_nesting; /* Track irq/process nesting level. */
88 int dynticks; /* Even value for dynticks-idle, else odd. */ 88 int dynticks_nmi_nesting; /* Track NMI nesting level. */
89 int dynticks_nmi; /* Even value for either dynticks-idle or */ 89 atomic_t dynticks; /* Even value for dynticks-idle, else odd. */
90 /* not in nmi handler, else odd. So this */
91 /* remains even for nmi from irq handler. */
92}; 90};
93 91
92/* RCU's kthread states for tracing. */
93#define RCU_KTHREAD_STOPPED 0
94#define RCU_KTHREAD_RUNNING 1
95#define RCU_KTHREAD_WAITING 2
96#define RCU_KTHREAD_OFFCPU 3
97#define RCU_KTHREAD_YIELDING 4
98#define RCU_KTHREAD_MAX 4
99
94/* 100/*
95 * Definition for node within the RCU grace-period-detection hierarchy. 101 * Definition for node within the RCU grace-period-detection hierarchy.
96 */ 102 */
@@ -109,10 +115,11 @@ struct rcu_node {
109 /* an rcu_data structure, otherwise, each */ 115 /* an rcu_data structure, otherwise, each */
110 /* bit corresponds to a child rcu_node */ 116 /* bit corresponds to a child rcu_node */
111 /* structure. */ 117 /* structure. */
112 unsigned long expmask; /* Groups that have ->blocked_tasks[] */ 118 unsigned long expmask; /* Groups that have ->blkd_tasks */
113 /* elements that need to drain to allow the */ 119 /* elements that need to drain to allow the */
114 /* current expedited grace period to */ 120 /* current expedited grace period to */
115 /* complete (only for TREE_PREEMPT_RCU). */ 121 /* complete (only for TREE_PREEMPT_RCU). */
122 unsigned long wakemask; /* CPUs whose kthread needs to be awakened. */
116 unsigned long qsmaskinit; 123 unsigned long qsmaskinit;
117 /* Per-GP initial value for qsmask & expmask. */ 124 /* Per-GP initial value for qsmask & expmask. */
118 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 125 unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -122,11 +129,68 @@ struct rcu_node {
122 u8 grpnum; /* CPU/group number for next level up. */ 129 u8 grpnum; /* CPU/group number for next level up. */
123 u8 level; /* root is at level 0. */ 130 u8 level; /* root is at level 0. */
124 struct rcu_node *parent; 131 struct rcu_node *parent;
125 struct list_head blocked_tasks[4]; 132 struct list_head blkd_tasks;
126 /* Tasks blocked in RCU read-side critsect. */ 133 /* Tasks blocked in RCU read-side critical */
127 /* Grace period number (->gpnum) x blocked */ 134 /* section. Tasks are placed at the head */
128 /* by tasks on the (x & 0x1) element of the */ 135 /* of this list and age towards the tail. */
129 /* blocked_tasks[] array. */ 136 struct list_head *gp_tasks;
137 /* Pointer to the first task blocking the */
138 /* current grace period, or NULL if there */
139 /* is no such task. */
140 struct list_head *exp_tasks;
141 /* Pointer to the first task blocking the */
142 /* current expedited grace period, or NULL */
143 /* if there is no such task. If there */
144 /* is no current expedited grace period, */
145 /* then there can cannot be any such task. */
146#ifdef CONFIG_RCU_BOOST
147 struct list_head *boost_tasks;
148 /* Pointer to first task that needs to be */
149 /* priority boosted, or NULL if no priority */
150 /* boosting is needed for this rcu_node */
151 /* structure. If there are no tasks */
152 /* queued on this rcu_node structure that */
153 /* are blocking the current grace period, */
154 /* there can be no such task. */
155 unsigned long boost_time;
156 /* When to start boosting (jiffies). */
157 struct task_struct *boost_kthread_task;
158 /* kthread that takes care of priority */
159 /* boosting for this rcu_node structure. */
160 wait_queue_head_t boost_wq;
161 /* Wait queue on which to park the boost */
162 /* kthread. */
163 unsigned int boost_kthread_status;
164 /* State of boost_kthread_task for tracing. */
165 unsigned long n_tasks_boosted;
166 /* Total number of tasks boosted. */
167 unsigned long n_exp_boosts;
168 /* Number of tasks boosted for expedited GP. */
169 unsigned long n_normal_boosts;
170 /* Number of tasks boosted for normal GP. */
171 unsigned long n_balk_blkd_tasks;
172 /* Refused to boost: no blocked tasks. */
173 unsigned long n_balk_exp_gp_tasks;
174 /* Refused to boost: nothing blocking GP. */
175 unsigned long n_balk_boost_tasks;
176 /* Refused to boost: already boosting. */
177 unsigned long n_balk_notblocked;
178 /* Refused to boost: RCU RS CS still running. */
179 unsigned long n_balk_notyet;
180 /* Refused to boost: not yet time. */
181 unsigned long n_balk_nos;
182 /* Refused to boost: not sure why, though. */
183 /* This can happen due to race conditions. */
184#endif /* #ifdef CONFIG_RCU_BOOST */
185 struct task_struct *node_kthread_task;
186 /* kthread that takes care of this rcu_node */
187 /* structure, for example, awakening the */
188 /* per-CPU kthreads as needed. */
189 wait_queue_head_t node_wq;
190 /* Wait queue on which to park the per-node */
191 /* kthread. */
192 unsigned int node_kthread_status;
193 /* State of node_kthread_task for tracing. */
130} ____cacheline_internodealigned_in_smp; 194} ____cacheline_internodealigned_in_smp;
131 195
132/* 196/*
@@ -175,7 +239,7 @@ struct rcu_data {
175 bool passed_quiesc; /* User-mode/idle loop etc. */ 239 bool passed_quiesc; /* User-mode/idle loop etc. */
176 bool qs_pending; /* Core waits for quiesc state. */ 240 bool qs_pending; /* Core waits for quiesc state. */
177 bool beenonline; /* CPU online at least once. */ 241 bool beenonline; /* CPU online at least once. */
178 bool preemptable; /* Preemptable RCU? */ 242 bool preemptible; /* Preemptible RCU? */
179 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ 243 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
180 unsigned long grpmask; /* Mask to apply to leaf qsmask. */ 244 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
181 245
@@ -218,7 +282,6 @@ struct rcu_data {
218 /* 3) dynticks interface. */ 282 /* 3) dynticks interface. */
219 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ 283 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
220 int dynticks_snap; /* Per-GP tracking for dynticks. */ 284 int dynticks_snap; /* Per-GP tracking for dynticks. */
221 int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */
222#endif /* #ifdef CONFIG_NO_HZ */ 285#endif /* #ifdef CONFIG_NO_HZ */
223 286
224 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ 287 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
@@ -254,7 +317,6 @@ struct rcu_data {
254#endif /* #else #ifdef CONFIG_NO_HZ */ 317#endif /* #else #ifdef CONFIG_NO_HZ */
255 318
256#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 319#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
257#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
258 320
259#ifdef CONFIG_PROVE_RCU 321#ifdef CONFIG_PROVE_RCU
260#define RCU_STALL_DELAY_DELTA (5 * HZ) 322#define RCU_STALL_DELAY_DELTA (5 * HZ)
@@ -272,13 +334,6 @@ struct rcu_data {
272 /* scheduling clock irq */ 334 /* scheduling clock irq */
273 /* before ratting on them. */ 335 /* before ratting on them. */
274 336
275#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE
276#define RCU_CPU_STALL_SUPPRESS_INIT 0
277#else
278#define RCU_CPU_STALL_SUPPRESS_INIT 1
279#endif
280
281#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
282 337
283/* 338/*
284 * RCU global state, including node hierarchy. This hierarchy is 339 * RCU global state, including node hierarchy. This hierarchy is
@@ -325,12 +380,12 @@ struct rcu_state {
325 /* due to lock unavailable. */ 380 /* due to lock unavailable. */
326 unsigned long n_force_qs_ngp; /* Number of calls leaving */ 381 unsigned long n_force_qs_ngp; /* Number of calls leaving */
327 /* due to no GP active. */ 382 /* due to no GP active. */
328#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
329 unsigned long gp_start; /* Time at which GP started, */ 383 unsigned long gp_start; /* Time at which GP started, */
330 /* but in jiffies. */ 384 /* but in jiffies. */
331 unsigned long jiffies_stall; /* Time at which to check */ 385 unsigned long jiffies_stall; /* Time at which to check */
332 /* for CPU stalls. */ 386 /* for CPU stalls. */
333#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 387 unsigned long gp_max; /* Maximum GP duration in */
388 /* jiffies. */
334 char *name; /* Name of structure. */ 389 char *name; /* Name of structure. */
335}; 390};
336 391
@@ -361,16 +416,14 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
361static void rcu_bootup_announce(void); 416static void rcu_bootup_announce(void);
362long rcu_batches_completed(void); 417long rcu_batches_completed(void);
363static void rcu_preempt_note_context_switch(int cpu); 418static void rcu_preempt_note_context_switch(int cpu);
364static int rcu_preempted_readers(struct rcu_node *rnp); 419static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
365#ifdef CONFIG_HOTPLUG_CPU 420#ifdef CONFIG_HOTPLUG_CPU
366static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 421static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
367 unsigned long flags); 422 unsigned long flags);
368#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 423#endif /* #ifdef CONFIG_HOTPLUG_CPU */
369#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
370static void rcu_print_detail_task_stall(struct rcu_state *rsp); 424static void rcu_print_detail_task_stall(struct rcu_state *rsp);
371static void rcu_print_task_stall(struct rcu_node *rnp); 425static void rcu_print_task_stall(struct rcu_node *rnp);
372static void rcu_preempt_stall_reset(void); 426static void rcu_preempt_stall_reset(void);
373#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
374static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 427static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
375#ifdef CONFIG_HOTPLUG_CPU 428#ifdef CONFIG_HOTPLUG_CPU
376static int rcu_preempt_offline_tasks(struct rcu_state *rsp, 429static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
@@ -390,5 +443,16 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
390static void rcu_preempt_send_cbs_to_online(void); 443static void rcu_preempt_send_cbs_to_online(void);
391static void __init __rcu_init_preempt(void); 444static void __init __rcu_init_preempt(void);
392static void rcu_needs_cpu_flush(void); 445static void rcu_needs_cpu_flush(void);
446static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp);
447static void rcu_initiate_boost(struct rcu_node *rnp);
448static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
449 cpumask_var_t cm);
450static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
451static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
452 struct rcu_node *rnp,
453 int rnp_index);
454#ifdef CONFIG_HOTPLUG_CPU
455static void rcu_stop_boost_kthread(struct rcu_node *rnp);
456#endif /* #ifdef CONFIG_HOTPLUG_CPU */
393 457
394#endif /* #ifndef RCU_TREE_NONCORE */ 458#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index a3638710dc67..f629479d4b1f 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version) 2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic 3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics. 4 * or preemptible semantics.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
@@ -54,10 +54,6 @@ static void __init rcu_bootup_announce_oddness(void)
54#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE 54#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
55 printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); 55 printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
56#endif 56#endif
57#ifndef CONFIG_RCU_CPU_STALL_DETECTOR
58 printk(KERN_INFO
59 "\tRCU-based detection of stalled CPUs is disabled.\n");
60#endif
61#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) 57#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
62 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); 58 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
63#endif 59#endif
@@ -70,6 +66,7 @@ static void __init rcu_bootup_announce_oddness(void)
70 66
71struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 67struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
72DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 68DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
69static struct rcu_state *rcu_state = &rcu_preempt_state;
73 70
74static int rcu_preempted_readers_exp(struct rcu_node *rnp); 71static int rcu_preempted_readers_exp(struct rcu_node *rnp);
75 72
@@ -78,7 +75,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
78 */ 75 */
79static void __init rcu_bootup_announce(void) 76static void __init rcu_bootup_announce(void)
80{ 77{
81 printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n"); 78 printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n");
82 rcu_bootup_announce_oddness(); 79 rcu_bootup_announce_oddness();
83} 80}
84 81
@@ -111,7 +108,7 @@ void rcu_force_quiescent_state(void)
111EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 108EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
112 109
113/* 110/*
114 * Record a preemptable-RCU quiescent state for the specified CPU. Note 111 * Record a preemptible-RCU quiescent state for the specified CPU. Note
115 * that this just means that the task currently running on the CPU is 112 * that this just means that the task currently running on the CPU is
116 * not in a quiescent state. There might be any number of tasks blocked 113 * not in a quiescent state. There might be any number of tasks blocked
117 * while in an RCU read-side critical section. 114 * while in an RCU read-side critical section.
@@ -134,12 +131,12 @@ static void rcu_preempt_qs(int cpu)
134 * We have entered the scheduler, and the current task might soon be 131 * We have entered the scheduler, and the current task might soon be
135 * context-switched away from. If this task is in an RCU read-side 132 * context-switched away from. If this task is in an RCU read-side
136 * critical section, we will no longer be able to rely on the CPU to 133 * critical section, we will no longer be able to rely on the CPU to
137 * record that fact, so we enqueue the task on the appropriate entry 134 * record that fact, so we enqueue the task on the blkd_tasks list.
138 * of the blocked_tasks[] array. The task will dequeue itself when 135 * The task will dequeue itself when it exits the outermost enclosing
139 * it exits the outermost enclosing RCU read-side critical section. 136 * RCU read-side critical section. Therefore, the current grace period
140 * Therefore, the current grace period cannot be permitted to complete 137 * cannot be permitted to complete until the blkd_tasks list entries
141 * until the blocked_tasks[] entry indexed by the low-order bit of 138 * predating the current grace period drain, in other words, until
142 * rnp->gpnum empties. 139 * rnp->gp_tasks becomes NULL.
143 * 140 *
144 * Caller must disable preemption. 141 * Caller must disable preemption.
145 */ 142 */
@@ -147,7 +144,6 @@ static void rcu_preempt_note_context_switch(int cpu)
147{ 144{
148 struct task_struct *t = current; 145 struct task_struct *t = current;
149 unsigned long flags; 146 unsigned long flags;
150 int phase;
151 struct rcu_data *rdp; 147 struct rcu_data *rdp;
152 struct rcu_node *rnp; 148 struct rcu_node *rnp;
153 149
@@ -169,15 +165,30 @@ static void rcu_preempt_note_context_switch(int cpu)
169 * (i.e., this CPU has not yet passed through a quiescent 165 * (i.e., this CPU has not yet passed through a quiescent
170 * state for the current grace period), then as long 166 * state for the current grace period), then as long
171 * as that task remains queued, the current grace period 167 * as that task remains queued, the current grace period
172 * cannot end. 168 * cannot end. Note that there is some uncertainty as
169 * to exactly when the current grace period started.
170 * We take a conservative approach, which can result
171 * in unnecessarily waiting on tasks that started very
172 * slightly after the current grace period began. C'est
173 * la vie!!!
173 * 174 *
174 * But first, note that the current CPU must still be 175 * But first, note that the current CPU must still be
175 * on line! 176 * on line!
176 */ 177 */
177 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); 178 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
178 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 179 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
179 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; 180 if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
180 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); 181 list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
182 rnp->gp_tasks = &t->rcu_node_entry;
183#ifdef CONFIG_RCU_BOOST
184 if (rnp->boost_tasks != NULL)
185 rnp->boost_tasks = rnp->gp_tasks;
186#endif /* #ifdef CONFIG_RCU_BOOST */
187 } else {
188 list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
189 if (rnp->qsmask & rdp->grpmask)
190 rnp->gp_tasks = &t->rcu_node_entry;
191 }
181 raw_spin_unlock_irqrestore(&rnp->lock, flags); 192 raw_spin_unlock_irqrestore(&rnp->lock, flags);
182 } 193 }
183 194
@@ -196,7 +207,7 @@ static void rcu_preempt_note_context_switch(int cpu)
196} 207}
197 208
198/* 209/*
199 * Tree-preemptable RCU implementation for rcu_read_lock(). 210 * Tree-preemptible RCU implementation for rcu_read_lock().
200 * Just increment ->rcu_read_lock_nesting, shared state will be updated 211 * Just increment ->rcu_read_lock_nesting, shared state will be updated
201 * if we block. 212 * if we block.
202 */ 213 */
@@ -212,12 +223,9 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);
212 * for the specified rcu_node structure. If the caller needs a reliable 223 * for the specified rcu_node structure. If the caller needs a reliable
213 * answer, it must hold the rcu_node's ->lock. 224 * answer, it must hold the rcu_node's ->lock.
214 */ 225 */
215static int rcu_preempted_readers(struct rcu_node *rnp) 226static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
216{ 227{
217 int phase = rnp->gpnum & 0x1; 228 return rnp->gp_tasks != NULL;
218
219 return !list_empty(&rnp->blocked_tasks[phase]) ||
220 !list_empty(&rnp->blocked_tasks[phase + 2]);
221} 229}
222 230
223/* 231/*
@@ -233,7 +241,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
233 unsigned long mask; 241 unsigned long mask;
234 struct rcu_node *rnp_p; 242 struct rcu_node *rnp_p;
235 243
236 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 244 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
237 raw_spin_unlock_irqrestore(&rnp->lock, flags); 245 raw_spin_unlock_irqrestore(&rnp->lock, flags);
238 return; /* Still need more quiescent states! */ 246 return; /* Still need more quiescent states! */
239 } 247 }
@@ -257,6 +265,21 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
257} 265}
258 266
259/* 267/*
268 * Advance a ->blkd_tasks-list pointer to the next entry, instead
269 * returning NULL if at the end of the list.
270 */
271static struct list_head *rcu_next_node_entry(struct task_struct *t,
272 struct rcu_node *rnp)
273{
274 struct list_head *np;
275
276 np = t->rcu_node_entry.next;
277 if (np == &rnp->blkd_tasks)
278 np = NULL;
279 return np;
280}
281
282/*
260 * Handle special cases during rcu_read_unlock(), such as needing to 283 * Handle special cases during rcu_read_unlock(), such as needing to
261 * notify RCU core processing or task having blocked during the RCU 284 * notify RCU core processing or task having blocked during the RCU
262 * read-side critical section. 285 * read-side critical section.
@@ -266,6 +289,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
266 int empty; 289 int empty;
267 int empty_exp; 290 int empty_exp;
268 unsigned long flags; 291 unsigned long flags;
292 struct list_head *np;
269 struct rcu_node *rnp; 293 struct rcu_node *rnp;
270 int special; 294 int special;
271 295
@@ -306,10 +330,19 @@ static void rcu_read_unlock_special(struct task_struct *t)
306 break; 330 break;
307 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 331 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
308 } 332 }
309 empty = !rcu_preempted_readers(rnp); 333 empty = !rcu_preempt_blocked_readers_cgp(rnp);
310 empty_exp = !rcu_preempted_readers_exp(rnp); 334 empty_exp = !rcu_preempted_readers_exp(rnp);
311 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 335 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
336 np = rcu_next_node_entry(t, rnp);
312 list_del_init(&t->rcu_node_entry); 337 list_del_init(&t->rcu_node_entry);
338 if (&t->rcu_node_entry == rnp->gp_tasks)
339 rnp->gp_tasks = np;
340 if (&t->rcu_node_entry == rnp->exp_tasks)
341 rnp->exp_tasks = np;
342#ifdef CONFIG_RCU_BOOST
343 if (&t->rcu_node_entry == rnp->boost_tasks)
344 rnp->boost_tasks = np;
345#endif /* #ifdef CONFIG_RCU_BOOST */
313 t->rcu_blocked_node = NULL; 346 t->rcu_blocked_node = NULL;
314 347
315 /* 348 /*
@@ -322,6 +355,15 @@ static void rcu_read_unlock_special(struct task_struct *t)
322 else 355 else
323 rcu_report_unblock_qs_rnp(rnp, flags); 356 rcu_report_unblock_qs_rnp(rnp, flags);
324 357
358#ifdef CONFIG_RCU_BOOST
359 /* Unboost if we were boosted. */
360 if (special & RCU_READ_UNLOCK_BOOSTED) {
361 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
362 rt_mutex_unlock(t->rcu_boost_mutex);
363 t->rcu_boost_mutex = NULL;
364 }
365#endif /* #ifdef CONFIG_RCU_BOOST */
366
325 /* 367 /*
326 * If this was the last task on the expedited lists, 368 * If this was the last task on the expedited lists,
327 * then we need to report up the rcu_node hierarchy. 369 * then we need to report up the rcu_node hierarchy.
@@ -334,7 +376,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
334} 376}
335 377
336/* 378/*
337 * Tree-preemptable RCU implementation for rcu_read_unlock(). 379 * Tree-preemptible RCU implementation for rcu_read_unlock().
338 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost 380 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
339 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then 381 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
340 * invoke rcu_read_unlock_special() to clean up after a context switch 382 * invoke rcu_read_unlock_special() to clean up after a context switch
@@ -356,8 +398,6 @@ void __rcu_read_unlock(void)
356} 398}
357EXPORT_SYMBOL_GPL(__rcu_read_unlock); 399EXPORT_SYMBOL_GPL(__rcu_read_unlock);
358 400
359#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
360
361#ifdef CONFIG_RCU_CPU_STALL_VERBOSE 401#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
362 402
363/* 403/*
@@ -367,18 +407,16 @@ EXPORT_SYMBOL_GPL(__rcu_read_unlock);
367static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) 407static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
368{ 408{
369 unsigned long flags; 409 unsigned long flags;
370 struct list_head *lp;
371 int phase;
372 struct task_struct *t; 410 struct task_struct *t;
373 411
374 if (rcu_preempted_readers(rnp)) { 412 if (!rcu_preempt_blocked_readers_cgp(rnp))
375 raw_spin_lock_irqsave(&rnp->lock, flags); 413 return;
376 phase = rnp->gpnum & 0x1; 414 raw_spin_lock_irqsave(&rnp->lock, flags);
377 lp = &rnp->blocked_tasks[phase]; 415 t = list_entry(rnp->gp_tasks,
378 list_for_each_entry(t, lp, rcu_node_entry) 416 struct task_struct, rcu_node_entry);
379 sched_show_task(t); 417 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
380 raw_spin_unlock_irqrestore(&rnp->lock, flags); 418 sched_show_task(t);
381 } 419 raw_spin_unlock_irqrestore(&rnp->lock, flags);
382} 420}
383 421
384/* 422/*
@@ -408,16 +446,14 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
408 */ 446 */
409static void rcu_print_task_stall(struct rcu_node *rnp) 447static void rcu_print_task_stall(struct rcu_node *rnp)
410{ 448{
411 struct list_head *lp;
412 int phase;
413 struct task_struct *t; 449 struct task_struct *t;
414 450
415 if (rcu_preempted_readers(rnp)) { 451 if (!rcu_preempt_blocked_readers_cgp(rnp))
416 phase = rnp->gpnum & 0x1; 452 return;
417 lp = &rnp->blocked_tasks[phase]; 453 t = list_entry(rnp->gp_tasks,
418 list_for_each_entry(t, lp, rcu_node_entry) 454 struct task_struct, rcu_node_entry);
419 printk(" P%d", t->pid); 455 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
420 } 456 printk(" P%d", t->pid);
421} 457}
422 458
423/* 459/*
@@ -430,18 +466,21 @@ static void rcu_preempt_stall_reset(void)
430 rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; 466 rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
431} 467}
432 468
433#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
434
435/* 469/*
436 * Check that the list of blocked tasks for the newly completed grace 470 * Check that the list of blocked tasks for the newly completed grace
437 * period is in fact empty. It is a serious bug to complete a grace 471 * period is in fact empty. It is a serious bug to complete a grace
438 * period that still has RCU readers blocked! This function must be 472 * period that still has RCU readers blocked! This function must be
439 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock 473 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
440 * must be held by the caller. 474 * must be held by the caller.
475 *
476 * Also, if there are blocked tasks on the list, they automatically
477 * block the newly created grace period, so set up ->gp_tasks accordingly.
441 */ 478 */
442static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 479static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
443{ 480{
444 WARN_ON_ONCE(rcu_preempted_readers(rnp)); 481 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
482 if (!list_empty(&rnp->blkd_tasks))
483 rnp->gp_tasks = rnp->blkd_tasks.next;
445 WARN_ON_ONCE(rnp->qsmask); 484 WARN_ON_ONCE(rnp->qsmask);
446} 485}
447 486
@@ -465,50 +504,68 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
465 struct rcu_node *rnp, 504 struct rcu_node *rnp,
466 struct rcu_data *rdp) 505 struct rcu_data *rdp)
467{ 506{
468 int i;
469 struct list_head *lp; 507 struct list_head *lp;
470 struct list_head *lp_root; 508 struct list_head *lp_root;
471 int retval = 0; 509 int retval = 0;
472 struct rcu_node *rnp_root = rcu_get_root(rsp); 510 struct rcu_node *rnp_root = rcu_get_root(rsp);
473 struct task_struct *tp; 511 struct task_struct *t;
474 512
475 if (rnp == rnp_root) { 513 if (rnp == rnp_root) {
476 WARN_ONCE(1, "Last CPU thought to be offlined?"); 514 WARN_ONCE(1, "Last CPU thought to be offlined?");
477 return 0; /* Shouldn't happen: at least one CPU online. */ 515 return 0; /* Shouldn't happen: at least one CPU online. */
478 } 516 }
479 WARN_ON_ONCE(rnp != rdp->mynode && 517
480 (!list_empty(&rnp->blocked_tasks[0]) || 518 /* If we are on an internal node, complain bitterly. */
481 !list_empty(&rnp->blocked_tasks[1]) || 519 WARN_ON_ONCE(rnp != rdp->mynode);
482 !list_empty(&rnp->blocked_tasks[2]) ||
483 !list_empty(&rnp->blocked_tasks[3])));
484 520
485 /* 521 /*
486 * Move tasks up to root rcu_node. Rely on the fact that the 522 * Move tasks up to root rcu_node. Don't try to get fancy for
487 * root rcu_node can be at most one ahead of the rest of the 523 * this corner-case operation -- just put this node's tasks
488 * rcu_nodes in terms of gp_num value. This fact allows us to 524 * at the head of the root node's list, and update the root node's
489 * move the blocked_tasks[] array directly, element by element. 525 * ->gp_tasks and ->exp_tasks pointers to those of this node's,
526 * if non-NULL. This might result in waiting for more tasks than
527 * absolutely necessary, but this is a good performance/complexity
528 * tradeoff.
490 */ 529 */
491 if (rcu_preempted_readers(rnp)) 530 if (rcu_preempt_blocked_readers_cgp(rnp))
492 retval |= RCU_OFL_TASKS_NORM_GP; 531 retval |= RCU_OFL_TASKS_NORM_GP;
493 if (rcu_preempted_readers_exp(rnp)) 532 if (rcu_preempted_readers_exp(rnp))
494 retval |= RCU_OFL_TASKS_EXP_GP; 533 retval |= RCU_OFL_TASKS_EXP_GP;
495 for (i = 0; i < 4; i++) { 534 lp = &rnp->blkd_tasks;
496 lp = &rnp->blocked_tasks[i]; 535 lp_root = &rnp_root->blkd_tasks;
497 lp_root = &rnp_root->blocked_tasks[i]; 536 while (!list_empty(lp)) {
498 while (!list_empty(lp)) { 537 t = list_entry(lp->next, typeof(*t), rcu_node_entry);
499 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); 538 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
500 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ 539 list_del(&t->rcu_node_entry);
501 list_del(&tp->rcu_node_entry); 540 t->rcu_blocked_node = rnp_root;
502 tp->rcu_blocked_node = rnp_root; 541 list_add(&t->rcu_node_entry, lp_root);
503 list_add(&tp->rcu_node_entry, lp_root); 542 if (&t->rcu_node_entry == rnp->gp_tasks)
504 raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 543 rnp_root->gp_tasks = rnp->gp_tasks;
505 } 544 if (&t->rcu_node_entry == rnp->exp_tasks)
545 rnp_root->exp_tasks = rnp->exp_tasks;
546#ifdef CONFIG_RCU_BOOST
547 if (&t->rcu_node_entry == rnp->boost_tasks)
548 rnp_root->boost_tasks = rnp->boost_tasks;
549#endif /* #ifdef CONFIG_RCU_BOOST */
550 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
506 } 551 }
552
553#ifdef CONFIG_RCU_BOOST
554 /* In case root is being boosted and leaf is not. */
555 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
556 if (rnp_root->boost_tasks != NULL &&
557 rnp_root->boost_tasks != rnp_root->gp_tasks)
558 rnp_root->boost_tasks = rnp_root->gp_tasks;
559 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
560#endif /* #ifdef CONFIG_RCU_BOOST */
561
562 rnp->gp_tasks = NULL;
563 rnp->exp_tasks = NULL;
507 return retval; 564 return retval;
508} 565}
509 566
510/* 567/*
511 * Do CPU-offline processing for preemptable RCU. 568 * Do CPU-offline processing for preemptible RCU.
512 */ 569 */
513static void rcu_preempt_offline_cpu(int cpu) 570static void rcu_preempt_offline_cpu(int cpu)
514{ 571{
@@ -537,7 +594,7 @@ static void rcu_preempt_check_callbacks(int cpu)
537} 594}
538 595
539/* 596/*
540 * Process callbacks for preemptable RCU. 597 * Process callbacks for preemptible RCU.
541 */ 598 */
542static void rcu_preempt_process_callbacks(void) 599static void rcu_preempt_process_callbacks(void)
543{ 600{
@@ -546,7 +603,7 @@ static void rcu_preempt_process_callbacks(void)
546} 603}
547 604
548/* 605/*
549 * Queue a preemptable-RCU callback for invocation after a grace period. 606 * Queue a preemptible-RCU callback for invocation after a grace period.
550 */ 607 */
551void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 608void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
552{ 609{
@@ -594,8 +651,7 @@ static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
594 */ 651 */
595static int rcu_preempted_readers_exp(struct rcu_node *rnp) 652static int rcu_preempted_readers_exp(struct rcu_node *rnp)
596{ 653{
597 return !list_empty(&rnp->blocked_tasks[2]) || 654 return rnp->exp_tasks != NULL;
598 !list_empty(&rnp->blocked_tasks[3]);
599} 655}
600 656
601/* 657/*
@@ -655,12 +711,14 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
655static void 711static void
656sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) 712sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
657{ 713{
658 int must_wait; 714 int must_wait = 0;
659 715
660 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 716 raw_spin_lock(&rnp->lock); /* irqs already disabled */
661 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); 717 if (!list_empty(&rnp->blkd_tasks)) {
662 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); 718 rnp->exp_tasks = rnp->blkd_tasks.next;
663 must_wait = rcu_preempted_readers_exp(rnp); 719 rcu_initiate_boost(rnp);
720 must_wait = 1;
721 }
664 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 722 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
665 if (!must_wait) 723 if (!must_wait)
666 rcu_report_exp_rnp(rsp, rnp); 724 rcu_report_exp_rnp(rsp, rnp);
@@ -669,9 +727,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
669/* 727/*
670 * Wait for an rcu-preempt grace period, but expedite it. The basic idea 728 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
671 * is to invoke synchronize_sched_expedited() to push all the tasks to 729 * is to invoke synchronize_sched_expedited() to push all the tasks to
672 * the ->blocked_tasks[] lists, move all entries from the first set of 730 * the ->blkd_tasks lists and wait for this list to drain.
673 * ->blocked_tasks[] lists to the second set, and finally wait for this
674 * second set to drain.
675 */ 731 */
676void synchronize_rcu_expedited(void) 732void synchronize_rcu_expedited(void)
677{ 733{
@@ -703,7 +759,7 @@ void synchronize_rcu_expedited(void)
703 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) 759 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
704 goto unlock_mb_ret; /* Others did our work for us. */ 760 goto unlock_mb_ret; /* Others did our work for us. */
705 761
706 /* force all RCU readers onto blocked_tasks[]. */ 762 /* force all RCU readers onto ->blkd_tasks lists. */
707 synchronize_sched_expedited(); 763 synchronize_sched_expedited();
708 764
709 raw_spin_lock_irqsave(&rsp->onofflock, flags); 765 raw_spin_lock_irqsave(&rsp->onofflock, flags);
@@ -715,7 +771,7 @@ void synchronize_rcu_expedited(void)
715 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 771 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
716 } 772 }
717 773
718 /* Snapshot current state of ->blocked_tasks[] lists. */ 774 /* Snapshot current state of ->blkd_tasks lists. */
719 rcu_for_each_leaf_node(rsp, rnp) 775 rcu_for_each_leaf_node(rsp, rnp)
720 sync_rcu_preempt_exp_init(rsp, rnp); 776 sync_rcu_preempt_exp_init(rsp, rnp);
721 if (NUM_RCU_NODES > 1) 777 if (NUM_RCU_NODES > 1)
@@ -723,7 +779,7 @@ void synchronize_rcu_expedited(void)
723 779
724 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 780 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
725 781
726 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ 782 /* Wait for snapshotted ->blkd_tasks lists to drain. */
727 rnp = rcu_get_root(rsp); 783 rnp = rcu_get_root(rsp);
728 wait_event(sync_rcu_preempt_exp_wq, 784 wait_event(sync_rcu_preempt_exp_wq,
729 sync_rcu_preempt_exp_done(rnp)); 785 sync_rcu_preempt_exp_done(rnp));
@@ -739,7 +795,7 @@ mb_ret:
739EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 795EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
740 796
741/* 797/*
742 * Check to see if there is any immediate preemptable-RCU-related work 798 * Check to see if there is any immediate preemptible-RCU-related work
743 * to be done. 799 * to be done.
744 */ 800 */
745static int rcu_preempt_pending(int cpu) 801static int rcu_preempt_pending(int cpu)
@@ -749,7 +805,7 @@ static int rcu_preempt_pending(int cpu)
749} 805}
750 806
751/* 807/*
752 * Does preemptable RCU need the CPU to stay out of dynticks mode? 808 * Does preemptible RCU need the CPU to stay out of dynticks mode?
753 */ 809 */
754static int rcu_preempt_needs_cpu(int cpu) 810static int rcu_preempt_needs_cpu(int cpu)
755{ 811{
@@ -766,7 +822,7 @@ void rcu_barrier(void)
766EXPORT_SYMBOL_GPL(rcu_barrier); 822EXPORT_SYMBOL_GPL(rcu_barrier);
767 823
768/* 824/*
769 * Initialize preemptable RCU's per-CPU data. 825 * Initialize preemptible RCU's per-CPU data.
770 */ 826 */
771static void __cpuinit rcu_preempt_init_percpu_data(int cpu) 827static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
772{ 828{
@@ -774,7 +830,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
774} 830}
775 831
776/* 832/*
777 * Move preemptable RCU's callbacks from dying CPU to other online CPU. 833 * Move preemptible RCU's callbacks from dying CPU to other online CPU.
778 */ 834 */
779static void rcu_preempt_send_cbs_to_online(void) 835static void rcu_preempt_send_cbs_to_online(void)
780{ 836{
@@ -782,7 +838,7 @@ static void rcu_preempt_send_cbs_to_online(void)
782} 838}
783 839
784/* 840/*
785 * Initialize preemptable RCU's state structures. 841 * Initialize preemptible RCU's state structures.
786 */ 842 */
787static void __init __rcu_init_preempt(void) 843static void __init __rcu_init_preempt(void)
788{ 844{
@@ -790,7 +846,7 @@ static void __init __rcu_init_preempt(void)
790} 846}
791 847
792/* 848/*
793 * Check for a task exiting while in a preemptable-RCU read-side 849 * Check for a task exiting while in a preemptible-RCU read-side
794 * critical section, clean up if so. No need to issue warnings, 850 * critical section, clean up if so. No need to issue warnings,
795 * as debug_check_no_locks_held() already does this if lockdep 851 * as debug_check_no_locks_held() already does this if lockdep
796 * is enabled. 852 * is enabled.
@@ -802,11 +858,13 @@ void exit_rcu(void)
802 if (t->rcu_read_lock_nesting == 0) 858 if (t->rcu_read_lock_nesting == 0)
803 return; 859 return;
804 t->rcu_read_lock_nesting = 1; 860 t->rcu_read_lock_nesting = 1;
805 rcu_read_unlock(); 861 __rcu_read_unlock();
806} 862}
807 863
808#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 864#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
809 865
866static struct rcu_state *rcu_state = &rcu_sched_state;
867
810/* 868/*
811 * Tell them what RCU they are running. 869 * Tell them what RCU they are running.
812 */ 870 */
@@ -836,7 +894,7 @@ void rcu_force_quiescent_state(void)
836EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 894EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
837 895
838/* 896/*
839 * Because preemptable RCU does not exist, we never have to check for 897 * Because preemptible RCU does not exist, we never have to check for
840 * CPUs being in quiescent states. 898 * CPUs being in quiescent states.
841 */ 899 */
842static void rcu_preempt_note_context_switch(int cpu) 900static void rcu_preempt_note_context_switch(int cpu)
@@ -844,10 +902,10 @@ static void rcu_preempt_note_context_switch(int cpu)
844} 902}
845 903
846/* 904/*
847 * Because preemptable RCU does not exist, there are never any preempted 905 * Because preemptible RCU does not exist, there are never any preempted
848 * RCU readers. 906 * RCU readers.
849 */ 907 */
850static int rcu_preempted_readers(struct rcu_node *rnp) 908static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
851{ 909{
852 return 0; 910 return 0;
853} 911}
@@ -862,10 +920,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
862 920
863#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 921#endif /* #ifdef CONFIG_HOTPLUG_CPU */
864 922
865#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
866
867/* 923/*
868 * Because preemptable RCU does not exist, we never have to check for 924 * Because preemptible RCU does not exist, we never have to check for
869 * tasks blocked within RCU read-side critical sections. 925 * tasks blocked within RCU read-side critical sections.
870 */ 926 */
871static void rcu_print_detail_task_stall(struct rcu_state *rsp) 927static void rcu_print_detail_task_stall(struct rcu_state *rsp)
@@ -873,7 +929,7 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
873} 929}
874 930
875/* 931/*
876 * Because preemptable RCU does not exist, we never have to check for 932 * Because preemptible RCU does not exist, we never have to check for
877 * tasks blocked within RCU read-side critical sections. 933 * tasks blocked within RCU read-side critical sections.
878 */ 934 */
879static void rcu_print_task_stall(struct rcu_node *rnp) 935static void rcu_print_task_stall(struct rcu_node *rnp)
@@ -888,10 +944,8 @@ static void rcu_preempt_stall_reset(void)
888{ 944{
889} 945}
890 946
891#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
892
893/* 947/*
894 * Because there is no preemptable RCU, there can be no readers blocked, 948 * Because there is no preemptible RCU, there can be no readers blocked,
895 * so there is no need to check for blocked tasks. So check only for 949 * so there is no need to check for blocked tasks. So check only for
896 * bogus qsmask values. 950 * bogus qsmask values.
897 */ 951 */
@@ -903,7 +957,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
903#ifdef CONFIG_HOTPLUG_CPU 957#ifdef CONFIG_HOTPLUG_CPU
904 958
905/* 959/*
906 * Because preemptable RCU does not exist, it never needs to migrate 960 * Because preemptible RCU does not exist, it never needs to migrate
907 * tasks that were blocked within RCU read-side critical sections, and 961 * tasks that were blocked within RCU read-side critical sections, and
908 * such non-existent tasks cannot possibly have been blocking the current 962 * such non-existent tasks cannot possibly have been blocking the current
909 * grace period. 963 * grace period.
@@ -916,7 +970,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
916} 970}
917 971
918/* 972/*
919 * Because preemptable RCU does not exist, it never needs CPU-offline 973 * Because preemptible RCU does not exist, it never needs CPU-offline
920 * processing. 974 * processing.
921 */ 975 */
922static void rcu_preempt_offline_cpu(int cpu) 976static void rcu_preempt_offline_cpu(int cpu)
@@ -926,7 +980,7 @@ static void rcu_preempt_offline_cpu(int cpu)
926#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 980#endif /* #ifdef CONFIG_HOTPLUG_CPU */
927 981
928/* 982/*
929 * Because preemptable RCU does not exist, it never has any callbacks 983 * Because preemptible RCU does not exist, it never has any callbacks
930 * to check. 984 * to check.
931 */ 985 */
932static void rcu_preempt_check_callbacks(int cpu) 986static void rcu_preempt_check_callbacks(int cpu)
@@ -934,7 +988,7 @@ static void rcu_preempt_check_callbacks(int cpu)
934} 988}
935 989
936/* 990/*
937 * Because preemptable RCU does not exist, it never has any callbacks 991 * Because preemptible RCU does not exist, it never has any callbacks
938 * to process. 992 * to process.
939 */ 993 */
940static void rcu_preempt_process_callbacks(void) 994static void rcu_preempt_process_callbacks(void)
@@ -943,7 +997,7 @@ static void rcu_preempt_process_callbacks(void)
943 997
944/* 998/*
945 * Wait for an rcu-preempt grace period, but make it happen quickly. 999 * Wait for an rcu-preempt grace period, but make it happen quickly.
946 * But because preemptable RCU does not exist, map to rcu-sched. 1000 * But because preemptible RCU does not exist, map to rcu-sched.
947 */ 1001 */
948void synchronize_rcu_expedited(void) 1002void synchronize_rcu_expedited(void)
949{ 1003{
@@ -954,7 +1008,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
954#ifdef CONFIG_HOTPLUG_CPU 1008#ifdef CONFIG_HOTPLUG_CPU
955 1009
956/* 1010/*
957 * Because preemptable RCU does not exist, there is never any need to 1011 * Because preemptible RCU does not exist, there is never any need to
958 * report on tasks preempted in RCU read-side critical sections during 1012 * report on tasks preempted in RCU read-side critical sections during
959 * expedited RCU grace periods. 1013 * expedited RCU grace periods.
960 */ 1014 */
@@ -966,7 +1020,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
966#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1020#endif /* #ifdef CONFIG_HOTPLUG_CPU */
967 1021
968/* 1022/*
969 * Because preemptable RCU does not exist, it never has any work to do. 1023 * Because preemptible RCU does not exist, it never has any work to do.
970 */ 1024 */
971static int rcu_preempt_pending(int cpu) 1025static int rcu_preempt_pending(int cpu)
972{ 1026{
@@ -974,7 +1028,7 @@ static int rcu_preempt_pending(int cpu)
974} 1028}
975 1029
976/* 1030/*
977 * Because preemptable RCU does not exist, it never needs any CPU. 1031 * Because preemptible RCU does not exist, it never needs any CPU.
978 */ 1032 */
979static int rcu_preempt_needs_cpu(int cpu) 1033static int rcu_preempt_needs_cpu(int cpu)
980{ 1034{
@@ -982,7 +1036,7 @@ static int rcu_preempt_needs_cpu(int cpu)
982} 1036}
983 1037
984/* 1038/*
985 * Because preemptable RCU does not exist, rcu_barrier() is just 1039 * Because preemptible RCU does not exist, rcu_barrier() is just
986 * another name for rcu_barrier_sched(). 1040 * another name for rcu_barrier_sched().
987 */ 1041 */
988void rcu_barrier(void) 1042void rcu_barrier(void)
@@ -992,7 +1046,7 @@ void rcu_barrier(void)
992EXPORT_SYMBOL_GPL(rcu_barrier); 1046EXPORT_SYMBOL_GPL(rcu_barrier);
993 1047
994/* 1048/*
995 * Because preemptable RCU does not exist, there is no per-CPU 1049 * Because preemptible RCU does not exist, there is no per-CPU
996 * data to initialize. 1050 * data to initialize.
997 */ 1051 */
998static void __cpuinit rcu_preempt_init_percpu_data(int cpu) 1052static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
@@ -1000,14 +1054,14 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
1000} 1054}
1001 1055
1002/* 1056/*
1003 * Because there is no preemptable RCU, there are no callbacks to move. 1057 * Because there is no preemptible RCU, there are no callbacks to move.
1004 */ 1058 */
1005static void rcu_preempt_send_cbs_to_online(void) 1059static void rcu_preempt_send_cbs_to_online(void)
1006{ 1060{
1007} 1061}
1008 1062
1009/* 1063/*
1010 * Because preemptable RCU does not exist, it need not be initialized. 1064 * Because preemptible RCU does not exist, it need not be initialized.
1011 */ 1065 */
1012static void __init __rcu_init_preempt(void) 1066static void __init __rcu_init_preempt(void)
1013{ 1067{
@@ -1015,6 +1069,302 @@ static void __init __rcu_init_preempt(void)
1015 1069
1016#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1070#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1017 1071
1072#ifdef CONFIG_RCU_BOOST
1073
1074#include "rtmutex_common.h"
1075
1076#ifdef CONFIG_RCU_TRACE
1077
1078static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1079{
1080 if (list_empty(&rnp->blkd_tasks))
1081 rnp->n_balk_blkd_tasks++;
1082 else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
1083 rnp->n_balk_exp_gp_tasks++;
1084 else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
1085 rnp->n_balk_boost_tasks++;
1086 else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
1087 rnp->n_balk_notblocked++;
1088 else if (rnp->gp_tasks != NULL &&
1089 ULONG_CMP_LT(jiffies, rnp->boost_time))
1090 rnp->n_balk_notyet++;
1091 else
1092 rnp->n_balk_nos++;
1093}
1094
1095#else /* #ifdef CONFIG_RCU_TRACE */
1096
1097static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1098{
1099}
1100
1101#endif /* #else #ifdef CONFIG_RCU_TRACE */
1102
1103/*
1104 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1105 * or ->boost_tasks, advancing the pointer to the next task in the
1106 * ->blkd_tasks list.
1107 *
1108 * Note that irqs must be enabled: boosting the task can block.
1109 * Returns 1 if there are more tasks needing to be boosted.
1110 */
1111static int rcu_boost(struct rcu_node *rnp)
1112{
1113 unsigned long flags;
1114 struct rt_mutex mtx;
1115 struct task_struct *t;
1116 struct list_head *tb;
1117
1118 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
1119 return 0; /* Nothing left to boost. */
1120
1121 raw_spin_lock_irqsave(&rnp->lock, flags);
1122
1123 /*
1124 * Recheck under the lock: all tasks in need of boosting
1125 * might exit their RCU read-side critical sections on their own.
1126 */
1127 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
1128 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1129 return 0;
1130 }
1131
1132 /*
1133 * Preferentially boost tasks blocking expedited grace periods.
1134 * This cannot starve the normal grace periods because a second
1135 * expedited grace period must boost all blocked tasks, including
1136 * those blocking the pre-existing normal grace period.
1137 */
1138 if (rnp->exp_tasks != NULL) {
1139 tb = rnp->exp_tasks;
1140 rnp->n_exp_boosts++;
1141 } else {
1142 tb = rnp->boost_tasks;
1143 rnp->n_normal_boosts++;
1144 }
1145 rnp->n_tasks_boosted++;
1146
1147 /*
1148 * We boost task t by manufacturing an rt_mutex that appears to
1149 * be held by task t. We leave a pointer to that rt_mutex where
1150 * task t can find it, and task t will release the mutex when it
1151 * exits its outermost RCU read-side critical section. Then
1152 * simply acquiring this artificial rt_mutex will boost task
1153 * t's priority. (Thanks to tglx for suggesting this approach!)
1154 *
1155 * Note that task t must acquire rnp->lock to remove itself from
1156 * the ->blkd_tasks list, which it will do from exit() if from
1157 * nowhere else. We therefore are guaranteed that task t will
1158 * stay around at least until we drop rnp->lock. Note that
1159 * rnp->lock also resolves races between our priority boosting
1160 * and task t's exiting its outermost RCU read-side critical
1161 * section.
1162 */
1163 t = container_of(tb, struct task_struct, rcu_node_entry);
1164 rt_mutex_init_proxy_locked(&mtx, t);
1165 t->rcu_boost_mutex = &mtx;
1166 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
1167 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1168 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
1169 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
1170
1171 return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL;
1172}
1173
1174/*
1175 * Timer handler to initiate waking up of boost kthreads that
1176 * have yielded the CPU due to excessive numbers of tasks to
1177 * boost. We wake up the per-rcu_node kthread, which in turn
1178 * will wake up the booster kthread.
1179 */
1180static void rcu_boost_kthread_timer(unsigned long arg)
1181{
1182 unsigned long flags;
1183 struct rcu_node *rnp = (struct rcu_node *)arg;
1184
1185 raw_spin_lock_irqsave(&rnp->lock, flags);
1186 invoke_rcu_node_kthread(rnp);
1187 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1188}
1189
1190/*
1191 * Priority-boosting kthread. One per leaf rcu_node and one for the
1192 * root rcu_node.
1193 */
1194static int rcu_boost_kthread(void *arg)
1195{
1196 struct rcu_node *rnp = (struct rcu_node *)arg;
1197 int spincnt = 0;
1198 int more2boost;
1199
1200 for (;;) {
1201 rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
1202 wait_event_interruptible(rnp->boost_wq, rnp->boost_tasks ||
1203 rnp->exp_tasks ||
1204 kthread_should_stop());
1205 if (kthread_should_stop())
1206 break;
1207 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
1208 more2boost = rcu_boost(rnp);
1209 if (more2boost)
1210 spincnt++;
1211 else
1212 spincnt = 0;
1213 if (spincnt > 10) {
1214 rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
1215 spincnt = 0;
1216 }
1217 }
1218 rnp->boost_kthread_status = RCU_KTHREAD_STOPPED;
1219 return 0;
1220}
1221
1222/*
1223 * Check to see if it is time to start boosting RCU readers that are
1224 * blocking the current grace period, and, if so, tell the per-rcu_node
1225 * kthread to start boosting them. If there is an expedited grace
1226 * period in progress, it is always time to boost.
1227 *
1228 * The caller must hold rnp->lock.
1229 */
1230static void rcu_initiate_boost(struct rcu_node *rnp)
1231{
1232 struct task_struct *t;
1233
1234 if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
1235 rnp->n_balk_exp_gp_tasks++;
1236 return;
1237 }
1238 if (rnp->exp_tasks != NULL ||
1239 (rnp->gp_tasks != NULL &&
1240 rnp->boost_tasks == NULL &&
1241 rnp->qsmask == 0 &&
1242 ULONG_CMP_GE(jiffies, rnp->boost_time))) {
1243 if (rnp->exp_tasks == NULL)
1244 rnp->boost_tasks = rnp->gp_tasks;
1245 t = rnp->boost_kthread_task;
1246 if (t != NULL)
1247 wake_up_process(t);
1248 } else
1249 rcu_initiate_boost_trace(rnp);
1250}
1251
1252/*
1253 * Set the affinity of the boost kthread. The CPU-hotplug locks are
1254 * held, so no one should be messing with the existence of the boost
1255 * kthread.
1256 */
1257static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
1258 cpumask_var_t cm)
1259{
1260 struct task_struct *t;
1261
1262 t = rnp->boost_kthread_task;
1263 if (t != NULL)
1264 set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
1265}
1266
1267#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
1268
1269/*
1270 * Do priority-boost accounting for the start of a new grace period.
1271 */
1272static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1273{
1274 rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
1275}
1276
1277/*
1278 * Initialize the RCU-boost waitqueue.
1279 */
1280static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp)
1281{
1282 init_waitqueue_head(&rnp->boost_wq);
1283}
1284
1285/*
1286 * Create an RCU-boost kthread for the specified node if one does not
1287 * already exist. We only create this kthread for preemptible RCU.
1288 * Returns zero if all is well, a negated errno otherwise.
1289 */
1290static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1291 struct rcu_node *rnp,
1292 int rnp_index)
1293{
1294 unsigned long flags;
1295 struct sched_param sp;
1296 struct task_struct *t;
1297
1298 if (&rcu_preempt_state != rsp)
1299 return 0;
1300 if (rnp->boost_kthread_task != NULL)
1301 return 0;
1302 t = kthread_create(rcu_boost_kthread, (void *)rnp,
1303 "rcub%d", rnp_index);
1304 if (IS_ERR(t))
1305 return PTR_ERR(t);
1306 raw_spin_lock_irqsave(&rnp->lock, flags);
1307 rnp->boost_kthread_task = t;
1308 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1309 wake_up_process(t);
1310 sp.sched_priority = RCU_KTHREAD_PRIO;
1311 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1312 return 0;
1313}
1314
1315#ifdef CONFIG_HOTPLUG_CPU
1316
1317static void rcu_stop_boost_kthread(struct rcu_node *rnp)
1318{
1319 unsigned long flags;
1320 struct task_struct *t;
1321
1322 raw_spin_lock_irqsave(&rnp->lock, flags);
1323 t = rnp->boost_kthread_task;
1324 rnp->boost_kthread_task = NULL;
1325 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1326 if (t != NULL)
1327 kthread_stop(t);
1328}
1329
1330#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1331
1332#else /* #ifdef CONFIG_RCU_BOOST */
1333
1334static void rcu_initiate_boost(struct rcu_node *rnp)
1335{
1336}
1337
1338static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
1339 cpumask_var_t cm)
1340{
1341}
1342
1343static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1344{
1345}
1346
1347static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp)
1348{
1349}
1350
1351static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1352 struct rcu_node *rnp,
1353 int rnp_index)
1354{
1355 return 0;
1356}
1357
1358#ifdef CONFIG_HOTPLUG_CPU
1359
1360static void rcu_stop_boost_kthread(struct rcu_node *rnp)
1361{
1362}
1363
1364#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1365
1366#endif /* #else #ifdef CONFIG_RCU_BOOST */
1367
1018#ifndef CONFIG_SMP 1368#ifndef CONFIG_SMP
1019 1369
1020void synchronize_sched_expedited(void) 1370void synchronize_sched_expedited(void)
@@ -1187,14 +1537,13 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1187 * 1537 *
1188 * Because it is not legal to invoke rcu_process_callbacks() with irqs 1538 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1189 * disabled, we do one pass of force_quiescent_state(), then do a 1539 * disabled, we do one pass of force_quiescent_state(), then do a
1190 * raise_softirq() to cause rcu_process_callbacks() to be invoked later. 1540 * invoke_rcu_cpu_kthread() to cause rcu_process_callbacks() to be invoked
1191 * The per-cpu rcu_dyntick_drain variable controls the sequencing. 1541 * later. The per-cpu rcu_dyntick_drain variable controls the sequencing.
1192 */ 1542 */
1193int rcu_needs_cpu(int cpu) 1543int rcu_needs_cpu(int cpu)
1194{ 1544{
1195 int c = 0; 1545 int c = 0;
1196 int snap; 1546 int snap;
1197 int snap_nmi;
1198 int thatcpu; 1547 int thatcpu;
1199 1548
1200 /* Check for being in the holdoff period. */ 1549 /* Check for being in the holdoff period. */
@@ -1205,10 +1554,10 @@ int rcu_needs_cpu(int cpu)
1205 for_each_online_cpu(thatcpu) { 1554 for_each_online_cpu(thatcpu) {
1206 if (thatcpu == cpu) 1555 if (thatcpu == cpu)
1207 continue; 1556 continue;
1208 snap = per_cpu(rcu_dynticks, thatcpu).dynticks; 1557 snap = atomic_add_return(0, &per_cpu(rcu_dynticks,
1209 snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi; 1558 thatcpu).dynticks);
1210 smp_mb(); /* Order sampling of snap with end of grace period. */ 1559 smp_mb(); /* Order sampling of snap with end of grace period. */
1211 if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) { 1560 if ((snap & 0x1) != 0) {
1212 per_cpu(rcu_dyntick_drain, cpu) = 0; 1561 per_cpu(rcu_dyntick_drain, cpu) = 0;
1213 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 1562 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1214 return rcu_needs_cpu_quick_check(cpu); 1563 return rcu_needs_cpu_quick_check(cpu);
@@ -1239,7 +1588,7 @@ int rcu_needs_cpu(int cpu)
1239 1588
1240 /* If RCU callbacks are still pending, RCU still needs this CPU. */ 1589 /* If RCU callbacks are still pending, RCU still needs this CPU. */
1241 if (c) 1590 if (c)
1242 raise_softirq(RCU_SOFTIRQ); 1591 invoke_rcu_cpu_kthread();
1243 return c; 1592 return c;
1244} 1593}
1245 1594
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index c8e97853b970..9678cc3650f5 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,6 +46,18 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "rcutree.h"
48 48
49DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
50DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu);
51DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
52DECLARE_PER_CPU(char, rcu_cpu_has_work);
53
54static char convert_kthread_status(unsigned int kthread_status)
55{
56 if (kthread_status > RCU_KTHREAD_MAX)
57 return '?';
58 return "SRWOY"[kthread_status];
59}
60
49static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 61static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
50{ 62{
51 if (!rdp->beenonline) 63 if (!rdp->beenonline)
@@ -57,14 +69,28 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
57 rdp->passed_quiesc, rdp->passed_quiesc_completed, 69 rdp->passed_quiesc, rdp->passed_quiesc_completed,
58 rdp->qs_pending); 70 rdp->qs_pending);
59#ifdef CONFIG_NO_HZ 71#ifdef CONFIG_NO_HZ
60 seq_printf(m, " dt=%d/%d dn=%d df=%lu", 72 seq_printf(m, " dt=%d/%d/%d df=%lu",
61 rdp->dynticks->dynticks, 73 atomic_read(&rdp->dynticks->dynticks),
62 rdp->dynticks->dynticks_nesting, 74 rdp->dynticks->dynticks_nesting,
63 rdp->dynticks->dynticks_nmi, 75 rdp->dynticks->dynticks_nmi_nesting,
64 rdp->dynticks_fqs); 76 rdp->dynticks_fqs);
65#endif /* #ifdef CONFIG_NO_HZ */ 77#endif /* #ifdef CONFIG_NO_HZ */
66 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); 78 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
67 seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit); 79 seq_printf(m, " ql=%ld qs=%c%c%c%c kt=%d/%c/%d ktl=%x b=%ld",
80 rdp->qlen,
81 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
82 rdp->nxttail[RCU_NEXT_TAIL]],
83 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
84 rdp->nxttail[RCU_NEXT_READY_TAIL]],
85 ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
86 rdp->nxttail[RCU_WAIT_TAIL]],
87 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]],
88 per_cpu(rcu_cpu_has_work, rdp->cpu),
89 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
90 rdp->cpu)),
91 per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
92 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff,
93 rdp->blimit);
68 seq_printf(m, " ci=%lu co=%lu ca=%lu\n", 94 seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
69 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); 95 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
70} 96}
@@ -115,13 +141,24 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
115 rdp->qs_pending); 141 rdp->qs_pending);
116#ifdef CONFIG_NO_HZ 142#ifdef CONFIG_NO_HZ
117 seq_printf(m, ",%d,%d,%d,%lu", 143 seq_printf(m, ",%d,%d,%d,%lu",
118 rdp->dynticks->dynticks, 144 atomic_read(&rdp->dynticks->dynticks),
119 rdp->dynticks->dynticks_nesting, 145 rdp->dynticks->dynticks_nesting,
120 rdp->dynticks->dynticks_nmi, 146 rdp->dynticks->dynticks_nmi_nesting,
121 rdp->dynticks_fqs); 147 rdp->dynticks_fqs);
122#endif /* #ifdef CONFIG_NO_HZ */ 148#endif /* #ifdef CONFIG_NO_HZ */
123 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); 149 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
124 seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit); 150 seq_printf(m, ",%ld,\"%c%c%c%c\",%d,\"%c\",%ld", rdp->qlen,
151 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
152 rdp->nxttail[RCU_NEXT_TAIL]],
153 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
154 rdp->nxttail[RCU_NEXT_READY_TAIL]],
155 ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
156 rdp->nxttail[RCU_WAIT_TAIL]],
157 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]],
158 per_cpu(rcu_cpu_has_work, rdp->cpu),
159 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
160 rdp->cpu)),
161 rdp->blimit);
125 seq_printf(m, ",%lu,%lu,%lu\n", 162 seq_printf(m, ",%lu,%lu,%lu\n",
126 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); 163 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
127} 164}
@@ -130,7 +167,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
130{ 167{
131 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); 168 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
132#ifdef CONFIG_NO_HZ 169#ifdef CONFIG_NO_HZ
133 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); 170 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
134#endif /* #ifdef CONFIG_NO_HZ */ 171#endif /* #ifdef CONFIG_NO_HZ */
135 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n"); 172 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n");
136#ifdef CONFIG_TREE_PREEMPT_RCU 173#ifdef CONFIG_TREE_PREEMPT_RCU
@@ -157,11 +194,76 @@ static const struct file_operations rcudata_csv_fops = {
157 .release = single_release, 194 .release = single_release,
158}; 195};
159 196
197#ifdef CONFIG_RCU_BOOST
198
199static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
200{
201 seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu "
202 "j=%04x bt=%04x\n",
203 rnp->grplo, rnp->grphi,
204 "T."[list_empty(&rnp->blkd_tasks)],
205 "N."[!rnp->gp_tasks],
206 "E."[!rnp->exp_tasks],
207 "B."[!rnp->boost_tasks],
208 convert_kthread_status(rnp->boost_kthread_status),
209 rnp->n_tasks_boosted, rnp->n_exp_boosts,
210 rnp->n_normal_boosts,
211 (int)(jiffies & 0xffff),
212 (int)(rnp->boost_time & 0xffff));
213 seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
214 " balk",
215 rnp->n_balk_blkd_tasks,
216 rnp->n_balk_exp_gp_tasks,
217 rnp->n_balk_boost_tasks,
218 rnp->n_balk_notblocked,
219 rnp->n_balk_notyet,
220 rnp->n_balk_nos);
221}
222
223static int show_rcu_node_boost(struct seq_file *m, void *unused)
224{
225 struct rcu_node *rnp;
226
227 rcu_for_each_leaf_node(&rcu_preempt_state, rnp)
228 print_one_rcu_node_boost(m, rnp);
229 return 0;
230}
231
232static int rcu_node_boost_open(struct inode *inode, struct file *file)
233{
234 return single_open(file, show_rcu_node_boost, NULL);
235}
236
237static const struct file_operations rcu_node_boost_fops = {
238 .owner = THIS_MODULE,
239 .open = rcu_node_boost_open,
240 .read = seq_read,
241 .llseek = seq_lseek,
242 .release = single_release,
243};
244
245/*
246 * Create the rcuboost debugfs entry. Standard error return.
247 */
248static int rcu_boost_trace_create_file(struct dentry *rcudir)
249{
250 return !debugfs_create_file("rcuboost", 0444, rcudir, NULL,
251 &rcu_node_boost_fops);
252}
253
254#else /* #ifdef CONFIG_RCU_BOOST */
255
256static int rcu_boost_trace_create_file(struct dentry *rcudir)
257{
258 return 0; /* There cannot be an error if we didn't create it! */
259}
260
261#endif /* #else #ifdef CONFIG_RCU_BOOST */
262
160static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 263static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
161{ 264{
162 unsigned long gpnum; 265 unsigned long gpnum;
163 int level = 0; 266 int level = 0;
164 int phase;
165 struct rcu_node *rnp; 267 struct rcu_node *rnp;
166 268
167 gpnum = rsp->gpnum; 269 gpnum = rsp->gpnum;
@@ -178,13 +280,11 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
178 seq_puts(m, "\n"); 280 seq_puts(m, "\n");
179 level = rnp->level; 281 level = rnp->level;
180 } 282 }
181 phase = gpnum & 0x1; 283 seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ",
182 seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ",
183 rnp->qsmask, rnp->qsmaskinit, 284 rnp->qsmask, rnp->qsmaskinit,
184 "T."[list_empty(&rnp->blocked_tasks[phase])], 285 ".G"[rnp->gp_tasks != NULL],
185 "E."[list_empty(&rnp->blocked_tasks[phase + 2])], 286 ".E"[rnp->exp_tasks != NULL],
186 "T."[list_empty(&rnp->blocked_tasks[!phase])], 287 ".T"[!list_empty(&rnp->blkd_tasks)],
187 "E."[list_empty(&rnp->blocked_tasks[!phase + 2])],
188 rnp->grplo, rnp->grphi, rnp->grpnum); 288 rnp->grplo, rnp->grphi, rnp->grpnum);
189 } 289 }
190 seq_puts(m, "\n"); 290 seq_puts(m, "\n");
@@ -216,16 +316,35 @@ static const struct file_operations rcuhier_fops = {
216 .release = single_release, 316 .release = single_release,
217}; 317};
218 318
319static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
320{
321 unsigned long flags;
322 unsigned long completed;
323 unsigned long gpnum;
324 unsigned long gpage;
325 unsigned long gpmax;
326 struct rcu_node *rnp = &rsp->node[0];
327
328 raw_spin_lock_irqsave(&rnp->lock, flags);
329 completed = rsp->completed;
330 gpnum = rsp->gpnum;
331 if (rsp->completed == rsp->gpnum)
332 gpage = 0;
333 else
334 gpage = jiffies - rsp->gp_start;
335 gpmax = rsp->gp_max;
336 raw_spin_unlock_irqrestore(&rnp->lock, flags);
337 seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n",
338 rsp->name, completed, gpnum, gpage, gpmax);
339}
340
219static int show_rcugp(struct seq_file *m, void *unused) 341static int show_rcugp(struct seq_file *m, void *unused)
220{ 342{
221#ifdef CONFIG_TREE_PREEMPT_RCU 343#ifdef CONFIG_TREE_PREEMPT_RCU
222 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n", 344 show_one_rcugp(m, &rcu_preempt_state);
223 rcu_preempt_state.completed, rcu_preempt_state.gpnum);
224#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 345#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
225 seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n", 346 show_one_rcugp(m, &rcu_sched_state);
226 rcu_sched_state.completed, rcu_sched_state.gpnum); 347 show_one_rcugp(m, &rcu_bh_state);
227 seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n",
228 rcu_bh_state.completed, rcu_bh_state.gpnum);
229 return 0; 348 return 0;
230} 349}
231 350
@@ -298,6 +417,29 @@ static const struct file_operations rcu_pending_fops = {
298 .release = single_release, 417 .release = single_release,
299}; 418};
300 419
420static int show_rcutorture(struct seq_file *m, void *unused)
421{
422 seq_printf(m, "rcutorture test sequence: %lu %s\n",
423 rcutorture_testseq >> 1,
424 (rcutorture_testseq & 0x1) ? "(test in progress)" : "");
425 seq_printf(m, "rcutorture update version number: %lu\n",
426 rcutorture_vernum);
427 return 0;
428}
429
430static int rcutorture_open(struct inode *inode, struct file *file)
431{
432 return single_open(file, show_rcutorture, NULL);
433}
434
435static const struct file_operations rcutorture_fops = {
436 .owner = THIS_MODULE,
437 .open = rcutorture_open,
438 .read = seq_read,
439 .llseek = seq_lseek,
440 .release = single_release,
441};
442
301static struct dentry *rcudir; 443static struct dentry *rcudir;
302 444
303static int __init rcutree_trace_init(void) 445static int __init rcutree_trace_init(void)
@@ -318,6 +460,9 @@ static int __init rcutree_trace_init(void)
318 if (!retval) 460 if (!retval)
319 goto free_out; 461 goto free_out;
320 462
463 if (rcu_boost_trace_create_file(rcudir))
464 goto free_out;
465
321 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); 466 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
322 if (!retval) 467 if (!retval)
323 goto free_out; 468 goto free_out;
@@ -331,6 +476,11 @@ static int __init rcutree_trace_init(void)
331 NULL, &rcu_pending_fops); 476 NULL, &rcu_pending_fops);
332 if (!retval) 477 if (!retval)
333 goto free_out; 478 goto free_out;
479
480 retval = debugfs_create_file("rcutorture", 0444, rcudir,
481 NULL, &rcutorture_fops);
482 if (!retval)
483 goto free_out;
334 return 0; 484 return 0;
335free_out: 485free_out:
336 debugfs_remove_recursive(rcudir); 486 debugfs_remove_recursive(rcudir);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 174f976c2874..13960170cad4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -58,7 +58,7 @@ DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59char *softirq_to_name[NR_SOFTIRQS] = { 59char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
61 "TASKLET", "SCHED", "HRTIMER", "RCU" 61 "TASKLET", "SCHED", "HRTIMER"
62}; 62};
63 63
64/* 64/*
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c768bcdda1b7..3aa278046d78 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -337,7 +337,7 @@ config DEBUG_OBJECTS_WORK
337 337
338config DEBUG_OBJECTS_RCU_HEAD 338config DEBUG_OBJECTS_RCU_HEAD
339 bool "Debug RCU callbacks objects" 339 bool "Debug RCU callbacks objects"
340 depends on DEBUG_OBJECTS && PREEMPT 340 depends on DEBUG_OBJECTS
341 help 341 help
342 Enable this to turn on debugging of RCU list heads (call_rcu() usage). 342 Enable this to turn on debugging of RCU list heads (call_rcu() usage).
343 343
@@ -875,22 +875,9 @@ config RCU_TORTURE_TEST_RUNNABLE
875 Say N here if you want the RCU torture tests to start only 875 Say N here if you want the RCU torture tests to start only
876 after being manually enabled via /proc. 876 after being manually enabled via /proc.
877 877
878config RCU_CPU_STALL_DETECTOR
879 bool "Check for stalled CPUs delaying RCU grace periods"
880 depends on TREE_RCU || TREE_PREEMPT_RCU
881 default y
882 help
883 This option causes RCU to printk information on which
884 CPUs are delaying the current grace period, but only when
885 the grace period extends for excessive time periods.
886
887 Say N if you want to disable such checks.
888
889 Say Y if you are unsure.
890
891config RCU_CPU_STALL_TIMEOUT 878config RCU_CPU_STALL_TIMEOUT
892 int "RCU CPU stall timeout in seconds" 879 int "RCU CPU stall timeout in seconds"
893 depends on RCU_CPU_STALL_DETECTOR 880 depends on TREE_RCU || TREE_PREEMPT_RCU
894 range 3 300 881 range 3 300
895 default 60 882 default 60
896 help 883 help
@@ -899,22 +886,9 @@ config RCU_CPU_STALL_TIMEOUT
899 RCU grace period persists, additional CPU stall warnings are 886 RCU grace period persists, additional CPU stall warnings are
900 printed at more widely spaced intervals. 887 printed at more widely spaced intervals.
901 888
902config RCU_CPU_STALL_DETECTOR_RUNNABLE
903 bool "RCU CPU stall checking starts automatically at boot"
904 depends on RCU_CPU_STALL_DETECTOR
905 default y
906 help
907 If set, start checking for RCU CPU stalls immediately on
908 boot. Otherwise, RCU CPU stall checking must be manually
909 enabled.
910
911 Say Y if you are unsure.
912
913 Say N if you wish to suppress RCU CPU stall checking during boot.
914
915config RCU_CPU_STALL_VERBOSE 889config RCU_CPU_STALL_VERBOSE
916 bool "Print additional per-task information for RCU_CPU_STALL_DETECTOR" 890 bool "Print additional per-task information for RCU_CPU_STALL_DETECTOR"
917 depends on RCU_CPU_STALL_DETECTOR && TREE_PREEMPT_RCU 891 depends on TREE_PREEMPT_RCU
918 default y 892 default y
919 help 893 help
920 This option causes RCU to printk detailed per-task information 894 This option causes RCU to printk detailed per-task information
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index 0a7ed5b5e281..1e88485c16a0 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -2187,7 +2187,6 @@ static const struct flag flags[] = {
2187 { "TASKLET_SOFTIRQ", 6 }, 2187 { "TASKLET_SOFTIRQ", 6 },
2188 { "SCHED_SOFTIRQ", 7 }, 2188 { "SCHED_SOFTIRQ", 7 },
2189 { "HRTIMER_SOFTIRQ", 8 }, 2189 { "HRTIMER_SOFTIRQ", 8 },
2190 { "RCU_SOFTIRQ", 9 },
2191 2190
2192 { "HRTIMER_NORESTART", 0 }, 2191 { "HRTIMER_NORESTART", 0 },
2193 { "HRTIMER_RESTART", 1 }, 2192 { "HRTIMER_RESTART", 1 },