aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/RCU/Design/Requirements/2013-08-is-it-dead.pngbin0 -> 100825 bytes
-rw-r--r--Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg374
-rw-r--r--Documentation/RCU/Design/Requirements/RCUApplicability.svg237
-rw-r--r--Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg639
-rw-r--r--Documentation/RCU/Design/Requirements/Requirements.html2897
-rw-r--r--Documentation/RCU/Design/Requirements/Requirements.htmlx2741
-rwxr-xr-xDocumentation/RCU/Design/htmlqqz.sh108
-rw-r--r--Documentation/kernel-parameters.txt29
-rw-r--r--Documentation/memory-barriers.txt8
-rw-r--r--drivers/tty/sysrq.c6
-rw-r--r--include/linux/list.h14
-rw-r--r--include/linux/list_bl.h2
-rw-r--r--include/linux/list_nulls.h2
-rw-r--r--include/linux/rculist.h105
-rw-r--r--include/linux/rcupdate.h21
-rw-r--r--include/linux/rcutiny.h8
-rw-r--r--include/linux/rcutree.h4
-rw-r--r--include/linux/tracepoint.h4
-rw-r--r--init/main.c2
-rw-r--r--kernel/ksysfs.c26
-rw-r--r--kernel/rcu/rcutorture.c24
-rw-r--r--kernel/rcu/srcu.c2
-rw-r--r--kernel/rcu/tree.c313
-rw-r--r--kernel/rcu/tree.h61
-rw-r--r--kernel/rcu/tree_plugin.h66
-rw-r--r--kernel/rcu/tree_trace.c39
-rw-r--r--kernel/rcu/update.c22
-rw-r--r--kernel/sched/core.c6
-rw-r--r--lib/list_debug.c2
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh9
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm.sh22
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/parse-console.sh41
-rw-r--r--tools/testing/selftests/rcutorture/doc/TINY_RCU.txt1
-rw-r--r--tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt4
34 files changed, 7552 insertions, 287 deletions
diff --git a/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png b/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png
new file mode 100644
index 000000000000..7496a55e4e7b
--- /dev/null
+++ b/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png
Binary files differ
diff --git a/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg b/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg
new file mode 100644
index 000000000000..4b4014fda770
--- /dev/null
+++ b/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg
@@ -0,0 +1,374 @@
1<?xml version="1.0" encoding="UTF-8" standalone="no"?>
2<!-- Created with Inkscape (http://www.inkscape.org/) -->
3
4<svg
5 xmlns:dc="http://purl.org/dc/elements/1.1/"
6 xmlns:cc="http://creativecommons.org/ns#"
7 xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
8 xmlns:svg="http://www.w3.org/2000/svg"
9 xmlns="http://www.w3.org/2000/svg"
10 xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
11 xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
12 width="447.99197"
13 height="428.19299"
14 id="svg2"
15 version="1.1"
16 inkscape:version="0.48.3.1 r9886"
17 sodipodi:docname="GPpartitionReaders1.svg">
18 <defs
19 id="defs4">
20 <marker
21 inkscape:stockid="Arrow2Lend"
22 orient="auto"
23 refY="0"
24 refX="0"
25 id="Arrow2Lend"
26 style="overflow:visible">
27 <path
28 id="path3792"
29 style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
30 d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
31 transform="matrix(-1.1,0,0,-1.1,-1.1,0)"
32 inkscape:connector-curvature="0" />
33 </marker>
34 <marker
35 inkscape:stockid="Arrow2Lstart"
36 orient="auto"
37 refY="0"
38 refX="0"
39 id="Arrow2Lstart"
40 style="overflow:visible">
41 <path
42 id="path3789"
43 style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
44 d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
45 transform="matrix(1.1,0,0,1.1,1.1,0)"
46 inkscape:connector-curvature="0" />
47 </marker>
48 </defs>
49 <sodipodi:namedview
50 id="base"
51 pagecolor="#ffffff"
52 bordercolor="#666666"
53 borderopacity="1.0"
54 inkscape:pageopacity="0.0"
55 inkscape:pageshadow="2"
56 inkscape:zoom="1.6184291"
57 inkscape:cx="223.99599"
58 inkscape:cy="214.0965"
59 inkscape:document-units="px"
60 inkscape:current-layer="layer1"
61 showgrid="false"
62 inkscape:window-width="979"
63 inkscape:window-height="836"
64 inkscape:window-x="571"
65 inkscape:window-y="335"
66 inkscape:window-maximized="0"
67 fit-margin-top="5"
68 fit-margin-left="5"
69 fit-margin-right="5"
70 fit-margin-bottom="5" />
71 <metadata
72 id="metadata7">
73 <rdf:RDF>
74 <cc:Work
75 rdf:about="">
76 <dc:format>image/svg+xml</dc:format>
77 <dc:type
78 rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
79 <dc:title></dc:title>
80 </cc:Work>
81 </rdf:RDF>
82 </metadata>
83 <g
84 inkscape:label="Layer 1"
85 inkscape:groupmode="layer"
86 id="layer1"
87 transform="translate(-28.441125,-185.60612)">
88 <flowRoot
89 xml:space="preserve"
90 id="flowRoot2985"
91 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion
92 id="flowRegion2987"><rect
93 id="rect2989"
94 width="82.85714"
95 height="11.428572"
96 x="240"
97 y="492.36218" /></flowRegion><flowPara
98 id="flowPara2991"></flowPara></flowRoot> <g
99 id="g4433"
100 transform="translate(2,0)">
101 <text
102 sodipodi:linespacing="125%"
103 id="text2993"
104 y="-261.66608"
105 x="412.12299"
106 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
107 xml:space="preserve"
108 transform="matrix(0,1,-1,0,0,0)"><tspan
109 y="-261.66608"
110 x="412.12299"
111 id="tspan2995"
112 sodipodi:role="line">synchronize_rcu()</tspan></text>
113 <g
114 id="g4417"
115 transform="matrix(0,1,-1,0,730.90257,222.4928)">
116 <path
117 style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)"
118 d="m 97.580736,477.4048 183.140664,0"
119 id="path2997"
120 inkscape:connector-curvature="0"
121 sodipodi:nodetypes="cc" />
122 <path
123 style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
124 d="m 96.752718,465.38398 0,22.62742"
125 id="path4397"
126 inkscape:connector-curvature="0"
127 sodipodi:nodetypes="cc" />
128 <path
129 style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
130 d="m 281.54942,465.38397 0,22.62742"
131 id="path4397-5"
132 inkscape:connector-curvature="0"
133 sodipodi:nodetypes="cc" />
134 </g>
135 </g>
136 <text
137 xml:space="preserve"
138 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
139 x="112.04738"
140 y="268.18076"
141 id="text4429"
142 sodipodi:linespacing="125%"><tspan
143 sodipodi:role="line"
144 id="tspan4431"
145 x="112.04738"
146 y="268.18076">WRITE_ONCE(a, 1);</tspan></text>
147 <text
148 xml:space="preserve"
149 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
150 x="112.04738"
151 y="439.13766"
152 id="text4441"
153 sodipodi:linespacing="125%"><tspan
154 sodipodi:role="line"
155 id="tspan4443"
156 x="112.04738"
157 y="439.13766">WRITE_ONCE(b, 1);</tspan></text>
158 <text
159 xml:space="preserve"
160 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
161 x="255.60869"
162 y="309.29346"
163 id="text4445"
164 sodipodi:linespacing="125%"><tspan
165 sodipodi:role="line"
166 id="tspan4447"
167 x="255.60869"
168 y="309.29346">r1 = READ_ONCE(a);</tspan></text>
169 <text
170 xml:space="preserve"
171 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
172 x="255.14423"
173 y="520.61786"
174 id="text4449"
175 sodipodi:linespacing="125%"><tspan
176 sodipodi:role="line"
177 id="tspan4451"
178 x="255.14423"
179 y="520.61786">WRITE_ONCE(c, 1);</tspan></text>
180 <text
181 xml:space="preserve"
182 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
183 x="396.10254"
184 y="384.71124"
185 id="text4453"
186 sodipodi:linespacing="125%"><tspan
187 sodipodi:role="line"
188 id="tspan4455"
189 x="396.10254"
190 y="384.71124">r2 = READ_ONCE(b);</tspan></text>
191 <text
192 xml:space="preserve"
193 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
194 x="396.10254"
195 y="582.13617"
196 id="text4457"
197 sodipodi:linespacing="125%"><tspan
198 sodipodi:role="line"
199 id="tspan4459"
200 x="396.10254"
201 y="582.13617">r3 = READ_ONCE(c);</tspan></text>
202 <text
203 xml:space="preserve"
204 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
205 x="112.08231"
206 y="213.91006"
207 id="text4461"
208 sodipodi:linespacing="125%"><tspan
209 sodipodi:role="line"
210 id="tspan4463"
211 x="112.08231"
212 y="213.91006">thread0()</tspan></text>
213 <text
214 xml:space="preserve"
215 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
216 x="252.34512"
217 y="213.91006"
218 id="text4461-6"
219 sodipodi:linespacing="125%"><tspan
220 sodipodi:role="line"
221 id="tspan4463-0"
222 x="252.34512"
223 y="213.91006">thread1()</tspan></text>
224 <text
225 xml:space="preserve"
226 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
227 x="396.42557"
228 y="213.91006"
229 id="text4461-2"
230 sodipodi:linespacing="125%"><tspan
231 sodipodi:role="line"
232 id="tspan4463-2"
233 x="396.42557"
234 y="213.91006">thread2()</tspan></text>
235 <rect
236 style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
237 id="rect4495"
238 width="436.28488"
239 height="416.4859"
240 x="34.648232"
241 y="191.10612" />
242 <path
243 style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
244 d="m 183.14066,191.10612 0,417.193 -0.70711,0"
245 id="path4497"
246 inkscape:connector-curvature="0" />
247 <path
248 style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
249 d="m 325.13867,191.10612 0,417.193 -0.70711,0"
250 id="path4497-5"
251 inkscape:connector-curvature="0" />
252 <text
253 xml:space="preserve"
254 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
255 x="111.75929"
256 y="251.53981"
257 id="text4429-8"
258 sodipodi:linespacing="125%"><tspan
259 sodipodi:role="line"
260 id="tspan4431-9"
261 x="111.75929"
262 y="251.53981">rcu_read_lock();</tspan></text>
263 <text
264 xml:space="preserve"
265 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
266 x="396.10254"
267 y="367.91556"
268 id="text4429-8-9"
269 sodipodi:linespacing="125%"><tspan
270 sodipodi:role="line"
271 id="tspan4431-9-4"
272 x="396.10254"
273 y="367.91556">rcu_read_lock();</tspan></text>
274 <text
275 xml:space="preserve"
276 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
277 x="396.10254"
278 y="597.40289"
279 id="text4429-8-9-3"
280 sodipodi:linespacing="125%"><tspan
281 sodipodi:role="line"
282 id="tspan4431-9-4-4"
283 x="396.10254"
284 y="597.40289">rcu_read_unlock();</tspan></text>
285 <text
286 xml:space="preserve"
287 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
288 x="111.75929"
289 y="453.15311"
290 id="text4429-8-9-3-1"
291 sodipodi:linespacing="125%"><tspan
292 sodipodi:role="line"
293 id="tspan4431-9-4-4-6"
294 x="111.75929"
295 y="453.15311">rcu_read_unlock();</tspan></text>
296 <path
297 style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
298 d="m 33.941125,227.87568 436.284885,0 0,0.7071"
299 id="path4608"
300 inkscape:connector-curvature="0" />
301 <text
302 xml:space="preserve"
303 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
304 x="394.94427"
305 y="345.66351"
306 id="text4648"
307 sodipodi:linespacing="125%"><tspan
308 sodipodi:role="line"
309 id="tspan4650"
310 x="394.94427"
311 y="345.66351">QS</tspan></text>
312 <path
313 sodipodi:type="arc"
314 style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
315 id="path4652"
316 sodipodi:cx="358.85669"
317 sodipodi:cy="142.87541"
318 sodipodi:rx="10.960155"
319 sodipodi:ry="10.253048"
320 d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
321 transform="translate(36.441125,199.60612)"
322 sodipodi:start="4.7135481"
323 sodipodi:end="10.994651"
324 sodipodi:open="true" />
325 <text
326 xml:space="preserve"
327 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
328 x="112.11968"
329 y="475.77856"
330 id="text4648-4"
331 sodipodi:linespacing="125%"><tspan
332 sodipodi:role="line"
333 id="tspan4650-4"
334 x="112.11968"
335 y="475.77856">QS</tspan></text>
336 <path
337 sodipodi:type="arc"
338 style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
339 id="path4652-7"
340 sodipodi:cx="358.85669"
341 sodipodi:cy="142.87541"
342 sodipodi:rx="10.960155"
343 sodipodi:ry="10.253048"
344 d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
345 transform="translate(-246.38346,329.72117)"
346 sodipodi:start="4.7135481"
347 sodipodi:end="10.994651"
348 sodipodi:open="true" />
349 <path
350 sodipodi:type="arc"
351 style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
352 id="path4652-7-7"
353 sodipodi:cx="358.85669"
354 sodipodi:cy="142.87541"
355 sodipodi:rx="10.960155"
356 sodipodi:ry="10.253048"
357 d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
358 transform="translate(-103.65246,202.90878)"
359 sodipodi:start="4.7135481"
360 sodipodi:end="10.994651"
361 sodipodi:open="true" />
362 <text
363 xml:space="preserve"
364 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
365 x="254.85066"
366 y="348.96619"
367 id="text4648-4-3"
368 sodipodi:linespacing="125%"><tspan
369 sodipodi:role="line"
370 id="tspan4650-4-5"
371 x="254.85066"
372 y="348.96619">QS</tspan></text>
373 </g>
374</svg>
diff --git a/Documentation/RCU/Design/Requirements/RCUApplicability.svg b/Documentation/RCU/Design/Requirements/RCUApplicability.svg
new file mode 100644
index 000000000000..ebcbeee391ed
--- /dev/null
+++ b/Documentation/RCU/Design/Requirements/RCUApplicability.svg
@@ -0,0 +1,237 @@
1<?xml version="1.0" encoding="UTF-8" standalone="no"?>
2<!-- Creator: fig2dev Version 3.2 Patchlevel 5d -->
3
4<!-- CreationDate: Tue Mar 4 18:34:25 2014 -->
5
6<!-- Magnification: 3.000 -->
7
8<svg
9 xmlns:dc="http://purl.org/dc/elements/1.1/"
10 xmlns:cc="http://creativecommons.org/ns#"
11 xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
12 xmlns:svg="http://www.w3.org/2000/svg"
13 xmlns="http://www.w3.org/2000/svg"
14 xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
15 xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
16 width="1089.1382"
17 height="668.21368"
18 viewBox="-2121 -36 14554.634 8876.4061"
19 id="svg2"
20 version="1.1"
21 inkscape:version="0.48.3.1 r9886"
22 sodipodi:docname="RCUApplicability.svg">
23 <metadata
24 id="metadata40">
25 <rdf:RDF>
26 <cc:Work
27 rdf:about="">
28 <dc:format>image/svg+xml</dc:format>
29 <dc:type
30 rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
31 <dc:title />
32 </cc:Work>
33 </rdf:RDF>
34 </metadata>
35 <defs
36 id="defs38" />
37 <sodipodi:namedview
38 pagecolor="#ffffff"
39 bordercolor="#666666"
40 borderopacity="1"
41 objecttolerance="10"
42 gridtolerance="10"
43 guidetolerance="10"
44 inkscape:pageopacity="0"
45 inkscape:pageshadow="2"
46 inkscape:window-width="849"
47 inkscape:window-height="639"
48 id="namedview36"
49 showgrid="false"
50 inkscape:zoom="0.51326165"
51 inkscape:cx="544.56912"
52 inkscape:cy="334.10686"
53 inkscape:window-x="149"
54 inkscape:window-y="448"
55 inkscape:window-maximized="0"
56 inkscape:current-layer="g4"
57 fit-margin-top="5"
58 fit-margin-left="5"
59 fit-margin-right="5"
60 fit-margin-bottom="5" />
61 <g
62 style="fill:none;stroke-width:0.025in"
63 id="g4"
64 transform="translate(-2043.6828,14.791398)">
65 <!-- Line: box -->
66 <rect
67 x="0"
68 y="0"
69 width="14400"
70 height="8775"
71 rx="0"
72 style="fill:#ffa1a1;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
73 id="rect6" />
74 <!-- Line: box -->
75 <rect
76 x="1350"
77 y="0"
78 width="11700"
79 height="6075"
80 rx="0"
81 style="fill:#ffff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
82 id="rect8" />
83 <!-- Line: box -->
84 <rect
85 x="2700"
86 y="0"
87 width="9000"
88 height="4275"
89 rx="0"
90 style="fill:#00ff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
91 id="rect10" />
92 <!-- Line: box -->
93 <rect
94 x="4050"
95 y="0"
96 width="6300"
97 height="2475"
98 rx="0"
99 style="fill:#87cfff;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
100 id="rect12" />
101 <!-- Text -->
102 <text
103 xml:space="preserve"
104 x="7200"
105 y="900"
106 font-style="normal"
107 font-weight="normal"
108 font-size="324"
109 id="text14"
110 sodipodi:linespacing="125%"
111 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
112 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
113 id="tspan3017">Read-Mostly, Stale &amp;</tspan></text>
114 <!-- Text -->
115 <text
116 xml:space="preserve"
117 x="7200"
118 y="1350"
119 font-style="normal"
120 font-weight="normal"
121 font-size="324"
122 id="text16"
123 sodipodi:linespacing="125%"
124 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
125 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
126 id="tspan3019">Inconsistent Data OK</tspan></text>
127 <!-- Text -->
128 <text
129 xml:space="preserve"
130 x="7200"
131 y="1800"
132 font-style="normal"
133 font-weight="normal"
134 font-size="324"
135 id="text18"
136 sodipodi:linespacing="125%"
137 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
138 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
139 id="tspan3021">(RCU Works Great!!!)</tspan></text>
140 <!-- Text -->
141 <text
142 xml:space="preserve"
143 x="7200"
144 y="3825"
145 font-style="normal"
146 font-weight="normal"
147 font-size="324"
148 id="text20"
149 sodipodi:linespacing="125%"
150 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
151 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
152 id="tspan3023">(RCU Works Well)</tspan></text>
153 <!-- Text -->
154 <text
155 xml:space="preserve"
156 x="7200"
157 y="3375"
158 font-style="normal"
159 font-weight="normal"
160 font-size="324"
161 id="text22"
162 sodipodi:linespacing="125%"
163 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
164 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
165 id="tspan3025">Read-Mostly, Need Consistent Data</tspan></text>
166 <!-- Text -->
167 <text
168 xml:space="preserve"
169 x="7200"
170 y="5175"
171 font-style="normal"
172 font-weight="normal"
173 font-size="324"
174 id="text24"
175 sodipodi:linespacing="125%"
176 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
177 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
178 id="tspan3027">Read-Write, Need Consistent Data</tspan></text>
179 <!-- Text -->
180 <text
181 xml:space="preserve"
182 x="7200"
183 y="6975"
184 font-style="normal"
185 font-weight="normal"
186 font-size="324"
187 id="text26"
188 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
189 sodipodi:linespacing="125%">Update-Mostly, Need Consistent Data</text>
190 <!-- Text -->
191 <text
192 xml:space="preserve"
193 x="7200"
194 y="5625"
195 font-style="normal"
196 font-weight="normal"
197 font-size="324"
198 id="text28"
199 sodipodi:linespacing="125%"
200 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
201 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
202 id="tspan3029">(RCU Might Be OK...)</tspan></text>
203 <!-- Text -->
204 <text
205 xml:space="preserve"
206 x="7200"
207 y="7875"
208 font-style="normal"
209 font-weight="normal"
210 font-size="324"
211 id="text30"
212 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
213 sodipodi:linespacing="125%">(1) Provide Existence Guarantees For Update-Friendly Mechanisms</text>
214 <!-- Text -->
215 <text
216 xml:space="preserve"
217 x="7200"
218 y="8325"
219 font-style="normal"
220 font-weight="normal"
221 font-size="324"
222 id="text32"
223 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
224 sodipodi:linespacing="125%">(2) Provide Wait-Free Read-Side Primitives for Real-Time Use)</text>
225 <!-- Text -->
226 <text
227 xml:space="preserve"
228 x="7200"
229 y="7425"
230 font-style="normal"
231 font-weight="normal"
232 font-size="324"
233 id="text34"
234 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
235 sodipodi:linespacing="125%">(RCU is Very Unlikely to be the Right Tool For The Job, But it Can:</text>
236 </g>
237</svg>
diff --git a/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg b/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg
new file mode 100644
index 000000000000..48cd1623d4d4
--- /dev/null
+++ b/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg
@@ -0,0 +1,639 @@
1<?xml version="1.0" encoding="UTF-8" standalone="no"?>
2<!-- Created with Inkscape (http://www.inkscape.org/) -->
3
4<svg
5 xmlns:dc="http://purl.org/dc/elements/1.1/"
6 xmlns:cc="http://creativecommons.org/ns#"
7 xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
8 xmlns:svg="http://www.w3.org/2000/svg"
9 xmlns="http://www.w3.org/2000/svg"
10 xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
11 xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
12 width="735.25"
13 height="516.21875"
14 id="svg2"
15 version="1.1"
16 inkscape:version="0.48.3.1 r9886"
17 sodipodi:docname="ReadersPartitionGP1.svg">
18 <defs
19 id="defs4">
20 <marker
21 inkscape:stockid="Arrow2Lend"
22 orient="auto"
23 refY="0"
24 refX="0"
25 id="Arrow2Lend"
26 style="overflow:visible">
27 <path
28 id="path3792"
29 style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
30 d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
31 transform="matrix(-1.1,0,0,-1.1,-1.1,0)"
32 inkscape:connector-curvature="0" />
33 </marker>
34 <marker
35 inkscape:stockid="Arrow2Lstart"
36 orient="auto"
37 refY="0"
38 refX="0"
39 id="Arrow2Lstart"
40 style="overflow:visible">
41 <path
42 id="path3789"
43 style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
44 d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
45 transform="matrix(1.1,0,0,1.1,1.1,0)"
46 inkscape:connector-curvature="0" />
47 </marker>
48 <marker
49 inkscape:stockid="Arrow2Lstart"
50 orient="auto"
51 refY="0"
52 refX="0"
53 id="Arrow2Lstart-4"
54 style="overflow:visible">
55 <path
56 id="path3789-9"
57 style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
58 d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
59 transform="matrix(1.1,0,0,1.1,1.1,0)"
60 inkscape:connector-curvature="0" />
61 </marker>
62 <marker
63 inkscape:stockid="Arrow2Lend"
64 orient="auto"
65 refY="0"
66 refX="0"
67 id="Arrow2Lend-4"
68 style="overflow:visible">
69 <path
70 id="path3792-4"
71 style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
72 d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
73 transform="matrix(-1.1,0,0,-1.1,-1.1,0)"
74 inkscape:connector-curvature="0" />
75 </marker>
76 </defs>
77 <sodipodi:namedview
78 id="base"
79 pagecolor="#ffffff"
80 bordercolor="#666666"
81 borderopacity="1.0"
82 inkscape:pageopacity="0.0"
83 inkscape:pageshadow="2"
84 inkscape:zoom="1.3670394"
85 inkscape:cx="367.26465"
86 inkscape:cy="258.46182"
87 inkscape:document-units="px"
88 inkscape:current-layer="g4433-6"
89 showgrid="false"
90 inkscape:window-width="1351"
91 inkscape:window-height="836"
92 inkscape:window-x="438"
93 inkscape:window-y="335"
94 inkscape:window-maximized="0"
95 fit-margin-top="5"
96 fit-margin-left="5"
97 fit-margin-right="5"
98 fit-margin-bottom="5" />
99 <metadata
100 id="metadata7">
101 <rdf:RDF>
102 <cc:Work
103 rdf:about="">
104 <dc:format>image/svg+xml</dc:format>
105 <dc:type
106 rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
107 <dc:title />
108 </cc:Work>
109 </rdf:RDF>
110 </metadata>
111 <g
112 inkscape:label="Layer 1"
113 inkscape:groupmode="layer"
114 id="layer1"
115 transform="translate(-29.15625,-185.59375)">
116 <flowRoot
117 xml:space="preserve"
118 id="flowRoot2985"
119 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion
120 id="flowRegion2987"><rect
121 id="rect2989"
122 width="82.85714"
123 height="11.428572"
124 x="240"
125 y="492.36218" /></flowRegion><flowPara
126 id="flowPara2991" /></flowRoot> <g
127 id="g4433"
128 transform="translate(2,-12)">
129 <text
130 sodipodi:linespacing="125%"
131 id="text2993"
132 y="-261.66608"
133 x="436.12299"
134 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
135 xml:space="preserve"
136 transform="matrix(0,1,-1,0,0,0)"><tspan
137 y="-261.66608"
138 x="436.12299"
139 id="tspan2995"
140 sodipodi:role="line">synchronize_rcu()</tspan></text>
141 <g
142 id="g4417"
143 transform="matrix(0,1,-1,0,730.90257,222.4928)">
144 <path
145 style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)"
146 d="M 97.580736,477.4048 327.57913,476.09759"
147 id="path2997"
148 inkscape:connector-curvature="0"
149 sodipodi:nodetypes="cc" />
150 <path
151 style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
152 d="m 96.752718,465.38398 0,22.62742"
153 id="path4397"
154 inkscape:connector-curvature="0"
155 sodipodi:nodetypes="cc" />
156 <path
157 style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
158 d="m 328.40703,465.38397 0,22.62742"
159 id="path4397-5"
160 inkscape:connector-curvature="0"
161 sodipodi:nodetypes="cc" />
162 </g>
163 </g>
164 <text
165 xml:space="preserve"
166 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
167 x="112.04738"
168 y="268.18076"
169 id="text4429"
170 sodipodi:linespacing="125%"><tspan
171 sodipodi:role="line"
172 id="tspan4431"
173 x="112.04738"
174 y="268.18076">WRITE_ONCE(a, 1);</tspan></text>
175 <text
176 xml:space="preserve"
177 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
178 x="112.04738"
179 y="487.13766"
180 id="text4441"
181 sodipodi:linespacing="125%"><tspan
182 sodipodi:role="line"
183 id="tspan4443"
184 x="112.04738"
185 y="487.13766">WRITE_ONCE(b, 1);</tspan></text>
186 <text
187 xml:space="preserve"
188 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
189 x="255.60869"
190 y="297.29346"
191 id="text4445"
192 sodipodi:linespacing="125%"><tspan
193 sodipodi:role="line"
194 id="tspan4447"
195 x="255.60869"
196 y="297.29346">r1 = READ_ONCE(a);</tspan></text>
197 <text
198 xml:space="preserve"
199 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
200 x="255.14423"
201 y="554.61786"
202 id="text4449"
203 sodipodi:linespacing="125%"><tspan
204 sodipodi:role="line"
205 id="tspan4451"
206 x="255.14423"
207 y="554.61786">WRITE_ONCE(c, 1);</tspan></text>
208 <text
209 xml:space="preserve"
210 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
211 x="396.10254"
212 y="370.71124"
213 id="text4453"
214 sodipodi:linespacing="125%"><tspan
215 sodipodi:role="line"
216 id="tspan4455"
217 x="396.10254"
218 y="370.71124">WRITE_ONCE(d, 1);</tspan></text>
219 <text
220 xml:space="preserve"
221 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
222 x="396.10254"
223 y="572.13617"
224 id="text4457"
225 sodipodi:linespacing="125%"><tspan
226 sodipodi:role="line"
227 id="tspan4459"
228 x="396.10254"
229 y="572.13617">r2 = READ_ONCE(c);</tspan></text>
230 <text
231 xml:space="preserve"
232 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
233 x="112.08231"
234 y="213.91006"
235 id="text4461"
236 sodipodi:linespacing="125%"><tspan
237 sodipodi:role="line"
238 id="tspan4463"
239 x="112.08231"
240 y="213.91006">thread0()</tspan></text>
241 <text
242 xml:space="preserve"
243 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
244 x="252.34512"
245 y="213.91006"
246 id="text4461-6"
247 sodipodi:linespacing="125%"><tspan
248 sodipodi:role="line"
249 id="tspan4463-0"
250 x="252.34512"
251 y="213.91006">thread1()</tspan></text>
252 <text
253 xml:space="preserve"
254 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
255 x="396.42557"
256 y="213.91006"
257 id="text4461-2"
258 sodipodi:linespacing="125%"><tspan
259 sodipodi:role="line"
260 id="tspan4463-2"
261 x="396.42557"
262 y="213.91006">thread2()</tspan></text>
263 <rect
264 style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
265 id="rect4495"
266 width="724.25244"
267 height="505.21201"
268 x="34.648232"
269 y="191.10612" />
270 <path
271 style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
272 d="m 183.14066,191.10612 0,504.24243"
273 id="path4497"
274 inkscape:connector-curvature="0"
275 sodipodi:nodetypes="cc" />
276 <path
277 style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
278 d="m 325.13867,191.10612 0,504.24243"
279 id="path4497-5"
280 inkscape:connector-curvature="0"
281 sodipodi:nodetypes="cc" />
282 <text
283 xml:space="preserve"
284 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
285 x="111.75929"
286 y="251.53981"
287 id="text4429-8"
288 sodipodi:linespacing="125%"><tspan
289 sodipodi:role="line"
290 id="tspan4431-9"
291 x="111.75929"
292 y="251.53981">rcu_read_lock();</tspan></text>
293 <text
294 xml:space="preserve"
295 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
296 x="396.10254"
297 y="353.91556"
298 id="text4429-8-9"
299 sodipodi:linespacing="125%"><tspan
300 sodipodi:role="line"
301 id="tspan4431-9-4"
302 x="396.10254"
303 y="353.91556">rcu_read_lock();</tspan></text>
304 <text
305 xml:space="preserve"
306 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
307 x="396.10254"
308 y="587.40289"
309 id="text4429-8-9-3"
310 sodipodi:linespacing="125%"><tspan
311 sodipodi:role="line"
312 id="tspan4431-9-4-4"
313 x="396.10254"
314 y="587.40289">rcu_read_unlock();</tspan></text>
315 <text
316 xml:space="preserve"
317 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
318 x="111.75929"
319 y="501.15311"
320 id="text4429-8-9-3-1"
321 sodipodi:linespacing="125%"><tspan
322 sodipodi:role="line"
323 id="tspan4431-9-4-4-6"
324 x="111.75929"
325 y="501.15311">rcu_read_unlock();</tspan></text>
326 <path
327 style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
328 d="m 33.941125,227.87568 724.941765,0"
329 id="path4608"
330 inkscape:connector-curvature="0"
331 sodipodi:nodetypes="cc" />
332 <text
333 xml:space="preserve"
334 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
335 x="394.94427"
336 y="331.66351"
337 id="text4648"
338 sodipodi:linespacing="125%"><tspan
339 sodipodi:role="line"
340 id="tspan4650"
341 x="394.94427"
342 y="331.66351">QS</tspan></text>
343 <path
344 sodipodi:type="arc"
345 style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
346 id="path4652"
347 sodipodi:cx="358.85669"
348 sodipodi:cy="142.87541"
349 sodipodi:rx="10.960155"
350 sodipodi:ry="10.253048"
351 d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
352 transform="translate(36.441125,185.60612)"
353 sodipodi:start="4.7135481"
354 sodipodi:end="10.994651"
355 sodipodi:open="true" />
356 <text
357 xml:space="preserve"
358 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
359 x="112.11968"
360 y="523.77856"
361 id="text4648-4"
362 sodipodi:linespacing="125%"><tspan
363 sodipodi:role="line"
364 id="tspan4650-4"
365 x="112.11968"
366 y="523.77856">QS</tspan></text>
367 <path
368 sodipodi:type="arc"
369 style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
370 id="path4652-7"
371 sodipodi:cx="358.85669"
372 sodipodi:cy="142.87541"
373 sodipodi:rx="10.960155"
374 sodipodi:ry="10.253048"
375 d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
376 transform="translate(-246.38346,377.72117)"
377 sodipodi:start="4.7135481"
378 sodipodi:end="10.994651"
379 sodipodi:open="true" />
380 <path
381 sodipodi:type="arc"
382 style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
383 id="path4652-7-7"
384 sodipodi:cx="358.85669"
385 sodipodi:cy="142.87541"
386 sodipodi:rx="10.960155"
387 sodipodi:ry="10.253048"
388 d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
389 transform="translate(-103.65246,190.90878)"
390 sodipodi:start="4.7135481"
391 sodipodi:end="10.994651"
392 sodipodi:open="true" />
393 <text
394 xml:space="preserve"
395 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
396 x="254.85066"
397 y="336.96619"
398 id="text4648-4-3"
399 sodipodi:linespacing="125%"><tspan
400 sodipodi:role="line"
401 id="tspan4650-4-5"
402 x="254.85066"
403 y="336.96619">QS</tspan></text>
404 <path
405 style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
406 d="m 470.93311,190.39903 0,504.24243"
407 id="path4497-5-6"
408 inkscape:connector-curvature="0"
409 sodipodi:nodetypes="cc" />
410 <path
411 style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
412 d="m 616.22755,190.38323 0,504.24243"
413 id="path4497-5-2"
414 inkscape:connector-curvature="0"
415 sodipodi:nodetypes="cc" />
416 <g
417 id="g4433-6"
418 transform="translate(288.0964,78.32827)">
419 <text
420 sodipodi:linespacing="125%"
421 id="text2993-7"
422 y="-261.66608"
423 x="440.12299"
424 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
425 xml:space="preserve"
426 transform="matrix(0,1,-1,0,0,0)"><tspan
427 y="-261.66608"
428 x="440.12299"
429 id="tspan2995-1"
430 sodipodi:role="line">synchronize_rcu()</tspan></text>
431 <g
432 id="g4417-1"
433 transform="matrix(0,1,-1,0,730.90257,222.4928)">
434 <path
435 style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)"
436 d="M 97.580736,477.4048 328.5624,477.07246"
437 id="path2997-2"
438 inkscape:connector-curvature="0"
439 sodipodi:nodetypes="cc" />
440 <path
441 style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
442 d="m 96.752718,465.38398 0,22.62742"
443 id="path4397-3"
444 inkscape:connector-curvature="0"
445 sodipodi:nodetypes="cc" />
446 <path
447 style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
448 d="m 329.39039,465.38397 0,22.62742"
449 id="path4397-5-4"
450 inkscape:connector-curvature="0"
451 sodipodi:nodetypes="cc" />
452 </g>
453 </g>
454 <text
455 xml:space="preserve"
456 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
457 x="541.70508"
458 y="387.6217"
459 id="text4445-0"
460 sodipodi:linespacing="125%"><tspan
461 sodipodi:role="line"
462 id="tspan4447-5"
463 x="541.70508"
464 y="387.6217">r3 = READ_ONCE(d);</tspan></text>
465 <text
466 xml:space="preserve"
467 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
468 x="541.2406"
469 y="646.94611"
470 id="text4449-6"
471 sodipodi:linespacing="125%"><tspan
472 sodipodi:role="line"
473 id="tspan4451-6"
474 x="541.2406"
475 y="646.94611">WRITE_ONCE(e, 1);</tspan></text>
476 <path
477 sodipodi:type="arc"
478 style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
479 id="path4652-7-7-5"
480 sodipodi:cx="358.85669"
481 sodipodi:cy="142.87541"
482 sodipodi:rx="10.960155"
483 sodipodi:ry="10.253048"
484 d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
485 transform="translate(182.44393,281.23704)"
486 sodipodi:start="4.7135481"
487 sodipodi:end="10.994651"
488 sodipodi:open="true" />
489 <text
490 xml:space="preserve"
491 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
492 x="540.94702"
493 y="427.29443"
494 id="text4648-4-3-1"
495 sodipodi:linespacing="125%"><tspan
496 sodipodi:role="line"
497 id="tspan4650-4-5-7"
498 x="540.94702"
499 y="427.29443">QS</tspan></text>
500 <text
501 xml:space="preserve"
502 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
503 x="686.27747"
504 y="461.83929"
505 id="text4453-7"
506 sodipodi:linespacing="125%"><tspan
507 sodipodi:role="line"
508 id="tspan4455-1"
509 x="686.27747"
510 y="461.83929">r4 = READ_ONCE(b);</tspan></text>
511 <text
512 xml:space="preserve"
513 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
514 x="686.27747"
515 y="669.26422"
516 id="text4457-9"
517 sodipodi:linespacing="125%"><tspan
518 sodipodi:role="line"
519 id="tspan4459-2"
520 x="686.27747"
521 y="669.26422">r5 = READ_ONCE(e);</tspan></text>
522 <text
523 xml:space="preserve"
524 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
525 x="686.27747"
526 y="445.04358"
527 id="text4429-8-9-33"
528 sodipodi:linespacing="125%"><tspan
529 sodipodi:role="line"
530 id="tspan4431-9-4-2"
531 x="686.27747"
532 y="445.04358">rcu_read_lock();</tspan></text>
533 <text
534 xml:space="preserve"
535 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
536 x="686.27747"
537 y="684.53094"
538 id="text4429-8-9-3-8"
539 sodipodi:linespacing="125%"><tspan
540 sodipodi:role="line"
541 id="tspan4431-9-4-4-5"
542 x="686.27747"
543 y="684.53094">rcu_read_unlock();</tspan></text>
544 <text
545 xml:space="preserve"
546 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
547 x="685.11914"
548 y="422.79153"
549 id="text4648-9"
550 sodipodi:linespacing="125%"><tspan
551 sodipodi:role="line"
552 id="tspan4650-7"
553 x="685.11914"
554 y="422.79153">QS</tspan></text>
555 <path
556 sodipodi:type="arc"
557 style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
558 id="path4652-8"
559 sodipodi:cx="358.85669"
560 sodipodi:cy="142.87541"
561 sodipodi:rx="10.960155"
562 sodipodi:ry="10.253048"
563 d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
564 transform="translate(326.61602,276.73415)"
565 sodipodi:start="4.7135481"
566 sodipodi:end="10.994651"
567 sodipodi:open="true" />
568 <text
569 xml:space="preserve"
570 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
571 x="397.85934"
572 y="609.59003"
573 id="text4648-5"
574 sodipodi:linespacing="125%"><tspan
575 sodipodi:role="line"
576 id="tspan4650-77"
577 x="397.85934"
578 y="609.59003">QS</tspan></text>
579 <path
580 sodipodi:type="arc"
581 style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
582 id="path4652-80"
583 sodipodi:cx="358.85669"
584 sodipodi:cy="142.87541"
585 sodipodi:rx="10.960155"
586 sodipodi:ry="10.253048"
587 d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
588 transform="translate(39.356201,463.53264)"
589 sodipodi:start="4.7135481"
590 sodipodi:end="10.994651"
591 sodipodi:open="true" />
592 <text
593 xml:space="preserve"
594 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
595 x="256.75986"
596 y="586.99133"
597 id="text4648-5-2"
598 sodipodi:linespacing="125%"><tspan
599 sodipodi:role="line"
600 id="tspan4650-77-7"
601 x="256.75986"
602 y="586.99133">QS</tspan></text>
603 <path
604 sodipodi:type="arc"
605 style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
606 id="path4652-80-5"
607 sodipodi:cx="358.85669"
608 sodipodi:cy="142.87541"
609 sodipodi:rx="10.960155"
610 sodipodi:ry="10.253048"
611 d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
612 transform="translate(-101.74328,440.93395)"
613 sodipodi:start="4.7135481"
614 sodipodi:end="10.994651"
615 sodipodi:open="true" />
616 <text
617 xml:space="preserve"
618 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
619 x="546.22791"
620 y="213.91006"
621 id="text4461-2-5"
622 sodipodi:linespacing="125%"><tspan
623 sodipodi:role="line"
624 id="tspan4463-2-6"
625 x="546.22791"
626 y="213.91006">thread3()</tspan></text>
627 <text
628 xml:space="preserve"
629 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
630 x="684.00067"
631 y="213.91006"
632 id="text4461-2-1"
633 sodipodi:linespacing="125%"><tspan
634 sodipodi:role="line"
635 id="tspan4463-2-0"
636 x="684.00067"
637 y="213.91006">thread4()</tspan></text>
638 </g>
639</svg>
diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html
new file mode 100644
index 000000000000..a725f9900ec8
--- /dev/null
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -0,0 +1,2897 @@
1<!-- DO NOT HAND EDIT. -->
2<!-- Instead, edit Documentation/RCU/Design/Requirements/Requirements.htmlx and run 'sh htmlqqz.sh Documentation/RCU/Design/Requirements/Requirements' -->
3<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
4 "http://www.w3.org/TR/html4/loose.dtd">
5 <html>
6 <head><title>A Tour Through RCU's Requirements [LWN.net]</title>
7 <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8">
8
9<h1>A Tour Through RCU's Requirements</h1>
10
11<p>Copyright IBM Corporation, 2015</p>
12<p>Author: Paul E.&nbsp;McKenney</p>
13<p><i>The initial version of this document appeared in the
14<a href="https://lwn.net/">LWN</a> articles
15<a href="https://lwn.net/Articles/652156/">here</a>,
16<a href="https://lwn.net/Articles/652677/">here</a>, and
17<a href="https://lwn.net/Articles/653326/">here</a>.</i></p>
18
19<h2>Introduction</h2>
20
21<p>
22Read-copy update (RCU) is a synchronization mechanism that is often
23used as a replacement for reader-writer locking.
24RCU is unusual in that updaters do not block readers,
25which means that RCU's read-side primitives can be exceedingly fast
26and scalable.
27In addition, updaters can make useful forward progress concurrently
28with readers.
29However, all this concurrency between RCU readers and updaters does raise
30the question of exactly what RCU readers are doing, which in turn
31raises the question of exactly what RCU's requirements are.
32
33<p>
34This document therefore summarizes RCU's requirements, and can be thought
35of as an informal, high-level specification for RCU.
36It is important to understand that RCU's specification is primarily
37empirical in nature;
38in fact, I learned about many of these requirements the hard way.
39This situation might cause some consternation, however, not only
40has this learning process been a lot of fun, but it has also been
41a great privilege to work with so many people willing to apply
42technologies in interesting new ways.
43
44<p>
45All that aside, here are the categories of currently known RCU requirements:
46</p>
47
48<ol>
49<li> <a href="#Fundamental Requirements">
50 Fundamental Requirements</a>
51<li> <a href="#Fundamental Non-Requirements">Fundamental Non-Requirements</a>
52<li> <a href="#Parallelism Facts of Life">
53 Parallelism Facts of Life</a>
54<li> <a href="#Quality-of-Implementation Requirements">
55 Quality-of-Implementation Requirements</a>
56<li> <a href="#Linux Kernel Complications">
57 Linux Kernel Complications</a>
58<li> <a href="#Software-Engineering Requirements">
59 Software-Engineering Requirements</a>
60<li> <a href="#Other RCU Flavors">
61 Other RCU Flavors</a>
62<li> <a href="#Possible Future Changes">
63 Possible Future Changes</a>
64</ol>
65
66<p>
67This is followed by a <a href="#Summary">summary</a>,
68which is in turn followed by the inevitable
69<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>.
70
71<h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2>
72
73<p>
74RCU's fundamental requirements are the closest thing RCU has to hard
75mathematical requirements.
76These are:
77
78<ol>
79<li> <a href="#Grace-Period Guarantee">
80 Grace-Period Guarantee</a>
81<li> <a href="#Publish-Subscribe Guarantee">
82 Publish-Subscribe Guarantee</a>
83<li> <a href="#Memory-Barrier Guarantees">
84 Memory-Barrier Guarantees</a>
85<li> <a href="#RCU Primitives Guaranteed to Execute Unconditionally">
86 RCU Primitives Guaranteed to Execute Unconditionally</a>
87<li> <a href="#Guaranteed Read-to-Write Upgrade">
88 Guaranteed Read-to-Write Upgrade</a>
89</ol>
90
91<h3><a name="Grace-Period Guarantee">Grace-Period Guarantee</a></h3>
92
93<p>
94RCU's grace-period guarantee is unusual in being premeditated:
95Jack Slingwine and I had this guarantee firmly in mind when we started
96work on RCU (then called &ldquo;rclock&rdquo;) in the early 1990s.
97That said, the past two decades of experience with RCU have produced
98a much more detailed understanding of this guarantee.
99
100<p>
101RCU's grace-period guarantee allows updaters to wait for the completion
102of all pre-existing RCU read-side critical sections.
103An RCU read-side critical section
104begins with the marker <tt>rcu_read_lock()</tt> and ends with
105the marker <tt>rcu_read_unlock()</tt>.
106These markers may be nested, and RCU treats a nested set as one
107big RCU read-side critical section.
108Production-quality implementations of <tt>rcu_read_lock()</tt> and
109<tt>rcu_read_unlock()</tt> are extremely lightweight, and in
110fact have exactly zero overhead in Linux kernels built for production
111use with <tt>CONFIG_PREEMPT=n</tt>.
112
113<p>
114This guarantee allows ordering to be enforced with extremely low
115overhead to readers, for example:
116
117<blockquote>
118<pre>
119 1 int x, y;
120 2
121 3 void thread0(void)
122 4 {
123 5 rcu_read_lock();
124 6 r1 = READ_ONCE(x);
125 7 r2 = READ_ONCE(y);
126 8 rcu_read_unlock();
127 9 }
12810
12911 void thread1(void)
13012 {
13113 WRITE_ONCE(x, 1);
13214 synchronize_rcu();
13315 WRITE_ONCE(y, 1);
13416 }
135</pre>
136</blockquote>
137
138<p>
139Because the <tt>synchronize_rcu()</tt> on line&nbsp;14 waits for
140all pre-existing readers, any instance of <tt>thread0()</tt> that
141loads a value of zero from <tt>x</tt> must complete before
142<tt>thread1()</tt> stores to <tt>y</tt>, so that instance must
143also load a value of zero from <tt>y</tt>.
144Similarly, any instance of <tt>thread0()</tt> that loads a value of
145one from <tt>y</tt> must have started after the
146<tt>synchronize_rcu()</tt> started, and must therefore also load
147a value of one from <tt>x</tt>.
148Therefore, the outcome:
149<blockquote>
150<pre>
151(r1 == 0 &amp;&amp; r2 == 1)
152</pre>
153</blockquote>
154cannot happen.
155
156<p><a name="Quick Quiz 1"><b>Quick Quiz 1</b>:</a>
157Wait a minute!
158You said that updaters can make useful forward progress concurrently
159with readers, but pre-existing readers will block
160<tt>synchronize_rcu()</tt>!!!
161Just who are you trying to fool???
162<br><a href="#qq1answer">Answer</a>
163
164<p>
165This scenario resembles one of the first uses of RCU in
166<a href="https://en.wikipedia.org/wiki/DYNIX">DYNIX/ptx</a>,
167which managed a distributed lock manager's transition into
168a state suitable for handling recovery from node failure,
169more or less as follows:
170
171<blockquote>
172<pre>
173 1 #define STATE_NORMAL 0
174 2 #define STATE_WANT_RECOVERY 1
175 3 #define STATE_RECOVERING 2
176 4 #define STATE_WANT_NORMAL 3
177 5
178 6 int state = STATE_NORMAL;
179 7
180 8 void do_something_dlm(void)
181 9 {
18210 int state_snap;
18311
18412 rcu_read_lock();
18513 state_snap = READ_ONCE(state);
18614 if (state_snap == STATE_NORMAL)
18715 do_something();
18816 else
18917 do_something_carefully();
19018 rcu_read_unlock();
19119 }
19220
19321 void start_recovery(void)
19422 {
19523 WRITE_ONCE(state, STATE_WANT_RECOVERY);
19624 synchronize_rcu();
19725 WRITE_ONCE(state, STATE_RECOVERING);
19826 recovery();
19927 WRITE_ONCE(state, STATE_WANT_NORMAL);
20028 synchronize_rcu();
20129 WRITE_ONCE(state, STATE_NORMAL);
20230 }
203</pre>
204</blockquote>
205
206<p>
207The RCU read-side critical section in <tt>do_something_dlm()</tt>
208works with the <tt>synchronize_rcu()</tt> in <tt>start_recovery()</tt>
209to guarantee that <tt>do_something()</tt> never runs concurrently
210with <tt>recovery()</tt>, but with little or no synchronization
211overhead in <tt>do_something_dlm()</tt>.
212
213<p><a name="Quick Quiz 2"><b>Quick Quiz 2</b>:</a>
214Why is the <tt>synchronize_rcu()</tt> on line&nbsp;28 needed?
215<br><a href="#qq2answer">Answer</a>
216
217<p>
218In order to avoid fatal problems such as deadlocks,
219an RCU read-side critical section must not contain calls to
220<tt>synchronize_rcu()</tt>.
221Similarly, an RCU read-side critical section must not
222contain anything that waits, directly or indirectly, on completion of
223an invocation of <tt>synchronize_rcu()</tt>.
224
225<p>
226Although RCU's grace-period guarantee is useful in and of itself, with
227<a href="https://lwn.net/Articles/573497/">quite a few use cases</a>,
228it would be good to be able to use RCU to coordinate read-side
229access to linked data structures.
230For this, the grace-period guarantee is not sufficient, as can
231be seen in function <tt>add_gp_buggy()</tt> below.
232We will look at the reader's code later, but in the meantime, just think of
233the reader as locklessly picking up the <tt>gp</tt> pointer,
234and, if the value loaded is non-<tt>NULL</tt>, locklessly accessing the
235<tt>-&gt;a</tt> and <tt>-&gt;b</tt> fields.
236
237<blockquote>
238<pre>
239 1 bool add_gp_buggy(int a, int b)
240 2 {
241 3 p = kmalloc(sizeof(*p), GFP_KERNEL);
242 4 if (!p)
243 5 return -ENOMEM;
244 6 spin_lock(&amp;gp_lock);
245 7 if (rcu_access_pointer(gp)) {
246 8 spin_unlock(&amp;gp_lock);
247 9 return false;
24810 }
24911 p-&gt;a = a;
25012 p-&gt;b = a;
25113 gp = p; /* ORDERING BUG */
25214 spin_unlock(&amp;gp_lock);
25315 return true;
25416 }
255</pre>
256</blockquote>
257
258<p>
259The problem is that both the compiler and weakly ordered CPUs are within
260their rights to reorder this code as follows:
261
262<blockquote>
263<pre>
264 1 bool add_gp_buggy_optimized(int a, int b)
265 2 {
266 3 p = kmalloc(sizeof(*p), GFP_KERNEL);
267 4 if (!p)
268 5 return -ENOMEM;
269 6 spin_lock(&amp;gp_lock);
270 7 if (rcu_access_pointer(gp)) {
271 8 spin_unlock(&amp;gp_lock);
272 9 return false;
27310 }
274<b>11 gp = p; /* ORDERING BUG */
27512 p-&gt;a = a;
27613 p-&gt;b = a;</b>
27714 spin_unlock(&amp;gp_lock);
27815 return true;
27916 }
280</pre>
281</blockquote>
282
283<p>
284If an RCU reader fetches <tt>gp</tt> just after
285<tt>add_gp_buggy_optimized</tt> executes line&nbsp;11,
286it will see garbage in the <tt>-&gt;a</tt> and <tt>-&gt;b</tt>
287fields.
288And this is but one of many ways in which compiler and hardware optimizations
289could cause trouble.
290Therefore, we clearly need some way to prevent the compiler and the CPU from
291reordering in this manner, which brings us to the publish-subscribe
292guarantee discussed in the next section.
293
294<h3><a name="Publish-Subscribe Guarantee">Publish/Subscribe Guarantee</a></h3>
295
296<p>
297RCU's publish-subscribe guarantee allows data to be inserted
298into a linked data structure without disrupting RCU readers.
299The updater uses <tt>rcu_assign_pointer()</tt> to insert the
300new data, and readers use <tt>rcu_dereference()</tt> to
301access data, whether new or old.
302The following shows an example of insertion:
303
304<blockquote>
305<pre>
306 1 bool add_gp(int a, int b)
307 2 {
308 3 p = kmalloc(sizeof(*p), GFP_KERNEL);
309 4 if (!p)
310 5 return -ENOMEM;
311 6 spin_lock(&amp;gp_lock);
312 7 if (rcu_access_pointer(gp)) {
313 8 spin_unlock(&amp;gp_lock);
314 9 return false;
31510 }
31611 p-&gt;a = a;
31712 p-&gt;b = a;
31813 rcu_assign_pointer(gp, p);
31914 spin_unlock(&amp;gp_lock);
32015 return true;
32116 }
322</pre>
323</blockquote>
324
325<p>
326The <tt>rcu_assign_pointer()</tt> on line&nbsp;13 is conceptually
327equivalent to a simple assignment statement, but also guarantees
328that its assignment will
329happen after the two assignments in lines&nbsp;11 and&nbsp;12,
330similar to the C11 <tt>memory_order_release</tt> store operation.
331It also prevents any number of &ldquo;interesting&rdquo; compiler
332optimizations, for example, the use of <tt>gp</tt> as a scratch
333location immediately preceding the assignment.
334
335<p><a name="Quick Quiz 3"><b>Quick Quiz 3</b>:</a>
336But <tt>rcu_assign_pointer()</tt> does nothing to prevent the
337two assignments to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt>
338from being reordered.
339Can't that also cause problems?
340<br><a href="#qq3answer">Answer</a>
341
342<p>
343It is tempting to assume that the reader need not do anything special
344to control its accesses to the RCU-protected data,
345as shown in <tt>do_something_gp_buggy()</tt> below:
346
347<blockquote>
348<pre>
349 1 bool do_something_gp_buggy(void)
350 2 {
351 3 rcu_read_lock();
352 4 p = gp; /* OPTIMIZATIONS GALORE!!! */
353 5 if (p) {
354 6 do_something(p-&gt;a, p-&gt;b);
355 7 rcu_read_unlock();
356 8 return true;
357 9 }
35810 rcu_read_unlock();
35911 return false;
36012 }
361</pre>
362</blockquote>
363
364<p>
365However, this temptation must be resisted because there are a
366surprisingly large number of ways that the compiler
367(to say nothing of
368<a href="https://h71000.www7.hp.com/wizard/wiz_2637.html">DEC Alpha CPUs</a>)
369can trip this code up.
370For but one example, if the compiler were short of registers, it
371might choose to refetch from <tt>gp</tt> rather than keeping
372a separate copy in <tt>p</tt> as follows:
373
374<blockquote>
375<pre>
376 1 bool do_something_gp_buggy_optimized(void)
377 2 {
378 3 rcu_read_lock();
379 4 if (gp) { /* OPTIMIZATIONS GALORE!!! */
380<b> 5 do_something(gp-&gt;a, gp-&gt;b);</b>
381 6 rcu_read_unlock();
382 7 return true;
383 8 }
384 9 rcu_read_unlock();
38510 return false;
38611 }
387</pre>
388</blockquote>
389
390<p>
391If this function ran concurrently with a series of updates that
392replaced the current structure with a new one,
393the fetches of <tt>gp-&gt;a</tt>
394and <tt>gp-&gt;b</tt> might well come from two different structures,
395which could cause serious confusion.
396To prevent this (and much else besides), <tt>do_something_gp()</tt> uses
397<tt>rcu_dereference()</tt> to fetch from <tt>gp</tt>:
398
399<blockquote>
400<pre>
401 1 bool do_something_gp(void)
402 2 {
403 3 rcu_read_lock();
404 4 p = rcu_dereference(gp);
405 5 if (p) {
406 6 do_something(p-&gt;a, p-&gt;b);
407 7 rcu_read_unlock();
408 8 return true;
409 9 }
41010 rcu_read_unlock();
41111 return false;
41212 }
413</pre>
414</blockquote>
415
416<p>
417The <tt>rcu_dereference()</tt> uses volatile casts and (for DEC Alpha)
418memory barriers in the Linux kernel.
419Should a
420<a href="http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf">high-quality implementation of C11 <tt>memory_order_consume</tt> [PDF]</a>
421ever appear, then <tt>rcu_dereference()</tt> could be implemented
422as a <tt>memory_order_consume</tt> load.
423Regardless of the exact implementation, a pointer fetched by
424<tt>rcu_dereference()</tt> may not be used outside of the
425outermost RCU read-side critical section containing that
426<tt>rcu_dereference()</tt>, unless protection of
427the corresponding data element has been passed from RCU to some
428other synchronization mechanism, most commonly locking or
429<a href="https://www.kernel.org/doc/Documentation/RCU/rcuref.txt">reference counting</a>.
430
431<p>
432In short, updaters use <tt>rcu_assign_pointer()</tt> and readers
433use <tt>rcu_dereference()</tt>, and these two RCU API elements
434work together to ensure that readers have a consistent view of
435newly added data elements.
436
437<p>
438Of course, it is also necessary to remove elements from RCU-protected
439data structures, for example, using the following process:
440
441<ol>
442<li> Remove the data element from the enclosing structure.
443<li> Wait for all pre-existing RCU read-side critical sections
444 to complete (because only pre-existing readers can possibly have
445 a reference to the newly removed data element).
446<li> At this point, only the updater has a reference to the
447 newly removed data element, so it can safely reclaim
448 the data element, for example, by passing it to <tt>kfree()</tt>.
449</ol>
450
451This process is implemented by <tt>remove_gp_synchronous()</tt>:
452
453<blockquote>
454<pre>
455 1 bool remove_gp_synchronous(void)
456 2 {
457 3 struct foo *p;
458 4
459 5 spin_lock(&amp;gp_lock);
460 6 p = rcu_access_pointer(gp);
461 7 if (!p) {
462 8 spin_unlock(&amp;gp_lock);
463 9 return false;
46410 }
46511 rcu_assign_pointer(gp, NULL);
46612 spin_unlock(&amp;gp_lock);
46713 synchronize_rcu();
46814 kfree(p);
46915 return true;
47016 }
471</pre>
472</blockquote>
473
474<p>
475This function is straightforward, with line&nbsp;13 waiting for a grace
476period before line&nbsp;14 frees the old data element.
477This waiting ensures that readers will reach line&nbsp;7 of
478<tt>do_something_gp()</tt> before the data element referenced by
479<tt>p</tt> is freed.
480The <tt>rcu_access_pointer()</tt> on line&nbsp;6 is similar to
481<tt>rcu_dereference()</tt>, except that:
482
483<ol>
484<li> The value returned by <tt>rcu_access_pointer()</tt>
485 cannot be dereferenced.
486 If you want to access the value pointed to as well as
487 the pointer itself, use <tt>rcu_dereference()</tt>
488 instead of <tt>rcu_access_pointer()</tt>.
489<li> The call to <tt>rcu_access_pointer()</tt> need not be
490 protected.
491 In contrast, <tt>rcu_dereference()</tt> must either be
492 within an RCU read-side critical section or in a code
493 segment where the pointer cannot change, for example, in
494 code protected by the corresponding update-side lock.
495</ol>
496
497<p><a name="Quick Quiz 4"><b>Quick Quiz 4</b>:</a>
498Without the <tt>rcu_dereference()</tt> or the
499<tt>rcu_access_pointer()</tt>, what destructive optimizations
500might the compiler make use of?
501<br><a href="#qq4answer">Answer</a>
502
503<p>
504In short, RCU's publish-subscribe guarantee is provided by the combination
505of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>.
506This guarantee allows data elements to be safely added to RCU-protected
507linked data structures without disrupting RCU readers.
508This guarantee can be used in combination with the grace-period
509guarantee to also allow data elements to be removed from RCU-protected
510linked data structures, again without disrupting RCU readers.
511
512<p>
513This guarantee was only partially premeditated.
514DYNIX/ptx used an explicit memory barrier for publication, but had nothing
515resembling <tt>rcu_dereference()</tt> for subscription, nor did it
516have anything resembling the <tt>smp_read_barrier_depends()</tt>
517that was later subsumed into <tt>rcu_dereference()</tt>.
518The need for these operations made itself known quite suddenly at a
519late-1990s meeting with the DEC Alpha architects, back in the days when
520DEC was still a free-standing company.
521It took the Alpha architects a good hour to convince me that any sort
522of barrier would ever be needed, and it then took me a good <i>two</i> hours
523to convince them that their documentation did not make this point clear.
524More recent work with the C and C++ standards committees have provided
525much education on tricks and traps from the compiler.
526In short, compilers were much less tricky in the early 1990s, but in
5272015, don't even think about omitting <tt>rcu_dereference()</tt>!
528
529<h3><a name="Memory-Barrier Guarantees">Memory-Barrier Guarantees</a></h3>
530
531<p>
532The previous section's simple linked-data-structure scenario clearly
533demonstrates the need for RCU's stringent memory-ordering guarantees on
534systems with more than one CPU:
535
536<ol>
537<li> Each CPU that has an RCU read-side critical section that
538 begins before <tt>synchronize_rcu()</tt> starts is
539 guaranteed to execute a full memory barrier between the time
540 that the RCU read-side critical section ends and the time that
541 <tt>synchronize_rcu()</tt> returns.
542 Without this guarantee, a pre-existing RCU read-side critical section
543 might hold a reference to the newly removed <tt>struct foo</tt>
544 after the <tt>kfree()</tt> on line&nbsp;14 of
545 <tt>remove_gp_synchronous()</tt>.
546<li> Each CPU that has an RCU read-side critical section that ends
547 after <tt>synchronize_rcu()</tt> returns is guaranteed
548 to execute a full memory barrier between the time that
549 <tt>synchronize_rcu()</tt> begins and the time that the RCU
550 read-side critical section begins.
551 Without this guarantee, a later RCU read-side critical section
552 running after the <tt>kfree()</tt> on line&nbsp;14 of
553 <tt>remove_gp_synchronous()</tt> might
554 later run <tt>do_something_gp()</tt> and find the
555 newly deleted <tt>struct foo</tt>.
556<li> If the task invoking <tt>synchronize_rcu()</tt> remains
557 on a given CPU, then that CPU is guaranteed to execute a full
558 memory barrier sometime during the execution of
559 <tt>synchronize_rcu()</tt>.
560 This guarantee ensures that the <tt>kfree()</tt> on
561 line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does
562 execute after the removal on line&nbsp;11.
563<li> If the task invoking <tt>synchronize_rcu()</tt> migrates
564 among a group of CPUs during that invocation, then each of the
565 CPUs in that group is guaranteed to execute a full memory barrier
566 sometime during the execution of <tt>synchronize_rcu()</tt>.
567 This guarantee also ensures that the <tt>kfree()</tt> on
568 line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does
569 execute after the removal on
570 line&nbsp;11, but also in the case where the thread executing the
571 <tt>synchronize_rcu()</tt> migrates in the meantime.
572</ol>
573
574<p><a name="Quick Quiz 5"><b>Quick Quiz 5</b>:</a>
575Given that multiple CPUs can start RCU read-side critical sections
576at any time without any ordering whatsoever, how can RCU possibly tell whether
577or not a given RCU read-side critical section starts before a
578given instance of <tt>synchronize_rcu()</tt>?
579<br><a href="#qq5answer">Answer</a>
580
581<p><a name="Quick Quiz 6"><b>Quick Quiz 6</b>:</a>
582The first and second guarantees require unbelievably strict ordering!
583Are all these memory barriers <i> really</i> required?
584<br><a href="#qq6answer">Answer</a>
585
586<p>
587Note that these memory-barrier requirements do not replace the fundamental
588RCU requirement that a grace period wait for all pre-existing readers.
589On the contrary, the memory barriers called out in this section must operate in
590such a way as to <i>enforce</i> this fundamental requirement.
591Of course, different implementations enforce this requirement in different
592ways, but enforce it they must.
593
594<h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3>
595
596<p>
597The common-case RCU primitives are unconditional.
598They are invoked, they do their job, and they return, with no possibility
599of error, and no need to retry.
600This is a key RCU design philosophy.
601
602<p>
603However, this philosophy is pragmatic rather than pigheaded.
604If someone comes up with a good justification for a particular conditional
605RCU primitive, it might well be implemented and added.
606After all, this guarantee was reverse-engineered, not premeditated.
607The unconditional nature of the RCU primitives was initially an
608accident of implementation, and later experience with synchronization
609primitives with conditional primitives caused me to elevate this
610accident to a guarantee.
611Therefore, the justification for adding a conditional primitive to
612RCU would need to be based on detailed and compelling use cases.
613
614<h3><a name="Guaranteed Read-to-Write Upgrade">Guaranteed Read-to-Write Upgrade</a></h3>
615
616<p>
617As far as RCU is concerned, it is always possible to carry out an
618update within an RCU read-side critical section.
619For example, that RCU read-side critical section might search for
620a given data element, and then might acquire the update-side
621spinlock in order to update that element, all while remaining
622in that RCU read-side critical section.
623Of course, it is necessary to exit the RCU read-side critical section
624before invoking <tt>synchronize_rcu()</tt>, however, this
625inconvenience can be avoided through use of the
626<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members
627described later in this document.
628
629<p><a name="Quick Quiz 7"><b>Quick Quiz 7</b>:</a>
630But how does the upgrade-to-write operation exclude other readers?
631<br><a href="#qq7answer">Answer</a>
632
633<p>
634This guarantee allows lookup code to be shared between read-side
635and update-side code, and was premeditated, appearing in the earliest
636DYNIX/ptx RCU documentation.
637
638<h2><a name="Fundamental Non-Requirements">Fundamental Non-Requirements</a></h2>
639
640<p>
641RCU provides extremely lightweight readers, and its read-side guarantees,
642though quite useful, are correspondingly lightweight.
643It is therefore all too easy to assume that RCU is guaranteeing more
644than it really is.
645Of course, the list of things that RCU does not guarantee is infinitely
646long, however, the following sections list a few non-guarantees that
647have caused confusion.
648Except where otherwise noted, these non-guarantees were premeditated.
649
650<ol>
651<li> <a href="#Readers Impose Minimal Ordering">
652 Readers Impose Minimal Ordering</a>
653<li> <a href="#Readers Do Not Exclude Updaters">
654 Readers Do Not Exclude Updaters</a>
655<li> <a href="#Updaters Only Wait For Old Readers">
656 Updaters Only Wait For Old Readers</a>
657<li> <a href="#Grace Periods Don't Partition Read-Side Critical Sections">
658 Grace Periods Don't Partition Read-Side Critical Sections</a>
659<li> <a href="#Read-Side Critical Sections Don't Partition Grace Periods">
660 Read-Side Critical Sections Don't Partition Grace Periods</a>
661<li> <a href="#Disabling Preemption Does Not Block Grace Periods">
662 Disabling Preemption Does Not Block Grace Periods</a>
663</ol>
664
665<h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3>
666
667<p>
668Reader-side markers such as <tt>rcu_read_lock()</tt> and
669<tt>rcu_read_unlock()</tt> provide absolutely no ordering guarantees
670except through their interaction with the grace-period APIs such as
671<tt>synchronize_rcu()</tt>.
672To see this, consider the following pair of threads:
673
674<blockquote>
675<pre>
676 1 void thread0(void)
677 2 {
678 3 rcu_read_lock();
679 4 WRITE_ONCE(x, 1);
680 5 rcu_read_unlock();
681 6 rcu_read_lock();
682 7 WRITE_ONCE(y, 1);
683 8 rcu_read_unlock();
684 9 }
68510
68611 void thread1(void)
68712 {
68813 rcu_read_lock();
68914 r1 = READ_ONCE(y);
69015 rcu_read_unlock();
69116 rcu_read_lock();
69217 r2 = READ_ONCE(x);
69318 rcu_read_unlock();
69419 }
695</pre>
696</blockquote>
697
698<p>
699After <tt>thread0()</tt> and <tt>thread1()</tt> execute
700concurrently, it is quite possible to have
701
702<blockquote>
703<pre>
704(r1 == 1 &amp;&amp; r2 == 0)
705</pre>
706</blockquote>
707
708(that is, <tt>y</tt> appears to have been assigned before <tt>x</tt>),
709which would not be possible if <tt>rcu_read_lock()</tt> and
710<tt>rcu_read_unlock()</tt> had much in the way of ordering
711properties.
712But they do not, so the CPU is within its rights
713to do significant reordering.
714This is by design: Any significant ordering constraints would slow down
715these fast-path APIs.
716
717<p><a name="Quick Quiz 8"><b>Quick Quiz 8</b>:</a>
718Can't the compiler also reorder this code?
719<br><a href="#qq8answer">Answer</a>
720
721<h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3>
722
723<p>
724Neither <tt>rcu_read_lock()</tt> nor <tt>rcu_read_unlock()</tt>
725exclude updates.
726All they do is to prevent grace periods from ending.
727The following example illustrates this:
728
729<blockquote>
730<pre>
731 1 void thread0(void)
732 2 {
733 3 rcu_read_lock();
734 4 r1 = READ_ONCE(y);
735 5 if (r1) {
736 6 do_something_with_nonzero_x();
737 7 r2 = READ_ONCE(x);
738 8 WARN_ON(!r2); /* BUG!!! */
739 9 }
74010 rcu_read_unlock();
74111 }
74212
74313 void thread1(void)
74414 {
74515 spin_lock(&amp;my_lock);
74616 WRITE_ONCE(x, 1);
74717 WRITE_ONCE(y, 1);
74818 spin_unlock(&amp;my_lock);
74919 }
750</pre>
751</blockquote>
752
753<p>
754If the <tt>thread0()</tt> function's <tt>rcu_read_lock()</tt>
755excluded the <tt>thread1()</tt> function's update,
756the <tt>WARN_ON()</tt> could never fire.
757But the fact is that <tt>rcu_read_lock()</tt> does not exclude
758much of anything aside from subsequent grace periods, of which
759<tt>thread1()</tt> has none, so the
760<tt>WARN_ON()</tt> can and does fire.
761
762<h3><a name="Updaters Only Wait For Old Readers">Updaters Only Wait For Old Readers</a></h3>
763
764<p>
765It might be tempting to assume that after <tt>synchronize_rcu()</tt>
766completes, there are no readers executing.
767This temptation must be avoided because
768new readers can start immediately after <tt>synchronize_rcu()</tt>
769starts, and <tt>synchronize_rcu()</tt> is under no
770obligation to wait for these new readers.
771
772<p><a name="Quick Quiz 9"><b>Quick Quiz 9</b>:</a>
773Suppose that synchronize_rcu() did wait until all readers had completed.
774Would the updater be able to rely on this?
775<br><a href="#qq9answer">Answer</a>
776
777<h3><a name="Grace Periods Don't Partition Read-Side Critical Sections">
778Grace Periods Don't Partition Read-Side Critical Sections</a></h3>
779
780<p>
781It is tempting to assume that if any part of one RCU read-side critical
782section precedes a given grace period, and if any part of another RCU
783read-side critical section follows that same grace period, then all of
784the first RCU read-side critical section must precede all of the second.
785However, this just isn't the case: A single grace period does not
786partition the set of RCU read-side critical sections.
787An example of this situation can be illustrated as follows, where
788<tt>x</tt>, <tt>y</tt>, and <tt>z</tt> are initially all zero:
789
790<blockquote>
791<pre>
792 1 void thread0(void)
793 2 {
794 3 rcu_read_lock();
795 4 WRITE_ONCE(a, 1);
796 5 WRITE_ONCE(b, 1);
797 6 rcu_read_unlock();
798 7 }
799 8
800 9 void thread1(void)
80110 {
80211 r1 = READ_ONCE(a);
80312 synchronize_rcu();
80413 WRITE_ONCE(c, 1);
80514 }
80615
80716 void thread2(void)
80817 {
80918 rcu_read_lock();
81019 r2 = READ_ONCE(b);
81120 r3 = READ_ONCE(c);
81221 rcu_read_unlock();
81322 }
814</pre>
815</blockquote>
816
817<p>
818It turns out that the outcome:
819
820<blockquote>
821<pre>
822(r1 == 1 &amp;&amp; r2 == 0 &amp;&amp; r3 == 1)
823</pre>
824</blockquote>
825
826is entirely possible.
827The following figure show how this can happen, with each circled
828<tt>QS</tt> indicating the point at which RCU recorded a
829<i>quiescent state</i> for each thread, that is, a state in which
830RCU knows that the thread cannot be in the midst of an RCU read-side
831critical section that started before the current grace period:
832
833<p><img src="GPpartitionReaders1.svg" alt="GPpartitionReaders1.svg" width="60%"></p>
834
835<p>
836If it is necessary to partition RCU read-side critical sections in this
837manner, it is necessary to use two grace periods, where the first
838grace period is known to end before the second grace period starts:
839
840<blockquote>
841<pre>
842 1 void thread0(void)
843 2 {
844 3 rcu_read_lock();
845 4 WRITE_ONCE(a, 1);
846 5 WRITE_ONCE(b, 1);
847 6 rcu_read_unlock();
848 7 }
849 8
850 9 void thread1(void)
85110 {
85211 r1 = READ_ONCE(a);
85312 synchronize_rcu();
85413 WRITE_ONCE(c, 1);
85514 }
85615
85716 void thread2(void)
85817 {
85918 r2 = READ_ONCE(c);
86019 synchronize_rcu();
86120 WRITE_ONCE(d, 1);
86221 }
86322
86423 void thread3(void)
86524 {
86625 rcu_read_lock();
86726 r3 = READ_ONCE(b);
86827 r4 = READ_ONCE(d);
86928 rcu_read_unlock();
87029 }
871</pre>
872</blockquote>
873
874<p>
875Here, if <tt>(r1 == 1)</tt>, then
876<tt>thread0()</tt>'s write to <tt>b</tt> must happen
877before the end of <tt>thread1()</tt>'s grace period.
878If in addition <tt>(r4 == 1)</tt>, then
879<tt>thread3()</tt>'s read from <tt>b</tt> must happen
880after the beginning of <tt>thread2()</tt>'s grace period.
881If it is also the case that <tt>(r2 == 1)</tt>, then the
882end of <tt>thread1()</tt>'s grace period must precede the
883beginning of <tt>thread2()</tt>'s grace period.
884This mean that the two RCU read-side critical sections cannot overlap,
885guaranteeing that <tt>(r3 == 1)</tt>.
886As a result, the outcome:
887
888<blockquote>
889<pre>
890(r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 0 &amp;&amp; r4 == 1)
891</pre>
892</blockquote>
893
894cannot happen.
895
896<p>
897This non-requirement was also non-premeditated, but became apparent
898when studying RCU's interaction with memory ordering.
899
900<h3><a name="Read-Side Critical Sections Don't Partition Grace Periods">
901Read-Side Critical Sections Don't Partition Grace Periods</a></h3>
902
903<p>
904It is also tempting to assume that if an RCU read-side critical section
905happens between a pair of grace periods, then those grace periods cannot
906overlap.
907However, this temptation leads nowhere good, as can be illustrated by
908the following, with all variables initially zero:
909
910<blockquote>
911<pre>
912 1 void thread0(void)
913 2 {
914 3 rcu_read_lock();
915 4 WRITE_ONCE(a, 1);
916 5 WRITE_ONCE(b, 1);
917 6 rcu_read_unlock();
918 7 }
919 8
920 9 void thread1(void)
92110 {
92211 r1 = READ_ONCE(a);
92312 synchronize_rcu();
92413 WRITE_ONCE(c, 1);
92514 }
92615
92716 void thread2(void)
92817 {
92918 rcu_read_lock();
93019 WRITE_ONCE(d, 1);
93120 r2 = READ_ONCE(c);
93221 rcu_read_unlock();
93322 }
93423
93524 void thread3(void)
93625 {
93726 r3 = READ_ONCE(d);
93827 synchronize_rcu();
93928 WRITE_ONCE(e, 1);
94029 }
94130
94231 void thread4(void)
94332 {
94433 rcu_read_lock();
94534 r4 = READ_ONCE(b);
94635 r5 = READ_ONCE(e);
94736 rcu_read_unlock();
94837 }
949</pre>
950</blockquote>
951
952<p>
953In this case, the outcome:
954
955<blockquote>
956<pre>
957(r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 1 &amp;&amp; r4 == 0 &amp&amp; r5 == 1)
958</pre>
959</blockquote>
960
961is entirely possible, as illustrated below:
962
963<p><img src="ReadersPartitionGP1.svg" alt="ReadersPartitionGP1.svg" width="100%"></p>
964
965<p>
966Again, an RCU read-side critical section can overlap almost all of a
967given grace period, just so long as it does not overlap the entire
968grace period.
969As a result, an RCU read-side critical section cannot partition a pair
970of RCU grace periods.
971
972<p><a name="Quick Quiz 10"><b>Quick Quiz 10</b>:</a>
973How long a sequence of grace periods, each separated by an RCU read-side
974critical section, would be required to partition the RCU read-side
975critical sections at the beginning and end of the chain?
976<br><a href="#qq10answer">Answer</a>
977
978<h3><a name="Disabling Preemption Does Not Block Grace Periods">
979Disabling Preemption Does Not Block Grace Periods</a></h3>
980
981<p>
982There was a time when disabling preemption on any given CPU would block
983subsequent grace periods.
984However, this was an accident of implementation and is not a requirement.
985And in the current Linux-kernel implementation, disabling preemption
986on a given CPU in fact does not block grace periods, as Oleg Nesterov
987<a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>.
988
989<p>
990If you need a preempt-disable region to block grace periods, you need to add
991<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example
992as follows:
993
994<blockquote>
995<pre>
996 1 preempt_disable();
997 2 rcu_read_lock();
998 3 do_something();
999 4 rcu_read_unlock();
1000 5 preempt_enable();
1001 6
1002 7 /* Spinlocks implicitly disable preemption. */
1003 8 spin_lock(&amp;mylock);
1004 9 rcu_read_lock();
100510 do_something();
100611 rcu_read_unlock();
100712 spin_unlock(&amp;mylock);
1008</pre>
1009</blockquote>
1010
1011<p>
1012In theory, you could enter the RCU read-side critical section first,
1013but it is more efficient to keep the entire RCU read-side critical
1014section contained in the preempt-disable region as shown above.
1015Of course, RCU read-side critical sections that extend outside of
1016preempt-disable regions will work correctly, but such critical sections
1017can be preempted, which forces <tt>rcu_read_unlock()</tt> to do
1018more work.
1019And no, this is <i>not</i> an invitation to enclose all of your RCU
1020read-side critical sections within preempt-disable regions, because
1021doing so would degrade real-time response.
1022
1023<p>
1024This non-requirement appeared with preemptible RCU.
1025If you need a grace period that waits on non-preemptible code regions, use
1026<a href="#Sched Flavor">RCU-sched</a>.
1027
1028<h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2>
1029
1030<p>
1031These parallelism facts of life are by no means specific to RCU, but
1032the RCU implementation must abide by them.
1033They therefore bear repeating:
1034
1035<ol>
1036<li> Any CPU or task may be delayed at any time,
1037 and any attempts to avoid these delays by disabling
1038 preemption, interrupts, or whatever are completely futile.
1039 This is most obvious in preemptible user-level
1040 environments and in virtualized environments (where
1041 a given guest OS's VCPUs can be preempted at any time by
1042 the underlying hypervisor), but can also happen in bare-metal
1043 environments due to ECC errors, NMIs, and other hardware
1044 events.
1045 Although a delay of more than about 20 seconds can result
1046 in splats, the RCU implementation is obligated to use
1047 algorithms that can tolerate extremely long delays, but where
1048 &ldquo;extremely long&rdquo; is not long enough to allow
1049 wrap-around when incrementing a 64-bit counter.
1050<li> Both the compiler and the CPU can reorder memory accesses.
1051 Where it matters, RCU must use compiler directives and
1052 memory-barrier instructions to preserve ordering.
1053<li> Conflicting writes to memory locations in any given cache line
1054 will result in expensive cache misses.
1055 Greater numbers of concurrent writes and more-frequent
1056 concurrent writes will result in more dramatic slowdowns.
1057 RCU is therefore obligated to use algorithms that have
1058 sufficient locality to avoid significant performance and
1059 scalability problems.
1060<li> As a rough rule of thumb, only one CPU's worth of processing
1061 may be carried out under the protection of any given exclusive
1062 lock.
1063 RCU must therefore use scalable locking designs.
1064<li> Counters are finite, especially on 32-bit systems.
1065 RCU's use of counters must therefore tolerate counter wrap,
1066 or be designed such that counter wrap would take way more
1067 time than a single system is likely to run.
1068 An uptime of ten years is quite possible, a runtime
1069 of a century much less so.
1070 As an example of the latter, RCU's dyntick-idle nesting counter
1071 allows 54 bits for interrupt nesting level (this counter
1072 is 64 bits even on a 32-bit system).
1073 Overflowing this counter requires 2<sup>54</sup>
1074 half-interrupts on a given CPU without that CPU ever going idle.
1075 If a half-interrupt happened every microsecond, it would take
1076 570 years of runtime to overflow this counter, which is currently
1077 believed to be an acceptably long time.
1078<li> Linux systems can have thousands of CPUs running a single
1079 Linux kernel in a single shared-memory environment.
1080 RCU must therefore pay close attention to high-end scalability.
1081</ol>
1082
1083<p>
1084This last parallelism fact of life means that RCU must pay special
1085attention to the preceding facts of life.
1086The idea that Linux might scale to systems with thousands of CPUs would
1087have been met with some skepticism in the 1990s, but these requirements
1088would have otherwise have been unsurprising, even in the early 1990s.
1089
1090<h2><a name="Quality-of-Implementation Requirements">Quality-of-Implementation Requirements</a></h2>
1091
1092<p>
1093These sections list quality-of-implementation requirements.
1094Although an RCU implementation that ignores these requirements could
1095still be used, it would likely be subject to limitations that would
1096make it inappropriate for industrial-strength production use.
1097Classes of quality-of-implementation requirements are as follows:
1098
1099<ol>
1100<li> <a href="#Specialization">Specialization</a>
1101<li> <a href="#Performance and Scalability">Performance and Scalability</a>
1102<li> <a href="#Composability">Composability</a>
1103<li> <a href="#Corner Cases">Corner Cases</a>
1104</ol>
1105
1106<p>
1107These classes is covered in the following sections.
1108
1109<h3><a name="Specialization">Specialization</a></h3>
1110
1111<p>
1112RCU is and always has been intended primarily for read-mostly situations, as
1113illustrated by the following figure.
1114This means that RCU's read-side primitives are optimized, often at the
1115expense of its update-side primitives.
1116
1117<p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p>
1118
1119<p>
1120This focus on read-mostly situations means that RCU must interoperate
1121with other synchronization primitives.
1122For example, the <tt>add_gp()</tt> and <tt>remove_gp_synchronous()</tt>
1123examples discussed earlier use RCU to protect readers and locking to
1124coordinate updaters.
1125However, the need extends much farther, requiring that a variety of
1126synchronization primitives be legal within RCU read-side critical sections,
1127including spinlocks, sequence locks, atomic operations, reference
1128counters, and memory barriers.
1129
1130<p><a name="Quick Quiz 11"><b>Quick Quiz 11</b>:</a>
1131What about sleeping locks?
1132<br><a href="#qq11answer">Answer</a>
1133
1134<p>
1135It often comes as a surprise that many algorithms do not require a
1136consistent view of data, but many can function in that mode,
1137with network routing being the poster child.
1138Internet routing algorithms take significant time to propagate
1139updates, so that by the time an update arrives at a given system,
1140that system has been sending network traffic the wrong way for
1141a considerable length of time.
1142Having a few threads continue to send traffic the wrong way for a
1143few more milliseconds is clearly not a problem: In the worst case,
1144TCP retransmissions will eventually get the data where it needs to go.
1145In general, when tracking the state of the universe outside of the
1146computer, some level of inconsistency must be tolerated due to
1147speed-of-light delays if nothing else.
1148
1149<p>
1150Furthermore, uncertainty about external state is inherent in many cases.
1151For example, a pair of veternarians might use heartbeat to determine
1152whether or not a given cat was alive.
1153But how long should they wait after the last heartbeat to decide that
1154the cat is in fact dead?
1155Waiting less than 400 milliseconds makes no sense because this would
1156mean that a relaxed cat would be considered to cycle between death
1157and life more than 100 times per minute.
1158Moreover, just as with human beings, a cat's heart might stop for
1159some period of time, so the exact wait period is a judgment call.
1160One of our pair of veternarians might wait 30 seconds before pronouncing
1161the cat dead, while the other might insist on waiting a full minute.
1162The two veternarians would then disagree on the state of the cat during
1163the final 30 seconds of the minute following the last heartbeat, as
1164fancifully illustrated below:
1165
1166<p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p>
1167
1168<p>
1169Interestingly enough, this same situation applies to hardware.
1170When push comes to shove, how do we tell whether or not some
1171external server has failed?
1172We send messages to it periodically, and declare it failed if we
1173don't receive a response within a given period of time.
1174Policy decisions can usually tolerate short
1175periods of inconsistency.
1176The policy was decided some time ago, and is only now being put into
1177effect, so a few milliseconds of delay is normally inconsequential.
1178
1179<p>
1180However, there are algorithms that absolutely must see consistent data.
1181For example, the translation between a user-level SystemV semaphore
1182ID to the corresponding in-kernel data structure is protected by RCU,
1183but it is absolutely forbidden to update a semaphore that has just been
1184removed.
1185In the Linux kernel, this need for consistency is accommodated by acquiring
1186spinlocks located in the in-kernel data structure from within
1187the RCU read-side critical section, and this is indicated by the
1188green box in the figure above.
1189Many other techniques may be used, and are in fact used within the
1190Linux kernel.
1191
1192<p>
1193In short, RCU is not required to maintain consistency, and other
1194mechanisms may be used in concert with RCU when consistency is required.
1195RCU's specialization allows it to do its job extremely well, and its
1196ability to interoperate with other synchronization mechanisms allows
1197the right mix of synchronization tools to be used for a given job.
1198
1199<h3><a name="Performance and Scalability">Performance and Scalability</a></h3>
1200
1201<p>
1202Energy efficiency is a critical component of performance today,
1203and Linux-kernel RCU implementations must therefore avoid unnecessarily
1204awakening idle CPUs.
1205I cannot claim that this requirement was premeditated.
1206In fact, I learned of it during a telephone conversation in which I
1207was given &ldquo;frank and open&rdquo; feedback on the importance
1208of energy efficiency in battery-powered systems and on specific
1209energy-efficiency shortcomings of the Linux-kernel RCU implementation.
1210In my experience, the battery-powered embedded community will consider
1211any unnecessary wakeups to be extremely unfriendly acts.
1212So much so that mere Linux-kernel-mailing-list posts are
1213insufficient to vent their ire.
1214
1215<p>
1216Memory consumption is not particularly important for in most
1217situations, and has become decreasingly
1218so as memory sizes have expanded and memory
1219costs have plummeted.
1220However, as I learned from Matt Mackall's
1221<a href="http://elinux.org/Linux_Tiny-FAQ">bloatwatch</a>
1222efforts, memory footprint is critically important on single-CPU systems with
1223non-preemptible (<tt>CONFIG_PREEMPT=n</tt>) kernels, and thus
1224<a href="https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com">tiny RCU</a>
1225was born.
1226Josh Triplett has since taken over the small-memory banner with his
1227<a href="https://tiny.wiki.kernel.org/">Linux kernel tinification</a>
1228project, which resulted in
1229<a href="#Sleepable RCU">SRCU</a>
1230becoming optional for those kernels not needing it.
1231
1232<p>
1233The remaining performance requirements are, for the most part,
1234unsurprising.
1235For example, in keeping with RCU's read-side specialization,
1236<tt>rcu_dereference()</tt> should have negligible overhead (for
1237example, suppression of a few minor compiler optimizations).
1238Similarly, in non-preemptible environments, <tt>rcu_read_lock()</tt> and
1239<tt>rcu_read_unlock()</tt> should have exactly zero overhead.
1240
1241<p>
1242In preemptible environments, in the case where the RCU read-side
1243critical section was not preempted (as will be the case for the
1244highest-priority real-time process), <tt>rcu_read_lock()</tt> and
1245<tt>rcu_read_unlock()</tt> should have minimal overhead.
1246In particular, they should not contain atomic read-modify-write
1247operations, memory-barrier instructions, preemption disabling,
1248interrupt disabling, or backwards branches.
1249However, in the case where the RCU read-side critical section was preempted,
1250<tt>rcu_read_unlock()</tt> may acquire spinlocks and disable interrupts.
1251This is why it is better to nest an RCU read-side critical section
1252within a preempt-disable region than vice versa, at least in cases
1253where that critical section is short enough to avoid unduly degrading
1254real-time latencies.
1255
1256<p>
1257The <tt>synchronize_rcu()</tt> grace-period-wait primitive is
1258optimized for throughput.
1259It may therefore incur several milliseconds of latency in addition to
1260the duration of the longest RCU read-side critical section.
1261On the other hand, multiple concurrent invocations of
1262<tt>synchronize_rcu()</tt> are required to use batching optimizations
1263so that they can be satisfied by a single underlying grace-period-wait
1264operation.
1265For example, in the Linux kernel, it is not unusual for a single
1266grace-period-wait operation to serve more than
1267<a href="https://www.usenix.org/conference/2004-usenix-annual-technical-conference/making-rcu-safe-deep-sub-millisecond-response">1,000 separate invocations</a>
1268of <tt>synchronize_rcu()</tt>, thus amortizing the per-invocation
1269overhead down to nearly zero.
1270However, the grace-period optimization is also required to avoid
1271measurable degradation of real-time scheduling and interrupt latencies.
1272
1273<p>
1274In some cases, the multi-millisecond <tt>synchronize_rcu()</tt>
1275latencies are unacceptable.
1276In these cases, <tt>synchronize_rcu_expedited()</tt> may be used
1277instead, reducing the grace-period latency down to a few tens of
1278microseconds on small systems, at least in cases where the RCU read-side
1279critical sections are short.
1280There are currently no special latency requirements for
1281<tt>synchronize_rcu_expedited()</tt> on large systems, but,
1282consistent with the empirical nature of the RCU specification,
1283that is subject to change.
1284However, there most definitely are scalability requirements:
1285A storm of <tt>synchronize_rcu_expedited()</tt> invocations on 4096
1286CPUs should at least make reasonable forward progress.
1287In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt>
1288is permitted to impose modest degradation of real-time latency
1289on non-idle online CPUs.
1290That said, it will likely be necessary to take further steps to reduce this
1291degradation, hopefully to roughly that of a scheduling-clock interrupt.
1292
1293<p>
1294There are a number of situations where even
1295<tt>synchronize_rcu_expedited()</tt>'s reduced grace-period
1296latency is unacceptable.
1297In these situations, the asynchronous <tt>call_rcu()</tt> can be
1298used in place of <tt>synchronize_rcu()</tt> as follows:
1299
1300<blockquote>
1301<pre>
1302 1 struct foo {
1303 2 int a;
1304 3 int b;
1305 4 struct rcu_head rh;
1306 5 };
1307 6
1308 7 static void remove_gp_cb(struct rcu_head *rhp)
1309 8 {
1310 9 struct foo *p = container_of(rhp, struct foo, rh);
131110
131211 kfree(p);
131312 }
131413
131514 bool remove_gp_asynchronous(void)
131615 {
131716 struct foo *p;
131817
131918 spin_lock(&amp;gp_lock);
132019 p = rcu_dereference(gp);
132120 if (!p) {
132221 spin_unlock(&amp;gp_lock);
132322 return false;
132423 }
132524 rcu_assign_pointer(gp, NULL);
132625 call_rcu(&amp;p-&gt;rh, remove_gp_cb);
132726 spin_unlock(&amp;gp_lock);
132827 return true;
132928 }
1330</pre>
1331</blockquote>
1332
1333<p>
1334A definition of <tt>struct foo</tt> is finally needed, and appears
1335on lines&nbsp;1-5.
1336The function <tt>remove_gp_cb()</tt> is passed to <tt>call_rcu()</tt>
1337on line&nbsp;25, and will be invoked after the end of a subsequent
1338grace period.
1339This gets the same effect as <tt>remove_gp_synchronous()</tt>,
1340but without forcing the updater to wait for a grace period to elapse.
1341The <tt>call_rcu()</tt> function may be used in a number of
1342situations where neither <tt>synchronize_rcu()</tt> nor
1343<tt>synchronize_rcu_expedited()</tt> would be legal,
1344including within preempt-disable code, <tt>local_bh_disable()</tt> code,
1345interrupt-disable code, and interrupt handlers.
1346However, even <tt>call_rcu()</tt> is illegal within NMI handlers.
1347The callback function (<tt>remove_gp_cb()</tt> in this case) will be
1348executed within softirq (software interrupt) environment within the
1349Linux kernel,
1350either within a real softirq handler or under the protection
1351of <tt>local_bh_disable()</tt>.
1352In both the Linux kernel and in userspace, it is bad practice to
1353write an RCU callback function that takes too long.
1354Long-running operations should be relegated to separate threads or
1355(in the Linux kernel) workqueues.
1356
1357<p><a name="Quick Quiz 12"><b>Quick Quiz 12</b>:</a>
1358Why does line&nbsp;19 use <tt>rcu_access_pointer()</tt>?
1359After all, <tt>call_rcu()</tt> on line&nbsp;25 stores into the
1360structure, which would interact badly with concurrent insertions.
1361Doesn't this mean that <tt>rcu_dereference()</tt> is required?
1362<br><a href="#qq12answer">Answer</a>
1363
1364<p>
1365However, all that <tt>remove_gp_cb()</tt> is doing is
1366invoking <tt>kfree()</tt> on the data element.
1367This is a common idiom, and is supported by <tt>kfree_rcu()</tt>,
1368which allows &ldquo;fire and forget&rdquo; operation as shown below:
1369
1370<blockquote>
1371<pre>
1372 1 struct foo {
1373 2 int a;
1374 3 int b;
1375 4 struct rcu_head rh;
1376 5 };
1377 6
1378 7 bool remove_gp_faf(void)
1379 8 {
1380 9 struct foo *p;
138110
138211 spin_lock(&amp;gp_lock);
138312 p = rcu_dereference(gp);
138413 if (!p) {
138514 spin_unlock(&amp;gp_lock);
138615 return false;
138716 }
138817 rcu_assign_pointer(gp, NULL);
138918 kfree_rcu(p, rh);
139019 spin_unlock(&amp;gp_lock);
139120 return true;
139221 }
1393</pre>
1394</blockquote>
1395
1396<p>
1397Note that <tt>remove_gp_faf()</tt> simply invokes
1398<tt>kfree_rcu()</tt> and proceeds, without any need to pay any
1399further attention to the subsequent grace period and <tt>kfree()</tt>.
1400It is permissible to invoke <tt>kfree_rcu()</tt> from the same
1401environments as for <tt>call_rcu()</tt>.
1402Interestingly enough, DYNIX/ptx had the equivalents of
1403<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>, but not
1404<tt>synchronize_rcu()</tt>.
1405This was due to the fact that RCU was not heavily used within DYNIX/ptx,
1406so the very few places that needed something like
1407<tt>synchronize_rcu()</tt> simply open-coded it.
1408
1409<p><a name="Quick Quiz 13"><b>Quick Quiz 13</b>:</a>
1410Earlier it was claimed that <tt>call_rcu()</tt> and
1411<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked
1412by readers.
1413But how can that be correct, given that the invocation of the callback
1414and the freeing of the memory (respectively) must still wait for
1415a grace period to elapse?
1416<br><a href="#qq13answer">Answer</a>
1417
1418<p>
1419But what if the updater must wait for the completion of code to be
1420executed after the end of the grace period, but has other tasks
1421that can be carried out in the meantime?
1422The polling-style <tt>get_state_synchronize_rcu()</tt> and
1423<tt>cond_synchronize_rcu()</tt> functions may be used for this
1424purpose, as shown below:
1425
1426<blockquote>
1427<pre>
1428 1 bool remove_gp_poll(void)
1429 2 {
1430 3 struct foo *p;
1431 4 unsigned long s;
1432 5
1433 6 spin_lock(&amp;gp_lock);
1434 7 p = rcu_access_pointer(gp);
1435 8 if (!p) {
1436 9 spin_unlock(&amp;gp_lock);
143710 return false;
143811 }
143912 rcu_assign_pointer(gp, NULL);
144013 spin_unlock(&amp;gp_lock);
144114 s = get_state_synchronize_rcu();
144215 do_something_while_waiting();
144316 cond_synchronize_rcu(s);
144417 kfree(p);
144518 return true;
144619 }
1447</pre>
1448</blockquote>
1449
1450<p>
1451On line&nbsp;14, <tt>get_state_synchronize_rcu()</tt> obtains a
1452&ldquo;cookie&rdquo; from RCU,
1453then line&nbsp;15 carries out other tasks,
1454and finally, line&nbsp;16 returns immediately if a grace period has
1455elapsed in the meantime, but otherwise waits as required.
1456The need for <tt>get_state_synchronize_rcu</tt> and
1457<tt>cond_synchronize_rcu()</tt> has appeared quite recently,
1458so it is too early to tell whether they will stand the test of time.
1459
1460<p>
1461RCU thus provides a range of tools to allow updaters to strike the
1462required tradeoff between latency, flexibility and CPU overhead.
1463
1464<h3><a name="Composability">Composability</a></h3>
1465
1466<p>
1467Composability has received much attention in recent years, perhaps in part
1468due to the collision of multicore hardware with object-oriented techniques
1469designed in single-threaded environments for single-threaded use.
1470And in theory, RCU read-side critical sections may be composed, and in
1471fact may be nested arbitrarily deeply.
1472In practice, as with all real-world implementations of composable
1473constructs, there are limitations.
1474
1475<p>
1476Implementations of RCU for which <tt>rcu_read_lock()</tt>
1477and <tt>rcu_read_unlock()</tt> generate no code, such as
1478Linux-kernel RCU when <tt>CONFIG_PREEMPT=n</tt>, can be
1479nested arbitrarily deeply.
1480After all, there is no overhead.
1481Except that if all these instances of <tt>rcu_read_lock()</tt>
1482and <tt>rcu_read_unlock()</tt> are visible to the compiler,
1483compilation will eventually fail due to exhausting memory,
1484mass storage, or user patience, whichever comes first.
1485If the nesting is not visible to the compiler, as is the case with
1486mutually recursive functions each in its own translation unit,
1487stack overflow will result.
1488If the nesting takes the form of loops, either the control variable
1489will overflow or (in the Linux kernel) you will get an RCU CPU stall warning.
1490Nevertheless, this class of RCU implementations is one
1491of the most composable constructs in existence.
1492
1493<p>
1494RCU implementations that explicitly track nesting depth
1495are limited by the nesting-depth counter.
1496For example, the Linux kernel's preemptible RCU limits nesting to
1497<tt>INT_MAX</tt>.
1498This should suffice for almost all practical purposes.
1499That said, a consecutive pair of RCU read-side critical sections
1500between which there is an operation that waits for a grace period
1501cannot be enclosed in another RCU read-side critical section.
1502This is because it is not legal to wait for a grace period within
1503an RCU read-side critical section: To do so would result either
1504in deadlock or
1505in RCU implicitly splitting the enclosing RCU read-side critical
1506section, neither of which is conducive to a long-lived and prosperous
1507kernel.
1508
1509<p>
1510It is worth noting that RCU is not alone in limiting composability.
1511For example, many transactional-memory implementations prohibit
1512composing a pair of transactions separated by an irrevocable
1513operation (for example, a network receive operation).
1514For another example, lock-based critical sections can be composed
1515surprisingly freely, but only if deadlock is avoided.
1516
1517<p>
1518In short, although RCU read-side critical sections are highly composable,
1519care is required in some situations, just as is the case for any other
1520composable synchronization mechanism.
1521
1522<h3><a name="Corner Cases">Corner Cases</a></h3>
1523
1524<p>
1525A given RCU workload might have an endless and intense stream of
1526RCU read-side critical sections, perhaps even so intense that there
1527was never a point in time during which there was not at least one
1528RCU read-side critical section in flight.
1529RCU cannot allow this situation to block grace periods: As long as
1530all the RCU read-side critical sections are finite, grace periods
1531must also be finite.
1532
1533<p>
1534That said, preemptible RCU implementations could potentially result
1535in RCU read-side critical sections being preempted for long durations,
1536which has the effect of creating a long-duration RCU read-side
1537critical section.
1538This situation can arise only in heavily loaded systems, but systems using
1539real-time priorities are of course more vulnerable.
1540Therefore, RCU priority boosting is provided to help deal with this
1541case.
1542That said, the exact requirements on RCU priority boosting will likely
1543evolve as more experience accumulates.
1544
1545<p>
1546Other workloads might have very high update rates.
1547Although one can argue that such workloads should instead use
1548something other than RCU, the fact remains that RCU must
1549handle such workloads gracefully.
1550This requirement is another factor driving batching of grace periods,
1551but it is also the driving force behind the checks for large numbers
1552of queued RCU callbacks in the <tt>call_rcu()</tt> code path.
1553Finally, high update rates should not delay RCU read-side critical
1554sections, although some read-side delays can occur when using
1555<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use
1556of <tt>try_stop_cpus()</tt>.
1557(In the future, <tt>synchronize_rcu_expedited()</tt> will be
1558converted to use lighter-weight inter-processor interrupts (IPIs),
1559but this will still disturb readers, though to a much smaller degree.)
1560
1561<p>
1562Although all three of these corner cases were understood in the early
15631990s, a simple user-level test consisting of <tt>close(open(path))</tt>
1564in a tight loop
1565in the early 2000s suddenly provided a much deeper appreciation of the
1566high-update-rate corner case.
1567This test also motivated addition of some RCU code to react to high update
1568rates, for example, if a given CPU finds itself with more than 10,000
1569RCU callbacks queued, it will cause RCU to take evasive action by
1570more aggressively starting grace periods and more aggressively forcing
1571completion of grace-period processing.
1572This evasive action causes the grace period to complete more quickly,
1573but at the cost of restricting RCU's batching optimizations, thus
1574increasing the CPU overhead incurred by that grace period.
1575
1576<h2><a name="Software-Engineering Requirements">
1577Software-Engineering Requirements</a></h2>
1578
1579<p>
1580Between Murphy's Law and &ldquo;To err is human&rdquo;, it is necessary to
1581guard against mishaps and misuse:
1582
1583<ol>
1584<li> It is all too easy to forget to use <tt>rcu_read_lock()</tt>
1585 everywhere that it is needed, so kernels built with
1586 <tt>CONFIG_PROVE_RCU=y</tt> will spat if
1587 <tt>rcu_dereference()</tt> is used outside of an
1588 RCU read-side critical section.
1589 Update-side code can use <tt>rcu_dereference_protected()</tt>,
1590 which takes a
1591 <a href="https://lwn.net/Articles/371986/">lockdep expression</a>
1592 to indicate what is providing the protection.
1593 If the indicated protection is not provided, a lockdep splat
1594 is emitted.
1595
1596 <p>
1597 Code shared between readers and updaters can use
1598 <tt>rcu_dereference_check()</tt>, which also takes a
1599 lockdep expression, and emits a lockdep splat if neither
1600 <tt>rcu_read_lock()</tt> nor the indicated protection
1601 is in place.
1602 In addition, <tt>rcu_dereference_raw()</tt> is used in those
1603 (hopefully rare) cases where the required protection cannot
1604 be easily described.
1605 Finally, <tt>rcu_read_lock_held()</tt> is provided to
1606 allow a function to verify that it has been invoked within
1607 an RCU read-side critical section.
1608 I was made aware of this set of requirements shortly after Thomas
1609 Gleixner audited a number of RCU uses.
1610<li> A given function might wish to check for RCU-related preconditions
1611 upon entry, before using any other RCU API.
1612 The <tt>rcu_lockdep_assert()</tt> does this job,
1613 asserting the expression in kernels having lockdep enabled
1614 and doing nothing otherwise.
1615<li> It is also easy to forget to use <tt>rcu_assign_pointer()</tt>
1616 and <tt>rcu_dereference()</tt>, perhaps (incorrectly)
1617 substituting a simple assignment.
1618 To catch this sort of error, a given RCU-protected pointer may be
1619 tagged with <tt>__rcu</tt>, after which running sparse
1620 with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain
1621 about simple-assignment accesses to that pointer.
1622 Arnd Bergmann made me aware of this requirement, and also
1623 supplied the needed
1624 <a href="https://lwn.net/Articles/376011/">patch series</a>.
1625<li> Kernels built with <tt>CONFIG_DEBUG_OBJECTS_RCU_HEAD=y</tt>
1626 will splat if a data element is passed to <tt>call_rcu()</tt>
1627 twice in a row, without a grace period in between.
1628 (This error is similar to a double free.)
1629 The corresponding <tt>rcu_head</tt> structures that are
1630 dynamically allocated are automatically tracked, but
1631 <tt>rcu_head</tt> structures allocated on the stack
1632 must be initialized with <tt>init_rcu_head_on_stack()</tt>
1633 and cleaned up with <tt>destroy_rcu_head_on_stack()</tt>.
1634 Similarly, statically allocated non-stack <tt>rcu_head</tt>
1635 structures must be initialized with <tt>init_rcu_head()</tt>
1636 and cleaned up with <tt>destroy_rcu_head()</tt>.
1637 Mathieu Desnoyers made me aware of this requirement, and also
1638 supplied the needed
1639 <a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>.
1640<li> An infinite loop in an RCU read-side critical section will
1641 eventually trigger an RCU CPU stall warning splat, with
1642 the duration of &ldquo;eventually&rdquo; being controlled by the
1643 <tt>RCU_CPU_STALL_TIMEOUT</tt> <tt>Kconfig</tt> option, or,
1644 alternatively, by the
1645 <tt>rcupdate.rcu_cpu_stall_timeout</tt> boot/sysfs
1646 parameter.
1647 However, RCU is not obligated to produce this splat
1648 unless there is a grace period waiting on that particular
1649 RCU read-side critical section.
1650 <p>
1651 Some extreme workloads might intentionally delay
1652 RCU grace periods, and systems running those workloads can
1653 be booted with <tt>rcupdate.rcu_cpu_stall_suppress</tt>
1654 to suppress the splats.
1655 This kernel parameter may also be set via <tt>sysfs</tt>.
1656 Furthermore, RCU CPU stall warnings are counter-productive
1657 during sysrq dumps and during panics.
1658 RCU therefore supplies the <tt>rcu_sysrq_start()</tt> and
1659 <tt>rcu_sysrq_end()</tt> API members to be called before
1660 and after long sysrq dumps.
1661 RCU also supplies the <tt>rcu_panic()</tt> notifier that is
1662 automatically invoked at the beginning of a panic to suppress
1663 further RCU CPU stall warnings.
1664
1665 <p>
1666 This requirement made itself known in the early 1990s, pretty
1667 much the first time that it was necessary to debug a CPU stall.
1668 That said, the initial implementation in DYNIX/ptx was quite
1669 generic in comparison with that of Linux.
1670<li> Although it would be very good to detect pointers leaking out
1671 of RCU read-side critical sections, there is currently no
1672 good way of doing this.
1673 One complication is the need to distinguish between pointers
1674 leaking and pointers that have been handed off from RCU to
1675 some other synchronization mechanism, for example, reference
1676 counting.
1677<li> In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related
1678 information is provided via both debugfs and event tracing.
1679<li> Open-coded use of <tt>rcu_assign_pointer()</tt> and
1680 <tt>rcu_dereference()</tt> to create typical linked
1681 data structures can be surprisingly error-prone.
1682 Therefore, RCU-protected
1683 <a href="https://lwn.net/Articles/609973/#RCU List APIs">linked lists</a>
1684 and, more recently, RCU-protected
1685 <a href="https://lwn.net/Articles/612100/">hash tables</a>
1686 are available.
1687 Many other special-purpose RCU-protected data structures are
1688 available in the Linux kernel and the userspace RCU library.
1689<li> Some linked structures are created at compile time, but still
1690 require <tt>__rcu</tt> checking.
1691 The <tt>RCU_POINTER_INITIALIZER()</tt> macro serves this
1692 purpose.
1693<li> It is not necessary to use <tt>rcu_assign_pointer()</tt>
1694 when creating linked structures that are to be published via
1695 a single external pointer.
1696 The <tt>RCU_INIT_POINTER()</tt> macro is provided for
1697 this task and also for assigning <tt>NULL</tt> pointers
1698 at runtime.
1699</ol>
1700
1701<p>
1702This not a hard-and-fast list: RCU's diagnostic capabilities will
1703continue to be guided by the number and type of usage bugs found
1704in real-world RCU usage.
1705
1706<h2><a name="Linux Kernel Complications">Linux Kernel Complications</a></h2>
1707
1708<p>
1709The Linux kernel provides an interesting environment for all kinds of
1710software, including RCU.
1711Some of the relevant points of interest are as follows:
1712
1713<ol>
1714<li> <a href="#Configuration">Configuration</a>.
1715<li> <a href="#Firmware Interface">Firmware Interface</a>.
1716<li> <a href="#Early Boot">Early Boot</a>.
1717<li> <a href="#Interrupts and NMIs">
1718 Interrupts and non-maskable interrupts (NMIs)</a>.
1719<li> <a href="#Loadable Modules">Loadable Modules</a>.
1720<li> <a href="#Hotplug CPU">Hotplug CPU</a>.
1721<li> <a href="#Scheduler and RCU">Scheduler and RCU</a>.
1722<li> <a href="#Tracing and RCU">Tracing and RCU</a>.
1723<li> <a href="#Energy Efficiency">Energy Efficiency</a>.
1724<li> <a href="#Memory Efficiency">Memory Efficiency</a>.
1725<li> <a href="#Performance, Scalability, Response Time, and Reliability">
1726 Performance, Scalability, Response Time, and Reliability</a>.
1727</ol>
1728
1729<p>
1730This list is probably incomplete, but it does give a feel for the
1731most notable Linux-kernel complications.
1732Each of the following sections covers one of the above topics.
1733
1734<h3><a name="Configuration">Configuration</a></h3>
1735
1736<p>
1737RCU's goal is automatic configuration, so that almost nobody
1738needs to worry about RCU's <tt>Kconfig</tt> options.
1739And for almost all users, RCU does in fact work well
1740&ldquo;out of the box.&rdquo;
1741
1742<p>
1743However, there are specialized use cases that are handled by
1744kernel boot parameters and <tt>Kconfig</tt> options.
1745Unfortunately, the <tt>Kconfig</tt> system will explicitly ask users
1746about new <tt>Kconfig</tt> options, which requires almost all of them
1747be hidden behind a <tt>CONFIG_RCU_EXPERT</tt> <tt>Kconfig</tt> option.
1748
1749<p>
1750This all should be quite obvious, but the fact remains that
1751Linus Torvalds recently had to
1752<a href="https://lkml.kernel.org/g/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com">remind</a>
1753me of this requirement.
1754
1755<h3><a name="Firmware Interface">Firmware Interface</a></h3>
1756
1757<p>
1758In many cases, kernel obtains information about the system from the
1759firmware, and sometimes things are lost in translation.
1760Or the translation is accurate, but the original message is bogus.
1761
1762<p>
1763For example, some systems' firmware overreports the number of CPUs,
1764sometimes by a large factor.
1765If RCU naively believed the firmware, as it used to do,
1766it would create too many per-CPU kthreads.
1767Although the resulting system will still run correctly, the extra
1768kthreads needlessly consume memory and can cause confusion
1769when they show up in <tt>ps</tt> listings.
1770
1771<p>
1772RCU must therefore wait for a given CPU to actually come online before
1773it can allow itself to believe that the CPU actually exists.
1774The resulting &ldquo;ghost CPUs&rdquo; (which are never going to
1775come online) cause a number of
1776<a href="https://paulmck.livejournal.com/37494.html">interesting complications</a>.
1777
1778<h3><a name="Early Boot">Early Boot</a></h3>
1779
1780<p>
1781The Linux kernel's boot sequence is an interesting process,
1782and RCU is used early, even before <tt>rcu_init()</tt>
1783is invoked.
1784In fact, a number of RCU's primitives can be used as soon as the
1785initial task's <tt>task_struct</tt> is available and the
1786boot CPU's per-CPU variables are set up.
1787The read-side primitives (<tt>rcu_read_lock()</tt>,
1788<tt>rcu_read_unlock()</tt>, <tt>rcu_dereference()</tt>,
1789and <tt>rcu_access_pointer()</tt>) will operate normally very early on,
1790as will <tt>rcu_assign_pointer()</tt>.
1791
1792<p>
1793Although <tt>call_rcu()</tt> may be invoked at any
1794time during boot, callbacks are not guaranteed to be invoked until after
1795the scheduler is fully up and running.
1796This delay in callback invocation is due to the fact that RCU does not
1797invoke callbacks until it is fully initialized, and this full initialization
1798cannot occur until after the scheduler has initialized itself to the
1799point where RCU can spawn and run its kthreads.
1800In theory, it would be possible to invoke callbacks earlier,
1801however, this is not a panacea because there would be severe restrictions
1802on what operations those callbacks could invoke.
1803
1804<p>
1805Perhaps surprisingly, <tt>synchronize_rcu()</tt>,
1806<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a>
1807(<a href="#Bottom-Half Flavor">discussed below</a>),
1808and
1809<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>
1810will all operate normally
1811during very early boot, the reason being that there is only one CPU
1812and preemption is disabled.
1813This means that the call <tt>synchronize_rcu()</tt> (or friends)
1814itself is a quiescent
1815state and thus a grace period, so the early-boot implementation can
1816be a no-op.
1817
1818<p>
1819Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt>
1820continue to operate normally through the remainder of boot, courtesy
1821of the fact that preemption is disabled across their RCU read-side
1822critical sections and also courtesy of the fact that there is still
1823only one CPU.
1824However, once the scheduler starts initializing, preemption is enabled.
1825There is still only a single CPU, but the fact that preemption is enabled
1826means that the no-op implementation of <tt>synchronize_rcu()</tt> no
1827longer works in <tt>CONFIG_PREEMPT=y</tt> kernels.
1828Therefore, as soon as the scheduler starts initializing, the early-boot
1829fastpath is disabled.
1830This means that <tt>synchronize_rcu()</tt> switches to its runtime
1831mode of operation where it posts callbacks, which in turn means that
1832any call to <tt>synchronize_rcu()</tt> will block until the corresponding
1833callback is invoked.
1834Unfortunately, the callback cannot be invoked until RCU's runtime
1835grace-period machinery is up and running, which cannot happen until
1836the scheduler has initialized itself sufficiently to allow RCU's
1837kthreads to be spawned.
1838Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler
1839initialization can result in deadlock.
1840
1841<p><a name="Quick Quiz 14"><b>Quick Quiz 14</b>:</a>
1842So what happens with <tt>synchronize_rcu()</tt> during
1843scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
1844kernels?
1845<br><a href="#qq14answer">Answer</a>
1846
1847<p>
1848I learned of these boot-time requirements as a result of a series of
1849system hangs.
1850
1851<h3><a name="Interrupts and NMIs">Interrupts and NMIs</a></h3>
1852
1853<p>
1854The Linux kernel has interrupts, and RCU read-side critical sections are
1855legal within interrupt handlers and within interrupt-disabled regions
1856of code, as are invocations of <tt>call_rcu()</tt>.
1857
1858<p>
1859Some Linux-kernel architectures can enter an interrupt handler from
1860non-idle process context, and then just never leave it, instead stealthily
1861transitioning back to process context.
1862This trick is sometimes used to invoke system calls from inside the kernel.
1863These &ldquo;half-interrupts&rdquo; mean that RCU has to be very careful
1864about how it counts interrupt nesting levels.
1865I learned of this requirement the hard way during a rewrite
1866of RCU's dyntick-idle code.
1867
1868<p>
1869The Linux kernel has non-maskable interrupts (NMIs), and
1870RCU read-side critical sections are legal within NMI handlers.
1871Thankfully, RCU update-side primitives, including
1872<tt>call_rcu()</tt>, are prohibited within NMI handlers.
1873
1874<p>
1875The name notwithstanding, some Linux-kernel architectures
1876can have nested NMIs, which RCU must handle correctly.
1877Andy Lutomirski
1878<a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a>
1879with this requirement;
1880he also kindly surprised me with
1881<a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a>
1882that meets this requirement.
1883
1884<h3><a name="Loadable Modules">Loadable Modules</a></h3>
1885
1886<p>
1887The Linux kernel has loadable modules, and these modules can
1888also be unloaded.
1889After a given module has been unloaded, any attempt to call
1890one of its functions results in a segmentation fault.
1891The module-unload functions must therefore cancel any
1892delayed calls to loadable-module functions, for example,
1893any outstanding <tt>mod_timer()</tt> must be dealt with
1894via <tt>del_timer_sync()</tt> or similar.
1895
1896<p>
1897Unfortunately, there is no way to cancel an RCU callback;
1898once you invoke <tt>call_rcu()</tt>, the callback function is
1899going to eventually be invoked, unless the system goes down first.
1900Because it is normally considered socially irresponsible to crash the system
1901in response to a module unload request, we need some other way
1902to deal with in-flight RCU callbacks.
1903
1904<p>
1905RCU therefore provides
1906<tt><a href="https://lwn.net/Articles/217484/">rcu_barrier()</a></tt>,
1907which waits until all in-flight RCU callbacks have been invoked.
1908If a module uses <tt>call_rcu()</tt>, its exit function should therefore
1909prevent any future invocation of <tt>call_rcu()</tt>, then invoke
1910<tt>rcu_barrier()</tt>.
1911In theory, the underlying module-unload code could invoke
1912<tt>rcu_barrier()</tt> unconditionally, but in practice this would
1913incur unacceptable latencies.
1914
1915<p>
1916Nikita Danilov noted this requirement for an analogous filesystem-unmount
1917situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU.
1918The need for <tt>rcu_barrier()</tt> for module unloading became
1919apparent later.
1920
1921<h3><a name="Hotplug CPU">Hotplug CPU</a></h3>
1922
1923<p>
1924The Linux kernel supports CPU hotplug, which means that CPUs
1925can come and go.
1926It is of course illegal to use any RCU API member from an offline CPU.
1927This requirement was present from day one in DYNIX/ptx, but
1928on the other hand, the Linux kernel's CPU-hotplug implementation
1929is &ldquo;interesting.&rdquo;
1930
1931<p>
1932The Linux-kernel CPU-hotplug implementation has notifiers that
1933are used to allow the various kernel subsystems (including RCU)
1934to respond appropriately to a given CPU-hotplug operation.
1935Most RCU operations may be invoked from CPU-hotplug notifiers,
1936including even normal synchronous grace-period operations
1937such as <tt>synchronize_rcu()</tt>.
1938However, expedited grace-period operations such as
1939<tt>synchronize_rcu_expedited()</tt> are not supported,
1940due to the fact that current implementations block CPU-hotplug
1941operations, which could result in deadlock.
1942
1943<p>
1944In addition, all-callback-wait operations such as
1945<tt>rcu_barrier()</tt> are also not supported, due to the
1946fact that there are phases of CPU-hotplug operations where
1947the outgoing CPU's callbacks will not be invoked until after
1948the CPU-hotplug operation ends, which could also result in deadlock.
1949
1950<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3>
1951
1952<p>
1953RCU depends on the scheduler, and the scheduler uses RCU to
1954protect some of its data structures.
1955This means the scheduler is forbidden from acquiring
1956the runqueue locks and the priority-inheritance locks
1957in the middle of an outermost RCU read-side critical section unless either
1958(1)&nbsp;it releases them before exiting that same
1959RCU read-side critical section, or
1960(2)&nbsp;interrupts are disabled across
1961that entire RCU read-side critical section.
1962This same prohibition also applies (recursively!) to any lock that is acquired
1963while holding any lock to which this prohibition applies.
1964Adhering to this rule prevents preemptible RCU from invoking
1965<tt>rcu_read_unlock_special()</tt> while either runqueue or
1966priority-inheritance locks are held, thus avoiding deadlock.
1967
1968<p>
1969Prior to v4.4, it was only necessary to disable preemption across
1970RCU read-side critical sections that acquired scheduler locks.
1971In v4.4, expedited grace periods started using IPIs, and these
1972IPIs could force a <tt>rcu_read_unlock()</tt> to take the slowpath.
1973Therefore, this expedited-grace-period change required disabling of
1974interrupts, not just preemption.
1975
1976<p>
1977For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt>
1978implementation must be written carefully to avoid similar deadlocks.
1979In particular, <tt>rcu_read_unlock()</tt> must tolerate an
1980interrupt where the interrupt handler invokes both
1981<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
1982This possibility requires <tt>rcu_read_unlock()</tt> to use
1983negative nesting levels to avoid destructive recursion via
1984interrupt handler's use of RCU.
1985
1986<p>
1987This pair of mutual scheduler-RCU requirements came as a
1988<a href="https://lwn.net/Articles/453002/">complete surprise</a>.
1989
1990<p>
1991As noted above, RCU makes use of kthreads, and it is necessary to
1992avoid excessive CPU-time accumulation by these kthreads.
1993This requirement was no surprise, but RCU's violation of it
1994when running context-switch-heavy workloads when built with
1995<tt>CONFIG_NO_HZ_FULL=y</tt>
1996<a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>.
1997RCU has made good progress towards meeting this requirement, even
1998for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads,
1999but there is room for further improvement.
2000
2001<h3><a name="Tracing and RCU">Tracing and RCU</a></h3>
2002
2003<p>
2004It is possible to use tracing on RCU code, but tracing itself
2005uses RCU.
2006For this reason, <tt>rcu_dereference_raw_notrace()</tt>
2007is provided for use by tracing, which avoids the destructive
2008recursion that could otherwise ensue.
2009This API is also used by virtualization in some architectures,
2010where RCU readers execute in environments in which tracing
2011cannot be used.
2012The tracing folks both located the requirement and provided the
2013needed fix, so this surprise requirement was relatively painless.
2014
2015<h3><a name="Energy Efficiency">Energy Efficiency</a></h3>
2016
2017<p>
2018Interrupting idle CPUs is considered socially unacceptable,
2019especially by people with battery-powered embedded systems.
2020RCU therefore conserves energy by detecting which CPUs are
2021idle, including tracking CPUs that have been interrupted from idle.
2022This is a large part of the energy-efficiency requirement,
2023so I learned of this via an irate phone call.
2024
2025<p>
2026Because RCU avoids interrupting idle CPUs, it is illegal to
2027execute an RCU read-side critical section on an idle CPU.
2028(Kernels built with <tt>CONFIG_PROVE_RCU=y</tt> will splat
2029if you try it.)
2030The <tt>RCU_NONIDLE()</tt> macro and <tt>_rcuidle</tt>
2031event tracing is provided to work around this restriction.
2032In addition, <tt>rcu_is_watching()</tt> may be used to
2033test whether or not it is currently legal to run RCU read-side
2034critical sections on this CPU.
2035I learned of the need for diagnostics on the one hand
2036and <tt>RCU_NONIDLE()</tt> on the other while inspecting
2037idle-loop code.
2038Steven Rostedt supplied <tt>_rcuidle</tt> event tracing,
2039which is used quite heavily in the idle loop.
2040
2041<p>
2042It is similarly socially unacceptable to interrupt an
2043<tt>nohz_full</tt> CPU running in userspace.
2044RCU must therefore track <tt>nohz_full</tt> userspace
2045execution.
2046And in
2047<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a>
2048kernels, RCU must separately track idle CPUs on the one hand and
2049CPUs that are either idle or executing in userspace on the other.
2050In both cases, RCU must be able to sample state at two points in
2051time, and be able to determine whether or not some other CPU spent
2052any time idle and/or executing in userspace.
2053
2054<p>
2055These energy-efficiency requirements have proven quite difficult to
2056understand and to meet, for example, there have been more than five
2057clean-sheet rewrites of RCU's energy-efficiency code, the last of
2058which was finally able to demonstrate
2059<a href="http://www.rdrop.com/users/paulmck/realtime/paper/AMPenergy.2013.04.19a.pdf">real energy savings running on real hardware [PDF]</a>.
2060As noted earlier,
2061I learned of many of these requirements via angry phone calls:
2062Flaming me on the Linux-kernel mailing list was apparently not
2063sufficient to fully vent their ire at RCU's energy-efficiency bugs!
2064
2065<h3><a name="Memory Efficiency">Memory Efficiency</a></h3>
2066
2067<p>
2068Although small-memory non-realtime systems can simply use Tiny RCU,
2069code size is only one aspect of memory efficiency.
2070Another aspect is the size of the <tt>rcu_head</tt> structure
2071used by <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>.
2072Although this structure contains nothing more than a pair of pointers,
2073it does appear in many RCU-protected data structures, including
2074some that are size critical.
2075The <tt>page</tt> structure is a case in point, as evidenced by
2076the many occurrences of the <tt>union</tt> keyword within that structure.
2077
2078<p>
2079This need for memory efficiency is one reason that RCU uses hand-crafted
2080singly linked lists to track the <tt>rcu_head</tt> structures that
2081are waiting for a grace period to elapse.
2082It is also the reason why <tt>rcu_head</tt> structures do not contain
2083debug information, such as fields tracking the file and line of the
2084<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> that posted them.
2085Although this information might appear in debug-only kernel builds at some
2086point, in the meantime, the <tt>-&gt;func</tt> field will often provide
2087the needed debug information.
2088
2089<p>
2090However, in some cases, the need for memory efficiency leads to even
2091more extreme measures.
2092Returning to the <tt>page</tt> structure, the <tt>rcu_head</tt> field
2093shares storage with a great many other structures that are used at
2094various points in the corresponding page's lifetime.
2095In order to correctly resolve certain
2096<a href="https://lkml.kernel.org/g/1439976106-137226-1-git-send-email-kirill.shutemov@linux.intel.com">race conditions</a>,
2097the Linux kernel's memory-management subsystem needs a particular bit
2098to remain zero during all phases of grace-period processing,
2099and that bit happens to map to the bottom bit of the
2100<tt>rcu_head</tt> structure's <tt>-&gt;next</tt> field.
2101RCU makes this guarantee as long as <tt>call_rcu()</tt>
2102is used to post the callback, as opposed to <tt>kfree_rcu()</tt>
2103or some future &ldquo;lazy&rdquo;
2104variant of <tt>call_rcu()</tt> that might one day be created for
2105energy-efficiency purposes.
2106
2107<h3><a name="Performance, Scalability, Response Time, and Reliability">
2108Performance, Scalability, Response Time, and Reliability</a></h3>
2109
2110<p>
2111Expanding on the
2112<a href="#Performance and Scalability">earlier discussion</a>,
2113RCU is used heavily by hot code paths in performance-critical
2114portions of the Linux kernel's networking, security, virtualization,
2115and scheduling code paths.
2116RCU must therefore use efficient implementations, especially in its
2117read-side primitives.
2118To that end, it would be good if preemptible RCU's implementation
2119of <tt>rcu_read_lock()</tt> could be inlined, however, doing
2120this requires resolving <tt>#include</tt> issues with the
2121<tt>task_struct</tt> structure.
2122
2123<p>
2124The Linux kernel supports hardware configurations with up to
21254096 CPUs, which means that RCU must be extremely scalable.
2126Algorithms that involve frequent acquisitions of global locks or
2127frequent atomic operations on global variables simply cannot be
2128tolerated within the RCU implementation.
2129RCU therefore makes heavy use of a combining tree based on the
2130<tt>rcu_node</tt> structure.
2131RCU is required to tolerate all CPUs continuously invoking any
2132combination of RCU's runtime primitives with minimal per-operation
2133overhead.
2134In fact, in many cases, increasing load must <i>decrease</i> the
2135per-operation overhead, witness the batching optimizations for
2136<tt>synchronize_rcu()</tt>, <tt>call_rcu()</tt>,
2137<tt>synchronize_rcu_expedited()</tt>, and <tt>rcu_barrier()</tt>.
2138As a general rule, RCU must cheerfully accept whatever the
2139rest of the Linux kernel decides to throw at it.
2140
2141<p>
2142The Linux kernel is used for real-time workloads, especially
2143in conjunction with the
2144<a href="https://rt.wiki.kernel.org/index.php/Main_Page">-rt patchset</a>.
2145The real-time-latency response requirements are such that the
2146traditional approach of disabling preemption across RCU
2147read-side critical sections is inappropriate.
2148Kernels built with <tt>CONFIG_PREEMPT=y</tt> therefore
2149use an RCU implementation that allows RCU read-side critical
2150sections to be preempted.
2151This requirement made its presence known after users made it
2152clear that an earlier
2153<a href="https://lwn.net/Articles/107930/">real-time patch</a>
2154did not meet their needs, in conjunction with some
2155<a href="https://lkml.kernel.org/g/20050318002026.GA2693@us.ibm.com">RCU issues</a>
2156encountered by a very early version of the -rt patchset.
2157
2158<p>
2159In addition, RCU must make do with a sub-100-microsecond real-time latency
2160budget.
2161In fact, on smaller systems with the -rt patchset, the Linux kernel
2162provides sub-20-microsecond real-time latencies for the whole kernel,
2163including RCU.
2164RCU's scalability and latency must therefore be sufficient for
2165these sorts of configurations.
2166To my surprise, the sub-100-microsecond real-time latency budget
2167<a href="http://www.rdrop.com/users/paulmck/realtime/paper/bigrt.2013.01.31a.LCA.pdf">
2168applies to even the largest systems [PDF]</a>,
2169up to and including systems with 4096 CPUs.
2170This real-time requirement motivated the grace-period kthread, which
2171also simplified handling of a number of race conditions.
2172
2173<p>
2174Finally, RCU's status as a synchronization primitive means that
2175any RCU failure can result in arbitrary memory corruption that can be
2176extremely difficult to debug.
2177This means that RCU must be extremely reliable, which in
2178practice also means that RCU must have an aggressive stress-test
2179suite.
2180This stress-test suite is called <tt>rcutorture</tt>.
2181
2182<p>
2183Although the need for <tt>rcutorture</tt> was no surprise,
2184the current immense popularity of the Linux kernel is posing
2185interesting&mdash;and perhaps unprecedented&mdash;validation
2186challenges.
2187To see this, keep in mind that there are well over one billion
2188instances of the Linux kernel running today, given Android
2189smartphones, Linux-powered televisions, and servers.
2190This number can be expected to increase sharply with the advent of
2191the celebrated Internet of Things.
2192
2193<p>
2194Suppose that RCU contains a race condition that manifests on average
2195once per million years of runtime.
2196This bug will be occurring about three times per <i>day</i> across
2197the installed base.
2198RCU could simply hide behind hardware error rates, given that no one
2199should really expect their smartphone to last for a million years.
2200However, anyone taking too much comfort from this thought should
2201consider the fact that in most jurisdictions, a successful multi-year
2202test of a given mechanism, which might include a Linux kernel,
2203suffices for a number of types of safety-critical certifications.
2204In fact, rumor has it that the Linux kernel is already being used
2205in production for safety-critical applications.
2206I don't know about you, but I would feel quite bad if a bug in RCU
2207killed someone.
2208Which might explain my recent focus on validation and verification.
2209
2210<h2><a name="Other RCU Flavors">Other RCU Flavors</a></h2>
2211
2212<p>
2213One of the more surprising things about RCU is that there are now
2214no fewer than five <i>flavors</i>, or API families.
2215In addition, the primary flavor that has been the sole focus up to
2216this point has two different implementations, non-preemptible and
2217preemptible.
2218The other four flavors are listed below, with requirements for each
2219described in a separate section.
2220
2221<ol>
2222<li> <a href="#Bottom-Half Flavor">Bottom-Half Flavor</a>
2223<li> <a href="#Sched Flavor">Sched Flavor</a>
2224<li> <a href="#Sleepable RCU">Sleepable RCU</a>
2225<li> <a href="#Tasks RCU">Tasks RCU</a>
2226</ol>
2227
2228<h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3>
2229
2230<p>
2231The softirq-disable (AKA &ldquo;bottom-half&rdquo;,
2232hence the &ldquo;_bh&rdquo; abbreviations)
2233flavor of RCU, or <i>RCU-bh</i>, was developed by
2234Dipankar Sarma to provide a flavor of RCU that could withstand the
2235network-based denial-of-service attacks researched by Robert
2236Olsson.
2237These attacks placed so much networking load on the system
2238that some of the CPUs never exited softirq execution,
2239which in turn prevented those CPUs from ever executing a context switch,
2240which, in the RCU implementation of that time, prevented grace periods
2241from ever ending.
2242The result was an out-of-memory condition and a system hang.
2243
2244<p>
2245The solution was the creation of RCU-bh, which does
2246<tt>local_bh_disable()</tt>
2247across its read-side critical sections, and which uses the transition
2248from one type of softirq processing to another as a quiescent state
2249in addition to context switch, idle, user mode, and offline.
2250This means that RCU-bh grace periods can complete even when some of
2251the CPUs execute in softirq indefinitely, thus allowing algorithms
2252based on RCU-bh to withstand network-based denial-of-service attacks.
2253
2254<p>
2255Because
2256<tt>rcu_read_lock_bh()</tt> and <tt>rcu_read_unlock_bh()</tt>
2257disable and re-enable softirq handlers, any attempt to start a softirq
2258handlers during the
2259RCU-bh read-side critical section will be deferred.
2260In this case, <tt>rcu_read_unlock_bh()</tt>
2261will invoke softirq processing, which can take considerable time.
2262One can of course argue that this softirq overhead should be associated
2263with the code following the RCU-bh read-side critical section rather
2264than <tt>rcu_read_unlock_bh()</tt>, but the fact
2265is that most profiling tools cannot be expected to make this sort
2266of fine distinction.
2267For example, suppose that a three-millisecond-long RCU-bh read-side
2268critical section executes during a time of heavy networking load.
2269There will very likely be an attempt to invoke at least one softirq
2270handler during that three milliseconds, but any such invocation will
2271be delayed until the time of the <tt>rcu_read_unlock_bh()</tt>.
2272This can of course make it appear at first glance as if
2273<tt>rcu_read_unlock_bh()</tt> was executing very slowly.
2274
2275<p>
2276The
2277<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-bh API</a>
2278includes
2279<tt>rcu_read_lock_bh()</tt>,
2280<tt>rcu_read_unlock_bh()</tt>,
2281<tt>rcu_dereference_bh()</tt>,
2282<tt>rcu_dereference_bh_check()</tt>,
2283<tt>synchronize_rcu_bh()</tt>,
2284<tt>synchronize_rcu_bh_expedited()</tt>,
2285<tt>call_rcu_bh()</tt>,
2286<tt>rcu_barrier_bh()</tt>, and
2287<tt>rcu_read_lock_bh_held()</tt>.
2288
2289<h3><a name="Sched Flavor">Sched Flavor</a></h3>
2290
2291<p>
2292Before preemptible RCU, waiting for an RCU grace period had the
2293side effect of also waiting for all pre-existing interrupt
2294and NMI handlers.
2295However, there are legitimate preemptible-RCU implementations that
2296do not have this property, given that any point in the code outside
2297of an RCU read-side critical section can be a quiescent state.
2298Therefore, <i>RCU-sched</i> was created, which follows &ldquo;classic&rdquo;
2299RCU in that an RCU-sched grace period waits for for pre-existing
2300interrupt and NMI handlers.
2301In kernels built with <tt>CONFIG_PREEMPT=n</tt>, the RCU and RCU-sched
2302APIs have identical implementations, while kernels built with
2303<tt>CONFIG_PREEMPT=y</tt> provide a separate implementation for each.
2304
2305<p>
2306Note well that in <tt>CONFIG_PREEMPT=y</tt> kernels,
2307<tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt>
2308disable and re-enable preemption, respectively.
2309This means that if there was a preemption attempt during the
2310RCU-sched read-side critical section, <tt>rcu_read_unlock_sched()</tt>
2311will enter the scheduler, with all the latency and overhead entailed.
2312Just as with <tt>rcu_read_unlock_bh()</tt>, this can make it look
2313as if <tt>rcu_read_unlock_sched()</tt> was executing very slowly.
2314However, the highest-priority task won't be preempted, so that task
2315will enjoy low-overhead <tt>rcu_read_unlock_sched()</tt> invocations.
2316
2317<p>
2318The
2319<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-sched API</a>
2320includes
2321<tt>rcu_read_lock_sched()</tt>,
2322<tt>rcu_read_unlock_sched()</tt>,
2323<tt>rcu_read_lock_sched_notrace()</tt>,
2324<tt>rcu_read_unlock_sched_notrace()</tt>,
2325<tt>rcu_dereference_sched()</tt>,
2326<tt>rcu_dereference_sched_check()</tt>,
2327<tt>synchronize_sched()</tt>,
2328<tt>synchronize_rcu_sched_expedited()</tt>,
2329<tt>call_rcu_sched()</tt>,
2330<tt>rcu_barrier_sched()</tt>, and
2331<tt>rcu_read_lock_sched_held()</tt>.
2332However, anything that disables preemption also marks an RCU-sched
2333read-side critical section, including
2334<tt>preempt_disable()</tt> and <tt>preempt_enable()</tt>,
2335<tt>local_irq_save()</tt> and <tt>local_irq_restore()</tt>,
2336and so on.
2337
2338<h3><a name="Sleepable RCU">Sleepable RCU</a></h3>
2339
2340<p>
2341For well over a decade, someone saying &ldquo;I need to block within
2342an RCU read-side critical section&rdquo; was a reliable indication
2343that this someone did not understand RCU.
2344After all, if you are always blocking in an RCU read-side critical
2345section, you can probably afford to use a higher-overhead synchronization
2346mechanism.
2347However, that changed with the advent of the Linux kernel's notifiers,
2348whose RCU read-side critical
2349sections almost never sleep, but sometimes need to.
2350This resulted in the introduction of
2351<a href="https://lwn.net/Articles/202847/">sleepable RCU</a>,
2352or <i>SRCU</i>.
2353
2354<p>
2355SRCU allows different domains to be defined, with each such domain
2356defined by an instance of an <tt>srcu_struct</tt> structure.
2357A pointer to this structure must be passed in to each SRCU function,
2358for example, <tt>synchronize_srcu(&amp;ss)</tt>, where
2359<tt>ss</tt> is the <tt>srcu_struct</tt> structure.
2360The key benefit of these domains is that a slow SRCU reader in one
2361domain does not delay an SRCU grace period in some other domain.
2362That said, one consequence of these domains is that read-side code
2363must pass a &ldquo;cookie&rdquo; from <tt>srcu_read_lock()</tt>
2364to <tt>srcu_read_unlock()</tt>, for example, as follows:
2365
2366<blockquote>
2367<pre>
2368 1 int idx;
2369 2
2370 3 idx = srcu_read_lock(&amp;ss);
2371 4 do_something();
2372 5 srcu_read_unlock(&amp;ss, idx);
2373</pre>
2374</blockquote>
2375
2376<p>
2377As noted above, it is legal to block within SRCU read-side critical sections,
2378however, with great power comes great responsibility.
2379If you block forever in one of a given domain's SRCU read-side critical
2380sections, then that domain's grace periods will also be blocked forever.
2381Of course, one good way to block forever is to deadlock, which can
2382happen if any operation in a given domain's SRCU read-side critical
2383section can block waiting, either directly or indirectly, for that domain's
2384grace period to elapse.
2385For example, this results in a self-deadlock:
2386
2387<blockquote>
2388<pre>
2389 1 int idx;
2390 2
2391 3 idx = srcu_read_lock(&amp;ss);
2392 4 do_something();
2393 5 synchronize_srcu(&amp;ss);
2394 6 srcu_read_unlock(&amp;ss, idx);
2395</pre>
2396</blockquote>
2397
2398<p>
2399However, if line&nbsp;5 acquired a mutex that was held across
2400a <tt>synchronize_srcu()</tt> for domain <tt>ss</tt>,
2401deadlock would still be possible.
2402Furthermore, if line&nbsp;5 acquired a mutex that was held across
2403a <tt>synchronize_srcu()</tt> for some other domain <tt>ss1</tt>,
2404and if an <tt>ss1</tt>-domain SRCU read-side critical section
2405acquired another mutex that was held across as <tt>ss</tt>-domain
2406<tt>synchronize_srcu()</tt>,
2407deadlock would again be possible.
2408Such a deadlock cycle could extend across an arbitrarily large number
2409of different SRCU domains.
2410Again, with great power comes great responsibility.
2411
2412<p>
2413Unlike the other RCU flavors, SRCU read-side critical sections can
2414run on idle and even offline CPUs.
2415This ability requires that <tt>srcu_read_lock()</tt> and
2416<tt>srcu_read_unlock()</tt> contain memory barriers, which means
2417that SRCU readers will run a bit slower than would RCU readers.
2418It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt>
2419API, which, in combination with <tt>srcu_read_unlock()</tt>,
2420guarantees a full memory barrier.
2421
2422<p>
2423The
2424<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a>
2425includes
2426<tt>srcu_read_lock()</tt>,
2427<tt>srcu_read_unlock()</tt>,
2428<tt>srcu_dereference()</tt>,
2429<tt>srcu_dereference_check()</tt>,
2430<tt>synchronize_srcu()</tt>,
2431<tt>synchronize_srcu_expedited()</tt>,
2432<tt>call_srcu()</tt>,
2433<tt>srcu_barrier()</tt>, and
2434<tt>srcu_read_lock_held()</tt>.
2435It also includes
2436<tt>DEFINE_SRCU()</tt>,
2437<tt>DEFINE_STATIC_SRCU()</tt>, and
2438<tt>init_srcu_struct()</tt>
2439APIs for defining and initializing <tt>srcu_struct</tt> structures.
2440
2441<h3><a name="Tasks RCU">Tasks RCU</a></h3>
2442
2443<p>
2444Some forms of tracing use &ldquo;tramopolines&rdquo; to handle the
2445binary rewriting required to install different types of probes.
2446It would be good to be able to free old trampolines, which sounds
2447like a job for some form of RCU.
2448However, because it is necessary to be able to install a trace
2449anywhere in the code, it is not possible to use read-side markers
2450such as <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
2451In addition, it does not work to have these markers in the trampoline
2452itself, because there would need to be instructions following
2453<tt>rcu_read_unlock()</tt>.
2454Although <tt>synchronize_rcu()</tt> would guarantee that execution
2455reached the <tt>rcu_read_unlock()</tt>, it would not be able to
2456guarantee that execution had completely left the trampoline.
2457
2458<p>
2459The solution, in the form of
2460<a href="https://lwn.net/Articles/607117/"><i>Tasks RCU</i></a>,
2461is to have implicit
2462read-side critical sections that are delimited by voluntary context
2463switches, that is, calls to <tt>schedule()</tt>,
2464<tt>cond_resched_rcu_qs()</tt>, and
2465<tt>synchronize_rcu_tasks()</tt>.
2466In addition, transitions to and from userspace execution also delimit
2467tasks-RCU read-side critical sections.
2468
2469<p>
2470The tasks-RCU API is quite compact, consisting only of
2471<tt>call_rcu_tasks()</tt>,
2472<tt>synchronize_rcu_tasks()</tt>, and
2473<tt>rcu_barrier_tasks()</tt>.
2474
2475<h2><a name="Possible Future Changes">Possible Future Changes</a></h2>
2476
2477<p>
2478One of the tricks that RCU uses to attain update-side scalability is
2479to increase grace-period latency with increasing numbers of CPUs.
2480If this becomes a serious problem, it will be necessary to rework the
2481grace-period state machine so as to avoid the need for the additional
2482latency.
2483
2484<p>
2485Expedited grace periods scan the CPUs, so their latency and overhead
2486increases with increasing numbers of CPUs.
2487If this becomes a serious problem on large systems, it will be necessary
2488to do some redesign to avoid this scalability problem.
2489
2490<p>
2491RCU disables CPU hotplug in a few places, perhaps most notably in the
2492expedited grace-period and <tt>rcu_barrier()</tt> operations.
2493If there is a strong reason to use expedited grace periods in CPU-hotplug
2494notifiers, it will be necessary to avoid disabling CPU hotplug.
2495This would introduce some complexity, so there had better be a <i>very</i>
2496good reason.
2497
2498<p>
2499The tradeoff between grace-period latency on the one hand and interruptions
2500of other CPUs on the other hand may need to be re-examined.
2501The desire is of course for zero grace-period latency as well as zero
2502interprocessor interrupts undertaken during an expedited grace period
2503operation.
2504While this ideal is unlikely to be achievable, it is quite possible that
2505further improvements can be made.
2506
2507<p>
2508The multiprocessor implementations of RCU use a combining tree that
2509groups CPUs so as to reduce lock contention and increase cache locality.
2510However, this combining tree does not spread its memory across NUMA
2511nodes nor does it align the CPU groups with hardware features such
2512as sockets or cores.
2513Such spreading and alignment is currently believed to be unnecessary
2514because the hotpath read-side primitives do not access the combining
2515tree, nor does <tt>call_rcu()</tt> in the common case.
2516If you believe that your architecture needs such spreading and alignment,
2517then your architecture should also benefit from the
2518<tt>rcutree.rcu_fanout_leaf</tt> boot parameter, which can be set
2519to the number of CPUs in a socket, NUMA node, or whatever.
2520If the number of CPUs is too large, use a fraction of the number of
2521CPUs.
2522If the number of CPUs is a large prime number, well, that certainly
2523is an &ldquo;interesting&rdquo; architectural choice!
2524More flexible arrangements might be considered, but only if
2525<tt>rcutree.rcu_fanout_leaf</tt> has proven inadequate, and only
2526if the inadequacy has been demonstrated by a carefully run and
2527realistic system-level workload.
2528
2529<p>
2530Please note that arrangements that require RCU to remap CPU numbers will
2531require extremely good demonstration of need and full exploration of
2532alternatives.
2533
2534<p>
2535There is an embarrassingly large number of flavors of RCU, and this
2536number has been increasing over time.
2537Perhaps it will be possible to combine some at some future date.
2538
2539<p>
2540RCU's various kthreads are reasonably recent additions.
2541It is quite likely that adjustments will be required to more gracefully
2542handle extreme loads.
2543It might also be necessary to be able to relate CPU utilization by
2544RCU's kthreads and softirq handlers to the code that instigated this
2545CPU utilization.
2546For example, RCU callback overhead might be charged back to the
2547originating <tt>call_rcu()</tt> instance, though probably not
2548in production kernels.
2549
2550<h2><a name="Summary">Summary</a></h2>
2551
2552<p>
2553This document has presented more than two decade's worth of RCU
2554requirements.
2555Given that the requirements keep changing, this will not be the last
2556word on this subject, but at least it serves to get an important
2557subset of the requirements set forth.
2558
2559<h2><a name="Acknowledgments">Acknowledgments</a></h2>
2560
2561I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar,
2562Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and
2563Andy Lutomirski for their help in rendering
2564this article human readable, and to Michelle Rankin for her support
2565of this effort.
2566Other contributions are acknowledged in the Linux kernel's git archive.
2567The cartoon is copyright (c) 2013 by Melissa Broussard,
2568and is provided
2569under the terms of the Creative Commons Attribution-Share Alike 3.0
2570United States license.
2571
2572<h3><a name="Answers to Quick Quizzes">
2573Answers to Quick Quizzes</a></h3>
2574
2575<a name="qq1answer"></a>
2576<p><b>Quick Quiz 1</b>:
2577Wait a minute!
2578You said that updaters can make useful forward progress concurrently
2579with readers, but pre-existing readers will block
2580<tt>synchronize_rcu()</tt>!!!
2581Just who are you trying to fool???
2582
2583
2584</p><p><b>Answer</b>:
2585First, if updaters do not wish to be blocked by readers, they can use
2586<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will
2587be discussed later.
2588Second, even when using <tt>synchronize_rcu()</tt>, the other
2589update-side code does run concurrently with readers, whether pre-existing
2590or not.
2591
2592
2593</p><p><a href="#Quick%20Quiz%201"><b>Back to Quick Quiz 1</b>.</a>
2594
2595<a name="qq2answer"></a>
2596<p><b>Quick Quiz 2</b>:
2597Why is the <tt>synchronize_rcu()</tt> on line&nbsp;28 needed?
2598
2599
2600</p><p><b>Answer</b>:
2601Without that extra grace period, memory reordering could result in
2602<tt>do_something_dlm()</tt> executing <tt>do_something()</tt>
2603concurrently with the last bits of <tt>recovery()</tt>.
2604
2605
2606</p><p><a href="#Quick%20Quiz%202"><b>Back to Quick Quiz 2</b>.</a>
2607
2608<a name="qq3answer"></a>
2609<p><b>Quick Quiz 3</b>:
2610But <tt>rcu_assign_pointer()</tt> does nothing to prevent the
2611two assignments to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt>
2612from being reordered.
2613Can't that also cause problems?
2614
2615
2616</p><p><b>Answer</b>:
2617No, it cannot.
2618The readers cannot see either of these two fields until
2619the assignment to <tt>gp</tt>, by which time both fields are
2620fully initialized.
2621So reordering the assignments
2622to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt> cannot possibly
2623cause any problems.
2624
2625
2626</p><p><a href="#Quick%20Quiz%203"><b>Back to Quick Quiz 3</b>.</a>
2627
2628<a name="qq4answer"></a>
2629<p><b>Quick Quiz 4</b>:
2630Without the <tt>rcu_dereference()</tt> or the
2631<tt>rcu_access_pointer()</tt>, what destructive optimizations
2632might the compiler make use of?
2633
2634
2635</p><p><b>Answer</b>:
2636Let's start with what happens to <tt>do_something_gp()</tt>
2637if it fails to use <tt>rcu_dereference()</tt>.
2638It could reuse a value formerly fetched from this same pointer.
2639It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time
2640manner, resulting in <i>load tearing</i>, in turn resulting a bytewise
2641mash-up of two distince pointer values.
2642It might even use value-speculation optimizations, where it makes a wrong
2643guess, but by the time it gets around to checking the value, an update
2644has changed the pointer to match the wrong guess.
2645Too bad about any dereferences that returned pre-initialization garbage
2646in the meantime!
2647
2648<p>
2649For <tt>remove_gp_synchronous()</tt>, as long as all modifications
2650to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>,
2651the above optimizations are harmless.
2652However,
2653with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>,
2654<tt>sparse</tt> will complain if you
2655define <tt>gp</tt> with <tt>__rcu</tt> and then
2656access it without using
2657either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>.
2658
2659
2660</p><p><a href="#Quick%20Quiz%204"><b>Back to Quick Quiz 4</b>.</a>
2661
2662<a name="qq5answer"></a>
2663<p><b>Quick Quiz 5</b>:
2664Given that multiple CPUs can start RCU read-side critical sections
2665at any time without any ordering whatsoever, how can RCU possibly tell whether
2666or not a given RCU read-side critical section starts before a
2667given instance of <tt>synchronize_rcu()</tt>?
2668
2669
2670</p><p><b>Answer</b>:
2671If RCU cannot tell whether or not a given
2672RCU read-side critical section starts before a
2673given instance of <tt>synchronize_rcu()</tt>,
2674then it must assume that the RCU read-side critical section
2675started first.
2676In other words, a given instance of <tt>synchronize_rcu()</tt>
2677can avoid waiting on a given RCU read-side critical section only
2678if it can prove that <tt>synchronize_rcu()</tt> started first.
2679
2680
2681</p><p><a href="#Quick%20Quiz%205"><b>Back to Quick Quiz 5</b>.</a>
2682
2683<a name="qq6answer"></a>
2684<p><b>Quick Quiz 6</b>:
2685The first and second guarantees require unbelievably strict ordering!
2686Are all these memory barriers <i> really</i> required?
2687
2688
2689</p><p><b>Answer</b>:
2690Yes, they really are required.
2691To see why the first guarantee is required, consider the following
2692sequence of events:
2693
2694<ol>
2695<li> CPU 1: <tt>rcu_read_lock()</tt>
2696<li> CPU 1: <tt>q = rcu_dereference(gp);
2697 /* Very likely to return p. */</tt>
2698<li> CPU 0: <tt>list_del_rcu(p);</tt>
2699<li> CPU 0: <tt>synchronize_rcu()</tt> starts.
2700<li> CPU 1: <tt>do_something_with(q-&gt;a);
2701 /* No smp_mb(), so might happen after kfree(). */</tt>
2702<li> CPU 1: <tt>rcu_read_unlock()</tt>
2703<li> CPU 0: <tt>synchronize_rcu()</tt> returns.
2704<li> CPU 0: <tt>kfree(p);</tt>
2705</ol>
2706
2707<p>
2708Therefore, there absolutely must be a full memory barrier between the
2709end of the RCU read-side critical section and the end of the
2710grace period.
2711
2712<p>
2713The sequence of events demonstrating the necessity of the second rule
2714is roughly similar:
2715
2716<ol>
2717<li> CPU 0: <tt>list_del_rcu(p);</tt>
2718<li> CPU 0: <tt>synchronize_rcu()</tt> starts.
2719<li> CPU 1: <tt>rcu_read_lock()</tt>
2720<li> CPU 1: <tt>q = rcu_dereference(gp);
2721 /* Might return p if no memory barrier. */</tt>
2722<li> CPU 0: <tt>synchronize_rcu()</tt> returns.
2723<li> CPU 0: <tt>kfree(p);</tt>
2724<li> CPU 1: <tt>do_something_with(q-&gt;a); /* Boom!!! */</tt>
2725<li> CPU 1: <tt>rcu_read_unlock()</tt>
2726</ol>
2727
2728<p>
2729And similarly, without a memory barrier between the beginning of the
2730grace period and the beginning of the RCU read-side critical section,
2731CPU&nbsp;1 might end up accessing the freelist.
2732
2733<p>
2734The &ldquo;as if&rdquo; rule of course applies, so that any implementation
2735that acts as if the appropriate memory barriers were in place is a
2736correct implementation.
2737That said, it is much easier to fool yourself into believing that you have
2738adhered to the as-if rule than it is to actually adhere to it!
2739
2740
2741</p><p><a href="#Quick%20Quiz%206"><b>Back to Quick Quiz 6</b>.</a>
2742
2743<a name="qq7answer"></a>
2744<p><b>Quick Quiz 7</b>:
2745But how does the upgrade-to-write operation exclude other readers?
2746
2747
2748</p><p><b>Answer</b>:
2749It doesn't, just like normal RCU updates, which also do not exclude
2750RCU readers.
2751
2752
2753</p><p><a href="#Quick%20Quiz%207"><b>Back to Quick Quiz 7</b>.</a>
2754
2755<a name="qq8answer"></a>
2756<p><b>Quick Quiz 8</b>:
2757Can't the compiler also reorder this code?
2758
2759
2760</p><p><b>Answer</b>:
2761No, the volatile casts in <tt>READ_ONCE()</tt> and
2762<tt>WRITE_ONCE()</tt> prevent the compiler from reordering in
2763this particular case.
2764
2765
2766</p><p><a href="#Quick%20Quiz%208"><b>Back to Quick Quiz 8</b>.</a>
2767
2768<a name="qq9answer"></a>
2769<p><b>Quick Quiz 9</b>:
2770Suppose that synchronize_rcu() did wait until all readers had completed.
2771Would the updater be able to rely on this?
2772
2773
2774</p><p><b>Answer</b>:
2775No.
2776Even if <tt>synchronize_rcu()</tt> were to wait until
2777all readers had completed, a new reader might start immediately after
2778<tt>synchronize_rcu()</tt> completed.
2779Therefore, the code following
2780<tt>synchronize_rcu()</tt> cannot rely on there being no readers
2781in any case.
2782
2783
2784</p><p><a href="#Quick%20Quiz%209"><b>Back to Quick Quiz 9</b>.</a>
2785
2786<a name="qq10answer"></a>
2787<p><b>Quick Quiz 10</b>:
2788How long a sequence of grace periods, each separated by an RCU read-side
2789critical section, would be required to partition the RCU read-side
2790critical sections at the beginning and end of the chain?
2791
2792
2793</p><p><b>Answer</b>:
2794In theory, an infinite number.
2795In practice, an unknown number that is sensitive to both implementation
2796details and timing considerations.
2797Therefore, even in practice, RCU users must abide by the theoretical rather
2798than the practical answer.
2799
2800
2801</p><p><a href="#Quick%20Quiz%2010"><b>Back to Quick Quiz 10</b>.</a>
2802
2803<a name="qq11answer"></a>
2804<p><b>Quick Quiz 11</b>:
2805What about sleeping locks?
2806
2807
2808</p><p><b>Answer</b>:
2809These are forbidden within Linux-kernel RCU read-side critical sections
2810because it is not legal to place a quiescent state (in this case,
2811voluntary context switch) within an RCU read-side critical section.
2812However, sleeping locks may be used within userspace RCU read-side critical
2813sections, and also within Linux-kernel sleepable RCU
2814<a href="#Sleepable RCU">(SRCU)</a>
2815read-side critical sections.
2816In addition, the -rt patchset turns spinlocks into a sleeping locks so
2817that the corresponding critical sections can be preempted, which
2818also means that these sleeplockified spinlocks (but not other sleeping locks!)
2819may be acquire within -rt-Linux-kernel RCU read-side critical sections.
2820
2821<p>
2822Note that it <i>is</i> legal for a normal RCU read-side critical section
2823to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>),
2824but only as long as it does not loop indefinitely attempting to
2825conditionally acquire that sleeping locks.
2826The key point is that things like <tt>mutex_trylock()</tt>
2827either return with the mutex held, or return an error indication if
2828the mutex was not immediately available.
2829Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping.
2830
2831
2832</p><p><a href="#Quick%20Quiz%2011"><b>Back to Quick Quiz 11</b>.</a>
2833
2834<a name="qq12answer"></a>
2835<p><b>Quick Quiz 12</b>:
2836Why does line&nbsp;19 use <tt>rcu_access_pointer()</tt>?
2837After all, <tt>call_rcu()</tt> on line&nbsp;25 stores into the
2838structure, which would interact badly with concurrent insertions.
2839Doesn't this mean that <tt>rcu_dereference()</tt> is required?
2840
2841
2842</p><p><b>Answer</b>:
2843Presumably the <tt>-&gt;gp_lock</tt> acquired on line&nbsp;18 excludes
2844any changes, including any insertions that <tt>rcu_dereference()</tt>
2845would protect against.
2846Therefore, any insertions will be delayed until after <tt>-&gt;gp_lock</tt>
2847is released on line&nbsp;25, which in turn means that
2848<tt>rcu_access_pointer()</tt> suffices.
2849
2850
2851</p><p><a href="#Quick%20Quiz%2012"><b>Back to Quick Quiz 12</b>.</a>
2852
2853<a name="qq13answer"></a>
2854<p><b>Quick Quiz 13</b>:
2855Earlier it was claimed that <tt>call_rcu()</tt> and
2856<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked
2857by readers.
2858But how can that be correct, given that the invocation of the callback
2859and the freeing of the memory (respectively) must still wait for
2860a grace period to elapse?
2861
2862
2863</p><p><b>Answer</b>:
2864We could define things this way, but keep in mind that this sort of
2865definition would say that updates in garbage-collected languages
2866cannot complete until the next time the garbage collector runs,
2867which does not seem at all reasonable.
2868The key point is that in most cases, an updater using either
2869<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the
2870next update as soon as it has invoked <tt>call_rcu()</tt> or
2871<tt>kfree_rcu()</tt>, without having to wait for a subsequent
2872grace period.
2873
2874
2875</p><p><a href="#Quick%20Quiz%2013"><b>Back to Quick Quiz 13</b>.</a>
2876
2877<a name="qq14answer"></a>
2878<p><b>Quick Quiz 14</b>:
2879So what happens with <tt>synchronize_rcu()</tt> during
2880scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
2881kernels?
2882
2883
2884</p><p><b>Answer</b>:
2885In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt>
2886maps directly to <tt>synchronize_sched()</tt>.
2887Therefore, <tt>synchronize_rcu()</tt> works normally throughout
2888boot in <tt>CONFIG_PREEMPT=n</tt> kernels.
2889However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels,
2890so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt>
2891during scheduler initialization.
2892
2893
2894</p><p><a href="#Quick%20Quiz%2014"><b>Back to Quick Quiz 14</b>.</a>
2895
2896
2897</body></html>
diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx
new file mode 100644
index 000000000000..3a97ba490c42
--- /dev/null
+++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx
@@ -0,0 +1,2741 @@
1<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
2 "http://www.w3.org/TR/html4/loose.dtd">
3 <html>
4 <head><title>A Tour Through RCU's Requirements [LWN.net]</title>
5 <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8">
6
7<h1>A Tour Through RCU's Requirements</h1>
8
9<p>Copyright IBM Corporation, 2015</p>
10<p>Author: Paul E.&nbsp;McKenney</p>
11<p><i>The initial version of this document appeared in the
12<a href="https://lwn.net/">LWN</a> articles
13<a href="https://lwn.net/Articles/652156/">here</a>,
14<a href="https://lwn.net/Articles/652677/">here</a>, and
15<a href="https://lwn.net/Articles/653326/">here</a>.</i></p>
16
17<h2>Introduction</h2>
18
19<p>
20Read-copy update (RCU) is a synchronization mechanism that is often
21used as a replacement for reader-writer locking.
22RCU is unusual in that updaters do not block readers,
23which means that RCU's read-side primitives can be exceedingly fast
24and scalable.
25In addition, updaters can make useful forward progress concurrently
26with readers.
27However, all this concurrency between RCU readers and updaters does raise
28the question of exactly what RCU readers are doing, which in turn
29raises the question of exactly what RCU's requirements are.
30
31<p>
32This document therefore summarizes RCU's requirements, and can be thought
33of as an informal, high-level specification for RCU.
34It is important to understand that RCU's specification is primarily
35empirical in nature;
36in fact, I learned about many of these requirements the hard way.
37This situation might cause some consternation, however, not only
38has this learning process been a lot of fun, but it has also been
39a great privilege to work with so many people willing to apply
40technologies in interesting new ways.
41
42<p>
43All that aside, here are the categories of currently known RCU requirements:
44</p>
45
46<ol>
47<li> <a href="#Fundamental Requirements">
48 Fundamental Requirements</a>
49<li> <a href="#Fundamental Non-Requirements">Fundamental Non-Requirements</a>
50<li> <a href="#Parallelism Facts of Life">
51 Parallelism Facts of Life</a>
52<li> <a href="#Quality-of-Implementation Requirements">
53 Quality-of-Implementation Requirements</a>
54<li> <a href="#Linux Kernel Complications">
55 Linux Kernel Complications</a>
56<li> <a href="#Software-Engineering Requirements">
57 Software-Engineering Requirements</a>
58<li> <a href="#Other RCU Flavors">
59 Other RCU Flavors</a>
60<li> <a href="#Possible Future Changes">
61 Possible Future Changes</a>
62</ol>
63
64<p>
65This is followed by a <a href="#Summary">summary</a>,
66which is in turn followed by the inevitable
67<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>.
68
69<h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2>
70
71<p>
72RCU's fundamental requirements are the closest thing RCU has to hard
73mathematical requirements.
74These are:
75
76<ol>
77<li> <a href="#Grace-Period Guarantee">
78 Grace-Period Guarantee</a>
79<li> <a href="#Publish-Subscribe Guarantee">
80 Publish-Subscribe Guarantee</a>
81<li> <a href="#Memory-Barrier Guarantees">
82 Memory-Barrier Guarantees</a>
83<li> <a href="#RCU Primitives Guaranteed to Execute Unconditionally">
84 RCU Primitives Guaranteed to Execute Unconditionally</a>
85<li> <a href="#Guaranteed Read-to-Write Upgrade">
86 Guaranteed Read-to-Write Upgrade</a>
87</ol>
88
89<h3><a name="Grace-Period Guarantee">Grace-Period Guarantee</a></h3>
90
91<p>
92RCU's grace-period guarantee is unusual in being premeditated:
93Jack Slingwine and I had this guarantee firmly in mind when we started
94work on RCU (then called &ldquo;rclock&rdquo;) in the early 1990s.
95That said, the past two decades of experience with RCU have produced
96a much more detailed understanding of this guarantee.
97
98<p>
99RCU's grace-period guarantee allows updaters to wait for the completion
100of all pre-existing RCU read-side critical sections.
101An RCU read-side critical section
102begins with the marker <tt>rcu_read_lock()</tt> and ends with
103the marker <tt>rcu_read_unlock()</tt>.
104These markers may be nested, and RCU treats a nested set as one
105big RCU read-side critical section.
106Production-quality implementations of <tt>rcu_read_lock()</tt> and
107<tt>rcu_read_unlock()</tt> are extremely lightweight, and in
108fact have exactly zero overhead in Linux kernels built for production
109use with <tt>CONFIG_PREEMPT=n</tt>.
110
111<p>
112This guarantee allows ordering to be enforced with extremely low
113overhead to readers, for example:
114
115<blockquote>
116<pre>
117 1 int x, y;
118 2
119 3 void thread0(void)
120 4 {
121 5 rcu_read_lock();
122 6 r1 = READ_ONCE(x);
123 7 r2 = READ_ONCE(y);
124 8 rcu_read_unlock();
125 9 }
12610
12711 void thread1(void)
12812 {
12913 WRITE_ONCE(x, 1);
13014 synchronize_rcu();
13115 WRITE_ONCE(y, 1);
13216 }
133</pre>
134</blockquote>
135
136<p>
137Because the <tt>synchronize_rcu()</tt> on line&nbsp;14 waits for
138all pre-existing readers, any instance of <tt>thread0()</tt> that
139loads a value of zero from <tt>x</tt> must complete before
140<tt>thread1()</tt> stores to <tt>y</tt>, so that instance must
141also load a value of zero from <tt>y</tt>.
142Similarly, any instance of <tt>thread0()</tt> that loads a value of
143one from <tt>y</tt> must have started after the
144<tt>synchronize_rcu()</tt> started, and must therefore also load
145a value of one from <tt>x</tt>.
146Therefore, the outcome:
147<blockquote>
148<pre>
149(r1 == 0 &amp;&amp; r2 == 1)
150</pre>
151</blockquote>
152cannot happen.
153
154<p>@@QQ@@
155Wait a minute!
156You said that updaters can make useful forward progress concurrently
157with readers, but pre-existing readers will block
158<tt>synchronize_rcu()</tt>!!!
159Just who are you trying to fool???
160<p>@@QQA@@
161First, if updaters do not wish to be blocked by readers, they can use
162<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will
163be discussed later.
164Second, even when using <tt>synchronize_rcu()</tt>, the other
165update-side code does run concurrently with readers, whether pre-existing
166or not.
167<p>@@QQE@@
168
169<p>
170This scenario resembles one of the first uses of RCU in
171<a href="https://en.wikipedia.org/wiki/DYNIX">DYNIX/ptx</a>,
172which managed a distributed lock manager's transition into
173a state suitable for handling recovery from node failure,
174more or less as follows:
175
176<blockquote>
177<pre>
178 1 #define STATE_NORMAL 0
179 2 #define STATE_WANT_RECOVERY 1
180 3 #define STATE_RECOVERING 2
181 4 #define STATE_WANT_NORMAL 3
182 5
183 6 int state = STATE_NORMAL;
184 7
185 8 void do_something_dlm(void)
186 9 {
18710 int state_snap;
18811
18912 rcu_read_lock();
19013 state_snap = READ_ONCE(state);
19114 if (state_snap == STATE_NORMAL)
19215 do_something();
19316 else
19417 do_something_carefully();
19518 rcu_read_unlock();
19619 }
19720
19821 void start_recovery(void)
19922 {
20023 WRITE_ONCE(state, STATE_WANT_RECOVERY);
20124 synchronize_rcu();
20225 WRITE_ONCE(state, STATE_RECOVERING);
20326 recovery();
20427 WRITE_ONCE(state, STATE_WANT_NORMAL);
20528 synchronize_rcu();
20629 WRITE_ONCE(state, STATE_NORMAL);
20730 }
208</pre>
209</blockquote>
210
211<p>
212The RCU read-side critical section in <tt>do_something_dlm()</tt>
213works with the <tt>synchronize_rcu()</tt> in <tt>start_recovery()</tt>
214to guarantee that <tt>do_something()</tt> never runs concurrently
215with <tt>recovery()</tt>, but with little or no synchronization
216overhead in <tt>do_something_dlm()</tt>.
217
218<p>@@QQ@@
219Why is the <tt>synchronize_rcu()</tt> on line&nbsp;28 needed?
220<p>@@QQA@@
221Without that extra grace period, memory reordering could result in
222<tt>do_something_dlm()</tt> executing <tt>do_something()</tt>
223concurrently with the last bits of <tt>recovery()</tt>.
224<p>@@QQE@@
225
226<p>
227In order to avoid fatal problems such as deadlocks,
228an RCU read-side critical section must not contain calls to
229<tt>synchronize_rcu()</tt>.
230Similarly, an RCU read-side critical section must not
231contain anything that waits, directly or indirectly, on completion of
232an invocation of <tt>synchronize_rcu()</tt>.
233
234<p>
235Although RCU's grace-period guarantee is useful in and of itself, with
236<a href="https://lwn.net/Articles/573497/">quite a few use cases</a>,
237it would be good to be able to use RCU to coordinate read-side
238access to linked data structures.
239For this, the grace-period guarantee is not sufficient, as can
240be seen in function <tt>add_gp_buggy()</tt> below.
241We will look at the reader's code later, but in the meantime, just think of
242the reader as locklessly picking up the <tt>gp</tt> pointer,
243and, if the value loaded is non-<tt>NULL</tt>, locklessly accessing the
244<tt>-&gt;a</tt> and <tt>-&gt;b</tt> fields.
245
246<blockquote>
247<pre>
248 1 bool add_gp_buggy(int a, int b)
249 2 {
250 3 p = kmalloc(sizeof(*p), GFP_KERNEL);
251 4 if (!p)
252 5 return -ENOMEM;
253 6 spin_lock(&amp;gp_lock);
254 7 if (rcu_access_pointer(gp)) {
255 8 spin_unlock(&amp;gp_lock);
256 9 return false;
25710 }
25811 p-&gt;a = a;
25912 p-&gt;b = a;
26013 gp = p; /* ORDERING BUG */
26114 spin_unlock(&amp;gp_lock);
26215 return true;
26316 }
264</pre>
265</blockquote>
266
267<p>
268The problem is that both the compiler and weakly ordered CPUs are within
269their rights to reorder this code as follows:
270
271<blockquote>
272<pre>
273 1 bool add_gp_buggy_optimized(int a, int b)
274 2 {
275 3 p = kmalloc(sizeof(*p), GFP_KERNEL);
276 4 if (!p)
277 5 return -ENOMEM;
278 6 spin_lock(&amp;gp_lock);
279 7 if (rcu_access_pointer(gp)) {
280 8 spin_unlock(&amp;gp_lock);
281 9 return false;
28210 }
283<b>11 gp = p; /* ORDERING BUG */
28412 p-&gt;a = a;
28513 p-&gt;b = a;</b>
28614 spin_unlock(&amp;gp_lock);
28715 return true;
28816 }
289</pre>
290</blockquote>
291
292<p>
293If an RCU reader fetches <tt>gp</tt> just after
294<tt>add_gp_buggy_optimized</tt> executes line&nbsp;11,
295it will see garbage in the <tt>-&gt;a</tt> and <tt>-&gt;b</tt>
296fields.
297And this is but one of many ways in which compiler and hardware optimizations
298could cause trouble.
299Therefore, we clearly need some way to prevent the compiler and the CPU from
300reordering in this manner, which brings us to the publish-subscribe
301guarantee discussed in the next section.
302
303<h3><a name="Publish-Subscribe Guarantee">Publish/Subscribe Guarantee</a></h3>
304
305<p>
306RCU's publish-subscribe guarantee allows data to be inserted
307into a linked data structure without disrupting RCU readers.
308The updater uses <tt>rcu_assign_pointer()</tt> to insert the
309new data, and readers use <tt>rcu_dereference()</tt> to
310access data, whether new or old.
311The following shows an example of insertion:
312
313<blockquote>
314<pre>
315 1 bool add_gp(int a, int b)
316 2 {
317 3 p = kmalloc(sizeof(*p), GFP_KERNEL);
318 4 if (!p)
319 5 return -ENOMEM;
320 6 spin_lock(&amp;gp_lock);
321 7 if (rcu_access_pointer(gp)) {
322 8 spin_unlock(&amp;gp_lock);
323 9 return false;
32410 }
32511 p-&gt;a = a;
32612 p-&gt;b = a;
32713 rcu_assign_pointer(gp, p);
32814 spin_unlock(&amp;gp_lock);
32915 return true;
33016 }
331</pre>
332</blockquote>
333
334<p>
335The <tt>rcu_assign_pointer()</tt> on line&nbsp;13 is conceptually
336equivalent to a simple assignment statement, but also guarantees
337that its assignment will
338happen after the two assignments in lines&nbsp;11 and&nbsp;12,
339similar to the C11 <tt>memory_order_release</tt> store operation.
340It also prevents any number of &ldquo;interesting&rdquo; compiler
341optimizations, for example, the use of <tt>gp</tt> as a scratch
342location immediately preceding the assignment.
343
344<p>@@QQ@@
345But <tt>rcu_assign_pointer()</tt> does nothing to prevent the
346two assignments to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt>
347from being reordered.
348Can't that also cause problems?
349<p>@@QQA@@
350No, it cannot.
351The readers cannot see either of these two fields until
352the assignment to <tt>gp</tt>, by which time both fields are
353fully initialized.
354So reordering the assignments
355to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt> cannot possibly
356cause any problems.
357<p>@@QQE@@
358
359<p>
360It is tempting to assume that the reader need not do anything special
361to control its accesses to the RCU-protected data,
362as shown in <tt>do_something_gp_buggy()</tt> below:
363
364<blockquote>
365<pre>
366 1 bool do_something_gp_buggy(void)
367 2 {
368 3 rcu_read_lock();
369 4 p = gp; /* OPTIMIZATIONS GALORE!!! */
370 5 if (p) {
371 6 do_something(p-&gt;a, p-&gt;b);
372 7 rcu_read_unlock();
373 8 return true;
374 9 }
37510 rcu_read_unlock();
37611 return false;
37712 }
378</pre>
379</blockquote>
380
381<p>
382However, this temptation must be resisted because there are a
383surprisingly large number of ways that the compiler
384(to say nothing of
385<a href="https://h71000.www7.hp.com/wizard/wiz_2637.html">DEC Alpha CPUs</a>)
386can trip this code up.
387For but one example, if the compiler were short of registers, it
388might choose to refetch from <tt>gp</tt> rather than keeping
389a separate copy in <tt>p</tt> as follows:
390
391<blockquote>
392<pre>
393 1 bool do_something_gp_buggy_optimized(void)
394 2 {
395 3 rcu_read_lock();
396 4 if (gp) { /* OPTIMIZATIONS GALORE!!! */
397<b> 5 do_something(gp-&gt;a, gp-&gt;b);</b>
398 6 rcu_read_unlock();
399 7 return true;
400 8 }
401 9 rcu_read_unlock();
40210 return false;
40311 }
404</pre>
405</blockquote>
406
407<p>
408If this function ran concurrently with a series of updates that
409replaced the current structure with a new one,
410the fetches of <tt>gp-&gt;a</tt>
411and <tt>gp-&gt;b</tt> might well come from two different structures,
412which could cause serious confusion.
413To prevent this (and much else besides), <tt>do_something_gp()</tt> uses
414<tt>rcu_dereference()</tt> to fetch from <tt>gp</tt>:
415
416<blockquote>
417<pre>
418 1 bool do_something_gp(void)
419 2 {
420 3 rcu_read_lock();
421 4 p = rcu_dereference(gp);
422 5 if (p) {
423 6 do_something(p-&gt;a, p-&gt;b);
424 7 rcu_read_unlock();
425 8 return true;
426 9 }
42710 rcu_read_unlock();
42811 return false;
42912 }
430</pre>
431</blockquote>
432
433<p>
434The <tt>rcu_dereference()</tt> uses volatile casts and (for DEC Alpha)
435memory barriers in the Linux kernel.
436Should a
437<a href="http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf">high-quality implementation of C11 <tt>memory_order_consume</tt> [PDF]</a>
438ever appear, then <tt>rcu_dereference()</tt> could be implemented
439as a <tt>memory_order_consume</tt> load.
440Regardless of the exact implementation, a pointer fetched by
441<tt>rcu_dereference()</tt> may not be used outside of the
442outermost RCU read-side critical section containing that
443<tt>rcu_dereference()</tt>, unless protection of
444the corresponding data element has been passed from RCU to some
445other synchronization mechanism, most commonly locking or
446<a href="https://www.kernel.org/doc/Documentation/RCU/rcuref.txt">reference counting</a>.
447
448<p>
449In short, updaters use <tt>rcu_assign_pointer()</tt> and readers
450use <tt>rcu_dereference()</tt>, and these two RCU API elements
451work together to ensure that readers have a consistent view of
452newly added data elements.
453
454<p>
455Of course, it is also necessary to remove elements from RCU-protected
456data structures, for example, using the following process:
457
458<ol>
459<li> Remove the data element from the enclosing structure.
460<li> Wait for all pre-existing RCU read-side critical sections
461 to complete (because only pre-existing readers can possibly have
462 a reference to the newly removed data element).
463<li> At this point, only the updater has a reference to the
464 newly removed data element, so it can safely reclaim
465 the data element, for example, by passing it to <tt>kfree()</tt>.
466</ol>
467
468This process is implemented by <tt>remove_gp_synchronous()</tt>:
469
470<blockquote>
471<pre>
472 1 bool remove_gp_synchronous(void)
473 2 {
474 3 struct foo *p;
475 4
476 5 spin_lock(&amp;gp_lock);
477 6 p = rcu_access_pointer(gp);
478 7 if (!p) {
479 8 spin_unlock(&amp;gp_lock);
480 9 return false;
48110 }
48211 rcu_assign_pointer(gp, NULL);
48312 spin_unlock(&amp;gp_lock);
48413 synchronize_rcu();
48514 kfree(p);
48615 return true;
48716 }
488</pre>
489</blockquote>
490
491<p>
492This function is straightforward, with line&nbsp;13 waiting for a grace
493period before line&nbsp;14 frees the old data element.
494This waiting ensures that readers will reach line&nbsp;7 of
495<tt>do_something_gp()</tt> before the data element referenced by
496<tt>p</tt> is freed.
497The <tt>rcu_access_pointer()</tt> on line&nbsp;6 is similar to
498<tt>rcu_dereference()</tt>, except that:
499
500<ol>
501<li> The value returned by <tt>rcu_access_pointer()</tt>
502 cannot be dereferenced.
503 If you want to access the value pointed to as well as
504 the pointer itself, use <tt>rcu_dereference()</tt>
505 instead of <tt>rcu_access_pointer()</tt>.
506<li> The call to <tt>rcu_access_pointer()</tt> need not be
507 protected.
508 In contrast, <tt>rcu_dereference()</tt> must either be
509 within an RCU read-side critical section or in a code
510 segment where the pointer cannot change, for example, in
511 code protected by the corresponding update-side lock.
512</ol>
513
514<p>@@QQ@@
515Without the <tt>rcu_dereference()</tt> or the
516<tt>rcu_access_pointer()</tt>, what destructive optimizations
517might the compiler make use of?
518<p>@@QQA@@
519Let's start with what happens to <tt>do_something_gp()</tt>
520if it fails to use <tt>rcu_dereference()</tt>.
521It could reuse a value formerly fetched from this same pointer.
522It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time
523manner, resulting in <i>load tearing</i>, in turn resulting a bytewise
524mash-up of two distince pointer values.
525It might even use value-speculation optimizations, where it makes a wrong
526guess, but by the time it gets around to checking the value, an update
527has changed the pointer to match the wrong guess.
528Too bad about any dereferences that returned pre-initialization garbage
529in the meantime!
530
531<p>
532For <tt>remove_gp_synchronous()</tt>, as long as all modifications
533to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>,
534the above optimizations are harmless.
535However,
536with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>,
537<tt>sparse</tt> will complain if you
538define <tt>gp</tt> with <tt>__rcu</tt> and then
539access it without using
540either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>.
541<p>@@QQE@@
542
543<p>
544In short, RCU's publish-subscribe guarantee is provided by the combination
545of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>.
546This guarantee allows data elements to be safely added to RCU-protected
547linked data structures without disrupting RCU readers.
548This guarantee can be used in combination with the grace-period
549guarantee to also allow data elements to be removed from RCU-protected
550linked data structures, again without disrupting RCU readers.
551
552<p>
553This guarantee was only partially premeditated.
554DYNIX/ptx used an explicit memory barrier for publication, but had nothing
555resembling <tt>rcu_dereference()</tt> for subscription, nor did it
556have anything resembling the <tt>smp_read_barrier_depends()</tt>
557that was later subsumed into <tt>rcu_dereference()</tt>.
558The need for these operations made itself known quite suddenly at a
559late-1990s meeting with the DEC Alpha architects, back in the days when
560DEC was still a free-standing company.
561It took the Alpha architects a good hour to convince me that any sort
562of barrier would ever be needed, and it then took me a good <i>two</i> hours
563to convince them that their documentation did not make this point clear.
564More recent work with the C and C++ standards committees have provided
565much education on tricks and traps from the compiler.
566In short, compilers were much less tricky in the early 1990s, but in
5672015, don't even think about omitting <tt>rcu_dereference()</tt>!
568
569<h3><a name="Memory-Barrier Guarantees">Memory-Barrier Guarantees</a></h3>
570
571<p>
572The previous section's simple linked-data-structure scenario clearly
573demonstrates the need for RCU's stringent memory-ordering guarantees on
574systems with more than one CPU:
575
576<ol>
577<li> Each CPU that has an RCU read-side critical section that
578 begins before <tt>synchronize_rcu()</tt> starts is
579 guaranteed to execute a full memory barrier between the time
580 that the RCU read-side critical section ends and the time that
581 <tt>synchronize_rcu()</tt> returns.
582 Without this guarantee, a pre-existing RCU read-side critical section
583 might hold a reference to the newly removed <tt>struct foo</tt>
584 after the <tt>kfree()</tt> on line&nbsp;14 of
585 <tt>remove_gp_synchronous()</tt>.
586<li> Each CPU that has an RCU read-side critical section that ends
587 after <tt>synchronize_rcu()</tt> returns is guaranteed
588 to execute a full memory barrier between the time that
589 <tt>synchronize_rcu()</tt> begins and the time that the RCU
590 read-side critical section begins.
591 Without this guarantee, a later RCU read-side critical section
592 running after the <tt>kfree()</tt> on line&nbsp;14 of
593 <tt>remove_gp_synchronous()</tt> might
594 later run <tt>do_something_gp()</tt> and find the
595 newly deleted <tt>struct foo</tt>.
596<li> If the task invoking <tt>synchronize_rcu()</tt> remains
597 on a given CPU, then that CPU is guaranteed to execute a full
598 memory barrier sometime during the execution of
599 <tt>synchronize_rcu()</tt>.
600 This guarantee ensures that the <tt>kfree()</tt> on
601 line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does
602 execute after the removal on line&nbsp;11.
603<li> If the task invoking <tt>synchronize_rcu()</tt> migrates
604 among a group of CPUs during that invocation, then each of the
605 CPUs in that group is guaranteed to execute a full memory barrier
606 sometime during the execution of <tt>synchronize_rcu()</tt>.
607 This guarantee also ensures that the <tt>kfree()</tt> on
608 line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does
609 execute after the removal on
610 line&nbsp;11, but also in the case where the thread executing the
611 <tt>synchronize_rcu()</tt> migrates in the meantime.
612</ol>
613
614<p>@@QQ@@
615Given that multiple CPUs can start RCU read-side critical sections
616at any time without any ordering whatsoever, how can RCU possibly tell whether
617or not a given RCU read-side critical section starts before a
618given instance of <tt>synchronize_rcu()</tt>?
619<p>@@QQA@@
620If RCU cannot tell whether or not a given
621RCU read-side critical section starts before a
622given instance of <tt>synchronize_rcu()</tt>,
623then it must assume that the RCU read-side critical section
624started first.
625In other words, a given instance of <tt>synchronize_rcu()</tt>
626can avoid waiting on a given RCU read-side critical section only
627if it can prove that <tt>synchronize_rcu()</tt> started first.
628<p>@@QQE@@
629
630<p>@@QQ@@
631The first and second guarantees require unbelievably strict ordering!
632Are all these memory barriers <i> really</i> required?
633<p>@@QQA@@
634Yes, they really are required.
635To see why the first guarantee is required, consider the following
636sequence of events:
637
638<ol>
639<li> CPU 1: <tt>rcu_read_lock()</tt>
640<li> CPU 1: <tt>q = rcu_dereference(gp);
641 /* Very likely to return p. */</tt>
642<li> CPU 0: <tt>list_del_rcu(p);</tt>
643<li> CPU 0: <tt>synchronize_rcu()</tt> starts.
644<li> CPU 1: <tt>do_something_with(q-&gt;a);
645 /* No smp_mb(), so might happen after kfree(). */</tt>
646<li> CPU 1: <tt>rcu_read_unlock()</tt>
647<li> CPU 0: <tt>synchronize_rcu()</tt> returns.
648<li> CPU 0: <tt>kfree(p);</tt>
649</ol>
650
651<p>
652Therefore, there absolutely must be a full memory barrier between the
653end of the RCU read-side critical section and the end of the
654grace period.
655
656<p>
657The sequence of events demonstrating the necessity of the second rule
658is roughly similar:
659
660<ol>
661<li> CPU 0: <tt>list_del_rcu(p);</tt>
662<li> CPU 0: <tt>synchronize_rcu()</tt> starts.
663<li> CPU 1: <tt>rcu_read_lock()</tt>
664<li> CPU 1: <tt>q = rcu_dereference(gp);
665 /* Might return p if no memory barrier. */</tt>
666<li> CPU 0: <tt>synchronize_rcu()</tt> returns.
667<li> CPU 0: <tt>kfree(p);</tt>
668<li> CPU 1: <tt>do_something_with(q-&gt;a); /* Boom!!! */</tt>
669<li> CPU 1: <tt>rcu_read_unlock()</tt>
670</ol>
671
672<p>
673And similarly, without a memory barrier between the beginning of the
674grace period and the beginning of the RCU read-side critical section,
675CPU&nbsp;1 might end up accessing the freelist.
676
677<p>
678The &ldquo;as if&rdquo; rule of course applies, so that any implementation
679that acts as if the appropriate memory barriers were in place is a
680correct implementation.
681That said, it is much easier to fool yourself into believing that you have
682adhered to the as-if rule than it is to actually adhere to it!
683<p>@@QQE@@
684
685<p>
686Note that these memory-barrier requirements do not replace the fundamental
687RCU requirement that a grace period wait for all pre-existing readers.
688On the contrary, the memory barriers called out in this section must operate in
689such a way as to <i>enforce</i> this fundamental requirement.
690Of course, different implementations enforce this requirement in different
691ways, but enforce it they must.
692
693<h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3>
694
695<p>
696The common-case RCU primitives are unconditional.
697They are invoked, they do their job, and they return, with no possibility
698of error, and no need to retry.
699This is a key RCU design philosophy.
700
701<p>
702However, this philosophy is pragmatic rather than pigheaded.
703If someone comes up with a good justification for a particular conditional
704RCU primitive, it might well be implemented and added.
705After all, this guarantee was reverse-engineered, not premeditated.
706The unconditional nature of the RCU primitives was initially an
707accident of implementation, and later experience with synchronization
708primitives with conditional primitives caused me to elevate this
709accident to a guarantee.
710Therefore, the justification for adding a conditional primitive to
711RCU would need to be based on detailed and compelling use cases.
712
713<h3><a name="Guaranteed Read-to-Write Upgrade">Guaranteed Read-to-Write Upgrade</a></h3>
714
715<p>
716As far as RCU is concerned, it is always possible to carry out an
717update within an RCU read-side critical section.
718For example, that RCU read-side critical section might search for
719a given data element, and then might acquire the update-side
720spinlock in order to update that element, all while remaining
721in that RCU read-side critical section.
722Of course, it is necessary to exit the RCU read-side critical section
723before invoking <tt>synchronize_rcu()</tt>, however, this
724inconvenience can be avoided through use of the
725<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members
726described later in this document.
727
728<p>@@QQ@@
729But how does the upgrade-to-write operation exclude other readers?
730<p>@@QQA@@
731It doesn't, just like normal RCU updates, which also do not exclude
732RCU readers.
733<p>@@QQE@@
734
735<p>
736This guarantee allows lookup code to be shared between read-side
737and update-side code, and was premeditated, appearing in the earliest
738DYNIX/ptx RCU documentation.
739
740<h2><a name="Fundamental Non-Requirements">Fundamental Non-Requirements</a></h2>
741
742<p>
743RCU provides extremely lightweight readers, and its read-side guarantees,
744though quite useful, are correspondingly lightweight.
745It is therefore all too easy to assume that RCU is guaranteeing more
746than it really is.
747Of course, the list of things that RCU does not guarantee is infinitely
748long, however, the following sections list a few non-guarantees that
749have caused confusion.
750Except where otherwise noted, these non-guarantees were premeditated.
751
752<ol>
753<li> <a href="#Readers Impose Minimal Ordering">
754 Readers Impose Minimal Ordering</a>
755<li> <a href="#Readers Do Not Exclude Updaters">
756 Readers Do Not Exclude Updaters</a>
757<li> <a href="#Updaters Only Wait For Old Readers">
758 Updaters Only Wait For Old Readers</a>
759<li> <a href="#Grace Periods Don't Partition Read-Side Critical Sections">
760 Grace Periods Don't Partition Read-Side Critical Sections</a>
761<li> <a href="#Read-Side Critical Sections Don't Partition Grace Periods">
762 Read-Side Critical Sections Don't Partition Grace Periods</a>
763<li> <a href="#Disabling Preemption Does Not Block Grace Periods">
764 Disabling Preemption Does Not Block Grace Periods</a>
765</ol>
766
767<h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3>
768
769<p>
770Reader-side markers such as <tt>rcu_read_lock()</tt> and
771<tt>rcu_read_unlock()</tt> provide absolutely no ordering guarantees
772except through their interaction with the grace-period APIs such as
773<tt>synchronize_rcu()</tt>.
774To see this, consider the following pair of threads:
775
776<blockquote>
777<pre>
778 1 void thread0(void)
779 2 {
780 3 rcu_read_lock();
781 4 WRITE_ONCE(x, 1);
782 5 rcu_read_unlock();
783 6 rcu_read_lock();
784 7 WRITE_ONCE(y, 1);
785 8 rcu_read_unlock();
786 9 }
78710
78811 void thread1(void)
78912 {
79013 rcu_read_lock();
79114 r1 = READ_ONCE(y);
79215 rcu_read_unlock();
79316 rcu_read_lock();
79417 r2 = READ_ONCE(x);
79518 rcu_read_unlock();
79619 }
797</pre>
798</blockquote>
799
800<p>
801After <tt>thread0()</tt> and <tt>thread1()</tt> execute
802concurrently, it is quite possible to have
803
804<blockquote>
805<pre>
806(r1 == 1 &amp;&amp; r2 == 0)
807</pre>
808</blockquote>
809
810(that is, <tt>y</tt> appears to have been assigned before <tt>x</tt>),
811which would not be possible if <tt>rcu_read_lock()</tt> and
812<tt>rcu_read_unlock()</tt> had much in the way of ordering
813properties.
814But they do not, so the CPU is within its rights
815to do significant reordering.
816This is by design: Any significant ordering constraints would slow down
817these fast-path APIs.
818
819<p>@@QQ@@
820Can't the compiler also reorder this code?
821<p>@@QQA@@
822No, the volatile casts in <tt>READ_ONCE()</tt> and
823<tt>WRITE_ONCE()</tt> prevent the compiler from reordering in
824this particular case.
825<p>@@QQE@@
826
827<h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3>
828
829<p>
830Neither <tt>rcu_read_lock()</tt> nor <tt>rcu_read_unlock()</tt>
831exclude updates.
832All they do is to prevent grace periods from ending.
833The following example illustrates this:
834
835<blockquote>
836<pre>
837 1 void thread0(void)
838 2 {
839 3 rcu_read_lock();
840 4 r1 = READ_ONCE(y);
841 5 if (r1) {
842 6 do_something_with_nonzero_x();
843 7 r2 = READ_ONCE(x);
844 8 WARN_ON(!r2); /* BUG!!! */
845 9 }
84610 rcu_read_unlock();
84711 }
84812
84913 void thread1(void)
85014 {
85115 spin_lock(&amp;my_lock);
85216 WRITE_ONCE(x, 1);
85317 WRITE_ONCE(y, 1);
85418 spin_unlock(&amp;my_lock);
85519 }
856</pre>
857</blockquote>
858
859<p>
860If the <tt>thread0()</tt> function's <tt>rcu_read_lock()</tt>
861excluded the <tt>thread1()</tt> function's update,
862the <tt>WARN_ON()</tt> could never fire.
863But the fact is that <tt>rcu_read_lock()</tt> does not exclude
864much of anything aside from subsequent grace periods, of which
865<tt>thread1()</tt> has none, so the
866<tt>WARN_ON()</tt> can and does fire.
867
868<h3><a name="Updaters Only Wait For Old Readers">Updaters Only Wait For Old Readers</a></h3>
869
870<p>
871It might be tempting to assume that after <tt>synchronize_rcu()</tt>
872completes, there are no readers executing.
873This temptation must be avoided because
874new readers can start immediately after <tt>synchronize_rcu()</tt>
875starts, and <tt>synchronize_rcu()</tt> is under no
876obligation to wait for these new readers.
877
878<p>@@QQ@@
879Suppose that synchronize_rcu() did wait until all readers had completed.
880Would the updater be able to rely on this?
881<p>@@QQA@@
882No.
883Even if <tt>synchronize_rcu()</tt> were to wait until
884all readers had completed, a new reader might start immediately after
885<tt>synchronize_rcu()</tt> completed.
886Therefore, the code following
887<tt>synchronize_rcu()</tt> cannot rely on there being no readers
888in any case.
889<p>@@QQE@@
890
891<h3><a name="Grace Periods Don't Partition Read-Side Critical Sections">
892Grace Periods Don't Partition Read-Side Critical Sections</a></h3>
893
894<p>
895It is tempting to assume that if any part of one RCU read-side critical
896section precedes a given grace period, and if any part of another RCU
897read-side critical section follows that same grace period, then all of
898the first RCU read-side critical section must precede all of the second.
899However, this just isn't the case: A single grace period does not
900partition the set of RCU read-side critical sections.
901An example of this situation can be illustrated as follows, where
902<tt>x</tt>, <tt>y</tt>, and <tt>z</tt> are initially all zero:
903
904<blockquote>
905<pre>
906 1 void thread0(void)
907 2 {
908 3 rcu_read_lock();
909 4 WRITE_ONCE(a, 1);
910 5 WRITE_ONCE(b, 1);
911 6 rcu_read_unlock();
912 7 }
913 8
914 9 void thread1(void)
91510 {
91611 r1 = READ_ONCE(a);
91712 synchronize_rcu();
91813 WRITE_ONCE(c, 1);
91914 }
92015
92116 void thread2(void)
92217 {
92318 rcu_read_lock();
92419 r2 = READ_ONCE(b);
92520 r3 = READ_ONCE(c);
92621 rcu_read_unlock();
92722 }
928</pre>
929</blockquote>
930
931<p>
932It turns out that the outcome:
933
934<blockquote>
935<pre>
936(r1 == 1 &amp;&amp; r2 == 0 &amp;&amp; r3 == 1)
937</pre>
938</blockquote>
939
940is entirely possible.
941The following figure show how this can happen, with each circled
942<tt>QS</tt> indicating the point at which RCU recorded a
943<i>quiescent state</i> for each thread, that is, a state in which
944RCU knows that the thread cannot be in the midst of an RCU read-side
945critical section that started before the current grace period:
946
947<p><img src="GPpartitionReaders1.svg" alt="GPpartitionReaders1.svg" width="60%"></p>
948
949<p>
950If it is necessary to partition RCU read-side critical sections in this
951manner, it is necessary to use two grace periods, where the first
952grace period is known to end before the second grace period starts:
953
954<blockquote>
955<pre>
956 1 void thread0(void)
957 2 {
958 3 rcu_read_lock();
959 4 WRITE_ONCE(a, 1);
960 5 WRITE_ONCE(b, 1);
961 6 rcu_read_unlock();
962 7 }
963 8
964 9 void thread1(void)
96510 {
96611 r1 = READ_ONCE(a);
96712 synchronize_rcu();
96813 WRITE_ONCE(c, 1);
96914 }
97015
97116 void thread2(void)
97217 {
97318 r2 = READ_ONCE(c);
97419 synchronize_rcu();
97520 WRITE_ONCE(d, 1);
97621 }
97722
97823 void thread3(void)
97924 {
98025 rcu_read_lock();
98126 r3 = READ_ONCE(b);
98227 r4 = READ_ONCE(d);
98328 rcu_read_unlock();
98429 }
985</pre>
986</blockquote>
987
988<p>
989Here, if <tt>(r1 == 1)</tt>, then
990<tt>thread0()</tt>'s write to <tt>b</tt> must happen
991before the end of <tt>thread1()</tt>'s grace period.
992If in addition <tt>(r4 == 1)</tt>, then
993<tt>thread3()</tt>'s read from <tt>b</tt> must happen
994after the beginning of <tt>thread2()</tt>'s grace period.
995If it is also the case that <tt>(r2 == 1)</tt>, then the
996end of <tt>thread1()</tt>'s grace period must precede the
997beginning of <tt>thread2()</tt>'s grace period.
998This mean that the two RCU read-side critical sections cannot overlap,
999guaranteeing that <tt>(r3 == 1)</tt>.
1000As a result, the outcome:
1001
1002<blockquote>
1003<pre>
1004(r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 0 &amp;&amp; r4 == 1)
1005</pre>
1006</blockquote>
1007
1008cannot happen.
1009
1010<p>
1011This non-requirement was also non-premeditated, but became apparent
1012when studying RCU's interaction with memory ordering.
1013
1014<h3><a name="Read-Side Critical Sections Don't Partition Grace Periods">
1015Read-Side Critical Sections Don't Partition Grace Periods</a></h3>
1016
1017<p>
1018It is also tempting to assume that if an RCU read-side critical section
1019happens between a pair of grace periods, then those grace periods cannot
1020overlap.
1021However, this temptation leads nowhere good, as can be illustrated by
1022the following, with all variables initially zero:
1023
1024<blockquote>
1025<pre>
1026 1 void thread0(void)
1027 2 {
1028 3 rcu_read_lock();
1029 4 WRITE_ONCE(a, 1);
1030 5 WRITE_ONCE(b, 1);
1031 6 rcu_read_unlock();
1032 7 }
1033 8
1034 9 void thread1(void)
103510 {
103611 r1 = READ_ONCE(a);
103712 synchronize_rcu();
103813 WRITE_ONCE(c, 1);
103914 }
104015
104116 void thread2(void)
104217 {
104318 rcu_read_lock();
104419 WRITE_ONCE(d, 1);
104520 r2 = READ_ONCE(c);
104621 rcu_read_unlock();
104722 }
104823
104924 void thread3(void)
105025 {
105126 r3 = READ_ONCE(d);
105227 synchronize_rcu();
105328 WRITE_ONCE(e, 1);
105429 }
105530
105631 void thread4(void)
105732 {
105833 rcu_read_lock();
105934 r4 = READ_ONCE(b);
106035 r5 = READ_ONCE(e);
106136 rcu_read_unlock();
106237 }
1063</pre>
1064</blockquote>
1065
1066<p>
1067In this case, the outcome:
1068
1069<blockquote>
1070<pre>
1071(r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 1 &amp;&amp; r4 == 0 &amp&amp; r5 == 1)
1072</pre>
1073</blockquote>
1074
1075is entirely possible, as illustrated below:
1076
1077<p><img src="ReadersPartitionGP1.svg" alt="ReadersPartitionGP1.svg" width="100%"></p>
1078
1079<p>
1080Again, an RCU read-side critical section can overlap almost all of a
1081given grace period, just so long as it does not overlap the entire
1082grace period.
1083As a result, an RCU read-side critical section cannot partition a pair
1084of RCU grace periods.
1085
1086<p>@@QQ@@
1087How long a sequence of grace periods, each separated by an RCU read-side
1088critical section, would be required to partition the RCU read-side
1089critical sections at the beginning and end of the chain?
1090<p>@@QQA@@
1091In theory, an infinite number.
1092In practice, an unknown number that is sensitive to both implementation
1093details and timing considerations.
1094Therefore, even in practice, RCU users must abide by the theoretical rather
1095than the practical answer.
1096<p>@@QQE@@
1097
1098<h3><a name="Disabling Preemption Does Not Block Grace Periods">
1099Disabling Preemption Does Not Block Grace Periods</a></h3>
1100
1101<p>
1102There was a time when disabling preemption on any given CPU would block
1103subsequent grace periods.
1104However, this was an accident of implementation and is not a requirement.
1105And in the current Linux-kernel implementation, disabling preemption
1106on a given CPU in fact does not block grace periods, as Oleg Nesterov
1107<a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>.
1108
1109<p>
1110If you need a preempt-disable region to block grace periods, you need to add
1111<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example
1112as follows:
1113
1114<blockquote>
1115<pre>
1116 1 preempt_disable();
1117 2 rcu_read_lock();
1118 3 do_something();
1119 4 rcu_read_unlock();
1120 5 preempt_enable();
1121 6
1122 7 /* Spinlocks implicitly disable preemption. */
1123 8 spin_lock(&amp;mylock);
1124 9 rcu_read_lock();
112510 do_something();
112611 rcu_read_unlock();
112712 spin_unlock(&amp;mylock);
1128</pre>
1129</blockquote>
1130
1131<p>
1132In theory, you could enter the RCU read-side critical section first,
1133but it is more efficient to keep the entire RCU read-side critical
1134section contained in the preempt-disable region as shown above.
1135Of course, RCU read-side critical sections that extend outside of
1136preempt-disable regions will work correctly, but such critical sections
1137can be preempted, which forces <tt>rcu_read_unlock()</tt> to do
1138more work.
1139And no, this is <i>not</i> an invitation to enclose all of your RCU
1140read-side critical sections within preempt-disable regions, because
1141doing so would degrade real-time response.
1142
1143<p>
1144This non-requirement appeared with preemptible RCU.
1145If you need a grace period that waits on non-preemptible code regions, use
1146<a href="#Sched Flavor">RCU-sched</a>.
1147
1148<h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2>
1149
1150<p>
1151These parallelism facts of life are by no means specific to RCU, but
1152the RCU implementation must abide by them.
1153They therefore bear repeating:
1154
1155<ol>
1156<li> Any CPU or task may be delayed at any time,
1157 and any attempts to avoid these delays by disabling
1158 preemption, interrupts, or whatever are completely futile.
1159 This is most obvious in preemptible user-level
1160 environments and in virtualized environments (where
1161 a given guest OS's VCPUs can be preempted at any time by
1162 the underlying hypervisor), but can also happen in bare-metal
1163 environments due to ECC errors, NMIs, and other hardware
1164 events.
1165 Although a delay of more than about 20 seconds can result
1166 in splats, the RCU implementation is obligated to use
1167 algorithms that can tolerate extremely long delays, but where
1168 &ldquo;extremely long&rdquo; is not long enough to allow
1169 wrap-around when incrementing a 64-bit counter.
1170<li> Both the compiler and the CPU can reorder memory accesses.
1171 Where it matters, RCU must use compiler directives and
1172 memory-barrier instructions to preserve ordering.
1173<li> Conflicting writes to memory locations in any given cache line
1174 will result in expensive cache misses.
1175 Greater numbers of concurrent writes and more-frequent
1176 concurrent writes will result in more dramatic slowdowns.
1177 RCU is therefore obligated to use algorithms that have
1178 sufficient locality to avoid significant performance and
1179 scalability problems.
1180<li> As a rough rule of thumb, only one CPU's worth of processing
1181 may be carried out under the protection of any given exclusive
1182 lock.
1183 RCU must therefore use scalable locking designs.
1184<li> Counters are finite, especially on 32-bit systems.
1185 RCU's use of counters must therefore tolerate counter wrap,
1186 or be designed such that counter wrap would take way more
1187 time than a single system is likely to run.
1188 An uptime of ten years is quite possible, a runtime
1189 of a century much less so.
1190 As an example of the latter, RCU's dyntick-idle nesting counter
1191 allows 54 bits for interrupt nesting level (this counter
1192 is 64 bits even on a 32-bit system).
1193 Overflowing this counter requires 2<sup>54</sup>
1194 half-interrupts on a given CPU without that CPU ever going idle.
1195 If a half-interrupt happened every microsecond, it would take
1196 570 years of runtime to overflow this counter, which is currently
1197 believed to be an acceptably long time.
1198<li> Linux systems can have thousands of CPUs running a single
1199 Linux kernel in a single shared-memory environment.
1200 RCU must therefore pay close attention to high-end scalability.
1201</ol>
1202
1203<p>
1204This last parallelism fact of life means that RCU must pay special
1205attention to the preceding facts of life.
1206The idea that Linux might scale to systems with thousands of CPUs would
1207have been met with some skepticism in the 1990s, but these requirements
1208would have otherwise have been unsurprising, even in the early 1990s.
1209
1210<h2><a name="Quality-of-Implementation Requirements">Quality-of-Implementation Requirements</a></h2>
1211
1212<p>
1213These sections list quality-of-implementation requirements.
1214Although an RCU implementation that ignores these requirements could
1215still be used, it would likely be subject to limitations that would
1216make it inappropriate for industrial-strength production use.
1217Classes of quality-of-implementation requirements are as follows:
1218
1219<ol>
1220<li> <a href="#Specialization">Specialization</a>
1221<li> <a href="#Performance and Scalability">Performance and Scalability</a>
1222<li> <a href="#Composability">Composability</a>
1223<li> <a href="#Corner Cases">Corner Cases</a>
1224</ol>
1225
1226<p>
1227These classes is covered in the following sections.
1228
1229<h3><a name="Specialization">Specialization</a></h3>
1230
1231<p>
1232RCU is and always has been intended primarily for read-mostly situations, as
1233illustrated by the following figure.
1234This means that RCU's read-side primitives are optimized, often at the
1235expense of its update-side primitives.
1236
1237<p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p>
1238
1239<p>
1240This focus on read-mostly situations means that RCU must interoperate
1241with other synchronization primitives.
1242For example, the <tt>add_gp()</tt> and <tt>remove_gp_synchronous()</tt>
1243examples discussed earlier use RCU to protect readers and locking to
1244coordinate updaters.
1245However, the need extends much farther, requiring that a variety of
1246synchronization primitives be legal within RCU read-side critical sections,
1247including spinlocks, sequence locks, atomic operations, reference
1248counters, and memory barriers.
1249
1250<p>@@QQ@@
1251What about sleeping locks?
1252<p>@@QQA@@
1253These are forbidden within Linux-kernel RCU read-side critical sections
1254because it is not legal to place a quiescent state (in this case,
1255voluntary context switch) within an RCU read-side critical section.
1256However, sleeping locks may be used within userspace RCU read-side critical
1257sections, and also within Linux-kernel sleepable RCU
1258<a href="#Sleepable RCU">(SRCU)</a>
1259read-side critical sections.
1260In addition, the -rt patchset turns spinlocks into a sleeping locks so
1261that the corresponding critical sections can be preempted, which
1262also means that these sleeplockified spinlocks (but not other sleeping locks!)
1263may be acquire within -rt-Linux-kernel RCU read-side critical sections.
1264
1265<p>
1266Note that it <i>is</i> legal for a normal RCU read-side critical section
1267to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>),
1268but only as long as it does not loop indefinitely attempting to
1269conditionally acquire that sleeping locks.
1270The key point is that things like <tt>mutex_trylock()</tt>
1271either return with the mutex held, or return an error indication if
1272the mutex was not immediately available.
1273Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping.
1274<p>@@QQE@@
1275
1276<p>
1277It often comes as a surprise that many algorithms do not require a
1278consistent view of data, but many can function in that mode,
1279with network routing being the poster child.
1280Internet routing algorithms take significant time to propagate
1281updates, so that by the time an update arrives at a given system,
1282that system has been sending network traffic the wrong way for
1283a considerable length of time.
1284Having a few threads continue to send traffic the wrong way for a
1285few more milliseconds is clearly not a problem: In the worst case,
1286TCP retransmissions will eventually get the data where it needs to go.
1287In general, when tracking the state of the universe outside of the
1288computer, some level of inconsistency must be tolerated due to
1289speed-of-light delays if nothing else.
1290
1291<p>
1292Furthermore, uncertainty about external state is inherent in many cases.
1293For example, a pair of veternarians might use heartbeat to determine
1294whether or not a given cat was alive.
1295But how long should they wait after the last heartbeat to decide that
1296the cat is in fact dead?
1297Waiting less than 400 milliseconds makes no sense because this would
1298mean that a relaxed cat would be considered to cycle between death
1299and life more than 100 times per minute.
1300Moreover, just as with human beings, a cat's heart might stop for
1301some period of time, so the exact wait period is a judgment call.
1302One of our pair of veternarians might wait 30 seconds before pronouncing
1303the cat dead, while the other might insist on waiting a full minute.
1304The two veternarians would then disagree on the state of the cat during
1305the final 30 seconds of the minute following the last heartbeat, as
1306fancifully illustrated below:
1307
1308<p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p>
1309
1310<p>
1311Interestingly enough, this same situation applies to hardware.
1312When push comes to shove, how do we tell whether or not some
1313external server has failed?
1314We send messages to it periodically, and declare it failed if we
1315don't receive a response within a given period of time.
1316Policy decisions can usually tolerate short
1317periods of inconsistency.
1318The policy was decided some time ago, and is only now being put into
1319effect, so a few milliseconds of delay is normally inconsequential.
1320
1321<p>
1322However, there are algorithms that absolutely must see consistent data.
1323For example, the translation between a user-level SystemV semaphore
1324ID to the corresponding in-kernel data structure is protected by RCU,
1325but it is absolutely forbidden to update a semaphore that has just been
1326removed.
1327In the Linux kernel, this need for consistency is accommodated by acquiring
1328spinlocks located in the in-kernel data structure from within
1329the RCU read-side critical section, and this is indicated by the
1330green box in the figure above.
1331Many other techniques may be used, and are in fact used within the
1332Linux kernel.
1333
1334<p>
1335In short, RCU is not required to maintain consistency, and other
1336mechanisms may be used in concert with RCU when consistency is required.
1337RCU's specialization allows it to do its job extremely well, and its
1338ability to interoperate with other synchronization mechanisms allows
1339the right mix of synchronization tools to be used for a given job.
1340
1341<h3><a name="Performance and Scalability">Performance and Scalability</a></h3>
1342
1343<p>
1344Energy efficiency is a critical component of performance today,
1345and Linux-kernel RCU implementations must therefore avoid unnecessarily
1346awakening idle CPUs.
1347I cannot claim that this requirement was premeditated.
1348In fact, I learned of it during a telephone conversation in which I
1349was given &ldquo;frank and open&rdquo; feedback on the importance
1350of energy efficiency in battery-powered systems and on specific
1351energy-efficiency shortcomings of the Linux-kernel RCU implementation.
1352In my experience, the battery-powered embedded community will consider
1353any unnecessary wakeups to be extremely unfriendly acts.
1354So much so that mere Linux-kernel-mailing-list posts are
1355insufficient to vent their ire.
1356
1357<p>
1358Memory consumption is not particularly important for in most
1359situations, and has become decreasingly
1360so as memory sizes have expanded and memory
1361costs have plummeted.
1362However, as I learned from Matt Mackall's
1363<a href="http://elinux.org/Linux_Tiny-FAQ">bloatwatch</a>
1364efforts, memory footprint is critically important on single-CPU systems with
1365non-preemptible (<tt>CONFIG_PREEMPT=n</tt>) kernels, and thus
1366<a href="https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com">tiny RCU</a>
1367was born.
1368Josh Triplett has since taken over the small-memory banner with his
1369<a href="https://tiny.wiki.kernel.org/">Linux kernel tinification</a>
1370project, which resulted in
1371<a href="#Sleepable RCU">SRCU</a>
1372becoming optional for those kernels not needing it.
1373
1374<p>
1375The remaining performance requirements are, for the most part,
1376unsurprising.
1377For example, in keeping with RCU's read-side specialization,
1378<tt>rcu_dereference()</tt> should have negligible overhead (for
1379example, suppression of a few minor compiler optimizations).
1380Similarly, in non-preemptible environments, <tt>rcu_read_lock()</tt> and
1381<tt>rcu_read_unlock()</tt> should have exactly zero overhead.
1382
1383<p>
1384In preemptible environments, in the case where the RCU read-side
1385critical section was not preempted (as will be the case for the
1386highest-priority real-time process), <tt>rcu_read_lock()</tt> and
1387<tt>rcu_read_unlock()</tt> should have minimal overhead.
1388In particular, they should not contain atomic read-modify-write
1389operations, memory-barrier instructions, preemption disabling,
1390interrupt disabling, or backwards branches.
1391However, in the case where the RCU read-side critical section was preempted,
1392<tt>rcu_read_unlock()</tt> may acquire spinlocks and disable interrupts.
1393This is why it is better to nest an RCU read-side critical section
1394within a preempt-disable region than vice versa, at least in cases
1395where that critical section is short enough to avoid unduly degrading
1396real-time latencies.
1397
1398<p>
1399The <tt>synchronize_rcu()</tt> grace-period-wait primitive is
1400optimized for throughput.
1401It may therefore incur several milliseconds of latency in addition to
1402the duration of the longest RCU read-side critical section.
1403On the other hand, multiple concurrent invocations of
1404<tt>synchronize_rcu()</tt> are required to use batching optimizations
1405so that they can be satisfied by a single underlying grace-period-wait
1406operation.
1407For example, in the Linux kernel, it is not unusual for a single
1408grace-period-wait operation to serve more than
1409<a href="https://www.usenix.org/conference/2004-usenix-annual-technical-conference/making-rcu-safe-deep-sub-millisecond-response">1,000 separate invocations</a>
1410of <tt>synchronize_rcu()</tt>, thus amortizing the per-invocation
1411overhead down to nearly zero.
1412However, the grace-period optimization is also required to avoid
1413measurable degradation of real-time scheduling and interrupt latencies.
1414
1415<p>
1416In some cases, the multi-millisecond <tt>synchronize_rcu()</tt>
1417latencies are unacceptable.
1418In these cases, <tt>synchronize_rcu_expedited()</tt> may be used
1419instead, reducing the grace-period latency down to a few tens of
1420microseconds on small systems, at least in cases where the RCU read-side
1421critical sections are short.
1422There are currently no special latency requirements for
1423<tt>synchronize_rcu_expedited()</tt> on large systems, but,
1424consistent with the empirical nature of the RCU specification,
1425that is subject to change.
1426However, there most definitely are scalability requirements:
1427A storm of <tt>synchronize_rcu_expedited()</tt> invocations on 4096
1428CPUs should at least make reasonable forward progress.
1429In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt>
1430is permitted to impose modest degradation of real-time latency
1431on non-idle online CPUs.
1432That said, it will likely be necessary to take further steps to reduce this
1433degradation, hopefully to roughly that of a scheduling-clock interrupt.
1434
1435<p>
1436There are a number of situations where even
1437<tt>synchronize_rcu_expedited()</tt>'s reduced grace-period
1438latency is unacceptable.
1439In these situations, the asynchronous <tt>call_rcu()</tt> can be
1440used in place of <tt>synchronize_rcu()</tt> as follows:
1441
1442<blockquote>
1443<pre>
1444 1 struct foo {
1445 2 int a;
1446 3 int b;
1447 4 struct rcu_head rh;
1448 5 };
1449 6
1450 7 static void remove_gp_cb(struct rcu_head *rhp)
1451 8 {
1452 9 struct foo *p = container_of(rhp, struct foo, rh);
145310
145411 kfree(p);
145512 }
145613
145714 bool remove_gp_asynchronous(void)
145815 {
145916 struct foo *p;
146017
146118 spin_lock(&amp;gp_lock);
146219 p = rcu_dereference(gp);
146320 if (!p) {
146421 spin_unlock(&amp;gp_lock);
146522 return false;
146623 }
146724 rcu_assign_pointer(gp, NULL);
146825 call_rcu(&amp;p-&gt;rh, remove_gp_cb);
146926 spin_unlock(&amp;gp_lock);
147027 return true;
147128 }
1472</pre>
1473</blockquote>
1474
1475<p>
1476A definition of <tt>struct foo</tt> is finally needed, and appears
1477on lines&nbsp;1-5.
1478The function <tt>remove_gp_cb()</tt> is passed to <tt>call_rcu()</tt>
1479on line&nbsp;25, and will be invoked after the end of a subsequent
1480grace period.
1481This gets the same effect as <tt>remove_gp_synchronous()</tt>,
1482but without forcing the updater to wait for a grace period to elapse.
1483The <tt>call_rcu()</tt> function may be used in a number of
1484situations where neither <tt>synchronize_rcu()</tt> nor
1485<tt>synchronize_rcu_expedited()</tt> would be legal,
1486including within preempt-disable code, <tt>local_bh_disable()</tt> code,
1487interrupt-disable code, and interrupt handlers.
1488However, even <tt>call_rcu()</tt> is illegal within NMI handlers.
1489The callback function (<tt>remove_gp_cb()</tt> in this case) will be
1490executed within softirq (software interrupt) environment within the
1491Linux kernel,
1492either within a real softirq handler or under the protection
1493of <tt>local_bh_disable()</tt>.
1494In both the Linux kernel and in userspace, it is bad practice to
1495write an RCU callback function that takes too long.
1496Long-running operations should be relegated to separate threads or
1497(in the Linux kernel) workqueues.
1498
1499<p>@@QQ@@
1500Why does line&nbsp;19 use <tt>rcu_access_pointer()</tt>?
1501After all, <tt>call_rcu()</tt> on line&nbsp;25 stores into the
1502structure, which would interact badly with concurrent insertions.
1503Doesn't this mean that <tt>rcu_dereference()</tt> is required?
1504<p>@@QQA@@
1505Presumably the <tt>-&gt;gp_lock</tt> acquired on line&nbsp;18 excludes
1506any changes, including any insertions that <tt>rcu_dereference()</tt>
1507would protect against.
1508Therefore, any insertions will be delayed until after <tt>-&gt;gp_lock</tt>
1509is released on line&nbsp;25, which in turn means that
1510<tt>rcu_access_pointer()</tt> suffices.
1511<p>@@QQE@@
1512
1513<p>
1514However, all that <tt>remove_gp_cb()</tt> is doing is
1515invoking <tt>kfree()</tt> on the data element.
1516This is a common idiom, and is supported by <tt>kfree_rcu()</tt>,
1517which allows &ldquo;fire and forget&rdquo; operation as shown below:
1518
1519<blockquote>
1520<pre>
1521 1 struct foo {
1522 2 int a;
1523 3 int b;
1524 4 struct rcu_head rh;
1525 5 };
1526 6
1527 7 bool remove_gp_faf(void)
1528 8 {
1529 9 struct foo *p;
153010
153111 spin_lock(&amp;gp_lock);
153212 p = rcu_dereference(gp);
153313 if (!p) {
153414 spin_unlock(&amp;gp_lock);
153515 return false;
153616 }
153717 rcu_assign_pointer(gp, NULL);
153818 kfree_rcu(p, rh);
153919 spin_unlock(&amp;gp_lock);
154020 return true;
154121 }
1542</pre>
1543</blockquote>
1544
1545<p>
1546Note that <tt>remove_gp_faf()</tt> simply invokes
1547<tt>kfree_rcu()</tt> and proceeds, without any need to pay any
1548further attention to the subsequent grace period and <tt>kfree()</tt>.
1549It is permissible to invoke <tt>kfree_rcu()</tt> from the same
1550environments as for <tt>call_rcu()</tt>.
1551Interestingly enough, DYNIX/ptx had the equivalents of
1552<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>, but not
1553<tt>synchronize_rcu()</tt>.
1554This was due to the fact that RCU was not heavily used within DYNIX/ptx,
1555so the very few places that needed something like
1556<tt>synchronize_rcu()</tt> simply open-coded it.
1557
1558<p>@@QQ@@
1559Earlier it was claimed that <tt>call_rcu()</tt> and
1560<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked
1561by readers.
1562But how can that be correct, given that the invocation of the callback
1563and the freeing of the memory (respectively) must still wait for
1564a grace period to elapse?
1565<p>@@QQA@@
1566We could define things this way, but keep in mind that this sort of
1567definition would say that updates in garbage-collected languages
1568cannot complete until the next time the garbage collector runs,
1569which does not seem at all reasonable.
1570The key point is that in most cases, an updater using either
1571<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the
1572next update as soon as it has invoked <tt>call_rcu()</tt> or
1573<tt>kfree_rcu()</tt>, without having to wait for a subsequent
1574grace period.
1575<p>@@QQE@@
1576
1577<p>
1578But what if the updater must wait for the completion of code to be
1579executed after the end of the grace period, but has other tasks
1580that can be carried out in the meantime?
1581The polling-style <tt>get_state_synchronize_rcu()</tt> and
1582<tt>cond_synchronize_rcu()</tt> functions may be used for this
1583purpose, as shown below:
1584
1585<blockquote>
1586<pre>
1587 1 bool remove_gp_poll(void)
1588 2 {
1589 3 struct foo *p;
1590 4 unsigned long s;
1591 5
1592 6 spin_lock(&amp;gp_lock);
1593 7 p = rcu_access_pointer(gp);
1594 8 if (!p) {
1595 9 spin_unlock(&amp;gp_lock);
159610 return false;
159711 }
159812 rcu_assign_pointer(gp, NULL);
159913 spin_unlock(&amp;gp_lock);
160014 s = get_state_synchronize_rcu();
160115 do_something_while_waiting();
160216 cond_synchronize_rcu(s);
160317 kfree(p);
160418 return true;
160519 }
1606</pre>
1607</blockquote>
1608
1609<p>
1610On line&nbsp;14, <tt>get_state_synchronize_rcu()</tt> obtains a
1611&ldquo;cookie&rdquo; from RCU,
1612then line&nbsp;15 carries out other tasks,
1613and finally, line&nbsp;16 returns immediately if a grace period has
1614elapsed in the meantime, but otherwise waits as required.
1615The need for <tt>get_state_synchronize_rcu</tt> and
1616<tt>cond_synchronize_rcu()</tt> has appeared quite recently,
1617so it is too early to tell whether they will stand the test of time.
1618
1619<p>
1620RCU thus provides a range of tools to allow updaters to strike the
1621required tradeoff between latency, flexibility and CPU overhead.
1622
1623<h3><a name="Composability">Composability</a></h3>
1624
1625<p>
1626Composability has received much attention in recent years, perhaps in part
1627due to the collision of multicore hardware with object-oriented techniques
1628designed in single-threaded environments for single-threaded use.
1629And in theory, RCU read-side critical sections may be composed, and in
1630fact may be nested arbitrarily deeply.
1631In practice, as with all real-world implementations of composable
1632constructs, there are limitations.
1633
1634<p>
1635Implementations of RCU for which <tt>rcu_read_lock()</tt>
1636and <tt>rcu_read_unlock()</tt> generate no code, such as
1637Linux-kernel RCU when <tt>CONFIG_PREEMPT=n</tt>, can be
1638nested arbitrarily deeply.
1639After all, there is no overhead.
1640Except that if all these instances of <tt>rcu_read_lock()</tt>
1641and <tt>rcu_read_unlock()</tt> are visible to the compiler,
1642compilation will eventually fail due to exhausting memory,
1643mass storage, or user patience, whichever comes first.
1644If the nesting is not visible to the compiler, as is the case with
1645mutually recursive functions each in its own translation unit,
1646stack overflow will result.
1647If the nesting takes the form of loops, either the control variable
1648will overflow or (in the Linux kernel) you will get an RCU CPU stall warning.
1649Nevertheless, this class of RCU implementations is one
1650of the most composable constructs in existence.
1651
1652<p>
1653RCU implementations that explicitly track nesting depth
1654are limited by the nesting-depth counter.
1655For example, the Linux kernel's preemptible RCU limits nesting to
1656<tt>INT_MAX</tt>.
1657This should suffice for almost all practical purposes.
1658That said, a consecutive pair of RCU read-side critical sections
1659between which there is an operation that waits for a grace period
1660cannot be enclosed in another RCU read-side critical section.
1661This is because it is not legal to wait for a grace period within
1662an RCU read-side critical section: To do so would result either
1663in deadlock or
1664in RCU implicitly splitting the enclosing RCU read-side critical
1665section, neither of which is conducive to a long-lived and prosperous
1666kernel.
1667
1668<p>
1669It is worth noting that RCU is not alone in limiting composability.
1670For example, many transactional-memory implementations prohibit
1671composing a pair of transactions separated by an irrevocable
1672operation (for example, a network receive operation).
1673For another example, lock-based critical sections can be composed
1674surprisingly freely, but only if deadlock is avoided.
1675
1676<p>
1677In short, although RCU read-side critical sections are highly composable,
1678care is required in some situations, just as is the case for any other
1679composable synchronization mechanism.
1680
1681<h3><a name="Corner Cases">Corner Cases</a></h3>
1682
1683<p>
1684A given RCU workload might have an endless and intense stream of
1685RCU read-side critical sections, perhaps even so intense that there
1686was never a point in time during which there was not at least one
1687RCU read-side critical section in flight.
1688RCU cannot allow this situation to block grace periods: As long as
1689all the RCU read-side critical sections are finite, grace periods
1690must also be finite.
1691
1692<p>
1693That said, preemptible RCU implementations could potentially result
1694in RCU read-side critical sections being preempted for long durations,
1695which has the effect of creating a long-duration RCU read-side
1696critical section.
1697This situation can arise only in heavily loaded systems, but systems using
1698real-time priorities are of course more vulnerable.
1699Therefore, RCU priority boosting is provided to help deal with this
1700case.
1701That said, the exact requirements on RCU priority boosting will likely
1702evolve as more experience accumulates.
1703
1704<p>
1705Other workloads might have very high update rates.
1706Although one can argue that such workloads should instead use
1707something other than RCU, the fact remains that RCU must
1708handle such workloads gracefully.
1709This requirement is another factor driving batching of grace periods,
1710but it is also the driving force behind the checks for large numbers
1711of queued RCU callbacks in the <tt>call_rcu()</tt> code path.
1712Finally, high update rates should not delay RCU read-side critical
1713sections, although some read-side delays can occur when using
1714<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use
1715of <tt>try_stop_cpus()</tt>.
1716(In the future, <tt>synchronize_rcu_expedited()</tt> will be
1717converted to use lighter-weight inter-processor interrupts (IPIs),
1718but this will still disturb readers, though to a much smaller degree.)
1719
1720<p>
1721Although all three of these corner cases were understood in the early
17221990s, a simple user-level test consisting of <tt>close(open(path))</tt>
1723in a tight loop
1724in the early 2000s suddenly provided a much deeper appreciation of the
1725high-update-rate corner case.
1726This test also motivated addition of some RCU code to react to high update
1727rates, for example, if a given CPU finds itself with more than 10,000
1728RCU callbacks queued, it will cause RCU to take evasive action by
1729more aggressively starting grace periods and more aggressively forcing
1730completion of grace-period processing.
1731This evasive action causes the grace period to complete more quickly,
1732but at the cost of restricting RCU's batching optimizations, thus
1733increasing the CPU overhead incurred by that grace period.
1734
1735<h2><a name="Software-Engineering Requirements">
1736Software-Engineering Requirements</a></h2>
1737
1738<p>
1739Between Murphy's Law and &ldquo;To err is human&rdquo;, it is necessary to
1740guard against mishaps and misuse:
1741
1742<ol>
1743<li> It is all too easy to forget to use <tt>rcu_read_lock()</tt>
1744 everywhere that it is needed, so kernels built with
1745 <tt>CONFIG_PROVE_RCU=y</tt> will spat if
1746 <tt>rcu_dereference()</tt> is used outside of an
1747 RCU read-side critical section.
1748 Update-side code can use <tt>rcu_dereference_protected()</tt>,
1749 which takes a
1750 <a href="https://lwn.net/Articles/371986/">lockdep expression</a>
1751 to indicate what is providing the protection.
1752 If the indicated protection is not provided, a lockdep splat
1753 is emitted.
1754
1755 <p>
1756 Code shared between readers and updaters can use
1757 <tt>rcu_dereference_check()</tt>, which also takes a
1758 lockdep expression, and emits a lockdep splat if neither
1759 <tt>rcu_read_lock()</tt> nor the indicated protection
1760 is in place.
1761 In addition, <tt>rcu_dereference_raw()</tt> is used in those
1762 (hopefully rare) cases where the required protection cannot
1763 be easily described.
1764 Finally, <tt>rcu_read_lock_held()</tt> is provided to
1765 allow a function to verify that it has been invoked within
1766 an RCU read-side critical section.
1767 I was made aware of this set of requirements shortly after Thomas
1768 Gleixner audited a number of RCU uses.
1769<li> A given function might wish to check for RCU-related preconditions
1770 upon entry, before using any other RCU API.
1771 The <tt>rcu_lockdep_assert()</tt> does this job,
1772 asserting the expression in kernels having lockdep enabled
1773 and doing nothing otherwise.
1774<li> It is also easy to forget to use <tt>rcu_assign_pointer()</tt>
1775 and <tt>rcu_dereference()</tt>, perhaps (incorrectly)
1776 substituting a simple assignment.
1777 To catch this sort of error, a given RCU-protected pointer may be
1778 tagged with <tt>__rcu</tt>, after which running sparse
1779 with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain
1780 about simple-assignment accesses to that pointer.
1781 Arnd Bergmann made me aware of this requirement, and also
1782 supplied the needed
1783 <a href="https://lwn.net/Articles/376011/">patch series</a>.
1784<li> Kernels built with <tt>CONFIG_DEBUG_OBJECTS_RCU_HEAD=y</tt>
1785 will splat if a data element is passed to <tt>call_rcu()</tt>
1786 twice in a row, without a grace period in between.
1787 (This error is similar to a double free.)
1788 The corresponding <tt>rcu_head</tt> structures that are
1789 dynamically allocated are automatically tracked, but
1790 <tt>rcu_head</tt> structures allocated on the stack
1791 must be initialized with <tt>init_rcu_head_on_stack()</tt>
1792 and cleaned up with <tt>destroy_rcu_head_on_stack()</tt>.
1793 Similarly, statically allocated non-stack <tt>rcu_head</tt>
1794 structures must be initialized with <tt>init_rcu_head()</tt>
1795 and cleaned up with <tt>destroy_rcu_head()</tt>.
1796 Mathieu Desnoyers made me aware of this requirement, and also
1797 supplied the needed
1798 <a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>.
1799<li> An infinite loop in an RCU read-side critical section will
1800 eventually trigger an RCU CPU stall warning splat, with
1801 the duration of &ldquo;eventually&rdquo; being controlled by the
1802 <tt>RCU_CPU_STALL_TIMEOUT</tt> <tt>Kconfig</tt> option, or,
1803 alternatively, by the
1804 <tt>rcupdate.rcu_cpu_stall_timeout</tt> boot/sysfs
1805 parameter.
1806 However, RCU is not obligated to produce this splat
1807 unless there is a grace period waiting on that particular
1808 RCU read-side critical section.
1809 <p>
1810 Some extreme workloads might intentionally delay
1811 RCU grace periods, and systems running those workloads can
1812 be booted with <tt>rcupdate.rcu_cpu_stall_suppress</tt>
1813 to suppress the splats.
1814 This kernel parameter may also be set via <tt>sysfs</tt>.
1815 Furthermore, RCU CPU stall warnings are counter-productive
1816 during sysrq dumps and during panics.
1817 RCU therefore supplies the <tt>rcu_sysrq_start()</tt> and
1818 <tt>rcu_sysrq_end()</tt> API members to be called before
1819 and after long sysrq dumps.
1820 RCU also supplies the <tt>rcu_panic()</tt> notifier that is
1821 automatically invoked at the beginning of a panic to suppress
1822 further RCU CPU stall warnings.
1823
1824 <p>
1825 This requirement made itself known in the early 1990s, pretty
1826 much the first time that it was necessary to debug a CPU stall.
1827 That said, the initial implementation in DYNIX/ptx was quite
1828 generic in comparison with that of Linux.
1829<li> Although it would be very good to detect pointers leaking out
1830 of RCU read-side critical sections, there is currently no
1831 good way of doing this.
1832 One complication is the need to distinguish between pointers
1833 leaking and pointers that have been handed off from RCU to
1834 some other synchronization mechanism, for example, reference
1835 counting.
1836<li> In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related
1837 information is provided via both debugfs and event tracing.
1838<li> Open-coded use of <tt>rcu_assign_pointer()</tt> and
1839 <tt>rcu_dereference()</tt> to create typical linked
1840 data structures can be surprisingly error-prone.
1841 Therefore, RCU-protected
1842 <a href="https://lwn.net/Articles/609973/#RCU List APIs">linked lists</a>
1843 and, more recently, RCU-protected
1844 <a href="https://lwn.net/Articles/612100/">hash tables</a>
1845 are available.
1846 Many other special-purpose RCU-protected data structures are
1847 available in the Linux kernel and the userspace RCU library.
1848<li> Some linked structures are created at compile time, but still
1849 require <tt>__rcu</tt> checking.
1850 The <tt>RCU_POINTER_INITIALIZER()</tt> macro serves this
1851 purpose.
1852<li> It is not necessary to use <tt>rcu_assign_pointer()</tt>
1853 when creating linked structures that are to be published via
1854 a single external pointer.
1855 The <tt>RCU_INIT_POINTER()</tt> macro is provided for
1856 this task and also for assigning <tt>NULL</tt> pointers
1857 at runtime.
1858</ol>
1859
1860<p>
1861This not a hard-and-fast list: RCU's diagnostic capabilities will
1862continue to be guided by the number and type of usage bugs found
1863in real-world RCU usage.
1864
1865<h2><a name="Linux Kernel Complications">Linux Kernel Complications</a></h2>
1866
1867<p>
1868The Linux kernel provides an interesting environment for all kinds of
1869software, including RCU.
1870Some of the relevant points of interest are as follows:
1871
1872<ol>
1873<li> <a href="#Configuration">Configuration</a>.
1874<li> <a href="#Firmware Interface">Firmware Interface</a>.
1875<li> <a href="#Early Boot">Early Boot</a>.
1876<li> <a href="#Interrupts and NMIs">
1877 Interrupts and non-maskable interrupts (NMIs)</a>.
1878<li> <a href="#Loadable Modules">Loadable Modules</a>.
1879<li> <a href="#Hotplug CPU">Hotplug CPU</a>.
1880<li> <a href="#Scheduler and RCU">Scheduler and RCU</a>.
1881<li> <a href="#Tracing and RCU">Tracing and RCU</a>.
1882<li> <a href="#Energy Efficiency">Energy Efficiency</a>.
1883<li> <a href="#Memory Efficiency">Memory Efficiency</a>.
1884<li> <a href="#Performance, Scalability, Response Time, and Reliability">
1885 Performance, Scalability, Response Time, and Reliability</a>.
1886</ol>
1887
1888<p>
1889This list is probably incomplete, but it does give a feel for the
1890most notable Linux-kernel complications.
1891Each of the following sections covers one of the above topics.
1892
1893<h3><a name="Configuration">Configuration</a></h3>
1894
1895<p>
1896RCU's goal is automatic configuration, so that almost nobody
1897needs to worry about RCU's <tt>Kconfig</tt> options.
1898And for almost all users, RCU does in fact work well
1899&ldquo;out of the box.&rdquo;
1900
1901<p>
1902However, there are specialized use cases that are handled by
1903kernel boot parameters and <tt>Kconfig</tt> options.
1904Unfortunately, the <tt>Kconfig</tt> system will explicitly ask users
1905about new <tt>Kconfig</tt> options, which requires almost all of them
1906be hidden behind a <tt>CONFIG_RCU_EXPERT</tt> <tt>Kconfig</tt> option.
1907
1908<p>
1909This all should be quite obvious, but the fact remains that
1910Linus Torvalds recently had to
1911<a href="https://lkml.kernel.org/g/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com">remind</a>
1912me of this requirement.
1913
1914<h3><a name="Firmware Interface">Firmware Interface</a></h3>
1915
1916<p>
1917In many cases, kernel obtains information about the system from the
1918firmware, and sometimes things are lost in translation.
1919Or the translation is accurate, but the original message is bogus.
1920
1921<p>
1922For example, some systems' firmware overreports the number of CPUs,
1923sometimes by a large factor.
1924If RCU naively believed the firmware, as it used to do,
1925it would create too many per-CPU kthreads.
1926Although the resulting system will still run correctly, the extra
1927kthreads needlessly consume memory and can cause confusion
1928when they show up in <tt>ps</tt> listings.
1929
1930<p>
1931RCU must therefore wait for a given CPU to actually come online before
1932it can allow itself to believe that the CPU actually exists.
1933The resulting &ldquo;ghost CPUs&rdquo; (which are never going to
1934come online) cause a number of
1935<a href="https://paulmck.livejournal.com/37494.html">interesting complications</a>.
1936
1937<h3><a name="Early Boot">Early Boot</a></h3>
1938
1939<p>
1940The Linux kernel's boot sequence is an interesting process,
1941and RCU is used early, even before <tt>rcu_init()</tt>
1942is invoked.
1943In fact, a number of RCU's primitives can be used as soon as the
1944initial task's <tt>task_struct</tt> is available and the
1945boot CPU's per-CPU variables are set up.
1946The read-side primitives (<tt>rcu_read_lock()</tt>,
1947<tt>rcu_read_unlock()</tt>, <tt>rcu_dereference()</tt>,
1948and <tt>rcu_access_pointer()</tt>) will operate normally very early on,
1949as will <tt>rcu_assign_pointer()</tt>.
1950
1951<p>
1952Although <tt>call_rcu()</tt> may be invoked at any
1953time during boot, callbacks are not guaranteed to be invoked until after
1954the scheduler is fully up and running.
1955This delay in callback invocation is due to the fact that RCU does not
1956invoke callbacks until it is fully initialized, and this full initialization
1957cannot occur until after the scheduler has initialized itself to the
1958point where RCU can spawn and run its kthreads.
1959In theory, it would be possible to invoke callbacks earlier,
1960however, this is not a panacea because there would be severe restrictions
1961on what operations those callbacks could invoke.
1962
1963<p>
1964Perhaps surprisingly, <tt>synchronize_rcu()</tt>,
1965<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a>
1966(<a href="#Bottom-Half Flavor">discussed below</a>),
1967and
1968<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>
1969will all operate normally
1970during very early boot, the reason being that there is only one CPU
1971and preemption is disabled.
1972This means that the call <tt>synchronize_rcu()</tt> (or friends)
1973itself is a quiescent
1974state and thus a grace period, so the early-boot implementation can
1975be a no-op.
1976
1977<p>
1978Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt>
1979continue to operate normally through the remainder of boot, courtesy
1980of the fact that preemption is disabled across their RCU read-side
1981critical sections and also courtesy of the fact that there is still
1982only one CPU.
1983However, once the scheduler starts initializing, preemption is enabled.
1984There is still only a single CPU, but the fact that preemption is enabled
1985means that the no-op implementation of <tt>synchronize_rcu()</tt> no
1986longer works in <tt>CONFIG_PREEMPT=y</tt> kernels.
1987Therefore, as soon as the scheduler starts initializing, the early-boot
1988fastpath is disabled.
1989This means that <tt>synchronize_rcu()</tt> switches to its runtime
1990mode of operation where it posts callbacks, which in turn means that
1991any call to <tt>synchronize_rcu()</tt> will block until the corresponding
1992callback is invoked.
1993Unfortunately, the callback cannot be invoked until RCU's runtime
1994grace-period machinery is up and running, which cannot happen until
1995the scheduler has initialized itself sufficiently to allow RCU's
1996kthreads to be spawned.
1997Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler
1998initialization can result in deadlock.
1999
2000<p>@@QQ@@
2001So what happens with <tt>synchronize_rcu()</tt> during
2002scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
2003kernels?
2004<p>@@QQA@@
2005In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt>
2006maps directly to <tt>synchronize_sched()</tt>.
2007Therefore, <tt>synchronize_rcu()</tt> works normally throughout
2008boot in <tt>CONFIG_PREEMPT=n</tt> kernels.
2009However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels,
2010so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt>
2011during scheduler initialization.
2012<p>@@QQE@@
2013
2014<p>
2015I learned of these boot-time requirements as a result of a series of
2016system hangs.
2017
2018<h3><a name="Interrupts and NMIs">Interrupts and NMIs</a></h3>
2019
2020<p>
2021The Linux kernel has interrupts, and RCU read-side critical sections are
2022legal within interrupt handlers and within interrupt-disabled regions
2023of code, as are invocations of <tt>call_rcu()</tt>.
2024
2025<p>
2026Some Linux-kernel architectures can enter an interrupt handler from
2027non-idle process context, and then just never leave it, instead stealthily
2028transitioning back to process context.
2029This trick is sometimes used to invoke system calls from inside the kernel.
2030These &ldquo;half-interrupts&rdquo; mean that RCU has to be very careful
2031about how it counts interrupt nesting levels.
2032I learned of this requirement the hard way during a rewrite
2033of RCU's dyntick-idle code.
2034
2035<p>
2036The Linux kernel has non-maskable interrupts (NMIs), and
2037RCU read-side critical sections are legal within NMI handlers.
2038Thankfully, RCU update-side primitives, including
2039<tt>call_rcu()</tt>, are prohibited within NMI handlers.
2040
2041<p>
2042The name notwithstanding, some Linux-kernel architectures
2043can have nested NMIs, which RCU must handle correctly.
2044Andy Lutomirski
2045<a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a>
2046with this requirement;
2047he also kindly surprised me with
2048<a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a>
2049that meets this requirement.
2050
2051<h3><a name="Loadable Modules">Loadable Modules</a></h3>
2052
2053<p>
2054The Linux kernel has loadable modules, and these modules can
2055also be unloaded.
2056After a given module has been unloaded, any attempt to call
2057one of its functions results in a segmentation fault.
2058The module-unload functions must therefore cancel any
2059delayed calls to loadable-module functions, for example,
2060any outstanding <tt>mod_timer()</tt> must be dealt with
2061via <tt>del_timer_sync()</tt> or similar.
2062
2063<p>
2064Unfortunately, there is no way to cancel an RCU callback;
2065once you invoke <tt>call_rcu()</tt>, the callback function is
2066going to eventually be invoked, unless the system goes down first.
2067Because it is normally considered socially irresponsible to crash the system
2068in response to a module unload request, we need some other way
2069to deal with in-flight RCU callbacks.
2070
2071<p>
2072RCU therefore provides
2073<tt><a href="https://lwn.net/Articles/217484/">rcu_barrier()</a></tt>,
2074which waits until all in-flight RCU callbacks have been invoked.
2075If a module uses <tt>call_rcu()</tt>, its exit function should therefore
2076prevent any future invocation of <tt>call_rcu()</tt>, then invoke
2077<tt>rcu_barrier()</tt>.
2078In theory, the underlying module-unload code could invoke
2079<tt>rcu_barrier()</tt> unconditionally, but in practice this would
2080incur unacceptable latencies.
2081
2082<p>
2083Nikita Danilov noted this requirement for an analogous filesystem-unmount
2084situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU.
2085The need for <tt>rcu_barrier()</tt> for module unloading became
2086apparent later.
2087
2088<h3><a name="Hotplug CPU">Hotplug CPU</a></h3>
2089
2090<p>
2091The Linux kernel supports CPU hotplug, which means that CPUs
2092can come and go.
2093It is of course illegal to use any RCU API member from an offline CPU.
2094This requirement was present from day one in DYNIX/ptx, but
2095on the other hand, the Linux kernel's CPU-hotplug implementation
2096is &ldquo;interesting.&rdquo;
2097
2098<p>
2099The Linux-kernel CPU-hotplug implementation has notifiers that
2100are used to allow the various kernel subsystems (including RCU)
2101to respond appropriately to a given CPU-hotplug operation.
2102Most RCU operations may be invoked from CPU-hotplug notifiers,
2103including even normal synchronous grace-period operations
2104such as <tt>synchronize_rcu()</tt>.
2105However, expedited grace-period operations such as
2106<tt>synchronize_rcu_expedited()</tt> are not supported,
2107due to the fact that current implementations block CPU-hotplug
2108operations, which could result in deadlock.
2109
2110<p>
2111In addition, all-callback-wait operations such as
2112<tt>rcu_barrier()</tt> are also not supported, due to the
2113fact that there are phases of CPU-hotplug operations where
2114the outgoing CPU's callbacks will not be invoked until after
2115the CPU-hotplug operation ends, which could also result in deadlock.
2116
2117<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3>
2118
2119<p>
2120RCU depends on the scheduler, and the scheduler uses RCU to
2121protect some of its data structures.
2122This means the scheduler is forbidden from acquiring
2123the runqueue locks and the priority-inheritance locks
2124in the middle of an outermost RCU read-side critical section unless either
2125(1)&nbsp;it releases them before exiting that same
2126RCU read-side critical section, or
2127(2)&nbsp;interrupts are disabled across
2128that entire RCU read-side critical section.
2129This same prohibition also applies (recursively!) to any lock that is acquired
2130while holding any lock to which this prohibition applies.
2131Adhering to this rule prevents preemptible RCU from invoking
2132<tt>rcu_read_unlock_special()</tt> while either runqueue or
2133priority-inheritance locks are held, thus avoiding deadlock.
2134
2135<p>
2136Prior to v4.4, it was only necessary to disable preemption across
2137RCU read-side critical sections that acquired scheduler locks.
2138In v4.4, expedited grace periods started using IPIs, and these
2139IPIs could force a <tt>rcu_read_unlock()</tt> to take the slowpath.
2140Therefore, this expedited-grace-period change required disabling of
2141interrupts, not just preemption.
2142
2143<p>
2144For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt>
2145implementation must be written carefully to avoid similar deadlocks.
2146In particular, <tt>rcu_read_unlock()</tt> must tolerate an
2147interrupt where the interrupt handler invokes both
2148<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
2149This possibility requires <tt>rcu_read_unlock()</tt> to use
2150negative nesting levels to avoid destructive recursion via
2151interrupt handler's use of RCU.
2152
2153<p>
2154This pair of mutual scheduler-RCU requirements came as a
2155<a href="https://lwn.net/Articles/453002/">complete surprise</a>.
2156
2157<p>
2158As noted above, RCU makes use of kthreads, and it is necessary to
2159avoid excessive CPU-time accumulation by these kthreads.
2160This requirement was no surprise, but RCU's violation of it
2161when running context-switch-heavy workloads when built with
2162<tt>CONFIG_NO_HZ_FULL=y</tt>
2163<a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>.
2164RCU has made good progress towards meeting this requirement, even
2165for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads,
2166but there is room for further improvement.
2167
2168<h3><a name="Tracing and RCU">Tracing and RCU</a></h3>
2169
2170<p>
2171It is possible to use tracing on RCU code, but tracing itself
2172uses RCU.
2173For this reason, <tt>rcu_dereference_raw_notrace()</tt>
2174is provided for use by tracing, which avoids the destructive
2175recursion that could otherwise ensue.
2176This API is also used by virtualization in some architectures,
2177where RCU readers execute in environments in which tracing
2178cannot be used.
2179The tracing folks both located the requirement and provided the
2180needed fix, so this surprise requirement was relatively painless.
2181
2182<h3><a name="Energy Efficiency">Energy Efficiency</a></h3>
2183
2184<p>
2185Interrupting idle CPUs is considered socially unacceptable,
2186especially by people with battery-powered embedded systems.
2187RCU therefore conserves energy by detecting which CPUs are
2188idle, including tracking CPUs that have been interrupted from idle.
2189This is a large part of the energy-efficiency requirement,
2190so I learned of this via an irate phone call.
2191
2192<p>
2193Because RCU avoids interrupting idle CPUs, it is illegal to
2194execute an RCU read-side critical section on an idle CPU.
2195(Kernels built with <tt>CONFIG_PROVE_RCU=y</tt> will splat
2196if you try it.)
2197The <tt>RCU_NONIDLE()</tt> macro and <tt>_rcuidle</tt>
2198event tracing is provided to work around this restriction.
2199In addition, <tt>rcu_is_watching()</tt> may be used to
2200test whether or not it is currently legal to run RCU read-side
2201critical sections on this CPU.
2202I learned of the need for diagnostics on the one hand
2203and <tt>RCU_NONIDLE()</tt> on the other while inspecting
2204idle-loop code.
2205Steven Rostedt supplied <tt>_rcuidle</tt> event tracing,
2206which is used quite heavily in the idle loop.
2207
2208<p>
2209It is similarly socially unacceptable to interrupt an
2210<tt>nohz_full</tt> CPU running in userspace.
2211RCU must therefore track <tt>nohz_full</tt> userspace
2212execution.
2213And in
2214<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a>
2215kernels, RCU must separately track idle CPUs on the one hand and
2216CPUs that are either idle or executing in userspace on the other.
2217In both cases, RCU must be able to sample state at two points in
2218time, and be able to determine whether or not some other CPU spent
2219any time idle and/or executing in userspace.
2220
2221<p>
2222These energy-efficiency requirements have proven quite difficult to
2223understand and to meet, for example, there have been more than five
2224clean-sheet rewrites of RCU's energy-efficiency code, the last of
2225which was finally able to demonstrate
2226<a href="http://www.rdrop.com/users/paulmck/realtime/paper/AMPenergy.2013.04.19a.pdf">real energy savings running on real hardware [PDF]</a>.
2227As noted earlier,
2228I learned of many of these requirements via angry phone calls:
2229Flaming me on the Linux-kernel mailing list was apparently not
2230sufficient to fully vent their ire at RCU's energy-efficiency bugs!
2231
2232<h3><a name="Memory Efficiency">Memory Efficiency</a></h3>
2233
2234<p>
2235Although small-memory non-realtime systems can simply use Tiny RCU,
2236code size is only one aspect of memory efficiency.
2237Another aspect is the size of the <tt>rcu_head</tt> structure
2238used by <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>.
2239Although this structure contains nothing more than a pair of pointers,
2240it does appear in many RCU-protected data structures, including
2241some that are size critical.
2242The <tt>page</tt> structure is a case in point, as evidenced by
2243the many occurrences of the <tt>union</tt> keyword within that structure.
2244
2245<p>
2246This need for memory efficiency is one reason that RCU uses hand-crafted
2247singly linked lists to track the <tt>rcu_head</tt> structures that
2248are waiting for a grace period to elapse.
2249It is also the reason why <tt>rcu_head</tt> structures do not contain
2250debug information, such as fields tracking the file and line of the
2251<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> that posted them.
2252Although this information might appear in debug-only kernel builds at some
2253point, in the meantime, the <tt>-&gt;func</tt> field will often provide
2254the needed debug information.
2255
2256<p>
2257However, in some cases, the need for memory efficiency leads to even
2258more extreme measures.
2259Returning to the <tt>page</tt> structure, the <tt>rcu_head</tt> field
2260shares storage with a great many other structures that are used at
2261various points in the corresponding page's lifetime.
2262In order to correctly resolve certain
2263<a href="https://lkml.kernel.org/g/1439976106-137226-1-git-send-email-kirill.shutemov@linux.intel.com">race conditions</a>,
2264the Linux kernel's memory-management subsystem needs a particular bit
2265to remain zero during all phases of grace-period processing,
2266and that bit happens to map to the bottom bit of the
2267<tt>rcu_head</tt> structure's <tt>-&gt;next</tt> field.
2268RCU makes this guarantee as long as <tt>call_rcu()</tt>
2269is used to post the callback, as opposed to <tt>kfree_rcu()</tt>
2270or some future &ldquo;lazy&rdquo;
2271variant of <tt>call_rcu()</tt> that might one day be created for
2272energy-efficiency purposes.
2273
2274<h3><a name="Performance, Scalability, Response Time, and Reliability">
2275Performance, Scalability, Response Time, and Reliability</a></h3>
2276
2277<p>
2278Expanding on the
2279<a href="#Performance and Scalability">earlier discussion</a>,
2280RCU is used heavily by hot code paths in performance-critical
2281portions of the Linux kernel's networking, security, virtualization,
2282and scheduling code paths.
2283RCU must therefore use efficient implementations, especially in its
2284read-side primitives.
2285To that end, it would be good if preemptible RCU's implementation
2286of <tt>rcu_read_lock()</tt> could be inlined, however, doing
2287this requires resolving <tt>#include</tt> issues with the
2288<tt>task_struct</tt> structure.
2289
2290<p>
2291The Linux kernel supports hardware configurations with up to
22924096 CPUs, which means that RCU must be extremely scalable.
2293Algorithms that involve frequent acquisitions of global locks or
2294frequent atomic operations on global variables simply cannot be
2295tolerated within the RCU implementation.
2296RCU therefore makes heavy use of a combining tree based on the
2297<tt>rcu_node</tt> structure.
2298RCU is required to tolerate all CPUs continuously invoking any
2299combination of RCU's runtime primitives with minimal per-operation
2300overhead.
2301In fact, in many cases, increasing load must <i>decrease</i> the
2302per-operation overhead, witness the batching optimizations for
2303<tt>synchronize_rcu()</tt>, <tt>call_rcu()</tt>,
2304<tt>synchronize_rcu_expedited()</tt>, and <tt>rcu_barrier()</tt>.
2305As a general rule, RCU must cheerfully accept whatever the
2306rest of the Linux kernel decides to throw at it.
2307
2308<p>
2309The Linux kernel is used for real-time workloads, especially
2310in conjunction with the
2311<a href="https://rt.wiki.kernel.org/index.php/Main_Page">-rt patchset</a>.
2312The real-time-latency response requirements are such that the
2313traditional approach of disabling preemption across RCU
2314read-side critical sections is inappropriate.
2315Kernels built with <tt>CONFIG_PREEMPT=y</tt> therefore
2316use an RCU implementation that allows RCU read-side critical
2317sections to be preempted.
2318This requirement made its presence known after users made it
2319clear that an earlier
2320<a href="https://lwn.net/Articles/107930/">real-time patch</a>
2321did not meet their needs, in conjunction with some
2322<a href="https://lkml.kernel.org/g/20050318002026.GA2693@us.ibm.com">RCU issues</a>
2323encountered by a very early version of the -rt patchset.
2324
2325<p>
2326In addition, RCU must make do with a sub-100-microsecond real-time latency
2327budget.
2328In fact, on smaller systems with the -rt patchset, the Linux kernel
2329provides sub-20-microsecond real-time latencies for the whole kernel,
2330including RCU.
2331RCU's scalability and latency must therefore be sufficient for
2332these sorts of configurations.
2333To my surprise, the sub-100-microsecond real-time latency budget
2334<a href="http://www.rdrop.com/users/paulmck/realtime/paper/bigrt.2013.01.31a.LCA.pdf">
2335applies to even the largest systems [PDF]</a>,
2336up to and including systems with 4096 CPUs.
2337This real-time requirement motivated the grace-period kthread, which
2338also simplified handling of a number of race conditions.
2339
2340<p>
2341Finally, RCU's status as a synchronization primitive means that
2342any RCU failure can result in arbitrary memory corruption that can be
2343extremely difficult to debug.
2344This means that RCU must be extremely reliable, which in
2345practice also means that RCU must have an aggressive stress-test
2346suite.
2347This stress-test suite is called <tt>rcutorture</tt>.
2348
2349<p>
2350Although the need for <tt>rcutorture</tt> was no surprise,
2351the current immense popularity of the Linux kernel is posing
2352interesting&mdash;and perhaps unprecedented&mdash;validation
2353challenges.
2354To see this, keep in mind that there are well over one billion
2355instances of the Linux kernel running today, given Android
2356smartphones, Linux-powered televisions, and servers.
2357This number can be expected to increase sharply with the advent of
2358the celebrated Internet of Things.
2359
2360<p>
2361Suppose that RCU contains a race condition that manifests on average
2362once per million years of runtime.
2363This bug will be occurring about three times per <i>day</i> across
2364the installed base.
2365RCU could simply hide behind hardware error rates, given that no one
2366should really expect their smartphone to last for a million years.
2367However, anyone taking too much comfort from this thought should
2368consider the fact that in most jurisdictions, a successful multi-year
2369test of a given mechanism, which might include a Linux kernel,
2370suffices for a number of types of safety-critical certifications.
2371In fact, rumor has it that the Linux kernel is already being used
2372in production for safety-critical applications.
2373I don't know about you, but I would feel quite bad if a bug in RCU
2374killed someone.
2375Which might explain my recent focus on validation and verification.
2376
2377<h2><a name="Other RCU Flavors">Other RCU Flavors</a></h2>
2378
2379<p>
2380One of the more surprising things about RCU is that there are now
2381no fewer than five <i>flavors</i>, or API families.
2382In addition, the primary flavor that has been the sole focus up to
2383this point has two different implementations, non-preemptible and
2384preemptible.
2385The other four flavors are listed below, with requirements for each
2386described in a separate section.
2387
2388<ol>
2389<li> <a href="#Bottom-Half Flavor">Bottom-Half Flavor</a>
2390<li> <a href="#Sched Flavor">Sched Flavor</a>
2391<li> <a href="#Sleepable RCU">Sleepable RCU</a>
2392<li> <a href="#Tasks RCU">Tasks RCU</a>
2393</ol>
2394
2395<h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3>
2396
2397<p>
2398The softirq-disable (AKA &ldquo;bottom-half&rdquo;,
2399hence the &ldquo;_bh&rdquo; abbreviations)
2400flavor of RCU, or <i>RCU-bh</i>, was developed by
2401Dipankar Sarma to provide a flavor of RCU that could withstand the
2402network-based denial-of-service attacks researched by Robert
2403Olsson.
2404These attacks placed so much networking load on the system
2405that some of the CPUs never exited softirq execution,
2406which in turn prevented those CPUs from ever executing a context switch,
2407which, in the RCU implementation of that time, prevented grace periods
2408from ever ending.
2409The result was an out-of-memory condition and a system hang.
2410
2411<p>
2412The solution was the creation of RCU-bh, which does
2413<tt>local_bh_disable()</tt>
2414across its read-side critical sections, and which uses the transition
2415from one type of softirq processing to another as a quiescent state
2416in addition to context switch, idle, user mode, and offline.
2417This means that RCU-bh grace periods can complete even when some of
2418the CPUs execute in softirq indefinitely, thus allowing algorithms
2419based on RCU-bh to withstand network-based denial-of-service attacks.
2420
2421<p>
2422Because
2423<tt>rcu_read_lock_bh()</tt> and <tt>rcu_read_unlock_bh()</tt>
2424disable and re-enable softirq handlers, any attempt to start a softirq
2425handlers during the
2426RCU-bh read-side critical section will be deferred.
2427In this case, <tt>rcu_read_unlock_bh()</tt>
2428will invoke softirq processing, which can take considerable time.
2429One can of course argue that this softirq overhead should be associated
2430with the code following the RCU-bh read-side critical section rather
2431than <tt>rcu_read_unlock_bh()</tt>, but the fact
2432is that most profiling tools cannot be expected to make this sort
2433of fine distinction.
2434For example, suppose that a three-millisecond-long RCU-bh read-side
2435critical section executes during a time of heavy networking load.
2436There will very likely be an attempt to invoke at least one softirq
2437handler during that three milliseconds, but any such invocation will
2438be delayed until the time of the <tt>rcu_read_unlock_bh()</tt>.
2439This can of course make it appear at first glance as if
2440<tt>rcu_read_unlock_bh()</tt> was executing very slowly.
2441
2442<p>
2443The
2444<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-bh API</a>
2445includes
2446<tt>rcu_read_lock_bh()</tt>,
2447<tt>rcu_read_unlock_bh()</tt>,
2448<tt>rcu_dereference_bh()</tt>,
2449<tt>rcu_dereference_bh_check()</tt>,
2450<tt>synchronize_rcu_bh()</tt>,
2451<tt>synchronize_rcu_bh_expedited()</tt>,
2452<tt>call_rcu_bh()</tt>,
2453<tt>rcu_barrier_bh()</tt>, and
2454<tt>rcu_read_lock_bh_held()</tt>.
2455
2456<h3><a name="Sched Flavor">Sched Flavor</a></h3>
2457
2458<p>
2459Before preemptible RCU, waiting for an RCU grace period had the
2460side effect of also waiting for all pre-existing interrupt
2461and NMI handlers.
2462However, there are legitimate preemptible-RCU implementations that
2463do not have this property, given that any point in the code outside
2464of an RCU read-side critical section can be a quiescent state.
2465Therefore, <i>RCU-sched</i> was created, which follows &ldquo;classic&rdquo;
2466RCU in that an RCU-sched grace period waits for for pre-existing
2467interrupt and NMI handlers.
2468In kernels built with <tt>CONFIG_PREEMPT=n</tt>, the RCU and RCU-sched
2469APIs have identical implementations, while kernels built with
2470<tt>CONFIG_PREEMPT=y</tt> provide a separate implementation for each.
2471
2472<p>
2473Note well that in <tt>CONFIG_PREEMPT=y</tt> kernels,
2474<tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt>
2475disable and re-enable preemption, respectively.
2476This means that if there was a preemption attempt during the
2477RCU-sched read-side critical section, <tt>rcu_read_unlock_sched()</tt>
2478will enter the scheduler, with all the latency and overhead entailed.
2479Just as with <tt>rcu_read_unlock_bh()</tt>, this can make it look
2480as if <tt>rcu_read_unlock_sched()</tt> was executing very slowly.
2481However, the highest-priority task won't be preempted, so that task
2482will enjoy low-overhead <tt>rcu_read_unlock_sched()</tt> invocations.
2483
2484<p>
2485The
2486<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-sched API</a>
2487includes
2488<tt>rcu_read_lock_sched()</tt>,
2489<tt>rcu_read_unlock_sched()</tt>,
2490<tt>rcu_read_lock_sched_notrace()</tt>,
2491<tt>rcu_read_unlock_sched_notrace()</tt>,
2492<tt>rcu_dereference_sched()</tt>,
2493<tt>rcu_dereference_sched_check()</tt>,
2494<tt>synchronize_sched()</tt>,
2495<tt>synchronize_rcu_sched_expedited()</tt>,
2496<tt>call_rcu_sched()</tt>,
2497<tt>rcu_barrier_sched()</tt>, and
2498<tt>rcu_read_lock_sched_held()</tt>.
2499However, anything that disables preemption also marks an RCU-sched
2500read-side critical section, including
2501<tt>preempt_disable()</tt> and <tt>preempt_enable()</tt>,
2502<tt>local_irq_save()</tt> and <tt>local_irq_restore()</tt>,
2503and so on.
2504
2505<h3><a name="Sleepable RCU">Sleepable RCU</a></h3>
2506
2507<p>
2508For well over a decade, someone saying &ldquo;I need to block within
2509an RCU read-side critical section&rdquo; was a reliable indication
2510that this someone did not understand RCU.
2511After all, if you are always blocking in an RCU read-side critical
2512section, you can probably afford to use a higher-overhead synchronization
2513mechanism.
2514However, that changed with the advent of the Linux kernel's notifiers,
2515whose RCU read-side critical
2516sections almost never sleep, but sometimes need to.
2517This resulted in the introduction of
2518<a href="https://lwn.net/Articles/202847/">sleepable RCU</a>,
2519or <i>SRCU</i>.
2520
2521<p>
2522SRCU allows different domains to be defined, with each such domain
2523defined by an instance of an <tt>srcu_struct</tt> structure.
2524A pointer to this structure must be passed in to each SRCU function,
2525for example, <tt>synchronize_srcu(&amp;ss)</tt>, where
2526<tt>ss</tt> is the <tt>srcu_struct</tt> structure.
2527The key benefit of these domains is that a slow SRCU reader in one
2528domain does not delay an SRCU grace period in some other domain.
2529That said, one consequence of these domains is that read-side code
2530must pass a &ldquo;cookie&rdquo; from <tt>srcu_read_lock()</tt>
2531to <tt>srcu_read_unlock()</tt>, for example, as follows:
2532
2533<blockquote>
2534<pre>
2535 1 int idx;
2536 2
2537 3 idx = srcu_read_lock(&amp;ss);
2538 4 do_something();
2539 5 srcu_read_unlock(&amp;ss, idx);
2540</pre>
2541</blockquote>
2542
2543<p>
2544As noted above, it is legal to block within SRCU read-side critical sections,
2545however, with great power comes great responsibility.
2546If you block forever in one of a given domain's SRCU read-side critical
2547sections, then that domain's grace periods will also be blocked forever.
2548Of course, one good way to block forever is to deadlock, which can
2549happen if any operation in a given domain's SRCU read-side critical
2550section can block waiting, either directly or indirectly, for that domain's
2551grace period to elapse.
2552For example, this results in a self-deadlock:
2553
2554<blockquote>
2555<pre>
2556 1 int idx;
2557 2
2558 3 idx = srcu_read_lock(&amp;ss);
2559 4 do_something();
2560 5 synchronize_srcu(&amp;ss);
2561 6 srcu_read_unlock(&amp;ss, idx);
2562</pre>
2563</blockquote>
2564
2565<p>
2566However, if line&nbsp;5 acquired a mutex that was held across
2567a <tt>synchronize_srcu()</tt> for domain <tt>ss</tt>,
2568deadlock would still be possible.
2569Furthermore, if line&nbsp;5 acquired a mutex that was held across
2570a <tt>synchronize_srcu()</tt> for some other domain <tt>ss1</tt>,
2571and if an <tt>ss1</tt>-domain SRCU read-side critical section
2572acquired another mutex that was held across as <tt>ss</tt>-domain
2573<tt>synchronize_srcu()</tt>,
2574deadlock would again be possible.
2575Such a deadlock cycle could extend across an arbitrarily large number
2576of different SRCU domains.
2577Again, with great power comes great responsibility.
2578
2579<p>
2580Unlike the other RCU flavors, SRCU read-side critical sections can
2581run on idle and even offline CPUs.
2582This ability requires that <tt>srcu_read_lock()</tt> and
2583<tt>srcu_read_unlock()</tt> contain memory barriers, which means
2584that SRCU readers will run a bit slower than would RCU readers.
2585It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt>
2586API, which, in combination with <tt>srcu_read_unlock()</tt>,
2587guarantees a full memory barrier.
2588
2589<p>
2590The
2591<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a>
2592includes
2593<tt>srcu_read_lock()</tt>,
2594<tt>srcu_read_unlock()</tt>,
2595<tt>srcu_dereference()</tt>,
2596<tt>srcu_dereference_check()</tt>,
2597<tt>synchronize_srcu()</tt>,
2598<tt>synchronize_srcu_expedited()</tt>,
2599<tt>call_srcu()</tt>,
2600<tt>srcu_barrier()</tt>, and
2601<tt>srcu_read_lock_held()</tt>.
2602It also includes
2603<tt>DEFINE_SRCU()</tt>,
2604<tt>DEFINE_STATIC_SRCU()</tt>, and
2605<tt>init_srcu_struct()</tt>
2606APIs for defining and initializing <tt>srcu_struct</tt> structures.
2607
2608<h3><a name="Tasks RCU">Tasks RCU</a></h3>
2609
2610<p>
2611Some forms of tracing use &ldquo;tramopolines&rdquo; to handle the
2612binary rewriting required to install different types of probes.
2613It would be good to be able to free old trampolines, which sounds
2614like a job for some form of RCU.
2615However, because it is necessary to be able to install a trace
2616anywhere in the code, it is not possible to use read-side markers
2617such as <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
2618In addition, it does not work to have these markers in the trampoline
2619itself, because there would need to be instructions following
2620<tt>rcu_read_unlock()</tt>.
2621Although <tt>synchronize_rcu()</tt> would guarantee that execution
2622reached the <tt>rcu_read_unlock()</tt>, it would not be able to
2623guarantee that execution had completely left the trampoline.
2624
2625<p>
2626The solution, in the form of
2627<a href="https://lwn.net/Articles/607117/"><i>Tasks RCU</i></a>,
2628is to have implicit
2629read-side critical sections that are delimited by voluntary context
2630switches, that is, calls to <tt>schedule()</tt>,
2631<tt>cond_resched_rcu_qs()</tt>, and
2632<tt>synchronize_rcu_tasks()</tt>.
2633In addition, transitions to and from userspace execution also delimit
2634tasks-RCU read-side critical sections.
2635
2636<p>
2637The tasks-RCU API is quite compact, consisting only of
2638<tt>call_rcu_tasks()</tt>,
2639<tt>synchronize_rcu_tasks()</tt>, and
2640<tt>rcu_barrier_tasks()</tt>.
2641
2642<h2><a name="Possible Future Changes">Possible Future Changes</a></h2>
2643
2644<p>
2645One of the tricks that RCU uses to attain update-side scalability is
2646to increase grace-period latency with increasing numbers of CPUs.
2647If this becomes a serious problem, it will be necessary to rework the
2648grace-period state machine so as to avoid the need for the additional
2649latency.
2650
2651<p>
2652Expedited grace periods scan the CPUs, so their latency and overhead
2653increases with increasing numbers of CPUs.
2654If this becomes a serious problem on large systems, it will be necessary
2655to do some redesign to avoid this scalability problem.
2656
2657<p>
2658RCU disables CPU hotplug in a few places, perhaps most notably in the
2659expedited grace-period and <tt>rcu_barrier()</tt> operations.
2660If there is a strong reason to use expedited grace periods in CPU-hotplug
2661notifiers, it will be necessary to avoid disabling CPU hotplug.
2662This would introduce some complexity, so there had better be a <i>very</i>
2663good reason.
2664
2665<p>
2666The tradeoff between grace-period latency on the one hand and interruptions
2667of other CPUs on the other hand may need to be re-examined.
2668The desire is of course for zero grace-period latency as well as zero
2669interprocessor interrupts undertaken during an expedited grace period
2670operation.
2671While this ideal is unlikely to be achievable, it is quite possible that
2672further improvements can be made.
2673
2674<p>
2675The multiprocessor implementations of RCU use a combining tree that
2676groups CPUs so as to reduce lock contention and increase cache locality.
2677However, this combining tree does not spread its memory across NUMA
2678nodes nor does it align the CPU groups with hardware features such
2679as sockets or cores.
2680Such spreading and alignment is currently believed to be unnecessary
2681because the hotpath read-side primitives do not access the combining
2682tree, nor does <tt>call_rcu()</tt> in the common case.
2683If you believe that your architecture needs such spreading and alignment,
2684then your architecture should also benefit from the
2685<tt>rcutree.rcu_fanout_leaf</tt> boot parameter, which can be set
2686to the number of CPUs in a socket, NUMA node, or whatever.
2687If the number of CPUs is too large, use a fraction of the number of
2688CPUs.
2689If the number of CPUs is a large prime number, well, that certainly
2690is an &ldquo;interesting&rdquo; architectural choice!
2691More flexible arrangements might be considered, but only if
2692<tt>rcutree.rcu_fanout_leaf</tt> has proven inadequate, and only
2693if the inadequacy has been demonstrated by a carefully run and
2694realistic system-level workload.
2695
2696<p>
2697Please note that arrangements that require RCU to remap CPU numbers will
2698require extremely good demonstration of need and full exploration of
2699alternatives.
2700
2701<p>
2702There is an embarrassingly large number of flavors of RCU, and this
2703number has been increasing over time.
2704Perhaps it will be possible to combine some at some future date.
2705
2706<p>
2707RCU's various kthreads are reasonably recent additions.
2708It is quite likely that adjustments will be required to more gracefully
2709handle extreme loads.
2710It might also be necessary to be able to relate CPU utilization by
2711RCU's kthreads and softirq handlers to the code that instigated this
2712CPU utilization.
2713For example, RCU callback overhead might be charged back to the
2714originating <tt>call_rcu()</tt> instance, though probably not
2715in production kernels.
2716
2717<h2><a name="Summary">Summary</a></h2>
2718
2719<p>
2720This document has presented more than two decade's worth of RCU
2721requirements.
2722Given that the requirements keep changing, this will not be the last
2723word on this subject, but at least it serves to get an important
2724subset of the requirements set forth.
2725
2726<h2><a name="Acknowledgments">Acknowledgments</a></h2>
2727
2728I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar,
2729Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and
2730Andy Lutomirski for their help in rendering
2731this article human readable, and to Michelle Rankin for her support
2732of this effort.
2733Other contributions are acknowledged in the Linux kernel's git archive.
2734The cartoon is copyright (c) 2013 by Melissa Broussard,
2735and is provided
2736under the terms of the Creative Commons Attribution-Share Alike 3.0
2737United States license.
2738
2739<p>@@QQAL@@
2740
2741</body></html>
diff --git a/Documentation/RCU/Design/htmlqqz.sh b/Documentation/RCU/Design/htmlqqz.sh
new file mode 100755
index 000000000000..d354f069559b
--- /dev/null
+++ b/Documentation/RCU/Design/htmlqqz.sh
@@ -0,0 +1,108 @@
1#!/bin/sh
2#
3# Usage: sh htmlqqz.sh file
4#
5# Extracts and converts quick quizzes in a proto-HTML document file.htmlx.
6# Commands, all of which must be on a line by themselves:
7#
8# "<p>@@QQ@@": Start of a quick quiz.
9# "<p>@@QQA@@": Start of a quick-quiz answer.
10# "<p>@@QQE@@": End of a quick-quiz answer, and thus of the quick quiz.
11# "<p>@@QQAL@@": Place to put quick-quiz answer list.
12#
13# Places the result in file.html.
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, you can access it online at
27# http://www.gnu.org/licenses/gpl-2.0.html.
28#
29# Copyright (c) 2013 Paul E. McKenney, IBM Corporation.
30
31fn=$1
32if test ! -r $fn.htmlx
33then
34 echo "Error: $fn.htmlx unreadable."
35 exit 1
36fi
37
38echo "<!-- DO NOT HAND EDIT. -->" > $fn.html
39echo "<!-- Instead, edit $fn.htmlx and run 'sh htmlqqz.sh $fn' -->" >> $fn.html
40awk < $fn.htmlx >> $fn.html '
41
42state == "" && $1 != "<p>@@QQ@@" && $1 != "<p>@@QQAL@@" {
43 print $0;
44 if ($0 ~ /^<p>@@QQ/)
45 print "Bad Quick Quiz command: " NR " (expected <p>@@QQ@@ or <p>@@QQAL@@)." > "/dev/stderr"
46 next;
47}
48
49state == "" && $1 == "<p>@@QQ@@" {
50 qqn++;
51 qqlineno = NR;
52 haveqq = 1;
53 state = "qq";
54 print "<p><a name=\"Quick Quiz " qqn "\"><b>Quick Quiz " qqn "</b>:</a>"
55 next;
56}
57
58state == "qq" && $1 != "<p>@@QQA@@" {
59 qq[qqn] = qq[qqn] $0 "\n";
60 print $0
61 if ($0 ~ /^<p>@@QQ/)
62 print "Bad Quick Quiz command: " NR ". (expected <p>@@QQA@@)" > "/dev/stderr"
63 next;
64}
65
66state == "qq" && $1 == "<p>@@QQA@@" {
67 state = "qqa";
68 print "<br><a href=\"#qq" qqn "answer\">Answer</a>"
69 next;
70}
71
72state == "qqa" && $1 != "<p>@@QQE@@" {
73 qqa[qqn] = qqa[qqn] $0 "\n";
74 if ($0 ~ /^<p>@@QQ/)
75 print "Bad Quick Quiz command: " NR " (expected <p>@@QQE@@)." > "/dev/stderr"
76 next;
77}
78
79state == "qqa" && $1 == "<p>@@QQE@@" {
80 state = "";
81 next;
82}
83
84state == "" && $1 == "<p>@@QQAL@@" {
85 haveqq = "";
86 print "<h3><a name=\"Answers to Quick Quizzes\">"
87 print "Answers to Quick Quizzes</a></h3>"
88 print "";
89 for (i = 1; i <= qqn; i++) {
90 print "<a name=\"qq" i "answer\"></a>"
91 print "<p><b>Quick Quiz " i "</b>:"
92 print qq[i];
93 print "";
94 print "</p><p><b>Answer</b>:"
95 print qqa[i];
96 print "";
97 print "</p><p><a href=\"#Quick%20Quiz%20" i "\"><b>Back to Quick Quiz " i "</b>.</a>"
98 print "";
99 }
100 next;
101}
102
103END {
104 if (state != "")
105 print "Unterminated Quick Quiz: " qqlineno "." > "/dev/stderr"
106 else if (haveqq)
107 print "Missing \"<p>@@QQAL@@\", no Quick Quiz." > "/dev/stderr"
108}'
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 742f69d18fc8..d8186da15ca1 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3296,18 +3296,35 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
3296 rcutorture.verbose= [KNL] 3296 rcutorture.verbose= [KNL]
3297 Enable additional printk() statements. 3297 Enable additional printk() statements.
3298 3298
3299 rcupdate.rcu_cpu_stall_suppress= [KNL]
3300 Suppress RCU CPU stall warning messages.
3301
3302 rcupdate.rcu_cpu_stall_timeout= [KNL]
3303 Set timeout for RCU CPU stall warning messages.
3304
3299 rcupdate.rcu_expedited= [KNL] 3305 rcupdate.rcu_expedited= [KNL]
3300 Use expedited grace-period primitives, for 3306 Use expedited grace-period primitives, for
3301 example, synchronize_rcu_expedited() instead 3307 example, synchronize_rcu_expedited() instead
3302 of synchronize_rcu(). This reduces latency, 3308 of synchronize_rcu(). This reduces latency,
3303 but can increase CPU utilization, degrade 3309 but can increase CPU utilization, degrade
3304 real-time latency, and degrade energy efficiency. 3310 real-time latency, and degrade energy efficiency.
3305 3311 No effect on CONFIG_TINY_RCU kernels.
3306 rcupdate.rcu_cpu_stall_suppress= [KNL] 3312
3307 Suppress RCU CPU stall warning messages. 3313 rcupdate.rcu_normal= [KNL]
3308 3314 Use only normal grace-period primitives,
3309 rcupdate.rcu_cpu_stall_timeout= [KNL] 3315 for example, synchronize_rcu() instead of
3310 Set timeout for RCU CPU stall warning messages. 3316 synchronize_rcu_expedited(). This improves
3317 real-time latency, CPU utilization, and
3318 energy efficiency, but can expose users to
3319 increased grace-period latency. This parameter
3320 overrides rcupdate.rcu_expedited. No effect on
3321 CONFIG_TINY_RCU kernels.
3322
3323 rcupdate.rcu_normal_after_boot= [KNL]
3324 Once boot has completed (that is, after
3325 rcu_end_inkernel_boot() has been invoked), use
3326 only normal grace-period primitives. No effect
3327 on CONFIG_TINY_RCU kernels.
3311 3328
3312 rcupdate.rcu_task_stall_timeout= [KNL] 3329 rcupdate.rcu_task_stall_timeout= [KNL]
3313 Set timeout in jiffies for RCU task stall warning 3330 Set timeout in jiffies for RCU task stall warning
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index aef9487303d0..85304ebd187c 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -194,7 +194,7 @@ There are some minimal guarantees that may be expected of a CPU:
194 (*) On any given CPU, dependent memory accesses will be issued in order, with 194 (*) On any given CPU, dependent memory accesses will be issued in order, with
195 respect to itself. This means that for: 195 respect to itself. This means that for:
196 196
197 WRITE_ONCE(Q, P); smp_read_barrier_depends(); D = READ_ONCE(*Q); 197 Q = READ_ONCE(P); smp_read_barrier_depends(); D = READ_ONCE(*Q);
198 198
199 the CPU will issue the following memory operations: 199 the CPU will issue the following memory operations:
200 200
@@ -202,9 +202,9 @@ There are some minimal guarantees that may be expected of a CPU:
202 202
203 and always in that order. On most systems, smp_read_barrier_depends() 203 and always in that order. On most systems, smp_read_barrier_depends()
204 does nothing, but it is required for DEC Alpha. The READ_ONCE() 204 does nothing, but it is required for DEC Alpha. The READ_ONCE()
205 and WRITE_ONCE() are required to prevent compiler mischief. Please 205 is required to prevent compiler mischief. Please note that you
206 note that you should normally use something like rcu_dereference() 206 should normally use something like rcu_dereference() instead of
207 instead of open-coding smp_read_barrier_depends(). 207 open-coding smp_read_barrier_depends().
208 208
209 (*) Overlapping loads and stores within a particular CPU will appear to be 209 (*) Overlapping loads and stores within a particular CPU will appear to be
210 ordered within that CPU. This means that for: 210 ordered within that CPU. This means that for:
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 5381a728d23e..e5139402e7f8 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -133,6 +133,12 @@ static void sysrq_handle_crash(int key)
133{ 133{
134 char *killer = NULL; 134 char *killer = NULL;
135 135
136 /* we need to release the RCU read lock here,
137 * otherwise we get an annoying
138 * 'BUG: sleeping function called from invalid context'
139 * complaint from the kernel before the panic.
140 */
141 rcu_read_unlock();
136 panic_on_oops = 1; /* force panic */ 142 panic_on_oops = 1; /* force panic */
137 wmb(); 143 wmb();
138 *killer = 1; 144 *killer = 1;
diff --git a/include/linux/list.h b/include/linux/list.h
index 993395a2e55c..5356f4d661a7 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -24,7 +24,7 @@
24 24
25static inline void INIT_LIST_HEAD(struct list_head *list) 25static inline void INIT_LIST_HEAD(struct list_head *list)
26{ 26{
27 list->next = list; 27 WRITE_ONCE(list->next, list);
28 list->prev = list; 28 list->prev = list;
29} 29}
30 30
@@ -42,7 +42,7 @@ static inline void __list_add(struct list_head *new,
42 next->prev = new; 42 next->prev = new;
43 new->next = next; 43 new->next = next;
44 new->prev = prev; 44 new->prev = prev;
45 prev->next = new; 45 WRITE_ONCE(prev->next, new);
46} 46}
47#else 47#else
48extern void __list_add(struct list_head *new, 48extern void __list_add(struct list_head *new,
@@ -186,7 +186,7 @@ static inline int list_is_last(const struct list_head *list,
186 */ 186 */
187static inline int list_empty(const struct list_head *head) 187static inline int list_empty(const struct list_head *head)
188{ 188{
189 return head->next == head; 189 return READ_ONCE(head->next) == head;
190} 190}
191 191
192/** 192/**
@@ -608,7 +608,7 @@ static inline int hlist_unhashed(const struct hlist_node *h)
608 608
609static inline int hlist_empty(const struct hlist_head *h) 609static inline int hlist_empty(const struct hlist_head *h)
610{ 610{
611 return !h->first; 611 return !READ_ONCE(h->first);
612} 612}
613 613
614static inline void __hlist_del(struct hlist_node *n) 614static inline void __hlist_del(struct hlist_node *n)
@@ -642,7 +642,7 @@ static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
642 n->next = first; 642 n->next = first;
643 if (first) 643 if (first)
644 first->pprev = &n->next; 644 first->pprev = &n->next;
645 h->first = n; 645 WRITE_ONCE(h->first, n);
646 n->pprev = &h->first; 646 n->pprev = &h->first;
647} 647}
648 648
@@ -653,14 +653,14 @@ static inline void hlist_add_before(struct hlist_node *n,
653 n->pprev = next->pprev; 653 n->pprev = next->pprev;
654 n->next = next; 654 n->next = next;
655 next->pprev = &n->next; 655 next->pprev = &n->next;
656 *(n->pprev) = n; 656 WRITE_ONCE(*(n->pprev), n);
657} 657}
658 658
659static inline void hlist_add_behind(struct hlist_node *n, 659static inline void hlist_add_behind(struct hlist_node *n,
660 struct hlist_node *prev) 660 struct hlist_node *prev)
661{ 661{
662 n->next = prev->next; 662 n->next = prev->next;
663 prev->next = n; 663 WRITE_ONCE(prev->next, n);
664 n->pprev = &prev->next; 664 n->pprev = &prev->next;
665 665
666 if (n->next) 666 if (n->next)
diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
index 8132214e8efd..ee7229a6c06a 100644
--- a/include/linux/list_bl.h
+++ b/include/linux/list_bl.h
@@ -70,7 +70,7 @@ static inline void hlist_bl_set_first(struct hlist_bl_head *h,
70 70
71static inline int hlist_bl_empty(const struct hlist_bl_head *h) 71static inline int hlist_bl_empty(const struct hlist_bl_head *h)
72{ 72{
73 return !((unsigned long)h->first & ~LIST_BL_LOCKMASK); 73 return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK);
74} 74}
75 75
76static inline void hlist_bl_add_head(struct hlist_bl_node *n, 76static inline void hlist_bl_add_head(struct hlist_bl_node *n,
diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
index 444d2b1313bd..b01fe1009084 100644
--- a/include/linux/list_nulls.h
+++ b/include/linux/list_nulls.h
@@ -57,7 +57,7 @@ static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h)
57 57
58static inline int hlist_nulls_empty(const struct hlist_nulls_head *h) 58static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
59{ 59{
60 return is_a_nulls(h->first); 60 return is_a_nulls(READ_ONCE(h->first));
61} 61}
62 62
63static inline void hlist_nulls_add_head(struct hlist_nulls_node *n, 63static inline void hlist_nulls_add_head(struct hlist_nulls_node *n,
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 5ed540986019..14ec1652daf4 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -179,32 +179,31 @@ static inline void list_replace_rcu(struct list_head *old,
179} 179}
180 180
181/** 181/**
182 * list_splice_init_rcu - splice an RCU-protected list into an existing list. 182 * __list_splice_init_rcu - join an RCU-protected list into an existing list.
183 * @list: the RCU-protected list to splice 183 * @list: the RCU-protected list to splice
184 * @head: the place in the list to splice the first list into 184 * @prev: points to the last element of the existing list
185 * @next: points to the first element of the existing list
185 * @sync: function to sync: synchronize_rcu(), synchronize_sched(), ... 186 * @sync: function to sync: synchronize_rcu(), synchronize_sched(), ...
186 * 187 *
187 * @head can be RCU-read traversed concurrently with this function. 188 * The list pointed to by @prev and @next can be RCU-read traversed
189 * concurrently with this function.
188 * 190 *
189 * Note that this function blocks. 191 * Note that this function blocks.
190 * 192 *
191 * Important note: the caller must take whatever action is necessary to 193 * Important note: the caller must take whatever action is necessary to prevent
192 * prevent any other updates to @head. In principle, it is possible 194 * any other updates to the existing list. In principle, it is possible to
193 * to modify the list as soon as sync() begins execution. 195 * modify the list as soon as sync() begins execution. If this sort of thing
194 * If this sort of thing becomes necessary, an alternative version 196 * becomes necessary, an alternative version based on call_rcu() could be
195 * based on call_rcu() could be created. But only if -really- 197 * created. But only if -really- needed -- there is no shortage of RCU API
196 * needed -- there is no shortage of RCU API members. 198 * members.
197 */ 199 */
198static inline void list_splice_init_rcu(struct list_head *list, 200static inline void __list_splice_init_rcu(struct list_head *list,
199 struct list_head *head, 201 struct list_head *prev,
200 void (*sync)(void)) 202 struct list_head *next,
203 void (*sync)(void))
201{ 204{
202 struct list_head *first = list->next; 205 struct list_head *first = list->next;
203 struct list_head *last = list->prev; 206 struct list_head *last = list->prev;
204 struct list_head *at = head->next;
205
206 if (list_empty(list))
207 return;
208 207
209 /* 208 /*
210 * "first" and "last" tracking list, so initialize it. RCU readers 209 * "first" and "last" tracking list, so initialize it. RCU readers
@@ -231,10 +230,40 @@ static inline void list_splice_init_rcu(struct list_head *list,
231 * this function. 230 * this function.
232 */ 231 */
233 232
234 last->next = at; 233 last->next = next;
235 rcu_assign_pointer(list_next_rcu(head), first); 234 rcu_assign_pointer(list_next_rcu(prev), first);
236 first->prev = head; 235 first->prev = prev;
237 at->prev = last; 236 next->prev = last;
237}
238
239/**
240 * list_splice_init_rcu - splice an RCU-protected list into an existing list,
241 * designed for stacks.
242 * @list: the RCU-protected list to splice
243 * @head: the place in the existing list to splice the first list into
244 * @sync: function to sync: synchronize_rcu(), synchronize_sched(), ...
245 */
246static inline void list_splice_init_rcu(struct list_head *list,
247 struct list_head *head,
248 void (*sync)(void))
249{
250 if (!list_empty(list))
251 __list_splice_init_rcu(list, head, head->next, sync);
252}
253
254/**
255 * list_splice_tail_init_rcu - splice an RCU-protected list into an existing
256 * list, designed for queues.
257 * @list: the RCU-protected list to splice
258 * @head: the place in the existing list to splice the first list into
259 * @sync: function to sync: synchronize_rcu(), synchronize_sched(), ...
260 */
261static inline void list_splice_tail_init_rcu(struct list_head *list,
262 struct list_head *head,
263 void (*sync)(void))
264{
265 if (!list_empty(list))
266 __list_splice_init_rcu(list, head->prev, head, sync);
238} 267}
239 268
240/** 269/**
@@ -305,6 +334,42 @@ static inline void list_splice_init_rcu(struct list_head *list,
305 pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) 334 pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
306 335
307/** 336/**
337 * list_entry_lockless - get the struct for this entry
338 * @ptr: the &struct list_head pointer.
339 * @type: the type of the struct this is embedded in.
340 * @member: the name of the list_head within the struct.
341 *
342 * This primitive may safely run concurrently with the _rcu list-mutation
343 * primitives such as list_add_rcu(), but requires some implicit RCU
344 * read-side guarding. One example is running within a special
345 * exception-time environment where preemption is disabled and where
346 * lockdep cannot be invoked (in which case updaters must use RCU-sched,
347 * as in synchronize_sched(), call_rcu_sched(), and friends). Another
348 * example is when items are added to the list, but never deleted.
349 */
350#define list_entry_lockless(ptr, type, member) \
351 container_of((typeof(ptr))lockless_dereference(ptr), type, member)
352
353/**
354 * list_for_each_entry_lockless - iterate over rcu list of given type
355 * @pos: the type * to use as a loop cursor.
356 * @head: the head for your list.
357 * @member: the name of the list_struct within the struct.
358 *
359 * This primitive may safely run concurrently with the _rcu list-mutation
360 * primitives such as list_add_rcu(), but requires some implicit RCU
361 * read-side guarding. One example is running within a special
362 * exception-time environment where preemption is disabled and where
363 * lockdep cannot be invoked (in which case updaters must use RCU-sched,
364 * as in synchronize_sched(), call_rcu_sched(), and friends). Another
365 * example is when items are added to the list, but never deleted.
366 */
367#define list_for_each_entry_lockless(pos, head, member) \
368 for (pos = list_entry_lockless((head)->next, typeof(*pos), member); \
369 &pos->member != (head); \
370 pos = list_entry_lockless(pos->member.next, typeof(*pos), member))
371
372/**
308 * list_for_each_entry_continue_rcu - continue iteration over list of given type 373 * list_for_each_entry_continue_rcu - continue iteration over list of given type
309 * @pos: the type * to use as a loop cursor. 374 * @pos: the type * to use as a loop cursor.
310 * @head: the head for your list. 375 * @head: the head for your list.
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index a0189ba67fde..14e6f47ee16f 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -48,10 +48,17 @@
48 48
49#include <asm/barrier.h> 49#include <asm/barrier.h>
50 50
51#ifndef CONFIG_TINY_RCU
51extern int rcu_expedited; /* for sysctl */ 52extern int rcu_expedited; /* for sysctl */
53extern int rcu_normal; /* also for sysctl */
54#endif /* #ifndef CONFIG_TINY_RCU */
52 55
53#ifdef CONFIG_TINY_RCU 56#ifdef CONFIG_TINY_RCU
54/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ 57/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
58static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */
59{
60 return true;
61}
55static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */ 62static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */
56{ 63{
57 return false; 64 return false;
@@ -65,6 +72,7 @@ static inline void rcu_unexpedite_gp(void)
65{ 72{
66} 73}
67#else /* #ifdef CONFIG_TINY_RCU */ 74#else /* #ifdef CONFIG_TINY_RCU */
75bool rcu_gp_is_normal(void); /* Internal RCU use. */
68bool rcu_gp_is_expedited(void); /* Internal RCU use. */ 76bool rcu_gp_is_expedited(void); /* Internal RCU use. */
69void rcu_expedite_gp(void); 77void rcu_expedite_gp(void);
70void rcu_unexpedite_gp(void); 78void rcu_unexpedite_gp(void);
@@ -321,7 +329,6 @@ static inline int rcu_preempt_depth(void)
321 329
322/* Internal to kernel */ 330/* Internal to kernel */
323void rcu_init(void); 331void rcu_init(void);
324void rcu_end_inkernel_boot(void);
325void rcu_sched_qs(void); 332void rcu_sched_qs(void);
326void rcu_bh_qs(void); 333void rcu_bh_qs(void);
327void rcu_check_callbacks(int user); 334void rcu_check_callbacks(int user);
@@ -329,6 +336,12 @@ struct notifier_block;
329int rcu_cpu_notify(struct notifier_block *self, 336int rcu_cpu_notify(struct notifier_block *self,
330 unsigned long action, void *hcpu); 337 unsigned long action, void *hcpu);
331 338
339#ifndef CONFIG_TINY_RCU
340void rcu_end_inkernel_boot(void);
341#else /* #ifndef CONFIG_TINY_RCU */
342static inline void rcu_end_inkernel_boot(void) { }
343#endif /* #ifndef CONFIG_TINY_RCU */
344
332#ifdef CONFIG_RCU_STALL_COMMON 345#ifdef CONFIG_RCU_STALL_COMMON
333void rcu_sysrq_start(void); 346void rcu_sysrq_start(void);
334void rcu_sysrq_end(void); 347void rcu_sysrq_end(void);
@@ -379,9 +392,9 @@ static inline void rcu_init_nohz(void)
379 */ 392 */
380#define RCU_NONIDLE(a) \ 393#define RCU_NONIDLE(a) \
381 do { \ 394 do { \
382 rcu_irq_enter(); \ 395 rcu_irq_enter_irqson(); \
383 do { a; } while (0); \ 396 do { a; } while (0); \
384 rcu_irq_exit(); \ 397 rcu_irq_exit_irqson(); \
385 } while (0) 398 } while (0)
386 399
387/* 400/*
@@ -741,7 +754,7 @@ static inline void rcu_preempt_sleep_check(void)
741 * The tracing infrastructure traces RCU (we want that), but unfortunately 754 * The tracing infrastructure traces RCU (we want that), but unfortunately
742 * some of the RCU checks causes tracing to lock up the system. 755 * some of the RCU checks causes tracing to lock up the system.
743 * 756 *
744 * The tracing version of rcu_dereference_raw() must not call 757 * The no-tracing version of rcu_dereference_raw() must not call
745 * rcu_read_lock_held(). 758 * rcu_read_lock_held().
746 */ 759 */
747#define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu) 760#define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu)
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 4c1aaf9cce7b..64809aea661c 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -181,6 +181,14 @@ static inline void rcu_irq_enter(void)
181{ 181{
182} 182}
183 183
184static inline void rcu_irq_exit_irqson(void)
185{
186}
187
188static inline void rcu_irq_enter_irqson(void)
189{
190}
191
184static inline void rcu_irq_exit(void) 192static inline void rcu_irq_exit(void)
185{ 193{
186} 194}
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 60d15a080d7c..ad1eda9fa4da 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -37,7 +37,7 @@ void rcu_cpu_stall_reset(void);
37/* 37/*
38 * Note a virtualization-based context switch. This is simply a 38 * Note a virtualization-based context switch. This is simply a
39 * wrapper around rcu_note_context_switch(), which allows TINY_RCU 39 * wrapper around rcu_note_context_switch(), which allows TINY_RCU
40 * to save a few bytes. 40 * to save a few bytes. The caller must have disabled interrupts.
41 */ 41 */
42static inline void rcu_virt_note_context_switch(int cpu) 42static inline void rcu_virt_note_context_switch(int cpu)
43{ 43{
@@ -97,6 +97,8 @@ void rcu_idle_enter(void);
97void rcu_idle_exit(void); 97void rcu_idle_exit(void);
98void rcu_irq_enter(void); 98void rcu_irq_enter(void);
99void rcu_irq_exit(void); 99void rcu_irq_exit(void);
100void rcu_irq_enter_irqson(void);
101void rcu_irq_exit_irqson(void);
100 102
101void exit_rcu(void); 103void exit_rcu(void);
102 104
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 696a339c592c..7834a8a8bf1e 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -171,8 +171,8 @@ extern void syscall_unregfunc(void);
171 TP_PROTO(data_proto), \ 171 TP_PROTO(data_proto), \
172 TP_ARGS(data_args), \ 172 TP_ARGS(data_args), \
173 TP_CONDITION(cond), \ 173 TP_CONDITION(cond), \
174 rcu_irq_enter(), \ 174 rcu_irq_enter_irqson(), \
175 rcu_irq_exit()); \ 175 rcu_irq_exit_irqson()); \
176 } 176 }
177#else 177#else
178#define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args) 178#define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args)
diff --git a/init/main.c b/init/main.c
index 9e64d7097f1a..c6ebefafa496 100644
--- a/init/main.c
+++ b/init/main.c
@@ -943,6 +943,8 @@ static int __ref kernel_init(void *unused)
943 943
944 flush_delayed_fput(); 944 flush_delayed_fput();
945 945
946 rcu_end_inkernel_boot();
947
946 if (ramdisk_execute_command) { 948 if (ramdisk_execute_command) {
947 ret = run_init_process(ramdisk_execute_command); 949 ret = run_init_process(ramdisk_execute_command);
948 if (!ret) 950 if (!ret)
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index e83b26464061..152da4a48867 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -20,7 +20,7 @@
20#include <linux/capability.h> 20#include <linux/capability.h>
21#include <linux/compiler.h> 21#include <linux/compiler.h>
22 22
23#include <linux/rcupdate.h> /* rcu_expedited */ 23#include <linux/rcupdate.h> /* rcu_expedited and rcu_normal */
24 24
25#define KERNEL_ATTR_RO(_name) \ 25#define KERNEL_ATTR_RO(_name) \
26static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 26static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
@@ -144,11 +144,12 @@ static ssize_t fscaps_show(struct kobject *kobj,
144} 144}
145KERNEL_ATTR_RO(fscaps); 145KERNEL_ATTR_RO(fscaps);
146 146
147#ifndef CONFIG_TINY_RCU
147int rcu_expedited; 148int rcu_expedited;
148static ssize_t rcu_expedited_show(struct kobject *kobj, 149static ssize_t rcu_expedited_show(struct kobject *kobj,
149 struct kobj_attribute *attr, char *buf) 150 struct kobj_attribute *attr, char *buf)
150{ 151{
151 return sprintf(buf, "%d\n", rcu_expedited); 152 return sprintf(buf, "%d\n", READ_ONCE(rcu_expedited));
152} 153}
153static ssize_t rcu_expedited_store(struct kobject *kobj, 154static ssize_t rcu_expedited_store(struct kobject *kobj,
154 struct kobj_attribute *attr, 155 struct kobj_attribute *attr,
@@ -161,6 +162,24 @@ static ssize_t rcu_expedited_store(struct kobject *kobj,
161} 162}
162KERNEL_ATTR_RW(rcu_expedited); 163KERNEL_ATTR_RW(rcu_expedited);
163 164
165int rcu_normal;
166static ssize_t rcu_normal_show(struct kobject *kobj,
167 struct kobj_attribute *attr, char *buf)
168{
169 return sprintf(buf, "%d\n", READ_ONCE(rcu_normal));
170}
171static ssize_t rcu_normal_store(struct kobject *kobj,
172 struct kobj_attribute *attr,
173 const char *buf, size_t count)
174{
175 if (kstrtoint(buf, 0, &rcu_normal))
176 return -EINVAL;
177
178 return count;
179}
180KERNEL_ATTR_RW(rcu_normal);
181#endif /* #ifndef CONFIG_TINY_RCU */
182
164/* 183/*
165 * Make /sys/kernel/notes give the raw contents of our kernel .notes section. 184 * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
166 */ 185 */
@@ -202,7 +221,10 @@ static struct attribute * kernel_attrs[] = {
202 &kexec_crash_size_attr.attr, 221 &kexec_crash_size_attr.attr,
203 &vmcoreinfo_attr.attr, 222 &vmcoreinfo_attr.attr,
204#endif 223#endif
224#ifndef CONFIG_TINY_RCU
205 &rcu_expedited_attr.attr, 225 &rcu_expedited_attr.attr,
226 &rcu_normal_attr.attr,
227#endif
206 NULL 228 NULL
207}; 229};
208 230
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d89328e260df..d2988d047d66 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -162,6 +162,27 @@ static int rcu_torture_writer_state;
162#define RTWS_SYNC 7 162#define RTWS_SYNC 7
163#define RTWS_STUTTER 8 163#define RTWS_STUTTER 8
164#define RTWS_STOPPING 9 164#define RTWS_STOPPING 9
165static const char * const rcu_torture_writer_state_names[] = {
166 "RTWS_FIXED_DELAY",
167 "RTWS_DELAY",
168 "RTWS_REPLACE",
169 "RTWS_DEF_FREE",
170 "RTWS_EXP_SYNC",
171 "RTWS_COND_GET",
172 "RTWS_COND_SYNC",
173 "RTWS_SYNC",
174 "RTWS_STUTTER",
175 "RTWS_STOPPING",
176};
177
178static const char *rcu_torture_writer_state_getname(void)
179{
180 unsigned int i = READ_ONCE(rcu_torture_writer_state);
181
182 if (i >= ARRAY_SIZE(rcu_torture_writer_state_names))
183 return "???";
184 return rcu_torture_writer_state_names[i];
185}
165 186
166#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) 187#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
167#define RCUTORTURE_RUNNABLE_INIT 1 188#define RCUTORTURE_RUNNABLE_INIT 1
@@ -1307,7 +1328,8 @@ rcu_torture_stats_print(void)
1307 1328
1308 rcutorture_get_gp_data(cur_ops->ttype, 1329 rcutorture_get_gp_data(cur_ops->ttype,
1309 &flags, &gpnum, &completed); 1330 &flags, &gpnum, &completed);
1310 pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n", 1331 pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n",
1332 rcu_torture_writer_state_getname(),
1311 rcu_torture_writer_state, 1333 rcu_torture_writer_state,
1312 gpnum, completed, flags); 1334 gpnum, completed, flags);
1313 show_rcu_gp_kthreads(); 1335 show_rcu_gp_kthreads();
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index a63a1ea5a41b..9b9cdd549caa 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -489,7 +489,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
489 */ 489 */
490void synchronize_srcu(struct srcu_struct *sp) 490void synchronize_srcu(struct srcu_struct *sp)
491{ 491{
492 __synchronize_srcu(sp, rcu_gp_is_expedited() 492 __synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal())
493 ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT 493 ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
494 : SYNCHRONIZE_SRCU_TRYCOUNT); 494 : SYNCHRONIZE_SRCU_TRYCOUNT);
495} 495}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f07343b54fe5..e41dd4131f7a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -68,10 +68,6 @@ MODULE_ALIAS("rcutree");
68 68
69/* Data structures. */ 69/* Data structures. */
70 70
71static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
72static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
73static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
74
75/* 71/*
76 * In order to export the rcu_state name to the tracing tools, it 72 * In order to export the rcu_state name to the tracing tools, it
77 * needs to be added in the __tracepoint_string section. 73 * needs to be added in the __tracepoint_string section.
@@ -246,24 +242,17 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
246 */ 242 */
247void rcu_sched_qs(void) 243void rcu_sched_qs(void)
248{ 244{
249 unsigned long flags; 245 if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s))
250 246 return;
251 if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) { 247 trace_rcu_grace_period(TPS("rcu_sched"),
252 trace_rcu_grace_period(TPS("rcu_sched"), 248 __this_cpu_read(rcu_sched_data.gpnum),
253 __this_cpu_read(rcu_sched_data.gpnum), 249 TPS("cpuqs"));
254 TPS("cpuqs")); 250 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
255 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); 251 if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
256 if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) 252 return;
257 return; 253 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
258 local_irq_save(flags); 254 rcu_report_exp_rdp(&rcu_sched_state,
259 if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) { 255 this_cpu_ptr(&rcu_sched_data), true);
260 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
261 rcu_report_exp_rdp(&rcu_sched_state,
262 this_cpu_ptr(&rcu_sched_data),
263 true);
264 }
265 local_irq_restore(flags);
266 }
267} 256}
268 257
269void rcu_bh_qs(void) 258void rcu_bh_qs(void)
@@ -300,17 +289,16 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
300 * We inform the RCU core by emulating a zero-duration dyntick-idle 289 * We inform the RCU core by emulating a zero-duration dyntick-idle
301 * period, which we in turn do by incrementing the ->dynticks counter 290 * period, which we in turn do by incrementing the ->dynticks counter
302 * by two. 291 * by two.
292 *
293 * The caller must have disabled interrupts.
303 */ 294 */
304static void rcu_momentary_dyntick_idle(void) 295static void rcu_momentary_dyntick_idle(void)
305{ 296{
306 unsigned long flags;
307 struct rcu_data *rdp; 297 struct rcu_data *rdp;
308 struct rcu_dynticks *rdtp; 298 struct rcu_dynticks *rdtp;
309 int resched_mask; 299 int resched_mask;
310 struct rcu_state *rsp; 300 struct rcu_state *rsp;
311 301
312 local_irq_save(flags);
313
314 /* 302 /*
315 * Yes, we can lose flag-setting operations. This is OK, because 303 * Yes, we can lose flag-setting operations. This is OK, because
316 * the flag will be set again after some delay. 304 * the flag will be set again after some delay.
@@ -340,13 +328,12 @@ static void rcu_momentary_dyntick_idle(void)
340 smp_mb__after_atomic(); /* Later stuff after QS. */ 328 smp_mb__after_atomic(); /* Later stuff after QS. */
341 break; 329 break;
342 } 330 }
343 local_irq_restore(flags);
344} 331}
345 332
346/* 333/*
347 * Note a context switch. This is a quiescent state for RCU-sched, 334 * Note a context switch. This is a quiescent state for RCU-sched,
348 * and requires special handling for preemptible RCU. 335 * and requires special handling for preemptible RCU.
349 * The caller must have disabled preemption. 336 * The caller must have disabled interrupts.
350 */ 337 */
351void rcu_note_context_switch(void) 338void rcu_note_context_switch(void)
352{ 339{
@@ -376,9 +363,14 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
376 */ 363 */
377void rcu_all_qs(void) 364void rcu_all_qs(void)
378{ 365{
366 unsigned long flags;
367
379 barrier(); /* Avoid RCU read-side critical sections leaking down. */ 368 barrier(); /* Avoid RCU read-side critical sections leaking down. */
380 if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) 369 if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) {
370 local_irq_save(flags);
381 rcu_momentary_dyntick_idle(); 371 rcu_momentary_dyntick_idle();
372 local_irq_restore(flags);
373 }
382 this_cpu_inc(rcu_qs_ctr); 374 this_cpu_inc(rcu_qs_ctr);
383 barrier(); /* Avoid RCU read-side critical sections leaking up. */ 375 barrier(); /* Avoid RCU read-side critical sections leaking up. */
384} 376}
@@ -605,25 +597,25 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
605 * The caller must have disabled interrupts to prevent races with 597 * The caller must have disabled interrupts to prevent races with
606 * normal callback registry. 598 * normal callback registry.
607 */ 599 */
608static int 600static bool
609cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 601cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
610{ 602{
611 int i; 603 int i;
612 604
613 if (rcu_gp_in_progress(rsp)) 605 if (rcu_gp_in_progress(rsp))
614 return 0; /* No, a grace period is already in progress. */ 606 return false; /* No, a grace period is already in progress. */
615 if (rcu_future_needs_gp(rsp)) 607 if (rcu_future_needs_gp(rsp))
616 return 1; /* Yes, a no-CBs CPU needs one. */ 608 return true; /* Yes, a no-CBs CPU needs one. */
617 if (!rdp->nxttail[RCU_NEXT_TAIL]) 609 if (!rdp->nxttail[RCU_NEXT_TAIL])
618 return 0; /* No, this is a no-CBs (or offline) CPU. */ 610 return false; /* No, this is a no-CBs (or offline) CPU. */
619 if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) 611 if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
620 return 1; /* Yes, this CPU has newly registered callbacks. */ 612 return true; /* Yes, CPU has newly registered callbacks. */
621 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) 613 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
622 if (rdp->nxttail[i - 1] != rdp->nxttail[i] && 614 if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
623 ULONG_CMP_LT(READ_ONCE(rsp->completed), 615 ULONG_CMP_LT(READ_ONCE(rsp->completed),
624 rdp->nxtcompleted[i])) 616 rdp->nxtcompleted[i]))
625 return 1; /* Yes, CBs for future grace period. */ 617 return true; /* Yes, CBs for future grace period. */
626 return 0; /* No grace period needed. */ 618 return false; /* No grace period needed. */
627} 619}
628 620
629/* 621/*
@@ -740,7 +732,7 @@ void rcu_user_enter(void)
740 * 732 *
741 * Exit from an interrupt handler, which might possibly result in entering 733 * Exit from an interrupt handler, which might possibly result in entering
742 * idle mode, in other words, leaving the mode in which read-side critical 734 * idle mode, in other words, leaving the mode in which read-side critical
743 * sections can occur. 735 * sections can occur. The caller must have disabled interrupts.
744 * 736 *
745 * This code assumes that the idle loop never does anything that might 737 * This code assumes that the idle loop never does anything that might
746 * result in unbalanced calls to irq_enter() and irq_exit(). If your 738 * result in unbalanced calls to irq_enter() and irq_exit(). If your
@@ -753,11 +745,10 @@ void rcu_user_enter(void)
753 */ 745 */
754void rcu_irq_exit(void) 746void rcu_irq_exit(void)
755{ 747{
756 unsigned long flags;
757 long long oldval; 748 long long oldval;
758 struct rcu_dynticks *rdtp; 749 struct rcu_dynticks *rdtp;
759 750
760 local_irq_save(flags); 751 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!");
761 rdtp = this_cpu_ptr(&rcu_dynticks); 752 rdtp = this_cpu_ptr(&rcu_dynticks);
762 oldval = rdtp->dynticks_nesting; 753 oldval = rdtp->dynticks_nesting;
763 rdtp->dynticks_nesting--; 754 rdtp->dynticks_nesting--;
@@ -768,6 +759,17 @@ void rcu_irq_exit(void)
768 else 759 else
769 rcu_eqs_enter_common(oldval, true); 760 rcu_eqs_enter_common(oldval, true);
770 rcu_sysidle_enter(1); 761 rcu_sysidle_enter(1);
762}
763
764/*
765 * Wrapper for rcu_irq_exit() where interrupts are enabled.
766 */
767void rcu_irq_exit_irqson(void)
768{
769 unsigned long flags;
770
771 local_irq_save(flags);
772 rcu_irq_exit();
771 local_irq_restore(flags); 773 local_irq_restore(flags);
772} 774}
773 775
@@ -865,7 +867,7 @@ void rcu_user_exit(void)
865 * 867 *
866 * Enter an interrupt handler, which might possibly result in exiting 868 * Enter an interrupt handler, which might possibly result in exiting
867 * idle mode, in other words, entering the mode in which read-side critical 869 * idle mode, in other words, entering the mode in which read-side critical
868 * sections can occur. 870 * sections can occur. The caller must have disabled interrupts.
869 * 871 *
870 * Note that the Linux kernel is fully capable of entering an interrupt 872 * Note that the Linux kernel is fully capable of entering an interrupt
871 * handler that it never exits, for example when doing upcalls to 873 * handler that it never exits, for example when doing upcalls to
@@ -881,11 +883,10 @@ void rcu_user_exit(void)
881 */ 883 */
882void rcu_irq_enter(void) 884void rcu_irq_enter(void)
883{ 885{
884 unsigned long flags;
885 struct rcu_dynticks *rdtp; 886 struct rcu_dynticks *rdtp;
886 long long oldval; 887 long long oldval;
887 888
888 local_irq_save(flags); 889 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!");
889 rdtp = this_cpu_ptr(&rcu_dynticks); 890 rdtp = this_cpu_ptr(&rcu_dynticks);
890 oldval = rdtp->dynticks_nesting; 891 oldval = rdtp->dynticks_nesting;
891 rdtp->dynticks_nesting++; 892 rdtp->dynticks_nesting++;
@@ -896,6 +897,17 @@ void rcu_irq_enter(void)
896 else 897 else
897 rcu_eqs_exit_common(oldval, true); 898 rcu_eqs_exit_common(oldval, true);
898 rcu_sysidle_exit(1); 899 rcu_sysidle_exit(1);
900}
901
902/*
903 * Wrapper for rcu_irq_enter() where interrupts are enabled.
904 */
905void rcu_irq_enter_irqson(void)
906{
907 unsigned long flags;
908
909 local_irq_save(flags);
910 rcu_irq_enter();
899 local_irq_restore(flags); 911 local_irq_restore(flags);
900} 912}
901 913
@@ -1187,6 +1199,16 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
1187} 1199}
1188 1200
1189/* 1201/*
1202 * Convert a ->gp_state value to a character string.
1203 */
1204static const char *gp_state_getname(short gs)
1205{
1206 if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
1207 return "???";
1208 return gp_state_names[gs];
1209}
1210
1211/*
1190 * Complain about starvation of grace-period kthread. 1212 * Complain about starvation of grace-period kthread.
1191 */ 1213 */
1192static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) 1214static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
@@ -1196,12 +1218,16 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
1196 1218
1197 j = jiffies; 1219 j = jiffies;
1198 gpa = READ_ONCE(rsp->gp_activity); 1220 gpa = READ_ONCE(rsp->gp_activity);
1199 if (j - gpa > 2 * HZ) 1221 if (j - gpa > 2 * HZ) {
1200 pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n", 1222 pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n",
1201 rsp->name, j - gpa, 1223 rsp->name, j - gpa,
1202 rsp->gpnum, rsp->completed, 1224 rsp->gpnum, rsp->completed,
1203 rsp->gp_flags, rsp->gp_state, 1225 rsp->gp_flags,
1204 rsp->gp_kthread ? rsp->gp_kthread->state : 0); 1226 gp_state_getname(rsp->gp_state), rsp->gp_state,
1227 rsp->gp_kthread ? rsp->gp_kthread->state : ~0);
1228 if (rsp->gp_kthread)
1229 sched_show_task(rsp->gp_kthread);
1230 }
1205} 1231}
1206 1232
1207/* 1233/*
@@ -1214,7 +1240,7 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
1214 struct rcu_node *rnp; 1240 struct rcu_node *rnp;
1215 1241
1216 rcu_for_each_leaf_node(rsp, rnp) { 1242 rcu_for_each_leaf_node(rsp, rnp) {
1217 raw_spin_lock_irqsave(&rnp->lock, flags); 1243 raw_spin_lock_irqsave_rcu_node(rnp, flags);
1218 if (rnp->qsmask != 0) { 1244 if (rnp->qsmask != 0) {
1219 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) 1245 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
1220 if (rnp->qsmask & (1UL << cpu)) 1246 if (rnp->qsmask & (1UL << cpu))
@@ -1237,7 +1263,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
1237 1263
1238 /* Only let one CPU complain about others per time interval. */ 1264 /* Only let one CPU complain about others per time interval. */
1239 1265
1240 raw_spin_lock_irqsave(&rnp->lock, flags); 1266 raw_spin_lock_irqsave_rcu_node(rnp, flags);
1241 delta = jiffies - READ_ONCE(rsp->jiffies_stall); 1267 delta = jiffies - READ_ONCE(rsp->jiffies_stall);
1242 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { 1268 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
1243 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1269 raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1256,7 +1282,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
1256 rsp->name); 1282 rsp->name);
1257 print_cpu_stall_info_begin(); 1283 print_cpu_stall_info_begin();
1258 rcu_for_each_leaf_node(rsp, rnp) { 1284 rcu_for_each_leaf_node(rsp, rnp) {
1259 raw_spin_lock_irqsave(&rnp->lock, flags); 1285 raw_spin_lock_irqsave_rcu_node(rnp, flags);
1260 ndetected += rcu_print_task_stall(rnp); 1286 ndetected += rcu_print_task_stall(rnp);
1261 if (rnp->qsmask != 0) { 1287 if (rnp->qsmask != 0) {
1262 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) 1288 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
@@ -1327,7 +1353,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
1327 1353
1328 rcu_dump_cpu_stacks(rsp); 1354 rcu_dump_cpu_stacks(rsp);
1329 1355
1330 raw_spin_lock_irqsave(&rnp->lock, flags); 1356 raw_spin_lock_irqsave_rcu_node(rnp, flags);
1331 if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) 1357 if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
1332 WRITE_ONCE(rsp->jiffies_stall, 1358 WRITE_ONCE(rsp->jiffies_stall,
1333 jiffies + 3 * rcu_jiffies_till_stall_check() + 3); 1359 jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
@@ -1534,10 +1560,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1534 * hold it, acquire the root rcu_node structure's lock in order to 1560 * hold it, acquire the root rcu_node structure's lock in order to
1535 * start one (if needed). 1561 * start one (if needed).
1536 */ 1562 */
1537 if (rnp != rnp_root) { 1563 if (rnp != rnp_root)
1538 raw_spin_lock(&rnp_root->lock); 1564 raw_spin_lock_rcu_node(rnp_root);
1539 smp_mb__after_unlock_lock();
1540 }
1541 1565
1542 /* 1566 /*
1543 * Get a new grace-period number. If there really is no grace 1567 * Get a new grace-period number. If there really is no grace
@@ -1786,11 +1810,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1786 if ((rdp->gpnum == READ_ONCE(rnp->gpnum) && 1810 if ((rdp->gpnum == READ_ONCE(rnp->gpnum) &&
1787 rdp->completed == READ_ONCE(rnp->completed) && 1811 rdp->completed == READ_ONCE(rnp->completed) &&
1788 !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */ 1812 !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */
1789 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ 1813 !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */
1790 local_irq_restore(flags); 1814 local_irq_restore(flags);
1791 return; 1815 return;
1792 } 1816 }
1793 smp_mb__after_unlock_lock();
1794 needwake = __note_gp_changes(rsp, rnp, rdp); 1817 needwake = __note_gp_changes(rsp, rnp, rdp);
1795 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1818 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1796 if (needwake) 1819 if (needwake)
@@ -1805,21 +1828,20 @@ static void rcu_gp_slow(struct rcu_state *rsp, int delay)
1805} 1828}
1806 1829
1807/* 1830/*
1808 * Initialize a new grace period. Return 0 if no grace period required. 1831 * Initialize a new grace period. Return false if no grace period required.
1809 */ 1832 */
1810static int rcu_gp_init(struct rcu_state *rsp) 1833static bool rcu_gp_init(struct rcu_state *rsp)
1811{ 1834{
1812 unsigned long oldmask; 1835 unsigned long oldmask;
1813 struct rcu_data *rdp; 1836 struct rcu_data *rdp;
1814 struct rcu_node *rnp = rcu_get_root(rsp); 1837 struct rcu_node *rnp = rcu_get_root(rsp);
1815 1838
1816 WRITE_ONCE(rsp->gp_activity, jiffies); 1839 WRITE_ONCE(rsp->gp_activity, jiffies);
1817 raw_spin_lock_irq(&rnp->lock); 1840 raw_spin_lock_irq_rcu_node(rnp);
1818 smp_mb__after_unlock_lock();
1819 if (!READ_ONCE(rsp->gp_flags)) { 1841 if (!READ_ONCE(rsp->gp_flags)) {
1820 /* Spurious wakeup, tell caller to go back to sleep. */ 1842 /* Spurious wakeup, tell caller to go back to sleep. */
1821 raw_spin_unlock_irq(&rnp->lock); 1843 raw_spin_unlock_irq(&rnp->lock);
1822 return 0; 1844 return false;
1823 } 1845 }
1824 WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */ 1846 WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */
1825 1847
@@ -1829,7 +1851,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1829 * Not supposed to be able to happen. 1851 * Not supposed to be able to happen.
1830 */ 1852 */
1831 raw_spin_unlock_irq(&rnp->lock); 1853 raw_spin_unlock_irq(&rnp->lock);
1832 return 0; 1854 return false;
1833 } 1855 }
1834 1856
1835 /* Advance to a new grace period and initialize state. */ 1857 /* Advance to a new grace period and initialize state. */
@@ -1847,8 +1869,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1847 */ 1869 */
1848 rcu_for_each_leaf_node(rsp, rnp) { 1870 rcu_for_each_leaf_node(rsp, rnp) {
1849 rcu_gp_slow(rsp, gp_preinit_delay); 1871 rcu_gp_slow(rsp, gp_preinit_delay);
1850 raw_spin_lock_irq(&rnp->lock); 1872 raw_spin_lock_irq_rcu_node(rnp);
1851 smp_mb__after_unlock_lock();
1852 if (rnp->qsmaskinit == rnp->qsmaskinitnext && 1873 if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
1853 !rnp->wait_blkd_tasks) { 1874 !rnp->wait_blkd_tasks) {
1854 /* Nothing to do on this leaf rcu_node structure. */ 1875 /* Nothing to do on this leaf rcu_node structure. */
@@ -1904,8 +1925,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1904 */ 1925 */
1905 rcu_for_each_node_breadth_first(rsp, rnp) { 1926 rcu_for_each_node_breadth_first(rsp, rnp) {
1906 rcu_gp_slow(rsp, gp_init_delay); 1927 rcu_gp_slow(rsp, gp_init_delay);
1907 raw_spin_lock_irq(&rnp->lock); 1928 raw_spin_lock_irq_rcu_node(rnp);
1908 smp_mb__after_unlock_lock();
1909 rdp = this_cpu_ptr(rsp->rda); 1929 rdp = this_cpu_ptr(rsp->rda);
1910 rcu_preempt_check_blocked_tasks(rnp); 1930 rcu_preempt_check_blocked_tasks(rnp);
1911 rnp->qsmask = rnp->qsmaskinit; 1931 rnp->qsmask = rnp->qsmaskinit;
@@ -1923,7 +1943,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1923 WRITE_ONCE(rsp->gp_activity, jiffies); 1943 WRITE_ONCE(rsp->gp_activity, jiffies);
1924 } 1944 }
1925 1945
1926 return 1; 1946 return true;
1927} 1947}
1928 1948
1929/* 1949/*
@@ -1973,8 +1993,7 @@ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time)
1973 } 1993 }
1974 /* Clear flag to prevent immediate re-entry. */ 1994 /* Clear flag to prevent immediate re-entry. */
1975 if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 1995 if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
1976 raw_spin_lock_irq(&rnp->lock); 1996 raw_spin_lock_irq_rcu_node(rnp);
1977 smp_mb__after_unlock_lock();
1978 WRITE_ONCE(rsp->gp_flags, 1997 WRITE_ONCE(rsp->gp_flags,
1979 READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); 1998 READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
1980 raw_spin_unlock_irq(&rnp->lock); 1999 raw_spin_unlock_irq(&rnp->lock);
@@ -1993,8 +2012,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1993 struct rcu_node *rnp = rcu_get_root(rsp); 2012 struct rcu_node *rnp = rcu_get_root(rsp);
1994 2013
1995 WRITE_ONCE(rsp->gp_activity, jiffies); 2014 WRITE_ONCE(rsp->gp_activity, jiffies);
1996 raw_spin_lock_irq(&rnp->lock); 2015 raw_spin_lock_irq_rcu_node(rnp);
1997 smp_mb__after_unlock_lock();
1998 gp_duration = jiffies - rsp->gp_start; 2016 gp_duration = jiffies - rsp->gp_start;
1999 if (gp_duration > rsp->gp_max) 2017 if (gp_duration > rsp->gp_max)
2000 rsp->gp_max = gp_duration; 2018 rsp->gp_max = gp_duration;
@@ -2019,8 +2037,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
2019 * grace period is recorded in any of the rcu_node structures. 2037 * grace period is recorded in any of the rcu_node structures.
2020 */ 2038 */
2021 rcu_for_each_node_breadth_first(rsp, rnp) { 2039 rcu_for_each_node_breadth_first(rsp, rnp) {
2022 raw_spin_lock_irq(&rnp->lock); 2040 raw_spin_lock_irq_rcu_node(rnp);
2023 smp_mb__after_unlock_lock();
2024 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); 2041 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
2025 WARN_ON_ONCE(rnp->qsmask); 2042 WARN_ON_ONCE(rnp->qsmask);
2026 WRITE_ONCE(rnp->completed, rsp->gpnum); 2043 WRITE_ONCE(rnp->completed, rsp->gpnum);
@@ -2035,8 +2052,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
2035 rcu_gp_slow(rsp, gp_cleanup_delay); 2052 rcu_gp_slow(rsp, gp_cleanup_delay);
2036 } 2053 }
2037 rnp = rcu_get_root(rsp); 2054 rnp = rcu_get_root(rsp);
2038 raw_spin_lock_irq(&rnp->lock); 2055 raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */
2039 smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */
2040 rcu_nocb_gp_set(rnp, nocb); 2056 rcu_nocb_gp_set(rnp, nocb);
2041 2057
2042 /* Declare grace period done. */ 2058 /* Declare grace period done. */
@@ -2284,8 +2300,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
2284 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2300 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2285 rnp_c = rnp; 2301 rnp_c = rnp;
2286 rnp = rnp->parent; 2302 rnp = rnp->parent;
2287 raw_spin_lock_irqsave(&rnp->lock, flags); 2303 raw_spin_lock_irqsave_rcu_node(rnp, flags);
2288 smp_mb__after_unlock_lock();
2289 oldmask = rnp_c->qsmask; 2304 oldmask = rnp_c->qsmask;
2290 } 2305 }
2291 2306
@@ -2332,8 +2347,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
2332 gps = rnp->gpnum; 2347 gps = rnp->gpnum;
2333 mask = rnp->grpmask; 2348 mask = rnp->grpmask;
2334 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2349 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2335 raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ 2350 raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */
2336 smp_mb__after_unlock_lock();
2337 rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); 2351 rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
2338} 2352}
2339 2353
@@ -2355,8 +2369,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
2355 struct rcu_node *rnp; 2369 struct rcu_node *rnp;
2356 2370
2357 rnp = rdp->mynode; 2371 rnp = rdp->mynode;
2358 raw_spin_lock_irqsave(&rnp->lock, flags); 2372 raw_spin_lock_irqsave_rcu_node(rnp, flags);
2359 smp_mb__after_unlock_lock();
2360 if ((rdp->cpu_no_qs.b.norm && 2373 if ((rdp->cpu_no_qs.b.norm &&
2361 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || 2374 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
2362 rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || 2375 rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
@@ -2582,8 +2595,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2582 rnp = rnp->parent; 2595 rnp = rnp->parent;
2583 if (!rnp) 2596 if (!rnp)
2584 break; 2597 break;
2585 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 2598 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
2586 smp_mb__after_unlock_lock(); /* GP memory ordering. */
2587 rnp->qsmaskinit &= ~mask; 2599 rnp->qsmaskinit &= ~mask;
2588 rnp->qsmask &= ~mask; 2600 rnp->qsmask &= ~mask;
2589 if (rnp->qsmaskinit) { 2601 if (rnp->qsmaskinit) {
@@ -2611,8 +2623,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
2611 2623
2612 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ 2624 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
2613 mask = rdp->grpmask; 2625 mask = rdp->grpmask;
2614 raw_spin_lock_irqsave(&rnp->lock, flags); 2626 raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
2615 smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
2616 rnp->qsmaskinitnext &= ~mask; 2627 rnp->qsmaskinitnext &= ~mask;
2617 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2628 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2618} 2629}
@@ -2809,8 +2820,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
2809 rcu_for_each_leaf_node(rsp, rnp) { 2820 rcu_for_each_leaf_node(rsp, rnp) {
2810 cond_resched_rcu_qs(); 2821 cond_resched_rcu_qs();
2811 mask = 0; 2822 mask = 0;
2812 raw_spin_lock_irqsave(&rnp->lock, flags); 2823 raw_spin_lock_irqsave_rcu_node(rnp, flags);
2813 smp_mb__after_unlock_lock();
2814 if (rnp->qsmask == 0) { 2824 if (rnp->qsmask == 0) {
2815 if (rcu_state_p == &rcu_sched_state || 2825 if (rcu_state_p == &rcu_sched_state ||
2816 rsp != rcu_state_p || 2826 rsp != rcu_state_p ||
@@ -2881,8 +2891,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
2881 /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ 2891 /* rnp_old == rcu_get_root(rsp), rnp == NULL. */
2882 2892
2883 /* Reached the root of the rcu_node tree, acquire lock. */ 2893 /* Reached the root of the rcu_node tree, acquire lock. */
2884 raw_spin_lock_irqsave(&rnp_old->lock, flags); 2894 raw_spin_lock_irqsave_rcu_node(rnp_old, flags);
2885 smp_mb__after_unlock_lock();
2886 raw_spin_unlock(&rnp_old->fqslock); 2895 raw_spin_unlock(&rnp_old->fqslock);
2887 if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 2896 if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
2888 rsp->n_force_qs_lh++; 2897 rsp->n_force_qs_lh++;
@@ -2914,7 +2923,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
2914 /* Does this CPU require a not-yet-started grace period? */ 2923 /* Does this CPU require a not-yet-started grace period? */
2915 local_irq_save(flags); 2924 local_irq_save(flags);
2916 if (cpu_needs_another_gp(rsp, rdp)) { 2925 if (cpu_needs_another_gp(rsp, rdp)) {
2917 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ 2926 raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */
2918 needwake = rcu_start_gp(rsp); 2927 needwake = rcu_start_gp(rsp);
2919 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); 2928 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
2920 if (needwake) 2929 if (needwake)
@@ -3005,8 +3014,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
3005 if (!rcu_gp_in_progress(rsp)) { 3014 if (!rcu_gp_in_progress(rsp)) {
3006 struct rcu_node *rnp_root = rcu_get_root(rsp); 3015 struct rcu_node *rnp_root = rcu_get_root(rsp);
3007 3016
3008 raw_spin_lock(&rnp_root->lock); 3017 raw_spin_lock_rcu_node(rnp_root);
3009 smp_mb__after_unlock_lock();
3010 needwake = rcu_start_gp(rsp); 3018 needwake = rcu_start_gp(rsp);
3011 raw_spin_unlock(&rnp_root->lock); 3019 raw_spin_unlock(&rnp_root->lock);
3012 if (needwake) 3020 if (needwake)
@@ -3365,7 +3373,6 @@ static unsigned long rcu_seq_snap(unsigned long *sp)
3365{ 3373{
3366 unsigned long s; 3374 unsigned long s;
3367 3375
3368 smp_mb(); /* Caller's modifications seen first by other CPUs. */
3369 s = (READ_ONCE(*sp) + 3) & ~0x1; 3376 s = (READ_ONCE(*sp) + 3) & ~0x1;
3370 smp_mb(); /* Above access must not bleed into critical section. */ 3377 smp_mb(); /* Above access must not bleed into critical section. */
3371 return s; 3378 return s;
@@ -3392,6 +3399,7 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
3392} 3399}
3393static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) 3400static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
3394{ 3401{
3402 smp_mb(); /* Caller's modifications seen first by other CPUs. */
3395 return rcu_seq_snap(&rsp->expedited_sequence); 3403 return rcu_seq_snap(&rsp->expedited_sequence);
3396} 3404}
3397static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) 3405static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
@@ -3426,8 +3434,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
3426 * CPUs for the current rcu_node structure up the rcu_node tree. 3434 * CPUs for the current rcu_node structure up the rcu_node tree.
3427 */ 3435 */
3428 rcu_for_each_leaf_node(rsp, rnp) { 3436 rcu_for_each_leaf_node(rsp, rnp) {
3429 raw_spin_lock_irqsave(&rnp->lock, flags); 3437 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3430 smp_mb__after_unlock_lock();
3431 if (rnp->expmaskinit == rnp->expmaskinitnext) { 3438 if (rnp->expmaskinit == rnp->expmaskinitnext) {
3432 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3439 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3433 continue; /* No new CPUs, nothing to do. */ 3440 continue; /* No new CPUs, nothing to do. */
@@ -3447,8 +3454,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
3447 rnp_up = rnp->parent; 3454 rnp_up = rnp->parent;
3448 done = false; 3455 done = false;
3449 while (rnp_up) { 3456 while (rnp_up) {
3450 raw_spin_lock_irqsave(&rnp_up->lock, flags); 3457 raw_spin_lock_irqsave_rcu_node(rnp_up, flags);
3451 smp_mb__after_unlock_lock();
3452 if (rnp_up->expmaskinit) 3458 if (rnp_up->expmaskinit)
3453 done = true; 3459 done = true;
3454 rnp_up->expmaskinit |= mask; 3460 rnp_up->expmaskinit |= mask;
@@ -3472,8 +3478,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
3472 3478
3473 sync_exp_reset_tree_hotplug(rsp); 3479 sync_exp_reset_tree_hotplug(rsp);
3474 rcu_for_each_node_breadth_first(rsp, rnp) { 3480 rcu_for_each_node_breadth_first(rsp, rnp) {
3475 raw_spin_lock_irqsave(&rnp->lock, flags); 3481 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3476 smp_mb__after_unlock_lock();
3477 WARN_ON_ONCE(rnp->expmask); 3482 WARN_ON_ONCE(rnp->expmask);
3478 rnp->expmask = rnp->expmaskinit; 3483 rnp->expmask = rnp->expmaskinit;
3479 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3484 raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -3531,8 +3536,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
3531 mask = rnp->grpmask; 3536 mask = rnp->grpmask;
3532 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 3537 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
3533 rnp = rnp->parent; 3538 rnp = rnp->parent;
3534 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 3539 raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
3535 smp_mb__after_unlock_lock();
3536 WARN_ON_ONCE(!(rnp->expmask & mask)); 3540 WARN_ON_ONCE(!(rnp->expmask & mask));
3537 rnp->expmask &= ~mask; 3541 rnp->expmask &= ~mask;
3538 } 3542 }
@@ -3549,8 +3553,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
3549{ 3553{
3550 unsigned long flags; 3554 unsigned long flags;
3551 3555
3552 raw_spin_lock_irqsave(&rnp->lock, flags); 3556 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3553 smp_mb__after_unlock_lock();
3554 __rcu_report_exp_rnp(rsp, rnp, wake, flags); 3557 __rcu_report_exp_rnp(rsp, rnp, wake, flags);
3555} 3558}
3556 3559
@@ -3564,8 +3567,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
3564{ 3567{
3565 unsigned long flags; 3568 unsigned long flags;
3566 3569
3567 raw_spin_lock_irqsave(&rnp->lock, flags); 3570 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3568 smp_mb__after_unlock_lock();
3569 if (!(rnp->expmask & mask)) { 3571 if (!(rnp->expmask & mask)) {
3570 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3572 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3571 return; 3573 return;
@@ -3609,7 +3611,7 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
3609 */ 3611 */
3610static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) 3612static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
3611{ 3613{
3612 struct rcu_data *rdp; 3614 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
3613 struct rcu_node *rnp0; 3615 struct rcu_node *rnp0;
3614 struct rcu_node *rnp1 = NULL; 3616 struct rcu_node *rnp1 = NULL;
3615 3617
@@ -3623,7 +3625,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
3623 if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { 3625 if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) {
3624 if (mutex_trylock(&rnp0->exp_funnel_mutex)) { 3626 if (mutex_trylock(&rnp0->exp_funnel_mutex)) {
3625 if (sync_exp_work_done(rsp, rnp0, NULL, 3627 if (sync_exp_work_done(rsp, rnp0, NULL,
3626 &rsp->expedited_workdone0, s)) 3628 &rdp->expedited_workdone0, s))
3627 return NULL; 3629 return NULL;
3628 return rnp0; 3630 return rnp0;
3629 } 3631 }
@@ -3637,14 +3639,13 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
3637 * can be inexact, as it is just promoting locality and is not 3639 * can be inexact, as it is just promoting locality and is not
3638 * strictly needed for correctness. 3640 * strictly needed for correctness.
3639 */ 3641 */
3640 rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); 3642 if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s))
3641 if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s))
3642 return NULL; 3643 return NULL;
3643 mutex_lock(&rdp->exp_funnel_mutex); 3644 mutex_lock(&rdp->exp_funnel_mutex);
3644 rnp0 = rdp->mynode; 3645 rnp0 = rdp->mynode;
3645 for (; rnp0 != NULL; rnp0 = rnp0->parent) { 3646 for (; rnp0 != NULL; rnp0 = rnp0->parent) {
3646 if (sync_exp_work_done(rsp, rnp1, rdp, 3647 if (sync_exp_work_done(rsp, rnp1, rdp,
3647 &rsp->expedited_workdone2, s)) 3648 &rdp->expedited_workdone2, s))
3648 return NULL; 3649 return NULL;
3649 mutex_lock(&rnp0->exp_funnel_mutex); 3650 mutex_lock(&rnp0->exp_funnel_mutex);
3650 if (rnp1) 3651 if (rnp1)
@@ -3654,7 +3655,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
3654 rnp1 = rnp0; 3655 rnp1 = rnp0;
3655 } 3656 }
3656 if (sync_exp_work_done(rsp, rnp1, rdp, 3657 if (sync_exp_work_done(rsp, rnp1, rdp,
3657 &rsp->expedited_workdone3, s)) 3658 &rdp->expedited_workdone3, s))
3658 return NULL; 3659 return NULL;
3659 return rnp1; 3660 return rnp1;
3660} 3661}
@@ -3708,8 +3709,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
3708 3709
3709 sync_exp_reset_tree(rsp); 3710 sync_exp_reset_tree(rsp);
3710 rcu_for_each_leaf_node(rsp, rnp) { 3711 rcu_for_each_leaf_node(rsp, rnp) {
3711 raw_spin_lock_irqsave(&rnp->lock, flags); 3712 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3712 smp_mb__after_unlock_lock();
3713 3713
3714 /* Each pass checks a CPU for identity, offline, and idle. */ 3714 /* Each pass checks a CPU for identity, offline, and idle. */
3715 mask_ofl_test = 0; 3715 mask_ofl_test = 0;
@@ -3741,24 +3741,22 @@ retry_ipi:
3741 ret = smp_call_function_single(cpu, func, rsp, 0); 3741 ret = smp_call_function_single(cpu, func, rsp, 0);
3742 if (!ret) { 3742 if (!ret) {
3743 mask_ofl_ipi &= ~mask; 3743 mask_ofl_ipi &= ~mask;
3744 } else { 3744 continue;
3745 /* Failed, raced with offline. */ 3745 }
3746 raw_spin_lock_irqsave(&rnp->lock, flags); 3746 /* Failed, raced with offline. */
3747 if (cpu_online(cpu) && 3747 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3748 (rnp->expmask & mask)) { 3748 if (cpu_online(cpu) &&
3749 raw_spin_unlock_irqrestore(&rnp->lock, 3749 (rnp->expmask & mask)) {
3750 flags);
3751 schedule_timeout_uninterruptible(1);
3752 if (cpu_online(cpu) &&
3753 (rnp->expmask & mask))
3754 goto retry_ipi;
3755 raw_spin_lock_irqsave(&rnp->lock,
3756 flags);
3757 }
3758 if (!(rnp->expmask & mask))
3759 mask_ofl_ipi &= ~mask;
3760 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3750 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3751 schedule_timeout_uninterruptible(1);
3752 if (cpu_online(cpu) &&
3753 (rnp->expmask & mask))
3754 goto retry_ipi;
3755 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3761 } 3756 }
3757 if (!(rnp->expmask & mask))
3758 mask_ofl_ipi &= ~mask;
3759 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3762 } 3760 }
3763 /* Report quiescent states for those that went offline. */ 3761 /* Report quiescent states for those that went offline. */
3764 mask_ofl_test |= mask_ofl_ipi; 3762 mask_ofl_test |= mask_ofl_ipi;
@@ -3773,6 +3771,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
3773 unsigned long jiffies_stall; 3771 unsigned long jiffies_stall;
3774 unsigned long jiffies_start; 3772 unsigned long jiffies_start;
3775 unsigned long mask; 3773 unsigned long mask;
3774 int ndetected;
3776 struct rcu_node *rnp; 3775 struct rcu_node *rnp;
3777 struct rcu_node *rnp_root = rcu_get_root(rsp); 3776 struct rcu_node *rnp_root = rcu_get_root(rsp);
3778 int ret; 3777 int ret;
@@ -3785,7 +3784,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
3785 rsp->expedited_wq, 3784 rsp->expedited_wq,
3786 sync_rcu_preempt_exp_done(rnp_root), 3785 sync_rcu_preempt_exp_done(rnp_root),
3787 jiffies_stall); 3786 jiffies_stall);
3788 if (ret > 0) 3787 if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
3789 return; 3788 return;
3790 if (ret < 0) { 3789 if (ret < 0) {
3791 /* Hit a signal, disable CPU stall warnings. */ 3790 /* Hit a signal, disable CPU stall warnings. */
@@ -3795,14 +3794,16 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
3795 } 3794 }
3796 pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", 3795 pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
3797 rsp->name); 3796 rsp->name);
3797 ndetected = 0;
3798 rcu_for_each_leaf_node(rsp, rnp) { 3798 rcu_for_each_leaf_node(rsp, rnp) {
3799 (void)rcu_print_task_exp_stall(rnp); 3799 ndetected = rcu_print_task_exp_stall(rnp);
3800 mask = 1; 3800 mask = 1;
3801 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { 3801 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
3802 struct rcu_data *rdp; 3802 struct rcu_data *rdp;
3803 3803
3804 if (!(rnp->expmask & mask)) 3804 if (!(rnp->expmask & mask))
3805 continue; 3805 continue;
3806 ndetected++;
3806 rdp = per_cpu_ptr(rsp->rda, cpu); 3807 rdp = per_cpu_ptr(rsp->rda, cpu);
3807 pr_cont(" %d-%c%c%c", cpu, 3808 pr_cont(" %d-%c%c%c", cpu,
3808 "O."[cpu_online(cpu)], 3809 "O."[cpu_online(cpu)],
@@ -3811,8 +3812,23 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
3811 } 3812 }
3812 mask <<= 1; 3813 mask <<= 1;
3813 } 3814 }
3814 pr_cont(" } %lu jiffies s: %lu\n", 3815 pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
3815 jiffies - jiffies_start, rsp->expedited_sequence); 3816 jiffies - jiffies_start, rsp->expedited_sequence,
3817 rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
3818 if (!ndetected) {
3819 pr_err("blocking rcu_node structures:");
3820 rcu_for_each_node_breadth_first(rsp, rnp) {
3821 if (rnp == rnp_root)
3822 continue; /* printed unconditionally */
3823 if (sync_rcu_preempt_exp_done(rnp))
3824 continue;
3825 pr_cont(" l=%u:%d-%d:%#lx/%c",
3826 rnp->level, rnp->grplo, rnp->grphi,
3827 rnp->expmask,
3828 ".T"[!!rnp->exp_tasks]);
3829 }
3830 pr_cont("\n");
3831 }
3816 rcu_for_each_leaf_node(rsp, rnp) { 3832 rcu_for_each_leaf_node(rsp, rnp) {
3817 mask = 1; 3833 mask = 1;
3818 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { 3834 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
@@ -3847,6 +3863,16 @@ void synchronize_sched_expedited(void)
3847 struct rcu_node *rnp; 3863 struct rcu_node *rnp;
3848 struct rcu_state *rsp = &rcu_sched_state; 3864 struct rcu_state *rsp = &rcu_sched_state;
3849 3865
3866 /* If only one CPU, this is automatically a grace period. */
3867 if (rcu_blocking_is_gp())
3868 return;
3869
3870 /* If expedited grace periods are prohibited, fall back to normal. */
3871 if (rcu_gp_is_normal()) {
3872 wait_rcu_gp(call_rcu_sched);
3873 return;
3874 }
3875
3850 /* Take a snapshot of the sequence number. */ 3876 /* Take a snapshot of the sequence number. */
3851 s = rcu_exp_gp_seq_snap(rsp); 3877 s = rcu_exp_gp_seq_snap(rsp);
3852 3878
@@ -4135,7 +4161,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
4135 rnp = rnp->parent; 4161 rnp = rnp->parent;
4136 if (rnp == NULL) 4162 if (rnp == NULL)
4137 return; 4163 return;
4138 raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */ 4164 raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */
4139 rnp->qsmaskinit |= mask; 4165 rnp->qsmaskinit |= mask;
4140 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ 4166 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
4141 } 4167 }
@@ -4152,7 +4178,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
4152 struct rcu_node *rnp = rcu_get_root(rsp); 4178 struct rcu_node *rnp = rcu_get_root(rsp);
4153 4179
4154 /* Set up local state, ensuring consistent view of global state. */ 4180 /* Set up local state, ensuring consistent view of global state. */
4155 raw_spin_lock_irqsave(&rnp->lock, flags); 4181 raw_spin_lock_irqsave_rcu_node(rnp, flags);
4156 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); 4182 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
4157 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 4183 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
4158 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); 4184 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
@@ -4179,7 +4205,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
4179 struct rcu_node *rnp = rcu_get_root(rsp); 4205 struct rcu_node *rnp = rcu_get_root(rsp);
4180 4206
4181 /* Set up local state, ensuring consistent view of global state. */ 4207 /* Set up local state, ensuring consistent view of global state. */
4182 raw_spin_lock_irqsave(&rnp->lock, flags); 4208 raw_spin_lock_irqsave_rcu_node(rnp, flags);
4183 rdp->qlen_last_fqs_check = 0; 4209 rdp->qlen_last_fqs_check = 0;
4184 rdp->n_force_qs_snap = rsp->n_force_qs; 4210 rdp->n_force_qs_snap = rsp->n_force_qs;
4185 rdp->blimit = blimit; 4211 rdp->blimit = blimit;
@@ -4198,8 +4224,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
4198 */ 4224 */
4199 rnp = rdp->mynode; 4225 rnp = rdp->mynode;
4200 mask = rdp->grpmask; 4226 mask = rdp->grpmask;
4201 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 4227 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
4202 smp_mb__after_unlock_lock();
4203 rnp->qsmaskinitnext |= mask; 4228 rnp->qsmaskinitnext |= mask;
4204 rnp->expmaskinitnext |= mask; 4229 rnp->expmaskinitnext |= mask;
4205 if (!rdp->beenonline) 4230 if (!rdp->beenonline)
@@ -4327,14 +4352,14 @@ static int __init rcu_spawn_gp_kthread(void)
4327 t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name); 4352 t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name);
4328 BUG_ON(IS_ERR(t)); 4353 BUG_ON(IS_ERR(t));
4329 rnp = rcu_get_root(rsp); 4354 rnp = rcu_get_root(rsp);
4330 raw_spin_lock_irqsave(&rnp->lock, flags); 4355 raw_spin_lock_irqsave_rcu_node(rnp, flags);
4331 rsp->gp_kthread = t; 4356 rsp->gp_kthread = t;
4332 if (kthread_prio) { 4357 if (kthread_prio) {
4333 sp.sched_priority = kthread_prio; 4358 sp.sched_priority = kthread_prio;
4334 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 4359 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
4335 } 4360 }
4336 wake_up_process(t);
4337 raw_spin_unlock_irqrestore(&rnp->lock, flags); 4361 raw_spin_unlock_irqrestore(&rnp->lock, flags);
4362 wake_up_process(t);
4338 } 4363 }
4339 rcu_spawn_nocb_kthreads(); 4364 rcu_spawn_nocb_kthreads();
4340 rcu_spawn_boost_kthreads(); 4365 rcu_spawn_boost_kthreads();
@@ -4385,12 +4410,14 @@ static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt)
4385/* 4410/*
4386 * Helper function for rcu_init() that initializes one rcu_state structure. 4411 * Helper function for rcu_init() that initializes one rcu_state structure.
4387 */ 4412 */
4388static void __init rcu_init_one(struct rcu_state *rsp, 4413static void __init rcu_init_one(struct rcu_state *rsp)
4389 struct rcu_data __percpu *rda)
4390{ 4414{
4391 static const char * const buf[] = RCU_NODE_NAME_INIT; 4415 static const char * const buf[] = RCU_NODE_NAME_INIT;
4392 static const char * const fqs[] = RCU_FQS_NAME_INIT; 4416 static const char * const fqs[] = RCU_FQS_NAME_INIT;
4393 static const char * const exp[] = RCU_EXP_NAME_INIT; 4417 static const char * const exp[] = RCU_EXP_NAME_INIT;
4418 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
4419 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
4420 static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
4394 static u8 fl_mask = 0x1; 4421 static u8 fl_mask = 0x1;
4395 4422
4396 int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ 4423 int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
@@ -4576,8 +4603,8 @@ void __init rcu_init(void)
4576 4603
4577 rcu_bootup_announce(); 4604 rcu_bootup_announce();
4578 rcu_init_geometry(); 4605 rcu_init_geometry();
4579 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 4606 rcu_init_one(&rcu_bh_state);
4580 rcu_init_one(&rcu_sched_state, &rcu_sched_data); 4607 rcu_init_one(&rcu_sched_state);
4581 if (dump_tree) 4608 if (dump_tree)
4582 rcu_dump_rcu_node_tree(&rcu_sched_state); 4609 rcu_dump_rcu_node_tree(&rcu_sched_state);
4583 __rcu_init_preempt(); 4610 __rcu_init_preempt();
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 9fb4e238d4dc..83360b4f4352 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -178,6 +178,8 @@ struct rcu_node {
178 /* beginning of each expedited GP. */ 178 /* beginning of each expedited GP. */
179 unsigned long expmaskinitnext; 179 unsigned long expmaskinitnext;
180 /* Online CPUs for next expedited GP. */ 180 /* Online CPUs for next expedited GP. */
181 /* Any CPU that has ever been online will */
182 /* have its bit set. */
181 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 183 unsigned long grpmask; /* Mask to apply to parent qsmask. */
182 /* Only one bit will be set in this mask. */ 184 /* Only one bit will be set in this mask. */
183 int grplo; /* lowest-numbered CPU or group here. */ 185 int grplo; /* lowest-numbered CPU or group here. */
@@ -384,6 +386,10 @@ struct rcu_data {
384 struct rcu_head oom_head; 386 struct rcu_head oom_head;
385#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 387#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
386 struct mutex exp_funnel_mutex; 388 struct mutex exp_funnel_mutex;
389 atomic_long_t expedited_workdone0; /* # done by others #0. */
390 atomic_long_t expedited_workdone1; /* # done by others #1. */
391 atomic_long_t expedited_workdone2; /* # done by others #2. */
392 atomic_long_t expedited_workdone3; /* # done by others #3. */
387 393
388 /* 7) Callback offloading. */ 394 /* 7) Callback offloading. */
389#ifdef CONFIG_RCU_NOCB_CPU 395#ifdef CONFIG_RCU_NOCB_CPU
@@ -498,10 +504,6 @@ struct rcu_state {
498 /* End of fields guarded by barrier_mutex. */ 504 /* End of fields guarded by barrier_mutex. */
499 505
500 unsigned long expedited_sequence; /* Take a ticket. */ 506 unsigned long expedited_sequence; /* Take a ticket. */
501 atomic_long_t expedited_workdone0; /* # done by others #0. */
502 atomic_long_t expedited_workdone1; /* # done by others #1. */
503 atomic_long_t expedited_workdone2; /* # done by others #2. */
504 atomic_long_t expedited_workdone3; /* # done by others #3. */
505 atomic_long_t expedited_normal; /* # fallbacks to normal. */ 507 atomic_long_t expedited_normal; /* # fallbacks to normal. */
506 atomic_t expedited_need_qs; /* # CPUs left to check in. */ 508 atomic_t expedited_need_qs; /* # CPUs left to check in. */
507 wait_queue_head_t expedited_wq; /* Wait for check-ins. */ 509 wait_queue_head_t expedited_wq; /* Wait for check-ins. */
@@ -545,6 +547,18 @@ struct rcu_state {
545#define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */ 547#define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */
546#define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */ 548#define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */
547 549
550#ifndef RCU_TREE_NONCORE
551static const char * const gp_state_names[] = {
552 "RCU_GP_IDLE",
553 "RCU_GP_WAIT_GPS",
554 "RCU_GP_DONE_GPS",
555 "RCU_GP_WAIT_FQS",
556 "RCU_GP_DOING_FQS",
557 "RCU_GP_CLEANUP",
558 "RCU_GP_CLEANED",
559};
560#endif /* #ifndef RCU_TREE_NONCORE */
561
548extern struct list_head rcu_struct_flavors; 562extern struct list_head rcu_struct_flavors;
549 563
550/* Sequence through rcu_state structures for each RCU flavor. */ 564/* Sequence through rcu_state structures for each RCU flavor. */
@@ -664,3 +678,42 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
664#else /* #ifdef CONFIG_PPC */ 678#else /* #ifdef CONFIG_PPC */
665#define smp_mb__after_unlock_lock() do { } while (0) 679#define smp_mb__after_unlock_lock() do { } while (0)
666#endif /* #else #ifdef CONFIG_PPC */ 680#endif /* #else #ifdef CONFIG_PPC */
681
682/*
683 * Wrappers for the rcu_node::lock acquire.
684 *
685 * Because the rcu_nodes form a tree, the tree traversal locking will observe
686 * different lock values, this in turn means that an UNLOCK of one level
687 * followed by a LOCK of another level does not imply a full memory barrier;
688 * and most importantly transitivity is lost.
689 *
690 * In order to restore full ordering between tree levels, augment the regular
691 * lock acquire functions with smp_mb__after_unlock_lock().
692 */
693static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp)
694{
695 raw_spin_lock(&rnp->lock);
696 smp_mb__after_unlock_lock();
697}
698
699static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp)
700{
701 raw_spin_lock_irq(&rnp->lock);
702 smp_mb__after_unlock_lock();
703}
704
705#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \
706do { \
707 typecheck(unsigned long, flags); \
708 raw_spin_lock_irqsave(&(rnp)->lock, flags); \
709 smp_mb__after_unlock_lock(); \
710} while (0)
711
712static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp)
713{
714 bool locked = raw_spin_trylock(&rnp->lock);
715
716 if (locked)
717 smp_mb__after_unlock_lock();
718 return locked;
719}
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 630c19772630..9467a8b7e756 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -63,8 +63,7 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
63 63
64/* 64/*
65 * Check the RCU kernel configuration parameters and print informative 65 * Check the RCU kernel configuration parameters and print informative
66 * messages about anything out of the ordinary. If you like #ifdef, you 66 * messages about anything out of the ordinary.
67 * will love this function.
68 */ 67 */
69static void __init rcu_bootup_announce_oddness(void) 68static void __init rcu_bootup_announce_oddness(void)
70{ 69{
@@ -147,8 +146,8 @@ static void __init rcu_bootup_announce(void)
147 * the corresponding expedited grace period will also be the end of the 146 * the corresponding expedited grace period will also be the end of the
148 * normal grace period. 147 * normal grace period.
149 */ 148 */
150static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, 149static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
151 unsigned long flags) __releases(rnp->lock) 150 __releases(rnp->lock) /* But leaves rrupts disabled. */
152{ 151{
153 int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) + 152 int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) +
154 (rnp->exp_tasks ? RCU_EXP_TASKS : 0) + 153 (rnp->exp_tasks ? RCU_EXP_TASKS : 0) +
@@ -236,7 +235,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
236 rnp->gp_tasks = &t->rcu_node_entry; 235 rnp->gp_tasks = &t->rcu_node_entry;
237 if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) 236 if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
238 rnp->exp_tasks = &t->rcu_node_entry; 237 rnp->exp_tasks = &t->rcu_node_entry;
239 raw_spin_unlock(&rnp->lock); 238 raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */
240 239
241 /* 240 /*
242 * Report the quiescent state for the expedited GP. This expedited 241 * Report the quiescent state for the expedited GP. This expedited
@@ -251,7 +250,6 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
251 } else { 250 } else {
252 WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs); 251 WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs);
253 } 252 }
254 local_irq_restore(flags);
255} 253}
256 254
257/* 255/*
@@ -286,12 +284,11 @@ static void rcu_preempt_qs(void)
286 * predating the current grace period drain, in other words, until 284 * predating the current grace period drain, in other words, until
287 * rnp->gp_tasks becomes NULL. 285 * rnp->gp_tasks becomes NULL.
288 * 286 *
289 * Caller must disable preemption. 287 * Caller must disable interrupts.
290 */ 288 */
291static void rcu_preempt_note_context_switch(void) 289static void rcu_preempt_note_context_switch(void)
292{ 290{
293 struct task_struct *t = current; 291 struct task_struct *t = current;
294 unsigned long flags;
295 struct rcu_data *rdp; 292 struct rcu_data *rdp;
296 struct rcu_node *rnp; 293 struct rcu_node *rnp;
297 294
@@ -301,8 +298,7 @@ static void rcu_preempt_note_context_switch(void)
301 /* Possibly blocking in an RCU read-side critical section. */ 298 /* Possibly blocking in an RCU read-side critical section. */
302 rdp = this_cpu_ptr(rcu_state_p->rda); 299 rdp = this_cpu_ptr(rcu_state_p->rda);
303 rnp = rdp->mynode; 300 rnp = rdp->mynode;
304 raw_spin_lock_irqsave(&rnp->lock, flags); 301 raw_spin_lock_rcu_node(rnp);
305 smp_mb__after_unlock_lock();
306 t->rcu_read_unlock_special.b.blocked = true; 302 t->rcu_read_unlock_special.b.blocked = true;
307 t->rcu_blocked_node = rnp; 303 t->rcu_blocked_node = rnp;
308 304
@@ -318,7 +314,7 @@ static void rcu_preempt_note_context_switch(void)
318 (rnp->qsmask & rdp->grpmask) 314 (rnp->qsmask & rdp->grpmask)
319 ? rnp->gpnum 315 ? rnp->gpnum
320 : rnp->gpnum + 1); 316 : rnp->gpnum + 1);
321 rcu_preempt_ctxt_queue(rnp, rdp, flags); 317 rcu_preempt_ctxt_queue(rnp, rdp);
322 } else if (t->rcu_read_lock_nesting < 0 && 318 } else if (t->rcu_read_lock_nesting < 0 &&
323 t->rcu_read_unlock_special.s) { 319 t->rcu_read_unlock_special.s) {
324 320
@@ -450,20 +446,13 @@ void rcu_read_unlock_special(struct task_struct *t)
450 446
451 /* 447 /*
452 * Remove this task from the list it blocked on. The task 448 * Remove this task from the list it blocked on. The task
453 * now remains queued on the rcu_node corresponding to 449 * now remains queued on the rcu_node corresponding to the
454 * the CPU it first blocked on, so the first attempt to 450 * CPU it first blocked on, so there is no longer any need
455 * acquire the task's rcu_node's ->lock will succeed. 451 * to loop. Retain a WARN_ON_ONCE() out of sheer paranoia.
456 * Keep the loop and add a WARN_ON() out of sheer paranoia.
457 */ 452 */
458 for (;;) { 453 rnp = t->rcu_blocked_node;
459 rnp = t->rcu_blocked_node; 454 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
460 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 455 WARN_ON_ONCE(rnp != t->rcu_blocked_node);
461 smp_mb__after_unlock_lock();
462 if (rnp == t->rcu_blocked_node)
463 break;
464 WARN_ON_ONCE(1);
465 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
466 }
467 empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); 456 empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
468 empty_exp = sync_rcu_preempt_exp_done(rnp); 457 empty_exp = sync_rcu_preempt_exp_done(rnp);
469 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 458 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -527,7 +516,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
527 unsigned long flags; 516 unsigned long flags;
528 struct task_struct *t; 517 struct task_struct *t;
529 518
530 raw_spin_lock_irqsave(&rnp->lock, flags); 519 raw_spin_lock_irqsave_rcu_node(rnp, flags);
531 if (!rcu_preempt_blocked_readers_cgp(rnp)) { 520 if (!rcu_preempt_blocked_readers_cgp(rnp)) {
532 raw_spin_unlock_irqrestore(&rnp->lock, flags); 521 raw_spin_unlock_irqrestore(&rnp->lock, flags);
533 return; 522 return;
@@ -748,6 +737,12 @@ void synchronize_rcu_expedited(void)
748 struct rcu_state *rsp = rcu_state_p; 737 struct rcu_state *rsp = rcu_state_p;
749 unsigned long s; 738 unsigned long s;
750 739
740 /* If expedited grace periods are prohibited, fall back to normal. */
741 if (rcu_gp_is_normal()) {
742 wait_rcu_gp(call_rcu);
743 return;
744 }
745
751 s = rcu_exp_gp_seq_snap(rsp); 746 s = rcu_exp_gp_seq_snap(rsp);
752 747
753 rnp_unlock = exp_funnel_lock(rsp, s); 748 rnp_unlock = exp_funnel_lock(rsp, s);
@@ -788,7 +783,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier);
788 */ 783 */
789static void __init __rcu_init_preempt(void) 784static void __init __rcu_init_preempt(void)
790{ 785{
791 rcu_init_one(rcu_state_p, rcu_data_p); 786 rcu_init_one(rcu_state_p);
792} 787}
793 788
794/* 789/*
@@ -989,8 +984,7 @@ static int rcu_boost(struct rcu_node *rnp)
989 READ_ONCE(rnp->boost_tasks) == NULL) 984 READ_ONCE(rnp->boost_tasks) == NULL)
990 return 0; /* Nothing left to boost. */ 985 return 0; /* Nothing left to boost. */
991 986
992 raw_spin_lock_irqsave(&rnp->lock, flags); 987 raw_spin_lock_irqsave_rcu_node(rnp, flags);
993 smp_mb__after_unlock_lock();
994 988
995 /* 989 /*
996 * Recheck under the lock: all tasks in need of boosting 990 * Recheck under the lock: all tasks in need of boosting
@@ -1176,8 +1170,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1176 "rcub/%d", rnp_index); 1170 "rcub/%d", rnp_index);
1177 if (IS_ERR(t)) 1171 if (IS_ERR(t))
1178 return PTR_ERR(t); 1172 return PTR_ERR(t);
1179 raw_spin_lock_irqsave(&rnp->lock, flags); 1173 raw_spin_lock_irqsave_rcu_node(rnp, flags);
1180 smp_mb__after_unlock_lock();
1181 rnp->boost_kthread_task = t; 1174 rnp->boost_kthread_task = t;
1182 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1175 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1183 sp.sched_priority = kthread_prio; 1176 sp.sched_priority = kthread_prio;
@@ -1524,7 +1517,8 @@ static void rcu_prepare_for_idle(void)
1524 struct rcu_state *rsp; 1517 struct rcu_state *rsp;
1525 int tne; 1518 int tne;
1526 1519
1527 if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) 1520 if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
1521 rcu_is_nocb_cpu(smp_processor_id()))
1528 return; 1522 return;
1529 1523
1530 /* Handle nohz enablement switches conservatively. */ 1524 /* Handle nohz enablement switches conservatively. */
@@ -1538,10 +1532,6 @@ static void rcu_prepare_for_idle(void)
1538 if (!tne) 1532 if (!tne)
1539 return; 1533 return;
1540 1534
1541 /* If this is a no-CBs CPU, no callbacks, just return. */
1542 if (rcu_is_nocb_cpu(smp_processor_id()))
1543 return;
1544
1545 /* 1535 /*
1546 * If a non-lazy callback arrived at a CPU having only lazy 1536 * If a non-lazy callback arrived at a CPU having only lazy
1547 * callbacks, invoke RCU core for the side-effect of recalculating 1537 * callbacks, invoke RCU core for the side-effect of recalculating
@@ -1567,8 +1557,7 @@ static void rcu_prepare_for_idle(void)
1567 if (!*rdp->nxttail[RCU_DONE_TAIL]) 1557 if (!*rdp->nxttail[RCU_DONE_TAIL])
1568 continue; 1558 continue;
1569 rnp = rdp->mynode; 1559 rnp = rdp->mynode;
1570 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1560 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
1571 smp_mb__after_unlock_lock();
1572 needwake = rcu_accelerate_cbs(rsp, rnp, rdp); 1561 needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
1573 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1562 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1574 if (needwake) 1563 if (needwake)
@@ -2068,8 +2057,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2068 bool needwake; 2057 bool needwake;
2069 struct rcu_node *rnp = rdp->mynode; 2058 struct rcu_node *rnp = rdp->mynode;
2070 2059
2071 raw_spin_lock_irqsave(&rnp->lock, flags); 2060 raw_spin_lock_irqsave_rcu_node(rnp, flags);
2072 smp_mb__after_unlock_lock();
2073 needwake = rcu_start_future_gp(rnp, rdp, &c); 2061 needwake = rcu_start_future_gp(rnp, rdp, &c);
2074 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2062 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2075 if (needwake) 2063 if (needwake)
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index ef7093cc9b5c..1088e64f01ad 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Read-Copy Update tracing for classic implementation 2 * Read-Copy Update tracing for hierarchical implementation.
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by 5 * it under the terms of the GNU General Public License as published by
@@ -16,6 +16,7 @@
16 * http://www.gnu.org/licenses/gpl-2.0.html. 16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 * 17 *
18 * Copyright IBM Corporation, 2008 18 * Copyright IBM Corporation, 2008
19 * Author: Paul E. McKenney
19 * 20 *
20 * Papers: http://www.rdrop.com/users/paulmck/RCU 21 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 * 22 *
@@ -33,9 +34,7 @@
33#include <linux/sched.h> 34#include <linux/sched.h>
34#include <linux/atomic.h> 35#include <linux/atomic.h>
35#include <linux/bitops.h> 36#include <linux/bitops.h>
36#include <linux/module.h>
37#include <linux/completion.h> 37#include <linux/completion.h>
38#include <linux/moduleparam.h>
39#include <linux/percpu.h> 38#include <linux/percpu.h>
40#include <linux/notifier.h> 39#include <linux/notifier.h>
41#include <linux/cpu.h> 40#include <linux/cpu.h>
@@ -183,14 +182,20 @@ static const struct file_operations rcudata_fops = {
183 182
184static int show_rcuexp(struct seq_file *m, void *v) 183static int show_rcuexp(struct seq_file *m, void *v)
185{ 184{
185 int cpu;
186 struct rcu_state *rsp = (struct rcu_state *)m->private; 186 struct rcu_state *rsp = (struct rcu_state *)m->private;
187 187 struct rcu_data *rdp;
188 unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0;
189
190 for_each_possible_cpu(cpu) {
191 rdp = per_cpu_ptr(rsp->rda, cpu);
192 s0 += atomic_long_read(&rdp->expedited_workdone0);
193 s1 += atomic_long_read(&rdp->expedited_workdone1);
194 s2 += atomic_long_read(&rdp->expedited_workdone2);
195 s3 += atomic_long_read(&rdp->expedited_workdone3);
196 }
188 seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", 197 seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
189 rsp->expedited_sequence, 198 rsp->expedited_sequence, s0, s1, s2, s3,
190 atomic_long_read(&rsp->expedited_workdone0),
191 atomic_long_read(&rsp->expedited_workdone1),
192 atomic_long_read(&rsp->expedited_workdone2),
193 atomic_long_read(&rsp->expedited_workdone3),
194 atomic_long_read(&rsp->expedited_normal), 199 atomic_long_read(&rsp->expedited_normal),
195 atomic_read(&rsp->expedited_need_qs), 200 atomic_read(&rsp->expedited_need_qs),
196 rsp->expedited_sequence / 2); 201 rsp->expedited_sequence / 2);
@@ -319,7 +324,7 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
319 unsigned long gpmax; 324 unsigned long gpmax;
320 struct rcu_node *rnp = &rsp->node[0]; 325 struct rcu_node *rnp = &rsp->node[0];
321 326
322 raw_spin_lock_irqsave(&rnp->lock, flags); 327 raw_spin_lock_irqsave_rcu_node(rnp, flags);
323 completed = READ_ONCE(rsp->completed); 328 completed = READ_ONCE(rsp->completed);
324 gpnum = READ_ONCE(rsp->gpnum); 329 gpnum = READ_ONCE(rsp->gpnum);
325 if (completed == gpnum) 330 if (completed == gpnum)
@@ -487,16 +492,4 @@ free_out:
487 debugfs_remove_recursive(rcudir); 492 debugfs_remove_recursive(rcudir);
488 return 1; 493 return 1;
489} 494}
490 495device_initcall(rcutree_trace_init);
491static void __exit rcutree_trace_cleanup(void)
492{
493 debugfs_remove_recursive(rcudir);
494}
495
496
497module_init(rcutree_trace_init);
498module_exit(rcutree_trace_cleanup);
499
500MODULE_AUTHOR("Paul E. McKenney");
501MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
502MODULE_LICENSE("GPL");
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 5f748c5a40f0..76b94e19430b 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -60,7 +60,12 @@ MODULE_ALIAS("rcupdate");
60#endif 60#endif
61#define MODULE_PARAM_PREFIX "rcupdate." 61#define MODULE_PARAM_PREFIX "rcupdate."
62 62
63#ifndef CONFIG_TINY_RCU
63module_param(rcu_expedited, int, 0); 64module_param(rcu_expedited, int, 0);
65module_param(rcu_normal, int, 0);
66static int rcu_normal_after_boot;
67module_param(rcu_normal_after_boot, int, 0);
68#endif /* #ifndef CONFIG_TINY_RCU */
64 69
65#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) 70#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT)
66/** 71/**
@@ -113,6 +118,17 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
113 118
114#ifndef CONFIG_TINY_RCU 119#ifndef CONFIG_TINY_RCU
115 120
121/*
122 * Should expedited grace-period primitives always fall back to their
123 * non-expedited counterparts? Intended for use within RCU. Note
124 * that if the user specifies both rcu_expedited and rcu_normal, then
125 * rcu_normal wins.
126 */
127bool rcu_gp_is_normal(void)
128{
129 return READ_ONCE(rcu_normal);
130}
131
116static atomic_t rcu_expedited_nesting = 132static atomic_t rcu_expedited_nesting =
117 ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0); 133 ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
118 134
@@ -157,8 +173,6 @@ void rcu_unexpedite_gp(void)
157} 173}
158EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); 174EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
159 175
160#endif /* #ifndef CONFIG_TINY_RCU */
161
162/* 176/*
163 * Inform RCU of the end of the in-kernel boot sequence. 177 * Inform RCU of the end of the in-kernel boot sequence.
164 */ 178 */
@@ -166,8 +180,12 @@ void rcu_end_inkernel_boot(void)
166{ 180{
167 if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT)) 181 if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
168 rcu_unexpedite_gp(); 182 rcu_unexpedite_gp();
183 if (rcu_normal_after_boot)
184 WRITE_ONCE(rcu_normal, 1);
169} 185}
170 186
187#endif /* #ifndef CONFIG_TINY_RCU */
188
171#ifdef CONFIG_PREEMPT_RCU 189#ifdef CONFIG_PREEMPT_RCU
172 190
173/* 191/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 732e993b564b..1ef0d7aeab47 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3109,7 +3109,6 @@ static void __sched notrace __schedule(bool preempt)
3109 3109
3110 cpu = smp_processor_id(); 3110 cpu = smp_processor_id();
3111 rq = cpu_rq(cpu); 3111 rq = cpu_rq(cpu);
3112 rcu_note_context_switch();
3113 prev = rq->curr; 3112 prev = rq->curr;
3114 3113
3115 /* 3114 /*
@@ -3128,13 +3127,16 @@ static void __sched notrace __schedule(bool preempt)
3128 if (sched_feat(HRTICK)) 3127 if (sched_feat(HRTICK))
3129 hrtick_clear(rq); 3128 hrtick_clear(rq);
3130 3129
3130 local_irq_disable();
3131 rcu_note_context_switch();
3132
3131 /* 3133 /*
3132 * Make sure that signal_pending_state()->signal_pending() below 3134 * Make sure that signal_pending_state()->signal_pending() below
3133 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) 3135 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
3134 * done by the caller to avoid the race with signal_wake_up(). 3136 * done by the caller to avoid the race with signal_wake_up().
3135 */ 3137 */
3136 smp_mb__before_spinlock(); 3138 smp_mb__before_spinlock();
3137 raw_spin_lock_irq(&rq->lock); 3139 raw_spin_lock(&rq->lock);
3138 lockdep_pin_lock(&rq->lock); 3140 lockdep_pin_lock(&rq->lock);
3139 3141
3140 rq->clock_skip_update <<= 1; /* promote REQ to ACT */ 3142 rq->clock_skip_update <<= 1; /* promote REQ to ACT */
diff --git a/lib/list_debug.c b/lib/list_debug.c
index c24c2f7e296f..3859bf63561c 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -37,7 +37,7 @@ void __list_add(struct list_head *new,
37 next->prev = new; 37 next->prev = new;
38 new->next = next; 38 new->next = next;
39 new->prev = prev; 39 new->prev = prev;
40 prev->next = new; 40 WRITE_ONCE(prev->next, new);
41} 41}
42EXPORT_SYMBOL(__list_add); 42EXPORT_SYMBOL(__list_add);
43 43
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index 5236e073919d..0f80eefb0bfd 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -38,8 +38,6 @@
38# 38#
39# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 39# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
40 40
41grace=120
42
43T=/tmp/kvm-test-1-run.sh.$$ 41T=/tmp/kvm-test-1-run.sh.$$
44trap 'rm -rf $T' 0 42trap 'rm -rf $T' 0
45touch $T 43touch $T
@@ -152,7 +150,7 @@ fi
152qemu_args="`specify_qemu_cpus "$QEMU" "$qemu_args" "$cpu_count"`" 150qemu_args="`specify_qemu_cpus "$QEMU" "$qemu_args" "$cpu_count"`"
153 151
154# Generate architecture-specific and interaction-specific qemu arguments 152# Generate architecture-specific and interaction-specific qemu arguments
155qemu_args="$qemu_args `identify_qemu_args "$QEMU" "$builddir/console.log"`" 153qemu_args="$qemu_args `identify_qemu_args "$QEMU" "$resdir/console.log"`"
156 154
157# Generate qemu -append arguments 155# Generate qemu -append arguments
158qemu_append="`identify_qemu_append "$QEMU"`" 156qemu_append="`identify_qemu_append "$QEMU"`"
@@ -168,7 +166,7 @@ then
168 touch $resdir/buildonly 166 touch $resdir/buildonly
169 exit 0 167 exit 0
170fi 168fi
171echo "NOTE: $QEMU either did not run or was interactive" > $builddir/console.log 169echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log
172echo $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd 170echo $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd
173( $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append "$qemu_append $boot_args"; echo $? > $resdir/qemu-retval ) & 171( $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append "$qemu_append $boot_args"; echo $? > $resdir/qemu-retval ) &
174qemu_pid=$! 172qemu_pid=$!
@@ -214,7 +212,7 @@ then
214 else 212 else
215 break 213 break
216 fi 214 fi
217 if test $kruntime -ge $((seconds + grace)) 215 if test $kruntime -ge $((seconds + $TORTURE_SHUTDOWN_GRACE))
218 then 216 then
219 echo "!!! PID $qemu_pid hung at $kruntime vs. $seconds seconds" >> $resdir/Warnings 2>&1 217 echo "!!! PID $qemu_pid hung at $kruntime vs. $seconds seconds" >> $resdir/Warnings 2>&1
220 kill -KILL $qemu_pid 218 kill -KILL $qemu_pid
@@ -224,6 +222,5 @@ then
224 done 222 done
225fi 223fi
226 224
227cp $builddir/console.log $resdir
228parse-torture.sh $resdir/console.log $title 225parse-torture.sh $resdir/console.log $title
229parse-console.sh $resdir/console.log $title 226parse-console.sh $resdir/console.log $title
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index f6483609ebc2..4a431767f77a 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -42,6 +42,7 @@ TORTURE_DEFCONFIG=defconfig
42TORTURE_BOOT_IMAGE="" 42TORTURE_BOOT_IMAGE=""
43TORTURE_INITRD="$KVM/initrd"; export TORTURE_INITRD 43TORTURE_INITRD="$KVM/initrd"; export TORTURE_INITRD
44TORTURE_KMAKE_ARG="" 44TORTURE_KMAKE_ARG=""
45TORTURE_SHUTDOWN_GRACE=180
45TORTURE_SUITE=rcu 46TORTURE_SUITE=rcu
46resdir="" 47resdir=""
47configs="" 48configs=""
@@ -149,6 +150,11 @@ do
149 resdir=$2 150 resdir=$2
150 shift 151 shift
151 ;; 152 ;;
153 --shutdown-grace)
154 checkarg --shutdown-grace "(seconds)" "$#" "$2" '^[0-9]*$' '^error'
155 TORTURE_SHUTDOWN_GRACE=$2
156 shift
157 ;;
152 --torture) 158 --torture)
153 checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\)$' '^--' 159 checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\)$' '^--'
154 TORTURE_SUITE=$2 160 TORTURE_SUITE=$2
@@ -266,6 +272,7 @@ TORTURE_KMAKE_ARG="$TORTURE_KMAKE_ARG"; export TORTURE_KMAKE_ARG
266TORTURE_QEMU_CMD="$TORTURE_QEMU_CMD"; export TORTURE_QEMU_CMD 272TORTURE_QEMU_CMD="$TORTURE_QEMU_CMD"; export TORTURE_QEMU_CMD
267TORTURE_QEMU_INTERACTIVE="$TORTURE_QEMU_INTERACTIVE"; export TORTURE_QEMU_INTERACTIVE 273TORTURE_QEMU_INTERACTIVE="$TORTURE_QEMU_INTERACTIVE"; export TORTURE_QEMU_INTERACTIVE
268TORTURE_QEMU_MAC="$TORTURE_QEMU_MAC"; export TORTURE_QEMU_MAC 274TORTURE_QEMU_MAC="$TORTURE_QEMU_MAC"; export TORTURE_QEMU_MAC
275TORTURE_SHUTDOWN_GRACE="$TORTURE_SHUTDOWN_GRACE"; export TORTURE_SHUTDOWN_GRACE
269TORTURE_SUITE="$TORTURE_SUITE"; export TORTURE_SUITE 276TORTURE_SUITE="$TORTURE_SUITE"; export TORTURE_SUITE
270if ! test -e $resdir 277if ! test -e $resdir
271then 278then
@@ -307,10 +314,10 @@ awk < $T/cfgcpu.pack \
307} 314}
308 315
309# Dump out the scripting required to run one test batch. 316# Dump out the scripting required to run one test batch.
310function dump(first, pastlast) 317function dump(first, pastlast, batchnum)
311{ 318{
312 print "echo ----Start batch: `date`"; 319 print "echo ----Start batch " batchnum ": `date`";
313 print "echo ----Start batch: `date` >> " rd "/log"; 320 print "echo ----Start batch " batchnum ": `date` >> " rd "/log";
314 jn=1 321 jn=1
315 for (j = first; j < pastlast; j++) { 322 for (j = first; j < pastlast; j++) {
316 builddir=KVM "/b" jn 323 builddir=KVM "/b" jn
@@ -371,25 +378,28 @@ END {
371 njobs = i; 378 njobs = i;
372 nc = ncpus; 379 nc = ncpus;
373 first = 0; 380 first = 0;
381 batchnum = 1;
374 382
375 # Each pass through the following loop considers one test. 383 # Each pass through the following loop considers one test.
376 for (i = 0; i < njobs; i++) { 384 for (i = 0; i < njobs; i++) {
377 if (ncpus == 0) { 385 if (ncpus == 0) {
378 # Sequential test specified, each test its own batch. 386 # Sequential test specified, each test its own batch.
379 dump(i, i + 1); 387 dump(i, i + 1, batchnum);
380 first = i; 388 first = i;
389 batchnum++;
381 } else if (nc < cpus[i] && i != 0) { 390 } else if (nc < cpus[i] && i != 0) {
382 # Out of CPUs, dump out a batch. 391 # Out of CPUs, dump out a batch.
383 dump(first, i); 392 dump(first, i, batchnum);
384 first = i; 393 first = i;
385 nc = ncpus; 394 nc = ncpus;
395 batchnum++;
386 } 396 }
387 # Account for the CPUs needed by the current test. 397 # Account for the CPUs needed by the current test.
388 nc -= cpus[i]; 398 nc -= cpus[i];
389 } 399 }
390 # Dump the last batch. 400 # Dump the last batch.
391 if (ncpus != 0) 401 if (ncpus != 0)
392 dump(first, i); 402 dump(first, i, batchnum);
393}' >> $T/script 403}' >> $T/script
394 404
395cat << ___EOF___ >> $T/script 405cat << ___EOF___ >> $T/script
diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh
index d8f35cf116be..844787a0d7be 100755
--- a/tools/testing/selftests/rcutorture/bin/parse-console.sh
+++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh
@@ -24,9 +24,6 @@
24# 24#
25# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 25# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
26 26
27T=/tmp/abat-chk-badness.sh.$$
28trap 'rm -f $T' 0
29
30file="$1" 27file="$1"
31title="$2" 28title="$2"
32 29
@@ -36,9 +33,41 @@ if grep -Pq '\x00' < $file
36then 33then
37 print_warning Console output contains nul bytes, old qemu still running? 34 print_warning Console output contains nul bytes, old qemu still running?
38fi 35fi
39egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|Stall ended before state dump start' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $T 36egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|Stall ended before state dump start' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags
40if test -s $T 37if test -s $1.diags
41then 38then
42 print_warning Assertion failure in $file $title 39 print_warning Assertion failure in $file $title
43 cat $T 40 # cat $1.diags
41 summary=""
42 n_badness=`grep -c Badness $1`
43 if test "$n_badness" -ne 0
44 then
45 summary="$summary Badness: $n_badness"
46 fi
47 n_warn=`grep -v 'Warning: unable to open an initial console' $1 | egrep -c 'WARNING:|Warn'`
48 if test "$n_warn" -ne 0
49 then
50 summary="$summary Warnings: $n_warn"
51 fi
52 n_bugs=`egrep -c 'BUG|Oops:' $1`
53 if test "$n_bugs" -ne 0
54 then
55 summary="$summary Bugs: $n_bugs"
56 fi
57 n_calltrace=`grep -c 'Call Trace:' $1`
58 if test "$n_calltrace" -ne 0
59 then
60 summary="$summary Call Traces: $n_calltrace"
61 fi
62 n_lockdep=`grep -c =========== $1`
63 if test "$n_badness" -ne 0
64 then
65 summary="$summary lockdep: $n_badness"
66 fi
67 n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|Stall ended before state dump start' $1`
68 if test "$n_stalls" -ne 0
69 then
70 summary="$summary Stalls: $n_stalls"
71 fi
72 print_warning Summary: $summary
44fi 73fi
diff --git a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt
index 9ef33a743b73..24396ae8355b 100644
--- a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt
+++ b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt
@@ -20,7 +20,6 @@ CONFIG_PROVE_RCU
20 20
21CONFIG_NO_HZ_FULL_SYSIDLE 21CONFIG_NO_HZ_FULL_SYSIDLE
22CONFIG_RCU_NOCB_CPU 22CONFIG_RCU_NOCB_CPU
23CONFIG_RCU_USER_QS
24 23
25 Meaningless for TINY_RCU. 24 Meaningless for TINY_RCU.
26 25
diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
index 657f3a035488..4e2b1893d40d 100644
--- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
+++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
@@ -72,10 +72,6 @@ CONFIG_RCU_TORTURE_TEST_RUNNABLE
72 72
73 Always used in KVM testing. 73 Always used in KVM testing.
74 74
75CONFIG_RCU_USER_QS
76
77 Redundant with CONFIG_NO_HZ_FULL.
78
79CONFIG_PREEMPT_RCU 75CONFIG_PREEMPT_RCU
80CONFIG_TREE_RCU 76CONFIG_TREE_RCU
81 77