summaryrefslogtreecommitdiffstats
path: root/Documentation/RCU
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2015-10-07 16:32:08 -0400
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>2015-12-05 15:19:07 -0500
commit649e4368ff786e3d02eb2a06b1493fb217d74408 (patch)
tree779b8db626afb75e0fa023265b43f9260ae73f12 /Documentation/RCU
parent6cf10081220ae21175a867d446b3167bcbcb937b (diff)
documentation: Record RCU requirements
This commit adds RCU requirements as published in a 2015 LWN series. Bringing these requirements in-tree allows them to be updated as changes are discovered. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> [ paulmck: Updates to charset and URLs as suggested by Josh Triplett. ]
Diffstat (limited to 'Documentation/RCU')
-rw-r--r--Documentation/RCU/Design/Requirements/2013-08-is-it-dead.pngbin0 -> 100825 bytes
-rw-r--r--Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg374
-rw-r--r--Documentation/RCU/Design/Requirements/RCUApplicability.svg237
-rw-r--r--Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg639
-rw-r--r--Documentation/RCU/Design/Requirements/Requirements.html2799
-rw-r--r--Documentation/RCU/Design/Requirements/Requirements.htmlx2643
-rwxr-xr-xDocumentation/RCU/Design/htmlqqz.sh108
7 files changed, 6800 insertions, 0 deletions
diff --git a/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png b/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png
new file mode 100644
index 000000000000..7496a55e4e7b
--- /dev/null
+++ b/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png
Binary files differ
diff --git a/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg b/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg
new file mode 100644
index 000000000000..4b4014fda770
--- /dev/null
+++ b/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg
@@ -0,0 +1,374 @@
1<?xml version="1.0" encoding="UTF-8" standalone="no"?>
2<!-- Created with Inkscape (http://www.inkscape.org/) -->
3
4<svg
5 xmlns:dc="http://purl.org/dc/elements/1.1/"
6 xmlns:cc="http://creativecommons.org/ns#"
7 xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
8 xmlns:svg="http://www.w3.org/2000/svg"
9 xmlns="http://www.w3.org/2000/svg"
10 xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
11 xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
12 width="447.99197"
13 height="428.19299"
14 id="svg2"
15 version="1.1"
16 inkscape:version="0.48.3.1 r9886"
17 sodipodi:docname="GPpartitionReaders1.svg">
18 <defs
19 id="defs4">
20 <marker
21 inkscape:stockid="Arrow2Lend"
22 orient="auto"
23 refY="0"
24 refX="0"
25 id="Arrow2Lend"
26 style="overflow:visible">
27 <path
28 id="path3792"
29 style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
30 d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
31 transform="matrix(-1.1,0,0,-1.1,-1.1,0)"
32 inkscape:connector-curvature="0" />
33 </marker>
34 <marker
35 inkscape:stockid="Arrow2Lstart"
36 orient="auto"
37 refY="0"
38 refX="0"
39 id="Arrow2Lstart"
40 style="overflow:visible">
41 <path
42 id="path3789"
43 style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
44 d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
45 transform="matrix(1.1,0,0,1.1,1.1,0)"
46 inkscape:connector-curvature="0" />
47 </marker>
48 </defs>
49 <sodipodi:namedview
50 id="base"
51 pagecolor="#ffffff"
52 bordercolor="#666666"
53 borderopacity="1.0"
54 inkscape:pageopacity="0.0"
55 inkscape:pageshadow="2"
56 inkscape:zoom="1.6184291"
57 inkscape:cx="223.99599"
58 inkscape:cy="214.0965"
59 inkscape:document-units="px"
60 inkscape:current-layer="layer1"
61 showgrid="false"
62 inkscape:window-width="979"
63 inkscape:window-height="836"
64 inkscape:window-x="571"
65 inkscape:window-y="335"
66 inkscape:window-maximized="0"
67 fit-margin-top="5"
68 fit-margin-left="5"
69 fit-margin-right="5"
70 fit-margin-bottom="5" />
71 <metadata
72 id="metadata7">
73 <rdf:RDF>
74 <cc:Work
75 rdf:about="">
76 <dc:format>image/svg+xml</dc:format>
77 <dc:type
78 rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
79 <dc:title></dc:title>
80 </cc:Work>
81 </rdf:RDF>
82 </metadata>
83 <g
84 inkscape:label="Layer 1"
85 inkscape:groupmode="layer"
86 id="layer1"
87 transform="translate(-28.441125,-185.60612)">
88 <flowRoot
89 xml:space="preserve"
90 id="flowRoot2985"
91 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion
92 id="flowRegion2987"><rect
93 id="rect2989"
94 width="82.85714"
95 height="11.428572"
96 x="240"
97 y="492.36218" /></flowRegion><flowPara
98 id="flowPara2991"></flowPara></flowRoot> <g
99 id="g4433"
100 transform="translate(2,0)">
101 <text
102 sodipodi:linespacing="125%"
103 id="text2993"
104 y="-261.66608"
105 x="412.12299"
106 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
107 xml:space="preserve"
108 transform="matrix(0,1,-1,0,0,0)"><tspan
109 y="-261.66608"
110 x="412.12299"
111 id="tspan2995"
112 sodipodi:role="line">synchronize_rcu()</tspan></text>
113 <g
114 id="g4417"
115 transform="matrix(0,1,-1,0,730.90257,222.4928)">
116 <path
117 style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)"
118 d="m 97.580736,477.4048 183.140664,0"
119 id="path2997"
120 inkscape:connector-curvature="0"
121 sodipodi:nodetypes="cc" />
122 <path
123 style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
124 d="m 96.752718,465.38398 0,22.62742"
125 id="path4397"
126 inkscape:connector-curvature="0"
127 sodipodi:nodetypes="cc" />
128 <path
129 style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
130 d="m 281.54942,465.38397 0,22.62742"
131 id="path4397-5"
132 inkscape:connector-curvature="0"
133 sodipodi:nodetypes="cc" />
134 </g>
135 </g>
136 <text
137 xml:space="preserve"
138 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
139 x="112.04738"
140 y="268.18076"
141 id="text4429"
142 sodipodi:linespacing="125%"><tspan
143 sodipodi:role="line"
144 id="tspan4431"
145 x="112.04738"
146 y="268.18076">WRITE_ONCE(a, 1);</tspan></text>
147 <text
148 xml:space="preserve"
149 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
150 x="112.04738"
151 y="439.13766"
152 id="text4441"
153 sodipodi:linespacing="125%"><tspan
154 sodipodi:role="line"
155 id="tspan4443"
156 x="112.04738"
157 y="439.13766">WRITE_ONCE(b, 1);</tspan></text>
158 <text
159 xml:space="preserve"
160 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
161 x="255.60869"
162 y="309.29346"
163 id="text4445"
164 sodipodi:linespacing="125%"><tspan
165 sodipodi:role="line"
166 id="tspan4447"
167 x="255.60869"
168 y="309.29346">r1 = READ_ONCE(a);</tspan></text>
169 <text
170 xml:space="preserve"
171 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
172 x="255.14423"
173 y="520.61786"
174 id="text4449"
175 sodipodi:linespacing="125%"><tspan
176 sodipodi:role="line"
177 id="tspan4451"
178 x="255.14423"
179 y="520.61786">WRITE_ONCE(c, 1);</tspan></text>
180 <text
181 xml:space="preserve"
182 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
183 x="396.10254"
184 y="384.71124"
185 id="text4453"
186 sodipodi:linespacing="125%"><tspan
187 sodipodi:role="line"
188 id="tspan4455"
189 x="396.10254"
190 y="384.71124">r2 = READ_ONCE(b);</tspan></text>
191 <text
192 xml:space="preserve"
193 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
194 x="396.10254"
195 y="582.13617"
196 id="text4457"
197 sodipodi:linespacing="125%"><tspan
198 sodipodi:role="line"
199 id="tspan4459"
200 x="396.10254"
201 y="582.13617">r3 = READ_ONCE(c);</tspan></text>
202 <text
203 xml:space="preserve"
204 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
205 x="112.08231"
206 y="213.91006"
207 id="text4461"
208 sodipodi:linespacing="125%"><tspan
209 sodipodi:role="line"
210 id="tspan4463"
211 x="112.08231"
212 y="213.91006">thread0()</tspan></text>
213 <text
214 xml:space="preserve"
215 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
216 x="252.34512"
217 y="213.91006"
218 id="text4461-6"
219 sodipodi:linespacing="125%"><tspan
220 sodipodi:role="line"
221 id="tspan4463-0"
222 x="252.34512"
223 y="213.91006">thread1()</tspan></text>
224 <text
225 xml:space="preserve"
226 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
227 x="396.42557"
228 y="213.91006"
229 id="text4461-2"
230 sodipodi:linespacing="125%"><tspan
231 sodipodi:role="line"
232 id="tspan4463-2"
233 x="396.42557"
234 y="213.91006">thread2()</tspan></text>
235 <rect
236 style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
237 id="rect4495"
238 width="436.28488"
239 height="416.4859"
240 x="34.648232"
241 y="191.10612" />
242 <path
243 style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
244 d="m 183.14066,191.10612 0,417.193 -0.70711,0"
245 id="path4497"
246 inkscape:connector-curvature="0" />
247 <path
248 style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
249 d="m 325.13867,191.10612 0,417.193 -0.70711,0"
250 id="path4497-5"
251 inkscape:connector-curvature="0" />
252 <text
253 xml:space="preserve"
254 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
255 x="111.75929"
256 y="251.53981"
257 id="text4429-8"
258 sodipodi:linespacing="125%"><tspan
259 sodipodi:role="line"
260 id="tspan4431-9"
261 x="111.75929"
262 y="251.53981">rcu_read_lock();</tspan></text>
263 <text
264 xml:space="preserve"
265 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
266 x="396.10254"
267 y="367.91556"
268 id="text4429-8-9"
269 sodipodi:linespacing="125%"><tspan
270 sodipodi:role="line"
271 id="tspan4431-9-4"
272 x="396.10254"
273 y="367.91556">rcu_read_lock();</tspan></text>
274 <text
275 xml:space="preserve"
276 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
277 x="396.10254"
278 y="597.40289"
279 id="text4429-8-9-3"
280 sodipodi:linespacing="125%"><tspan
281 sodipodi:role="line"
282 id="tspan4431-9-4-4"
283 x="396.10254"
284 y="597.40289">rcu_read_unlock();</tspan></text>
285 <text
286 xml:space="preserve"
287 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
288 x="111.75929"
289 y="453.15311"
290 id="text4429-8-9-3-1"
291 sodipodi:linespacing="125%"><tspan
292 sodipodi:role="line"
293 id="tspan4431-9-4-4-6"
294 x="111.75929"
295 y="453.15311">rcu_read_unlock();</tspan></text>
296 <path
297 style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
298 d="m 33.941125,227.87568 436.284885,0 0,0.7071"
299 id="path4608"
300 inkscape:connector-curvature="0" />
301 <text
302 xml:space="preserve"
303 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
304 x="394.94427"
305 y="345.66351"
306 id="text4648"
307 sodipodi:linespacing="125%"><tspan
308 sodipodi:role="line"
309 id="tspan4650"
310 x="394.94427"
311 y="345.66351">QS</tspan></text>
312 <path
313 sodipodi:type="arc"
314 style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
315 id="path4652"
316 sodipodi:cx="358.85669"
317 sodipodi:cy="142.87541"
318 sodipodi:rx="10.960155"
319 sodipodi:ry="10.253048"
320 d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
321 transform="translate(36.441125,199.60612)"
322 sodipodi:start="4.7135481"
323 sodipodi:end="10.994651"
324 sodipodi:open="true" />
325 <text
326 xml:space="preserve"
327 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
328 x="112.11968"
329 y="475.77856"
330 id="text4648-4"
331 sodipodi:linespacing="125%"><tspan
332 sodipodi:role="line"
333 id="tspan4650-4"
334 x="112.11968"
335 y="475.77856">QS</tspan></text>
336 <path
337 sodipodi:type="arc"
338 style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
339 id="path4652-7"
340 sodipodi:cx="358.85669"
341 sodipodi:cy="142.87541"
342 sodipodi:rx="10.960155"
343 sodipodi:ry="10.253048"
344 d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
345 transform="translate(-246.38346,329.72117)"
346 sodipodi:start="4.7135481"
347 sodipodi:end="10.994651"
348 sodipodi:open="true" />
349 <path
350 sodipodi:type="arc"
351 style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
352 id="path4652-7-7"
353 sodipodi:cx="358.85669"
354 sodipodi:cy="142.87541"
355 sodipodi:rx="10.960155"
356 sodipodi:ry="10.253048"
357 d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
358 transform="translate(-103.65246,202.90878)"
359 sodipodi:start="4.7135481"
360 sodipodi:end="10.994651"
361 sodipodi:open="true" />
362 <text
363 xml:space="preserve"
364 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
365 x="254.85066"
366 y="348.96619"
367 id="text4648-4-3"
368 sodipodi:linespacing="125%"><tspan
369 sodipodi:role="line"
370 id="tspan4650-4-5"
371 x="254.85066"
372 y="348.96619">QS</tspan></text>
373 </g>
374</svg>
diff --git a/Documentation/RCU/Design/Requirements/RCUApplicability.svg b/Documentation/RCU/Design/Requirements/RCUApplicability.svg
new file mode 100644
index 000000000000..ebcbeee391ed
--- /dev/null
+++ b/Documentation/RCU/Design/Requirements/RCUApplicability.svg
@@ -0,0 +1,237 @@
1<?xml version="1.0" encoding="UTF-8" standalone="no"?>
2<!-- Creator: fig2dev Version 3.2 Patchlevel 5d -->
3
4<!-- CreationDate: Tue Mar 4 18:34:25 2014 -->
5
6<!-- Magnification: 3.000 -->
7
8<svg
9 xmlns:dc="http://purl.org/dc/elements/1.1/"
10 xmlns:cc="http://creativecommons.org/ns#"
11 xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
12 xmlns:svg="http://www.w3.org/2000/svg"
13 xmlns="http://www.w3.org/2000/svg"
14 xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
15 xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
16 width="1089.1382"
17 height="668.21368"
18 viewBox="-2121 -36 14554.634 8876.4061"
19 id="svg2"
20 version="1.1"
21 inkscape:version="0.48.3.1 r9886"
22 sodipodi:docname="RCUApplicability.svg">
23 <metadata
24 id="metadata40">
25 <rdf:RDF>
26 <cc:Work
27 rdf:about="">
28 <dc:format>image/svg+xml</dc:format>
29 <dc:type
30 rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
31 <dc:title />
32 </cc:Work>
33 </rdf:RDF>
34 </metadata>
35 <defs
36 id="defs38" />
37 <sodipodi:namedview
38 pagecolor="#ffffff"
39 bordercolor="#666666"
40 borderopacity="1"
41 objecttolerance="10"
42 gridtolerance="10"
43 guidetolerance="10"
44 inkscape:pageopacity="0"
45 inkscape:pageshadow="2"
46 inkscape:window-width="849"
47 inkscape:window-height="639"
48 id="namedview36"
49 showgrid="false"
50 inkscape:zoom="0.51326165"
51 inkscape:cx="544.56912"
52 inkscape:cy="334.10686"
53 inkscape:window-x="149"
54 inkscape:window-y="448"
55 inkscape:window-maximized="0"
56 inkscape:current-layer="g4"
57 fit-margin-top="5"
58 fit-margin-left="5"
59 fit-margin-right="5"
60 fit-margin-bottom="5" />
61 <g
62 style="fill:none;stroke-width:0.025in"
63 id="g4"
64 transform="translate(-2043.6828,14.791398)">
65 <!-- Line: box -->
66 <rect
67 x="0"
68 y="0"
69 width="14400"
70 height="8775"
71 rx="0"
72 style="fill:#ffa1a1;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
73 id="rect6" />
74 <!-- Line: box -->
75 <rect
76 x="1350"
77 y="0"
78 width="11700"
79 height="6075"
80 rx="0"
81 style="fill:#ffff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
82 id="rect8" />
83 <!-- Line: box -->
84 <rect
85 x="2700"
86 y="0"
87 width="9000"
88 height="4275"
89 rx="0"
90 style="fill:#00ff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
91 id="rect10" />
92 <!-- Line: box -->
93 <rect
94 x="4050"
95 y="0"
96 width="6300"
97 height="2475"
98 rx="0"
99 style="fill:#87cfff;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
100 id="rect12" />
101 <!-- Text -->
102 <text
103 xml:space="preserve"
104 x="7200"
105 y="900"
106 font-style="normal"
107 font-weight="normal"
108 font-size="324"
109 id="text14"
110 sodipodi:linespacing="125%"
111 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
112 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
113 id="tspan3017">Read-Mostly, Stale &amp;</tspan></text>
114 <!-- Text -->
115 <text
116 xml:space="preserve"
117 x="7200"
118 y="1350"
119 font-style="normal"
120 font-weight="normal"
121 font-size="324"
122 id="text16"
123 sodipodi:linespacing="125%"
124 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
125 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
126 id="tspan3019">Inconsistent Data OK</tspan></text>
127 <!-- Text -->
128 <text
129 xml:space="preserve"
130 x="7200"
131 y="1800"
132 font-style="normal"
133 font-weight="normal"
134 font-size="324"
135 id="text18"
136 sodipodi:linespacing="125%"
137 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
138 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
139 id="tspan3021">(RCU Works Great!!!)</tspan></text>
140 <!-- Text -->
141 <text
142 xml:space="preserve"
143 x="7200"
144 y="3825"
145 font-style="normal"
146 font-weight="normal"
147 font-size="324"
148 id="text20"
149 sodipodi:linespacing="125%"
150 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
151 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
152 id="tspan3023">(RCU Works Well)</tspan></text>
153 <!-- Text -->
154 <text
155 xml:space="preserve"
156 x="7200"
157 y="3375"
158 font-style="normal"
159 font-weight="normal"
160 font-size="324"
161 id="text22"
162 sodipodi:linespacing="125%"
163 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
164 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
165 id="tspan3025">Read-Mostly, Need Consistent Data</tspan></text>
166 <!-- Text -->
167 <text
168 xml:space="preserve"
169 x="7200"
170 y="5175"
171 font-style="normal"
172 font-weight="normal"
173 font-size="324"
174 id="text24"
175 sodipodi:linespacing="125%"
176 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
177 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
178 id="tspan3027">Read-Write, Need Consistent Data</tspan></text>
179 <!-- Text -->
180 <text
181 xml:space="preserve"
182 x="7200"
183 y="6975"
184 font-style="normal"
185 font-weight="normal"
186 font-size="324"
187 id="text26"
188 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
189 sodipodi:linespacing="125%">Update-Mostly, Need Consistent Data</text>
190 <!-- Text -->
191 <text
192 xml:space="preserve"
193 x="7200"
194 y="5625"
195 font-style="normal"
196 font-weight="normal"
197 font-size="324"
198 id="text28"
199 sodipodi:linespacing="125%"
200 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
201 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
202 id="tspan3029">(RCU Might Be OK...)</tspan></text>
203 <!-- Text -->
204 <text
205 xml:space="preserve"
206 x="7200"
207 y="7875"
208 font-style="normal"
209 font-weight="normal"
210 font-size="324"
211 id="text30"
212 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
213 sodipodi:linespacing="125%">(1) Provide Existence Guarantees For Update-Friendly Mechanisms</text>
214 <!-- Text -->
215 <text
216 xml:space="preserve"
217 x="7200"
218 y="8325"
219 font-style="normal"
220 font-weight="normal"
221 font-size="324"
222 id="text32"
223 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
224 sodipodi:linespacing="125%">(2) Provide Wait-Free Read-Side Primitives for Real-Time Use)</text>
225 <!-- Text -->
226 <text
227 xml:space="preserve"
228 x="7200"
229 y="7425"
230 font-style="normal"
231 font-weight="normal"
232 font-size="324"
233 id="text34"
234 style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
235 sodipodi:linespacing="125%">(RCU is Very Unlikely to be the Right Tool For The Job, But it Can:</text>
236 </g>
237</svg>
diff --git a/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg b/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg
new file mode 100644
index 000000000000..48cd1623d4d4
--- /dev/null
+++ b/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg
@@ -0,0 +1,639 @@
1<?xml version="1.0" encoding="UTF-8" standalone="no"?>
2<!-- Created with Inkscape (http://www.inkscape.org/) -->
3
4<svg
5 xmlns:dc="http://purl.org/dc/elements/1.1/"
6 xmlns:cc="http://creativecommons.org/ns#"
7 xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
8 xmlns:svg="http://www.w3.org/2000/svg"
9 xmlns="http://www.w3.org/2000/svg"
10 xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
11 xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
12 width="735.25"
13 height="516.21875"
14 id="svg2"
15 version="1.1"
16 inkscape:version="0.48.3.1 r9886"
17 sodipodi:docname="ReadersPartitionGP1.svg">
18 <defs
19 id="defs4">
20 <marker
21 inkscape:stockid="Arrow2Lend"
22 orient="auto"
23 refY="0"
24 refX="0"
25 id="Arrow2Lend"
26 style="overflow:visible">
27 <path
28 id="path3792"
29 style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
30 d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
31 transform="matrix(-1.1,0,0,-1.1,-1.1,0)"
32 inkscape:connector-curvature="0" />
33 </marker>
34 <marker
35 inkscape:stockid="Arrow2Lstart"
36 orient="auto"
37 refY="0"
38 refX="0"
39 id="Arrow2Lstart"
40 style="overflow:visible">
41 <path
42 id="path3789"
43 style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
44 d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
45 transform="matrix(1.1,0,0,1.1,1.1,0)"
46 inkscape:connector-curvature="0" />
47 </marker>
48 <marker
49 inkscape:stockid="Arrow2Lstart"
50 orient="auto"
51 refY="0"
52 refX="0"
53 id="Arrow2Lstart-4"
54 style="overflow:visible">
55 <path
56 id="path3789-9"
57 style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
58 d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
59 transform="matrix(1.1,0,0,1.1,1.1,0)"
60 inkscape:connector-curvature="0" />
61 </marker>
62 <marker
63 inkscape:stockid="Arrow2Lend"
64 orient="auto"
65 refY="0"
66 refX="0"
67 id="Arrow2Lend-4"
68 style="overflow:visible">
69 <path
70 id="path3792-4"
71 style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
72 d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
73 transform="matrix(-1.1,0,0,-1.1,-1.1,0)"
74 inkscape:connector-curvature="0" />
75 </marker>
76 </defs>
77 <sodipodi:namedview
78 id="base"
79 pagecolor="#ffffff"
80 bordercolor="#666666"
81 borderopacity="1.0"
82 inkscape:pageopacity="0.0"
83 inkscape:pageshadow="2"
84 inkscape:zoom="1.3670394"
85 inkscape:cx="367.26465"
86 inkscape:cy="258.46182"
87 inkscape:document-units="px"
88 inkscape:current-layer="g4433-6"
89 showgrid="false"
90 inkscape:window-width="1351"
91 inkscape:window-height="836"
92 inkscape:window-x="438"
93 inkscape:window-y="335"
94 inkscape:window-maximized="0"
95 fit-margin-top="5"
96 fit-margin-left="5"
97 fit-margin-right="5"
98 fit-margin-bottom="5" />
99 <metadata
100 id="metadata7">
101 <rdf:RDF>
102 <cc:Work
103 rdf:about="">
104 <dc:format>image/svg+xml</dc:format>
105 <dc:type
106 rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
107 <dc:title />
108 </cc:Work>
109 </rdf:RDF>
110 </metadata>
111 <g
112 inkscape:label="Layer 1"
113 inkscape:groupmode="layer"
114 id="layer1"
115 transform="translate(-29.15625,-185.59375)">
116 <flowRoot
117 xml:space="preserve"
118 id="flowRoot2985"
119 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion
120 id="flowRegion2987"><rect
121 id="rect2989"
122 width="82.85714"
123 height="11.428572"
124 x="240"
125 y="492.36218" /></flowRegion><flowPara
126 id="flowPara2991" /></flowRoot> <g
127 id="g4433"
128 transform="translate(2,-12)">
129 <text
130 sodipodi:linespacing="125%"
131 id="text2993"
132 y="-261.66608"
133 x="436.12299"
134 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
135 xml:space="preserve"
136 transform="matrix(0,1,-1,0,0,0)"><tspan
137 y="-261.66608"
138 x="436.12299"
139 id="tspan2995"
140 sodipodi:role="line">synchronize_rcu()</tspan></text>
141 <g
142 id="g4417"
143 transform="matrix(0,1,-1,0,730.90257,222.4928)">
144 <path
145 style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)"
146 d="M 97.580736,477.4048 327.57913,476.09759"
147 id="path2997"
148 inkscape:connector-curvature="0"
149 sodipodi:nodetypes="cc" />
150 <path
151 style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
152 d="m 96.752718,465.38398 0,22.62742"
153 id="path4397"
154 inkscape:connector-curvature="0"
155 sodipodi:nodetypes="cc" />
156 <path
157 style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
158 d="m 328.40703,465.38397 0,22.62742"
159 id="path4397-5"
160 inkscape:connector-curvature="0"
161 sodipodi:nodetypes="cc" />
162 </g>
163 </g>
164 <text
165 xml:space="preserve"
166 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
167 x="112.04738"
168 y="268.18076"
169 id="text4429"
170 sodipodi:linespacing="125%"><tspan
171 sodipodi:role="line"
172 id="tspan4431"
173 x="112.04738"
174 y="268.18076">WRITE_ONCE(a, 1);</tspan></text>
175 <text
176 xml:space="preserve"
177 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
178 x="112.04738"
179 y="487.13766"
180 id="text4441"
181 sodipodi:linespacing="125%"><tspan
182 sodipodi:role="line"
183 id="tspan4443"
184 x="112.04738"
185 y="487.13766">WRITE_ONCE(b, 1);</tspan></text>
186 <text
187 xml:space="preserve"
188 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
189 x="255.60869"
190 y="297.29346"
191 id="text4445"
192 sodipodi:linespacing="125%"><tspan
193 sodipodi:role="line"
194 id="tspan4447"
195 x="255.60869"
196 y="297.29346">r1 = READ_ONCE(a);</tspan></text>
197 <text
198 xml:space="preserve"
199 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
200 x="255.14423"
201 y="554.61786"
202 id="text4449"
203 sodipodi:linespacing="125%"><tspan
204 sodipodi:role="line"
205 id="tspan4451"
206 x="255.14423"
207 y="554.61786">WRITE_ONCE(c, 1);</tspan></text>
208 <text
209 xml:space="preserve"
210 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
211 x="396.10254"
212 y="370.71124"
213 id="text4453"
214 sodipodi:linespacing="125%"><tspan
215 sodipodi:role="line"
216 id="tspan4455"
217 x="396.10254"
218 y="370.71124">WRITE_ONCE(d, 1);</tspan></text>
219 <text
220 xml:space="preserve"
221 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
222 x="396.10254"
223 y="572.13617"
224 id="text4457"
225 sodipodi:linespacing="125%"><tspan
226 sodipodi:role="line"
227 id="tspan4459"
228 x="396.10254"
229 y="572.13617">r2 = READ_ONCE(c);</tspan></text>
230 <text
231 xml:space="preserve"
232 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
233 x="112.08231"
234 y="213.91006"
235 id="text4461"
236 sodipodi:linespacing="125%"><tspan
237 sodipodi:role="line"
238 id="tspan4463"
239 x="112.08231"
240 y="213.91006">thread0()</tspan></text>
241 <text
242 xml:space="preserve"
243 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
244 x="252.34512"
245 y="213.91006"
246 id="text4461-6"
247 sodipodi:linespacing="125%"><tspan
248 sodipodi:role="line"
249 id="tspan4463-0"
250 x="252.34512"
251 y="213.91006">thread1()</tspan></text>
252 <text
253 xml:space="preserve"
254 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
255 x="396.42557"
256 y="213.91006"
257 id="text4461-2"
258 sodipodi:linespacing="125%"><tspan
259 sodipodi:role="line"
260 id="tspan4463-2"
261 x="396.42557"
262 y="213.91006">thread2()</tspan></text>
263 <rect
264 style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
265 id="rect4495"
266 width="724.25244"
267 height="505.21201"
268 x="34.648232"
269 y="191.10612" />
270 <path
271 style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
272 d="m 183.14066,191.10612 0,504.24243"
273 id="path4497"
274 inkscape:connector-curvature="0"
275 sodipodi:nodetypes="cc" />
276 <path
277 style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
278 d="m 325.13867,191.10612 0,504.24243"
279 id="path4497-5"
280 inkscape:connector-curvature="0"
281 sodipodi:nodetypes="cc" />
282 <text
283 xml:space="preserve"
284 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
285 x="111.75929"
286 y="251.53981"
287 id="text4429-8"
288 sodipodi:linespacing="125%"><tspan
289 sodipodi:role="line"
290 id="tspan4431-9"
291 x="111.75929"
292 y="251.53981">rcu_read_lock();</tspan></text>
293 <text
294 xml:space="preserve"
295 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
296 x="396.10254"
297 y="353.91556"
298 id="text4429-8-9"
299 sodipodi:linespacing="125%"><tspan
300 sodipodi:role="line"
301 id="tspan4431-9-4"
302 x="396.10254"
303 y="353.91556">rcu_read_lock();</tspan></text>
304 <text
305 xml:space="preserve"
306 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
307 x="396.10254"
308 y="587.40289"
309 id="text4429-8-9-3"
310 sodipodi:linespacing="125%"><tspan
311 sodipodi:role="line"
312 id="tspan4431-9-4-4"
313 x="396.10254"
314 y="587.40289">rcu_read_unlock();</tspan></text>
315 <text
316 xml:space="preserve"
317 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
318 x="111.75929"
319 y="501.15311"
320 id="text4429-8-9-3-1"
321 sodipodi:linespacing="125%"><tspan
322 sodipodi:role="line"
323 id="tspan4431-9-4-4-6"
324 x="111.75929"
325 y="501.15311">rcu_read_unlock();</tspan></text>
326 <path
327 style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
328 d="m 33.941125,227.87568 724.941765,0"
329 id="path4608"
330 inkscape:connector-curvature="0"
331 sodipodi:nodetypes="cc" />
332 <text
333 xml:space="preserve"
334 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
335 x="394.94427"
336 y="331.66351"
337 id="text4648"
338 sodipodi:linespacing="125%"><tspan
339 sodipodi:role="line"
340 id="tspan4650"
341 x="394.94427"
342 y="331.66351">QS</tspan></text>
343 <path
344 sodipodi:type="arc"
345 style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
346 id="path4652"
347 sodipodi:cx="358.85669"
348 sodipodi:cy="142.87541"
349 sodipodi:rx="10.960155"
350 sodipodi:ry="10.253048"
351 d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
352 transform="translate(36.441125,185.60612)"
353 sodipodi:start="4.7135481"
354 sodipodi:end="10.994651"
355 sodipodi:open="true" />
356 <text
357 xml:space="preserve"
358 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
359 x="112.11968"
360 y="523.77856"
361 id="text4648-4"
362 sodipodi:linespacing="125%"><tspan
363 sodipodi:role="line"
364 id="tspan4650-4"
365 x="112.11968"
366 y="523.77856">QS</tspan></text>
367 <path
368 sodipodi:type="arc"
369 style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
370 id="path4652-7"
371 sodipodi:cx="358.85669"
372 sodipodi:cy="142.87541"
373 sodipodi:rx="10.960155"
374 sodipodi:ry="10.253048"
375 d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
376 transform="translate(-246.38346,377.72117)"
377 sodipodi:start="4.7135481"
378 sodipodi:end="10.994651"
379 sodipodi:open="true" />
380 <path
381 sodipodi:type="arc"
382 style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
383 id="path4652-7-7"
384 sodipodi:cx="358.85669"
385 sodipodi:cy="142.87541"
386 sodipodi:rx="10.960155"
387 sodipodi:ry="10.253048"
388 d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
389 transform="translate(-103.65246,190.90878)"
390 sodipodi:start="4.7135481"
391 sodipodi:end="10.994651"
392 sodipodi:open="true" />
393 <text
394 xml:space="preserve"
395 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
396 x="254.85066"
397 y="336.96619"
398 id="text4648-4-3"
399 sodipodi:linespacing="125%"><tspan
400 sodipodi:role="line"
401 id="tspan4650-4-5"
402 x="254.85066"
403 y="336.96619">QS</tspan></text>
404 <path
405 style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
406 d="m 470.93311,190.39903 0,504.24243"
407 id="path4497-5-6"
408 inkscape:connector-curvature="0"
409 sodipodi:nodetypes="cc" />
410 <path
411 style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
412 d="m 616.22755,190.38323 0,504.24243"
413 id="path4497-5-2"
414 inkscape:connector-curvature="0"
415 sodipodi:nodetypes="cc" />
416 <g
417 id="g4433-6"
418 transform="translate(288.0964,78.32827)">
419 <text
420 sodipodi:linespacing="125%"
421 id="text2993-7"
422 y="-261.66608"
423 x="440.12299"
424 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
425 xml:space="preserve"
426 transform="matrix(0,1,-1,0,0,0)"><tspan
427 y="-261.66608"
428 x="440.12299"
429 id="tspan2995-1"
430 sodipodi:role="line">synchronize_rcu()</tspan></text>
431 <g
432 id="g4417-1"
433 transform="matrix(0,1,-1,0,730.90257,222.4928)">
434 <path
435 style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)"
436 d="M 97.580736,477.4048 328.5624,477.07246"
437 id="path2997-2"
438 inkscape:connector-curvature="0"
439 sodipodi:nodetypes="cc" />
440 <path
441 style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
442 d="m 96.752718,465.38398 0,22.62742"
443 id="path4397-3"
444 inkscape:connector-curvature="0"
445 sodipodi:nodetypes="cc" />
446 <path
447 style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
448 d="m 329.39039,465.38397 0,22.62742"
449 id="path4397-5-4"
450 inkscape:connector-curvature="0"
451 sodipodi:nodetypes="cc" />
452 </g>
453 </g>
454 <text
455 xml:space="preserve"
456 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
457 x="541.70508"
458 y="387.6217"
459 id="text4445-0"
460 sodipodi:linespacing="125%"><tspan
461 sodipodi:role="line"
462 id="tspan4447-5"
463 x="541.70508"
464 y="387.6217">r3 = READ_ONCE(d);</tspan></text>
465 <text
466 xml:space="preserve"
467 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
468 x="541.2406"
469 y="646.94611"
470 id="text4449-6"
471 sodipodi:linespacing="125%"><tspan
472 sodipodi:role="line"
473 id="tspan4451-6"
474 x="541.2406"
475 y="646.94611">WRITE_ONCE(e, 1);</tspan></text>
476 <path
477 sodipodi:type="arc"
478 style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
479 id="path4652-7-7-5"
480 sodipodi:cx="358.85669"
481 sodipodi:cy="142.87541"
482 sodipodi:rx="10.960155"
483 sodipodi:ry="10.253048"
484 d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
485 transform="translate(182.44393,281.23704)"
486 sodipodi:start="4.7135481"
487 sodipodi:end="10.994651"
488 sodipodi:open="true" />
489 <text
490 xml:space="preserve"
491 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
492 x="540.94702"
493 y="427.29443"
494 id="text4648-4-3-1"
495 sodipodi:linespacing="125%"><tspan
496 sodipodi:role="line"
497 id="tspan4650-4-5-7"
498 x="540.94702"
499 y="427.29443">QS</tspan></text>
500 <text
501 xml:space="preserve"
502 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
503 x="686.27747"
504 y="461.83929"
505 id="text4453-7"
506 sodipodi:linespacing="125%"><tspan
507 sodipodi:role="line"
508 id="tspan4455-1"
509 x="686.27747"
510 y="461.83929">r4 = READ_ONCE(b);</tspan></text>
511 <text
512 xml:space="preserve"
513 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
514 x="686.27747"
515 y="669.26422"
516 id="text4457-9"
517 sodipodi:linespacing="125%"><tspan
518 sodipodi:role="line"
519 id="tspan4459-2"
520 x="686.27747"
521 y="669.26422">r5 = READ_ONCE(e);</tspan></text>
522 <text
523 xml:space="preserve"
524 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
525 x="686.27747"
526 y="445.04358"
527 id="text4429-8-9-33"
528 sodipodi:linespacing="125%"><tspan
529 sodipodi:role="line"
530 id="tspan4431-9-4-2"
531 x="686.27747"
532 y="445.04358">rcu_read_lock();</tspan></text>
533 <text
534 xml:space="preserve"
535 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
536 x="686.27747"
537 y="684.53094"
538 id="text4429-8-9-3-8"
539 sodipodi:linespacing="125%"><tspan
540 sodipodi:role="line"
541 id="tspan4431-9-4-4-5"
542 x="686.27747"
543 y="684.53094">rcu_read_unlock();</tspan></text>
544 <text
545 xml:space="preserve"
546 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
547 x="685.11914"
548 y="422.79153"
549 id="text4648-9"
550 sodipodi:linespacing="125%"><tspan
551 sodipodi:role="line"
552 id="tspan4650-7"
553 x="685.11914"
554 y="422.79153">QS</tspan></text>
555 <path
556 sodipodi:type="arc"
557 style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
558 id="path4652-8"
559 sodipodi:cx="358.85669"
560 sodipodi:cy="142.87541"
561 sodipodi:rx="10.960155"
562 sodipodi:ry="10.253048"
563 d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
564 transform="translate(326.61602,276.73415)"
565 sodipodi:start="4.7135481"
566 sodipodi:end="10.994651"
567 sodipodi:open="true" />
568 <text
569 xml:space="preserve"
570 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
571 x="397.85934"
572 y="609.59003"
573 id="text4648-5"
574 sodipodi:linespacing="125%"><tspan
575 sodipodi:role="line"
576 id="tspan4650-77"
577 x="397.85934"
578 y="609.59003">QS</tspan></text>
579 <path
580 sodipodi:type="arc"
581 style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
582 id="path4652-80"
583 sodipodi:cx="358.85669"
584 sodipodi:cy="142.87541"
585 sodipodi:rx="10.960155"
586 sodipodi:ry="10.253048"
587 d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
588 transform="translate(39.356201,463.53264)"
589 sodipodi:start="4.7135481"
590 sodipodi:end="10.994651"
591 sodipodi:open="true" />
592 <text
593 xml:space="preserve"
594 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
595 x="256.75986"
596 y="586.99133"
597 id="text4648-5-2"
598 sodipodi:linespacing="125%"><tspan
599 sodipodi:role="line"
600 id="tspan4650-77-7"
601 x="256.75986"
602 y="586.99133">QS</tspan></text>
603 <path
604 sodipodi:type="arc"
605 style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
606 id="path4652-80-5"
607 sodipodi:cx="358.85669"
608 sodipodi:cy="142.87541"
609 sodipodi:rx="10.960155"
610 sodipodi:ry="10.253048"
611 d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
612 transform="translate(-101.74328,440.93395)"
613 sodipodi:start="4.7135481"
614 sodipodi:end="10.994651"
615 sodipodi:open="true" />
616 <text
617 xml:space="preserve"
618 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
619 x="546.22791"
620 y="213.91006"
621 id="text4461-2-5"
622 sodipodi:linespacing="125%"><tspan
623 sodipodi:role="line"
624 id="tspan4463-2-6"
625 x="546.22791"
626 y="213.91006">thread3()</tspan></text>
627 <text
628 xml:space="preserve"
629 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
630 x="684.00067"
631 y="213.91006"
632 id="text4461-2-1"
633 sodipodi:linespacing="125%"><tspan
634 sodipodi:role="line"
635 id="tspan4463-2-0"
636 x="684.00067"
637 y="213.91006">thread4()</tspan></text>
638 </g>
639</svg>
diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html
new file mode 100644
index 000000000000..36de7aaa941e
--- /dev/null
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -0,0 +1,2799 @@
1<!-- DO NOT HAND EDIT. -->
2<!-- Instead, edit Requirements.htmlx and run 'sh htmlqqz.sh Requirements' -->
3<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
4 "http://www.w3.org/TR/html4/loose.dtd">
5 <html>
6 <head><title>A Tour Through RCU's Requirements [LWN.net]</title>
7 <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8">
8
9<h1>A Tour Through RCU's Requirements</h1>
10
11<p>Copyright IBM Corporation, 2015</p>
12<p>Author: Paul E.&nbsp;McKenney</p>
13<p><i>The initial version of this document appeared in the
14<a href="https://lwn.net/">LWN</a> articles
15<a href="https://lwn.net/Articles/652156/">here</a>,
16<a href="https://lwn.net/Articles/652677/">here</a>, and
17<a href="https://lwn.net/Articles/653326/">here</a>.</i></p>
18
19<h2>Introduction</h2>
20
21<p>
22Read-copy update (RCU) is a synchronization mechanism that is often
23used as a replacement for reader-writer locking.
24RCU is unusual in that updaters do not block readers,
25which means that RCU's read-side primitives can be exceedingly fast
26and scalable.
27In addition, updaters can make useful forward progress concurrently
28with readers.
29However, all this concurrency between RCU readers and updaters does raise
30the question of exactly what RCU readers are doing, which in turn
31raises the question of exactly what RCU's requirements are.
32
33<p>
34This document therefore summarizes RCU's requirements, and can be thought
35of as an informal, high-level specification for RCU.
36It is important to understand that RCU's specification is primarily
37empirical in nature;
38in fact, I learned about many of these requirements the hard way.
39This situation might cause some consternation, however, not only
40has this learning process been a lot of fun, but it has also been
41a great privilege to work with so many people willing to apply
42technologies in interesting new ways.
43
44<p>
45All that aside, here are the categories of currently known RCU requirements:
46</p>
47
48<ol>
49<li> <a href="#Fundamental Requirements">
50 Fundamental Requirements</a>
51<li> <a href="#Fundamental Non-Requirements">Fundamental Non-Requirements</a>
52<li> <a href="#Parallelism Facts of Life">
53 Parallelism Facts of Life</a>
54<li> <a href="#Quality-of-Implementation Requirements">
55 Quality-of-Implementation Requirements</a>
56<li> <a href="#Linux Kernel Complications">
57 Linux Kernel Complications</a>
58<li> <a href="#Software-Engineering Requirements">
59 Software-Engineering Requirements</a>
60<li> <a href="#Other RCU Flavors">
61 Other RCU Flavors</a>
62<li> <a href="#Possible Future Changes">
63 Possible Future Changes</a>
64</ol>
65
66<p>
67This is followed by a <a href="#Summary">summary</a>,
68which is in turn followed by the inevitable
69<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>.
70
71<h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2>
72
73<p>
74RCU's fundamental requirements are the closest thing RCU has to hard
75mathematical requirements.
76These are:
77
78<ol>
79<li> <a href="#Grace-Period Guarantee">
80 Grace-Period Guarantee</a>
81<li> <a href="#Publish-Subscribe Guarantee">
82 Publish-Subscribe Guarantee</a>
83<li> <a href="#RCU Primitives Guaranteed to Execute Unconditionally">
84 RCU Primitives Guaranteed to Execute Unconditionally</a>
85<li> <a href="#Guaranteed Read-to-Write Upgrade">
86 Guaranteed Read-to-Write Upgrade</a>
87</ol>
88
89<h3><a name="Grace-Period Guarantee">Grace-Period Guarantee</a></h3>
90
91<p>
92RCU's grace-period guarantee is unusual in being premeditated:
93Jack Slingwine and I had this guarantee firmly in mind when we started
94work on RCU (then called &ldquo;rclock&rdquo;) in the early 1990s.
95That said, the past two decades of experience with RCU have produced
96a much more detailed understanding of this guarantee.
97
98<p>
99RCU's grace-period guarantee allows updaters to wait for the completion
100of all pre-existing RCU read-side critical sections.
101An RCU read-side critical section
102begins with the marker <tt>rcu_read_lock()</tt> and ends with
103the marker <tt>rcu_read_unlock()</tt>.
104These markers may be nested, and RCU treats a nested set as one
105big RCU read-side critical section.
106Production-quality implementations of <tt>rcu_read_lock()</tt> and
107<tt>rcu_read_unlock()</tt> are extremely lightweight, and in
108fact have exactly zero overhead in Linux kernels built for production
109use with <tt>CONFIG_PREEMPT=n</tt>.
110
111<p>
112This guarantee allows ordering to be enforced with extremely low
113overhead to readers, for example:
114
115<blockquote>
116<pre>
117 1 int x, y;
118 2
119 3 void thread0(void)
120 4 {
121 5 rcu_read_lock();
122 6 r1 = READ_ONCE(x);
123 7 r2 = READ_ONCE(y);
124 8 rcu_read_unlock();
125 9 }
12610
12711 void thread1(void)
12812 {
12913 WRITE_ONCE(x, 1);
13014 synchronize_rcu();
13115 WRITE_ONCE(y, 1);
13216 }
133</pre>
134</blockquote>
135
136<p>
137Because the <tt>synchronize_rcu()</tt> on line&nbsp;14 waits for
138all pre-existing readers, any instance of <tt>thread0()</tt> that
139loads a value of zero from <tt>x</tt> must complete before
140<tt>thread1()</tt> stores to <tt>y</tt>, so that instance must
141also load a value of zero from <tt>y</tt>.
142Similarly, any instance of <tt>thread0()</tt> that loads a value of
143one from <tt>y</tt> must have started after the
144<tt>synchronize_rcu()</tt> started, and must therefore also load
145a value of one from <tt>x</tt>.
146Therefore, the outcome:
147<blockquote>
148<pre>
149(r1 == 0 &amp;&amp; r2 == 1)
150</pre>
151</blockquote>
152cannot happen.
153
154<p><a name="Quick Quiz 1"><b>Quick Quiz 1</b>:</a>
155Wait a minute!
156You said that updaters can make useful forward progress concurrently
157with readers, but pre-existing readers will block
158<tt>synchronize_rcu()</tt>!!!
159Just who are you trying to fool???
160<br><a href="#qq1answer">Answer</a>
161
162<p>
163This scenario resembles one of the first uses of RCU in
164<a href="https://en.wikipedia.org/wiki/DYNIX">DYNIX/ptx</a>,
165which managed a distributed lock manager's transition into
166a state suitable for handling recovery from node failure,
167more or less as follows:
168
169<blockquote>
170<pre>
171 1 #define STATE_NORMAL 0
172 2 #define STATE_WANT_RECOVERY 1
173 3 #define STATE_RECOVERING 2
174 4 #define STATE_WANT_NORMAL 3
175 5
176 6 int state = STATE_NORMAL;
177 7
178 8 void do_something_dlm(void)
179 9 {
18010 int state_snap;
18111
18212 rcu_read_lock();
18313 state_snap = READ_ONCE(state);
18414 if (state_snap == STATE_NORMAL)
18515 do_something();
18616 else
18717 do_something_carefully();
18818 rcu_read_unlock();
18919 }
19020
19121 void start_recovery(void)
19222 {
19323 WRITE_ONCE(state, STATE_WANT_RECOVERY);
19424 synchronize_rcu();
19525 WRITE_ONCE(state, STATE_RECOVERING);
19626 recovery();
19727 WRITE_ONCE(state, STATE_WANT_NORMAL);
19828 synchronize_rcu();
19929 WRITE_ONCE(state, STATE_NORMAL);
20030 }
201</pre>
202</blockquote>
203
204<p>
205The RCU read-side critical section in <tt>do_something_dlm()</tt>
206works with the <tt>synchronize_rcu()</tt> in <tt>start_recovery()</tt>
207to guarantee that <tt>do_something()</tt> never runs concurrently
208with <tt>recovery()</tt>, but with little or no synchronization
209overhead in <tt>do_something_dlm()</tt>.
210
211<p><a name="Quick Quiz 2"><b>Quick Quiz 2</b>:</a>
212Why is the <tt>synchronize_rcu()</tt> on line&nbsp;28 needed?
213<br><a href="#qq2answer">Answer</a>
214
215<p>
216In order to avoid fatal problems such as deadlocks,
217an RCU read-side critical section must not contain calls to
218<tt>synchronize_rcu()</tt>.
219Similarly, an RCU read-side critical section must not
220contain anything that waits, directly or indirectly, on completion of
221an invocation of <tt>synchronize_rcu()</tt>.
222
223<p>
224Although RCU's grace-period guarantee is useful in and of itself, with
225<a href="https://lwn.net/Articles/573497/">quite a few use cases</a>,
226it would be good to be able to use RCU to coordinate read-side
227access to linked data structures.
228For this, the grace-period guarantee is not sufficient, as can
229be seen in function <tt>add_gp_buggy()</tt> below.
230We will look at the reader's code later, but in the meantime, just think of
231the reader as locklessly picking up the <tt>gp</tt> pointer,
232and, if the value loaded is non-<tt>NULL</tt>, locklessly accessing the
233<tt>-&gt;a</tt> and <tt>-&gt;b</tt> fields.
234
235<blockquote>
236<pre>
237 1 bool add_gp_buggy(int a, int b)
238 2 {
239 3 p = kmalloc(sizeof(*p), GFP_KERNEL);
240 4 if (!p)
241 5 return -ENOMEM;
242 6 spin_lock(&amp;gp_lock);
243 7 if (rcu_access_pointer(gp)) {
244 8 spin_unlock(&amp;gp_lock);
245 9 return false;
24610 }
24711 p-&gt;a = a;
24812 p-&gt;b = a;
24913 gp = p; /* ORDERING BUG */
25014 spin_unlock(&amp;gp_lock);
25115 return true;
25216 }
253</pre>
254</blockquote>
255
256<p>
257The problem is that both the compiler and weakly ordered CPUs are within
258their rights to reorder this code as follows:
259
260<blockquote>
261<pre>
262 1 bool add_gp_buggy_optimized(int a, int b)
263 2 {
264 3 p = kmalloc(sizeof(*p), GFP_KERNEL);
265 4 if (!p)
266 5 return -ENOMEM;
267 6 spin_lock(&amp;gp_lock);
268 7 if (rcu_access_pointer(gp)) {
269 8 spin_unlock(&amp;gp_lock);
270 9 return false;
27110 }
272<b>11 gp = p; /* ORDERING BUG */
27312 p-&gt;a = a;
27413 p-&gt;b = a;</b>
27514 spin_unlock(&amp;gp_lock);
27615 return true;
27716 }
278</pre>
279</blockquote>
280
281<p>
282If an RCU reader fetches <tt>gp</tt> just after
283<tt>add_gp_buggy_optimized</tt> executes line&nbsp;11,
284it will see garbage in the <tt>-&gt;a</tt> and <tt>-&gt;b</tt>
285fields.
286And this is but one of many ways in which compiler and hardware optimizations
287could cause trouble.
288Therefore, we clearly need some way to prevent the compiler and the CPU from
289reordering in this manner, which brings us to the publish-subscribe
290guarantee discussed in the next section.
291
292<h3><a name="Publish-Subscribe Guarantee">Publish/Subscribe Guarantee</a></h3>
293
294<p>
295RCU's publish-subscribe guarantee allows data to be inserted
296into a linked data structure without disrupting RCU readers.
297The updater uses <tt>rcu_assign_pointer()</tt> to insert the
298new data, and readers use <tt>rcu_dereference()</tt> to
299access data, whether new or old.
300The following shows an example of insertion:
301
302<blockquote>
303<pre>
304 1 bool add_gp(int a, int b)
305 2 {
306 3 p = kmalloc(sizeof(*p), GFP_KERNEL);
307 4 if (!p)
308 5 return -ENOMEM;
309 6 spin_lock(&amp;gp_lock);
310 7 if (rcu_access_pointer(gp)) {
311 8 spin_unlock(&amp;gp_lock);
312 9 return false;
31310 }
31411 p-&gt;a = a;
31512 p-&gt;b = a;
31613 rcu_assign_pointer(gp, p);
31714 spin_unlock(&amp;gp_lock);
31815 return true;
31916 }
320</pre>
321</blockquote>
322
323<p>
324The <tt>rcu_assign_pointer()</tt> on line&nbsp;13 is conceptually
325equivalent to a simple assignment statement, but also guarantees
326that its assignment will
327happen after the two assignments in lines&nbsp;11 and&nbsp;12,
328similar to the C11 <tt>memory_order_release</tt> store operation.
329It also prevents any number of &ldquo;interesting&rdquo; compiler
330optimizations, for example, the use of <tt>gp</tt> as a scratch
331location immediately preceding the assignment.
332
333<p><a name="Quick Quiz 3"><b>Quick Quiz 3</b>:</a>
334But <tt>rcu_assign_pointer()</tt> does nothing to prevent the
335two assignments to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt>
336from being reordered.
337Can't that also cause problems?
338<br><a href="#qq3answer">Answer</a>
339
340<p>
341It is tempting to assume that the reader need not do anything special
342to control its accesses to the RCU-protected data,
343as shown in <tt>do_something_gp_buggy()</tt> below:
344
345<blockquote>
346<pre>
347 1 bool do_something_gp_buggy(void)
348 2 {
349 3 rcu_read_lock();
350 4 p = gp; /* OPTIMIZATIONS GALORE!!! */
351 5 if (p) {
352 6 do_something(p-&gt;a, p-&gt;b);
353 7 rcu_read_unlock();
354 8 return true;
355 9 }
35610 rcu_read_unlock();
35711 return false;
35812 }
359</pre>
360</blockquote>
361
362<p>
363However, this temptation must be resisted because there are a
364surprisingly large number of ways that the compiler
365(to say nothing of
366<a href="https://h71000.www7.hp.com/wizard/wiz_2637.html">DEC Alpha CPUs</a>)
367can trip this code up.
368For but one example, if the compiler were short of registers, it
369might choose to refetch from <tt>gp</tt> rather than keeping
370a separate copy in <tt>p</tt> as follows:
371
372<blockquote>
373<pre>
374 1 bool do_something_gp_buggy_optimized(void)
375 2 {
376 3 rcu_read_lock();
377 4 if (gp) { /* OPTIMIZATIONS GALORE!!! */
378<b> 5 do_something(gp-&gt;a, gp-&gt;b);</b>
379 6 rcu_read_unlock();
380 7 return true;
381 8 }
382 9 rcu_read_unlock();
38310 return false;
38411 }
385</pre>
386</blockquote>
387
388<p>
389If this function ran concurrently with a series of updates that
390replaced the current structure with a new one,
391the fetches of <tt>gp-&gt;a</tt>
392and <tt>gp-&gt;b</tt> might well come from two different structures,
393which could cause serious confusion.
394To prevent this (and much else besides), <tt>do_something_gp()</tt> uses
395<tt>rcu_dereference()</tt> to fetch from <tt>gp</tt>:
396
397<blockquote>
398<pre>
399 1 bool do_something_gp(void)
400 2 {
401 3 rcu_read_lock();
402 4 p = rcu_dereference(gp);
403 5 if (p) {
404 6 do_something(p-&gt;a, p-&gt;b);
405 7 rcu_read_unlock();
406 8 return true;
407 9 }
40810 rcu_read_unlock();
40911 return false;
41012 }
411</pre>
412</blockquote>
413
414<p>
415The <tt>rcu_dereference()</tt> uses volatile casts and (for DEC Alpha)
416memory barriers in the Linux kernel.
417Should a
418<a href="http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf">high-quality implementation of C11 <tt>memory_order_consume</tt> [PDF]</a>
419ever appear, then <tt>rcu_dereference()</tt> could be implemented
420as a <tt>memory_order_consume</tt> load.
421Regardless of the exact implementation, a pointer fetched by
422<tt>rcu_dereference()</tt> may not be used outside of the
423outermost RCU read-side critical section containing that
424<tt>rcu_dereference()</tt>, unless protection of
425the corresponding data element has been passed from RCU to some
426other synchronization mechanism, most commonly locking or
427<a href="https://www.kernel.org/doc/Documentation/RCU/rcuref.txt">reference counting</a>.
428
429<p>
430In short, updaters use <tt>rcu_assign_pointer()</tt> and readers
431use <tt>rcu_dereference()</tt>, and these two RCU API elements
432work together to ensure that readers have a consistent view of
433newly added data elements.
434
435<p>
436Of course, it is also necessary to remove elements from RCU-protected
437data structures, for example, using the following process:
438
439<ol>
440<li> Remove the data element from the enclosing structure.
441<li> Wait for all pre-existing RCU read-side critical sections
442 to complete (because only pre-existing readers can possibly have
443 a reference to the newly removed data element).
444<li> At this point, only the updater has a reference to the
445 newly removed data element, so it can safely reclaim
446 the data element, for example, by passing it to <tt>kfree()</tt>.
447</ol>
448
449This process is implemented by <tt>remove_gp_synchronous()</tt>:
450
451<blockquote>
452<pre>
453 1 bool remove_gp_synchronous(void)
454 2 {
455 3 struct foo *p;
456 4
457 5 spin_lock(&amp;gp_lock);
458 6 p = rcu_access_pointer(gp);
459 7 if (!p) {
460 8 spin_unlock(&amp;gp_lock);
461 9 return false;
46210 }
46311 rcu_assign_pointer(gp, NULL);
46412 spin_unlock(&amp;gp_lock);
46513 synchronize_rcu();
46614 kfree(p);
46715 return true;
46816 }
469</pre>
470</blockquote>
471
472<p>
473This function is straightforward, with line&nbsp;13 waiting for a grace
474period before line&nbsp;14 frees the old data element.
475This waiting ensures that readers will reach line&nbsp;7 of
476<tt>do_something_gp()</tt> before the data element referenced by
477<tt>p</tt> is freed.
478The <tt>rcu_access_pointer()</tt> on line&nbsp;6 is similar to
479<tt>rcu_dereference()</tt>, except that:
480
481<ol>
482<li> The value returned by <tt>rcu_access_pointer()</tt>
483 cannot be dereferenced.
484 If you want to access the value pointed to as well as
485 the pointer itself, use <tt>rcu_dereference()</tt>
486 instead of <tt>rcu_access_pointer()</tt>.
487<li> The call to <tt>rcu_access_pointer()</tt> need not be
488 protected.
489 In contrast, <tt>rcu_dereference()</tt> must either be
490 within an RCU read-side critical section or in a code
491 segment where the pointer cannot change, for example, in
492 code protected by the corresponding update-side lock.
493</ol>
494
495<p><a name="Quick Quiz 4"><b>Quick Quiz 4</b>:</a>
496Without the <tt>rcu_dereference()</tt> or the
497<tt>rcu_access_pointer()</tt>, what destructive optimizations
498might the compiler make use of?
499<br><a href="#qq4answer">Answer</a>
500
501<p>
502This simple linked-data-structure scenario clearly demonstrates the need
503for RCU's stringent memory-ordering guarantees on systems with more than
504one CPU:
505
506<ol>
507<li> Each CPU that has an RCU read-side critical section that
508 begins before <tt>synchronize_rcu()</tt> starts is
509 guaranteed to execute a full memory barrier between the time
510 that the RCU read-side critical section ends and the time that
511 <tt>synchronize_rcu()</tt> returns.
512 Without this guarantee, a pre-existing RCU read-side critical section
513 might hold a reference to the newly removed <tt>struct foo</tt>
514 after the <tt>kfree()</tt> on line&nbsp;14 of
515 <tt>remove_gp_synchronous()</tt>.
516<li> Each CPU that has an RCU read-side critical section that ends
517 after <tt>synchronize_rcu()</tt> returns is guaranteed
518 to execute a full memory barrier between the time that
519 <tt>synchronize_rcu()</tt> begins and the time that the RCU
520 read-side critical section begins.
521 Without this guarantee, a later RCU read-side critical section
522 running after the <tt>kfree()</tt> on line&nbsp;14 of
523 <tt>remove_gp_synchronous()</tt> might
524 later run <tt>do_something_gp()</tt> and find the
525 newly deleted <tt>struct foo</tt>.
526<li> If the task invoking <tt>synchronize_rcu()</tt> remains
527 on a given CPU, then that CPU is guaranteed to execute a full
528 memory barrier sometime during the execution of
529 <tt>synchronize_rcu()</tt>.
530 This guarantee ensures that the <tt>kfree()</tt> on
531 line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does
532 execute after the removal on line&nbsp;11.
533<li> If the task invoking <tt>synchronize_rcu()</tt> migrates
534 among a group of CPUs during that invocation, then each of the
535 CPUs in that group is guaranteed to execute a full memory barrier
536 sometime during the execution of <tt>synchronize_rcu()</tt>.
537 This guarantee also ensures that the <tt>kfree()</tt> on
538 line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does
539 execute after the removal on
540 line&nbsp;11, but also in the case where the thread executing the
541 <tt>synchronize_rcu()</tt> migrates in the meantime.
542</ol>
543
544<p><a name="Quick Quiz 5"><b>Quick Quiz 5</b>:</a>
545Given that multiple CPUs can start RCU read-side critical sections
546at any time without any ordering whatsoever, how can RCU possibly tell whether
547or not a given RCU read-side critical section starts before a
548given instance of <tt>synchronize_rcu()</tt>?
549<br><a href="#qq5answer">Answer</a>
550
551<p><a name="Quick Quiz 6"><b>Quick Quiz 6</b>:</a>
552The first and second guarantees require unbelievably strict ordering!
553Are all these memory barriers <i> really</i> required?
554<br><a href="#qq6answer">Answer</a>
555
556<p>
557In short, RCU's publish-subscribe guarantee is provided by the combination
558of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>.
559This guarantee allows data elements to be safely added to RCU-protected
560linked data structures without disrupting RCU readers.
561This guarantee can be used in combination with the grace-period
562guarantee to also allow data elements to be removed from RCU-protected
563linked data structures, again without disrupting RCU readers.
564
565<p>
566This guarantee was only partially premeditated.
567DYNIX/ptx used an explicit memory barrier for publication, but had nothing
568resembling <tt>rcu_dereference()</tt> for subscription, nor did it
569have anything resembling the <tt>smp_read_barrier_depends()</tt>
570that was later subsumed into <tt>rcu_dereference()</tt>.
571The need for these operations made itself known quite suddenly at a
572late-1990s meeting with the DEC Alpha architects, back in the days when
573DEC was still a free-standing company.
574It took the Alpha architects a good hour to convince me that any sort
575of barrier would ever be needed, and it then took me a good <i>two</i> hours
576to convince them that their documentation did not make this point clear.
577More recent work with the C and C++ standards committees have provided
578much education on tricks and traps from the compiler.
579In short, compilers were much less tricky in the early 1990s, but in
5802015, don't even think about omitting <tt>rcu_dereference()</tt>!
581
582<h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3>
583
584<p>
585The common-case RCU primitives are unconditional.
586They are invoked, they do their job, and they return, with no possibility
587of error, and no need to retry.
588This is a key RCU design philosophy.
589
590<p>
591However, this philosophy is pragmatic rather than pigheaded.
592If someone comes up with a good justification for a particular conditional
593RCU primitive, it might well be implemented and added.
594After all, this guarantee was reverse-engineered, not premeditated.
595The unconditional nature of the RCU primitives was initially an
596accident of implementation, and later experience with synchronization
597primitives with conditional primitives caused me to elevate this
598accident to a guarantee.
599Therefore, the justification for adding a conditional primitive to
600RCU would need to be based on detailed and compelling use cases.
601
602<h3><a name="Guaranteed Read-to-Write Upgrade">Guaranteed Read-to-Write Upgrade</a></h3>
603
604<p>
605As far as RCU is concerned, it is always possible to carry out an
606update within an RCU read-side critical section.
607For example, that RCU read-side critical section might search for
608a given data element, and then might acquire the update-side
609spinlock in order to update that element, all while remaining
610in that RCU read-side critical section.
611Of course, it is necessary to exit the RCU read-side critical section
612before invoking <tt>synchronize_rcu()</tt>, however, this
613inconvenience can be avoided through use of the
614<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members
615described later in this document.
616
617<p><a name="Quick Quiz 7"><b>Quick Quiz 7</b>:</a>
618But how does the upgrade-to-write operation exclude other readers?
619<br><a href="#qq7answer">Answer</a>
620
621<p>
622This guarantee allows lookup code to be shared between read-side
623and update-side code, and was premeditated, appearing in the earliest
624DYNIX/ptx RCU documentation.
625
626<h2><a name="Fundamental Non-Requirements">Fundamental Non-Requirements</a></h2>
627
628<p>
629RCU provides extremely lightweight readers, and its read-side guarantees,
630though quite useful, are correspondingly lightweight.
631It is therefore all too easy to assume that RCU is guaranteeing more
632than it really is.
633Of course, the list of things that RCU does not guarantee is infinitely
634long, however, the following sections list a few non-guarantees that
635have caused confusion.
636Except where otherwise noted, these non-guarantees were premeditated.
637
638<ol>
639<li> <a href="#Readers Impose Minimal Ordering">
640 Readers Impose Minimal Ordering</a>
641<li> <a href="#Readers Do Not Exclude Updaters">
642 Readers Do Not Exclude Updaters</a>
643<li> <a href="#Updaters Only Wait For Old Readers">
644 Updaters Only Wait For Old Readers</a>
645<li> <a href="#Grace Periods Don't Partition Read-Side Critical Sections">
646 Grace Periods Don't Partition Read-Side Critical Sections</a>
647<li> <a href="#Read-Side Critical Sections Don't Partition Grace Periods">
648 Read-Side Critical Sections Don't Partition Grace Periods</a>
649<li> <a href="#Disabling Preemption Does Not Block Grace Periods">
650 Disabling Preemption Does Not Block Grace Periods</a>
651</ol>
652
653<h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3>
654
655<p>
656Reader-side markers such as <tt>rcu_read_lock()</tt> and
657<tt>rcu_read_unlock()</tt> provide absolutely no ordering guarantees
658except through their interaction with the grace-period APIs such as
659<tt>synchronize_rcu()</tt>.
660To see this, consider the following pair of threads:
661
662<blockquote>
663<pre>
664 1 void thread0(void)
665 2 {
666 3 rcu_read_lock();
667 4 WRITE_ONCE(x, 1);
668 5 rcu_read_unlock();
669 6 rcu_read_lock();
670 7 WRITE_ONCE(y, 1);
671 8 rcu_read_unlock();
672 9 }
67310
67411 void thread1(void)
67512 {
67613 rcu_read_lock();
67714 r1 = READ_ONCE(y);
67815 rcu_read_unlock();
67916 rcu_read_lock();
68017 r2 = READ_ONCE(x);
68118 rcu_read_unlock();
68219 }
683</pre>
684</blockquote>
685
686<p>
687After <tt>thread0()</tt> and <tt>thread1()</tt> execute
688concurrently, it is quite possible to have
689
690<blockquote>
691<pre>
692(r1 == 1 &amp;&amp; r2 == 0)
693</pre>
694</blockquote>
695
696(that is, <tt>y</tt> appears to have been assigned before <tt>x</tt>),
697which would not be possible if <tt>rcu_read_lock()</tt> and
698<tt>rcu_read_unlock()</tt> had much in the way of ordering
699properties.
700But they do not, so the CPU is within its rights
701to do significant reordering.
702This is by design: Any significant ordering constraints would slow down
703these fast-path APIs.
704
705<p><a name="Quick Quiz 8"><b>Quick Quiz 8</b>:</a>
706Can't the compiler also reorder this code?
707<br><a href="#qq8answer">Answer</a>
708
709<h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3>
710
711<p>
712Neither <tt>rcu_read_lock()</tt> nor <tt>rcu_read_unlock()</tt>
713exclude updates.
714All they do is to prevent grace periods from ending.
715The following example illustrates this:
716
717<blockquote>
718<pre>
719 1 void thread0(void)
720 2 {
721 3 rcu_read_lock();
722 4 r1 = READ_ONCE(y);
723 5 if (r1) {
724 6 do_something_with_nonzero_x();
725 7 r2 = READ_ONCE(x);
726 8 WARN_ON(!r2); /* BUG!!! */
727 9 }
72810 rcu_read_unlock();
72911 }
73012
73113 void thread1(void)
73214 {
73315 spin_lock(&amp;my_lock);
73416 WRITE_ONCE(x, 1);
73517 WRITE_ONCE(y, 1);
73618 spin_unlock(&amp;my_lock);
73719 }
738</pre>
739</blockquote>
740
741<p>
742If the <tt>thread0()</tt> function's <tt>rcu_read_lock()</tt>
743excluded the <tt>thread1()</tt> function's update,
744the <tt>WARN_ON()</tt> could never fire.
745But the fact is that <tt>rcu_read_lock()</tt> does not exclude
746much of anything aside from subsequent grace periods, of which
747<tt>thread1()</tt> has none, so the
748<tt>WARN_ON()</tt> can and does fire.
749
750<h3><a name="Updaters Only Wait For Old Readers">Updaters Only Wait For Old Readers</a></h3>
751
752<p>
753It might be tempting to assume that after <tt>synchronize_rcu()</tt>
754completes, there are no readers executing.
755This temptation must be avoided because
756new readers can start immediately after <tt>synchronize_rcu()</tt>
757starts, and <tt>synchronize_rcu()</tt> is under no
758obligation to wait for these new readers.
759
760<p><a name="Quick Quiz 9"><b>Quick Quiz 9</b>:</a>
761Suppose that synchronize_rcu() did wait until all readers had completed.
762Would the updater be able to rely on this?
763<br><a href="#qq9answer">Answer</a>
764
765<h3><a name="Grace Periods Don't Partition Read-Side Critical Sections">
766Grace Periods Don't Partition Read-Side Critical Sections</a></h3>
767
768<p>
769It is tempting to assume that if any part of one RCU read-side critical
770section precedes a given grace period, and if any part of another RCU
771read-side critical section follows that same grace period, then all of
772the first RCU read-side critical section must precede all of the second.
773However, this just isn't the case: A single grace period does not
774partition the set of RCU read-side critical sections.
775An example of this situation can be illustrated as follows, where
776<tt>x</tt>, <tt>y</tt>, and <tt>z</tt> are initially all zero:
777
778<blockquote>
779<pre>
780 1 void thread0(void)
781 2 {
782 3 rcu_read_lock();
783 4 WRITE_ONCE(a, 1);
784 5 WRITE_ONCE(b, 1);
785 6 rcu_read_unlock();
786 7 }
787 8
788 9 void thread1(void)
78910 {
79011 r1 = READ_ONCE(a);
79112 synchronize_rcu();
79213 WRITE_ONCE(c, 1);
79314 }
79415
79516 void thread2(void)
79617 {
79718 rcu_read_lock();
79819 r2 = READ_ONCE(b);
79920 r3 = READ_ONCE(c);
80021 rcu_read_unlock();
80122 }
802</pre>
803</blockquote>
804
805<p>
806It turns out that the outcome:
807
808<blockquote>
809<pre>
810(r1 == 1 &amp;&amp; r2 == 0 &amp;&amp; r3 == 1)
811</pre>
812</blockquote>
813
814is entirely possible.
815The following figure show how this can happen, with each circled
816<tt>QS</tt> indicating the point at which RCU recorded a
817<i>quiescent state</i> for each thread, that is, a state in which
818RCU knows that the thread cannot be in the midst of an RCU read-side
819critical section that started before the current grace period:
820
821<p><img src="GPpartitionReaders1.svg" alt="GPpartitionReaders1.svg" width="60%"></p>
822
823<p>
824If it is necessary to partition RCU read-side critical sections in this
825manner, it is necessary to use two grace periods, where the first
826grace period is known to end before the second grace period starts:
827
828<blockquote>
829<pre>
830 1 void thread0(void)
831 2 {
832 3 rcu_read_lock();
833 4 WRITE_ONCE(a, 1);
834 5 WRITE_ONCE(b, 1);
835 6 rcu_read_unlock();
836 7 }
837 8
838 9 void thread1(void)
83910 {
84011 r1 = READ_ONCE(a);
84112 synchronize_rcu();
84213 WRITE_ONCE(c, 1);
84314 }
84415
84516 void thread2(void)
84617 {
84718 r2 = READ_ONCE(c);
84819 synchronize_rcu();
84920 WRITE_ONCE(d, 1);
85021 }
85122
85223 void thread3(void)
85324 {
85425 rcu_read_lock();
85526 r3 = READ_ONCE(b);
85627 r4 = READ_ONCE(d);
85728 rcu_read_unlock();
85829 }
859</pre>
860</blockquote>
861
862<p>
863Here, if <tt>(r1 == 1)</tt>, then
864<tt>thread0()</tt>'s write to <tt>b</tt> must happen
865before the end of <tt>thread1()</tt>'s grace period.
866If in addition <tt>(r4 == 1)</tt>, then
867<tt>thread3()</tt>'s read from <tt>b</tt> must happen
868after the beginning of <tt>thread2()</tt>'s grace period.
869If it is also the case that <tt>(r2 == 1)</tt>, then the
870end of <tt>thread1()</tt>'s grace period must precede the
871beginning of <tt>thread2()</tt>'s grace period.
872This mean that the two RCU read-side critical sections cannot overlap,
873guaranteeing that <tt>(r3 == 1)</tt>.
874As a result, the outcome:
875
876<blockquote>
877<pre>
878(r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 0 &amp;&amp; r4 == 1)
879</pre>
880</blockquote>
881
882cannot happen.
883
884<p>
885This non-requirement was also non-premeditated, but became apparent
886when studying RCU's interaction with memory ordering.
887
888<h3><a name="Read-Side Critical Sections Don't Partition Grace Periods">
889Read-Side Critical Sections Don't Partition Grace Periods</a></h3>
890
891<p>
892It is also tempting to assume that if an RCU read-side critical section
893happens between a pair of grace periods, then those grace periods cannot
894overlap.
895However, this temptation leads nowhere good, as can be illustrated by
896the following, with all variables initially zero:
897
898<blockquote>
899<pre>
900 1 void thread0(void)
901 2 {
902 3 rcu_read_lock();
903 4 WRITE_ONCE(a, 1);
904 5 WRITE_ONCE(b, 1);
905 6 rcu_read_unlock();
906 7 }
907 8
908 9 void thread1(void)
90910 {
91011 r1 = READ_ONCE(a);
91112 synchronize_rcu();
91213 WRITE_ONCE(c, 1);
91314 }
91415
91516 void thread2(void)
91617 {
91718 rcu_read_lock();
91819 WRITE_ONCE(d, 1);
91920 r2 = READ_ONCE(c);
92021 rcu_read_unlock();
92122 }
92223
92324 void thread3(void)
92425 {
92526 r3 = READ_ONCE(d);
92627 synchronize_rcu();
92728 WRITE_ONCE(e, 1);
92829 }
92930
93031 void thread4(void)
93132 {
93233 rcu_read_lock();
93334 r4 = READ_ONCE(b);
93435 r5 = READ_ONCE(e);
93536 rcu_read_unlock();
93637 }
937</pre>
938</blockquote>
939
940<p>
941In this case, the outcome:
942
943<blockquote>
944<pre>
945(r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 1 &amp;&amp; r4 == 0 &amp&amp; r5 == 1)
946</pre>
947</blockquote>
948
949is entirely possible, as illustrated below:
950
951<p><img src="ReadersPartitionGP1.svg" alt="ReadersPartitionGP1.svg" width="100%"></p>
952
953<p>
954Again, an RCU read-side critical section can overlap almost all of a
955given grace period, just so long as it does not overlap the entire
956grace period.
957As a result, an RCU read-side critical section cannot partition a pair
958of RCU grace periods.
959
960<p><a name="Quick Quiz 10"><b>Quick Quiz 10</b>:</a>
961How long a sequence of grace periods, each separated by an RCU read-side
962critical section, would be required to partition the RCU read-side
963critical sections at the beginning and end of the chain?
964<br><a href="#qq10answer">Answer</a>
965
966<h3><a name="Disabling Preemption Does Not Block Grace Periods">
967Disabling Preemption Does Not Block Grace Periods</a></h3>
968
969<p>
970There was a time when disabling preemption on any given CPU would block
971subsequent grace periods.
972However, this was an accident of implementation and is not a requirement.
973And in the current Linux-kernel implementation, disabling preemption
974on a given CPU in fact does not block grace periods, as Oleg Nesterov
975<a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>.
976
977<p>
978If you need a preempt-disable region to block grace periods, you need to add
979<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example
980as follows:
981
982<blockquote>
983<pre>
984 1 preempt_disable();
985 2 rcu_read_lock();
986 3 do_something();
987 4 rcu_read_unlock();
988 5 preempt_enable();
989 6
990 7 /* Spinlocks implicitly disable preemption. */
991 8 spin_lock(&amp;mylock);
992 9 rcu_read_lock();
99310 do_something();
99411 rcu_read_unlock();
99512 spin_unlock(&amp;mylock);
996</pre>
997</blockquote>
998
999<p>
1000In theory, you could enter the RCU read-side critical section first,
1001but it is more efficient to keep the entire RCU read-side critical
1002section contained in the preempt-disable region as shown above.
1003Of course, RCU read-side critical sections that extend outside of
1004preempt-disable regions will work correctly, but such critical sections
1005can be preempted, which forces <tt>rcu_read_unlock()</tt> to do
1006more work.
1007And no, this is <i>not</i> an invitation to enclose all of your RCU
1008read-side critical sections within preempt-disable regions, because
1009doing so would degrade real-time response.
1010
1011<p>
1012This non-requirement appeared with preemptible RCU.
1013If you need a grace period that waits on non-preemptible code regions, use
1014<a href="#Sched Flavor">RCU-sched</a>.
1015
1016<h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2>
1017
1018<p>
1019These parallelism facts of life are by no means specific to RCU, but
1020the RCU implementation must abide by them.
1021They therefore bear repeating:
1022
1023<ol>
1024<li> Any CPU or task may be delayed at any time,
1025 and any attempts to avoid these delays by disabling
1026 preemption, interrupts, or whatever are completely futile.
1027 This is most obvious in preemptible user-level
1028 environments and in virtualized environments (where
1029 a given guest OS's VCPUs can be preempted at any time by
1030 the underlying hypervisor), but can also happen in bare-metal
1031 environments due to ECC errors, NMIs, and other hardware
1032 events.
1033 Although a delay of more than about 20 seconds can result
1034 in splats, the RCU implementation is obligated to use
1035 algorithms that can tolerate extremely long delays, but where
1036 &ldquo;extremely long&rdquo; is not long enough to allow
1037 wrap-around when incrementing a 64-bit counter.
1038<li> Both the compiler and the CPU can reorder memory accesses.
1039 Where it matters, RCU must use compiler directives and
1040 memory-barrier instructions to preserve ordering.
1041<li> Conflicting writes to memory locations in any given cache line
1042 will result in expensive cache misses.
1043 Greater numbers of concurrent writes and more-frequent
1044 concurrent writes will result in more dramatic slowdowns.
1045 RCU is therefore obligated to use algorithms that have
1046 sufficient locality to avoid significant performance and
1047 scalability problems.
1048<li> As a rough rule of thumb, only one CPU's worth of processing
1049 may be carried out under the protection of any given exclusive
1050 lock.
1051 RCU must therefore use scalable locking designs.
1052<li> Counters are finite, especially on 32-bit systems.
1053 RCU's use of counters must therefore tolerate counter wrap,
1054 or be designed such that counter wrap would take way more
1055 time than a single system is likely to run.
1056 An uptime of ten years is quite possible, a runtime
1057 of a century much less so.
1058 As an example of the latter, RCU's dyntick-idle nesting counter
1059 allows 54 bits for interrupt nesting level (this counter
1060 is 64 bits even on a 32-bit system).
1061 Overflowing this counter requires 2<sup>54</sup>
1062 half-interrupts on a given CPU without that CPU ever going idle.
1063 If a half-interrupt happened every microsecond, it would take
1064 570 years of runtime to overflow this counter, which is currently
1065 believed to be an acceptably long time.
1066<li> Linux systems can have thousands of CPUs running a single
1067 Linux kernel in a single shared-memory environment.
1068 RCU must therefore pay close attention to high-end scalability.
1069</ol>
1070
1071<p>
1072This last parallelism fact of life means that RCU must pay special
1073attention to the preceding facts of life.
1074The idea that Linux might scale to systems with thousands of CPUs would
1075have been met with some skepticism in the 1990s, but these requirements
1076would have otherwise have been unsurprising, even in the early 1990s.
1077
1078<h2><a name="Quality-of-Implementation Requirements">Quality-of-Implementation Requirements</a></h2>
1079
1080<p>
1081These sections list quality-of-implementation requirements.
1082Although an RCU implementation that ignores these requirements could
1083still be used, it would likely be subject to limitations that would
1084make it inappropriate for industrial-strength production use.
1085Classes of quality-of-implementation requirements are as follows:
1086
1087<ol>
1088<li> <a href="#Specialization">Specialization</a>
1089<li> <a href="#Performance and Scalability">Performance and Scalability</a>
1090<li> <a href="#Composability">Composability</a>
1091<li> <a href="#Corner Cases">Corner Cases</a>
1092</ol>
1093
1094<p>
1095These classes is covered in the following sections.
1096
1097<h3><a name="Specialization">Specialization</a></h3>
1098
1099<p>
1100RCU is and always has been intended primarily for read-mostly situations, as
1101illustrated by the following figure.
1102This means that RCU's read-side primitives are optimized, often at the
1103expense of its update-side primitives.
1104
1105<p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p>
1106
1107<p>
1108This focus on read-mostly situations means that RCU must interoperate
1109with other synchronization primitives.
1110For example, the <tt>add_gp()</tt> and <tt>remove_gp_synchronous()</tt>
1111examples discussed earlier use RCU to protect readers and locking to
1112coordinate updaters.
1113However, the need extends much farther, requiring that a variety of
1114synchronization primitives be legal within RCU read-side critical sections,
1115including spinlocks, sequence locks, atomic operations, reference
1116counters, and memory barriers.
1117
1118<p><a name="Quick Quiz 11"><b>Quick Quiz 11</b>:</a>
1119What about sleeping locks?
1120<br><a href="#qq11answer">Answer</a>
1121
1122<p>
1123It often comes as a surprise that many algorithms do not require a
1124consistent view of data, but many can function in that mode,
1125with network routing being the poster child.
1126Internet routing algorithms take significant time to propagate
1127updates, so that by the time an update arrives at a given system,
1128that system has been sending network traffic the wrong way for
1129a considerable length of time.
1130Having a few threads continue to send traffic the wrong way for a
1131few more milliseconds is clearly not a problem: In the worst case,
1132TCP retransmissions will eventually get the data where it needs to go.
1133In general, when tracking the state of the universe outside of the
1134computer, some level of inconsistency must be tolerated due to
1135speed-of-light delays if nothing else.
1136
1137<p>
1138Furthermore, uncertainty about external state is inherent in many cases.
1139For example, a pair of veternarians might use heartbeat to determine
1140whether or not a given cat was alive.
1141But how long should they wait after the last heartbeat to decide that
1142the cat is in fact dead?
1143Waiting less than 400 milliseconds makes no sense because this would
1144mean that a relaxed cat would be considered to cycle between death
1145and life more than 100 times per minute.
1146Moreover, just as with human beings, a cat's heart might stop for
1147some period of time, so the exact wait period is a judgment call.
1148One of our pair of veternarians might wait 30 seconds before pronouncing
1149the cat dead, while the other might insist on waiting a full minute.
1150The two veternarians would then disagree on the state of the cat during
1151the final 30 seconds of the minute following the last heartbeat, as
1152fancifully illustrated below:
1153
1154<p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p>
1155
1156<p>
1157Interestingly enough, this same situation applies to hardware.
1158When push comes to shove, how do we tell whether or not some
1159external server has failed?
1160We send messages to it periodically, and declare it failed if we
1161don't receive a response within a given period of time.
1162Policy decisions can usually tolerate short
1163periods of inconsistency.
1164The policy was decided some time ago, and is only now being put into
1165effect, so a few milliseconds of delay is normally inconsequential.
1166
1167<p>
1168However, there are algorithms that absolutely must see consistent data.
1169For example, the translation between a user-level SystemV semaphore
1170ID to the corresponding in-kernel data structure is protected by RCU,
1171but it is absolutely forbidden to update a semaphore that has just been
1172removed.
1173In the Linux kernel, this need for consistency is accommodated by acquiring
1174spinlocks located in the in-kernel data structure from within
1175the RCU read-side critical section, and this is indicated by the
1176green box in the figure above.
1177Many other techniques may be used, and are in fact used within the
1178Linux kernel.
1179
1180<p>
1181In short, RCU is not required to maintain consistency, and other
1182mechanisms may be used in concert with RCU when consistency is required.
1183RCU's specialization allows it to do its job extremely well, and its
1184ability to interoperate with other synchronization mechanisms allows
1185the right mix of synchronization tools to be used for a given job.
1186
1187<h3><a name="Performance and Scalability">Performance and Scalability</a></h3>
1188
1189<p>
1190Energy efficiency is a critical component of performance today,
1191and Linux-kernel RCU implementations must therefore avoid unnecessarily
1192awakening idle CPUs.
1193I cannot claim that this requirement was premeditated.
1194In fact, I learned of it during a telephone conversation in which I
1195was given &ldquo;frank and open&rdquo; feedback on the importance
1196of energy efficiency in battery-powered systems and on specific
1197energy-efficiency shortcomings of the Linux-kernel RCU implementation.
1198In my experience, the battery-powered embedded community will consider
1199any unnecessary wakeups to be extremely unfriendly acts.
1200So much so that mere Linux-kernel-mailing-list posts are
1201insufficient to vent their ire.
1202
1203<p>
1204Memory consumption is not particularly important for in most
1205situations, and has become decreasingly
1206so as memory sizes have expanded and memory
1207costs have plummeted.
1208However, as I learned from Matt Mackall's
1209<a href="http://elinux.org/Linux_Tiny-FAQ">bloatwatch</a>
1210efforts, memory footprint is critically important on single-CPU systems with
1211non-preemptible (<tt>CONFIG_PREEMPT=n</tt>) kernels, and thus
1212<a href="https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com">tiny RCU</a>
1213was born.
1214Josh Triplett has since taken over the small-memory banner with his
1215<a href="https://tiny.wiki.kernel.org/">Linux kernel tinification</a>
1216project, which resulted in
1217<a href="#Sleepable RCU">SRCU</a>
1218becoming optional for those kernels not needing it.
1219
1220<p>
1221The remaining performance requirements are, for the most part,
1222unsurprising.
1223For example, in keeping with RCU's read-side specialization,
1224<tt>rcu_dereference()</tt> should have negligible overhead (for
1225example, suppression of a few minor compiler optimizations).
1226Similarly, in non-preemptible environments, <tt>rcu_read_lock()</tt> and
1227<tt>rcu_read_unlock()</tt> should have exactly zero overhead.
1228
1229<p>
1230In preemptible environments, in the case where the RCU read-side
1231critical section was not preempted (as will be the case for the
1232highest-priority real-time process), <tt>rcu_read_lock()</tt> and
1233<tt>rcu_read_unlock()</tt> should have minimal overhead.
1234In particular, they should not contain atomic read-modify-write
1235operations, memory-barrier instructions, preemption disabling,
1236interrupt disabling, or backwards branches.
1237However, in the case where the RCU read-side critical section was preempted,
1238<tt>rcu_read_unlock()</tt> may acquire spinlocks and disable interrupts.
1239This is why it is better to nest an RCU read-side critical section
1240within a preempt-disable region than vice versa, at least in cases
1241where that critical section is short enough to avoid unduly degrading
1242real-time latencies.
1243
1244<p>
1245The <tt>synchronize_rcu()</tt> grace-period-wait primitive is
1246optimized for throughput.
1247It may therefore incur several milliseconds of latency in addition to
1248the duration of the longest RCU read-side critical section.
1249On the other hand, multiple concurrent invocations of
1250<tt>synchronize_rcu()</tt> are required to use batching optimizations
1251so that they can be satisfied by a single underlying grace-period-wait
1252operation.
1253For example, in the Linux kernel, it is not unusual for a single
1254grace-period-wait operation to serve more than
1255<a href="https://www.usenix.org/conference/2004-usenix-annual-technical-conference/making-rcu-safe-deep-sub-millisecond-response">1,000 separate invocations</a>
1256of <tt>synchronize_rcu()</tt>, thus amortizing the per-invocation
1257overhead down to nearly zero.
1258However, the grace-period optimization is also required to avoid
1259measurable degradation of real-time scheduling and interrupt latencies.
1260
1261<p>
1262In some cases, the multi-millisecond <tt>synchronize_rcu()</tt>
1263latencies are unacceptable.
1264In these cases, <tt>synchronize_rcu_expedited()</tt> may be used
1265instead, reducing the grace-period latency down to a few tens of
1266microseconds on small systems, at least in cases where the RCU read-side
1267critical sections are short.
1268There are currently no special latency requirements for
1269<tt>synchronize_rcu_expedited()</tt> on large systems, but,
1270consistent with the empirical nature of the RCU specification,
1271that is subject to change.
1272However, there most definitely are scalability requirements:
1273A storm of <tt>synchronize_rcu_expedited()</tt> invocations on 4096
1274CPUs should at least make reasonable forward progress.
1275In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt>
1276is permitted to impose modest degradation of real-time latency
1277on non-idle online CPUs.
1278That said, it will likely be necessary to take further steps to reduce this
1279degradation, hopefully to roughly that of a scheduling-clock interrupt.
1280
1281<p>
1282There are a number of situations where even
1283<tt>synchronize_rcu_expedited()</tt>'s reduced grace-period
1284latency is unacceptable.
1285In these situations, the asynchronous <tt>call_rcu()</tt> can be
1286used in place of <tt>synchronize_rcu()</tt> as follows:
1287
1288<blockquote>
1289<pre>
1290 1 struct foo {
1291 2 int a;
1292 3 int b;
1293 4 struct rcu_head rh;
1294 5 };
1295 6
1296 7 static void remove_gp_cb(struct rcu_head *rhp)
1297 8 {
1298 9 struct foo *p = container_of(rhp, struct foo, rh);
129910
130011 kfree(p);
130112 }
130213
130314 bool remove_gp_asynchronous(void)
130415 {
130516 struct foo *p;
130617
130718 spin_lock(&amp;gp_lock);
130819 p = rcu_dereference(gp);
130920 if (!p) {
131021 spin_unlock(&amp;gp_lock);
131122 return false;
131223 }
131324 rcu_assign_pointer(gp, NULL);
131425 call_rcu(&amp;p-&gt;rh, remove_gp_cb);
131526 spin_unlock(&amp;gp_lock);
131627 return true;
131728 }
1318</pre>
1319</blockquote>
1320
1321<p>
1322A definition of <tt>struct foo</tt> is finally needed, and appears
1323on lines&nbsp;1-5.
1324The function <tt>remove_gp_cb()</tt> is passed to <tt>call_rcu()</tt>
1325on line&nbsp;25, and will be invoked after the end of a subsequent
1326grace period.
1327This gets the same effect as <tt>remove_gp_synchronous()</tt>,
1328but without forcing the updater to wait for a grace period to elapse.
1329The <tt>call_rcu()</tt> function may be used in a number of
1330situations where neither <tt>synchronize_rcu()</tt> nor
1331<tt>synchronize_rcu_expedited()</tt> would be legal,
1332including within preempt-disable code, <tt>local_bh_disable()</tt> code,
1333interrupt-disable code, and interrupt handlers.
1334However, even <tt>call_rcu()</tt> is illegal within NMI handlers.
1335The callback function (<tt>remove_gp_cb()</tt> in this case) will be
1336executed within softirq (software interrupt) environment within the
1337Linux kernel,
1338either within a real softirq handler or under the protection
1339of <tt>local_bh_disable()</tt>.
1340In both the Linux kernel and in userspace, it is bad practice to
1341write an RCU callback function that takes too long.
1342Long-running operations should be relegated to separate threads or
1343(in the Linux kernel) workqueues.
1344
1345<p><a name="Quick Quiz 12"><b>Quick Quiz 12</b>:</a>
1346Why does line&nbsp;19 use <tt>rcu_access_pointer()</tt>?
1347After all, <tt>call_rcu()</tt> on line&nbsp;25 stores into the
1348structure, which would interact badly with concurrent insertions.
1349Doesn't this mean that <tt>rcu_dereference()</tt> is required?
1350<br><a href="#qq12answer">Answer</a>
1351
1352<p>
1353However, all that <tt>remove_gp_cb()</tt> is doing is
1354invoking <tt>kfree()</tt> on the data element.
1355This is a common idiom, and is supported by <tt>kfree_rcu()</tt>,
1356which allows &ldquo;fire and forget&rdquo; operation as shown below:
1357
1358<blockquote>
1359<pre>
1360 1 struct foo {
1361 2 int a;
1362 3 int b;
1363 4 struct rcu_head rh;
1364 5 };
1365 6
1366 7 bool remove_gp_faf(void)
1367 8 {
1368 9 struct foo *p;
136910
137011 spin_lock(&amp;gp_lock);
137112 p = rcu_dereference(gp);
137213 if (!p) {
137314 spin_unlock(&amp;gp_lock);
137415 return false;
137516 }
137617 rcu_assign_pointer(gp, NULL);
137718 kfree_rcu(p, rh);
137819 spin_unlock(&amp;gp_lock);
137920 return true;
138021 }
1381</pre>
1382</blockquote>
1383
1384<p>
1385Note that <tt>remove_gp_faf()</tt> simply invokes
1386<tt>kfree_rcu()</tt> and proceeds, without any need to pay any
1387further attention to the subsequent grace period and <tt>kfree()</tt>.
1388It is permissible to invoke <tt>kfree_rcu()</tt> from the same
1389environments as for <tt>call_rcu()</tt>.
1390Interestingly enough, DYNIX/ptx had the equivalents of
1391<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>, but not
1392<tt>synchronize_rcu()</tt>.
1393This was due to the fact that RCU was not heavily used within DYNIX/ptx,
1394so the very few places that needed something like
1395<tt>synchronize_rcu()</tt> simply open-coded it.
1396
1397<p><a name="Quick Quiz 13"><b>Quick Quiz 13</b>:</a>
1398Earlier it was claimed that <tt>call_rcu()</tt> and
1399<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked
1400by readers.
1401But how can that be correct, given that the invocation of the callback
1402and the freeing of the memory (respectively) must still wait for
1403a grace period to elapse?
1404<br><a href="#qq13answer">Answer</a>
1405
1406<p>
1407But what if the updater must wait for the completion of code to be
1408executed after the end of the grace period, but has other tasks
1409that can be carried out in the meantime?
1410The polling-style <tt>get_state_synchronize_rcu()</tt> and
1411<tt>cond_synchronize_rcu()</tt> functions may be used for this
1412purpose, as shown below:
1413
1414<blockquote>
1415<pre>
1416 1 bool remove_gp_poll(void)
1417 2 {
1418 3 struct foo *p;
1419 4 unsigned long s;
1420 5
1421 6 spin_lock(&amp;gp_lock);
1422 7 p = rcu_access_pointer(gp);
1423 8 if (!p) {
1424 9 spin_unlock(&amp;gp_lock);
142510 return false;
142611 }
142712 rcu_assign_pointer(gp, NULL);
142813 spin_unlock(&amp;gp_lock);
142914 s = get_state_synchronize_rcu();
143015 do_something_while_waiting();
143116 cond_synchronize_rcu(s);
143217 kfree(p);
143318 return true;
143419 }
1435</pre>
1436</blockquote>
1437
1438<p>
1439On line&nbsp;14, <tt>get_state_synchronize_rcu()</tt> obtains a
1440&ldquo;cookie&rdquo; from RCU,
1441then line&nbsp;15 carries out other tasks,
1442and finally, line&nbsp;16 returns immediately if a grace period has
1443elapsed in the meantime, but otherwise waits as required.
1444The need for <tt>get_state_synchronize_rcu</tt> and
1445<tt>cond_synchronize_rcu()</tt> has appeared quite recently,
1446so it is too early to tell whether they will stand the test of time.
1447
1448<p>
1449RCU thus provides a range of tools to allow updaters to strike the
1450required tradeoff between latency, flexibility and CPU overhead.
1451
1452<h3><a name="Composability">Composability</a></h3>
1453
1454<p>
1455Composability has received much attention in recent years, perhaps in part
1456due to the collision of multicore hardware with object-oriented techniques
1457designed in single-threaded environments for single-threaded use.
1458And in theory, RCU read-side critical sections may be composed, and in
1459fact may be nested arbitrarily deeply.
1460In practice, as with all real-world implementations of composable
1461constructs, there are limitations.
1462
1463<p>
1464Implementations of RCU for which <tt>rcu_read_lock()</tt>
1465and <tt>rcu_read_unlock()</tt> generate no code, such as
1466Linux-kernel RCU when <tt>CONFIG_PREEMPT=n</tt>, can be
1467nested arbitrarily deeply.
1468After all, there is no overhead.
1469Except that if all these instances of <tt>rcu_read_lock()</tt>
1470and <tt>rcu_read_unlock()</tt> are visible to the compiler,
1471compilation will eventually fail due to exhausting memory,
1472mass storage, or user patience, whichever comes first.
1473If the nesting is not visible to the compiler, as is the case with
1474mutually recursive functions each in its own translation unit,
1475stack overflow will result.
1476If the nesting takes the form of loops, either the control variable
1477will overflow or (in the Linux kernel) you will get an RCU CPU stall warning.
1478Nevertheless, this class of RCU implementations is one
1479of the most composable constructs in existence.
1480
1481<p>
1482RCU implementations that explicitly track nesting depth
1483are limited by the nesting-depth counter.
1484For example, the Linux kernel's preemptible RCU limits nesting to
1485<tt>INT_MAX</tt>.
1486This should suffice for almost all practical purposes.
1487That said, a consecutive pair of RCU read-side critical sections
1488between which there is an operation that waits for a grace period
1489cannot be enclosed in another RCU read-side critical section.
1490This is because it is not legal to wait for a grace period within
1491an RCU read-side critical section: To do so would result either
1492in deadlock or
1493in RCU implicitly splitting the enclosing RCU read-side critical
1494section, neither of which is conducive to a long-lived and prosperous
1495kernel.
1496
1497<p>
1498In short, although RCU read-side critical sections are highly composable,
1499care is required in some situations, just as is the case for any other
1500composable synchronization mechanism.
1501
1502<h3><a name="Corner Cases">Corner Cases</a></h3>
1503
1504<p>
1505A given RCU workload might have an endless and intense stream of
1506RCU read-side critical sections, perhaps even so intense that there
1507was never a point in time during which there was not at least one
1508RCU read-side critical section in flight.
1509RCU cannot allow this situation to block grace periods: As long as
1510all the RCU read-side critical sections are finite, grace periods
1511must also be finite.
1512
1513<p>
1514That said, preemptible RCU implementations could potentially result
1515in RCU read-side critical sections being preempted for long durations,
1516which has the effect of creating a long-duration RCU read-side
1517critical section.
1518This situation can arise only in heavily loaded systems, but systems using
1519real-time priorities are of course more vulnerable.
1520Therefore, RCU priority boosting is provided to help deal with this
1521case.
1522That said, the exact requirements on RCU priority boosting will likely
1523evolve as more experience accumulates.
1524
1525<p>
1526Other workloads might have very high update rates.
1527Although one can argue that such workloads should instead use
1528something other than RCU, the fact remains that RCU must
1529handle such workloads gracefully.
1530This requirement is another factor driving batching of grace periods,
1531but it is also the driving force behind the checks for large numbers
1532of queued RCU callbacks in the <tt>call_rcu()</tt> code path.
1533Finally, high update rates should not delay RCU read-side critical
1534sections, although some read-side delays can occur when using
1535<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use
1536of <tt>try_stop_cpus()</tt>.
1537(In the future, <tt>synchronize_rcu_expedited()</tt> will be
1538converted to use lighter-weight inter-processor interrupts (IPIs),
1539but this will still disturb readers, though to a much smaller degree.)
1540
1541<p>
1542Although all three of these corner cases were understood in the early
15431990s, a simple user-level test consisting of <tt>close(open(path))</tt>
1544in a tight loop
1545in the early 2000s suddenly provided a much deeper appreciation of the
1546high-update-rate corner case.
1547This test also motivated addition of some RCU code to react to high update
1548rates, for example, if a given CPU finds itself with more than 10,000
1549RCU callbacks queued, it will cause RCU to take evasive action by
1550more aggressively starting grace periods and more aggressively forcing
1551completion of grace-period processing.
1552This evasive action causes the grace period to complete more quickly,
1553but at the cost of restricting RCU's batching optimizations, thus
1554increasing the CPU overhead incurred by that grace period.
1555
1556<h2><a name="Software-Engineering Requirements">
1557Software-Engineering Requirements</a></h2>
1558
1559<p>
1560Between Murphy's Law and &ldquo;To err is human&rdquo;, it is necessary to
1561guard against mishaps and misuse:
1562
1563<ol>
1564<li> It is all too easy to forget to use <tt>rcu_read_lock()</tt>
1565 everywhere that it is needed, so kernels built with
1566 <tt>CONFIG_PROVE_RCU=y</tt> will spat if
1567 <tt>rcu_dereference()</tt> is used outside of an
1568 RCU read-side critical section.
1569 Update-side code can use <tt>rcu_dereference_protected()</tt>,
1570 which takes a
1571 <a href="https://lwn.net/Articles/371986/">lockdep expression</a>
1572 to indicate what is providing the protection.
1573 If the indicated protection is not provided, a lockdep splat
1574 is emitted.
1575
1576 <p>
1577 Code shared between readers and updaters can use
1578 <tt>rcu_dereference_check()</tt>, which also takes a
1579 lockdep expression, and emits a lockdep splat if neither
1580 <tt>rcu_read_lock()</tt> nor the indicated protection
1581 is in place.
1582 In addition, <tt>rcu_dereference_raw()</tt> is used in those
1583 (hopefully rare) cases where the required protection cannot
1584 be easily described.
1585 Finally, <tt>rcu_read_lock_held()</tt> is provided to
1586 allow a function to verify that it has been invoked within
1587 an RCU read-side critical section.
1588 I was made aware of this set of requirements shortly after Thomas
1589 Gleixner audited a number of RCU uses.
1590<li> A given function might wish to check for RCU-related preconditions
1591 upon entry, before using any other RCU API.
1592 The <tt>rcu_lockdep_assert()</tt> does this job,
1593 asserting the expression in kernels having lockdep enabled
1594 and doing nothing otherwise.
1595<li> It is also easy to forget to use <tt>rcu_assign_pointer()</tt>
1596 and <tt>rcu_dereference()</tt>, perhaps (incorrectly)
1597 substituting a simple assignment.
1598 To catch this sort of error, a given RCU-protected pointer may be
1599 tagged with <tt>__rcu</tt>, after which running sparse
1600 with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain
1601 about simple-assignment accesses to that pointer.
1602 Arnd Bergmann made me aware of this requirement, and also
1603 supplied the needed
1604 <a href="https://lwn.net/Articles/376011/">patch series</a>.
1605<li> Kernels built with <tt>CONFIG_DEBUG_OBJECTS_RCU_HEAD=y</tt>
1606 will splat if a data element is passed to <tt>call_rcu()</tt>
1607 twice in a row, without a grace period in between.
1608 (This error is similar to a double free.)
1609 The corresponding <tt>rcu_head</tt> structures that are
1610 dynamically allocated are automatically tracked, but
1611 <tt>rcu_head</tt> structures allocated on the stack
1612 must be initialized with <tt>init_rcu_head_on_stack()</tt>
1613 and cleaned up with <tt>destroy_rcu_head_on_stack()</tt>.
1614 Similarly, statically allocated non-stack <tt>rcu_head</tt>
1615 structures must be initialized with <tt>init_rcu_head()</tt>
1616 and cleaned up with <tt>destroy_rcu_head()</tt>.
1617 Mathieu Desnoyers made me aware of this requirement, and also
1618 supplied the needed
1619 <a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>.
1620<li> An infinite loop in an RCU read-side critical section will
1621 eventually trigger an RCU CPU stall warning splat.
1622 However, RCU is not obligated to produce this splat
1623 unless there is a grace period waiting on that particular
1624 RCU read-side critical section.
1625 This requirement made itself known in the early 1990s, pretty
1626 much the first time that it was necessary to debug a CPU stall.
1627<li> Although it would be very good to detect pointers leaking out
1628 of RCU read-side critical sections, there is currently no
1629 good way of doing this.
1630 One complication is the need to distinguish between pointers
1631 leaking and pointers that have been handed off from RCU to
1632 some other synchronization mechanism, for example, reference
1633 counting.
1634<li> In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related
1635 information is provided via both debugfs and event tracing.
1636<li> Open-coded use of <tt>rcu_assign_pointer()</tt> and
1637 <tt>rcu_dereference()</tt> to create typical linked
1638 data structures can be surprisingly error-prone.
1639 Therefore, RCU-protected
1640 <a href="https://lwn.net/Articles/609973/#RCU List APIs">linked lists</a>
1641 and, more recently, RCU-protected
1642 <a href="https://lwn.net/Articles/612100/">hash tables</a>
1643 are available.
1644 Many other special-purpose RCU-protected data structures are
1645 available in the Linux kernel and the userspace RCU library.
1646<li> Some linked structures are created at compile time, but still
1647 require <tt>__rcu</tt> checking.
1648 The <tt>RCU_POINTER_INITIALIZER()</tt> macro serves this
1649 purpose.
1650<li> It is not necessary to use <tt>rcu_assign_pointer()</tt>
1651 when creating linked structures that are to be published via
1652 a single external pointer.
1653 The <tt>RCU_INIT_POINTER()</tt> macro is provided for
1654 this task and also for assigning <tt>NULL</tt> pointers
1655 at runtime.
1656</ol>
1657
1658<p>
1659This not a hard-and-fast list: RCU's diagnostic capabilities will
1660continue to be guided by the number and type of usage bugs found
1661in real-world RCU usage.
1662
1663<h2><a name="Linux Kernel Complications">Linux Kernel Complications</a></h2>
1664
1665<p>
1666The Linux kernel provides an interesting environment for all kinds of
1667software, including RCU.
1668Some of the relevant points of interest are as follows:
1669
1670<ol>
1671<li> <a href="#Configuration">Configuration</a>.
1672<li> <a href="#Firmware Interface">Firmware Interface</a>.
1673<li> <a href="#Early Boot">Early Boot</a>.
1674<li> <a href="#Interrupts and NMIs">
1675 Interrupts and non-maskable interrupts (NMIs)</a>.
1676<li> <a href="#Loadable Modules">Loadable Modules</a>.
1677<li> <a href="#Hotplug CPU">Hotplug CPU</a>.
1678<li> <a href="#Scheduler and RCU">Scheduler and RCU</a>.
1679<li> <a href="#Tracing and RCU">Tracing and RCU</a>.
1680<li> <a href="#Energy Efficiency">Energy Efficiency</a>.
1681<li> <a href="#Performance, Scalability, Response Time, and Reliability">
1682 Performance, Scalability, Response Time, and Reliability</a>.
1683</ol>
1684
1685<p>
1686This list is probably incomplete, but it does give a feel for the
1687most notable Linux-kernel complications.
1688Each of the following sections covers one of the above topics.
1689
1690<h3><a name="Configuration">Configuration</a></h3>
1691
1692<p>
1693RCU's goal is automatic configuration, so that almost nobody
1694needs to worry about RCU's <tt>Kconfig</tt> options.
1695And for almost all users, RCU does in fact work well
1696&ldquo;out of the box.&rdquo;
1697
1698<p>
1699However, there are specialized use cases that are handled by
1700kernel boot parameters and <tt>Kconfig</tt> options.
1701Unfortunately, the <tt>Kconfig</tt> system will explicitly ask users
1702about new <tt>Kconfig</tt> options, which requires almost all of them
1703be hidden behind a <tt>CONFIG_RCU_EXPERT</tt> <tt>Kconfig</tt> option.
1704
1705<p>
1706This all should be quite obvious, but the fact remains that
1707Linus Torvalds recently had to
1708<a href="https://lkml.kernel.org/g/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com">remind</a>
1709me of this requirement.
1710
1711<h3><a name="Firmware Interface">Firmware Interface</a></h3>
1712
1713<p>
1714In many cases, kernel obtains information about the system from the
1715firmware, and sometimes things are lost in translation.
1716Or the translation is accurate, but the original message is bogus.
1717
1718<p>
1719For example, some systems' firmware overreports the number of CPUs,
1720sometimes by a large factor.
1721If RCU naively believed the firmware, as it used to do,
1722it would create too many per-CPU kthreads.
1723Although the resulting system will still run correctly, the extra
1724kthreads needlessly consume memory and can cause confusion
1725when they show up in <tt>ps</tt> listings.
1726
1727<p>
1728RCU must therefore wait for a given CPU to actually come online before
1729it can allow itself to believe that the CPU actually exists.
1730The resulting &ldquo;ghost CPUs&rdquo; (which are never going to
1731come online) cause a number of
1732<a href="https://paulmck.livejournal.com/37494.html">interesting complications</a>.
1733
1734<h3><a name="Early Boot">Early Boot</a></h3>
1735
1736<p>
1737The Linux kernel's boot sequence is an interesting process,
1738and RCU is used early, even before <tt>rcu_init()</tt>
1739is invoked.
1740In fact, a number of RCU's primitives can be used as soon as the
1741initial task's <tt>task_struct</tt> is available and the
1742boot CPU's per-CPU variables are set up.
1743The read-side primitives (<tt>rcu_read_lock()</tt>,
1744<tt>rcu_read_unlock()</tt>, <tt>rcu_dereference()</tt>,
1745and <tt>rcu_access_pointer()</tt>) will operate normally very early on,
1746as will <tt>rcu_assign_pointer()</tt>.
1747
1748<p>
1749Although <tt>call_rcu()</tt> may be invoked at any
1750time during boot, callbacks are not guaranteed to be invoked until after
1751the scheduler is fully up and running.
1752This delay in callback invocation is due to the fact that RCU does not
1753invoke callbacks until it is fully initialized, and this full initialization
1754cannot occur until after the scheduler has initialized itself to the
1755point where RCU can spawn and run its kthreads.
1756In theory, it would be possible to invoke callbacks earlier,
1757however, this is not a panacea because there would be severe restrictions
1758on what operations those callbacks could invoke.
1759
1760<p>
1761Perhaps surprisingly, <tt>synchronize_rcu()</tt>,
1762<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a>
1763(<a href="#Bottom-Half Flavor">discussed below</a>),
1764and
1765<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>
1766will all operate normally
1767during very early boot, the reason being that there is only one CPU
1768and preemption is disabled.
1769This means that the call <tt>synchronize_rcu()</tt> (or friends)
1770itself is a quiescent
1771state and thus a grace period, so the early-boot implementation can
1772be a no-op.
1773
1774<p>
1775Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt>
1776continue to operate normally through the remainder of boot, courtesy
1777of the fact that preemption is disabled across their RCU read-side
1778critical sections and also courtesy of the fact that there is still
1779only one CPU.
1780However, once the scheduler starts initializing, preemption is enabled.
1781There is still only a single CPU, but the fact that preemption is enabled
1782means that the no-op implementation of <tt>synchronize_rcu()</tt> no
1783longer works in <tt>CONFIG_PREEMPT=y</tt> kernels.
1784Therefore, as soon as the scheduler starts initializing, the early-boot
1785fastpath is disabled.
1786This means that <tt>synchronize_rcu()</tt> switches to its runtime
1787mode of operation where it posts callbacks, which in turn means that
1788any call to <tt>synchronize_rcu()</tt> will block until the corresponding
1789callback is invoked.
1790Unfortunately, the callback cannot be invoked until RCU's runtime
1791grace-period machinery is up and running, which cannot happen until
1792the scheduler has initialized itself sufficiently to allow RCU's
1793kthreads to be spawned.
1794Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler
1795initialization can result in deadlock.
1796
1797<p><a name="Quick Quiz 14"><b>Quick Quiz 14</b>:</a>
1798So what happens with <tt>synchronize_rcu()</tt> during
1799scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
1800kernels?
1801<br><a href="#qq14answer">Answer</a>
1802
1803<p>
1804I learned of these boot-time requirements as a result of a series of
1805system hangs.
1806
1807<h3><a name="Interrupts and NMIs">Interrupts and NMIs</a></h3>
1808
1809<p>
1810The Linux kernel has interrupts, and RCU read-side critical sections are
1811legal within interrupt handlers and within interrupt-disabled regions
1812of code, as are invocations of <tt>call_rcu()</tt>.
1813
1814<p>
1815Some Linux-kernel architectures can enter an interrupt handler from
1816non-idle process context, and then just never leave it, instead stealthily
1817transitioning back to process context.
1818This trick is sometimes used to invoke system calls from inside the kernel.
1819These &ldquo;half-interrupts&rdquo; mean that RCU has to be very careful
1820about how it counts interrupt nesting levels.
1821I learned of this requirement the hard way during a rewrite
1822of RCU's dyntick-idle code.
1823
1824<p>
1825The Linux kernel has non-maskable interrupts (NMIs), and
1826RCU read-side critical sections are legal within NMI handlers.
1827Thankfully, RCU update-side primitives, including
1828<tt>call_rcu()</tt>, are prohibited within NMI handlers.
1829
1830<p>
1831The name notwithstanding, some Linux-kernel architectures
1832can have nested NMIs, which RCU must handle correctly.
1833Andy Lutomirski
1834<a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a>
1835with this requirement;
1836he also kindly surprised me with
1837<a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a>
1838that meets this requirement.
1839
1840<h3><a name="Loadable Modules">Loadable Modules</a></h3>
1841
1842<p>
1843The Linux kernel has loadable modules, and these modules can
1844also be unloaded.
1845After a given module has been unloaded, any attempt to call
1846one of its functions results in a segmentation fault.
1847The module-unload functions must therefore cancel any
1848delayed calls to loadable-module functions, for example,
1849any outstanding <tt>mod_timer()</tt> must be dealt with
1850via <tt>del_timer_sync()</tt> or similar.
1851
1852<p>
1853Unfortunately, there is no way to cancel an RCU callback;
1854once you invoke <tt>call_rcu()</tt>, the callback function is
1855going to eventually be invoked, unless the system goes down first.
1856Because it is normally considered socially irresponsible to crash the system
1857in response to a module unload request, we need some other way
1858to deal with in-flight RCU callbacks.
1859
1860<p>
1861RCU therefore provides
1862<tt><a href="https://lwn.net/Articles/217484/">rcu_barrier()</a></tt>,
1863which waits until all in-flight RCU callbacks have been invoked.
1864If a module uses <tt>call_rcu()</tt>, its exit function should therefore
1865prevent any future invocation of <tt>call_rcu()</tt>, then invoke
1866<tt>rcu_barrier()</tt>.
1867In theory, the underlying module-unload code could invoke
1868<tt>rcu_barrier()</tt> unconditionally, but in practice this would
1869incur unacceptable latencies.
1870
1871<p>
1872Nikita Danilov noted this requirement for an analogous filesystem-unmount
1873situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU.
1874The need for <tt>rcu_barrier()</tt> for module unloading became
1875apparent later.
1876
1877<h3><a name="Hotplug CPU">Hotplug CPU</a></h3>
1878
1879<p>
1880The Linux kernel supports CPU hotplug, which means that CPUs
1881can come and go.
1882It is of course illegal to use any RCU API member from an offline CPU.
1883This requirement was present from day one in DYNIX/ptx, but
1884on the other hand, the Linux kernel's CPU-hotplug implementation
1885is &ldquo;interesting.&rdquo;
1886
1887<p>
1888The Linux-kernel CPU-hotplug implementation has notifiers that
1889are used to allow the various kernel subsystems (including RCU)
1890to respond appropriately to a given CPU-hotplug operation.
1891Most RCU operations may be invoked from CPU-hotplug notifiers,
1892including even normal synchronous grace-period operations
1893such as <tt>synchronize_rcu()</tt>.
1894However, expedited grace-period operations such as
1895<tt>synchronize_rcu_expedited()</tt> are not supported,
1896due to the fact that current implementations block CPU-hotplug
1897operations, which could result in deadlock.
1898
1899<p>
1900In addition, all-callback-wait operations such as
1901<tt>rcu_barrier()</tt> are also not supported, due to the
1902fact that there are phases of CPU-hotplug operations where
1903the outgoing CPU's callbacks will not be invoked until after
1904the CPU-hotplug operation ends, which could also result in deadlock.
1905
1906<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3>
1907
1908<p>
1909RCU depends on the scheduler, and the scheduler uses RCU to
1910protect some of its data structures.
1911This means the scheduler is forbidden from acquiring
1912the runqueue locks and the priority-inheritance locks
1913in the middle of an outermost RCU read-side critical section unless
1914it also releases them before exiting that same
1915RCU read-side critical section.
1916This same prohibition also applies to any lock that is acquired
1917while holding any lock to which this prohibition applies.
1918Violating this rule results in deadlock.
1919
1920<p>
1921For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt>
1922implementation must be written carefully to avoid similar deadlocks.
1923In particular, <tt>rcu_read_unlock()</tt> must tolerate an
1924interrupt where the interrupt handler invokes both
1925<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
1926This possibility requires <tt>rcu_read_unlock()</tt> to use
1927negative nesting levels to avoid destructive recursion via
1928interrupt handler's use of RCU.
1929
1930<p>
1931This pair of mutual scheduler-RCU requirements came as a
1932<a href="https://lwn.net/Articles/453002/">complete surprise</a>.
1933
1934<p>
1935As noted above, RCU makes use of kthreads, and it is necessary to
1936avoid excessive CPU-time accumulation by these kthreads.
1937This requirement was no surprise, but RCU's violation of it
1938when running context-switch-heavy workloads when built with
1939<tt>CONFIG_NO_HZ_FULL=y</tt>
1940<a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>.
1941RCU has made good progress towards meeting this requirement, even
1942for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads,
1943but there is room for further improvement.
1944
1945<h3><a name="Tracing and RCU">Tracing and RCU</a></h3>
1946
1947<p>
1948It is possible to use tracing on RCU code, but tracing itself
1949uses RCU.
1950For this reason, <tt>rcu_dereference_raw_notrace()</tt>
1951is provided for use by tracing, which avoids the destructive
1952recursion that could otherwise ensue.
1953This API is also used by virtualization in some architectures,
1954where RCU readers execute in environments in which tracing
1955cannot be used.
1956The tracing folks both located the requirement and provided the
1957needed fix, so this surprise requirement was relatively painless.
1958
1959<h3><a name="Energy Efficiency">Energy Efficiency</a></h3>
1960
1961<p>
1962Interrupting idle CPUs is considered socially unacceptable,
1963especially by people with battery-powered embedded systems.
1964RCU therefore conserves energy by detecting which CPUs are
1965idle, including tracking CPUs that have been interrupted from idle.
1966This is a large part of the energy-efficiency requirement,
1967so I learned of this via an irate phone call.
1968
1969<p>
1970Because RCU avoids interrupting idle CPUs, it is illegal to
1971execute an RCU read-side critical section on an idle CPU.
1972(Kernels built with <tt>CONFIG_PROVE_RCU=y</tt> will splat
1973if you try it.)
1974The <tt>RCU_NONIDLE()</tt> macro and <tt>_rcuidle</tt>
1975event tracing is provided to work around this restriction.
1976In addition, <tt>rcu_is_watching()</tt> may be used to
1977test whether or not it is currently legal to run RCU read-side
1978critical sections on this CPU.
1979I learned of the need for diagnostics on the one hand
1980and <tt>RCU_NONIDLE()</tt> on the other while inspecting
1981idle-loop code.
1982Steven Rostedt supplied <tt>_rcuidle</tt> event tracing,
1983which is used quite heavily in the idle loop.
1984
1985<p>
1986It is similarly socially unacceptable to interrupt an
1987<tt>nohz_full</tt> CPU running in userspace.
1988RCU must therefore track <tt>nohz_full</tt> userspace
1989execution.
1990And in
1991<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a>
1992kernels, RCU must separately track idle CPUs on the one hand and
1993CPUs that are either idle or executing in userspace on the other.
1994In both cases, RCU must be able to sample state at two points in
1995time, and be able to determine whether or not some other CPU spent
1996any time idle and/or executing in userspace.
1997
1998<p>
1999These energy-efficiency requirements have proven quite difficult to
2000understand and to meet, for example, there have been more than five
2001clean-sheet rewrites of RCU's energy-efficiency code, the last of
2002which was finally able to demonstrate
2003<a href="http://www.rdrop.com/users/paulmck/realtime/paper/AMPenergy.2013.04.19a.pdf">real energy savings running on real hardware [PDF]</a>.
2004As noted earlier,
2005I learned of many of these requirements via angry phone calls:
2006Flaming me on the Linux-kernel mailing list was apparently not
2007sufficient to fully vent their ire at RCU's energy-efficiency bugs!
2008
2009<h3><a name="Performance, Scalability, Response Time, and Reliability">
2010Performance, Scalability, Response Time, and Reliability</a></h3>
2011
2012<p>
2013Expanding on the
2014<a href="#Performance and Scalability">earlier discussion</a>,
2015RCU is used heavily by hot code paths in performance-critical
2016portions of the Linux kernel's networking, security, virtualization,
2017and scheduling code paths.
2018RCU must therefore use efficient implementations, especially in its
2019read-side primitives.
2020To that end, it would be good if preemptible RCU's implementation
2021of <tt>rcu_read_lock()</tt> could be inlined, however, doing
2022this requires resolving <tt>#include</tt> issues with the
2023<tt>task_struct</tt> structure.
2024
2025<p>
2026The Linux kernel supports hardware configurations with up to
20274096 CPUs, which means that RCU must be extremely scalable.
2028Algorithms that involve frequent acquisitions of global locks or
2029frequent atomic operations on global variables simply cannot be
2030tolerated within the RCU implementation.
2031RCU therefore makes heavy use of a combining tree based on the
2032<tt>rcu_node</tt> structure.
2033RCU is required to tolerate all CPUs continuously invoking any
2034combination of RCU's runtime primitives with minimal per-operation
2035overhead.
2036In fact, in many cases, increasing load must <i>decrease</i> the
2037per-operation overhead, witness the batching optimizations for
2038<tt>synchronize_rcu()</tt>, <tt>call_rcu()</tt>,
2039<tt>synchronize_rcu_expedited()</tt>, and <tt>rcu_barrier()</tt>.
2040As a general rule, RCU must cheerfully accept whatever the
2041rest of the Linux kernel decides to throw at it.
2042
2043<p>
2044The Linux kernel is used for real-time workloads, especially
2045in conjunction with the
2046<a href="https://rt.wiki.kernel.org/index.php/Main_Page">-rt patchset</a>.
2047The real-time-latency response requirements are such that the
2048traditional approach of disabling preemption across RCU
2049read-side critical sections is inappropriate.
2050Kernels built with <tt>CONFIG_PREEMPT=y</tt> therefore
2051use an RCU implementation that allows RCU read-side critical
2052sections to be preempted.
2053This requirement made its presence known after users made it
2054clear that an earlier
2055<a href="https://lwn.net/Articles/107930/">real-time patch</a>
2056did not meet their needs, in conjunction with some
2057<a href="https://lkml.kernel.org/g/20050318002026.GA2693@us.ibm.com">RCU issues</a>
2058encountered by a very early version of the -rt patchset.
2059
2060<p>
2061In addition, RCU must make do with a sub-100-microsecond real-time latency
2062budget.
2063In fact, on smaller systems with the -rt patchset, the Linux kernel
2064provides sub-20-microsecond real-time latencies for the whole kernel,
2065including RCU.
2066RCU's scalability and latency must therefore be sufficient for
2067these sorts of configurations.
2068To my surprise, the sub-100-microsecond real-time latency budget
2069<a href="http://www.rdrop.com/users/paulmck/realtime/paper/bigrt.2013.01.31a.LCA.pdf">
2070applies to even the largest systems [PDF]</a>,
2071up to and including systems with 4096 CPUs.
2072This real-time requirement motivated the grace-period kthread, which
2073also simplified handling of a number of race conditions.
2074
2075<p>
2076Finally, RCU's status as a synchronization primitive means that
2077any RCU failure can result in arbitrary memory corruption that can be
2078extremely difficult to debug.
2079This means that RCU must be extremely reliable, which in
2080practice also means that RCU must have an aggressive stress-test
2081suite.
2082This stress-test suite is called <tt>rcutorture</tt>.
2083
2084<p>
2085Although the need for <tt>rcutorture</tt> was no surprise,
2086the current immense popularity of the Linux kernel is posing
2087interesting&mdash;and perhaps unprecedented&mdash;validation
2088challenges.
2089To see this, keep in mind that there are well over one billion
2090instances of the Linux kernel running today, given Android
2091smartphones, Linux-powered televisions, and servers.
2092This number can be expected to increase sharply with the advent of
2093the celebrated Internet of Things.
2094
2095<p>
2096Suppose that RCU contains a race condition that manifests on average
2097once per million years of runtime.
2098This bug will be occurring about three times per <i>day</i> across
2099the installed base.
2100RCU could simply hide behind hardware error rates, given that no one
2101should really expect their smartphone to last for a million years.
2102However, anyone taking too much comfort from this thought should
2103consider the fact that in most jurisdictions, a successful multi-year
2104test of a given mechanism, which might include a Linux kernel,
2105suffices for a number of types of safety-critical certifications.
2106In fact, rumor has it that the Linux kernel is already being used
2107in production for safety-critical applications.
2108I don't know about you, but I would feel quite bad if a bug in RCU
2109killed someone.
2110Which might explain my recent focus on validation and verification.
2111
2112<h2><a name="Other RCU Flavors">Other RCU Flavors</a></h2>
2113
2114<p>
2115One of the more surprising things about RCU is that there are now
2116no fewer than five <i>flavors</i>, or API families.
2117In addition, the primary flavor that has been the sole focus up to
2118this point has two different implementations, non-preemptible and
2119preemptible.
2120The other four flavors are listed below, with requirements for each
2121described in a separate section.
2122
2123<ol>
2124<li> <a href="#Bottom-Half Flavor">Bottom-Half Flavor</a>
2125<li> <a href="#Sched Flavor">Sched Flavor</a>
2126<li> <a href="#Sleepable RCU">Sleepable RCU</a>
2127<li> <a href="#Tasks RCU">Tasks RCU</a>
2128</ol>
2129
2130<h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3>
2131
2132<p>
2133The softirq-disable (AKA &ldquo;bottom-half&rdquo;,
2134hence the &ldquo;_bh&rdquo; abbreviations)
2135flavor of RCU, or <i>RCU-bh</i>, was developed by
2136Dipankar Sarma to provide a flavor of RCU that could withstand the
2137network-based denial-of-service attacks researched by Robert
2138Olsson.
2139These attacks placed so much networking load on the system
2140that some of the CPUs never exited softirq execution,
2141which in turn prevented those CPUs from ever executing a context switch,
2142which, in the RCU implementation of that time, prevented grace periods
2143from ever ending.
2144The result was an out-of-memory condition and a system hang.
2145
2146<p>
2147The solution was the creation of RCU-bh, which does
2148<tt>local_bh_disable()</tt>
2149across its read-side critical sections, and which uses the transition
2150from one type of softirq processing to another as a quiescent state
2151in addition to context switch, idle, user mode, and offline.
2152This means that RCU-bh grace periods can complete even when some of
2153the CPUs execute in softirq indefinitely, thus allowing algorithms
2154based on RCU-bh to withstand network-based denial-of-service attacks.
2155
2156<p>
2157Because
2158<tt>rcu_read_lock_bh()</tt> and <tt>rcu_read_unlock_bh()</tt>
2159disable and re-enable softirq handlers, any attempt to start a softirq
2160handlers during the
2161RCU-bh read-side critical section will be deferred.
2162In this case, <tt>rcu_read_unlock_bh()</tt>
2163will invoke softirq processing, which can take considerable time.
2164One can of course argue that this softirq overhead should be associated
2165with the code following the RCU-bh read-side critical section rather
2166than <tt>rcu_read_unlock_bh()</tt>, but the fact
2167is that most profiling tools cannot be expected to make this sort
2168of fine distinction.
2169For example, suppose that a three-millisecond-long RCU-bh read-side
2170critical section executes during a time of heavy networking load.
2171There will very likely be an attempt to invoke at least one softirq
2172handler during that three milliseconds, but any such invocation will
2173be delayed until the time of the <tt>rcu_read_unlock_bh()</tt>.
2174This can of course make it appear at first glance as if
2175<tt>rcu_read_unlock_bh()</tt> was executing very slowly.
2176
2177<p>
2178The
2179<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-bh API</a>
2180includes
2181<tt>rcu_read_lock_bh()</tt>,
2182<tt>rcu_read_unlock_bh()</tt>,
2183<tt>rcu_dereference_bh()</tt>,
2184<tt>rcu_dereference_bh_check()</tt>,
2185<tt>synchronize_rcu_bh()</tt>,
2186<tt>synchronize_rcu_bh_expedited()</tt>,
2187<tt>call_rcu_bh()</tt>,
2188<tt>rcu_barrier_bh()</tt>, and
2189<tt>rcu_read_lock_bh_held()</tt>.
2190
2191<h3><a name="Sched Flavor">Sched Flavor</a></h3>
2192
2193<p>
2194Before preemptible RCU, waiting for an RCU grace period had the
2195side effect of also waiting for all pre-existing interrupt
2196and NMI handlers.
2197However, there are legitimate preemptible-RCU implementations that
2198do not have this property, given that any point in the code outside
2199of an RCU read-side critical section can be a quiescent state.
2200Therefore, <i>RCU-sched</i> was created, which follows &ldquo;classic&rdquo;
2201RCU in that an RCU-sched grace period waits for for pre-existing
2202interrupt and NMI handlers.
2203In kernels built with <tt>CONFIG_PREEMPT=n</tt>, the RCU and RCU-sched
2204APIs have identical implementations, while kernels built with
2205<tt>CONFIG_PREEMPT=y</tt> provide a separate implementation for each.
2206
2207<p>
2208Note well that in <tt>CONFIG_PREEMPT=y</tt> kernels,
2209<tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt>
2210disable and re-enable preemption, respectively.
2211This means that if there was a preemption attempt during the
2212RCU-sched read-side critical section, <tt>rcu_read_unlock_sched()</tt>
2213will enter the scheduler, with all the latency and overhead entailed.
2214Just as with <tt>rcu_read_unlock_bh()</tt>, this can make it look
2215as if <tt>rcu_read_unlock_sched()</tt> was executing very slowly.
2216However, the highest-priority task won't be preempted, so that task
2217will enjoy low-overhead <tt>rcu_read_unlock_sched()</tt> invocations.
2218
2219<p>
2220The
2221<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-sched API</a>
2222includes
2223<tt>rcu_read_lock_sched()</tt>,
2224<tt>rcu_read_unlock_sched()</tt>,
2225<tt>rcu_read_lock_sched_notrace()</tt>,
2226<tt>rcu_read_unlock_sched_notrace()</tt>,
2227<tt>rcu_dereference_sched()</tt>,
2228<tt>rcu_dereference_sched_check()</tt>,
2229<tt>synchronize_sched()</tt>,
2230<tt>synchronize_rcu_sched_expedited()</tt>,
2231<tt>call_rcu_sched()</tt>,
2232<tt>rcu_barrier_sched()</tt>, and
2233<tt>rcu_read_lock_sched_held()</tt>.
2234However, anything that disables preemption also marks an RCU-sched
2235read-side critical section, including
2236<tt>preempt_disable()</tt> and <tt>preempt_enable()</tt>,
2237<tt>local_irq_save()</tt> and <tt>local_irq_restore()</tt>,
2238and so on.
2239
2240<h3><a name="Sleepable RCU">Sleepable RCU</a></h3>
2241
2242<p>
2243For well over a decade, someone saying &ldquo;I need to block within
2244an RCU read-side critical section&rdquo; was a reliable indication
2245that this someone did not understand RCU.
2246After all, if you are always blocking in an RCU read-side critical
2247section, you can probably afford to use a higher-overhead synchronization
2248mechanism.
2249However, that changed with the advent of the Linux kernel's notifiers,
2250whose RCU read-side critical
2251sections almost never sleep, but sometimes need to.
2252This resulted in the introduction of
2253<a href="https://lwn.net/Articles/202847/">sleepable RCU</a>,
2254or <i>SRCU</i>.
2255
2256<p>
2257SRCU allows different domains to be defined, with each such domain
2258defined by an instance of an <tt>srcu_struct</tt> structure.
2259A pointer to this structure must be passed in to each SRCU function,
2260for example, <tt>synchronize_srcu(&amp;ss)</tt>, where
2261<tt>ss</tt> is the <tt>srcu_struct</tt> structure.
2262The key benefit of these domains is that a slow SRCU reader in one
2263domain does not delay an SRCU grace period in some other domain.
2264That said, one consequence of these domains is that read-side code
2265must pass a &ldquo;cookie&rdquo; from <tt>srcu_read_lock()</tt>
2266to <tt>srcu_read_unlock()</tt>, for example, as follows:
2267
2268<blockquote>
2269<pre>
2270 1 int idx;
2271 2
2272 3 idx = srcu_read_lock(&amp;ss);
2273 4 do_something();
2274 5 srcu_read_unlock(&amp;ss, idx);
2275</pre>
2276</blockquote>
2277
2278<p>
2279As noted above, it is legal to block within SRCU read-side critical sections,
2280however, with great power comes great responsibility.
2281If you block forever in one of a given domain's SRCU read-side critical
2282sections, then that domain's grace periods will also be blocked forever.
2283Of course, one good way to block forever is to deadlock, which can
2284happen if any operation in a given domain's SRCU read-side critical
2285section can block waiting, either directly or indirectly, for that domain's
2286grace period to elapse.
2287For example, this results in a self-deadlock:
2288
2289<blockquote>
2290<pre>
2291 1 int idx;
2292 2
2293 3 idx = srcu_read_lock(&amp;ss);
2294 4 do_something();
2295 5 synchronize_srcu(&amp;ss);
2296 6 srcu_read_unlock(&amp;ss, idx);
2297</pre>
2298</blockquote>
2299
2300<p>
2301However, if line&nbsp;5 acquired a mutex that was held across
2302a <tt>synchronize_srcu()</tt> for domain <tt>ss</tt>,
2303deadlock would still be possible.
2304Furthermore, if line&nbsp;5 acquired a mutex that was held across
2305a <tt>synchronize_srcu()</tt> for some other domain <tt>ss1</tt>,
2306and if an <tt>ss1</tt>-domain SRCU read-side critical section
2307acquired another mutex that was held across as <tt>ss</tt>-domain
2308<tt>synchronize_srcu()</tt>,
2309deadlock would again be possible.
2310Such a deadlock cycle could extend across an arbitrarily large number
2311of different SRCU domains.
2312Again, with great power comes great responsibility.
2313
2314<p>
2315Unlike the other RCU flavors, SRCU read-side critical sections can
2316run on idle and even offline CPUs.
2317This ability requires that <tt>srcu_read_lock()</tt> and
2318<tt>srcu_read_unlock()</tt> contain memory barriers, which means
2319that SRCU readers will run a bit slower than would RCU readers.
2320It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt>
2321API, which, in combination with <tt>srcu_read_unlock()</tt>,
2322guarantees a full memory barrier.
2323
2324<p>
2325The
2326<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a>
2327includes
2328<tt>srcu_read_lock()</tt>,
2329<tt>srcu_read_unlock()</tt>,
2330<tt>srcu_dereference()</tt>,
2331<tt>srcu_dereference_check()</tt>,
2332<tt>synchronize_srcu()</tt>,
2333<tt>synchronize_srcu_expedited()</tt>,
2334<tt>call_srcu()</tt>,
2335<tt>srcu_barrier()</tt>, and
2336<tt>srcu_read_lock_held()</tt>.
2337It also includes
2338<tt>DEFINE_SRCU()</tt>,
2339<tt>DEFINE_STATIC_SRCU()</tt>, and
2340<tt>init_srcu_struct()</tt>
2341APIs for defining and initializing <tt>srcu_struct</tt> structures.
2342
2343<h3><a name="Tasks RCU">Tasks RCU</a></h3>
2344
2345<p>
2346Some forms of tracing use &ldquo;tramopolines&rdquo; to handle the
2347binary rewriting required to install different types of probes.
2348It would be good to be able to free old trampolines, which sounds
2349like a job for some form of RCU.
2350However, because it is necessary to be able to install a trace
2351anywhere in the code, it is not possible to use read-side markers
2352such as <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
2353In addition, it does not work to have these markers in the trampoline
2354itself, because there would need to be instructions following
2355<tt>rcu_read_unlock()</tt>.
2356Although <tt>synchronize_rcu()</tt> would guarantee that execution
2357reached the <tt>rcu_read_unlock()</tt>, it would not be able to
2358guarantee that execution had completely left the trampoline.
2359
2360<p>
2361The solution, in the form of
2362<a href="https://lwn.net/Articles/607117/"><i>Tasks RCU</i></a>,
2363is to have implicit
2364read-side critical sections that are delimited by voluntary context
2365switches, that is, calls to <tt>schedule()</tt>,
2366<tt>cond_resched_rcu_qs()</tt>, and
2367<tt>synchronize_rcu_tasks()</tt>.
2368In addition, transitions to and from userspace execution also delimit
2369tasks-RCU read-side critical sections.
2370
2371<p>
2372The tasks-RCU API is quite compact, consisting only of
2373<tt>call_rcu_tasks()</tt>,
2374<tt>synchronize_rcu_tasks()</tt>, and
2375<tt>rcu_barrier_tasks()</tt>.
2376
2377<h2><a name="Possible Future Changes">Possible Future Changes</a></h2>
2378
2379<p>
2380One of the tricks that RCU uses to attain update-side scalability is
2381to increase grace-period latency with increasing numbers of CPUs.
2382If this becomes a serious problem, it will be necessary to rework the
2383grace-period state machine so as to avoid the need for the additional
2384latency.
2385
2386<p>
2387Expedited grace periods scan the CPUs, so their latency and overhead
2388increases with increasing numbers of CPUs.
2389If this becomes a serious problem on large systems, it will be necessary
2390to do some redesign to avoid this scalability problem.
2391
2392<p>
2393RCU disables CPU hotplug in a few places, perhaps most notably in the
2394expedited grace-period and <tt>rcu_barrier()</tt> operations.
2395If there is a strong reason to use expedited grace periods in CPU-hotplug
2396notifiers, it will be necessary to avoid disabling CPU hotplug.
2397This would introduce some complexity, so there had better be a <i>very</i>
2398good reason.
2399
2400<p>
2401The tradeoff between grace-period latency on the one hand and interruptions
2402of other CPUs on the other hand may need to be re-examined.
2403The desire is of course for zero grace-period latency as well as zero
2404interprocessor interrupts undertaken during an expedited grace period
2405operation.
2406While this ideal is unlikely to be achievable, it is quite possible that
2407further improvements can be made.
2408
2409<p>
2410The multiprocessor implementations of RCU use a combining tree that
2411groups CPUs so as to reduce lock contention and increase cache locality.
2412However, this combining tree does not spread its memory across NUMA
2413nodes nor does it align the CPU groups with hardware features such
2414as sockets or cores.
2415Such spreading and alignment is currently believed to be unnecessary
2416because the hotpath read-side primitives do not access the combining
2417tree, nor does <tt>call_rcu()</tt> in the common case.
2418If you believe that your architecture needs such spreading and alignment,
2419then your architecture should also benefit from the
2420<tt>rcutree.rcu_fanout_leaf</tt> boot parameter, which can be set
2421to the number of CPUs in a socket, NUMA node, or whatever.
2422If the number of CPUs is too large, use a fraction of the number of
2423CPUs.
2424If the number of CPUs is a large prime number, well, that certainly
2425is an &ldquo;interesting&rdquo; architectural choice!
2426More flexible arrangements might be considered, but only if
2427<tt>rcutree.rcu_fanout_leaf</tt> has proven inadequate, and only
2428if the inadequacy has been demonstrated by a carefully run and
2429realistic system-level workload.
2430
2431<p>
2432Please note that arrangements that require RCU to remap CPU numbers will
2433require extremely good demonstration of need and full exploration of
2434alternatives.
2435
2436<p>
2437There is an embarrassingly large number of flavors of RCU, and this
2438number has been increasing over time.
2439Perhaps it will be possible to combine some at some future date.
2440
2441<p>
2442RCU's various kthreads are reasonably recent additions.
2443It is quite likely that adjustments will be required to more gracefully
2444handle extreme loads.
2445It might also be necessary to be able to relate CPU utilization by
2446RCU's kthreads and softirq handlers to the code that instigated this
2447CPU utilization.
2448For example, RCU callback overhead might be charged back to the
2449originating <tt>call_rcu()</tt> instance, though probably not
2450in production kernels.
2451
2452<h2><a name="Summary">Summary</a></h2>
2453
2454<p>
2455This document has presented more than two decade's worth of RCU
2456requirements.
2457Given that the requirements keep changing, this will not be the last
2458word on this subject, but at least it serves to get an important
2459subset of the requirements set forth.
2460
2461<h2><a name="Acknowledgments">Acknowledgments</a></h2>
2462
2463I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar,
2464Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and
2465Andy Lutomirski for their help in rendering
2466this article human readable, and to Michelle Rankin for her support
2467of this effort.
2468Other contributions are acknowledged in the Linux kernel's git archive.
2469The cartoon is copyright (c) 2013 by Melissa Broussard,
2470and is provided
2471under the terms of the Creative Commons Attribution-Share Alike 3.0
2472United States license.
2473
2474<h3><a name="Answers to Quick Quizzes">
2475Answers to Quick Quizzes</a></h3>
2476
2477<a name="qq1answer"></a>
2478<p><b>Quick Quiz 1</b>:
2479Wait a minute!
2480You said that updaters can make useful forward progress concurrently
2481with readers, but pre-existing readers will block
2482<tt>synchronize_rcu()</tt>!!!
2483Just who are you trying to fool???
2484
2485
2486</p><p><b>Answer</b>:
2487First, if updaters do not wish to be blocked by readers, they can use
2488<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will
2489be discussed later.
2490Second, even when using <tt>synchronize_rcu()</tt>, the other
2491update-side code does run concurrently with readers, whether pre-existing
2492or not.
2493
2494
2495</p><p><a href="#Quick%20Quiz%201"><b>Back to Quick Quiz 1</b>.</a>
2496
2497<a name="qq2answer"></a>
2498<p><b>Quick Quiz 2</b>:
2499Why is the <tt>synchronize_rcu()</tt> on line&nbsp;28 needed?
2500
2501
2502</p><p><b>Answer</b>:
2503Without that extra grace period, memory reordering could result in
2504<tt>do_something_dlm()</tt> executing <tt>do_something()</tt>
2505concurrently with the last bits of <tt>recovery()</tt>.
2506
2507
2508</p><p><a href="#Quick%20Quiz%202"><b>Back to Quick Quiz 2</b>.</a>
2509
2510<a name="qq3answer"></a>
2511<p><b>Quick Quiz 3</b>:
2512But <tt>rcu_assign_pointer()</tt> does nothing to prevent the
2513two assignments to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt>
2514from being reordered.
2515Can't that also cause problems?
2516
2517
2518</p><p><b>Answer</b>:
2519No, it cannot.
2520The readers cannot see either of these two fields until
2521the assignment to <tt>gp</tt>, by which time both fields are
2522fully initialized.
2523So reordering the assignments
2524to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt> cannot possibly
2525cause any problems.
2526
2527
2528</p><p><a href="#Quick%20Quiz%203"><b>Back to Quick Quiz 3</b>.</a>
2529
2530<a name="qq4answer"></a>
2531<p><b>Quick Quiz 4</b>:
2532Without the <tt>rcu_dereference()</tt> or the
2533<tt>rcu_access_pointer()</tt>, what destructive optimizations
2534might the compiler make use of?
2535
2536
2537</p><p><b>Answer</b>:
2538Let's start with what happens to <tt>do_something_gp()</tt>
2539if it fails to use <tt>rcu_dereference()</tt>.
2540It could reuse a value formerly fetched from this same pointer.
2541It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time
2542manner, resulting in <i>load tearing</i>, in turn resulting a bytewise
2543mash-up of two distince pointer values.
2544It might even use value-speculation optimizations, where it makes a wrong
2545guess, but by the time it gets around to checking the value, an update
2546has changed the pointer to match the wrong guess.
2547Too bad about any dereferences that returned pre-initialization garbage
2548in the meantime!
2549
2550<p>
2551For <tt>remove_gp_synchronous()</tt>, as long as all modifications
2552to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>,
2553the above optimizations are harmless.
2554However,
2555with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>,
2556<tt>sparse</tt> will complain if you
2557define <tt>gp</tt> with <tt>__rcu</tt> and then
2558access it without using
2559either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>.
2560
2561
2562</p><p><a href="#Quick%20Quiz%204"><b>Back to Quick Quiz 4</b>.</a>
2563
2564<a name="qq5answer"></a>
2565<p><b>Quick Quiz 5</b>:
2566Given that multiple CPUs can start RCU read-side critical sections
2567at any time without any ordering whatsoever, how can RCU possibly tell whether
2568or not a given RCU read-side critical section starts before a
2569given instance of <tt>synchronize_rcu()</tt>?
2570
2571
2572</p><p><b>Answer</b>:
2573If RCU cannot tell whether or not a given
2574RCU read-side critical section starts before a
2575given instance of <tt>synchronize_rcu()</tt>,
2576then it must assume that the RCU read-side critical section
2577started first.
2578In other words, a given instance of <tt>synchronize_rcu()</tt>
2579can avoid waiting on a given RCU read-side critical section only
2580if it can prove that <tt>synchronize_rcu()</tt> started first.
2581
2582
2583</p><p><a href="#Quick%20Quiz%205"><b>Back to Quick Quiz 5</b>.</a>
2584
2585<a name="qq6answer"></a>
2586<p><b>Quick Quiz 6</b>:
2587The first and second guarantees require unbelievably strict ordering!
2588Are all these memory barriers <i> really</i> required?
2589
2590
2591</p><p><b>Answer</b>:
2592Yes, they really are required.
2593To see why the first guarantee is required, consider the following
2594sequence of events:
2595
2596<ol>
2597<li> CPU 1: <tt>rcu_read_lock()</tt>
2598<li> CPU 1: <tt>q = rcu_dereference(gp);
2599 /* Very likely to return p. */</tt>
2600<li> CPU 0: <tt>list_del_rcu(p);</tt>
2601<li> CPU 0: <tt>synchronize_rcu()</tt> starts.
2602<li> CPU 1: <tt>do_something_with(q-&gt;a);
2603 /* No smp_mb(), so might happen after kfree(). */</tt>
2604<li> CPU 1: <tt>rcu_read_unlock()</tt>
2605<li> CPU 0: <tt>synchronize_rcu()</tt> returns.
2606<li> CPU 0: <tt>kfree(p);</tt>
2607</ol>
2608
2609<p>
2610Therefore, there absolutely must be a full memory barrier between the
2611end of the RCU read-side critical section and the end of the
2612grace period.
2613
2614<p>
2615The sequence of events demonstrating the necessity of the second rule
2616is roughly similar:
2617
2618<ol>
2619<li> CPU 0: <tt>list_del_rcu(p);</tt>
2620<li> CPU 0: <tt>synchronize_rcu()</tt> starts.
2621<li> CPU 1: <tt>rcu_read_lock()</tt>
2622<li> CPU 1: <tt>q = rcu_dereference(gp);
2623 /* Might return p if no memory barrier. */</tt>
2624<li> CPU 0: <tt>synchronize_rcu()</tt> returns.
2625<li> CPU 0: <tt>kfree(p);</tt>
2626<li> CPU 1: <tt>do_something_with(q-&gt;a); /* Boom!!! */</tt>
2627<li> CPU 1: <tt>rcu_read_unlock()</tt>
2628</ol>
2629
2630<p>
2631And similarly, without a memory barrier between the beginning of the
2632grace period and the beginning of the RCU read-side critical section,
2633CPU&nbsp;1 might end up accessing the freelist.
2634
2635<p>
2636The &ldquo;as if&rdquo; rule of course applies, so that any implementation
2637that acts as if the appropriate memory barriers were in place is a
2638correct implementation.
2639That said, it is much easier to fool yourself into believing that you have
2640adhered to the as-if rule than it is to actually adhere to it!
2641
2642
2643</p><p><a href="#Quick%20Quiz%206"><b>Back to Quick Quiz 6</b>.</a>
2644
2645<a name="qq7answer"></a>
2646<p><b>Quick Quiz 7</b>:
2647But how does the upgrade-to-write operation exclude other readers?
2648
2649
2650</p><p><b>Answer</b>:
2651It doesn't, just like normal RCU updates, which also do not exclude
2652RCU readers.
2653
2654
2655</p><p><a href="#Quick%20Quiz%207"><b>Back to Quick Quiz 7</b>.</a>
2656
2657<a name="qq8answer"></a>
2658<p><b>Quick Quiz 8</b>:
2659Can't the compiler also reorder this code?
2660
2661
2662</p><p><b>Answer</b>:
2663No, the volatile casts in <tt>READ_ONCE()</tt> and
2664<tt>WRITE_ONCE()</tt> prevent the compiler from reordering in
2665this particular case.
2666
2667
2668</p><p><a href="#Quick%20Quiz%208"><b>Back to Quick Quiz 8</b>.</a>
2669
2670<a name="qq9answer"></a>
2671<p><b>Quick Quiz 9</b>:
2672Suppose that synchronize_rcu() did wait until all readers had completed.
2673Would the updater be able to rely on this?
2674
2675
2676</p><p><b>Answer</b>:
2677No.
2678Even if <tt>synchronize_rcu()</tt> were to wait until
2679all readers had completed, a new reader might start immediately after
2680<tt>synchronize_rcu()</tt> completed.
2681Therefore, the code following
2682<tt>synchronize_rcu()</tt> cannot rely on there being no readers
2683in any case.
2684
2685
2686</p><p><a href="#Quick%20Quiz%209"><b>Back to Quick Quiz 9</b>.</a>
2687
2688<a name="qq10answer"></a>
2689<p><b>Quick Quiz 10</b>:
2690How long a sequence of grace periods, each separated by an RCU read-side
2691critical section, would be required to partition the RCU read-side
2692critical sections at the beginning and end of the chain?
2693
2694
2695</p><p><b>Answer</b>:
2696In theory, an infinite number.
2697In practice, an unknown number that is sensitive to both implementation
2698details and timing considerations.
2699Therefore, even in practice, RCU users must abide by the theoretical rather
2700than the practical answer.
2701
2702
2703</p><p><a href="#Quick%20Quiz%2010"><b>Back to Quick Quiz 10</b>.</a>
2704
2705<a name="qq11answer"></a>
2706<p><b>Quick Quiz 11</b>:
2707What about sleeping locks?
2708
2709
2710</p><p><b>Answer</b>:
2711These are forbidden within Linux-kernel RCU read-side critical sections
2712because it is not legal to place a quiescent state (in this case,
2713voluntary context switch) within an RCU read-side critical section.
2714However, sleeping locks may be used within userspace RCU read-side critical
2715sections, and also within Linux-kernel sleepable RCU
2716<a href="#Sleepable RCU">(SRCU)</a>
2717read-side critical sections.
2718In addition, the -rt patchset turns spinlocks into a sleeping locks so
2719that the corresponding critical sections can be preempted, which
2720also means that these sleeplockified spinlocks (but not other sleeping locks!)
2721may be acquire within -rt-Linux-kernel RCU read-side critical sections.
2722
2723<p>
2724Note that it <i>is</i> legal for a normal RCU read-side critical section
2725to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>),
2726but only as long as it does not loop indefinitely attempting to
2727conditionally acquire that sleeping locks.
2728The key point is that things like <tt>mutex_trylock()</tt>
2729either return with the mutex held, or return an error indication if
2730the mutex was not immediately available.
2731Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping.
2732
2733
2734</p><p><a href="#Quick%20Quiz%2011"><b>Back to Quick Quiz 11</b>.</a>
2735
2736<a name="qq12answer"></a>
2737<p><b>Quick Quiz 12</b>:
2738Why does line&nbsp;19 use <tt>rcu_access_pointer()</tt>?
2739After all, <tt>call_rcu()</tt> on line&nbsp;25 stores into the
2740structure, which would interact badly with concurrent insertions.
2741Doesn't this mean that <tt>rcu_dereference()</tt> is required?
2742
2743
2744</p><p><b>Answer</b>:
2745Presumably the <tt>-&gt;gp_lock</tt> acquired on line&nbsp;18 excludes
2746any changes, including any insertions that <tt>rcu_dereference()</tt>
2747would protect against.
2748Therefore, any insertions will be delayed until after <tt>-&gt;gp_lock</tt>
2749is released on line&nbsp;25, which in turn means that
2750<tt>rcu_access_pointer()</tt> suffices.
2751
2752
2753</p><p><a href="#Quick%20Quiz%2012"><b>Back to Quick Quiz 12</b>.</a>
2754
2755<a name="qq13answer"></a>
2756<p><b>Quick Quiz 13</b>:
2757Earlier it was claimed that <tt>call_rcu()</tt> and
2758<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked
2759by readers.
2760But how can that be correct, given that the invocation of the callback
2761and the freeing of the memory (respectively) must still wait for
2762a grace period to elapse?
2763
2764
2765</p><p><b>Answer</b>:
2766We could define things this way, but keep in mind that this sort of
2767definition would say that updates in garbage-collected languages
2768cannot complete until the next time the garbage collector runs,
2769which does not seem at all reasonable.
2770The key point is that in most cases, an updater using either
2771<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the
2772next update as soon as it has invoked <tt>call_rcu()</tt> or
2773<tt>kfree_rcu()</tt>, without having to wait for a subsequent
2774grace period.
2775
2776
2777</p><p><a href="#Quick%20Quiz%2013"><b>Back to Quick Quiz 13</b>.</a>
2778
2779<a name="qq14answer"></a>
2780<p><b>Quick Quiz 14</b>:
2781So what happens with <tt>synchronize_rcu()</tt> during
2782scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
2783kernels?
2784
2785
2786</p><p><b>Answer</b>:
2787In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt>
2788maps directly to <tt>synchronize_sched()</tt>.
2789Therefore, <tt>synchronize_rcu()</tt> works normally throughout
2790boot in <tt>CONFIG_PREEMPT=n</tt> kernels.
2791However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels,
2792so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt>
2793during scheduler initialization.
2794
2795
2796</p><p><a href="#Quick%20Quiz%2014"><b>Back to Quick Quiz 14</b>.</a>
2797
2798
2799</body></html>
diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx
new file mode 100644
index 000000000000..1168010c39fe
--- /dev/null
+++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx
@@ -0,0 +1,2643 @@
1<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
2 "http://www.w3.org/TR/html4/loose.dtd">
3 <html>
4 <head><title>A Tour Through RCU's Requirements [LWN.net]</title>
5 <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8">
6
7<h1>A Tour Through RCU's Requirements</h1>
8
9<p>Copyright IBM Corporation, 2015</p>
10<p>Author: Paul E.&nbsp;McKenney</p>
11<p><i>The initial version of this document appeared in the
12<a href="https://lwn.net/">LWN</a> articles
13<a href="https://lwn.net/Articles/652156/">here</a>,
14<a href="https://lwn.net/Articles/652677/">here</a>, and
15<a href="https://lwn.net/Articles/653326/">here</a>.</i></p>
16
17<h2>Introduction</h2>
18
19<p>
20Read-copy update (RCU) is a synchronization mechanism that is often
21used as a replacement for reader-writer locking.
22RCU is unusual in that updaters do not block readers,
23which means that RCU's read-side primitives can be exceedingly fast
24and scalable.
25In addition, updaters can make useful forward progress concurrently
26with readers.
27However, all this concurrency between RCU readers and updaters does raise
28the question of exactly what RCU readers are doing, which in turn
29raises the question of exactly what RCU's requirements are.
30
31<p>
32This document therefore summarizes RCU's requirements, and can be thought
33of as an informal, high-level specification for RCU.
34It is important to understand that RCU's specification is primarily
35empirical in nature;
36in fact, I learned about many of these requirements the hard way.
37This situation might cause some consternation, however, not only
38has this learning process been a lot of fun, but it has also been
39a great privilege to work with so many people willing to apply
40technologies in interesting new ways.
41
42<p>
43All that aside, here are the categories of currently known RCU requirements:
44</p>
45
46<ol>
47<li> <a href="#Fundamental Requirements">
48 Fundamental Requirements</a>
49<li> <a href="#Fundamental Non-Requirements">Fundamental Non-Requirements</a>
50<li> <a href="#Parallelism Facts of Life">
51 Parallelism Facts of Life</a>
52<li> <a href="#Quality-of-Implementation Requirements">
53 Quality-of-Implementation Requirements</a>
54<li> <a href="#Linux Kernel Complications">
55 Linux Kernel Complications</a>
56<li> <a href="#Software-Engineering Requirements">
57 Software-Engineering Requirements</a>
58<li> <a href="#Other RCU Flavors">
59 Other RCU Flavors</a>
60<li> <a href="#Possible Future Changes">
61 Possible Future Changes</a>
62</ol>
63
64<p>
65This is followed by a <a href="#Summary">summary</a>,
66which is in turn followed by the inevitable
67<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>.
68
69<h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2>
70
71<p>
72RCU's fundamental requirements are the closest thing RCU has to hard
73mathematical requirements.
74These are:
75
76<ol>
77<li> <a href="#Grace-Period Guarantee">
78 Grace-Period Guarantee</a>
79<li> <a href="#Publish-Subscribe Guarantee">
80 Publish-Subscribe Guarantee</a>
81<li> <a href="#RCU Primitives Guaranteed to Execute Unconditionally">
82 RCU Primitives Guaranteed to Execute Unconditionally</a>
83<li> <a href="#Guaranteed Read-to-Write Upgrade">
84 Guaranteed Read-to-Write Upgrade</a>
85</ol>
86
87<h3><a name="Grace-Period Guarantee">Grace-Period Guarantee</a></h3>
88
89<p>
90RCU's grace-period guarantee is unusual in being premeditated:
91Jack Slingwine and I had this guarantee firmly in mind when we started
92work on RCU (then called &ldquo;rclock&rdquo;) in the early 1990s.
93That said, the past two decades of experience with RCU have produced
94a much more detailed understanding of this guarantee.
95
96<p>
97RCU's grace-period guarantee allows updaters to wait for the completion
98of all pre-existing RCU read-side critical sections.
99An RCU read-side critical section
100begins with the marker <tt>rcu_read_lock()</tt> and ends with
101the marker <tt>rcu_read_unlock()</tt>.
102These markers may be nested, and RCU treats a nested set as one
103big RCU read-side critical section.
104Production-quality implementations of <tt>rcu_read_lock()</tt> and
105<tt>rcu_read_unlock()</tt> are extremely lightweight, and in
106fact have exactly zero overhead in Linux kernels built for production
107use with <tt>CONFIG_PREEMPT=n</tt>.
108
109<p>
110This guarantee allows ordering to be enforced with extremely low
111overhead to readers, for example:
112
113<blockquote>
114<pre>
115 1 int x, y;
116 2
117 3 void thread0(void)
118 4 {
119 5 rcu_read_lock();
120 6 r1 = READ_ONCE(x);
121 7 r2 = READ_ONCE(y);
122 8 rcu_read_unlock();
123 9 }
12410
12511 void thread1(void)
12612 {
12713 WRITE_ONCE(x, 1);
12814 synchronize_rcu();
12915 WRITE_ONCE(y, 1);
13016 }
131</pre>
132</blockquote>
133
134<p>
135Because the <tt>synchronize_rcu()</tt> on line&nbsp;14 waits for
136all pre-existing readers, any instance of <tt>thread0()</tt> that
137loads a value of zero from <tt>x</tt> must complete before
138<tt>thread1()</tt> stores to <tt>y</tt>, so that instance must
139also load a value of zero from <tt>y</tt>.
140Similarly, any instance of <tt>thread0()</tt> that loads a value of
141one from <tt>y</tt> must have started after the
142<tt>synchronize_rcu()</tt> started, and must therefore also load
143a value of one from <tt>x</tt>.
144Therefore, the outcome:
145<blockquote>
146<pre>
147(r1 == 0 &amp;&amp; r2 == 1)
148</pre>
149</blockquote>
150cannot happen.
151
152<p>@@QQ@@
153Wait a minute!
154You said that updaters can make useful forward progress concurrently
155with readers, but pre-existing readers will block
156<tt>synchronize_rcu()</tt>!!!
157Just who are you trying to fool???
158<p>@@QQA@@
159First, if updaters do not wish to be blocked by readers, they can use
160<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will
161be discussed later.
162Second, even when using <tt>synchronize_rcu()</tt>, the other
163update-side code does run concurrently with readers, whether pre-existing
164or not.
165<p>@@QQE@@
166
167<p>
168This scenario resembles one of the first uses of RCU in
169<a href="https://en.wikipedia.org/wiki/DYNIX">DYNIX/ptx</a>,
170which managed a distributed lock manager's transition into
171a state suitable for handling recovery from node failure,
172more or less as follows:
173
174<blockquote>
175<pre>
176 1 #define STATE_NORMAL 0
177 2 #define STATE_WANT_RECOVERY 1
178 3 #define STATE_RECOVERING 2
179 4 #define STATE_WANT_NORMAL 3
180 5
181 6 int state = STATE_NORMAL;
182 7
183 8 void do_something_dlm(void)
184 9 {
18510 int state_snap;
18611
18712 rcu_read_lock();
18813 state_snap = READ_ONCE(state);
18914 if (state_snap == STATE_NORMAL)
19015 do_something();
19116 else
19217 do_something_carefully();
19318 rcu_read_unlock();
19419 }
19520
19621 void start_recovery(void)
19722 {
19823 WRITE_ONCE(state, STATE_WANT_RECOVERY);
19924 synchronize_rcu();
20025 WRITE_ONCE(state, STATE_RECOVERING);
20126 recovery();
20227 WRITE_ONCE(state, STATE_WANT_NORMAL);
20328 synchronize_rcu();
20429 WRITE_ONCE(state, STATE_NORMAL);
20530 }
206</pre>
207</blockquote>
208
209<p>
210The RCU read-side critical section in <tt>do_something_dlm()</tt>
211works with the <tt>synchronize_rcu()</tt> in <tt>start_recovery()</tt>
212to guarantee that <tt>do_something()</tt> never runs concurrently
213with <tt>recovery()</tt>, but with little or no synchronization
214overhead in <tt>do_something_dlm()</tt>.
215
216<p>@@QQ@@
217Why is the <tt>synchronize_rcu()</tt> on line&nbsp;28 needed?
218<p>@@QQA@@
219Without that extra grace period, memory reordering could result in
220<tt>do_something_dlm()</tt> executing <tt>do_something()</tt>
221concurrently with the last bits of <tt>recovery()</tt>.
222<p>@@QQE@@
223
224<p>
225In order to avoid fatal problems such as deadlocks,
226an RCU read-side critical section must not contain calls to
227<tt>synchronize_rcu()</tt>.
228Similarly, an RCU read-side critical section must not
229contain anything that waits, directly or indirectly, on completion of
230an invocation of <tt>synchronize_rcu()</tt>.
231
232<p>
233Although RCU's grace-period guarantee is useful in and of itself, with
234<a href="https://lwn.net/Articles/573497/">quite a few use cases</a>,
235it would be good to be able to use RCU to coordinate read-side
236access to linked data structures.
237For this, the grace-period guarantee is not sufficient, as can
238be seen in function <tt>add_gp_buggy()</tt> below.
239We will look at the reader's code later, but in the meantime, just think of
240the reader as locklessly picking up the <tt>gp</tt> pointer,
241and, if the value loaded is non-<tt>NULL</tt>, locklessly accessing the
242<tt>-&gt;a</tt> and <tt>-&gt;b</tt> fields.
243
244<blockquote>
245<pre>
246 1 bool add_gp_buggy(int a, int b)
247 2 {
248 3 p = kmalloc(sizeof(*p), GFP_KERNEL);
249 4 if (!p)
250 5 return -ENOMEM;
251 6 spin_lock(&amp;gp_lock);
252 7 if (rcu_access_pointer(gp)) {
253 8 spin_unlock(&amp;gp_lock);
254 9 return false;
25510 }
25611 p-&gt;a = a;
25712 p-&gt;b = a;
25813 gp = p; /* ORDERING BUG */
25914 spin_unlock(&amp;gp_lock);
26015 return true;
26116 }
262</pre>
263</blockquote>
264
265<p>
266The problem is that both the compiler and weakly ordered CPUs are within
267their rights to reorder this code as follows:
268
269<blockquote>
270<pre>
271 1 bool add_gp_buggy_optimized(int a, int b)
272 2 {
273 3 p = kmalloc(sizeof(*p), GFP_KERNEL);
274 4 if (!p)
275 5 return -ENOMEM;
276 6 spin_lock(&amp;gp_lock);
277 7 if (rcu_access_pointer(gp)) {
278 8 spin_unlock(&amp;gp_lock);
279 9 return false;
28010 }
281<b>11 gp = p; /* ORDERING BUG */
28212 p-&gt;a = a;
28313 p-&gt;b = a;</b>
28414 spin_unlock(&amp;gp_lock);
28515 return true;
28616 }
287</pre>
288</blockquote>
289
290<p>
291If an RCU reader fetches <tt>gp</tt> just after
292<tt>add_gp_buggy_optimized</tt> executes line&nbsp;11,
293it will see garbage in the <tt>-&gt;a</tt> and <tt>-&gt;b</tt>
294fields.
295And this is but one of many ways in which compiler and hardware optimizations
296could cause trouble.
297Therefore, we clearly need some way to prevent the compiler and the CPU from
298reordering in this manner, which brings us to the publish-subscribe
299guarantee discussed in the next section.
300
301<h3><a name="Publish-Subscribe Guarantee">Publish/Subscribe Guarantee</a></h3>
302
303<p>
304RCU's publish-subscribe guarantee allows data to be inserted
305into a linked data structure without disrupting RCU readers.
306The updater uses <tt>rcu_assign_pointer()</tt> to insert the
307new data, and readers use <tt>rcu_dereference()</tt> to
308access data, whether new or old.
309The following shows an example of insertion:
310
311<blockquote>
312<pre>
313 1 bool add_gp(int a, int b)
314 2 {
315 3 p = kmalloc(sizeof(*p), GFP_KERNEL);
316 4 if (!p)
317 5 return -ENOMEM;
318 6 spin_lock(&amp;gp_lock);
319 7 if (rcu_access_pointer(gp)) {
320 8 spin_unlock(&amp;gp_lock);
321 9 return false;
32210 }
32311 p-&gt;a = a;
32412 p-&gt;b = a;
32513 rcu_assign_pointer(gp, p);
32614 spin_unlock(&amp;gp_lock);
32715 return true;
32816 }
329</pre>
330</blockquote>
331
332<p>
333The <tt>rcu_assign_pointer()</tt> on line&nbsp;13 is conceptually
334equivalent to a simple assignment statement, but also guarantees
335that its assignment will
336happen after the two assignments in lines&nbsp;11 and&nbsp;12,
337similar to the C11 <tt>memory_order_release</tt> store operation.
338It also prevents any number of &ldquo;interesting&rdquo; compiler
339optimizations, for example, the use of <tt>gp</tt> as a scratch
340location immediately preceding the assignment.
341
342<p>@@QQ@@
343But <tt>rcu_assign_pointer()</tt> does nothing to prevent the
344two assignments to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt>
345from being reordered.
346Can't that also cause problems?
347<p>@@QQA@@
348No, it cannot.
349The readers cannot see either of these two fields until
350the assignment to <tt>gp</tt>, by which time both fields are
351fully initialized.
352So reordering the assignments
353to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt> cannot possibly
354cause any problems.
355<p>@@QQE@@
356
357<p>
358It is tempting to assume that the reader need not do anything special
359to control its accesses to the RCU-protected data,
360as shown in <tt>do_something_gp_buggy()</tt> below:
361
362<blockquote>
363<pre>
364 1 bool do_something_gp_buggy(void)
365 2 {
366 3 rcu_read_lock();
367 4 p = gp; /* OPTIMIZATIONS GALORE!!! */
368 5 if (p) {
369 6 do_something(p-&gt;a, p-&gt;b);
370 7 rcu_read_unlock();
371 8 return true;
372 9 }
37310 rcu_read_unlock();
37411 return false;
37512 }
376</pre>
377</blockquote>
378
379<p>
380However, this temptation must be resisted because there are a
381surprisingly large number of ways that the compiler
382(to say nothing of
383<a href="https://h71000.www7.hp.com/wizard/wiz_2637.html">DEC Alpha CPUs</a>)
384can trip this code up.
385For but one example, if the compiler were short of registers, it
386might choose to refetch from <tt>gp</tt> rather than keeping
387a separate copy in <tt>p</tt> as follows:
388
389<blockquote>
390<pre>
391 1 bool do_something_gp_buggy_optimized(void)
392 2 {
393 3 rcu_read_lock();
394 4 if (gp) { /* OPTIMIZATIONS GALORE!!! */
395<b> 5 do_something(gp-&gt;a, gp-&gt;b);</b>
396 6 rcu_read_unlock();
397 7 return true;
398 8 }
399 9 rcu_read_unlock();
40010 return false;
40111 }
402</pre>
403</blockquote>
404
405<p>
406If this function ran concurrently with a series of updates that
407replaced the current structure with a new one,
408the fetches of <tt>gp-&gt;a</tt>
409and <tt>gp-&gt;b</tt> might well come from two different structures,
410which could cause serious confusion.
411To prevent this (and much else besides), <tt>do_something_gp()</tt> uses
412<tt>rcu_dereference()</tt> to fetch from <tt>gp</tt>:
413
414<blockquote>
415<pre>
416 1 bool do_something_gp(void)
417 2 {
418 3 rcu_read_lock();
419 4 p = rcu_dereference(gp);
420 5 if (p) {
421 6 do_something(p-&gt;a, p-&gt;b);
422 7 rcu_read_unlock();
423 8 return true;
424 9 }
42510 rcu_read_unlock();
42611 return false;
42712 }
428</pre>
429</blockquote>
430
431<p>
432The <tt>rcu_dereference()</tt> uses volatile casts and (for DEC Alpha)
433memory barriers in the Linux kernel.
434Should a
435<a href="http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf">high-quality implementation of C11 <tt>memory_order_consume</tt> [PDF]</a>
436ever appear, then <tt>rcu_dereference()</tt> could be implemented
437as a <tt>memory_order_consume</tt> load.
438Regardless of the exact implementation, a pointer fetched by
439<tt>rcu_dereference()</tt> may not be used outside of the
440outermost RCU read-side critical section containing that
441<tt>rcu_dereference()</tt>, unless protection of
442the corresponding data element has been passed from RCU to some
443other synchronization mechanism, most commonly locking or
444<a href="https://www.kernel.org/doc/Documentation/RCU/rcuref.txt">reference counting</a>.
445
446<p>
447In short, updaters use <tt>rcu_assign_pointer()</tt> and readers
448use <tt>rcu_dereference()</tt>, and these two RCU API elements
449work together to ensure that readers have a consistent view of
450newly added data elements.
451
452<p>
453Of course, it is also necessary to remove elements from RCU-protected
454data structures, for example, using the following process:
455
456<ol>
457<li> Remove the data element from the enclosing structure.
458<li> Wait for all pre-existing RCU read-side critical sections
459 to complete (because only pre-existing readers can possibly have
460 a reference to the newly removed data element).
461<li> At this point, only the updater has a reference to the
462 newly removed data element, so it can safely reclaim
463 the data element, for example, by passing it to <tt>kfree()</tt>.
464</ol>
465
466This process is implemented by <tt>remove_gp_synchronous()</tt>:
467
468<blockquote>
469<pre>
470 1 bool remove_gp_synchronous(void)
471 2 {
472 3 struct foo *p;
473 4
474 5 spin_lock(&amp;gp_lock);
475 6 p = rcu_access_pointer(gp);
476 7 if (!p) {
477 8 spin_unlock(&amp;gp_lock);
478 9 return false;
47910 }
48011 rcu_assign_pointer(gp, NULL);
48112 spin_unlock(&amp;gp_lock);
48213 synchronize_rcu();
48314 kfree(p);
48415 return true;
48516 }
486</pre>
487</blockquote>
488
489<p>
490This function is straightforward, with line&nbsp;13 waiting for a grace
491period before line&nbsp;14 frees the old data element.
492This waiting ensures that readers will reach line&nbsp;7 of
493<tt>do_something_gp()</tt> before the data element referenced by
494<tt>p</tt> is freed.
495The <tt>rcu_access_pointer()</tt> on line&nbsp;6 is similar to
496<tt>rcu_dereference()</tt>, except that:
497
498<ol>
499<li> The value returned by <tt>rcu_access_pointer()</tt>
500 cannot be dereferenced.
501 If you want to access the value pointed to as well as
502 the pointer itself, use <tt>rcu_dereference()</tt>
503 instead of <tt>rcu_access_pointer()</tt>.
504<li> The call to <tt>rcu_access_pointer()</tt> need not be
505 protected.
506 In contrast, <tt>rcu_dereference()</tt> must either be
507 within an RCU read-side critical section or in a code
508 segment where the pointer cannot change, for example, in
509 code protected by the corresponding update-side lock.
510</ol>
511
512<p>@@QQ@@
513Without the <tt>rcu_dereference()</tt> or the
514<tt>rcu_access_pointer()</tt>, what destructive optimizations
515might the compiler make use of?
516<p>@@QQA@@
517Let's start with what happens to <tt>do_something_gp()</tt>
518if it fails to use <tt>rcu_dereference()</tt>.
519It could reuse a value formerly fetched from this same pointer.
520It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time
521manner, resulting in <i>load tearing</i>, in turn resulting a bytewise
522mash-up of two distince pointer values.
523It might even use value-speculation optimizations, where it makes a wrong
524guess, but by the time it gets around to checking the value, an update
525has changed the pointer to match the wrong guess.
526Too bad about any dereferences that returned pre-initialization garbage
527in the meantime!
528
529<p>
530For <tt>remove_gp_synchronous()</tt>, as long as all modifications
531to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>,
532the above optimizations are harmless.
533However,
534with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>,
535<tt>sparse</tt> will complain if you
536define <tt>gp</tt> with <tt>__rcu</tt> and then
537access it without using
538either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>.
539<p>@@QQE@@
540
541<p>
542This simple linked-data-structure scenario clearly demonstrates the need
543for RCU's stringent memory-ordering guarantees on systems with more than
544one CPU:
545
546<ol>
547<li> Each CPU that has an RCU read-side critical section that
548 begins before <tt>synchronize_rcu()</tt> starts is
549 guaranteed to execute a full memory barrier between the time
550 that the RCU read-side critical section ends and the time that
551 <tt>synchronize_rcu()</tt> returns.
552 Without this guarantee, a pre-existing RCU read-side critical section
553 might hold a reference to the newly removed <tt>struct foo</tt>
554 after the <tt>kfree()</tt> on line&nbsp;14 of
555 <tt>remove_gp_synchronous()</tt>.
556<li> Each CPU that has an RCU read-side critical section that ends
557 after <tt>synchronize_rcu()</tt> returns is guaranteed
558 to execute a full memory barrier between the time that
559 <tt>synchronize_rcu()</tt> begins and the time that the RCU
560 read-side critical section begins.
561 Without this guarantee, a later RCU read-side critical section
562 running after the <tt>kfree()</tt> on line&nbsp;14 of
563 <tt>remove_gp_synchronous()</tt> might
564 later run <tt>do_something_gp()</tt> and find the
565 newly deleted <tt>struct foo</tt>.
566<li> If the task invoking <tt>synchronize_rcu()</tt> remains
567 on a given CPU, then that CPU is guaranteed to execute a full
568 memory barrier sometime during the execution of
569 <tt>synchronize_rcu()</tt>.
570 This guarantee ensures that the <tt>kfree()</tt> on
571 line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does
572 execute after the removal on line&nbsp;11.
573<li> If the task invoking <tt>synchronize_rcu()</tt> migrates
574 among a group of CPUs during that invocation, then each of the
575 CPUs in that group is guaranteed to execute a full memory barrier
576 sometime during the execution of <tt>synchronize_rcu()</tt>.
577 This guarantee also ensures that the <tt>kfree()</tt> on
578 line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does
579 execute after the removal on
580 line&nbsp;11, but also in the case where the thread executing the
581 <tt>synchronize_rcu()</tt> migrates in the meantime.
582</ol>
583
584<p>@@QQ@@
585Given that multiple CPUs can start RCU read-side critical sections
586at any time without any ordering whatsoever, how can RCU possibly tell whether
587or not a given RCU read-side critical section starts before a
588given instance of <tt>synchronize_rcu()</tt>?
589<p>@@QQA@@
590If RCU cannot tell whether or not a given
591RCU read-side critical section starts before a
592given instance of <tt>synchronize_rcu()</tt>,
593then it must assume that the RCU read-side critical section
594started first.
595In other words, a given instance of <tt>synchronize_rcu()</tt>
596can avoid waiting on a given RCU read-side critical section only
597if it can prove that <tt>synchronize_rcu()</tt> started first.
598<p>@@QQE@@
599
600<p>@@QQ@@
601The first and second guarantees require unbelievably strict ordering!
602Are all these memory barriers <i> really</i> required?
603<p>@@QQA@@
604Yes, they really are required.
605To see why the first guarantee is required, consider the following
606sequence of events:
607
608<ol>
609<li> CPU 1: <tt>rcu_read_lock()</tt>
610<li> CPU 1: <tt>q = rcu_dereference(gp);
611 /* Very likely to return p. */</tt>
612<li> CPU 0: <tt>list_del_rcu(p);</tt>
613<li> CPU 0: <tt>synchronize_rcu()</tt> starts.
614<li> CPU 1: <tt>do_something_with(q-&gt;a);
615 /* No smp_mb(), so might happen after kfree(). */</tt>
616<li> CPU 1: <tt>rcu_read_unlock()</tt>
617<li> CPU 0: <tt>synchronize_rcu()</tt> returns.
618<li> CPU 0: <tt>kfree(p);</tt>
619</ol>
620
621<p>
622Therefore, there absolutely must be a full memory barrier between the
623end of the RCU read-side critical section and the end of the
624grace period.
625
626<p>
627The sequence of events demonstrating the necessity of the second rule
628is roughly similar:
629
630<ol>
631<li> CPU 0: <tt>list_del_rcu(p);</tt>
632<li> CPU 0: <tt>synchronize_rcu()</tt> starts.
633<li> CPU 1: <tt>rcu_read_lock()</tt>
634<li> CPU 1: <tt>q = rcu_dereference(gp);
635 /* Might return p if no memory barrier. */</tt>
636<li> CPU 0: <tt>synchronize_rcu()</tt> returns.
637<li> CPU 0: <tt>kfree(p);</tt>
638<li> CPU 1: <tt>do_something_with(q-&gt;a); /* Boom!!! */</tt>
639<li> CPU 1: <tt>rcu_read_unlock()</tt>
640</ol>
641
642<p>
643And similarly, without a memory barrier between the beginning of the
644grace period and the beginning of the RCU read-side critical section,
645CPU&nbsp;1 might end up accessing the freelist.
646
647<p>
648The &ldquo;as if&rdquo; rule of course applies, so that any implementation
649that acts as if the appropriate memory barriers were in place is a
650correct implementation.
651That said, it is much easier to fool yourself into believing that you have
652adhered to the as-if rule than it is to actually adhere to it!
653<p>@@QQE@@
654
655<p>
656In short, RCU's publish-subscribe guarantee is provided by the combination
657of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>.
658This guarantee allows data elements to be safely added to RCU-protected
659linked data structures without disrupting RCU readers.
660This guarantee can be used in combination with the grace-period
661guarantee to also allow data elements to be removed from RCU-protected
662linked data structures, again without disrupting RCU readers.
663
664<p>
665This guarantee was only partially premeditated.
666DYNIX/ptx used an explicit memory barrier for publication, but had nothing
667resembling <tt>rcu_dereference()</tt> for subscription, nor did it
668have anything resembling the <tt>smp_read_barrier_depends()</tt>
669that was later subsumed into <tt>rcu_dereference()</tt>.
670The need for these operations made itself known quite suddenly at a
671late-1990s meeting with the DEC Alpha architects, back in the days when
672DEC was still a free-standing company.
673It took the Alpha architects a good hour to convince me that any sort
674of barrier would ever be needed, and it then took me a good <i>two</i> hours
675to convince them that their documentation did not make this point clear.
676More recent work with the C and C++ standards committees have provided
677much education on tricks and traps from the compiler.
678In short, compilers were much less tricky in the early 1990s, but in
6792015, don't even think about omitting <tt>rcu_dereference()</tt>!
680
681<h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3>
682
683<p>
684The common-case RCU primitives are unconditional.
685They are invoked, they do their job, and they return, with no possibility
686of error, and no need to retry.
687This is a key RCU design philosophy.
688
689<p>
690However, this philosophy is pragmatic rather than pigheaded.
691If someone comes up with a good justification for a particular conditional
692RCU primitive, it might well be implemented and added.
693After all, this guarantee was reverse-engineered, not premeditated.
694The unconditional nature of the RCU primitives was initially an
695accident of implementation, and later experience with synchronization
696primitives with conditional primitives caused me to elevate this
697accident to a guarantee.
698Therefore, the justification for adding a conditional primitive to
699RCU would need to be based on detailed and compelling use cases.
700
701<h3><a name="Guaranteed Read-to-Write Upgrade">Guaranteed Read-to-Write Upgrade</a></h3>
702
703<p>
704As far as RCU is concerned, it is always possible to carry out an
705update within an RCU read-side critical section.
706For example, that RCU read-side critical section might search for
707a given data element, and then might acquire the update-side
708spinlock in order to update that element, all while remaining
709in that RCU read-side critical section.
710Of course, it is necessary to exit the RCU read-side critical section
711before invoking <tt>synchronize_rcu()</tt>, however, this
712inconvenience can be avoided through use of the
713<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members
714described later in this document.
715
716<p>@@QQ@@
717But how does the upgrade-to-write operation exclude other readers?
718<p>@@QQA@@
719It doesn't, just like normal RCU updates, which also do not exclude
720RCU readers.
721<p>@@QQE@@
722
723<p>
724This guarantee allows lookup code to be shared between read-side
725and update-side code, and was premeditated, appearing in the earliest
726DYNIX/ptx RCU documentation.
727
728<h2><a name="Fundamental Non-Requirements">Fundamental Non-Requirements</a></h2>
729
730<p>
731RCU provides extremely lightweight readers, and its read-side guarantees,
732though quite useful, are correspondingly lightweight.
733It is therefore all too easy to assume that RCU is guaranteeing more
734than it really is.
735Of course, the list of things that RCU does not guarantee is infinitely
736long, however, the following sections list a few non-guarantees that
737have caused confusion.
738Except where otherwise noted, these non-guarantees were premeditated.
739
740<ol>
741<li> <a href="#Readers Impose Minimal Ordering">
742 Readers Impose Minimal Ordering</a>
743<li> <a href="#Readers Do Not Exclude Updaters">
744 Readers Do Not Exclude Updaters</a>
745<li> <a href="#Updaters Only Wait For Old Readers">
746 Updaters Only Wait For Old Readers</a>
747<li> <a href="#Grace Periods Don't Partition Read-Side Critical Sections">
748 Grace Periods Don't Partition Read-Side Critical Sections</a>
749<li> <a href="#Read-Side Critical Sections Don't Partition Grace Periods">
750 Read-Side Critical Sections Don't Partition Grace Periods</a>
751<li> <a href="#Disabling Preemption Does Not Block Grace Periods">
752 Disabling Preemption Does Not Block Grace Periods</a>
753</ol>
754
755<h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3>
756
757<p>
758Reader-side markers such as <tt>rcu_read_lock()</tt> and
759<tt>rcu_read_unlock()</tt> provide absolutely no ordering guarantees
760except through their interaction with the grace-period APIs such as
761<tt>synchronize_rcu()</tt>.
762To see this, consider the following pair of threads:
763
764<blockquote>
765<pre>
766 1 void thread0(void)
767 2 {
768 3 rcu_read_lock();
769 4 WRITE_ONCE(x, 1);
770 5 rcu_read_unlock();
771 6 rcu_read_lock();
772 7 WRITE_ONCE(y, 1);
773 8 rcu_read_unlock();
774 9 }
77510
77611 void thread1(void)
77712 {
77813 rcu_read_lock();
77914 r1 = READ_ONCE(y);
78015 rcu_read_unlock();
78116 rcu_read_lock();
78217 r2 = READ_ONCE(x);
78318 rcu_read_unlock();
78419 }
785</pre>
786</blockquote>
787
788<p>
789After <tt>thread0()</tt> and <tt>thread1()</tt> execute
790concurrently, it is quite possible to have
791
792<blockquote>
793<pre>
794(r1 == 1 &amp;&amp; r2 == 0)
795</pre>
796</blockquote>
797
798(that is, <tt>y</tt> appears to have been assigned before <tt>x</tt>),
799which would not be possible if <tt>rcu_read_lock()</tt> and
800<tt>rcu_read_unlock()</tt> had much in the way of ordering
801properties.
802But they do not, so the CPU is within its rights
803to do significant reordering.
804This is by design: Any significant ordering constraints would slow down
805these fast-path APIs.
806
807<p>@@QQ@@
808Can't the compiler also reorder this code?
809<p>@@QQA@@
810No, the volatile casts in <tt>READ_ONCE()</tt> and
811<tt>WRITE_ONCE()</tt> prevent the compiler from reordering in
812this particular case.
813<p>@@QQE@@
814
815<h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3>
816
817<p>
818Neither <tt>rcu_read_lock()</tt> nor <tt>rcu_read_unlock()</tt>
819exclude updates.
820All they do is to prevent grace periods from ending.
821The following example illustrates this:
822
823<blockquote>
824<pre>
825 1 void thread0(void)
826 2 {
827 3 rcu_read_lock();
828 4 r1 = READ_ONCE(y);
829 5 if (r1) {
830 6 do_something_with_nonzero_x();
831 7 r2 = READ_ONCE(x);
832 8 WARN_ON(!r2); /* BUG!!! */
833 9 }
83410 rcu_read_unlock();
83511 }
83612
83713 void thread1(void)
83814 {
83915 spin_lock(&amp;my_lock);
84016 WRITE_ONCE(x, 1);
84117 WRITE_ONCE(y, 1);
84218 spin_unlock(&amp;my_lock);
84319 }
844</pre>
845</blockquote>
846
847<p>
848If the <tt>thread0()</tt> function's <tt>rcu_read_lock()</tt>
849excluded the <tt>thread1()</tt> function's update,
850the <tt>WARN_ON()</tt> could never fire.
851But the fact is that <tt>rcu_read_lock()</tt> does not exclude
852much of anything aside from subsequent grace periods, of which
853<tt>thread1()</tt> has none, so the
854<tt>WARN_ON()</tt> can and does fire.
855
856<h3><a name="Updaters Only Wait For Old Readers">Updaters Only Wait For Old Readers</a></h3>
857
858<p>
859It might be tempting to assume that after <tt>synchronize_rcu()</tt>
860completes, there are no readers executing.
861This temptation must be avoided because
862new readers can start immediately after <tt>synchronize_rcu()</tt>
863starts, and <tt>synchronize_rcu()</tt> is under no
864obligation to wait for these new readers.
865
866<p>@@QQ@@
867Suppose that synchronize_rcu() did wait until all readers had completed.
868Would the updater be able to rely on this?
869<p>@@QQA@@
870No.
871Even if <tt>synchronize_rcu()</tt> were to wait until
872all readers had completed, a new reader might start immediately after
873<tt>synchronize_rcu()</tt> completed.
874Therefore, the code following
875<tt>synchronize_rcu()</tt> cannot rely on there being no readers
876in any case.
877<p>@@QQE@@
878
879<h3><a name="Grace Periods Don't Partition Read-Side Critical Sections">
880Grace Periods Don't Partition Read-Side Critical Sections</a></h3>
881
882<p>
883It is tempting to assume that if any part of one RCU read-side critical
884section precedes a given grace period, and if any part of another RCU
885read-side critical section follows that same grace period, then all of
886the first RCU read-side critical section must precede all of the second.
887However, this just isn't the case: A single grace period does not
888partition the set of RCU read-side critical sections.
889An example of this situation can be illustrated as follows, where
890<tt>x</tt>, <tt>y</tt>, and <tt>z</tt> are initially all zero:
891
892<blockquote>
893<pre>
894 1 void thread0(void)
895 2 {
896 3 rcu_read_lock();
897 4 WRITE_ONCE(a, 1);
898 5 WRITE_ONCE(b, 1);
899 6 rcu_read_unlock();
900 7 }
901 8
902 9 void thread1(void)
90310 {
90411 r1 = READ_ONCE(a);
90512 synchronize_rcu();
90613 WRITE_ONCE(c, 1);
90714 }
90815
90916 void thread2(void)
91017 {
91118 rcu_read_lock();
91219 r2 = READ_ONCE(b);
91320 r3 = READ_ONCE(c);
91421 rcu_read_unlock();
91522 }
916</pre>
917</blockquote>
918
919<p>
920It turns out that the outcome:
921
922<blockquote>
923<pre>
924(r1 == 1 &amp;&amp; r2 == 0 &amp;&amp; r3 == 1)
925</pre>
926</blockquote>
927
928is entirely possible.
929The following figure show how this can happen, with each circled
930<tt>QS</tt> indicating the point at which RCU recorded a
931<i>quiescent state</i> for each thread, that is, a state in which
932RCU knows that the thread cannot be in the midst of an RCU read-side
933critical section that started before the current grace period:
934
935<p><img src="GPpartitionReaders1.svg" alt="GPpartitionReaders1.svg" width="60%"></p>
936
937<p>
938If it is necessary to partition RCU read-side critical sections in this
939manner, it is necessary to use two grace periods, where the first
940grace period is known to end before the second grace period starts:
941
942<blockquote>
943<pre>
944 1 void thread0(void)
945 2 {
946 3 rcu_read_lock();
947 4 WRITE_ONCE(a, 1);
948 5 WRITE_ONCE(b, 1);
949 6 rcu_read_unlock();
950 7 }
951 8
952 9 void thread1(void)
95310 {
95411 r1 = READ_ONCE(a);
95512 synchronize_rcu();
95613 WRITE_ONCE(c, 1);
95714 }
95815
95916 void thread2(void)
96017 {
96118 r2 = READ_ONCE(c);
96219 synchronize_rcu();
96320 WRITE_ONCE(d, 1);
96421 }
96522
96623 void thread3(void)
96724 {
96825 rcu_read_lock();
96926 r3 = READ_ONCE(b);
97027 r4 = READ_ONCE(d);
97128 rcu_read_unlock();
97229 }
973</pre>
974</blockquote>
975
976<p>
977Here, if <tt>(r1 == 1)</tt>, then
978<tt>thread0()</tt>'s write to <tt>b</tt> must happen
979before the end of <tt>thread1()</tt>'s grace period.
980If in addition <tt>(r4 == 1)</tt>, then
981<tt>thread3()</tt>'s read from <tt>b</tt> must happen
982after the beginning of <tt>thread2()</tt>'s grace period.
983If it is also the case that <tt>(r2 == 1)</tt>, then the
984end of <tt>thread1()</tt>'s grace period must precede the
985beginning of <tt>thread2()</tt>'s grace period.
986This mean that the two RCU read-side critical sections cannot overlap,
987guaranteeing that <tt>(r3 == 1)</tt>.
988As a result, the outcome:
989
990<blockquote>
991<pre>
992(r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 0 &amp;&amp; r4 == 1)
993</pre>
994</blockquote>
995
996cannot happen.
997
998<p>
999This non-requirement was also non-premeditated, but became apparent
1000when studying RCU's interaction with memory ordering.
1001
1002<h3><a name="Read-Side Critical Sections Don't Partition Grace Periods">
1003Read-Side Critical Sections Don't Partition Grace Periods</a></h3>
1004
1005<p>
1006It is also tempting to assume that if an RCU read-side critical section
1007happens between a pair of grace periods, then those grace periods cannot
1008overlap.
1009However, this temptation leads nowhere good, as can be illustrated by
1010the following, with all variables initially zero:
1011
1012<blockquote>
1013<pre>
1014 1 void thread0(void)
1015 2 {
1016 3 rcu_read_lock();
1017 4 WRITE_ONCE(a, 1);
1018 5 WRITE_ONCE(b, 1);
1019 6 rcu_read_unlock();
1020 7 }
1021 8
1022 9 void thread1(void)
102310 {
102411 r1 = READ_ONCE(a);
102512 synchronize_rcu();
102613 WRITE_ONCE(c, 1);
102714 }
102815
102916 void thread2(void)
103017 {
103118 rcu_read_lock();
103219 WRITE_ONCE(d, 1);
103320 r2 = READ_ONCE(c);
103421 rcu_read_unlock();
103522 }
103623
103724 void thread3(void)
103825 {
103926 r3 = READ_ONCE(d);
104027 synchronize_rcu();
104128 WRITE_ONCE(e, 1);
104229 }
104330
104431 void thread4(void)
104532 {
104633 rcu_read_lock();
104734 r4 = READ_ONCE(b);
104835 r5 = READ_ONCE(e);
104936 rcu_read_unlock();
105037 }
1051</pre>
1052</blockquote>
1053
1054<p>
1055In this case, the outcome:
1056
1057<blockquote>
1058<pre>
1059(r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 1 &amp;&amp; r4 == 0 &amp&amp; r5 == 1)
1060</pre>
1061</blockquote>
1062
1063is entirely possible, as illustrated below:
1064
1065<p><img src="ReadersPartitionGP1.svg" alt="ReadersPartitionGP1.svg" width="100%"></p>
1066
1067<p>
1068Again, an RCU read-side critical section can overlap almost all of a
1069given grace period, just so long as it does not overlap the entire
1070grace period.
1071As a result, an RCU read-side critical section cannot partition a pair
1072of RCU grace periods.
1073
1074<p>@@QQ@@
1075How long a sequence of grace periods, each separated by an RCU read-side
1076critical section, would be required to partition the RCU read-side
1077critical sections at the beginning and end of the chain?
1078<p>@@QQA@@
1079In theory, an infinite number.
1080In practice, an unknown number that is sensitive to both implementation
1081details and timing considerations.
1082Therefore, even in practice, RCU users must abide by the theoretical rather
1083than the practical answer.
1084<p>@@QQE@@
1085
1086<h3><a name="Disabling Preemption Does Not Block Grace Periods">
1087Disabling Preemption Does Not Block Grace Periods</a></h3>
1088
1089<p>
1090There was a time when disabling preemption on any given CPU would block
1091subsequent grace periods.
1092However, this was an accident of implementation and is not a requirement.
1093And in the current Linux-kernel implementation, disabling preemption
1094on a given CPU in fact does not block grace periods, as Oleg Nesterov
1095<a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>.
1096
1097<p>
1098If you need a preempt-disable region to block grace periods, you need to add
1099<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example
1100as follows:
1101
1102<blockquote>
1103<pre>
1104 1 preempt_disable();
1105 2 rcu_read_lock();
1106 3 do_something();
1107 4 rcu_read_unlock();
1108 5 preempt_enable();
1109 6
1110 7 /* Spinlocks implicitly disable preemption. */
1111 8 spin_lock(&amp;mylock);
1112 9 rcu_read_lock();
111310 do_something();
111411 rcu_read_unlock();
111512 spin_unlock(&amp;mylock);
1116</pre>
1117</blockquote>
1118
1119<p>
1120In theory, you could enter the RCU read-side critical section first,
1121but it is more efficient to keep the entire RCU read-side critical
1122section contained in the preempt-disable region as shown above.
1123Of course, RCU read-side critical sections that extend outside of
1124preempt-disable regions will work correctly, but such critical sections
1125can be preempted, which forces <tt>rcu_read_unlock()</tt> to do
1126more work.
1127And no, this is <i>not</i> an invitation to enclose all of your RCU
1128read-side critical sections within preempt-disable regions, because
1129doing so would degrade real-time response.
1130
1131<p>
1132This non-requirement appeared with preemptible RCU.
1133If you need a grace period that waits on non-preemptible code regions, use
1134<a href="#Sched Flavor">RCU-sched</a>.
1135
1136<h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2>
1137
1138<p>
1139These parallelism facts of life are by no means specific to RCU, but
1140the RCU implementation must abide by them.
1141They therefore bear repeating:
1142
1143<ol>
1144<li> Any CPU or task may be delayed at any time,
1145 and any attempts to avoid these delays by disabling
1146 preemption, interrupts, or whatever are completely futile.
1147 This is most obvious in preemptible user-level
1148 environments and in virtualized environments (where
1149 a given guest OS's VCPUs can be preempted at any time by
1150 the underlying hypervisor), but can also happen in bare-metal
1151 environments due to ECC errors, NMIs, and other hardware
1152 events.
1153 Although a delay of more than about 20 seconds can result
1154 in splats, the RCU implementation is obligated to use
1155 algorithms that can tolerate extremely long delays, but where
1156 &ldquo;extremely long&rdquo; is not long enough to allow
1157 wrap-around when incrementing a 64-bit counter.
1158<li> Both the compiler and the CPU can reorder memory accesses.
1159 Where it matters, RCU must use compiler directives and
1160 memory-barrier instructions to preserve ordering.
1161<li> Conflicting writes to memory locations in any given cache line
1162 will result in expensive cache misses.
1163 Greater numbers of concurrent writes and more-frequent
1164 concurrent writes will result in more dramatic slowdowns.
1165 RCU is therefore obligated to use algorithms that have
1166 sufficient locality to avoid significant performance and
1167 scalability problems.
1168<li> As a rough rule of thumb, only one CPU's worth of processing
1169 may be carried out under the protection of any given exclusive
1170 lock.
1171 RCU must therefore use scalable locking designs.
1172<li> Counters are finite, especially on 32-bit systems.
1173 RCU's use of counters must therefore tolerate counter wrap,
1174 or be designed such that counter wrap would take way more
1175 time than a single system is likely to run.
1176 An uptime of ten years is quite possible, a runtime
1177 of a century much less so.
1178 As an example of the latter, RCU's dyntick-idle nesting counter
1179 allows 54 bits for interrupt nesting level (this counter
1180 is 64 bits even on a 32-bit system).
1181 Overflowing this counter requires 2<sup>54</sup>
1182 half-interrupts on a given CPU without that CPU ever going idle.
1183 If a half-interrupt happened every microsecond, it would take
1184 570 years of runtime to overflow this counter, which is currently
1185 believed to be an acceptably long time.
1186<li> Linux systems can have thousands of CPUs running a single
1187 Linux kernel in a single shared-memory environment.
1188 RCU must therefore pay close attention to high-end scalability.
1189</ol>
1190
1191<p>
1192This last parallelism fact of life means that RCU must pay special
1193attention to the preceding facts of life.
1194The idea that Linux might scale to systems with thousands of CPUs would
1195have been met with some skepticism in the 1990s, but these requirements
1196would have otherwise have been unsurprising, even in the early 1990s.
1197
1198<h2><a name="Quality-of-Implementation Requirements">Quality-of-Implementation Requirements</a></h2>
1199
1200<p>
1201These sections list quality-of-implementation requirements.
1202Although an RCU implementation that ignores these requirements could
1203still be used, it would likely be subject to limitations that would
1204make it inappropriate for industrial-strength production use.
1205Classes of quality-of-implementation requirements are as follows:
1206
1207<ol>
1208<li> <a href="#Specialization">Specialization</a>
1209<li> <a href="#Performance and Scalability">Performance and Scalability</a>
1210<li> <a href="#Composability">Composability</a>
1211<li> <a href="#Corner Cases">Corner Cases</a>
1212</ol>
1213
1214<p>
1215These classes is covered in the following sections.
1216
1217<h3><a name="Specialization">Specialization</a></h3>
1218
1219<p>
1220RCU is and always has been intended primarily for read-mostly situations, as
1221illustrated by the following figure.
1222This means that RCU's read-side primitives are optimized, often at the
1223expense of its update-side primitives.
1224
1225<p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p>
1226
1227<p>
1228This focus on read-mostly situations means that RCU must interoperate
1229with other synchronization primitives.
1230For example, the <tt>add_gp()</tt> and <tt>remove_gp_synchronous()</tt>
1231examples discussed earlier use RCU to protect readers and locking to
1232coordinate updaters.
1233However, the need extends much farther, requiring that a variety of
1234synchronization primitives be legal within RCU read-side critical sections,
1235including spinlocks, sequence locks, atomic operations, reference
1236counters, and memory barriers.
1237
1238<p>@@QQ@@
1239What about sleeping locks?
1240<p>@@QQA@@
1241These are forbidden within Linux-kernel RCU read-side critical sections
1242because it is not legal to place a quiescent state (in this case,
1243voluntary context switch) within an RCU read-side critical section.
1244However, sleeping locks may be used within userspace RCU read-side critical
1245sections, and also within Linux-kernel sleepable RCU
1246<a href="#Sleepable RCU">(SRCU)</a>
1247read-side critical sections.
1248In addition, the -rt patchset turns spinlocks into a sleeping locks so
1249that the corresponding critical sections can be preempted, which
1250also means that these sleeplockified spinlocks (but not other sleeping locks!)
1251may be acquire within -rt-Linux-kernel RCU read-side critical sections.
1252
1253<p>
1254Note that it <i>is</i> legal for a normal RCU read-side critical section
1255to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>),
1256but only as long as it does not loop indefinitely attempting to
1257conditionally acquire that sleeping locks.
1258The key point is that things like <tt>mutex_trylock()</tt>
1259either return with the mutex held, or return an error indication if
1260the mutex was not immediately available.
1261Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping.
1262<p>@@QQE@@
1263
1264<p>
1265It often comes as a surprise that many algorithms do not require a
1266consistent view of data, but many can function in that mode,
1267with network routing being the poster child.
1268Internet routing algorithms take significant time to propagate
1269updates, so that by the time an update arrives at a given system,
1270that system has been sending network traffic the wrong way for
1271a considerable length of time.
1272Having a few threads continue to send traffic the wrong way for a
1273few more milliseconds is clearly not a problem: In the worst case,
1274TCP retransmissions will eventually get the data where it needs to go.
1275In general, when tracking the state of the universe outside of the
1276computer, some level of inconsistency must be tolerated due to
1277speed-of-light delays if nothing else.
1278
1279<p>
1280Furthermore, uncertainty about external state is inherent in many cases.
1281For example, a pair of veternarians might use heartbeat to determine
1282whether or not a given cat was alive.
1283But how long should they wait after the last heartbeat to decide that
1284the cat is in fact dead?
1285Waiting less than 400 milliseconds makes no sense because this would
1286mean that a relaxed cat would be considered to cycle between death
1287and life more than 100 times per minute.
1288Moreover, just as with human beings, a cat's heart might stop for
1289some period of time, so the exact wait period is a judgment call.
1290One of our pair of veternarians might wait 30 seconds before pronouncing
1291the cat dead, while the other might insist on waiting a full minute.
1292The two veternarians would then disagree on the state of the cat during
1293the final 30 seconds of the minute following the last heartbeat, as
1294fancifully illustrated below:
1295
1296<p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p>
1297
1298<p>
1299Interestingly enough, this same situation applies to hardware.
1300When push comes to shove, how do we tell whether or not some
1301external server has failed?
1302We send messages to it periodically, and declare it failed if we
1303don't receive a response within a given period of time.
1304Policy decisions can usually tolerate short
1305periods of inconsistency.
1306The policy was decided some time ago, and is only now being put into
1307effect, so a few milliseconds of delay is normally inconsequential.
1308
1309<p>
1310However, there are algorithms that absolutely must see consistent data.
1311For example, the translation between a user-level SystemV semaphore
1312ID to the corresponding in-kernel data structure is protected by RCU,
1313but it is absolutely forbidden to update a semaphore that has just been
1314removed.
1315In the Linux kernel, this need for consistency is accommodated by acquiring
1316spinlocks located in the in-kernel data structure from within
1317the RCU read-side critical section, and this is indicated by the
1318green box in the figure above.
1319Many other techniques may be used, and are in fact used within the
1320Linux kernel.
1321
1322<p>
1323In short, RCU is not required to maintain consistency, and other
1324mechanisms may be used in concert with RCU when consistency is required.
1325RCU's specialization allows it to do its job extremely well, and its
1326ability to interoperate with other synchronization mechanisms allows
1327the right mix of synchronization tools to be used for a given job.
1328
1329<h3><a name="Performance and Scalability">Performance and Scalability</a></h3>
1330
1331<p>
1332Energy efficiency is a critical component of performance today,
1333and Linux-kernel RCU implementations must therefore avoid unnecessarily
1334awakening idle CPUs.
1335I cannot claim that this requirement was premeditated.
1336In fact, I learned of it during a telephone conversation in which I
1337was given &ldquo;frank and open&rdquo; feedback on the importance
1338of energy efficiency in battery-powered systems and on specific
1339energy-efficiency shortcomings of the Linux-kernel RCU implementation.
1340In my experience, the battery-powered embedded community will consider
1341any unnecessary wakeups to be extremely unfriendly acts.
1342So much so that mere Linux-kernel-mailing-list posts are
1343insufficient to vent their ire.
1344
1345<p>
1346Memory consumption is not particularly important for in most
1347situations, and has become decreasingly
1348so as memory sizes have expanded and memory
1349costs have plummeted.
1350However, as I learned from Matt Mackall's
1351<a href="http://elinux.org/Linux_Tiny-FAQ">bloatwatch</a>
1352efforts, memory footprint is critically important on single-CPU systems with
1353non-preemptible (<tt>CONFIG_PREEMPT=n</tt>) kernels, and thus
1354<a href="https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com">tiny RCU</a>
1355was born.
1356Josh Triplett has since taken over the small-memory banner with his
1357<a href="https://tiny.wiki.kernel.org/">Linux kernel tinification</a>
1358project, which resulted in
1359<a href="#Sleepable RCU">SRCU</a>
1360becoming optional for those kernels not needing it.
1361
1362<p>
1363The remaining performance requirements are, for the most part,
1364unsurprising.
1365For example, in keeping with RCU's read-side specialization,
1366<tt>rcu_dereference()</tt> should have negligible overhead (for
1367example, suppression of a few minor compiler optimizations).
1368Similarly, in non-preemptible environments, <tt>rcu_read_lock()</tt> and
1369<tt>rcu_read_unlock()</tt> should have exactly zero overhead.
1370
1371<p>
1372In preemptible environments, in the case where the RCU read-side
1373critical section was not preempted (as will be the case for the
1374highest-priority real-time process), <tt>rcu_read_lock()</tt> and
1375<tt>rcu_read_unlock()</tt> should have minimal overhead.
1376In particular, they should not contain atomic read-modify-write
1377operations, memory-barrier instructions, preemption disabling,
1378interrupt disabling, or backwards branches.
1379However, in the case where the RCU read-side critical section was preempted,
1380<tt>rcu_read_unlock()</tt> may acquire spinlocks and disable interrupts.
1381This is why it is better to nest an RCU read-side critical section
1382within a preempt-disable region than vice versa, at least in cases
1383where that critical section is short enough to avoid unduly degrading
1384real-time latencies.
1385
1386<p>
1387The <tt>synchronize_rcu()</tt> grace-period-wait primitive is
1388optimized for throughput.
1389It may therefore incur several milliseconds of latency in addition to
1390the duration of the longest RCU read-side critical section.
1391On the other hand, multiple concurrent invocations of
1392<tt>synchronize_rcu()</tt> are required to use batching optimizations
1393so that they can be satisfied by a single underlying grace-period-wait
1394operation.
1395For example, in the Linux kernel, it is not unusual for a single
1396grace-period-wait operation to serve more than
1397<a href="https://www.usenix.org/conference/2004-usenix-annual-technical-conference/making-rcu-safe-deep-sub-millisecond-response">1,000 separate invocations</a>
1398of <tt>synchronize_rcu()</tt>, thus amortizing the per-invocation
1399overhead down to nearly zero.
1400However, the grace-period optimization is also required to avoid
1401measurable degradation of real-time scheduling and interrupt latencies.
1402
1403<p>
1404In some cases, the multi-millisecond <tt>synchronize_rcu()</tt>
1405latencies are unacceptable.
1406In these cases, <tt>synchronize_rcu_expedited()</tt> may be used
1407instead, reducing the grace-period latency down to a few tens of
1408microseconds on small systems, at least in cases where the RCU read-side
1409critical sections are short.
1410There are currently no special latency requirements for
1411<tt>synchronize_rcu_expedited()</tt> on large systems, but,
1412consistent with the empirical nature of the RCU specification,
1413that is subject to change.
1414However, there most definitely are scalability requirements:
1415A storm of <tt>synchronize_rcu_expedited()</tt> invocations on 4096
1416CPUs should at least make reasonable forward progress.
1417In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt>
1418is permitted to impose modest degradation of real-time latency
1419on non-idle online CPUs.
1420That said, it will likely be necessary to take further steps to reduce this
1421degradation, hopefully to roughly that of a scheduling-clock interrupt.
1422
1423<p>
1424There are a number of situations where even
1425<tt>synchronize_rcu_expedited()</tt>'s reduced grace-period
1426latency is unacceptable.
1427In these situations, the asynchronous <tt>call_rcu()</tt> can be
1428used in place of <tt>synchronize_rcu()</tt> as follows:
1429
1430<blockquote>
1431<pre>
1432 1 struct foo {
1433 2 int a;
1434 3 int b;
1435 4 struct rcu_head rh;
1436 5 };
1437 6
1438 7 static void remove_gp_cb(struct rcu_head *rhp)
1439 8 {
1440 9 struct foo *p = container_of(rhp, struct foo, rh);
144110
144211 kfree(p);
144312 }
144413
144514 bool remove_gp_asynchronous(void)
144615 {
144716 struct foo *p;
144817
144918 spin_lock(&amp;gp_lock);
145019 p = rcu_dereference(gp);
145120 if (!p) {
145221 spin_unlock(&amp;gp_lock);
145322 return false;
145423 }
145524 rcu_assign_pointer(gp, NULL);
145625 call_rcu(&amp;p-&gt;rh, remove_gp_cb);
145726 spin_unlock(&amp;gp_lock);
145827 return true;
145928 }
1460</pre>
1461</blockquote>
1462
1463<p>
1464A definition of <tt>struct foo</tt> is finally needed, and appears
1465on lines&nbsp;1-5.
1466The function <tt>remove_gp_cb()</tt> is passed to <tt>call_rcu()</tt>
1467on line&nbsp;25, and will be invoked after the end of a subsequent
1468grace period.
1469This gets the same effect as <tt>remove_gp_synchronous()</tt>,
1470but without forcing the updater to wait for a grace period to elapse.
1471The <tt>call_rcu()</tt> function may be used in a number of
1472situations where neither <tt>synchronize_rcu()</tt> nor
1473<tt>synchronize_rcu_expedited()</tt> would be legal,
1474including within preempt-disable code, <tt>local_bh_disable()</tt> code,
1475interrupt-disable code, and interrupt handlers.
1476However, even <tt>call_rcu()</tt> is illegal within NMI handlers.
1477The callback function (<tt>remove_gp_cb()</tt> in this case) will be
1478executed within softirq (software interrupt) environment within the
1479Linux kernel,
1480either within a real softirq handler or under the protection
1481of <tt>local_bh_disable()</tt>.
1482In both the Linux kernel and in userspace, it is bad practice to
1483write an RCU callback function that takes too long.
1484Long-running operations should be relegated to separate threads or
1485(in the Linux kernel) workqueues.
1486
1487<p>@@QQ@@
1488Why does line&nbsp;19 use <tt>rcu_access_pointer()</tt>?
1489After all, <tt>call_rcu()</tt> on line&nbsp;25 stores into the
1490structure, which would interact badly with concurrent insertions.
1491Doesn't this mean that <tt>rcu_dereference()</tt> is required?
1492<p>@@QQA@@
1493Presumably the <tt>-&gt;gp_lock</tt> acquired on line&nbsp;18 excludes
1494any changes, including any insertions that <tt>rcu_dereference()</tt>
1495would protect against.
1496Therefore, any insertions will be delayed until after <tt>-&gt;gp_lock</tt>
1497is released on line&nbsp;25, which in turn means that
1498<tt>rcu_access_pointer()</tt> suffices.
1499<p>@@QQE@@
1500
1501<p>
1502However, all that <tt>remove_gp_cb()</tt> is doing is
1503invoking <tt>kfree()</tt> on the data element.
1504This is a common idiom, and is supported by <tt>kfree_rcu()</tt>,
1505which allows &ldquo;fire and forget&rdquo; operation as shown below:
1506
1507<blockquote>
1508<pre>
1509 1 struct foo {
1510 2 int a;
1511 3 int b;
1512 4 struct rcu_head rh;
1513 5 };
1514 6
1515 7 bool remove_gp_faf(void)
1516 8 {
1517 9 struct foo *p;
151810
151911 spin_lock(&amp;gp_lock);
152012 p = rcu_dereference(gp);
152113 if (!p) {
152214 spin_unlock(&amp;gp_lock);
152315 return false;
152416 }
152517 rcu_assign_pointer(gp, NULL);
152618 kfree_rcu(p, rh);
152719 spin_unlock(&amp;gp_lock);
152820 return true;
152921 }
1530</pre>
1531</blockquote>
1532
1533<p>
1534Note that <tt>remove_gp_faf()</tt> simply invokes
1535<tt>kfree_rcu()</tt> and proceeds, without any need to pay any
1536further attention to the subsequent grace period and <tt>kfree()</tt>.
1537It is permissible to invoke <tt>kfree_rcu()</tt> from the same
1538environments as for <tt>call_rcu()</tt>.
1539Interestingly enough, DYNIX/ptx had the equivalents of
1540<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>, but not
1541<tt>synchronize_rcu()</tt>.
1542This was due to the fact that RCU was not heavily used within DYNIX/ptx,
1543so the very few places that needed something like
1544<tt>synchronize_rcu()</tt> simply open-coded it.
1545
1546<p>@@QQ@@
1547Earlier it was claimed that <tt>call_rcu()</tt> and
1548<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked
1549by readers.
1550But how can that be correct, given that the invocation of the callback
1551and the freeing of the memory (respectively) must still wait for
1552a grace period to elapse?
1553<p>@@QQA@@
1554We could define things this way, but keep in mind that this sort of
1555definition would say that updates in garbage-collected languages
1556cannot complete until the next time the garbage collector runs,
1557which does not seem at all reasonable.
1558The key point is that in most cases, an updater using either
1559<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the
1560next update as soon as it has invoked <tt>call_rcu()</tt> or
1561<tt>kfree_rcu()</tt>, without having to wait for a subsequent
1562grace period.
1563<p>@@QQE@@
1564
1565<p>
1566But what if the updater must wait for the completion of code to be
1567executed after the end of the grace period, but has other tasks
1568that can be carried out in the meantime?
1569The polling-style <tt>get_state_synchronize_rcu()</tt> and
1570<tt>cond_synchronize_rcu()</tt> functions may be used for this
1571purpose, as shown below:
1572
1573<blockquote>
1574<pre>
1575 1 bool remove_gp_poll(void)
1576 2 {
1577 3 struct foo *p;
1578 4 unsigned long s;
1579 5
1580 6 spin_lock(&amp;gp_lock);
1581 7 p = rcu_access_pointer(gp);
1582 8 if (!p) {
1583 9 spin_unlock(&amp;gp_lock);
158410 return false;
158511 }
158612 rcu_assign_pointer(gp, NULL);
158713 spin_unlock(&amp;gp_lock);
158814 s = get_state_synchronize_rcu();
158915 do_something_while_waiting();
159016 cond_synchronize_rcu(s);
159117 kfree(p);
159218 return true;
159319 }
1594</pre>
1595</blockquote>
1596
1597<p>
1598On line&nbsp;14, <tt>get_state_synchronize_rcu()</tt> obtains a
1599&ldquo;cookie&rdquo; from RCU,
1600then line&nbsp;15 carries out other tasks,
1601and finally, line&nbsp;16 returns immediately if a grace period has
1602elapsed in the meantime, but otherwise waits as required.
1603The need for <tt>get_state_synchronize_rcu</tt> and
1604<tt>cond_synchronize_rcu()</tt> has appeared quite recently,
1605so it is too early to tell whether they will stand the test of time.
1606
1607<p>
1608RCU thus provides a range of tools to allow updaters to strike the
1609required tradeoff between latency, flexibility and CPU overhead.
1610
1611<h3><a name="Composability">Composability</a></h3>
1612
1613<p>
1614Composability has received much attention in recent years, perhaps in part
1615due to the collision of multicore hardware with object-oriented techniques
1616designed in single-threaded environments for single-threaded use.
1617And in theory, RCU read-side critical sections may be composed, and in
1618fact may be nested arbitrarily deeply.
1619In practice, as with all real-world implementations of composable
1620constructs, there are limitations.
1621
1622<p>
1623Implementations of RCU for which <tt>rcu_read_lock()</tt>
1624and <tt>rcu_read_unlock()</tt> generate no code, such as
1625Linux-kernel RCU when <tt>CONFIG_PREEMPT=n</tt>, can be
1626nested arbitrarily deeply.
1627After all, there is no overhead.
1628Except that if all these instances of <tt>rcu_read_lock()</tt>
1629and <tt>rcu_read_unlock()</tt> are visible to the compiler,
1630compilation will eventually fail due to exhausting memory,
1631mass storage, or user patience, whichever comes first.
1632If the nesting is not visible to the compiler, as is the case with
1633mutually recursive functions each in its own translation unit,
1634stack overflow will result.
1635If the nesting takes the form of loops, either the control variable
1636will overflow or (in the Linux kernel) you will get an RCU CPU stall warning.
1637Nevertheless, this class of RCU implementations is one
1638of the most composable constructs in existence.
1639
1640<p>
1641RCU implementations that explicitly track nesting depth
1642are limited by the nesting-depth counter.
1643For example, the Linux kernel's preemptible RCU limits nesting to
1644<tt>INT_MAX</tt>.
1645This should suffice for almost all practical purposes.
1646That said, a consecutive pair of RCU read-side critical sections
1647between which there is an operation that waits for a grace period
1648cannot be enclosed in another RCU read-side critical section.
1649This is because it is not legal to wait for a grace period within
1650an RCU read-side critical section: To do so would result either
1651in deadlock or
1652in RCU implicitly splitting the enclosing RCU read-side critical
1653section, neither of which is conducive to a long-lived and prosperous
1654kernel.
1655
1656<p>
1657In short, although RCU read-side critical sections are highly composable,
1658care is required in some situations, just as is the case for any other
1659composable synchronization mechanism.
1660
1661<h3><a name="Corner Cases">Corner Cases</a></h3>
1662
1663<p>
1664A given RCU workload might have an endless and intense stream of
1665RCU read-side critical sections, perhaps even so intense that there
1666was never a point in time during which there was not at least one
1667RCU read-side critical section in flight.
1668RCU cannot allow this situation to block grace periods: As long as
1669all the RCU read-side critical sections are finite, grace periods
1670must also be finite.
1671
1672<p>
1673That said, preemptible RCU implementations could potentially result
1674in RCU read-side critical sections being preempted for long durations,
1675which has the effect of creating a long-duration RCU read-side
1676critical section.
1677This situation can arise only in heavily loaded systems, but systems using
1678real-time priorities are of course more vulnerable.
1679Therefore, RCU priority boosting is provided to help deal with this
1680case.
1681That said, the exact requirements on RCU priority boosting will likely
1682evolve as more experience accumulates.
1683
1684<p>
1685Other workloads might have very high update rates.
1686Although one can argue that such workloads should instead use
1687something other than RCU, the fact remains that RCU must
1688handle such workloads gracefully.
1689This requirement is another factor driving batching of grace periods,
1690but it is also the driving force behind the checks for large numbers
1691of queued RCU callbacks in the <tt>call_rcu()</tt> code path.
1692Finally, high update rates should not delay RCU read-side critical
1693sections, although some read-side delays can occur when using
1694<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use
1695of <tt>try_stop_cpus()</tt>.
1696(In the future, <tt>synchronize_rcu_expedited()</tt> will be
1697converted to use lighter-weight inter-processor interrupts (IPIs),
1698but this will still disturb readers, though to a much smaller degree.)
1699
1700<p>
1701Although all three of these corner cases were understood in the early
17021990s, a simple user-level test consisting of <tt>close(open(path))</tt>
1703in a tight loop
1704in the early 2000s suddenly provided a much deeper appreciation of the
1705high-update-rate corner case.
1706This test also motivated addition of some RCU code to react to high update
1707rates, for example, if a given CPU finds itself with more than 10,000
1708RCU callbacks queued, it will cause RCU to take evasive action by
1709more aggressively starting grace periods and more aggressively forcing
1710completion of grace-period processing.
1711This evasive action causes the grace period to complete more quickly,
1712but at the cost of restricting RCU's batching optimizations, thus
1713increasing the CPU overhead incurred by that grace period.
1714
1715<h2><a name="Software-Engineering Requirements">
1716Software-Engineering Requirements</a></h2>
1717
1718<p>
1719Between Murphy's Law and &ldquo;To err is human&rdquo;, it is necessary to
1720guard against mishaps and misuse:
1721
1722<ol>
1723<li> It is all too easy to forget to use <tt>rcu_read_lock()</tt>
1724 everywhere that it is needed, so kernels built with
1725 <tt>CONFIG_PROVE_RCU=y</tt> will spat if
1726 <tt>rcu_dereference()</tt> is used outside of an
1727 RCU read-side critical section.
1728 Update-side code can use <tt>rcu_dereference_protected()</tt>,
1729 which takes a
1730 <a href="https://lwn.net/Articles/371986/">lockdep expression</a>
1731 to indicate what is providing the protection.
1732 If the indicated protection is not provided, a lockdep splat
1733 is emitted.
1734
1735 <p>
1736 Code shared between readers and updaters can use
1737 <tt>rcu_dereference_check()</tt>, which also takes a
1738 lockdep expression, and emits a lockdep splat if neither
1739 <tt>rcu_read_lock()</tt> nor the indicated protection
1740 is in place.
1741 In addition, <tt>rcu_dereference_raw()</tt> is used in those
1742 (hopefully rare) cases where the required protection cannot
1743 be easily described.
1744 Finally, <tt>rcu_read_lock_held()</tt> is provided to
1745 allow a function to verify that it has been invoked within
1746 an RCU read-side critical section.
1747 I was made aware of this set of requirements shortly after Thomas
1748 Gleixner audited a number of RCU uses.
1749<li> A given function might wish to check for RCU-related preconditions
1750 upon entry, before using any other RCU API.
1751 The <tt>rcu_lockdep_assert()</tt> does this job,
1752 asserting the expression in kernels having lockdep enabled
1753 and doing nothing otherwise.
1754<li> It is also easy to forget to use <tt>rcu_assign_pointer()</tt>
1755 and <tt>rcu_dereference()</tt>, perhaps (incorrectly)
1756 substituting a simple assignment.
1757 To catch this sort of error, a given RCU-protected pointer may be
1758 tagged with <tt>__rcu</tt>, after which running sparse
1759 with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain
1760 about simple-assignment accesses to that pointer.
1761 Arnd Bergmann made me aware of this requirement, and also
1762 supplied the needed
1763 <a href="https://lwn.net/Articles/376011/">patch series</a>.
1764<li> Kernels built with <tt>CONFIG_DEBUG_OBJECTS_RCU_HEAD=y</tt>
1765 will splat if a data element is passed to <tt>call_rcu()</tt>
1766 twice in a row, without a grace period in between.
1767 (This error is similar to a double free.)
1768 The corresponding <tt>rcu_head</tt> structures that are
1769 dynamically allocated are automatically tracked, but
1770 <tt>rcu_head</tt> structures allocated on the stack
1771 must be initialized with <tt>init_rcu_head_on_stack()</tt>
1772 and cleaned up with <tt>destroy_rcu_head_on_stack()</tt>.
1773 Similarly, statically allocated non-stack <tt>rcu_head</tt>
1774 structures must be initialized with <tt>init_rcu_head()</tt>
1775 and cleaned up with <tt>destroy_rcu_head()</tt>.
1776 Mathieu Desnoyers made me aware of this requirement, and also
1777 supplied the needed
1778 <a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>.
1779<li> An infinite loop in an RCU read-side critical section will
1780 eventually trigger an RCU CPU stall warning splat.
1781 However, RCU is not obligated to produce this splat
1782 unless there is a grace period waiting on that particular
1783 RCU read-side critical section.
1784 This requirement made itself known in the early 1990s, pretty
1785 much the first time that it was necessary to debug a CPU stall.
1786<li> Although it would be very good to detect pointers leaking out
1787 of RCU read-side critical sections, there is currently no
1788 good way of doing this.
1789 One complication is the need to distinguish between pointers
1790 leaking and pointers that have been handed off from RCU to
1791 some other synchronization mechanism, for example, reference
1792 counting.
1793<li> In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related
1794 information is provided via both debugfs and event tracing.
1795<li> Open-coded use of <tt>rcu_assign_pointer()</tt> and
1796 <tt>rcu_dereference()</tt> to create typical linked
1797 data structures can be surprisingly error-prone.
1798 Therefore, RCU-protected
1799 <a href="https://lwn.net/Articles/609973/#RCU List APIs">linked lists</a>
1800 and, more recently, RCU-protected
1801 <a href="https://lwn.net/Articles/612100/">hash tables</a>
1802 are available.
1803 Many other special-purpose RCU-protected data structures are
1804 available in the Linux kernel and the userspace RCU library.
1805<li> Some linked structures are created at compile time, but still
1806 require <tt>__rcu</tt> checking.
1807 The <tt>RCU_POINTER_INITIALIZER()</tt> macro serves this
1808 purpose.
1809<li> It is not necessary to use <tt>rcu_assign_pointer()</tt>
1810 when creating linked structures that are to be published via
1811 a single external pointer.
1812 The <tt>RCU_INIT_POINTER()</tt> macro is provided for
1813 this task and also for assigning <tt>NULL</tt> pointers
1814 at runtime.
1815</ol>
1816
1817<p>
1818This not a hard-and-fast list: RCU's diagnostic capabilities will
1819continue to be guided by the number and type of usage bugs found
1820in real-world RCU usage.
1821
1822<h2><a name="Linux Kernel Complications">Linux Kernel Complications</a></h2>
1823
1824<p>
1825The Linux kernel provides an interesting environment for all kinds of
1826software, including RCU.
1827Some of the relevant points of interest are as follows:
1828
1829<ol>
1830<li> <a href="#Configuration">Configuration</a>.
1831<li> <a href="#Firmware Interface">Firmware Interface</a>.
1832<li> <a href="#Early Boot">Early Boot</a>.
1833<li> <a href="#Interrupts and NMIs">
1834 Interrupts and non-maskable interrupts (NMIs)</a>.
1835<li> <a href="#Loadable Modules">Loadable Modules</a>.
1836<li> <a href="#Hotplug CPU">Hotplug CPU</a>.
1837<li> <a href="#Scheduler and RCU">Scheduler and RCU</a>.
1838<li> <a href="#Tracing and RCU">Tracing and RCU</a>.
1839<li> <a href="#Energy Efficiency">Energy Efficiency</a>.
1840<li> <a href="#Performance, Scalability, Response Time, and Reliability">
1841 Performance, Scalability, Response Time, and Reliability</a>.
1842</ol>
1843
1844<p>
1845This list is probably incomplete, but it does give a feel for the
1846most notable Linux-kernel complications.
1847Each of the following sections covers one of the above topics.
1848
1849<h3><a name="Configuration">Configuration</a></h3>
1850
1851<p>
1852RCU's goal is automatic configuration, so that almost nobody
1853needs to worry about RCU's <tt>Kconfig</tt> options.
1854And for almost all users, RCU does in fact work well
1855&ldquo;out of the box.&rdquo;
1856
1857<p>
1858However, there are specialized use cases that are handled by
1859kernel boot parameters and <tt>Kconfig</tt> options.
1860Unfortunately, the <tt>Kconfig</tt> system will explicitly ask users
1861about new <tt>Kconfig</tt> options, which requires almost all of them
1862be hidden behind a <tt>CONFIG_RCU_EXPERT</tt> <tt>Kconfig</tt> option.
1863
1864<p>
1865This all should be quite obvious, but the fact remains that
1866Linus Torvalds recently had to
1867<a href="https://lkml.kernel.org/g/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com">remind</a>
1868me of this requirement.
1869
1870<h3><a name="Firmware Interface">Firmware Interface</a></h3>
1871
1872<p>
1873In many cases, kernel obtains information about the system from the
1874firmware, and sometimes things are lost in translation.
1875Or the translation is accurate, but the original message is bogus.
1876
1877<p>
1878For example, some systems' firmware overreports the number of CPUs,
1879sometimes by a large factor.
1880If RCU naively believed the firmware, as it used to do,
1881it would create too many per-CPU kthreads.
1882Although the resulting system will still run correctly, the extra
1883kthreads needlessly consume memory and can cause confusion
1884when they show up in <tt>ps</tt> listings.
1885
1886<p>
1887RCU must therefore wait for a given CPU to actually come online before
1888it can allow itself to believe that the CPU actually exists.
1889The resulting &ldquo;ghost CPUs&rdquo; (which are never going to
1890come online) cause a number of
1891<a href="https://paulmck.livejournal.com/37494.html">interesting complications</a>.
1892
1893<h3><a name="Early Boot">Early Boot</a></h3>
1894
1895<p>
1896The Linux kernel's boot sequence is an interesting process,
1897and RCU is used early, even before <tt>rcu_init()</tt>
1898is invoked.
1899In fact, a number of RCU's primitives can be used as soon as the
1900initial task's <tt>task_struct</tt> is available and the
1901boot CPU's per-CPU variables are set up.
1902The read-side primitives (<tt>rcu_read_lock()</tt>,
1903<tt>rcu_read_unlock()</tt>, <tt>rcu_dereference()</tt>,
1904and <tt>rcu_access_pointer()</tt>) will operate normally very early on,
1905as will <tt>rcu_assign_pointer()</tt>.
1906
1907<p>
1908Although <tt>call_rcu()</tt> may be invoked at any
1909time during boot, callbacks are not guaranteed to be invoked until after
1910the scheduler is fully up and running.
1911This delay in callback invocation is due to the fact that RCU does not
1912invoke callbacks until it is fully initialized, and this full initialization
1913cannot occur until after the scheduler has initialized itself to the
1914point where RCU can spawn and run its kthreads.
1915In theory, it would be possible to invoke callbacks earlier,
1916however, this is not a panacea because there would be severe restrictions
1917on what operations those callbacks could invoke.
1918
1919<p>
1920Perhaps surprisingly, <tt>synchronize_rcu()</tt>,
1921<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a>
1922(<a href="#Bottom-Half Flavor">discussed below</a>),
1923and
1924<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>
1925will all operate normally
1926during very early boot, the reason being that there is only one CPU
1927and preemption is disabled.
1928This means that the call <tt>synchronize_rcu()</tt> (or friends)
1929itself is a quiescent
1930state and thus a grace period, so the early-boot implementation can
1931be a no-op.
1932
1933<p>
1934Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt>
1935continue to operate normally through the remainder of boot, courtesy
1936of the fact that preemption is disabled across their RCU read-side
1937critical sections and also courtesy of the fact that there is still
1938only one CPU.
1939However, once the scheduler starts initializing, preemption is enabled.
1940There is still only a single CPU, but the fact that preemption is enabled
1941means that the no-op implementation of <tt>synchronize_rcu()</tt> no
1942longer works in <tt>CONFIG_PREEMPT=y</tt> kernels.
1943Therefore, as soon as the scheduler starts initializing, the early-boot
1944fastpath is disabled.
1945This means that <tt>synchronize_rcu()</tt> switches to its runtime
1946mode of operation where it posts callbacks, which in turn means that
1947any call to <tt>synchronize_rcu()</tt> will block until the corresponding
1948callback is invoked.
1949Unfortunately, the callback cannot be invoked until RCU's runtime
1950grace-period machinery is up and running, which cannot happen until
1951the scheduler has initialized itself sufficiently to allow RCU's
1952kthreads to be spawned.
1953Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler
1954initialization can result in deadlock.
1955
1956<p>@@QQ@@
1957So what happens with <tt>synchronize_rcu()</tt> during
1958scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
1959kernels?
1960<p>@@QQA@@
1961In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt>
1962maps directly to <tt>synchronize_sched()</tt>.
1963Therefore, <tt>synchronize_rcu()</tt> works normally throughout
1964boot in <tt>CONFIG_PREEMPT=n</tt> kernels.
1965However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels,
1966so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt>
1967during scheduler initialization.
1968<p>@@QQE@@
1969
1970<p>
1971I learned of these boot-time requirements as a result of a series of
1972system hangs.
1973
1974<h3><a name="Interrupts and NMIs">Interrupts and NMIs</a></h3>
1975
1976<p>
1977The Linux kernel has interrupts, and RCU read-side critical sections are
1978legal within interrupt handlers and within interrupt-disabled regions
1979of code, as are invocations of <tt>call_rcu()</tt>.
1980
1981<p>
1982Some Linux-kernel architectures can enter an interrupt handler from
1983non-idle process context, and then just never leave it, instead stealthily
1984transitioning back to process context.
1985This trick is sometimes used to invoke system calls from inside the kernel.
1986These &ldquo;half-interrupts&rdquo; mean that RCU has to be very careful
1987about how it counts interrupt nesting levels.
1988I learned of this requirement the hard way during a rewrite
1989of RCU's dyntick-idle code.
1990
1991<p>
1992The Linux kernel has non-maskable interrupts (NMIs), and
1993RCU read-side critical sections are legal within NMI handlers.
1994Thankfully, RCU update-side primitives, including
1995<tt>call_rcu()</tt>, are prohibited within NMI handlers.
1996
1997<p>
1998The name notwithstanding, some Linux-kernel architectures
1999can have nested NMIs, which RCU must handle correctly.
2000Andy Lutomirski
2001<a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a>
2002with this requirement;
2003he also kindly surprised me with
2004<a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a>
2005that meets this requirement.
2006
2007<h3><a name="Loadable Modules">Loadable Modules</a></h3>
2008
2009<p>
2010The Linux kernel has loadable modules, and these modules can
2011also be unloaded.
2012After a given module has been unloaded, any attempt to call
2013one of its functions results in a segmentation fault.
2014The module-unload functions must therefore cancel any
2015delayed calls to loadable-module functions, for example,
2016any outstanding <tt>mod_timer()</tt> must be dealt with
2017via <tt>del_timer_sync()</tt> or similar.
2018
2019<p>
2020Unfortunately, there is no way to cancel an RCU callback;
2021once you invoke <tt>call_rcu()</tt>, the callback function is
2022going to eventually be invoked, unless the system goes down first.
2023Because it is normally considered socially irresponsible to crash the system
2024in response to a module unload request, we need some other way
2025to deal with in-flight RCU callbacks.
2026
2027<p>
2028RCU therefore provides
2029<tt><a href="https://lwn.net/Articles/217484/">rcu_barrier()</a></tt>,
2030which waits until all in-flight RCU callbacks have been invoked.
2031If a module uses <tt>call_rcu()</tt>, its exit function should therefore
2032prevent any future invocation of <tt>call_rcu()</tt>, then invoke
2033<tt>rcu_barrier()</tt>.
2034In theory, the underlying module-unload code could invoke
2035<tt>rcu_barrier()</tt> unconditionally, but in practice this would
2036incur unacceptable latencies.
2037
2038<p>
2039Nikita Danilov noted this requirement for an analogous filesystem-unmount
2040situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU.
2041The need for <tt>rcu_barrier()</tt> for module unloading became
2042apparent later.
2043
2044<h3><a name="Hotplug CPU">Hotplug CPU</a></h3>
2045
2046<p>
2047The Linux kernel supports CPU hotplug, which means that CPUs
2048can come and go.
2049It is of course illegal to use any RCU API member from an offline CPU.
2050This requirement was present from day one in DYNIX/ptx, but
2051on the other hand, the Linux kernel's CPU-hotplug implementation
2052is &ldquo;interesting.&rdquo;
2053
2054<p>
2055The Linux-kernel CPU-hotplug implementation has notifiers that
2056are used to allow the various kernel subsystems (including RCU)
2057to respond appropriately to a given CPU-hotplug operation.
2058Most RCU operations may be invoked from CPU-hotplug notifiers,
2059including even normal synchronous grace-period operations
2060such as <tt>synchronize_rcu()</tt>.
2061However, expedited grace-period operations such as
2062<tt>synchronize_rcu_expedited()</tt> are not supported,
2063due to the fact that current implementations block CPU-hotplug
2064operations, which could result in deadlock.
2065
2066<p>
2067In addition, all-callback-wait operations such as
2068<tt>rcu_barrier()</tt> are also not supported, due to the
2069fact that there are phases of CPU-hotplug operations where
2070the outgoing CPU's callbacks will not be invoked until after
2071the CPU-hotplug operation ends, which could also result in deadlock.
2072
2073<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3>
2074
2075<p>
2076RCU depends on the scheduler, and the scheduler uses RCU to
2077protect some of its data structures.
2078This means the scheduler is forbidden from acquiring
2079the runqueue locks and the priority-inheritance locks
2080in the middle of an outermost RCU read-side critical section unless
2081it also releases them before exiting that same
2082RCU read-side critical section.
2083This same prohibition also applies to any lock that is acquired
2084while holding any lock to which this prohibition applies.
2085Violating this rule results in deadlock.
2086
2087<p>
2088For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt>
2089implementation must be written carefully to avoid similar deadlocks.
2090In particular, <tt>rcu_read_unlock()</tt> must tolerate an
2091interrupt where the interrupt handler invokes both
2092<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
2093This possibility requires <tt>rcu_read_unlock()</tt> to use
2094negative nesting levels to avoid destructive recursion via
2095interrupt handler's use of RCU.
2096
2097<p>
2098This pair of mutual scheduler-RCU requirements came as a
2099<a href="https://lwn.net/Articles/453002/">complete surprise</a>.
2100
2101<p>
2102As noted above, RCU makes use of kthreads, and it is necessary to
2103avoid excessive CPU-time accumulation by these kthreads.
2104This requirement was no surprise, but RCU's violation of it
2105when running context-switch-heavy workloads when built with
2106<tt>CONFIG_NO_HZ_FULL=y</tt>
2107<a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>.
2108RCU has made good progress towards meeting this requirement, even
2109for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads,
2110but there is room for further improvement.
2111
2112<h3><a name="Tracing and RCU">Tracing and RCU</a></h3>
2113
2114<p>
2115It is possible to use tracing on RCU code, but tracing itself
2116uses RCU.
2117For this reason, <tt>rcu_dereference_raw_notrace()</tt>
2118is provided for use by tracing, which avoids the destructive
2119recursion that could otherwise ensue.
2120This API is also used by virtualization in some architectures,
2121where RCU readers execute in environments in which tracing
2122cannot be used.
2123The tracing folks both located the requirement and provided the
2124needed fix, so this surprise requirement was relatively painless.
2125
2126<h3><a name="Energy Efficiency">Energy Efficiency</a></h3>
2127
2128<p>
2129Interrupting idle CPUs is considered socially unacceptable,
2130especially by people with battery-powered embedded systems.
2131RCU therefore conserves energy by detecting which CPUs are
2132idle, including tracking CPUs that have been interrupted from idle.
2133This is a large part of the energy-efficiency requirement,
2134so I learned of this via an irate phone call.
2135
2136<p>
2137Because RCU avoids interrupting idle CPUs, it is illegal to
2138execute an RCU read-side critical section on an idle CPU.
2139(Kernels built with <tt>CONFIG_PROVE_RCU=y</tt> will splat
2140if you try it.)
2141The <tt>RCU_NONIDLE()</tt> macro and <tt>_rcuidle</tt>
2142event tracing is provided to work around this restriction.
2143In addition, <tt>rcu_is_watching()</tt> may be used to
2144test whether or not it is currently legal to run RCU read-side
2145critical sections on this CPU.
2146I learned of the need for diagnostics on the one hand
2147and <tt>RCU_NONIDLE()</tt> on the other while inspecting
2148idle-loop code.
2149Steven Rostedt supplied <tt>_rcuidle</tt> event tracing,
2150which is used quite heavily in the idle loop.
2151
2152<p>
2153It is similarly socially unacceptable to interrupt an
2154<tt>nohz_full</tt> CPU running in userspace.
2155RCU must therefore track <tt>nohz_full</tt> userspace
2156execution.
2157And in
2158<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a>
2159kernels, RCU must separately track idle CPUs on the one hand and
2160CPUs that are either idle or executing in userspace on the other.
2161In both cases, RCU must be able to sample state at two points in
2162time, and be able to determine whether or not some other CPU spent
2163any time idle and/or executing in userspace.
2164
2165<p>
2166These energy-efficiency requirements have proven quite difficult to
2167understand and to meet, for example, there have been more than five
2168clean-sheet rewrites of RCU's energy-efficiency code, the last of
2169which was finally able to demonstrate
2170<a href="http://www.rdrop.com/users/paulmck/realtime/paper/AMPenergy.2013.04.19a.pdf">real energy savings running on real hardware [PDF]</a>.
2171As noted earlier,
2172I learned of many of these requirements via angry phone calls:
2173Flaming me on the Linux-kernel mailing list was apparently not
2174sufficient to fully vent their ire at RCU's energy-efficiency bugs!
2175
2176<h3><a name="Performance, Scalability, Response Time, and Reliability">
2177Performance, Scalability, Response Time, and Reliability</a></h3>
2178
2179<p>
2180Expanding on the
2181<a href="#Performance and Scalability">earlier discussion</a>,
2182RCU is used heavily by hot code paths in performance-critical
2183portions of the Linux kernel's networking, security, virtualization,
2184and scheduling code paths.
2185RCU must therefore use efficient implementations, especially in its
2186read-side primitives.
2187To that end, it would be good if preemptible RCU's implementation
2188of <tt>rcu_read_lock()</tt> could be inlined, however, doing
2189this requires resolving <tt>#include</tt> issues with the
2190<tt>task_struct</tt> structure.
2191
2192<p>
2193The Linux kernel supports hardware configurations with up to
21944096 CPUs, which means that RCU must be extremely scalable.
2195Algorithms that involve frequent acquisitions of global locks or
2196frequent atomic operations on global variables simply cannot be
2197tolerated within the RCU implementation.
2198RCU therefore makes heavy use of a combining tree based on the
2199<tt>rcu_node</tt> structure.
2200RCU is required to tolerate all CPUs continuously invoking any
2201combination of RCU's runtime primitives with minimal per-operation
2202overhead.
2203In fact, in many cases, increasing load must <i>decrease</i> the
2204per-operation overhead, witness the batching optimizations for
2205<tt>synchronize_rcu()</tt>, <tt>call_rcu()</tt>,
2206<tt>synchronize_rcu_expedited()</tt>, and <tt>rcu_barrier()</tt>.
2207As a general rule, RCU must cheerfully accept whatever the
2208rest of the Linux kernel decides to throw at it.
2209
2210<p>
2211The Linux kernel is used for real-time workloads, especially
2212in conjunction with the
2213<a href="https://rt.wiki.kernel.org/index.php/Main_Page">-rt patchset</a>.
2214The real-time-latency response requirements are such that the
2215traditional approach of disabling preemption across RCU
2216read-side critical sections is inappropriate.
2217Kernels built with <tt>CONFIG_PREEMPT=y</tt> therefore
2218use an RCU implementation that allows RCU read-side critical
2219sections to be preempted.
2220This requirement made its presence known after users made it
2221clear that an earlier
2222<a href="https://lwn.net/Articles/107930/">real-time patch</a>
2223did not meet their needs, in conjunction with some
2224<a href="https://lkml.kernel.org/g/20050318002026.GA2693@us.ibm.com">RCU issues</a>
2225encountered by a very early version of the -rt patchset.
2226
2227<p>
2228In addition, RCU must make do with a sub-100-microsecond real-time latency
2229budget.
2230In fact, on smaller systems with the -rt patchset, the Linux kernel
2231provides sub-20-microsecond real-time latencies for the whole kernel,
2232including RCU.
2233RCU's scalability and latency must therefore be sufficient for
2234these sorts of configurations.
2235To my surprise, the sub-100-microsecond real-time latency budget
2236<a href="http://www.rdrop.com/users/paulmck/realtime/paper/bigrt.2013.01.31a.LCA.pdf">
2237applies to even the largest systems [PDF]</a>,
2238up to and including systems with 4096 CPUs.
2239This real-time requirement motivated the grace-period kthread, which
2240also simplified handling of a number of race conditions.
2241
2242<p>
2243Finally, RCU's status as a synchronization primitive means that
2244any RCU failure can result in arbitrary memory corruption that can be
2245extremely difficult to debug.
2246This means that RCU must be extremely reliable, which in
2247practice also means that RCU must have an aggressive stress-test
2248suite.
2249This stress-test suite is called <tt>rcutorture</tt>.
2250
2251<p>
2252Although the need for <tt>rcutorture</tt> was no surprise,
2253the current immense popularity of the Linux kernel is posing
2254interesting&mdash;and perhaps unprecedented&mdash;validation
2255challenges.
2256To see this, keep in mind that there are well over one billion
2257instances of the Linux kernel running today, given Android
2258smartphones, Linux-powered televisions, and servers.
2259This number can be expected to increase sharply with the advent of
2260the celebrated Internet of Things.
2261
2262<p>
2263Suppose that RCU contains a race condition that manifests on average
2264once per million years of runtime.
2265This bug will be occurring about three times per <i>day</i> across
2266the installed base.
2267RCU could simply hide behind hardware error rates, given that no one
2268should really expect their smartphone to last for a million years.
2269However, anyone taking too much comfort from this thought should
2270consider the fact that in most jurisdictions, a successful multi-year
2271test of a given mechanism, which might include a Linux kernel,
2272suffices for a number of types of safety-critical certifications.
2273In fact, rumor has it that the Linux kernel is already being used
2274in production for safety-critical applications.
2275I don't know about you, but I would feel quite bad if a bug in RCU
2276killed someone.
2277Which might explain my recent focus on validation and verification.
2278
2279<h2><a name="Other RCU Flavors">Other RCU Flavors</a></h2>
2280
2281<p>
2282One of the more surprising things about RCU is that there are now
2283no fewer than five <i>flavors</i>, or API families.
2284In addition, the primary flavor that has been the sole focus up to
2285this point has two different implementations, non-preemptible and
2286preemptible.
2287The other four flavors are listed below, with requirements for each
2288described in a separate section.
2289
2290<ol>
2291<li> <a href="#Bottom-Half Flavor">Bottom-Half Flavor</a>
2292<li> <a href="#Sched Flavor">Sched Flavor</a>
2293<li> <a href="#Sleepable RCU">Sleepable RCU</a>
2294<li> <a href="#Tasks RCU">Tasks RCU</a>
2295</ol>
2296
2297<h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3>
2298
2299<p>
2300The softirq-disable (AKA &ldquo;bottom-half&rdquo;,
2301hence the &ldquo;_bh&rdquo; abbreviations)
2302flavor of RCU, or <i>RCU-bh</i>, was developed by
2303Dipankar Sarma to provide a flavor of RCU that could withstand the
2304network-based denial-of-service attacks researched by Robert
2305Olsson.
2306These attacks placed so much networking load on the system
2307that some of the CPUs never exited softirq execution,
2308which in turn prevented those CPUs from ever executing a context switch,
2309which, in the RCU implementation of that time, prevented grace periods
2310from ever ending.
2311The result was an out-of-memory condition and a system hang.
2312
2313<p>
2314The solution was the creation of RCU-bh, which does
2315<tt>local_bh_disable()</tt>
2316across its read-side critical sections, and which uses the transition
2317from one type of softirq processing to another as a quiescent state
2318in addition to context switch, idle, user mode, and offline.
2319This means that RCU-bh grace periods can complete even when some of
2320the CPUs execute in softirq indefinitely, thus allowing algorithms
2321based on RCU-bh to withstand network-based denial-of-service attacks.
2322
2323<p>
2324Because
2325<tt>rcu_read_lock_bh()</tt> and <tt>rcu_read_unlock_bh()</tt>
2326disable and re-enable softirq handlers, any attempt to start a softirq
2327handlers during the
2328RCU-bh read-side critical section will be deferred.
2329In this case, <tt>rcu_read_unlock_bh()</tt>
2330will invoke softirq processing, which can take considerable time.
2331One can of course argue that this softirq overhead should be associated
2332with the code following the RCU-bh read-side critical section rather
2333than <tt>rcu_read_unlock_bh()</tt>, but the fact
2334is that most profiling tools cannot be expected to make this sort
2335of fine distinction.
2336For example, suppose that a three-millisecond-long RCU-bh read-side
2337critical section executes during a time of heavy networking load.
2338There will very likely be an attempt to invoke at least one softirq
2339handler during that three milliseconds, but any such invocation will
2340be delayed until the time of the <tt>rcu_read_unlock_bh()</tt>.
2341This can of course make it appear at first glance as if
2342<tt>rcu_read_unlock_bh()</tt> was executing very slowly.
2343
2344<p>
2345The
2346<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-bh API</a>
2347includes
2348<tt>rcu_read_lock_bh()</tt>,
2349<tt>rcu_read_unlock_bh()</tt>,
2350<tt>rcu_dereference_bh()</tt>,
2351<tt>rcu_dereference_bh_check()</tt>,
2352<tt>synchronize_rcu_bh()</tt>,
2353<tt>synchronize_rcu_bh_expedited()</tt>,
2354<tt>call_rcu_bh()</tt>,
2355<tt>rcu_barrier_bh()</tt>, and
2356<tt>rcu_read_lock_bh_held()</tt>.
2357
2358<h3><a name="Sched Flavor">Sched Flavor</a></h3>
2359
2360<p>
2361Before preemptible RCU, waiting for an RCU grace period had the
2362side effect of also waiting for all pre-existing interrupt
2363and NMI handlers.
2364However, there are legitimate preemptible-RCU implementations that
2365do not have this property, given that any point in the code outside
2366of an RCU read-side critical section can be a quiescent state.
2367Therefore, <i>RCU-sched</i> was created, which follows &ldquo;classic&rdquo;
2368RCU in that an RCU-sched grace period waits for for pre-existing
2369interrupt and NMI handlers.
2370In kernels built with <tt>CONFIG_PREEMPT=n</tt>, the RCU and RCU-sched
2371APIs have identical implementations, while kernels built with
2372<tt>CONFIG_PREEMPT=y</tt> provide a separate implementation for each.
2373
2374<p>
2375Note well that in <tt>CONFIG_PREEMPT=y</tt> kernels,
2376<tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt>
2377disable and re-enable preemption, respectively.
2378This means that if there was a preemption attempt during the
2379RCU-sched read-side critical section, <tt>rcu_read_unlock_sched()</tt>
2380will enter the scheduler, with all the latency and overhead entailed.
2381Just as with <tt>rcu_read_unlock_bh()</tt>, this can make it look
2382as if <tt>rcu_read_unlock_sched()</tt> was executing very slowly.
2383However, the highest-priority task won't be preempted, so that task
2384will enjoy low-overhead <tt>rcu_read_unlock_sched()</tt> invocations.
2385
2386<p>
2387The
2388<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-sched API</a>
2389includes
2390<tt>rcu_read_lock_sched()</tt>,
2391<tt>rcu_read_unlock_sched()</tt>,
2392<tt>rcu_read_lock_sched_notrace()</tt>,
2393<tt>rcu_read_unlock_sched_notrace()</tt>,
2394<tt>rcu_dereference_sched()</tt>,
2395<tt>rcu_dereference_sched_check()</tt>,
2396<tt>synchronize_sched()</tt>,
2397<tt>synchronize_rcu_sched_expedited()</tt>,
2398<tt>call_rcu_sched()</tt>,
2399<tt>rcu_barrier_sched()</tt>, and
2400<tt>rcu_read_lock_sched_held()</tt>.
2401However, anything that disables preemption also marks an RCU-sched
2402read-side critical section, including
2403<tt>preempt_disable()</tt> and <tt>preempt_enable()</tt>,
2404<tt>local_irq_save()</tt> and <tt>local_irq_restore()</tt>,
2405and so on.
2406
2407<h3><a name="Sleepable RCU">Sleepable RCU</a></h3>
2408
2409<p>
2410For well over a decade, someone saying &ldquo;I need to block within
2411an RCU read-side critical section&rdquo; was a reliable indication
2412that this someone did not understand RCU.
2413After all, if you are always blocking in an RCU read-side critical
2414section, you can probably afford to use a higher-overhead synchronization
2415mechanism.
2416However, that changed with the advent of the Linux kernel's notifiers,
2417whose RCU read-side critical
2418sections almost never sleep, but sometimes need to.
2419This resulted in the introduction of
2420<a href="https://lwn.net/Articles/202847/">sleepable RCU</a>,
2421or <i>SRCU</i>.
2422
2423<p>
2424SRCU allows different domains to be defined, with each such domain
2425defined by an instance of an <tt>srcu_struct</tt> structure.
2426A pointer to this structure must be passed in to each SRCU function,
2427for example, <tt>synchronize_srcu(&amp;ss)</tt>, where
2428<tt>ss</tt> is the <tt>srcu_struct</tt> structure.
2429The key benefit of these domains is that a slow SRCU reader in one
2430domain does not delay an SRCU grace period in some other domain.
2431That said, one consequence of these domains is that read-side code
2432must pass a &ldquo;cookie&rdquo; from <tt>srcu_read_lock()</tt>
2433to <tt>srcu_read_unlock()</tt>, for example, as follows:
2434
2435<blockquote>
2436<pre>
2437 1 int idx;
2438 2
2439 3 idx = srcu_read_lock(&amp;ss);
2440 4 do_something();
2441 5 srcu_read_unlock(&amp;ss, idx);
2442</pre>
2443</blockquote>
2444
2445<p>
2446As noted above, it is legal to block within SRCU read-side critical sections,
2447however, with great power comes great responsibility.
2448If you block forever in one of a given domain's SRCU read-side critical
2449sections, then that domain's grace periods will also be blocked forever.
2450Of course, one good way to block forever is to deadlock, which can
2451happen if any operation in a given domain's SRCU read-side critical
2452section can block waiting, either directly or indirectly, for that domain's
2453grace period to elapse.
2454For example, this results in a self-deadlock:
2455
2456<blockquote>
2457<pre>
2458 1 int idx;
2459 2
2460 3 idx = srcu_read_lock(&amp;ss);
2461 4 do_something();
2462 5 synchronize_srcu(&amp;ss);
2463 6 srcu_read_unlock(&amp;ss, idx);
2464</pre>
2465</blockquote>
2466
2467<p>
2468However, if line&nbsp;5 acquired a mutex that was held across
2469a <tt>synchronize_srcu()</tt> for domain <tt>ss</tt>,
2470deadlock would still be possible.
2471Furthermore, if line&nbsp;5 acquired a mutex that was held across
2472a <tt>synchronize_srcu()</tt> for some other domain <tt>ss1</tt>,
2473and if an <tt>ss1</tt>-domain SRCU read-side critical section
2474acquired another mutex that was held across as <tt>ss</tt>-domain
2475<tt>synchronize_srcu()</tt>,
2476deadlock would again be possible.
2477Such a deadlock cycle could extend across an arbitrarily large number
2478of different SRCU domains.
2479Again, with great power comes great responsibility.
2480
2481<p>
2482Unlike the other RCU flavors, SRCU read-side critical sections can
2483run on idle and even offline CPUs.
2484This ability requires that <tt>srcu_read_lock()</tt> and
2485<tt>srcu_read_unlock()</tt> contain memory barriers, which means
2486that SRCU readers will run a bit slower than would RCU readers.
2487It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt>
2488API, which, in combination with <tt>srcu_read_unlock()</tt>,
2489guarantees a full memory barrier.
2490
2491<p>
2492The
2493<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a>
2494includes
2495<tt>srcu_read_lock()</tt>,
2496<tt>srcu_read_unlock()</tt>,
2497<tt>srcu_dereference()</tt>,
2498<tt>srcu_dereference_check()</tt>,
2499<tt>synchronize_srcu()</tt>,
2500<tt>synchronize_srcu_expedited()</tt>,
2501<tt>call_srcu()</tt>,
2502<tt>srcu_barrier()</tt>, and
2503<tt>srcu_read_lock_held()</tt>.
2504It also includes
2505<tt>DEFINE_SRCU()</tt>,
2506<tt>DEFINE_STATIC_SRCU()</tt>, and
2507<tt>init_srcu_struct()</tt>
2508APIs for defining and initializing <tt>srcu_struct</tt> structures.
2509
2510<h3><a name="Tasks RCU">Tasks RCU</a></h3>
2511
2512<p>
2513Some forms of tracing use &ldquo;tramopolines&rdquo; to handle the
2514binary rewriting required to install different types of probes.
2515It would be good to be able to free old trampolines, which sounds
2516like a job for some form of RCU.
2517However, because it is necessary to be able to install a trace
2518anywhere in the code, it is not possible to use read-side markers
2519such as <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
2520In addition, it does not work to have these markers in the trampoline
2521itself, because there would need to be instructions following
2522<tt>rcu_read_unlock()</tt>.
2523Although <tt>synchronize_rcu()</tt> would guarantee that execution
2524reached the <tt>rcu_read_unlock()</tt>, it would not be able to
2525guarantee that execution had completely left the trampoline.
2526
2527<p>
2528The solution, in the form of
2529<a href="https://lwn.net/Articles/607117/"><i>Tasks RCU</i></a>,
2530is to have implicit
2531read-side critical sections that are delimited by voluntary context
2532switches, that is, calls to <tt>schedule()</tt>,
2533<tt>cond_resched_rcu_qs()</tt>, and
2534<tt>synchronize_rcu_tasks()</tt>.
2535In addition, transitions to and from userspace execution also delimit
2536tasks-RCU read-side critical sections.
2537
2538<p>
2539The tasks-RCU API is quite compact, consisting only of
2540<tt>call_rcu_tasks()</tt>,
2541<tt>synchronize_rcu_tasks()</tt>, and
2542<tt>rcu_barrier_tasks()</tt>.
2543
2544<h2><a name="Possible Future Changes">Possible Future Changes</a></h2>
2545
2546<p>
2547One of the tricks that RCU uses to attain update-side scalability is
2548to increase grace-period latency with increasing numbers of CPUs.
2549If this becomes a serious problem, it will be necessary to rework the
2550grace-period state machine so as to avoid the need for the additional
2551latency.
2552
2553<p>
2554Expedited grace periods scan the CPUs, so their latency and overhead
2555increases with increasing numbers of CPUs.
2556If this becomes a serious problem on large systems, it will be necessary
2557to do some redesign to avoid this scalability problem.
2558
2559<p>
2560RCU disables CPU hotplug in a few places, perhaps most notably in the
2561expedited grace-period and <tt>rcu_barrier()</tt> operations.
2562If there is a strong reason to use expedited grace periods in CPU-hotplug
2563notifiers, it will be necessary to avoid disabling CPU hotplug.
2564This would introduce some complexity, so there had better be a <i>very</i>
2565good reason.
2566
2567<p>
2568The tradeoff between grace-period latency on the one hand and interruptions
2569of other CPUs on the other hand may need to be re-examined.
2570The desire is of course for zero grace-period latency as well as zero
2571interprocessor interrupts undertaken during an expedited grace period
2572operation.
2573While this ideal is unlikely to be achievable, it is quite possible that
2574further improvements can be made.
2575
2576<p>
2577The multiprocessor implementations of RCU use a combining tree that
2578groups CPUs so as to reduce lock contention and increase cache locality.
2579However, this combining tree does not spread its memory across NUMA
2580nodes nor does it align the CPU groups with hardware features such
2581as sockets or cores.
2582Such spreading and alignment is currently believed to be unnecessary
2583because the hotpath read-side primitives do not access the combining
2584tree, nor does <tt>call_rcu()</tt> in the common case.
2585If you believe that your architecture needs such spreading and alignment,
2586then your architecture should also benefit from the
2587<tt>rcutree.rcu_fanout_leaf</tt> boot parameter, which can be set
2588to the number of CPUs in a socket, NUMA node, or whatever.
2589If the number of CPUs is too large, use a fraction of the number of
2590CPUs.
2591If the number of CPUs is a large prime number, well, that certainly
2592is an &ldquo;interesting&rdquo; architectural choice!
2593More flexible arrangements might be considered, but only if
2594<tt>rcutree.rcu_fanout_leaf</tt> has proven inadequate, and only
2595if the inadequacy has been demonstrated by a carefully run and
2596realistic system-level workload.
2597
2598<p>
2599Please note that arrangements that require RCU to remap CPU numbers will
2600require extremely good demonstration of need and full exploration of
2601alternatives.
2602
2603<p>
2604There is an embarrassingly large number of flavors of RCU, and this
2605number has been increasing over time.
2606Perhaps it will be possible to combine some at some future date.
2607
2608<p>
2609RCU's various kthreads are reasonably recent additions.
2610It is quite likely that adjustments will be required to more gracefully
2611handle extreme loads.
2612It might also be necessary to be able to relate CPU utilization by
2613RCU's kthreads and softirq handlers to the code that instigated this
2614CPU utilization.
2615For example, RCU callback overhead might be charged back to the
2616originating <tt>call_rcu()</tt> instance, though probably not
2617in production kernels.
2618
2619<h2><a name="Summary">Summary</a></h2>
2620
2621<p>
2622This document has presented more than two decade's worth of RCU
2623requirements.
2624Given that the requirements keep changing, this will not be the last
2625word on this subject, but at least it serves to get an important
2626subset of the requirements set forth.
2627
2628<h2><a name="Acknowledgments">Acknowledgments</a></h2>
2629
2630I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar,
2631Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and
2632Andy Lutomirski for their help in rendering
2633this article human readable, and to Michelle Rankin for her support
2634of this effort.
2635Other contributions are acknowledged in the Linux kernel's git archive.
2636The cartoon is copyright (c) 2013 by Melissa Broussard,
2637and is provided
2638under the terms of the Creative Commons Attribution-Share Alike 3.0
2639United States license.
2640
2641<p>@@QQAL@@
2642
2643</body></html>
diff --git a/Documentation/RCU/Design/htmlqqz.sh b/Documentation/RCU/Design/htmlqqz.sh
new file mode 100755
index 000000000000..d354f069559b
--- /dev/null
+++ b/Documentation/RCU/Design/htmlqqz.sh
@@ -0,0 +1,108 @@
1#!/bin/sh
2#
3# Usage: sh htmlqqz.sh file
4#
5# Extracts and converts quick quizzes in a proto-HTML document file.htmlx.
6# Commands, all of which must be on a line by themselves:
7#
8# "<p>@@QQ@@": Start of a quick quiz.
9# "<p>@@QQA@@": Start of a quick-quiz answer.
10# "<p>@@QQE@@": End of a quick-quiz answer, and thus of the quick quiz.
11# "<p>@@QQAL@@": Place to put quick-quiz answer list.
12#
13# Places the result in file.html.
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program; if not, you can access it online at
27# http://www.gnu.org/licenses/gpl-2.0.html.
28#
29# Copyright (c) 2013 Paul E. McKenney, IBM Corporation.
30
31fn=$1
32if test ! -r $fn.htmlx
33then
34 echo "Error: $fn.htmlx unreadable."
35 exit 1
36fi
37
38echo "<!-- DO NOT HAND EDIT. -->" > $fn.html
39echo "<!-- Instead, edit $fn.htmlx and run 'sh htmlqqz.sh $fn' -->" >> $fn.html
40awk < $fn.htmlx >> $fn.html '
41
42state == "" && $1 != "<p>@@QQ@@" && $1 != "<p>@@QQAL@@" {
43 print $0;
44 if ($0 ~ /^<p>@@QQ/)
45 print "Bad Quick Quiz command: " NR " (expected <p>@@QQ@@ or <p>@@QQAL@@)." > "/dev/stderr"
46 next;
47}
48
49state == "" && $1 == "<p>@@QQ@@" {
50 qqn++;
51 qqlineno = NR;
52 haveqq = 1;
53 state = "qq";
54 print "<p><a name=\"Quick Quiz " qqn "\"><b>Quick Quiz " qqn "</b>:</a>"
55 next;
56}
57
58state == "qq" && $1 != "<p>@@QQA@@" {
59 qq[qqn] = qq[qqn] $0 "\n";
60 print $0
61 if ($0 ~ /^<p>@@QQ/)
62 print "Bad Quick Quiz command: " NR ". (expected <p>@@QQA@@)" > "/dev/stderr"
63 next;
64}
65
66state == "qq" && $1 == "<p>@@QQA@@" {
67 state = "qqa";
68 print "<br><a href=\"#qq" qqn "answer\">Answer</a>"
69 next;
70}
71
72state == "qqa" && $1 != "<p>@@QQE@@" {
73 qqa[qqn] = qqa[qqn] $0 "\n";
74 if ($0 ~ /^<p>@@QQ/)
75 print "Bad Quick Quiz command: " NR " (expected <p>@@QQE@@)." > "/dev/stderr"
76 next;
77}
78
79state == "qqa" && $1 == "<p>@@QQE@@" {
80 state = "";
81 next;
82}
83
84state == "" && $1 == "<p>@@QQAL@@" {
85 haveqq = "";
86 print "<h3><a name=\"Answers to Quick Quizzes\">"
87 print "Answers to Quick Quizzes</a></h3>"
88 print "";
89 for (i = 1; i <= qqn; i++) {
90 print "<a name=\"qq" i "answer\"></a>"
91 print "<p><b>Quick Quiz " i "</b>:"
92 print qq[i];
93 print "";
94 print "</p><p><b>Answer</b>:"
95 print qqa[i];
96 print "";
97 print "</p><p><a href=\"#Quick%20Quiz%20" i "\"><b>Back to Quick Quiz " i "</b>.</a>"
98 print "";
99 }
100 next;
101}
102
103END {
104 if (state != "")
105 print "Unterminated Quick Quiz: " qqlineno "." > "/dev/stderr"
106 else if (haveqq)
107 print "Missing \"<p>@@QQAL@@\", no Quick Quiz." > "/dev/stderr"
108}'