diff options
author | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2015-10-07 16:32:08 -0400 |
---|---|---|
committer | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2015-12-05 15:19:07 -0500 |
commit | 649e4368ff786e3d02eb2a06b1493fb217d74408 (patch) | |
tree | 779b8db626afb75e0fa023265b43f9260ae73f12 /Documentation/RCU | |
parent | 6cf10081220ae21175a867d446b3167bcbcb937b (diff) |
documentation: Record RCU requirements
This commit adds RCU requirements as published in a 2015 LWN series.
Bringing these requirements in-tree allows them to be updated as changes
are discovered.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Updates to charset and URLs as suggested by Josh Triplett. ]
Diffstat (limited to 'Documentation/RCU')
-rw-r--r-- | Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png | bin | 0 -> 100825 bytes | |||
-rw-r--r-- | Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg | 374 | ||||
-rw-r--r-- | Documentation/RCU/Design/Requirements/RCUApplicability.svg | 237 | ||||
-rw-r--r-- | Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg | 639 | ||||
-rw-r--r-- | Documentation/RCU/Design/Requirements/Requirements.html | 2799 | ||||
-rw-r--r-- | Documentation/RCU/Design/Requirements/Requirements.htmlx | 2643 | ||||
-rwxr-xr-x | Documentation/RCU/Design/htmlqqz.sh | 108 |
7 files changed, 6800 insertions, 0 deletions
diff --git a/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png b/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png new file mode 100644 index 000000000000..7496a55e4e7b --- /dev/null +++ b/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png | |||
Binary files differ | |||
diff --git a/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg b/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg new file mode 100644 index 000000000000..4b4014fda770 --- /dev/null +++ b/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg | |||
@@ -0,0 +1,374 @@ | |||
1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?> | ||
2 | <!-- Created with Inkscape (http://www.inkscape.org/) --> | ||
3 | |||
4 | <svg | ||
5 | xmlns:dc="http://purl.org/dc/elements/1.1/" | ||
6 | xmlns:cc="http://creativecommons.org/ns#" | ||
7 | xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" | ||
8 | xmlns:svg="http://www.w3.org/2000/svg" | ||
9 | xmlns="http://www.w3.org/2000/svg" | ||
10 | xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" | ||
11 | xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" | ||
12 | width="447.99197" | ||
13 | height="428.19299" | ||
14 | id="svg2" | ||
15 | version="1.1" | ||
16 | inkscape:version="0.48.3.1 r9886" | ||
17 | sodipodi:docname="GPpartitionReaders1.svg"> | ||
18 | <defs | ||
19 | id="defs4"> | ||
20 | <marker | ||
21 | inkscape:stockid="Arrow2Lend" | ||
22 | orient="auto" | ||
23 | refY="0" | ||
24 | refX="0" | ||
25 | id="Arrow2Lend" | ||
26 | style="overflow:visible"> | ||
27 | <path | ||
28 | id="path3792" | ||
29 | style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" | ||
30 | d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" | ||
31 | transform="matrix(-1.1,0,0,-1.1,-1.1,0)" | ||
32 | inkscape:connector-curvature="0" /> | ||
33 | </marker> | ||
34 | <marker | ||
35 | inkscape:stockid="Arrow2Lstart" | ||
36 | orient="auto" | ||
37 | refY="0" | ||
38 | refX="0" | ||
39 | id="Arrow2Lstart" | ||
40 | style="overflow:visible"> | ||
41 | <path | ||
42 | id="path3789" | ||
43 | style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" | ||
44 | d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" | ||
45 | transform="matrix(1.1,0,0,1.1,1.1,0)" | ||
46 | inkscape:connector-curvature="0" /> | ||
47 | </marker> | ||
48 | </defs> | ||
49 | <sodipodi:namedview | ||
50 | id="base" | ||
51 | pagecolor="#ffffff" | ||
52 | bordercolor="#666666" | ||
53 | borderopacity="1.0" | ||
54 | inkscape:pageopacity="0.0" | ||
55 | inkscape:pageshadow="2" | ||
56 | inkscape:zoom="1.6184291" | ||
57 | inkscape:cx="223.99599" | ||
58 | inkscape:cy="214.0965" | ||
59 | inkscape:document-units="px" | ||
60 | inkscape:current-layer="layer1" | ||
61 | showgrid="false" | ||
62 | inkscape:window-width="979" | ||
63 | inkscape:window-height="836" | ||
64 | inkscape:window-x="571" | ||
65 | inkscape:window-y="335" | ||
66 | inkscape:window-maximized="0" | ||
67 | fit-margin-top="5" | ||
68 | fit-margin-left="5" | ||
69 | fit-margin-right="5" | ||
70 | fit-margin-bottom="5" /> | ||
71 | <metadata | ||
72 | id="metadata7"> | ||
73 | <rdf:RDF> | ||
74 | <cc:Work | ||
75 | rdf:about=""> | ||
76 | <dc:format>image/svg+xml</dc:format> | ||
77 | <dc:type | ||
78 | rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> | ||
79 | <dc:title></dc:title> | ||
80 | </cc:Work> | ||
81 | </rdf:RDF> | ||
82 | </metadata> | ||
83 | <g | ||
84 | inkscape:label="Layer 1" | ||
85 | inkscape:groupmode="layer" | ||
86 | id="layer1" | ||
87 | transform="translate(-28.441125,-185.60612)"> | ||
88 | <flowRoot | ||
89 | xml:space="preserve" | ||
90 | id="flowRoot2985" | ||
91 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion | ||
92 | id="flowRegion2987"><rect | ||
93 | id="rect2989" | ||
94 | width="82.85714" | ||
95 | height="11.428572" | ||
96 | x="240" | ||
97 | y="492.36218" /></flowRegion><flowPara | ||
98 | id="flowPara2991"></flowPara></flowRoot> <g | ||
99 | id="g4433" | ||
100 | transform="translate(2,0)"> | ||
101 | <text | ||
102 | sodipodi:linespacing="125%" | ||
103 | id="text2993" | ||
104 | y="-261.66608" | ||
105 | x="412.12299" | ||
106 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
107 | xml:space="preserve" | ||
108 | transform="matrix(0,1,-1,0,0,0)"><tspan | ||
109 | y="-261.66608" | ||
110 | x="412.12299" | ||
111 | id="tspan2995" | ||
112 | sodipodi:role="line">synchronize_rcu()</tspan></text> | ||
113 | <g | ||
114 | id="g4417" | ||
115 | transform="matrix(0,1,-1,0,730.90257,222.4928)"> | ||
116 | <path | ||
117 | style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)" | ||
118 | d="m 97.580736,477.4048 183.140664,0" | ||
119 | id="path2997" | ||
120 | inkscape:connector-curvature="0" | ||
121 | sodipodi:nodetypes="cc" /> | ||
122 | <path | ||
123 | style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" | ||
124 | d="m 96.752718,465.38398 0,22.62742" | ||
125 | id="path4397" | ||
126 | inkscape:connector-curvature="0" | ||
127 | sodipodi:nodetypes="cc" /> | ||
128 | <path | ||
129 | style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" | ||
130 | d="m 281.54942,465.38397 0,22.62742" | ||
131 | id="path4397-5" | ||
132 | inkscape:connector-curvature="0" | ||
133 | sodipodi:nodetypes="cc" /> | ||
134 | </g> | ||
135 | </g> | ||
136 | <text | ||
137 | xml:space="preserve" | ||
138 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
139 | x="112.04738" | ||
140 | y="268.18076" | ||
141 | id="text4429" | ||
142 | sodipodi:linespacing="125%"><tspan | ||
143 | sodipodi:role="line" | ||
144 | id="tspan4431" | ||
145 | x="112.04738" | ||
146 | y="268.18076">WRITE_ONCE(a, 1);</tspan></text> | ||
147 | <text | ||
148 | xml:space="preserve" | ||
149 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
150 | x="112.04738" | ||
151 | y="439.13766" | ||
152 | id="text4441" | ||
153 | sodipodi:linespacing="125%"><tspan | ||
154 | sodipodi:role="line" | ||
155 | id="tspan4443" | ||
156 | x="112.04738" | ||
157 | y="439.13766">WRITE_ONCE(b, 1);</tspan></text> | ||
158 | <text | ||
159 | xml:space="preserve" | ||
160 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
161 | x="255.60869" | ||
162 | y="309.29346" | ||
163 | id="text4445" | ||
164 | sodipodi:linespacing="125%"><tspan | ||
165 | sodipodi:role="line" | ||
166 | id="tspan4447" | ||
167 | x="255.60869" | ||
168 | y="309.29346">r1 = READ_ONCE(a);</tspan></text> | ||
169 | <text | ||
170 | xml:space="preserve" | ||
171 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
172 | x="255.14423" | ||
173 | y="520.61786" | ||
174 | id="text4449" | ||
175 | sodipodi:linespacing="125%"><tspan | ||
176 | sodipodi:role="line" | ||
177 | id="tspan4451" | ||
178 | x="255.14423" | ||
179 | y="520.61786">WRITE_ONCE(c, 1);</tspan></text> | ||
180 | <text | ||
181 | xml:space="preserve" | ||
182 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
183 | x="396.10254" | ||
184 | y="384.71124" | ||
185 | id="text4453" | ||
186 | sodipodi:linespacing="125%"><tspan | ||
187 | sodipodi:role="line" | ||
188 | id="tspan4455" | ||
189 | x="396.10254" | ||
190 | y="384.71124">r2 = READ_ONCE(b);</tspan></text> | ||
191 | <text | ||
192 | xml:space="preserve" | ||
193 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
194 | x="396.10254" | ||
195 | y="582.13617" | ||
196 | id="text4457" | ||
197 | sodipodi:linespacing="125%"><tspan | ||
198 | sodipodi:role="line" | ||
199 | id="tspan4459" | ||
200 | x="396.10254" | ||
201 | y="582.13617">r3 = READ_ONCE(c);</tspan></text> | ||
202 | <text | ||
203 | xml:space="preserve" | ||
204 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
205 | x="112.08231" | ||
206 | y="213.91006" | ||
207 | id="text4461" | ||
208 | sodipodi:linespacing="125%"><tspan | ||
209 | sodipodi:role="line" | ||
210 | id="tspan4463" | ||
211 | x="112.08231" | ||
212 | y="213.91006">thread0()</tspan></text> | ||
213 | <text | ||
214 | xml:space="preserve" | ||
215 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
216 | x="252.34512" | ||
217 | y="213.91006" | ||
218 | id="text4461-6" | ||
219 | sodipodi:linespacing="125%"><tspan | ||
220 | sodipodi:role="line" | ||
221 | id="tspan4463-0" | ||
222 | x="252.34512" | ||
223 | y="213.91006">thread1()</tspan></text> | ||
224 | <text | ||
225 | xml:space="preserve" | ||
226 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
227 | x="396.42557" | ||
228 | y="213.91006" | ||
229 | id="text4461-2" | ||
230 | sodipodi:linespacing="125%"><tspan | ||
231 | sodipodi:role="line" | ||
232 | id="tspan4463-2" | ||
233 | x="396.42557" | ||
234 | y="213.91006">thread2()</tspan></text> | ||
235 | <rect | ||
236 | style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
237 | id="rect4495" | ||
238 | width="436.28488" | ||
239 | height="416.4859" | ||
240 | x="34.648232" | ||
241 | y="191.10612" /> | ||
242 | <path | ||
243 | style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
244 | d="m 183.14066,191.10612 0,417.193 -0.70711,0" | ||
245 | id="path4497" | ||
246 | inkscape:connector-curvature="0" /> | ||
247 | <path | ||
248 | style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
249 | d="m 325.13867,191.10612 0,417.193 -0.70711,0" | ||
250 | id="path4497-5" | ||
251 | inkscape:connector-curvature="0" /> | ||
252 | <text | ||
253 | xml:space="preserve" | ||
254 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
255 | x="111.75929" | ||
256 | y="251.53981" | ||
257 | id="text4429-8" | ||
258 | sodipodi:linespacing="125%"><tspan | ||
259 | sodipodi:role="line" | ||
260 | id="tspan4431-9" | ||
261 | x="111.75929" | ||
262 | y="251.53981">rcu_read_lock();</tspan></text> | ||
263 | <text | ||
264 | xml:space="preserve" | ||
265 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
266 | x="396.10254" | ||
267 | y="367.91556" | ||
268 | id="text4429-8-9" | ||
269 | sodipodi:linespacing="125%"><tspan | ||
270 | sodipodi:role="line" | ||
271 | id="tspan4431-9-4" | ||
272 | x="396.10254" | ||
273 | y="367.91556">rcu_read_lock();</tspan></text> | ||
274 | <text | ||
275 | xml:space="preserve" | ||
276 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
277 | x="396.10254" | ||
278 | y="597.40289" | ||
279 | id="text4429-8-9-3" | ||
280 | sodipodi:linespacing="125%"><tspan | ||
281 | sodipodi:role="line" | ||
282 | id="tspan4431-9-4-4" | ||
283 | x="396.10254" | ||
284 | y="597.40289">rcu_read_unlock();</tspan></text> | ||
285 | <text | ||
286 | xml:space="preserve" | ||
287 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
288 | x="111.75929" | ||
289 | y="453.15311" | ||
290 | id="text4429-8-9-3-1" | ||
291 | sodipodi:linespacing="125%"><tspan | ||
292 | sodipodi:role="line" | ||
293 | id="tspan4431-9-4-4-6" | ||
294 | x="111.75929" | ||
295 | y="453.15311">rcu_read_unlock();</tspan></text> | ||
296 | <path | ||
297 | style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" | ||
298 | d="m 33.941125,227.87568 436.284885,0 0,0.7071" | ||
299 | id="path4608" | ||
300 | inkscape:connector-curvature="0" /> | ||
301 | <text | ||
302 | xml:space="preserve" | ||
303 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
304 | x="394.94427" | ||
305 | y="345.66351" | ||
306 | id="text4648" | ||
307 | sodipodi:linespacing="125%"><tspan | ||
308 | sodipodi:role="line" | ||
309 | id="tspan4650" | ||
310 | x="394.94427" | ||
311 | y="345.66351">QS</tspan></text> | ||
312 | <path | ||
313 | sodipodi:type="arc" | ||
314 | style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
315 | id="path4652" | ||
316 | sodipodi:cx="358.85669" | ||
317 | sodipodi:cy="142.87541" | ||
318 | sodipodi:rx="10.960155" | ||
319 | sodipodi:ry="10.253048" | ||
320 | d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" | ||
321 | transform="translate(36.441125,199.60612)" | ||
322 | sodipodi:start="4.7135481" | ||
323 | sodipodi:end="10.994651" | ||
324 | sodipodi:open="true" /> | ||
325 | <text | ||
326 | xml:space="preserve" | ||
327 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
328 | x="112.11968" | ||
329 | y="475.77856" | ||
330 | id="text4648-4" | ||
331 | sodipodi:linespacing="125%"><tspan | ||
332 | sodipodi:role="line" | ||
333 | id="tspan4650-4" | ||
334 | x="112.11968" | ||
335 | y="475.77856">QS</tspan></text> | ||
336 | <path | ||
337 | sodipodi:type="arc" | ||
338 | style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
339 | id="path4652-7" | ||
340 | sodipodi:cx="358.85669" | ||
341 | sodipodi:cy="142.87541" | ||
342 | sodipodi:rx="10.960155" | ||
343 | sodipodi:ry="10.253048" | ||
344 | d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" | ||
345 | transform="translate(-246.38346,329.72117)" | ||
346 | sodipodi:start="4.7135481" | ||
347 | sodipodi:end="10.994651" | ||
348 | sodipodi:open="true" /> | ||
349 | <path | ||
350 | sodipodi:type="arc" | ||
351 | style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
352 | id="path4652-7-7" | ||
353 | sodipodi:cx="358.85669" | ||
354 | sodipodi:cy="142.87541" | ||
355 | sodipodi:rx="10.960155" | ||
356 | sodipodi:ry="10.253048" | ||
357 | d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" | ||
358 | transform="translate(-103.65246,202.90878)" | ||
359 | sodipodi:start="4.7135481" | ||
360 | sodipodi:end="10.994651" | ||
361 | sodipodi:open="true" /> | ||
362 | <text | ||
363 | xml:space="preserve" | ||
364 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
365 | x="254.85066" | ||
366 | y="348.96619" | ||
367 | id="text4648-4-3" | ||
368 | sodipodi:linespacing="125%"><tspan | ||
369 | sodipodi:role="line" | ||
370 | id="tspan4650-4-5" | ||
371 | x="254.85066" | ||
372 | y="348.96619">QS</tspan></text> | ||
373 | </g> | ||
374 | </svg> | ||
diff --git a/Documentation/RCU/Design/Requirements/RCUApplicability.svg b/Documentation/RCU/Design/Requirements/RCUApplicability.svg new file mode 100644 index 000000000000..ebcbeee391ed --- /dev/null +++ b/Documentation/RCU/Design/Requirements/RCUApplicability.svg | |||
@@ -0,0 +1,237 @@ | |||
1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?> | ||
2 | <!-- Creator: fig2dev Version 3.2 Patchlevel 5d --> | ||
3 | |||
4 | <!-- CreationDate: Tue Mar 4 18:34:25 2014 --> | ||
5 | |||
6 | <!-- Magnification: 3.000 --> | ||
7 | |||
8 | <svg | ||
9 | xmlns:dc="http://purl.org/dc/elements/1.1/" | ||
10 | xmlns:cc="http://creativecommons.org/ns#" | ||
11 | xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" | ||
12 | xmlns:svg="http://www.w3.org/2000/svg" | ||
13 | xmlns="http://www.w3.org/2000/svg" | ||
14 | xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" | ||
15 | xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" | ||
16 | width="1089.1382" | ||
17 | height="668.21368" | ||
18 | viewBox="-2121 -36 14554.634 8876.4061" | ||
19 | id="svg2" | ||
20 | version="1.1" | ||
21 | inkscape:version="0.48.3.1 r9886" | ||
22 | sodipodi:docname="RCUApplicability.svg"> | ||
23 | <metadata | ||
24 | id="metadata40"> | ||
25 | <rdf:RDF> | ||
26 | <cc:Work | ||
27 | rdf:about=""> | ||
28 | <dc:format>image/svg+xml</dc:format> | ||
29 | <dc:type | ||
30 | rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> | ||
31 | <dc:title /> | ||
32 | </cc:Work> | ||
33 | </rdf:RDF> | ||
34 | </metadata> | ||
35 | <defs | ||
36 | id="defs38" /> | ||
37 | <sodipodi:namedview | ||
38 | pagecolor="#ffffff" | ||
39 | bordercolor="#666666" | ||
40 | borderopacity="1" | ||
41 | objecttolerance="10" | ||
42 | gridtolerance="10" | ||
43 | guidetolerance="10" | ||
44 | inkscape:pageopacity="0" | ||
45 | inkscape:pageshadow="2" | ||
46 | inkscape:window-width="849" | ||
47 | inkscape:window-height="639" | ||
48 | id="namedview36" | ||
49 | showgrid="false" | ||
50 | inkscape:zoom="0.51326165" | ||
51 | inkscape:cx="544.56912" | ||
52 | inkscape:cy="334.10686" | ||
53 | inkscape:window-x="149" | ||
54 | inkscape:window-y="448" | ||
55 | inkscape:window-maximized="0" | ||
56 | inkscape:current-layer="g4" | ||
57 | fit-margin-top="5" | ||
58 | fit-margin-left="5" | ||
59 | fit-margin-right="5" | ||
60 | fit-margin-bottom="5" /> | ||
61 | <g | ||
62 | style="fill:none;stroke-width:0.025in" | ||
63 | id="g4" | ||
64 | transform="translate(-2043.6828,14.791398)"> | ||
65 | <!-- Line: box --> | ||
66 | <rect | ||
67 | x="0" | ||
68 | y="0" | ||
69 | width="14400" | ||
70 | height="8775" | ||
71 | rx="0" | ||
72 | style="fill:#ffa1a1;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" | ||
73 | id="rect6" /> | ||
74 | <!-- Line: box --> | ||
75 | <rect | ||
76 | x="1350" | ||
77 | y="0" | ||
78 | width="11700" | ||
79 | height="6075" | ||
80 | rx="0" | ||
81 | style="fill:#ffff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" | ||
82 | id="rect8" /> | ||
83 | <!-- Line: box --> | ||
84 | <rect | ||
85 | x="2700" | ||
86 | y="0" | ||
87 | width="9000" | ||
88 | height="4275" | ||
89 | rx="0" | ||
90 | style="fill:#00ff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" | ||
91 | id="rect10" /> | ||
92 | <!-- Line: box --> | ||
93 | <rect | ||
94 | x="4050" | ||
95 | y="0" | ||
96 | width="6300" | ||
97 | height="2475" | ||
98 | rx="0" | ||
99 | style="fill:#87cfff;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" | ||
100 | id="rect12" /> | ||
101 | <!-- Text --> | ||
102 | <text | ||
103 | xml:space="preserve" | ||
104 | x="7200" | ||
105 | y="900" | ||
106 | font-style="normal" | ||
107 | font-weight="normal" | ||
108 | font-size="324" | ||
109 | id="text14" | ||
110 | sodipodi:linespacing="125%" | ||
111 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan | ||
112 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" | ||
113 | id="tspan3017">Read-Mostly, Stale &</tspan></text> | ||
114 | <!-- Text --> | ||
115 | <text | ||
116 | xml:space="preserve" | ||
117 | x="7200" | ||
118 | y="1350" | ||
119 | font-style="normal" | ||
120 | font-weight="normal" | ||
121 | font-size="324" | ||
122 | id="text16" | ||
123 | sodipodi:linespacing="125%" | ||
124 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan | ||
125 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" | ||
126 | id="tspan3019">Inconsistent Data OK</tspan></text> | ||
127 | <!-- Text --> | ||
128 | <text | ||
129 | xml:space="preserve" | ||
130 | x="7200" | ||
131 | y="1800" | ||
132 | font-style="normal" | ||
133 | font-weight="normal" | ||
134 | font-size="324" | ||
135 | id="text18" | ||
136 | sodipodi:linespacing="125%" | ||
137 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan | ||
138 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" | ||
139 | id="tspan3021">(RCU Works Great!!!)</tspan></text> | ||
140 | <!-- Text --> | ||
141 | <text | ||
142 | xml:space="preserve" | ||
143 | x="7200" | ||
144 | y="3825" | ||
145 | font-style="normal" | ||
146 | font-weight="normal" | ||
147 | font-size="324" | ||
148 | id="text20" | ||
149 | sodipodi:linespacing="125%" | ||
150 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan | ||
151 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" | ||
152 | id="tspan3023">(RCU Works Well)</tspan></text> | ||
153 | <!-- Text --> | ||
154 | <text | ||
155 | xml:space="preserve" | ||
156 | x="7200" | ||
157 | y="3375" | ||
158 | font-style="normal" | ||
159 | font-weight="normal" | ||
160 | font-size="324" | ||
161 | id="text22" | ||
162 | sodipodi:linespacing="125%" | ||
163 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan | ||
164 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" | ||
165 | id="tspan3025">Read-Mostly, Need Consistent Data</tspan></text> | ||
166 | <!-- Text --> | ||
167 | <text | ||
168 | xml:space="preserve" | ||
169 | x="7200" | ||
170 | y="5175" | ||
171 | font-style="normal" | ||
172 | font-weight="normal" | ||
173 | font-size="324" | ||
174 | id="text24" | ||
175 | sodipodi:linespacing="125%" | ||
176 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan | ||
177 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" | ||
178 | id="tspan3027">Read-Write, Need Consistent Data</tspan></text> | ||
179 | <!-- Text --> | ||
180 | <text | ||
181 | xml:space="preserve" | ||
182 | x="7200" | ||
183 | y="6975" | ||
184 | font-style="normal" | ||
185 | font-weight="normal" | ||
186 | font-size="324" | ||
187 | id="text26" | ||
188 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" | ||
189 | sodipodi:linespacing="125%">Update-Mostly, Need Consistent Data</text> | ||
190 | <!-- Text --> | ||
191 | <text | ||
192 | xml:space="preserve" | ||
193 | x="7200" | ||
194 | y="5625" | ||
195 | font-style="normal" | ||
196 | font-weight="normal" | ||
197 | font-size="324" | ||
198 | id="text28" | ||
199 | sodipodi:linespacing="125%" | ||
200 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan | ||
201 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" | ||
202 | id="tspan3029">(RCU Might Be OK...)</tspan></text> | ||
203 | <!-- Text --> | ||
204 | <text | ||
205 | xml:space="preserve" | ||
206 | x="7200" | ||
207 | y="7875" | ||
208 | font-style="normal" | ||
209 | font-weight="normal" | ||
210 | font-size="324" | ||
211 | id="text30" | ||
212 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" | ||
213 | sodipodi:linespacing="125%">(1) Provide Existence Guarantees For Update-Friendly Mechanisms</text> | ||
214 | <!-- Text --> | ||
215 | <text | ||
216 | xml:space="preserve" | ||
217 | x="7200" | ||
218 | y="8325" | ||
219 | font-style="normal" | ||
220 | font-weight="normal" | ||
221 | font-size="324" | ||
222 | id="text32" | ||
223 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" | ||
224 | sodipodi:linespacing="125%">(2) Provide Wait-Free Read-Side Primitives for Real-Time Use)</text> | ||
225 | <!-- Text --> | ||
226 | <text | ||
227 | xml:space="preserve" | ||
228 | x="7200" | ||
229 | y="7425" | ||
230 | font-style="normal" | ||
231 | font-weight="normal" | ||
232 | font-size="324" | ||
233 | id="text34" | ||
234 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" | ||
235 | sodipodi:linespacing="125%">(RCU is Very Unlikely to be the Right Tool For The Job, But it Can:</text> | ||
236 | </g> | ||
237 | </svg> | ||
diff --git a/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg b/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg new file mode 100644 index 000000000000..48cd1623d4d4 --- /dev/null +++ b/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg | |||
@@ -0,0 +1,639 @@ | |||
1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?> | ||
2 | <!-- Created with Inkscape (http://www.inkscape.org/) --> | ||
3 | |||
4 | <svg | ||
5 | xmlns:dc="http://purl.org/dc/elements/1.1/" | ||
6 | xmlns:cc="http://creativecommons.org/ns#" | ||
7 | xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" | ||
8 | xmlns:svg="http://www.w3.org/2000/svg" | ||
9 | xmlns="http://www.w3.org/2000/svg" | ||
10 | xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" | ||
11 | xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" | ||
12 | width="735.25" | ||
13 | height="516.21875" | ||
14 | id="svg2" | ||
15 | version="1.1" | ||
16 | inkscape:version="0.48.3.1 r9886" | ||
17 | sodipodi:docname="ReadersPartitionGP1.svg"> | ||
18 | <defs | ||
19 | id="defs4"> | ||
20 | <marker | ||
21 | inkscape:stockid="Arrow2Lend" | ||
22 | orient="auto" | ||
23 | refY="0" | ||
24 | refX="0" | ||
25 | id="Arrow2Lend" | ||
26 | style="overflow:visible"> | ||
27 | <path | ||
28 | id="path3792" | ||
29 | style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" | ||
30 | d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" | ||
31 | transform="matrix(-1.1,0,0,-1.1,-1.1,0)" | ||
32 | inkscape:connector-curvature="0" /> | ||
33 | </marker> | ||
34 | <marker | ||
35 | inkscape:stockid="Arrow2Lstart" | ||
36 | orient="auto" | ||
37 | refY="0" | ||
38 | refX="0" | ||
39 | id="Arrow2Lstart" | ||
40 | style="overflow:visible"> | ||
41 | <path | ||
42 | id="path3789" | ||
43 | style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" | ||
44 | d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" | ||
45 | transform="matrix(1.1,0,0,1.1,1.1,0)" | ||
46 | inkscape:connector-curvature="0" /> | ||
47 | </marker> | ||
48 | <marker | ||
49 | inkscape:stockid="Arrow2Lstart" | ||
50 | orient="auto" | ||
51 | refY="0" | ||
52 | refX="0" | ||
53 | id="Arrow2Lstart-4" | ||
54 | style="overflow:visible"> | ||
55 | <path | ||
56 | id="path3789-9" | ||
57 | style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" | ||
58 | d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" | ||
59 | transform="matrix(1.1,0,0,1.1,1.1,0)" | ||
60 | inkscape:connector-curvature="0" /> | ||
61 | </marker> | ||
62 | <marker | ||
63 | inkscape:stockid="Arrow2Lend" | ||
64 | orient="auto" | ||
65 | refY="0" | ||
66 | refX="0" | ||
67 | id="Arrow2Lend-4" | ||
68 | style="overflow:visible"> | ||
69 | <path | ||
70 | id="path3792-4" | ||
71 | style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" | ||
72 | d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" | ||
73 | transform="matrix(-1.1,0,0,-1.1,-1.1,0)" | ||
74 | inkscape:connector-curvature="0" /> | ||
75 | </marker> | ||
76 | </defs> | ||
77 | <sodipodi:namedview | ||
78 | id="base" | ||
79 | pagecolor="#ffffff" | ||
80 | bordercolor="#666666" | ||
81 | borderopacity="1.0" | ||
82 | inkscape:pageopacity="0.0" | ||
83 | inkscape:pageshadow="2" | ||
84 | inkscape:zoom="1.3670394" | ||
85 | inkscape:cx="367.26465" | ||
86 | inkscape:cy="258.46182" | ||
87 | inkscape:document-units="px" | ||
88 | inkscape:current-layer="g4433-6" | ||
89 | showgrid="false" | ||
90 | inkscape:window-width="1351" | ||
91 | inkscape:window-height="836" | ||
92 | inkscape:window-x="438" | ||
93 | inkscape:window-y="335" | ||
94 | inkscape:window-maximized="0" | ||
95 | fit-margin-top="5" | ||
96 | fit-margin-left="5" | ||
97 | fit-margin-right="5" | ||
98 | fit-margin-bottom="5" /> | ||
99 | <metadata | ||
100 | id="metadata7"> | ||
101 | <rdf:RDF> | ||
102 | <cc:Work | ||
103 | rdf:about=""> | ||
104 | <dc:format>image/svg+xml</dc:format> | ||
105 | <dc:type | ||
106 | rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> | ||
107 | <dc:title /> | ||
108 | </cc:Work> | ||
109 | </rdf:RDF> | ||
110 | </metadata> | ||
111 | <g | ||
112 | inkscape:label="Layer 1" | ||
113 | inkscape:groupmode="layer" | ||
114 | id="layer1" | ||
115 | transform="translate(-29.15625,-185.59375)"> | ||
116 | <flowRoot | ||
117 | xml:space="preserve" | ||
118 | id="flowRoot2985" | ||
119 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion | ||
120 | id="flowRegion2987"><rect | ||
121 | id="rect2989" | ||
122 | width="82.85714" | ||
123 | height="11.428572" | ||
124 | x="240" | ||
125 | y="492.36218" /></flowRegion><flowPara | ||
126 | id="flowPara2991" /></flowRoot> <g | ||
127 | id="g4433" | ||
128 | transform="translate(2,-12)"> | ||
129 | <text | ||
130 | sodipodi:linespacing="125%" | ||
131 | id="text2993" | ||
132 | y="-261.66608" | ||
133 | x="436.12299" | ||
134 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
135 | xml:space="preserve" | ||
136 | transform="matrix(0,1,-1,0,0,0)"><tspan | ||
137 | y="-261.66608" | ||
138 | x="436.12299" | ||
139 | id="tspan2995" | ||
140 | sodipodi:role="line">synchronize_rcu()</tspan></text> | ||
141 | <g | ||
142 | id="g4417" | ||
143 | transform="matrix(0,1,-1,0,730.90257,222.4928)"> | ||
144 | <path | ||
145 | style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)" | ||
146 | d="M 97.580736,477.4048 327.57913,476.09759" | ||
147 | id="path2997" | ||
148 | inkscape:connector-curvature="0" | ||
149 | sodipodi:nodetypes="cc" /> | ||
150 | <path | ||
151 | style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" | ||
152 | d="m 96.752718,465.38398 0,22.62742" | ||
153 | id="path4397" | ||
154 | inkscape:connector-curvature="0" | ||
155 | sodipodi:nodetypes="cc" /> | ||
156 | <path | ||
157 | style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" | ||
158 | d="m 328.40703,465.38397 0,22.62742" | ||
159 | id="path4397-5" | ||
160 | inkscape:connector-curvature="0" | ||
161 | sodipodi:nodetypes="cc" /> | ||
162 | </g> | ||
163 | </g> | ||
164 | <text | ||
165 | xml:space="preserve" | ||
166 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
167 | x="112.04738" | ||
168 | y="268.18076" | ||
169 | id="text4429" | ||
170 | sodipodi:linespacing="125%"><tspan | ||
171 | sodipodi:role="line" | ||
172 | id="tspan4431" | ||
173 | x="112.04738" | ||
174 | y="268.18076">WRITE_ONCE(a, 1);</tspan></text> | ||
175 | <text | ||
176 | xml:space="preserve" | ||
177 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
178 | x="112.04738" | ||
179 | y="487.13766" | ||
180 | id="text4441" | ||
181 | sodipodi:linespacing="125%"><tspan | ||
182 | sodipodi:role="line" | ||
183 | id="tspan4443" | ||
184 | x="112.04738" | ||
185 | y="487.13766">WRITE_ONCE(b, 1);</tspan></text> | ||
186 | <text | ||
187 | xml:space="preserve" | ||
188 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
189 | x="255.60869" | ||
190 | y="297.29346" | ||
191 | id="text4445" | ||
192 | sodipodi:linespacing="125%"><tspan | ||
193 | sodipodi:role="line" | ||
194 | id="tspan4447" | ||
195 | x="255.60869" | ||
196 | y="297.29346">r1 = READ_ONCE(a);</tspan></text> | ||
197 | <text | ||
198 | xml:space="preserve" | ||
199 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
200 | x="255.14423" | ||
201 | y="554.61786" | ||
202 | id="text4449" | ||
203 | sodipodi:linespacing="125%"><tspan | ||
204 | sodipodi:role="line" | ||
205 | id="tspan4451" | ||
206 | x="255.14423" | ||
207 | y="554.61786">WRITE_ONCE(c, 1);</tspan></text> | ||
208 | <text | ||
209 | xml:space="preserve" | ||
210 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
211 | x="396.10254" | ||
212 | y="370.71124" | ||
213 | id="text4453" | ||
214 | sodipodi:linespacing="125%"><tspan | ||
215 | sodipodi:role="line" | ||
216 | id="tspan4455" | ||
217 | x="396.10254" | ||
218 | y="370.71124">WRITE_ONCE(d, 1);</tspan></text> | ||
219 | <text | ||
220 | xml:space="preserve" | ||
221 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
222 | x="396.10254" | ||
223 | y="572.13617" | ||
224 | id="text4457" | ||
225 | sodipodi:linespacing="125%"><tspan | ||
226 | sodipodi:role="line" | ||
227 | id="tspan4459" | ||
228 | x="396.10254" | ||
229 | y="572.13617">r2 = READ_ONCE(c);</tspan></text> | ||
230 | <text | ||
231 | xml:space="preserve" | ||
232 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
233 | x="112.08231" | ||
234 | y="213.91006" | ||
235 | id="text4461" | ||
236 | sodipodi:linespacing="125%"><tspan | ||
237 | sodipodi:role="line" | ||
238 | id="tspan4463" | ||
239 | x="112.08231" | ||
240 | y="213.91006">thread0()</tspan></text> | ||
241 | <text | ||
242 | xml:space="preserve" | ||
243 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
244 | x="252.34512" | ||
245 | y="213.91006" | ||
246 | id="text4461-6" | ||
247 | sodipodi:linespacing="125%"><tspan | ||
248 | sodipodi:role="line" | ||
249 | id="tspan4463-0" | ||
250 | x="252.34512" | ||
251 | y="213.91006">thread1()</tspan></text> | ||
252 | <text | ||
253 | xml:space="preserve" | ||
254 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
255 | x="396.42557" | ||
256 | y="213.91006" | ||
257 | id="text4461-2" | ||
258 | sodipodi:linespacing="125%"><tspan | ||
259 | sodipodi:role="line" | ||
260 | id="tspan4463-2" | ||
261 | x="396.42557" | ||
262 | y="213.91006">thread2()</tspan></text> | ||
263 | <rect | ||
264 | style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
265 | id="rect4495" | ||
266 | width="724.25244" | ||
267 | height="505.21201" | ||
268 | x="34.648232" | ||
269 | y="191.10612" /> | ||
270 | <path | ||
271 | style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
272 | d="m 183.14066,191.10612 0,504.24243" | ||
273 | id="path4497" | ||
274 | inkscape:connector-curvature="0" | ||
275 | sodipodi:nodetypes="cc" /> | ||
276 | <path | ||
277 | style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
278 | d="m 325.13867,191.10612 0,504.24243" | ||
279 | id="path4497-5" | ||
280 | inkscape:connector-curvature="0" | ||
281 | sodipodi:nodetypes="cc" /> | ||
282 | <text | ||
283 | xml:space="preserve" | ||
284 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
285 | x="111.75929" | ||
286 | y="251.53981" | ||
287 | id="text4429-8" | ||
288 | sodipodi:linespacing="125%"><tspan | ||
289 | sodipodi:role="line" | ||
290 | id="tspan4431-9" | ||
291 | x="111.75929" | ||
292 | y="251.53981">rcu_read_lock();</tspan></text> | ||
293 | <text | ||
294 | xml:space="preserve" | ||
295 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
296 | x="396.10254" | ||
297 | y="353.91556" | ||
298 | id="text4429-8-9" | ||
299 | sodipodi:linespacing="125%"><tspan | ||
300 | sodipodi:role="line" | ||
301 | id="tspan4431-9-4" | ||
302 | x="396.10254" | ||
303 | y="353.91556">rcu_read_lock();</tspan></text> | ||
304 | <text | ||
305 | xml:space="preserve" | ||
306 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
307 | x="396.10254" | ||
308 | y="587.40289" | ||
309 | id="text4429-8-9-3" | ||
310 | sodipodi:linespacing="125%"><tspan | ||
311 | sodipodi:role="line" | ||
312 | id="tspan4431-9-4-4" | ||
313 | x="396.10254" | ||
314 | y="587.40289">rcu_read_unlock();</tspan></text> | ||
315 | <text | ||
316 | xml:space="preserve" | ||
317 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
318 | x="111.75929" | ||
319 | y="501.15311" | ||
320 | id="text4429-8-9-3-1" | ||
321 | sodipodi:linespacing="125%"><tspan | ||
322 | sodipodi:role="line" | ||
323 | id="tspan4431-9-4-4-6" | ||
324 | x="111.75929" | ||
325 | y="501.15311">rcu_read_unlock();</tspan></text> | ||
326 | <path | ||
327 | style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" | ||
328 | d="m 33.941125,227.87568 724.941765,0" | ||
329 | id="path4608" | ||
330 | inkscape:connector-curvature="0" | ||
331 | sodipodi:nodetypes="cc" /> | ||
332 | <text | ||
333 | xml:space="preserve" | ||
334 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
335 | x="394.94427" | ||
336 | y="331.66351" | ||
337 | id="text4648" | ||
338 | sodipodi:linespacing="125%"><tspan | ||
339 | sodipodi:role="line" | ||
340 | id="tspan4650" | ||
341 | x="394.94427" | ||
342 | y="331.66351">QS</tspan></text> | ||
343 | <path | ||
344 | sodipodi:type="arc" | ||
345 | style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
346 | id="path4652" | ||
347 | sodipodi:cx="358.85669" | ||
348 | sodipodi:cy="142.87541" | ||
349 | sodipodi:rx="10.960155" | ||
350 | sodipodi:ry="10.253048" | ||
351 | d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" | ||
352 | transform="translate(36.441125,185.60612)" | ||
353 | sodipodi:start="4.7135481" | ||
354 | sodipodi:end="10.994651" | ||
355 | sodipodi:open="true" /> | ||
356 | <text | ||
357 | xml:space="preserve" | ||
358 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
359 | x="112.11968" | ||
360 | y="523.77856" | ||
361 | id="text4648-4" | ||
362 | sodipodi:linespacing="125%"><tspan | ||
363 | sodipodi:role="line" | ||
364 | id="tspan4650-4" | ||
365 | x="112.11968" | ||
366 | y="523.77856">QS</tspan></text> | ||
367 | <path | ||
368 | sodipodi:type="arc" | ||
369 | style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
370 | id="path4652-7" | ||
371 | sodipodi:cx="358.85669" | ||
372 | sodipodi:cy="142.87541" | ||
373 | sodipodi:rx="10.960155" | ||
374 | sodipodi:ry="10.253048" | ||
375 | d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" | ||
376 | transform="translate(-246.38346,377.72117)" | ||
377 | sodipodi:start="4.7135481" | ||
378 | sodipodi:end="10.994651" | ||
379 | sodipodi:open="true" /> | ||
380 | <path | ||
381 | sodipodi:type="arc" | ||
382 | style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
383 | id="path4652-7-7" | ||
384 | sodipodi:cx="358.85669" | ||
385 | sodipodi:cy="142.87541" | ||
386 | sodipodi:rx="10.960155" | ||
387 | sodipodi:ry="10.253048" | ||
388 | d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" | ||
389 | transform="translate(-103.65246,190.90878)" | ||
390 | sodipodi:start="4.7135481" | ||
391 | sodipodi:end="10.994651" | ||
392 | sodipodi:open="true" /> | ||
393 | <text | ||
394 | xml:space="preserve" | ||
395 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
396 | x="254.85066" | ||
397 | y="336.96619" | ||
398 | id="text4648-4-3" | ||
399 | sodipodi:linespacing="125%"><tspan | ||
400 | sodipodi:role="line" | ||
401 | id="tspan4650-4-5" | ||
402 | x="254.85066" | ||
403 | y="336.96619">QS</tspan></text> | ||
404 | <path | ||
405 | style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
406 | d="m 470.93311,190.39903 0,504.24243" | ||
407 | id="path4497-5-6" | ||
408 | inkscape:connector-curvature="0" | ||
409 | sodipodi:nodetypes="cc" /> | ||
410 | <path | ||
411 | style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
412 | d="m 616.22755,190.38323 0,504.24243" | ||
413 | id="path4497-5-2" | ||
414 | inkscape:connector-curvature="0" | ||
415 | sodipodi:nodetypes="cc" /> | ||
416 | <g | ||
417 | id="g4433-6" | ||
418 | transform="translate(288.0964,78.32827)"> | ||
419 | <text | ||
420 | sodipodi:linespacing="125%" | ||
421 | id="text2993-7" | ||
422 | y="-261.66608" | ||
423 | x="440.12299" | ||
424 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
425 | xml:space="preserve" | ||
426 | transform="matrix(0,1,-1,0,0,0)"><tspan | ||
427 | y="-261.66608" | ||
428 | x="440.12299" | ||
429 | id="tspan2995-1" | ||
430 | sodipodi:role="line">synchronize_rcu()</tspan></text> | ||
431 | <g | ||
432 | id="g4417-1" | ||
433 | transform="matrix(0,1,-1,0,730.90257,222.4928)"> | ||
434 | <path | ||
435 | style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)" | ||
436 | d="M 97.580736,477.4048 328.5624,477.07246" | ||
437 | id="path2997-2" | ||
438 | inkscape:connector-curvature="0" | ||
439 | sodipodi:nodetypes="cc" /> | ||
440 | <path | ||
441 | style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" | ||
442 | d="m 96.752718,465.38398 0,22.62742" | ||
443 | id="path4397-3" | ||
444 | inkscape:connector-curvature="0" | ||
445 | sodipodi:nodetypes="cc" /> | ||
446 | <path | ||
447 | style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" | ||
448 | d="m 329.39039,465.38397 0,22.62742" | ||
449 | id="path4397-5-4" | ||
450 | inkscape:connector-curvature="0" | ||
451 | sodipodi:nodetypes="cc" /> | ||
452 | </g> | ||
453 | </g> | ||
454 | <text | ||
455 | xml:space="preserve" | ||
456 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
457 | x="541.70508" | ||
458 | y="387.6217" | ||
459 | id="text4445-0" | ||
460 | sodipodi:linespacing="125%"><tspan | ||
461 | sodipodi:role="line" | ||
462 | id="tspan4447-5" | ||
463 | x="541.70508" | ||
464 | y="387.6217">r3 = READ_ONCE(d);</tspan></text> | ||
465 | <text | ||
466 | xml:space="preserve" | ||
467 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
468 | x="541.2406" | ||
469 | y="646.94611" | ||
470 | id="text4449-6" | ||
471 | sodipodi:linespacing="125%"><tspan | ||
472 | sodipodi:role="line" | ||
473 | id="tspan4451-6" | ||
474 | x="541.2406" | ||
475 | y="646.94611">WRITE_ONCE(e, 1);</tspan></text> | ||
476 | <path | ||
477 | sodipodi:type="arc" | ||
478 | style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
479 | id="path4652-7-7-5" | ||
480 | sodipodi:cx="358.85669" | ||
481 | sodipodi:cy="142.87541" | ||
482 | sodipodi:rx="10.960155" | ||
483 | sodipodi:ry="10.253048" | ||
484 | d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" | ||
485 | transform="translate(182.44393,281.23704)" | ||
486 | sodipodi:start="4.7135481" | ||
487 | sodipodi:end="10.994651" | ||
488 | sodipodi:open="true" /> | ||
489 | <text | ||
490 | xml:space="preserve" | ||
491 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
492 | x="540.94702" | ||
493 | y="427.29443" | ||
494 | id="text4648-4-3-1" | ||
495 | sodipodi:linespacing="125%"><tspan | ||
496 | sodipodi:role="line" | ||
497 | id="tspan4650-4-5-7" | ||
498 | x="540.94702" | ||
499 | y="427.29443">QS</tspan></text> | ||
500 | <text | ||
501 | xml:space="preserve" | ||
502 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
503 | x="686.27747" | ||
504 | y="461.83929" | ||
505 | id="text4453-7" | ||
506 | sodipodi:linespacing="125%"><tspan | ||
507 | sodipodi:role="line" | ||
508 | id="tspan4455-1" | ||
509 | x="686.27747" | ||
510 | y="461.83929">r4 = READ_ONCE(b);</tspan></text> | ||
511 | <text | ||
512 | xml:space="preserve" | ||
513 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
514 | x="686.27747" | ||
515 | y="669.26422" | ||
516 | id="text4457-9" | ||
517 | sodipodi:linespacing="125%"><tspan | ||
518 | sodipodi:role="line" | ||
519 | id="tspan4459-2" | ||
520 | x="686.27747" | ||
521 | y="669.26422">r5 = READ_ONCE(e);</tspan></text> | ||
522 | <text | ||
523 | xml:space="preserve" | ||
524 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
525 | x="686.27747" | ||
526 | y="445.04358" | ||
527 | id="text4429-8-9-33" | ||
528 | sodipodi:linespacing="125%"><tspan | ||
529 | sodipodi:role="line" | ||
530 | id="tspan4431-9-4-2" | ||
531 | x="686.27747" | ||
532 | y="445.04358">rcu_read_lock();</tspan></text> | ||
533 | <text | ||
534 | xml:space="preserve" | ||
535 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
536 | x="686.27747" | ||
537 | y="684.53094" | ||
538 | id="text4429-8-9-3-8" | ||
539 | sodipodi:linespacing="125%"><tspan | ||
540 | sodipodi:role="line" | ||
541 | id="tspan4431-9-4-4-5" | ||
542 | x="686.27747" | ||
543 | y="684.53094">rcu_read_unlock();</tspan></text> | ||
544 | <text | ||
545 | xml:space="preserve" | ||
546 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
547 | x="685.11914" | ||
548 | y="422.79153" | ||
549 | id="text4648-9" | ||
550 | sodipodi:linespacing="125%"><tspan | ||
551 | sodipodi:role="line" | ||
552 | id="tspan4650-7" | ||
553 | x="685.11914" | ||
554 | y="422.79153">QS</tspan></text> | ||
555 | <path | ||
556 | sodipodi:type="arc" | ||
557 | style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
558 | id="path4652-8" | ||
559 | sodipodi:cx="358.85669" | ||
560 | sodipodi:cy="142.87541" | ||
561 | sodipodi:rx="10.960155" | ||
562 | sodipodi:ry="10.253048" | ||
563 | d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" | ||
564 | transform="translate(326.61602,276.73415)" | ||
565 | sodipodi:start="4.7135481" | ||
566 | sodipodi:end="10.994651" | ||
567 | sodipodi:open="true" /> | ||
568 | <text | ||
569 | xml:space="preserve" | ||
570 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
571 | x="397.85934" | ||
572 | y="609.59003" | ||
573 | id="text4648-5" | ||
574 | sodipodi:linespacing="125%"><tspan | ||
575 | sodipodi:role="line" | ||
576 | id="tspan4650-77" | ||
577 | x="397.85934" | ||
578 | y="609.59003">QS</tspan></text> | ||
579 | <path | ||
580 | sodipodi:type="arc" | ||
581 | style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
582 | id="path4652-80" | ||
583 | sodipodi:cx="358.85669" | ||
584 | sodipodi:cy="142.87541" | ||
585 | sodipodi:rx="10.960155" | ||
586 | sodipodi:ry="10.253048" | ||
587 | d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" | ||
588 | transform="translate(39.356201,463.53264)" | ||
589 | sodipodi:start="4.7135481" | ||
590 | sodipodi:end="10.994651" | ||
591 | sodipodi:open="true" /> | ||
592 | <text | ||
593 | xml:space="preserve" | ||
594 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
595 | x="256.75986" | ||
596 | y="586.99133" | ||
597 | id="text4648-5-2" | ||
598 | sodipodi:linespacing="125%"><tspan | ||
599 | sodipodi:role="line" | ||
600 | id="tspan4650-77-7" | ||
601 | x="256.75986" | ||
602 | y="586.99133">QS</tspan></text> | ||
603 | <path | ||
604 | sodipodi:type="arc" | ||
605 | style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
606 | id="path4652-80-5" | ||
607 | sodipodi:cx="358.85669" | ||
608 | sodipodi:cy="142.87541" | ||
609 | sodipodi:rx="10.960155" | ||
610 | sodipodi:ry="10.253048" | ||
611 | d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" | ||
612 | transform="translate(-101.74328,440.93395)" | ||
613 | sodipodi:start="4.7135481" | ||
614 | sodipodi:end="10.994651" | ||
615 | sodipodi:open="true" /> | ||
616 | <text | ||
617 | xml:space="preserve" | ||
618 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
619 | x="546.22791" | ||
620 | y="213.91006" | ||
621 | id="text4461-2-5" | ||
622 | sodipodi:linespacing="125%"><tspan | ||
623 | sodipodi:role="line" | ||
624 | id="tspan4463-2-6" | ||
625 | x="546.22791" | ||
626 | y="213.91006">thread3()</tspan></text> | ||
627 | <text | ||
628 | xml:space="preserve" | ||
629 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
630 | x="684.00067" | ||
631 | y="213.91006" | ||
632 | id="text4461-2-1" | ||
633 | sodipodi:linespacing="125%"><tspan | ||
634 | sodipodi:role="line" | ||
635 | id="tspan4463-2-0" | ||
636 | x="684.00067" | ||
637 | y="213.91006">thread4()</tspan></text> | ||
638 | </g> | ||
639 | </svg> | ||
diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html new file mode 100644 index 000000000000..36de7aaa941e --- /dev/null +++ b/Documentation/RCU/Design/Requirements/Requirements.html | |||
@@ -0,0 +1,2799 @@ | |||
1 | <!-- DO NOT HAND EDIT. --> | ||
2 | <!-- Instead, edit Requirements.htmlx and run 'sh htmlqqz.sh Requirements' --> | ||
3 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" | ||
4 | "http://www.w3.org/TR/html4/loose.dtd"> | ||
5 | <html> | ||
6 | <head><title>A Tour Through RCU's Requirements [LWN.net]</title> | ||
7 | <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8"> | ||
8 | |||
9 | <h1>A Tour Through RCU's Requirements</h1> | ||
10 | |||
11 | <p>Copyright IBM Corporation, 2015</p> | ||
12 | <p>Author: Paul E. McKenney</p> | ||
13 | <p><i>The initial version of this document appeared in the | ||
14 | <a href="https://lwn.net/">LWN</a> articles | ||
15 | <a href="https://lwn.net/Articles/652156/">here</a>, | ||
16 | <a href="https://lwn.net/Articles/652677/">here</a>, and | ||
17 | <a href="https://lwn.net/Articles/653326/">here</a>.</i></p> | ||
18 | |||
19 | <h2>Introduction</h2> | ||
20 | |||
21 | <p> | ||
22 | Read-copy update (RCU) is a synchronization mechanism that is often | ||
23 | used as a replacement for reader-writer locking. | ||
24 | RCU is unusual in that updaters do not block readers, | ||
25 | which means that RCU's read-side primitives can be exceedingly fast | ||
26 | and scalable. | ||
27 | In addition, updaters can make useful forward progress concurrently | ||
28 | with readers. | ||
29 | However, all this concurrency between RCU readers and updaters does raise | ||
30 | the question of exactly what RCU readers are doing, which in turn | ||
31 | raises the question of exactly what RCU's requirements are. | ||
32 | |||
33 | <p> | ||
34 | This document therefore summarizes RCU's requirements, and can be thought | ||
35 | of as an informal, high-level specification for RCU. | ||
36 | It is important to understand that RCU's specification is primarily | ||
37 | empirical in nature; | ||
38 | in fact, I learned about many of these requirements the hard way. | ||
39 | This situation might cause some consternation, however, not only | ||
40 | has this learning process been a lot of fun, but it has also been | ||
41 | a great privilege to work with so many people willing to apply | ||
42 | technologies in interesting new ways. | ||
43 | |||
44 | <p> | ||
45 | All that aside, here are the categories of currently known RCU requirements: | ||
46 | </p> | ||
47 | |||
48 | <ol> | ||
49 | <li> <a href="#Fundamental Requirements"> | ||
50 | Fundamental Requirements</a> | ||
51 | <li> <a href="#Fundamental Non-Requirements">Fundamental Non-Requirements</a> | ||
52 | <li> <a href="#Parallelism Facts of Life"> | ||
53 | Parallelism Facts of Life</a> | ||
54 | <li> <a href="#Quality-of-Implementation Requirements"> | ||
55 | Quality-of-Implementation Requirements</a> | ||
56 | <li> <a href="#Linux Kernel Complications"> | ||
57 | Linux Kernel Complications</a> | ||
58 | <li> <a href="#Software-Engineering Requirements"> | ||
59 | Software-Engineering Requirements</a> | ||
60 | <li> <a href="#Other RCU Flavors"> | ||
61 | Other RCU Flavors</a> | ||
62 | <li> <a href="#Possible Future Changes"> | ||
63 | Possible Future Changes</a> | ||
64 | </ol> | ||
65 | |||
66 | <p> | ||
67 | This is followed by a <a href="#Summary">summary</a>, | ||
68 | which is in turn followed by the inevitable | ||
69 | <a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>. | ||
70 | |||
71 | <h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2> | ||
72 | |||
73 | <p> | ||
74 | RCU's fundamental requirements are the closest thing RCU has to hard | ||
75 | mathematical requirements. | ||
76 | These are: | ||
77 | |||
78 | <ol> | ||
79 | <li> <a href="#Grace-Period Guarantee"> | ||
80 | Grace-Period Guarantee</a> | ||
81 | <li> <a href="#Publish-Subscribe Guarantee"> | ||
82 | Publish-Subscribe Guarantee</a> | ||
83 | <li> <a href="#RCU Primitives Guaranteed to Execute Unconditionally"> | ||
84 | RCU Primitives Guaranteed to Execute Unconditionally</a> | ||
85 | <li> <a href="#Guaranteed Read-to-Write Upgrade"> | ||
86 | Guaranteed Read-to-Write Upgrade</a> | ||
87 | </ol> | ||
88 | |||
89 | <h3><a name="Grace-Period Guarantee">Grace-Period Guarantee</a></h3> | ||
90 | |||
91 | <p> | ||
92 | RCU's grace-period guarantee is unusual in being premeditated: | ||
93 | Jack Slingwine and I had this guarantee firmly in mind when we started | ||
94 | work on RCU (then called “rclock”) in the early 1990s. | ||
95 | That said, the past two decades of experience with RCU have produced | ||
96 | a much more detailed understanding of this guarantee. | ||
97 | |||
98 | <p> | ||
99 | RCU's grace-period guarantee allows updaters to wait for the completion | ||
100 | of all pre-existing RCU read-side critical sections. | ||
101 | An RCU read-side critical section | ||
102 | begins with the marker <tt>rcu_read_lock()</tt> and ends with | ||
103 | the marker <tt>rcu_read_unlock()</tt>. | ||
104 | These markers may be nested, and RCU treats a nested set as one | ||
105 | big RCU read-side critical section. | ||
106 | Production-quality implementations of <tt>rcu_read_lock()</tt> and | ||
107 | <tt>rcu_read_unlock()</tt> are extremely lightweight, and in | ||
108 | fact have exactly zero overhead in Linux kernels built for production | ||
109 | use with <tt>CONFIG_PREEMPT=n</tt>. | ||
110 | |||
111 | <p> | ||
112 | This guarantee allows ordering to be enforced with extremely low | ||
113 | overhead to readers, for example: | ||
114 | |||
115 | <blockquote> | ||
116 | <pre> | ||
117 | 1 int x, y; | ||
118 | 2 | ||
119 | 3 void thread0(void) | ||
120 | 4 { | ||
121 | 5 rcu_read_lock(); | ||
122 | 6 r1 = READ_ONCE(x); | ||
123 | 7 r2 = READ_ONCE(y); | ||
124 | 8 rcu_read_unlock(); | ||
125 | 9 } | ||
126 | 10 | ||
127 | 11 void thread1(void) | ||
128 | 12 { | ||
129 | 13 WRITE_ONCE(x, 1); | ||
130 | 14 synchronize_rcu(); | ||
131 | 15 WRITE_ONCE(y, 1); | ||
132 | 16 } | ||
133 | </pre> | ||
134 | </blockquote> | ||
135 | |||
136 | <p> | ||
137 | Because the <tt>synchronize_rcu()</tt> on line 14 waits for | ||
138 | all pre-existing readers, any instance of <tt>thread0()</tt> that | ||
139 | loads a value of zero from <tt>x</tt> must complete before | ||
140 | <tt>thread1()</tt> stores to <tt>y</tt>, so that instance must | ||
141 | also load a value of zero from <tt>y</tt>. | ||
142 | Similarly, any instance of <tt>thread0()</tt> that loads a value of | ||
143 | one from <tt>y</tt> must have started after the | ||
144 | <tt>synchronize_rcu()</tt> started, and must therefore also load | ||
145 | a value of one from <tt>x</tt>. | ||
146 | Therefore, the outcome: | ||
147 | <blockquote> | ||
148 | <pre> | ||
149 | (r1 == 0 && r2 == 1) | ||
150 | </pre> | ||
151 | </blockquote> | ||
152 | cannot happen. | ||
153 | |||
154 | <p><a name="Quick Quiz 1"><b>Quick Quiz 1</b>:</a> | ||
155 | Wait a minute! | ||
156 | You said that updaters can make useful forward progress concurrently | ||
157 | with readers, but pre-existing readers will block | ||
158 | <tt>synchronize_rcu()</tt>!!! | ||
159 | Just who are you trying to fool??? | ||
160 | <br><a href="#qq1answer">Answer</a> | ||
161 | |||
162 | <p> | ||
163 | This scenario resembles one of the first uses of RCU in | ||
164 | <a href="https://en.wikipedia.org/wiki/DYNIX">DYNIX/ptx</a>, | ||
165 | which managed a distributed lock manager's transition into | ||
166 | a state suitable for handling recovery from node failure, | ||
167 | more or less as follows: | ||
168 | |||
169 | <blockquote> | ||
170 | <pre> | ||
171 | 1 #define STATE_NORMAL 0 | ||
172 | 2 #define STATE_WANT_RECOVERY 1 | ||
173 | 3 #define STATE_RECOVERING 2 | ||
174 | 4 #define STATE_WANT_NORMAL 3 | ||
175 | 5 | ||
176 | 6 int state = STATE_NORMAL; | ||
177 | 7 | ||
178 | 8 void do_something_dlm(void) | ||
179 | 9 { | ||
180 | 10 int state_snap; | ||
181 | 11 | ||
182 | 12 rcu_read_lock(); | ||
183 | 13 state_snap = READ_ONCE(state); | ||
184 | 14 if (state_snap == STATE_NORMAL) | ||
185 | 15 do_something(); | ||
186 | 16 else | ||
187 | 17 do_something_carefully(); | ||
188 | 18 rcu_read_unlock(); | ||
189 | 19 } | ||
190 | 20 | ||
191 | 21 void start_recovery(void) | ||
192 | 22 { | ||
193 | 23 WRITE_ONCE(state, STATE_WANT_RECOVERY); | ||
194 | 24 synchronize_rcu(); | ||
195 | 25 WRITE_ONCE(state, STATE_RECOVERING); | ||
196 | 26 recovery(); | ||
197 | 27 WRITE_ONCE(state, STATE_WANT_NORMAL); | ||
198 | 28 synchronize_rcu(); | ||
199 | 29 WRITE_ONCE(state, STATE_NORMAL); | ||
200 | 30 } | ||
201 | </pre> | ||
202 | </blockquote> | ||
203 | |||
204 | <p> | ||
205 | The RCU read-side critical section in <tt>do_something_dlm()</tt> | ||
206 | works with the <tt>synchronize_rcu()</tt> in <tt>start_recovery()</tt> | ||
207 | to guarantee that <tt>do_something()</tt> never runs concurrently | ||
208 | with <tt>recovery()</tt>, but with little or no synchronization | ||
209 | overhead in <tt>do_something_dlm()</tt>. | ||
210 | |||
211 | <p><a name="Quick Quiz 2"><b>Quick Quiz 2</b>:</a> | ||
212 | Why is the <tt>synchronize_rcu()</tt> on line 28 needed? | ||
213 | <br><a href="#qq2answer">Answer</a> | ||
214 | |||
215 | <p> | ||
216 | In order to avoid fatal problems such as deadlocks, | ||
217 | an RCU read-side critical section must not contain calls to | ||
218 | <tt>synchronize_rcu()</tt>. | ||
219 | Similarly, an RCU read-side critical section must not | ||
220 | contain anything that waits, directly or indirectly, on completion of | ||
221 | an invocation of <tt>synchronize_rcu()</tt>. | ||
222 | |||
223 | <p> | ||
224 | Although RCU's grace-period guarantee is useful in and of itself, with | ||
225 | <a href="https://lwn.net/Articles/573497/">quite a few use cases</a>, | ||
226 | it would be good to be able to use RCU to coordinate read-side | ||
227 | access to linked data structures. | ||
228 | For this, the grace-period guarantee is not sufficient, as can | ||
229 | be seen in function <tt>add_gp_buggy()</tt> below. | ||
230 | We will look at the reader's code later, but in the meantime, just think of | ||
231 | the reader as locklessly picking up the <tt>gp</tt> pointer, | ||
232 | and, if the value loaded is non-<tt>NULL</tt>, locklessly accessing the | ||
233 | <tt>->a</tt> and <tt>->b</tt> fields. | ||
234 | |||
235 | <blockquote> | ||
236 | <pre> | ||
237 | 1 bool add_gp_buggy(int a, int b) | ||
238 | 2 { | ||
239 | 3 p = kmalloc(sizeof(*p), GFP_KERNEL); | ||
240 | 4 if (!p) | ||
241 | 5 return -ENOMEM; | ||
242 | 6 spin_lock(&gp_lock); | ||
243 | 7 if (rcu_access_pointer(gp)) { | ||
244 | 8 spin_unlock(&gp_lock); | ||
245 | 9 return false; | ||
246 | 10 } | ||
247 | 11 p->a = a; | ||
248 | 12 p->b = a; | ||
249 | 13 gp = p; /* ORDERING BUG */ | ||
250 | 14 spin_unlock(&gp_lock); | ||
251 | 15 return true; | ||
252 | 16 } | ||
253 | </pre> | ||
254 | </blockquote> | ||
255 | |||
256 | <p> | ||
257 | The problem is that both the compiler and weakly ordered CPUs are within | ||
258 | their rights to reorder this code as follows: | ||
259 | |||
260 | <blockquote> | ||
261 | <pre> | ||
262 | 1 bool add_gp_buggy_optimized(int a, int b) | ||
263 | 2 { | ||
264 | 3 p = kmalloc(sizeof(*p), GFP_KERNEL); | ||
265 | 4 if (!p) | ||
266 | 5 return -ENOMEM; | ||
267 | 6 spin_lock(&gp_lock); | ||
268 | 7 if (rcu_access_pointer(gp)) { | ||
269 | 8 spin_unlock(&gp_lock); | ||
270 | 9 return false; | ||
271 | 10 } | ||
272 | <b>11 gp = p; /* ORDERING BUG */ | ||
273 | 12 p->a = a; | ||
274 | 13 p->b = a;</b> | ||
275 | 14 spin_unlock(&gp_lock); | ||
276 | 15 return true; | ||
277 | 16 } | ||
278 | </pre> | ||
279 | </blockquote> | ||
280 | |||
281 | <p> | ||
282 | If an RCU reader fetches <tt>gp</tt> just after | ||
283 | <tt>add_gp_buggy_optimized</tt> executes line 11, | ||
284 | it will see garbage in the <tt>->a</tt> and <tt>->b</tt> | ||
285 | fields. | ||
286 | And this is but one of many ways in which compiler and hardware optimizations | ||
287 | could cause trouble. | ||
288 | Therefore, we clearly need some way to prevent the compiler and the CPU from | ||
289 | reordering in this manner, which brings us to the publish-subscribe | ||
290 | guarantee discussed in the next section. | ||
291 | |||
292 | <h3><a name="Publish-Subscribe Guarantee">Publish/Subscribe Guarantee</a></h3> | ||
293 | |||
294 | <p> | ||
295 | RCU's publish-subscribe guarantee allows data to be inserted | ||
296 | into a linked data structure without disrupting RCU readers. | ||
297 | The updater uses <tt>rcu_assign_pointer()</tt> to insert the | ||
298 | new data, and readers use <tt>rcu_dereference()</tt> to | ||
299 | access data, whether new or old. | ||
300 | The following shows an example of insertion: | ||
301 | |||
302 | <blockquote> | ||
303 | <pre> | ||
304 | 1 bool add_gp(int a, int b) | ||
305 | 2 { | ||
306 | 3 p = kmalloc(sizeof(*p), GFP_KERNEL); | ||
307 | 4 if (!p) | ||
308 | 5 return -ENOMEM; | ||
309 | 6 spin_lock(&gp_lock); | ||
310 | 7 if (rcu_access_pointer(gp)) { | ||
311 | 8 spin_unlock(&gp_lock); | ||
312 | 9 return false; | ||
313 | 10 } | ||
314 | 11 p->a = a; | ||
315 | 12 p->b = a; | ||
316 | 13 rcu_assign_pointer(gp, p); | ||
317 | 14 spin_unlock(&gp_lock); | ||
318 | 15 return true; | ||
319 | 16 } | ||
320 | </pre> | ||
321 | </blockquote> | ||
322 | |||
323 | <p> | ||
324 | The <tt>rcu_assign_pointer()</tt> on line 13 is conceptually | ||
325 | equivalent to a simple assignment statement, but also guarantees | ||
326 | that its assignment will | ||
327 | happen after the two assignments in lines 11 and 12, | ||
328 | similar to the C11 <tt>memory_order_release</tt> store operation. | ||
329 | It also prevents any number of “interesting” compiler | ||
330 | optimizations, for example, the use of <tt>gp</tt> as a scratch | ||
331 | location immediately preceding the assignment. | ||
332 | |||
333 | <p><a name="Quick Quiz 3"><b>Quick Quiz 3</b>:</a> | ||
334 | But <tt>rcu_assign_pointer()</tt> does nothing to prevent the | ||
335 | two assignments to <tt>p->a</tt> and <tt>p->b</tt> | ||
336 | from being reordered. | ||
337 | Can't that also cause problems? | ||
338 | <br><a href="#qq3answer">Answer</a> | ||
339 | |||
340 | <p> | ||
341 | It is tempting to assume that the reader need not do anything special | ||
342 | to control its accesses to the RCU-protected data, | ||
343 | as shown in <tt>do_something_gp_buggy()</tt> below: | ||
344 | |||
345 | <blockquote> | ||
346 | <pre> | ||
347 | 1 bool do_something_gp_buggy(void) | ||
348 | 2 { | ||
349 | 3 rcu_read_lock(); | ||
350 | 4 p = gp; /* OPTIMIZATIONS GALORE!!! */ | ||
351 | 5 if (p) { | ||
352 | 6 do_something(p->a, p->b); | ||
353 | 7 rcu_read_unlock(); | ||
354 | 8 return true; | ||
355 | 9 } | ||
356 | 10 rcu_read_unlock(); | ||
357 | 11 return false; | ||
358 | 12 } | ||
359 | </pre> | ||
360 | </blockquote> | ||
361 | |||
362 | <p> | ||
363 | However, this temptation must be resisted because there are a | ||
364 | surprisingly large number of ways that the compiler | ||
365 | (to say nothing of | ||
366 | <a href="https://h71000.www7.hp.com/wizard/wiz_2637.html">DEC Alpha CPUs</a>) | ||
367 | can trip this code up. | ||
368 | For but one example, if the compiler were short of registers, it | ||
369 | might choose to refetch from <tt>gp</tt> rather than keeping | ||
370 | a separate copy in <tt>p</tt> as follows: | ||
371 | |||
372 | <blockquote> | ||
373 | <pre> | ||
374 | 1 bool do_something_gp_buggy_optimized(void) | ||
375 | 2 { | ||
376 | 3 rcu_read_lock(); | ||
377 | 4 if (gp) { /* OPTIMIZATIONS GALORE!!! */ | ||
378 | <b> 5 do_something(gp->a, gp->b);</b> | ||
379 | 6 rcu_read_unlock(); | ||
380 | 7 return true; | ||
381 | 8 } | ||
382 | 9 rcu_read_unlock(); | ||
383 | 10 return false; | ||
384 | 11 } | ||
385 | </pre> | ||
386 | </blockquote> | ||
387 | |||
388 | <p> | ||
389 | If this function ran concurrently with a series of updates that | ||
390 | replaced the current structure with a new one, | ||
391 | the fetches of <tt>gp->a</tt> | ||
392 | and <tt>gp->b</tt> might well come from two different structures, | ||
393 | which could cause serious confusion. | ||
394 | To prevent this (and much else besides), <tt>do_something_gp()</tt> uses | ||
395 | <tt>rcu_dereference()</tt> to fetch from <tt>gp</tt>: | ||
396 | |||
397 | <blockquote> | ||
398 | <pre> | ||
399 | 1 bool do_something_gp(void) | ||
400 | 2 { | ||
401 | 3 rcu_read_lock(); | ||
402 | 4 p = rcu_dereference(gp); | ||
403 | 5 if (p) { | ||
404 | 6 do_something(p->a, p->b); | ||
405 | 7 rcu_read_unlock(); | ||
406 | 8 return true; | ||
407 | 9 } | ||
408 | 10 rcu_read_unlock(); | ||
409 | 11 return false; | ||
410 | 12 } | ||
411 | </pre> | ||
412 | </blockquote> | ||
413 | |||
414 | <p> | ||
415 | The <tt>rcu_dereference()</tt> uses volatile casts and (for DEC Alpha) | ||
416 | memory barriers in the Linux kernel. | ||
417 | Should a | ||
418 | <a href="http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf">high-quality implementation of C11 <tt>memory_order_consume</tt> [PDF]</a> | ||
419 | ever appear, then <tt>rcu_dereference()</tt> could be implemented | ||
420 | as a <tt>memory_order_consume</tt> load. | ||
421 | Regardless of the exact implementation, a pointer fetched by | ||
422 | <tt>rcu_dereference()</tt> may not be used outside of the | ||
423 | outermost RCU read-side critical section containing that | ||
424 | <tt>rcu_dereference()</tt>, unless protection of | ||
425 | the corresponding data element has been passed from RCU to some | ||
426 | other synchronization mechanism, most commonly locking or | ||
427 | <a href="https://www.kernel.org/doc/Documentation/RCU/rcuref.txt">reference counting</a>. | ||
428 | |||
429 | <p> | ||
430 | In short, updaters use <tt>rcu_assign_pointer()</tt> and readers | ||
431 | use <tt>rcu_dereference()</tt>, and these two RCU API elements | ||
432 | work together to ensure that readers have a consistent view of | ||
433 | newly added data elements. | ||
434 | |||
435 | <p> | ||
436 | Of course, it is also necessary to remove elements from RCU-protected | ||
437 | data structures, for example, using the following process: | ||
438 | |||
439 | <ol> | ||
440 | <li> Remove the data element from the enclosing structure. | ||
441 | <li> Wait for all pre-existing RCU read-side critical sections | ||
442 | to complete (because only pre-existing readers can possibly have | ||
443 | a reference to the newly removed data element). | ||
444 | <li> At this point, only the updater has a reference to the | ||
445 | newly removed data element, so it can safely reclaim | ||
446 | the data element, for example, by passing it to <tt>kfree()</tt>. | ||
447 | </ol> | ||
448 | |||
449 | This process is implemented by <tt>remove_gp_synchronous()</tt>: | ||
450 | |||
451 | <blockquote> | ||
452 | <pre> | ||
453 | 1 bool remove_gp_synchronous(void) | ||
454 | 2 { | ||
455 | 3 struct foo *p; | ||
456 | 4 | ||
457 | 5 spin_lock(&gp_lock); | ||
458 | 6 p = rcu_access_pointer(gp); | ||
459 | 7 if (!p) { | ||
460 | 8 spin_unlock(&gp_lock); | ||
461 | 9 return false; | ||
462 | 10 } | ||
463 | 11 rcu_assign_pointer(gp, NULL); | ||
464 | 12 spin_unlock(&gp_lock); | ||
465 | 13 synchronize_rcu(); | ||
466 | 14 kfree(p); | ||
467 | 15 return true; | ||
468 | 16 } | ||
469 | </pre> | ||
470 | </blockquote> | ||
471 | |||
472 | <p> | ||
473 | This function is straightforward, with line 13 waiting for a grace | ||
474 | period before line 14 frees the old data element. | ||
475 | This waiting ensures that readers will reach line 7 of | ||
476 | <tt>do_something_gp()</tt> before the data element referenced by | ||
477 | <tt>p</tt> is freed. | ||
478 | The <tt>rcu_access_pointer()</tt> on line 6 is similar to | ||
479 | <tt>rcu_dereference()</tt>, except that: | ||
480 | |||
481 | <ol> | ||
482 | <li> The value returned by <tt>rcu_access_pointer()</tt> | ||
483 | cannot be dereferenced. | ||
484 | If you want to access the value pointed to as well as | ||
485 | the pointer itself, use <tt>rcu_dereference()</tt> | ||
486 | instead of <tt>rcu_access_pointer()</tt>. | ||
487 | <li> The call to <tt>rcu_access_pointer()</tt> need not be | ||
488 | protected. | ||
489 | In contrast, <tt>rcu_dereference()</tt> must either be | ||
490 | within an RCU read-side critical section or in a code | ||
491 | segment where the pointer cannot change, for example, in | ||
492 | code protected by the corresponding update-side lock. | ||
493 | </ol> | ||
494 | |||
495 | <p><a name="Quick Quiz 4"><b>Quick Quiz 4</b>:</a> | ||
496 | Without the <tt>rcu_dereference()</tt> or the | ||
497 | <tt>rcu_access_pointer()</tt>, what destructive optimizations | ||
498 | might the compiler make use of? | ||
499 | <br><a href="#qq4answer">Answer</a> | ||
500 | |||
501 | <p> | ||
502 | This simple linked-data-structure scenario clearly demonstrates the need | ||
503 | for RCU's stringent memory-ordering guarantees on systems with more than | ||
504 | one CPU: | ||
505 | |||
506 | <ol> | ||
507 | <li> Each CPU that has an RCU read-side critical section that | ||
508 | begins before <tt>synchronize_rcu()</tt> starts is | ||
509 | guaranteed to execute a full memory barrier between the time | ||
510 | that the RCU read-side critical section ends and the time that | ||
511 | <tt>synchronize_rcu()</tt> returns. | ||
512 | Without this guarantee, a pre-existing RCU read-side critical section | ||
513 | might hold a reference to the newly removed <tt>struct foo</tt> | ||
514 | after the <tt>kfree()</tt> on line 14 of | ||
515 | <tt>remove_gp_synchronous()</tt>. | ||
516 | <li> Each CPU that has an RCU read-side critical section that ends | ||
517 | after <tt>synchronize_rcu()</tt> returns is guaranteed | ||
518 | to execute a full memory barrier between the time that | ||
519 | <tt>synchronize_rcu()</tt> begins and the time that the RCU | ||
520 | read-side critical section begins. | ||
521 | Without this guarantee, a later RCU read-side critical section | ||
522 | running after the <tt>kfree()</tt> on line 14 of | ||
523 | <tt>remove_gp_synchronous()</tt> might | ||
524 | later run <tt>do_something_gp()</tt> and find the | ||
525 | newly deleted <tt>struct foo</tt>. | ||
526 | <li> If the task invoking <tt>synchronize_rcu()</tt> remains | ||
527 | on a given CPU, then that CPU is guaranteed to execute a full | ||
528 | memory barrier sometime during the execution of | ||
529 | <tt>synchronize_rcu()</tt>. | ||
530 | This guarantee ensures that the <tt>kfree()</tt> on | ||
531 | line 14 of <tt>remove_gp_synchronous()</tt> really does | ||
532 | execute after the removal on line 11. | ||
533 | <li> If the task invoking <tt>synchronize_rcu()</tt> migrates | ||
534 | among a group of CPUs during that invocation, then each of the | ||
535 | CPUs in that group is guaranteed to execute a full memory barrier | ||
536 | sometime during the execution of <tt>synchronize_rcu()</tt>. | ||
537 | This guarantee also ensures that the <tt>kfree()</tt> on | ||
538 | line 14 of <tt>remove_gp_synchronous()</tt> really does | ||
539 | execute after the removal on | ||
540 | line 11, but also in the case where the thread executing the | ||
541 | <tt>synchronize_rcu()</tt> migrates in the meantime. | ||
542 | </ol> | ||
543 | |||
544 | <p><a name="Quick Quiz 5"><b>Quick Quiz 5</b>:</a> | ||
545 | Given that multiple CPUs can start RCU read-side critical sections | ||
546 | at any time without any ordering whatsoever, how can RCU possibly tell whether | ||
547 | or not a given RCU read-side critical section starts before a | ||
548 | given instance of <tt>synchronize_rcu()</tt>? | ||
549 | <br><a href="#qq5answer">Answer</a> | ||
550 | |||
551 | <p><a name="Quick Quiz 6"><b>Quick Quiz 6</b>:</a> | ||
552 | The first and second guarantees require unbelievably strict ordering! | ||
553 | Are all these memory barriers <i> really</i> required? | ||
554 | <br><a href="#qq6answer">Answer</a> | ||
555 | |||
556 | <p> | ||
557 | In short, RCU's publish-subscribe guarantee is provided by the combination | ||
558 | of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>. | ||
559 | This guarantee allows data elements to be safely added to RCU-protected | ||
560 | linked data structures without disrupting RCU readers. | ||
561 | This guarantee can be used in combination with the grace-period | ||
562 | guarantee to also allow data elements to be removed from RCU-protected | ||
563 | linked data structures, again without disrupting RCU readers. | ||
564 | |||
565 | <p> | ||
566 | This guarantee was only partially premeditated. | ||
567 | DYNIX/ptx used an explicit memory barrier for publication, but had nothing | ||
568 | resembling <tt>rcu_dereference()</tt> for subscription, nor did it | ||
569 | have anything resembling the <tt>smp_read_barrier_depends()</tt> | ||
570 | that was later subsumed into <tt>rcu_dereference()</tt>. | ||
571 | The need for these operations made itself known quite suddenly at a | ||
572 | late-1990s meeting with the DEC Alpha architects, back in the days when | ||
573 | DEC was still a free-standing company. | ||
574 | It took the Alpha architects a good hour to convince me that any sort | ||
575 | of barrier would ever be needed, and it then took me a good <i>two</i> hours | ||
576 | to convince them that their documentation did not make this point clear. | ||
577 | More recent work with the C and C++ standards committees have provided | ||
578 | much education on tricks and traps from the compiler. | ||
579 | In short, compilers were much less tricky in the early 1990s, but in | ||
580 | 2015, don't even think about omitting <tt>rcu_dereference()</tt>! | ||
581 | |||
582 | <h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3> | ||
583 | |||
584 | <p> | ||
585 | The common-case RCU primitives are unconditional. | ||
586 | They are invoked, they do their job, and they return, with no possibility | ||
587 | of error, and no need to retry. | ||
588 | This is a key RCU design philosophy. | ||
589 | |||
590 | <p> | ||
591 | However, this philosophy is pragmatic rather than pigheaded. | ||
592 | If someone comes up with a good justification for a particular conditional | ||
593 | RCU primitive, it might well be implemented and added. | ||
594 | After all, this guarantee was reverse-engineered, not premeditated. | ||
595 | The unconditional nature of the RCU primitives was initially an | ||
596 | accident of implementation, and later experience with synchronization | ||
597 | primitives with conditional primitives caused me to elevate this | ||
598 | accident to a guarantee. | ||
599 | Therefore, the justification for adding a conditional primitive to | ||
600 | RCU would need to be based on detailed and compelling use cases. | ||
601 | |||
602 | <h3><a name="Guaranteed Read-to-Write Upgrade">Guaranteed Read-to-Write Upgrade</a></h3> | ||
603 | |||
604 | <p> | ||
605 | As far as RCU is concerned, it is always possible to carry out an | ||
606 | update within an RCU read-side critical section. | ||
607 | For example, that RCU read-side critical section might search for | ||
608 | a given data element, and then might acquire the update-side | ||
609 | spinlock in order to update that element, all while remaining | ||
610 | in that RCU read-side critical section. | ||
611 | Of course, it is necessary to exit the RCU read-side critical section | ||
612 | before invoking <tt>synchronize_rcu()</tt>, however, this | ||
613 | inconvenience can be avoided through use of the | ||
614 | <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members | ||
615 | described later in this document. | ||
616 | |||
617 | <p><a name="Quick Quiz 7"><b>Quick Quiz 7</b>:</a> | ||
618 | But how does the upgrade-to-write operation exclude other readers? | ||
619 | <br><a href="#qq7answer">Answer</a> | ||
620 | |||
621 | <p> | ||
622 | This guarantee allows lookup code to be shared between read-side | ||
623 | and update-side code, and was premeditated, appearing in the earliest | ||
624 | DYNIX/ptx RCU documentation. | ||
625 | |||
626 | <h2><a name="Fundamental Non-Requirements">Fundamental Non-Requirements</a></h2> | ||
627 | |||
628 | <p> | ||
629 | RCU provides extremely lightweight readers, and its read-side guarantees, | ||
630 | though quite useful, are correspondingly lightweight. | ||
631 | It is therefore all too easy to assume that RCU is guaranteeing more | ||
632 | than it really is. | ||
633 | Of course, the list of things that RCU does not guarantee is infinitely | ||
634 | long, however, the following sections list a few non-guarantees that | ||
635 | have caused confusion. | ||
636 | Except where otherwise noted, these non-guarantees were premeditated. | ||
637 | |||
638 | <ol> | ||
639 | <li> <a href="#Readers Impose Minimal Ordering"> | ||
640 | Readers Impose Minimal Ordering</a> | ||
641 | <li> <a href="#Readers Do Not Exclude Updaters"> | ||
642 | Readers Do Not Exclude Updaters</a> | ||
643 | <li> <a href="#Updaters Only Wait For Old Readers"> | ||
644 | Updaters Only Wait For Old Readers</a> | ||
645 | <li> <a href="#Grace Periods Don't Partition Read-Side Critical Sections"> | ||
646 | Grace Periods Don't Partition Read-Side Critical Sections</a> | ||
647 | <li> <a href="#Read-Side Critical Sections Don't Partition Grace Periods"> | ||
648 | Read-Side Critical Sections Don't Partition Grace Periods</a> | ||
649 | <li> <a href="#Disabling Preemption Does Not Block Grace Periods"> | ||
650 | Disabling Preemption Does Not Block Grace Periods</a> | ||
651 | </ol> | ||
652 | |||
653 | <h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3> | ||
654 | |||
655 | <p> | ||
656 | Reader-side markers such as <tt>rcu_read_lock()</tt> and | ||
657 | <tt>rcu_read_unlock()</tt> provide absolutely no ordering guarantees | ||
658 | except through their interaction with the grace-period APIs such as | ||
659 | <tt>synchronize_rcu()</tt>. | ||
660 | To see this, consider the following pair of threads: | ||
661 | |||
662 | <blockquote> | ||
663 | <pre> | ||
664 | 1 void thread0(void) | ||
665 | 2 { | ||
666 | 3 rcu_read_lock(); | ||
667 | 4 WRITE_ONCE(x, 1); | ||
668 | 5 rcu_read_unlock(); | ||
669 | 6 rcu_read_lock(); | ||
670 | 7 WRITE_ONCE(y, 1); | ||
671 | 8 rcu_read_unlock(); | ||
672 | 9 } | ||
673 | 10 | ||
674 | 11 void thread1(void) | ||
675 | 12 { | ||
676 | 13 rcu_read_lock(); | ||
677 | 14 r1 = READ_ONCE(y); | ||
678 | 15 rcu_read_unlock(); | ||
679 | 16 rcu_read_lock(); | ||
680 | 17 r2 = READ_ONCE(x); | ||
681 | 18 rcu_read_unlock(); | ||
682 | 19 } | ||
683 | </pre> | ||
684 | </blockquote> | ||
685 | |||
686 | <p> | ||
687 | After <tt>thread0()</tt> and <tt>thread1()</tt> execute | ||
688 | concurrently, it is quite possible to have | ||
689 | |||
690 | <blockquote> | ||
691 | <pre> | ||
692 | (r1 == 1 && r2 == 0) | ||
693 | </pre> | ||
694 | </blockquote> | ||
695 | |||
696 | (that is, <tt>y</tt> appears to have been assigned before <tt>x</tt>), | ||
697 | which would not be possible if <tt>rcu_read_lock()</tt> and | ||
698 | <tt>rcu_read_unlock()</tt> had much in the way of ordering | ||
699 | properties. | ||
700 | But they do not, so the CPU is within its rights | ||
701 | to do significant reordering. | ||
702 | This is by design: Any significant ordering constraints would slow down | ||
703 | these fast-path APIs. | ||
704 | |||
705 | <p><a name="Quick Quiz 8"><b>Quick Quiz 8</b>:</a> | ||
706 | Can't the compiler also reorder this code? | ||
707 | <br><a href="#qq8answer">Answer</a> | ||
708 | |||
709 | <h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3> | ||
710 | |||
711 | <p> | ||
712 | Neither <tt>rcu_read_lock()</tt> nor <tt>rcu_read_unlock()</tt> | ||
713 | exclude updates. | ||
714 | All they do is to prevent grace periods from ending. | ||
715 | The following example illustrates this: | ||
716 | |||
717 | <blockquote> | ||
718 | <pre> | ||
719 | 1 void thread0(void) | ||
720 | 2 { | ||
721 | 3 rcu_read_lock(); | ||
722 | 4 r1 = READ_ONCE(y); | ||
723 | 5 if (r1) { | ||
724 | 6 do_something_with_nonzero_x(); | ||
725 | 7 r2 = READ_ONCE(x); | ||
726 | 8 WARN_ON(!r2); /* BUG!!! */ | ||
727 | 9 } | ||
728 | 10 rcu_read_unlock(); | ||
729 | 11 } | ||
730 | 12 | ||
731 | 13 void thread1(void) | ||
732 | 14 { | ||
733 | 15 spin_lock(&my_lock); | ||
734 | 16 WRITE_ONCE(x, 1); | ||
735 | 17 WRITE_ONCE(y, 1); | ||
736 | 18 spin_unlock(&my_lock); | ||
737 | 19 } | ||
738 | </pre> | ||
739 | </blockquote> | ||
740 | |||
741 | <p> | ||
742 | If the <tt>thread0()</tt> function's <tt>rcu_read_lock()</tt> | ||
743 | excluded the <tt>thread1()</tt> function's update, | ||
744 | the <tt>WARN_ON()</tt> could never fire. | ||
745 | But the fact is that <tt>rcu_read_lock()</tt> does not exclude | ||
746 | much of anything aside from subsequent grace periods, of which | ||
747 | <tt>thread1()</tt> has none, so the | ||
748 | <tt>WARN_ON()</tt> can and does fire. | ||
749 | |||
750 | <h3><a name="Updaters Only Wait For Old Readers">Updaters Only Wait For Old Readers</a></h3> | ||
751 | |||
752 | <p> | ||
753 | It might be tempting to assume that after <tt>synchronize_rcu()</tt> | ||
754 | completes, there are no readers executing. | ||
755 | This temptation must be avoided because | ||
756 | new readers can start immediately after <tt>synchronize_rcu()</tt> | ||
757 | starts, and <tt>synchronize_rcu()</tt> is under no | ||
758 | obligation to wait for these new readers. | ||
759 | |||
760 | <p><a name="Quick Quiz 9"><b>Quick Quiz 9</b>:</a> | ||
761 | Suppose that synchronize_rcu() did wait until all readers had completed. | ||
762 | Would the updater be able to rely on this? | ||
763 | <br><a href="#qq9answer">Answer</a> | ||
764 | |||
765 | <h3><a name="Grace Periods Don't Partition Read-Side Critical Sections"> | ||
766 | Grace Periods Don't Partition Read-Side Critical Sections</a></h3> | ||
767 | |||
768 | <p> | ||
769 | It is tempting to assume that if any part of one RCU read-side critical | ||
770 | section precedes a given grace period, and if any part of another RCU | ||
771 | read-side critical section follows that same grace period, then all of | ||
772 | the first RCU read-side critical section must precede all of the second. | ||
773 | However, this just isn't the case: A single grace period does not | ||
774 | partition the set of RCU read-side critical sections. | ||
775 | An example of this situation can be illustrated as follows, where | ||
776 | <tt>x</tt>, <tt>y</tt>, and <tt>z</tt> are initially all zero: | ||
777 | |||
778 | <blockquote> | ||
779 | <pre> | ||
780 | 1 void thread0(void) | ||
781 | 2 { | ||
782 | 3 rcu_read_lock(); | ||
783 | 4 WRITE_ONCE(a, 1); | ||
784 | 5 WRITE_ONCE(b, 1); | ||
785 | 6 rcu_read_unlock(); | ||
786 | 7 } | ||
787 | 8 | ||
788 | 9 void thread1(void) | ||
789 | 10 { | ||
790 | 11 r1 = READ_ONCE(a); | ||
791 | 12 synchronize_rcu(); | ||
792 | 13 WRITE_ONCE(c, 1); | ||
793 | 14 } | ||
794 | 15 | ||
795 | 16 void thread2(void) | ||
796 | 17 { | ||
797 | 18 rcu_read_lock(); | ||
798 | 19 r2 = READ_ONCE(b); | ||
799 | 20 r3 = READ_ONCE(c); | ||
800 | 21 rcu_read_unlock(); | ||
801 | 22 } | ||
802 | </pre> | ||
803 | </blockquote> | ||
804 | |||
805 | <p> | ||
806 | It turns out that the outcome: | ||
807 | |||
808 | <blockquote> | ||
809 | <pre> | ||
810 | (r1 == 1 && r2 == 0 && r3 == 1) | ||
811 | </pre> | ||
812 | </blockquote> | ||
813 | |||
814 | is entirely possible. | ||
815 | The following figure show how this can happen, with each circled | ||
816 | <tt>QS</tt> indicating the point at which RCU recorded a | ||
817 | <i>quiescent state</i> for each thread, that is, a state in which | ||
818 | RCU knows that the thread cannot be in the midst of an RCU read-side | ||
819 | critical section that started before the current grace period: | ||
820 | |||
821 | <p><img src="GPpartitionReaders1.svg" alt="GPpartitionReaders1.svg" width="60%"></p> | ||
822 | |||
823 | <p> | ||
824 | If it is necessary to partition RCU read-side critical sections in this | ||
825 | manner, it is necessary to use two grace periods, where the first | ||
826 | grace period is known to end before the second grace period starts: | ||
827 | |||
828 | <blockquote> | ||
829 | <pre> | ||
830 | 1 void thread0(void) | ||
831 | 2 { | ||
832 | 3 rcu_read_lock(); | ||
833 | 4 WRITE_ONCE(a, 1); | ||
834 | 5 WRITE_ONCE(b, 1); | ||
835 | 6 rcu_read_unlock(); | ||
836 | 7 } | ||
837 | 8 | ||
838 | 9 void thread1(void) | ||
839 | 10 { | ||
840 | 11 r1 = READ_ONCE(a); | ||
841 | 12 synchronize_rcu(); | ||
842 | 13 WRITE_ONCE(c, 1); | ||
843 | 14 } | ||
844 | 15 | ||
845 | 16 void thread2(void) | ||
846 | 17 { | ||
847 | 18 r2 = READ_ONCE(c); | ||
848 | 19 synchronize_rcu(); | ||
849 | 20 WRITE_ONCE(d, 1); | ||
850 | 21 } | ||
851 | 22 | ||
852 | 23 void thread3(void) | ||
853 | 24 { | ||
854 | 25 rcu_read_lock(); | ||
855 | 26 r3 = READ_ONCE(b); | ||
856 | 27 r4 = READ_ONCE(d); | ||
857 | 28 rcu_read_unlock(); | ||
858 | 29 } | ||
859 | </pre> | ||
860 | </blockquote> | ||
861 | |||
862 | <p> | ||
863 | Here, if <tt>(r1 == 1)</tt>, then | ||
864 | <tt>thread0()</tt>'s write to <tt>b</tt> must happen | ||
865 | before the end of <tt>thread1()</tt>'s grace period. | ||
866 | If in addition <tt>(r4 == 1)</tt>, then | ||
867 | <tt>thread3()</tt>'s read from <tt>b</tt> must happen | ||
868 | after the beginning of <tt>thread2()</tt>'s grace period. | ||
869 | If it is also the case that <tt>(r2 == 1)</tt>, then the | ||
870 | end of <tt>thread1()</tt>'s grace period must precede the | ||
871 | beginning of <tt>thread2()</tt>'s grace period. | ||
872 | This mean that the two RCU read-side critical sections cannot overlap, | ||
873 | guaranteeing that <tt>(r3 == 1)</tt>. | ||
874 | As a result, the outcome: | ||
875 | |||
876 | <blockquote> | ||
877 | <pre> | ||
878 | (r1 == 1 && r2 == 1 && r3 == 0 && r4 == 1) | ||
879 | </pre> | ||
880 | </blockquote> | ||
881 | |||
882 | cannot happen. | ||
883 | |||
884 | <p> | ||
885 | This non-requirement was also non-premeditated, but became apparent | ||
886 | when studying RCU's interaction with memory ordering. | ||
887 | |||
888 | <h3><a name="Read-Side Critical Sections Don't Partition Grace Periods"> | ||
889 | Read-Side Critical Sections Don't Partition Grace Periods</a></h3> | ||
890 | |||
891 | <p> | ||
892 | It is also tempting to assume that if an RCU read-side critical section | ||
893 | happens between a pair of grace periods, then those grace periods cannot | ||
894 | overlap. | ||
895 | However, this temptation leads nowhere good, as can be illustrated by | ||
896 | the following, with all variables initially zero: | ||
897 | |||
898 | <blockquote> | ||
899 | <pre> | ||
900 | 1 void thread0(void) | ||
901 | 2 { | ||
902 | 3 rcu_read_lock(); | ||
903 | 4 WRITE_ONCE(a, 1); | ||
904 | 5 WRITE_ONCE(b, 1); | ||
905 | 6 rcu_read_unlock(); | ||
906 | 7 } | ||
907 | 8 | ||
908 | 9 void thread1(void) | ||
909 | 10 { | ||
910 | 11 r1 = READ_ONCE(a); | ||
911 | 12 synchronize_rcu(); | ||
912 | 13 WRITE_ONCE(c, 1); | ||
913 | 14 } | ||
914 | 15 | ||
915 | 16 void thread2(void) | ||
916 | 17 { | ||
917 | 18 rcu_read_lock(); | ||
918 | 19 WRITE_ONCE(d, 1); | ||
919 | 20 r2 = READ_ONCE(c); | ||
920 | 21 rcu_read_unlock(); | ||
921 | 22 } | ||
922 | 23 | ||
923 | 24 void thread3(void) | ||
924 | 25 { | ||
925 | 26 r3 = READ_ONCE(d); | ||
926 | 27 synchronize_rcu(); | ||
927 | 28 WRITE_ONCE(e, 1); | ||
928 | 29 } | ||
929 | 30 | ||
930 | 31 void thread4(void) | ||
931 | 32 { | ||
932 | 33 rcu_read_lock(); | ||
933 | 34 r4 = READ_ONCE(b); | ||
934 | 35 r5 = READ_ONCE(e); | ||
935 | 36 rcu_read_unlock(); | ||
936 | 37 } | ||
937 | </pre> | ||
938 | </blockquote> | ||
939 | |||
940 | <p> | ||
941 | In this case, the outcome: | ||
942 | |||
943 | <blockquote> | ||
944 | <pre> | ||
945 | (r1 == 1 && r2 == 1 && r3 == 1 && r4 == 0 && r5 == 1) | ||
946 | </pre> | ||
947 | </blockquote> | ||
948 | |||
949 | is entirely possible, as illustrated below: | ||
950 | |||
951 | <p><img src="ReadersPartitionGP1.svg" alt="ReadersPartitionGP1.svg" width="100%"></p> | ||
952 | |||
953 | <p> | ||
954 | Again, an RCU read-side critical section can overlap almost all of a | ||
955 | given grace period, just so long as it does not overlap the entire | ||
956 | grace period. | ||
957 | As a result, an RCU read-side critical section cannot partition a pair | ||
958 | of RCU grace periods. | ||
959 | |||
960 | <p><a name="Quick Quiz 10"><b>Quick Quiz 10</b>:</a> | ||
961 | How long a sequence of grace periods, each separated by an RCU read-side | ||
962 | critical section, would be required to partition the RCU read-side | ||
963 | critical sections at the beginning and end of the chain? | ||
964 | <br><a href="#qq10answer">Answer</a> | ||
965 | |||
966 | <h3><a name="Disabling Preemption Does Not Block Grace Periods"> | ||
967 | Disabling Preemption Does Not Block Grace Periods</a></h3> | ||
968 | |||
969 | <p> | ||
970 | There was a time when disabling preemption on any given CPU would block | ||
971 | subsequent grace periods. | ||
972 | However, this was an accident of implementation and is not a requirement. | ||
973 | And in the current Linux-kernel implementation, disabling preemption | ||
974 | on a given CPU in fact does not block grace periods, as Oleg Nesterov | ||
975 | <a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>. | ||
976 | |||
977 | <p> | ||
978 | If you need a preempt-disable region to block grace periods, you need to add | ||
979 | <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example | ||
980 | as follows: | ||
981 | |||
982 | <blockquote> | ||
983 | <pre> | ||
984 | 1 preempt_disable(); | ||
985 | 2 rcu_read_lock(); | ||
986 | 3 do_something(); | ||
987 | 4 rcu_read_unlock(); | ||
988 | 5 preempt_enable(); | ||
989 | 6 | ||
990 | 7 /* Spinlocks implicitly disable preemption. */ | ||
991 | 8 spin_lock(&mylock); | ||
992 | 9 rcu_read_lock(); | ||
993 | 10 do_something(); | ||
994 | 11 rcu_read_unlock(); | ||
995 | 12 spin_unlock(&mylock); | ||
996 | </pre> | ||
997 | </blockquote> | ||
998 | |||
999 | <p> | ||
1000 | In theory, you could enter the RCU read-side critical section first, | ||
1001 | but it is more efficient to keep the entire RCU read-side critical | ||
1002 | section contained in the preempt-disable region as shown above. | ||
1003 | Of course, RCU read-side critical sections that extend outside of | ||
1004 | preempt-disable regions will work correctly, but such critical sections | ||
1005 | can be preempted, which forces <tt>rcu_read_unlock()</tt> to do | ||
1006 | more work. | ||
1007 | And no, this is <i>not</i> an invitation to enclose all of your RCU | ||
1008 | read-side critical sections within preempt-disable regions, because | ||
1009 | doing so would degrade real-time response. | ||
1010 | |||
1011 | <p> | ||
1012 | This non-requirement appeared with preemptible RCU. | ||
1013 | If you need a grace period that waits on non-preemptible code regions, use | ||
1014 | <a href="#Sched Flavor">RCU-sched</a>. | ||
1015 | |||
1016 | <h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2> | ||
1017 | |||
1018 | <p> | ||
1019 | These parallelism facts of life are by no means specific to RCU, but | ||
1020 | the RCU implementation must abide by them. | ||
1021 | They therefore bear repeating: | ||
1022 | |||
1023 | <ol> | ||
1024 | <li> Any CPU or task may be delayed at any time, | ||
1025 | and any attempts to avoid these delays by disabling | ||
1026 | preemption, interrupts, or whatever are completely futile. | ||
1027 | This is most obvious in preemptible user-level | ||
1028 | environments and in virtualized environments (where | ||
1029 | a given guest OS's VCPUs can be preempted at any time by | ||
1030 | the underlying hypervisor), but can also happen in bare-metal | ||
1031 | environments due to ECC errors, NMIs, and other hardware | ||
1032 | events. | ||
1033 | Although a delay of more than about 20 seconds can result | ||
1034 | in splats, the RCU implementation is obligated to use | ||
1035 | algorithms that can tolerate extremely long delays, but where | ||
1036 | “extremely long” is not long enough to allow | ||
1037 | wrap-around when incrementing a 64-bit counter. | ||
1038 | <li> Both the compiler and the CPU can reorder memory accesses. | ||
1039 | Where it matters, RCU must use compiler directives and | ||
1040 | memory-barrier instructions to preserve ordering. | ||
1041 | <li> Conflicting writes to memory locations in any given cache line | ||
1042 | will result in expensive cache misses. | ||
1043 | Greater numbers of concurrent writes and more-frequent | ||
1044 | concurrent writes will result in more dramatic slowdowns. | ||
1045 | RCU is therefore obligated to use algorithms that have | ||
1046 | sufficient locality to avoid significant performance and | ||
1047 | scalability problems. | ||
1048 | <li> As a rough rule of thumb, only one CPU's worth of processing | ||
1049 | may be carried out under the protection of any given exclusive | ||
1050 | lock. | ||
1051 | RCU must therefore use scalable locking designs. | ||
1052 | <li> Counters are finite, especially on 32-bit systems. | ||
1053 | RCU's use of counters must therefore tolerate counter wrap, | ||
1054 | or be designed such that counter wrap would take way more | ||
1055 | time than a single system is likely to run. | ||
1056 | An uptime of ten years is quite possible, a runtime | ||
1057 | of a century much less so. | ||
1058 | As an example of the latter, RCU's dyntick-idle nesting counter | ||
1059 | allows 54 bits for interrupt nesting level (this counter | ||
1060 | is 64 bits even on a 32-bit system). | ||
1061 | Overflowing this counter requires 2<sup>54</sup> | ||
1062 | half-interrupts on a given CPU without that CPU ever going idle. | ||
1063 | If a half-interrupt happened every microsecond, it would take | ||
1064 | 570 years of runtime to overflow this counter, which is currently | ||
1065 | believed to be an acceptably long time. | ||
1066 | <li> Linux systems can have thousands of CPUs running a single | ||
1067 | Linux kernel in a single shared-memory environment. | ||
1068 | RCU must therefore pay close attention to high-end scalability. | ||
1069 | </ol> | ||
1070 | |||
1071 | <p> | ||
1072 | This last parallelism fact of life means that RCU must pay special | ||
1073 | attention to the preceding facts of life. | ||
1074 | The idea that Linux might scale to systems with thousands of CPUs would | ||
1075 | have been met with some skepticism in the 1990s, but these requirements | ||
1076 | would have otherwise have been unsurprising, even in the early 1990s. | ||
1077 | |||
1078 | <h2><a name="Quality-of-Implementation Requirements">Quality-of-Implementation Requirements</a></h2> | ||
1079 | |||
1080 | <p> | ||
1081 | These sections list quality-of-implementation requirements. | ||
1082 | Although an RCU implementation that ignores these requirements could | ||
1083 | still be used, it would likely be subject to limitations that would | ||
1084 | make it inappropriate for industrial-strength production use. | ||
1085 | Classes of quality-of-implementation requirements are as follows: | ||
1086 | |||
1087 | <ol> | ||
1088 | <li> <a href="#Specialization">Specialization</a> | ||
1089 | <li> <a href="#Performance and Scalability">Performance and Scalability</a> | ||
1090 | <li> <a href="#Composability">Composability</a> | ||
1091 | <li> <a href="#Corner Cases">Corner Cases</a> | ||
1092 | </ol> | ||
1093 | |||
1094 | <p> | ||
1095 | These classes is covered in the following sections. | ||
1096 | |||
1097 | <h3><a name="Specialization">Specialization</a></h3> | ||
1098 | |||
1099 | <p> | ||
1100 | RCU is and always has been intended primarily for read-mostly situations, as | ||
1101 | illustrated by the following figure. | ||
1102 | This means that RCU's read-side primitives are optimized, often at the | ||
1103 | expense of its update-side primitives. | ||
1104 | |||
1105 | <p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p> | ||
1106 | |||
1107 | <p> | ||
1108 | This focus on read-mostly situations means that RCU must interoperate | ||
1109 | with other synchronization primitives. | ||
1110 | For example, the <tt>add_gp()</tt> and <tt>remove_gp_synchronous()</tt> | ||
1111 | examples discussed earlier use RCU to protect readers and locking to | ||
1112 | coordinate updaters. | ||
1113 | However, the need extends much farther, requiring that a variety of | ||
1114 | synchronization primitives be legal within RCU read-side critical sections, | ||
1115 | including spinlocks, sequence locks, atomic operations, reference | ||
1116 | counters, and memory barriers. | ||
1117 | |||
1118 | <p><a name="Quick Quiz 11"><b>Quick Quiz 11</b>:</a> | ||
1119 | What about sleeping locks? | ||
1120 | <br><a href="#qq11answer">Answer</a> | ||
1121 | |||
1122 | <p> | ||
1123 | It often comes as a surprise that many algorithms do not require a | ||
1124 | consistent view of data, but many can function in that mode, | ||
1125 | with network routing being the poster child. | ||
1126 | Internet routing algorithms take significant time to propagate | ||
1127 | updates, so that by the time an update arrives at a given system, | ||
1128 | that system has been sending network traffic the wrong way for | ||
1129 | a considerable length of time. | ||
1130 | Having a few threads continue to send traffic the wrong way for a | ||
1131 | few more milliseconds is clearly not a problem: In the worst case, | ||
1132 | TCP retransmissions will eventually get the data where it needs to go. | ||
1133 | In general, when tracking the state of the universe outside of the | ||
1134 | computer, some level of inconsistency must be tolerated due to | ||
1135 | speed-of-light delays if nothing else. | ||
1136 | |||
1137 | <p> | ||
1138 | Furthermore, uncertainty about external state is inherent in many cases. | ||
1139 | For example, a pair of veternarians might use heartbeat to determine | ||
1140 | whether or not a given cat was alive. | ||
1141 | But how long should they wait after the last heartbeat to decide that | ||
1142 | the cat is in fact dead? | ||
1143 | Waiting less than 400 milliseconds makes no sense because this would | ||
1144 | mean that a relaxed cat would be considered to cycle between death | ||
1145 | and life more than 100 times per minute. | ||
1146 | Moreover, just as with human beings, a cat's heart might stop for | ||
1147 | some period of time, so the exact wait period is a judgment call. | ||
1148 | One of our pair of veternarians might wait 30 seconds before pronouncing | ||
1149 | the cat dead, while the other might insist on waiting a full minute. | ||
1150 | The two veternarians would then disagree on the state of the cat during | ||
1151 | the final 30 seconds of the minute following the last heartbeat, as | ||
1152 | fancifully illustrated below: | ||
1153 | |||
1154 | <p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p> | ||
1155 | |||
1156 | <p> | ||
1157 | Interestingly enough, this same situation applies to hardware. | ||
1158 | When push comes to shove, how do we tell whether or not some | ||
1159 | external server has failed? | ||
1160 | We send messages to it periodically, and declare it failed if we | ||
1161 | don't receive a response within a given period of time. | ||
1162 | Policy decisions can usually tolerate short | ||
1163 | periods of inconsistency. | ||
1164 | The policy was decided some time ago, and is only now being put into | ||
1165 | effect, so a few milliseconds of delay is normally inconsequential. | ||
1166 | |||
1167 | <p> | ||
1168 | However, there are algorithms that absolutely must see consistent data. | ||
1169 | For example, the translation between a user-level SystemV semaphore | ||
1170 | ID to the corresponding in-kernel data structure is protected by RCU, | ||
1171 | but it is absolutely forbidden to update a semaphore that has just been | ||
1172 | removed. | ||
1173 | In the Linux kernel, this need for consistency is accommodated by acquiring | ||
1174 | spinlocks located in the in-kernel data structure from within | ||
1175 | the RCU read-side critical section, and this is indicated by the | ||
1176 | green box in the figure above. | ||
1177 | Many other techniques may be used, and are in fact used within the | ||
1178 | Linux kernel. | ||
1179 | |||
1180 | <p> | ||
1181 | In short, RCU is not required to maintain consistency, and other | ||
1182 | mechanisms may be used in concert with RCU when consistency is required. | ||
1183 | RCU's specialization allows it to do its job extremely well, and its | ||
1184 | ability to interoperate with other synchronization mechanisms allows | ||
1185 | the right mix of synchronization tools to be used for a given job. | ||
1186 | |||
1187 | <h3><a name="Performance and Scalability">Performance and Scalability</a></h3> | ||
1188 | |||
1189 | <p> | ||
1190 | Energy efficiency is a critical component of performance today, | ||
1191 | and Linux-kernel RCU implementations must therefore avoid unnecessarily | ||
1192 | awakening idle CPUs. | ||
1193 | I cannot claim that this requirement was premeditated. | ||
1194 | In fact, I learned of it during a telephone conversation in which I | ||
1195 | was given “frank and open” feedback on the importance | ||
1196 | of energy efficiency in battery-powered systems and on specific | ||
1197 | energy-efficiency shortcomings of the Linux-kernel RCU implementation. | ||
1198 | In my experience, the battery-powered embedded community will consider | ||
1199 | any unnecessary wakeups to be extremely unfriendly acts. | ||
1200 | So much so that mere Linux-kernel-mailing-list posts are | ||
1201 | insufficient to vent their ire. | ||
1202 | |||
1203 | <p> | ||
1204 | Memory consumption is not particularly important for in most | ||
1205 | situations, and has become decreasingly | ||
1206 | so as memory sizes have expanded and memory | ||
1207 | costs have plummeted. | ||
1208 | However, as I learned from Matt Mackall's | ||
1209 | <a href="http://elinux.org/Linux_Tiny-FAQ">bloatwatch</a> | ||
1210 | efforts, memory footprint is critically important on single-CPU systems with | ||
1211 | non-preemptible (<tt>CONFIG_PREEMPT=n</tt>) kernels, and thus | ||
1212 | <a href="https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com">tiny RCU</a> | ||
1213 | was born. | ||
1214 | Josh Triplett has since taken over the small-memory banner with his | ||
1215 | <a href="https://tiny.wiki.kernel.org/">Linux kernel tinification</a> | ||
1216 | project, which resulted in | ||
1217 | <a href="#Sleepable RCU">SRCU</a> | ||
1218 | becoming optional for those kernels not needing it. | ||
1219 | |||
1220 | <p> | ||
1221 | The remaining performance requirements are, for the most part, | ||
1222 | unsurprising. | ||
1223 | For example, in keeping with RCU's read-side specialization, | ||
1224 | <tt>rcu_dereference()</tt> should have negligible overhead (for | ||
1225 | example, suppression of a few minor compiler optimizations). | ||
1226 | Similarly, in non-preemptible environments, <tt>rcu_read_lock()</tt> and | ||
1227 | <tt>rcu_read_unlock()</tt> should have exactly zero overhead. | ||
1228 | |||
1229 | <p> | ||
1230 | In preemptible environments, in the case where the RCU read-side | ||
1231 | critical section was not preempted (as will be the case for the | ||
1232 | highest-priority real-time process), <tt>rcu_read_lock()</tt> and | ||
1233 | <tt>rcu_read_unlock()</tt> should have minimal overhead. | ||
1234 | In particular, they should not contain atomic read-modify-write | ||
1235 | operations, memory-barrier instructions, preemption disabling, | ||
1236 | interrupt disabling, or backwards branches. | ||
1237 | However, in the case where the RCU read-side critical section was preempted, | ||
1238 | <tt>rcu_read_unlock()</tt> may acquire spinlocks and disable interrupts. | ||
1239 | This is why it is better to nest an RCU read-side critical section | ||
1240 | within a preempt-disable region than vice versa, at least in cases | ||
1241 | where that critical section is short enough to avoid unduly degrading | ||
1242 | real-time latencies. | ||
1243 | |||
1244 | <p> | ||
1245 | The <tt>synchronize_rcu()</tt> grace-period-wait primitive is | ||
1246 | optimized for throughput. | ||
1247 | It may therefore incur several milliseconds of latency in addition to | ||
1248 | the duration of the longest RCU read-side critical section. | ||
1249 | On the other hand, multiple concurrent invocations of | ||
1250 | <tt>synchronize_rcu()</tt> are required to use batching optimizations | ||
1251 | so that they can be satisfied by a single underlying grace-period-wait | ||
1252 | operation. | ||
1253 | For example, in the Linux kernel, it is not unusual for a single | ||
1254 | grace-period-wait operation to serve more than | ||
1255 | <a href="https://www.usenix.org/conference/2004-usenix-annual-technical-conference/making-rcu-safe-deep-sub-millisecond-response">1,000 separate invocations</a> | ||
1256 | of <tt>synchronize_rcu()</tt>, thus amortizing the per-invocation | ||
1257 | overhead down to nearly zero. | ||
1258 | However, the grace-period optimization is also required to avoid | ||
1259 | measurable degradation of real-time scheduling and interrupt latencies. | ||
1260 | |||
1261 | <p> | ||
1262 | In some cases, the multi-millisecond <tt>synchronize_rcu()</tt> | ||
1263 | latencies are unacceptable. | ||
1264 | In these cases, <tt>synchronize_rcu_expedited()</tt> may be used | ||
1265 | instead, reducing the grace-period latency down to a few tens of | ||
1266 | microseconds on small systems, at least in cases where the RCU read-side | ||
1267 | critical sections are short. | ||
1268 | There are currently no special latency requirements for | ||
1269 | <tt>synchronize_rcu_expedited()</tt> on large systems, but, | ||
1270 | consistent with the empirical nature of the RCU specification, | ||
1271 | that is subject to change. | ||
1272 | However, there most definitely are scalability requirements: | ||
1273 | A storm of <tt>synchronize_rcu_expedited()</tt> invocations on 4096 | ||
1274 | CPUs should at least make reasonable forward progress. | ||
1275 | In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt> | ||
1276 | is permitted to impose modest degradation of real-time latency | ||
1277 | on non-idle online CPUs. | ||
1278 | That said, it will likely be necessary to take further steps to reduce this | ||
1279 | degradation, hopefully to roughly that of a scheduling-clock interrupt. | ||
1280 | |||
1281 | <p> | ||
1282 | There are a number of situations where even | ||
1283 | <tt>synchronize_rcu_expedited()</tt>'s reduced grace-period | ||
1284 | latency is unacceptable. | ||
1285 | In these situations, the asynchronous <tt>call_rcu()</tt> can be | ||
1286 | used in place of <tt>synchronize_rcu()</tt> as follows: | ||
1287 | |||
1288 | <blockquote> | ||
1289 | <pre> | ||
1290 | 1 struct foo { | ||
1291 | 2 int a; | ||
1292 | 3 int b; | ||
1293 | 4 struct rcu_head rh; | ||
1294 | 5 }; | ||
1295 | 6 | ||
1296 | 7 static void remove_gp_cb(struct rcu_head *rhp) | ||
1297 | 8 { | ||
1298 | 9 struct foo *p = container_of(rhp, struct foo, rh); | ||
1299 | 10 | ||
1300 | 11 kfree(p); | ||
1301 | 12 } | ||
1302 | 13 | ||
1303 | 14 bool remove_gp_asynchronous(void) | ||
1304 | 15 { | ||
1305 | 16 struct foo *p; | ||
1306 | 17 | ||
1307 | 18 spin_lock(&gp_lock); | ||
1308 | 19 p = rcu_dereference(gp); | ||
1309 | 20 if (!p) { | ||
1310 | 21 spin_unlock(&gp_lock); | ||
1311 | 22 return false; | ||
1312 | 23 } | ||
1313 | 24 rcu_assign_pointer(gp, NULL); | ||
1314 | 25 call_rcu(&p->rh, remove_gp_cb); | ||
1315 | 26 spin_unlock(&gp_lock); | ||
1316 | 27 return true; | ||
1317 | 28 } | ||
1318 | </pre> | ||
1319 | </blockquote> | ||
1320 | |||
1321 | <p> | ||
1322 | A definition of <tt>struct foo</tt> is finally needed, and appears | ||
1323 | on lines 1-5. | ||
1324 | The function <tt>remove_gp_cb()</tt> is passed to <tt>call_rcu()</tt> | ||
1325 | on line 25, and will be invoked after the end of a subsequent | ||
1326 | grace period. | ||
1327 | This gets the same effect as <tt>remove_gp_synchronous()</tt>, | ||
1328 | but without forcing the updater to wait for a grace period to elapse. | ||
1329 | The <tt>call_rcu()</tt> function may be used in a number of | ||
1330 | situations where neither <tt>synchronize_rcu()</tt> nor | ||
1331 | <tt>synchronize_rcu_expedited()</tt> would be legal, | ||
1332 | including within preempt-disable code, <tt>local_bh_disable()</tt> code, | ||
1333 | interrupt-disable code, and interrupt handlers. | ||
1334 | However, even <tt>call_rcu()</tt> is illegal within NMI handlers. | ||
1335 | The callback function (<tt>remove_gp_cb()</tt> in this case) will be | ||
1336 | executed within softirq (software interrupt) environment within the | ||
1337 | Linux kernel, | ||
1338 | either within a real softirq handler or under the protection | ||
1339 | of <tt>local_bh_disable()</tt>. | ||
1340 | In both the Linux kernel and in userspace, it is bad practice to | ||
1341 | write an RCU callback function that takes too long. | ||
1342 | Long-running operations should be relegated to separate threads or | ||
1343 | (in the Linux kernel) workqueues. | ||
1344 | |||
1345 | <p><a name="Quick Quiz 12"><b>Quick Quiz 12</b>:</a> | ||
1346 | Why does line 19 use <tt>rcu_access_pointer()</tt>? | ||
1347 | After all, <tt>call_rcu()</tt> on line 25 stores into the | ||
1348 | structure, which would interact badly with concurrent insertions. | ||
1349 | Doesn't this mean that <tt>rcu_dereference()</tt> is required? | ||
1350 | <br><a href="#qq12answer">Answer</a> | ||
1351 | |||
1352 | <p> | ||
1353 | However, all that <tt>remove_gp_cb()</tt> is doing is | ||
1354 | invoking <tt>kfree()</tt> on the data element. | ||
1355 | This is a common idiom, and is supported by <tt>kfree_rcu()</tt>, | ||
1356 | which allows “fire and forget” operation as shown below: | ||
1357 | |||
1358 | <blockquote> | ||
1359 | <pre> | ||
1360 | 1 struct foo { | ||
1361 | 2 int a; | ||
1362 | 3 int b; | ||
1363 | 4 struct rcu_head rh; | ||
1364 | 5 }; | ||
1365 | 6 | ||
1366 | 7 bool remove_gp_faf(void) | ||
1367 | 8 { | ||
1368 | 9 struct foo *p; | ||
1369 | 10 | ||
1370 | 11 spin_lock(&gp_lock); | ||
1371 | 12 p = rcu_dereference(gp); | ||
1372 | 13 if (!p) { | ||
1373 | 14 spin_unlock(&gp_lock); | ||
1374 | 15 return false; | ||
1375 | 16 } | ||
1376 | 17 rcu_assign_pointer(gp, NULL); | ||
1377 | 18 kfree_rcu(p, rh); | ||
1378 | 19 spin_unlock(&gp_lock); | ||
1379 | 20 return true; | ||
1380 | 21 } | ||
1381 | </pre> | ||
1382 | </blockquote> | ||
1383 | |||
1384 | <p> | ||
1385 | Note that <tt>remove_gp_faf()</tt> simply invokes | ||
1386 | <tt>kfree_rcu()</tt> and proceeds, without any need to pay any | ||
1387 | further attention to the subsequent grace period and <tt>kfree()</tt>. | ||
1388 | It is permissible to invoke <tt>kfree_rcu()</tt> from the same | ||
1389 | environments as for <tt>call_rcu()</tt>. | ||
1390 | Interestingly enough, DYNIX/ptx had the equivalents of | ||
1391 | <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>, but not | ||
1392 | <tt>synchronize_rcu()</tt>. | ||
1393 | This was due to the fact that RCU was not heavily used within DYNIX/ptx, | ||
1394 | so the very few places that needed something like | ||
1395 | <tt>synchronize_rcu()</tt> simply open-coded it. | ||
1396 | |||
1397 | <p><a name="Quick Quiz 13"><b>Quick Quiz 13</b>:</a> | ||
1398 | Earlier it was claimed that <tt>call_rcu()</tt> and | ||
1399 | <tt>kfree_rcu()</tt> allowed updaters to avoid being blocked | ||
1400 | by readers. | ||
1401 | But how can that be correct, given that the invocation of the callback | ||
1402 | and the freeing of the memory (respectively) must still wait for | ||
1403 | a grace period to elapse? | ||
1404 | <br><a href="#qq13answer">Answer</a> | ||
1405 | |||
1406 | <p> | ||
1407 | But what if the updater must wait for the completion of code to be | ||
1408 | executed after the end of the grace period, but has other tasks | ||
1409 | that can be carried out in the meantime? | ||
1410 | The polling-style <tt>get_state_synchronize_rcu()</tt> and | ||
1411 | <tt>cond_synchronize_rcu()</tt> functions may be used for this | ||
1412 | purpose, as shown below: | ||
1413 | |||
1414 | <blockquote> | ||
1415 | <pre> | ||
1416 | 1 bool remove_gp_poll(void) | ||
1417 | 2 { | ||
1418 | 3 struct foo *p; | ||
1419 | 4 unsigned long s; | ||
1420 | 5 | ||
1421 | 6 spin_lock(&gp_lock); | ||
1422 | 7 p = rcu_access_pointer(gp); | ||
1423 | 8 if (!p) { | ||
1424 | 9 spin_unlock(&gp_lock); | ||
1425 | 10 return false; | ||
1426 | 11 } | ||
1427 | 12 rcu_assign_pointer(gp, NULL); | ||
1428 | 13 spin_unlock(&gp_lock); | ||
1429 | 14 s = get_state_synchronize_rcu(); | ||
1430 | 15 do_something_while_waiting(); | ||
1431 | 16 cond_synchronize_rcu(s); | ||
1432 | 17 kfree(p); | ||
1433 | 18 return true; | ||
1434 | 19 } | ||
1435 | </pre> | ||
1436 | </blockquote> | ||
1437 | |||
1438 | <p> | ||
1439 | On line 14, <tt>get_state_synchronize_rcu()</tt> obtains a | ||
1440 | “cookie” from RCU, | ||
1441 | then line 15 carries out other tasks, | ||
1442 | and finally, line 16 returns immediately if a grace period has | ||
1443 | elapsed in the meantime, but otherwise waits as required. | ||
1444 | The need for <tt>get_state_synchronize_rcu</tt> and | ||
1445 | <tt>cond_synchronize_rcu()</tt> has appeared quite recently, | ||
1446 | so it is too early to tell whether they will stand the test of time. | ||
1447 | |||
1448 | <p> | ||
1449 | RCU thus provides a range of tools to allow updaters to strike the | ||
1450 | required tradeoff between latency, flexibility and CPU overhead. | ||
1451 | |||
1452 | <h3><a name="Composability">Composability</a></h3> | ||
1453 | |||
1454 | <p> | ||
1455 | Composability has received much attention in recent years, perhaps in part | ||
1456 | due to the collision of multicore hardware with object-oriented techniques | ||
1457 | designed in single-threaded environments for single-threaded use. | ||
1458 | And in theory, RCU read-side critical sections may be composed, and in | ||
1459 | fact may be nested arbitrarily deeply. | ||
1460 | In practice, as with all real-world implementations of composable | ||
1461 | constructs, there are limitations. | ||
1462 | |||
1463 | <p> | ||
1464 | Implementations of RCU for which <tt>rcu_read_lock()</tt> | ||
1465 | and <tt>rcu_read_unlock()</tt> generate no code, such as | ||
1466 | Linux-kernel RCU when <tt>CONFIG_PREEMPT=n</tt>, can be | ||
1467 | nested arbitrarily deeply. | ||
1468 | After all, there is no overhead. | ||
1469 | Except that if all these instances of <tt>rcu_read_lock()</tt> | ||
1470 | and <tt>rcu_read_unlock()</tt> are visible to the compiler, | ||
1471 | compilation will eventually fail due to exhausting memory, | ||
1472 | mass storage, or user patience, whichever comes first. | ||
1473 | If the nesting is not visible to the compiler, as is the case with | ||
1474 | mutually recursive functions each in its own translation unit, | ||
1475 | stack overflow will result. | ||
1476 | If the nesting takes the form of loops, either the control variable | ||
1477 | will overflow or (in the Linux kernel) you will get an RCU CPU stall warning. | ||
1478 | Nevertheless, this class of RCU implementations is one | ||
1479 | of the most composable constructs in existence. | ||
1480 | |||
1481 | <p> | ||
1482 | RCU implementations that explicitly track nesting depth | ||
1483 | are limited by the nesting-depth counter. | ||
1484 | For example, the Linux kernel's preemptible RCU limits nesting to | ||
1485 | <tt>INT_MAX</tt>. | ||
1486 | This should suffice for almost all practical purposes. | ||
1487 | That said, a consecutive pair of RCU read-side critical sections | ||
1488 | between which there is an operation that waits for a grace period | ||
1489 | cannot be enclosed in another RCU read-side critical section. | ||
1490 | This is because it is not legal to wait for a grace period within | ||
1491 | an RCU read-side critical section: To do so would result either | ||
1492 | in deadlock or | ||
1493 | in RCU implicitly splitting the enclosing RCU read-side critical | ||
1494 | section, neither of which is conducive to a long-lived and prosperous | ||
1495 | kernel. | ||
1496 | |||
1497 | <p> | ||
1498 | In short, although RCU read-side critical sections are highly composable, | ||
1499 | care is required in some situations, just as is the case for any other | ||
1500 | composable synchronization mechanism. | ||
1501 | |||
1502 | <h3><a name="Corner Cases">Corner Cases</a></h3> | ||
1503 | |||
1504 | <p> | ||
1505 | A given RCU workload might have an endless and intense stream of | ||
1506 | RCU read-side critical sections, perhaps even so intense that there | ||
1507 | was never a point in time during which there was not at least one | ||
1508 | RCU read-side critical section in flight. | ||
1509 | RCU cannot allow this situation to block grace periods: As long as | ||
1510 | all the RCU read-side critical sections are finite, grace periods | ||
1511 | must also be finite. | ||
1512 | |||
1513 | <p> | ||
1514 | That said, preemptible RCU implementations could potentially result | ||
1515 | in RCU read-side critical sections being preempted for long durations, | ||
1516 | which has the effect of creating a long-duration RCU read-side | ||
1517 | critical section. | ||
1518 | This situation can arise only in heavily loaded systems, but systems using | ||
1519 | real-time priorities are of course more vulnerable. | ||
1520 | Therefore, RCU priority boosting is provided to help deal with this | ||
1521 | case. | ||
1522 | That said, the exact requirements on RCU priority boosting will likely | ||
1523 | evolve as more experience accumulates. | ||
1524 | |||
1525 | <p> | ||
1526 | Other workloads might have very high update rates. | ||
1527 | Although one can argue that such workloads should instead use | ||
1528 | something other than RCU, the fact remains that RCU must | ||
1529 | handle such workloads gracefully. | ||
1530 | This requirement is another factor driving batching of grace periods, | ||
1531 | but it is also the driving force behind the checks for large numbers | ||
1532 | of queued RCU callbacks in the <tt>call_rcu()</tt> code path. | ||
1533 | Finally, high update rates should not delay RCU read-side critical | ||
1534 | sections, although some read-side delays can occur when using | ||
1535 | <tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use | ||
1536 | of <tt>try_stop_cpus()</tt>. | ||
1537 | (In the future, <tt>synchronize_rcu_expedited()</tt> will be | ||
1538 | converted to use lighter-weight inter-processor interrupts (IPIs), | ||
1539 | but this will still disturb readers, though to a much smaller degree.) | ||
1540 | |||
1541 | <p> | ||
1542 | Although all three of these corner cases were understood in the early | ||
1543 | 1990s, a simple user-level test consisting of <tt>close(open(path))</tt> | ||
1544 | in a tight loop | ||
1545 | in the early 2000s suddenly provided a much deeper appreciation of the | ||
1546 | high-update-rate corner case. | ||
1547 | This test also motivated addition of some RCU code to react to high update | ||
1548 | rates, for example, if a given CPU finds itself with more than 10,000 | ||
1549 | RCU callbacks queued, it will cause RCU to take evasive action by | ||
1550 | more aggressively starting grace periods and more aggressively forcing | ||
1551 | completion of grace-period processing. | ||
1552 | This evasive action causes the grace period to complete more quickly, | ||
1553 | but at the cost of restricting RCU's batching optimizations, thus | ||
1554 | increasing the CPU overhead incurred by that grace period. | ||
1555 | |||
1556 | <h2><a name="Software-Engineering Requirements"> | ||
1557 | Software-Engineering Requirements</a></h2> | ||
1558 | |||
1559 | <p> | ||
1560 | Between Murphy's Law and “To err is human”, it is necessary to | ||
1561 | guard against mishaps and misuse: | ||
1562 | |||
1563 | <ol> | ||
1564 | <li> It is all too easy to forget to use <tt>rcu_read_lock()</tt> | ||
1565 | everywhere that it is needed, so kernels built with | ||
1566 | <tt>CONFIG_PROVE_RCU=y</tt> will spat if | ||
1567 | <tt>rcu_dereference()</tt> is used outside of an | ||
1568 | RCU read-side critical section. | ||
1569 | Update-side code can use <tt>rcu_dereference_protected()</tt>, | ||
1570 | which takes a | ||
1571 | <a href="https://lwn.net/Articles/371986/">lockdep expression</a> | ||
1572 | to indicate what is providing the protection. | ||
1573 | If the indicated protection is not provided, a lockdep splat | ||
1574 | is emitted. | ||
1575 | |||
1576 | <p> | ||
1577 | Code shared between readers and updaters can use | ||
1578 | <tt>rcu_dereference_check()</tt>, which also takes a | ||
1579 | lockdep expression, and emits a lockdep splat if neither | ||
1580 | <tt>rcu_read_lock()</tt> nor the indicated protection | ||
1581 | is in place. | ||
1582 | In addition, <tt>rcu_dereference_raw()</tt> is used in those | ||
1583 | (hopefully rare) cases where the required protection cannot | ||
1584 | be easily described. | ||
1585 | Finally, <tt>rcu_read_lock_held()</tt> is provided to | ||
1586 | allow a function to verify that it has been invoked within | ||
1587 | an RCU read-side critical section. | ||
1588 | I was made aware of this set of requirements shortly after Thomas | ||
1589 | Gleixner audited a number of RCU uses. | ||
1590 | <li> A given function might wish to check for RCU-related preconditions | ||
1591 | upon entry, before using any other RCU API. | ||
1592 | The <tt>rcu_lockdep_assert()</tt> does this job, | ||
1593 | asserting the expression in kernels having lockdep enabled | ||
1594 | and doing nothing otherwise. | ||
1595 | <li> It is also easy to forget to use <tt>rcu_assign_pointer()</tt> | ||
1596 | and <tt>rcu_dereference()</tt>, perhaps (incorrectly) | ||
1597 | substituting a simple assignment. | ||
1598 | To catch this sort of error, a given RCU-protected pointer may be | ||
1599 | tagged with <tt>__rcu</tt>, after which running sparse | ||
1600 | with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain | ||
1601 | about simple-assignment accesses to that pointer. | ||
1602 | Arnd Bergmann made me aware of this requirement, and also | ||
1603 | supplied the needed | ||
1604 | <a href="https://lwn.net/Articles/376011/">patch series</a>. | ||
1605 | <li> Kernels built with <tt>CONFIG_DEBUG_OBJECTS_RCU_HEAD=y</tt> | ||
1606 | will splat if a data element is passed to <tt>call_rcu()</tt> | ||
1607 | twice in a row, without a grace period in between. | ||
1608 | (This error is similar to a double free.) | ||
1609 | The corresponding <tt>rcu_head</tt> structures that are | ||
1610 | dynamically allocated are automatically tracked, but | ||
1611 | <tt>rcu_head</tt> structures allocated on the stack | ||
1612 | must be initialized with <tt>init_rcu_head_on_stack()</tt> | ||
1613 | and cleaned up with <tt>destroy_rcu_head_on_stack()</tt>. | ||
1614 | Similarly, statically allocated non-stack <tt>rcu_head</tt> | ||
1615 | structures must be initialized with <tt>init_rcu_head()</tt> | ||
1616 | and cleaned up with <tt>destroy_rcu_head()</tt>. | ||
1617 | Mathieu Desnoyers made me aware of this requirement, and also | ||
1618 | supplied the needed | ||
1619 | <a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>. | ||
1620 | <li> An infinite loop in an RCU read-side critical section will | ||
1621 | eventually trigger an RCU CPU stall warning splat. | ||
1622 | However, RCU is not obligated to produce this splat | ||
1623 | unless there is a grace period waiting on that particular | ||
1624 | RCU read-side critical section. | ||
1625 | This requirement made itself known in the early 1990s, pretty | ||
1626 | much the first time that it was necessary to debug a CPU stall. | ||
1627 | <li> Although it would be very good to detect pointers leaking out | ||
1628 | of RCU read-side critical sections, there is currently no | ||
1629 | good way of doing this. | ||
1630 | One complication is the need to distinguish between pointers | ||
1631 | leaking and pointers that have been handed off from RCU to | ||
1632 | some other synchronization mechanism, for example, reference | ||
1633 | counting. | ||
1634 | <li> In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related | ||
1635 | information is provided via both debugfs and event tracing. | ||
1636 | <li> Open-coded use of <tt>rcu_assign_pointer()</tt> and | ||
1637 | <tt>rcu_dereference()</tt> to create typical linked | ||
1638 | data structures can be surprisingly error-prone. | ||
1639 | Therefore, RCU-protected | ||
1640 | <a href="https://lwn.net/Articles/609973/#RCU List APIs">linked lists</a> | ||
1641 | and, more recently, RCU-protected | ||
1642 | <a href="https://lwn.net/Articles/612100/">hash tables</a> | ||
1643 | are available. | ||
1644 | Many other special-purpose RCU-protected data structures are | ||
1645 | available in the Linux kernel and the userspace RCU library. | ||
1646 | <li> Some linked structures are created at compile time, but still | ||
1647 | require <tt>__rcu</tt> checking. | ||
1648 | The <tt>RCU_POINTER_INITIALIZER()</tt> macro serves this | ||
1649 | purpose. | ||
1650 | <li> It is not necessary to use <tt>rcu_assign_pointer()</tt> | ||
1651 | when creating linked structures that are to be published via | ||
1652 | a single external pointer. | ||
1653 | The <tt>RCU_INIT_POINTER()</tt> macro is provided for | ||
1654 | this task and also for assigning <tt>NULL</tt> pointers | ||
1655 | at runtime. | ||
1656 | </ol> | ||
1657 | |||
1658 | <p> | ||
1659 | This not a hard-and-fast list: RCU's diagnostic capabilities will | ||
1660 | continue to be guided by the number and type of usage bugs found | ||
1661 | in real-world RCU usage. | ||
1662 | |||
1663 | <h2><a name="Linux Kernel Complications">Linux Kernel Complications</a></h2> | ||
1664 | |||
1665 | <p> | ||
1666 | The Linux kernel provides an interesting environment for all kinds of | ||
1667 | software, including RCU. | ||
1668 | Some of the relevant points of interest are as follows: | ||
1669 | |||
1670 | <ol> | ||
1671 | <li> <a href="#Configuration">Configuration</a>. | ||
1672 | <li> <a href="#Firmware Interface">Firmware Interface</a>. | ||
1673 | <li> <a href="#Early Boot">Early Boot</a>. | ||
1674 | <li> <a href="#Interrupts and NMIs"> | ||
1675 | Interrupts and non-maskable interrupts (NMIs)</a>. | ||
1676 | <li> <a href="#Loadable Modules">Loadable Modules</a>. | ||
1677 | <li> <a href="#Hotplug CPU">Hotplug CPU</a>. | ||
1678 | <li> <a href="#Scheduler and RCU">Scheduler and RCU</a>. | ||
1679 | <li> <a href="#Tracing and RCU">Tracing and RCU</a>. | ||
1680 | <li> <a href="#Energy Efficiency">Energy Efficiency</a>. | ||
1681 | <li> <a href="#Performance, Scalability, Response Time, and Reliability"> | ||
1682 | Performance, Scalability, Response Time, and Reliability</a>. | ||
1683 | </ol> | ||
1684 | |||
1685 | <p> | ||
1686 | This list is probably incomplete, but it does give a feel for the | ||
1687 | most notable Linux-kernel complications. | ||
1688 | Each of the following sections covers one of the above topics. | ||
1689 | |||
1690 | <h3><a name="Configuration">Configuration</a></h3> | ||
1691 | |||
1692 | <p> | ||
1693 | RCU's goal is automatic configuration, so that almost nobody | ||
1694 | needs to worry about RCU's <tt>Kconfig</tt> options. | ||
1695 | And for almost all users, RCU does in fact work well | ||
1696 | “out of the box.” | ||
1697 | |||
1698 | <p> | ||
1699 | However, there are specialized use cases that are handled by | ||
1700 | kernel boot parameters and <tt>Kconfig</tt> options. | ||
1701 | Unfortunately, the <tt>Kconfig</tt> system will explicitly ask users | ||
1702 | about new <tt>Kconfig</tt> options, which requires almost all of them | ||
1703 | be hidden behind a <tt>CONFIG_RCU_EXPERT</tt> <tt>Kconfig</tt> option. | ||
1704 | |||
1705 | <p> | ||
1706 | This all should be quite obvious, but the fact remains that | ||
1707 | Linus Torvalds recently had to | ||
1708 | <a href="https://lkml.kernel.org/g/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com">remind</a> | ||
1709 | me of this requirement. | ||
1710 | |||
1711 | <h3><a name="Firmware Interface">Firmware Interface</a></h3> | ||
1712 | |||
1713 | <p> | ||
1714 | In many cases, kernel obtains information about the system from the | ||
1715 | firmware, and sometimes things are lost in translation. | ||
1716 | Or the translation is accurate, but the original message is bogus. | ||
1717 | |||
1718 | <p> | ||
1719 | For example, some systems' firmware overreports the number of CPUs, | ||
1720 | sometimes by a large factor. | ||
1721 | If RCU naively believed the firmware, as it used to do, | ||
1722 | it would create too many per-CPU kthreads. | ||
1723 | Although the resulting system will still run correctly, the extra | ||
1724 | kthreads needlessly consume memory and can cause confusion | ||
1725 | when they show up in <tt>ps</tt> listings. | ||
1726 | |||
1727 | <p> | ||
1728 | RCU must therefore wait for a given CPU to actually come online before | ||
1729 | it can allow itself to believe that the CPU actually exists. | ||
1730 | The resulting “ghost CPUs” (which are never going to | ||
1731 | come online) cause a number of | ||
1732 | <a href="https://paulmck.livejournal.com/37494.html">interesting complications</a>. | ||
1733 | |||
1734 | <h3><a name="Early Boot">Early Boot</a></h3> | ||
1735 | |||
1736 | <p> | ||
1737 | The Linux kernel's boot sequence is an interesting process, | ||
1738 | and RCU is used early, even before <tt>rcu_init()</tt> | ||
1739 | is invoked. | ||
1740 | In fact, a number of RCU's primitives can be used as soon as the | ||
1741 | initial task's <tt>task_struct</tt> is available and the | ||
1742 | boot CPU's per-CPU variables are set up. | ||
1743 | The read-side primitives (<tt>rcu_read_lock()</tt>, | ||
1744 | <tt>rcu_read_unlock()</tt>, <tt>rcu_dereference()</tt>, | ||
1745 | and <tt>rcu_access_pointer()</tt>) will operate normally very early on, | ||
1746 | as will <tt>rcu_assign_pointer()</tt>. | ||
1747 | |||
1748 | <p> | ||
1749 | Although <tt>call_rcu()</tt> may be invoked at any | ||
1750 | time during boot, callbacks are not guaranteed to be invoked until after | ||
1751 | the scheduler is fully up and running. | ||
1752 | This delay in callback invocation is due to the fact that RCU does not | ||
1753 | invoke callbacks until it is fully initialized, and this full initialization | ||
1754 | cannot occur until after the scheduler has initialized itself to the | ||
1755 | point where RCU can spawn and run its kthreads. | ||
1756 | In theory, it would be possible to invoke callbacks earlier, | ||
1757 | however, this is not a panacea because there would be severe restrictions | ||
1758 | on what operations those callbacks could invoke. | ||
1759 | |||
1760 | <p> | ||
1761 | Perhaps surprisingly, <tt>synchronize_rcu()</tt>, | ||
1762 | <a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a> | ||
1763 | (<a href="#Bottom-Half Flavor">discussed below</a>), | ||
1764 | and | ||
1765 | <a href="#Sched Flavor"><tt>synchronize_sched()</tt></a> | ||
1766 | will all operate normally | ||
1767 | during very early boot, the reason being that there is only one CPU | ||
1768 | and preemption is disabled. | ||
1769 | This means that the call <tt>synchronize_rcu()</tt> (or friends) | ||
1770 | itself is a quiescent | ||
1771 | state and thus a grace period, so the early-boot implementation can | ||
1772 | be a no-op. | ||
1773 | |||
1774 | <p> | ||
1775 | Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt> | ||
1776 | continue to operate normally through the remainder of boot, courtesy | ||
1777 | of the fact that preemption is disabled across their RCU read-side | ||
1778 | critical sections and also courtesy of the fact that there is still | ||
1779 | only one CPU. | ||
1780 | However, once the scheduler starts initializing, preemption is enabled. | ||
1781 | There is still only a single CPU, but the fact that preemption is enabled | ||
1782 | means that the no-op implementation of <tt>synchronize_rcu()</tt> no | ||
1783 | longer works in <tt>CONFIG_PREEMPT=y</tt> kernels. | ||
1784 | Therefore, as soon as the scheduler starts initializing, the early-boot | ||
1785 | fastpath is disabled. | ||
1786 | This means that <tt>synchronize_rcu()</tt> switches to its runtime | ||
1787 | mode of operation where it posts callbacks, which in turn means that | ||
1788 | any call to <tt>synchronize_rcu()</tt> will block until the corresponding | ||
1789 | callback is invoked. | ||
1790 | Unfortunately, the callback cannot be invoked until RCU's runtime | ||
1791 | grace-period machinery is up and running, which cannot happen until | ||
1792 | the scheduler has initialized itself sufficiently to allow RCU's | ||
1793 | kthreads to be spawned. | ||
1794 | Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler | ||
1795 | initialization can result in deadlock. | ||
1796 | |||
1797 | <p><a name="Quick Quiz 14"><b>Quick Quiz 14</b>:</a> | ||
1798 | So what happens with <tt>synchronize_rcu()</tt> during | ||
1799 | scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> | ||
1800 | kernels? | ||
1801 | <br><a href="#qq14answer">Answer</a> | ||
1802 | |||
1803 | <p> | ||
1804 | I learned of these boot-time requirements as a result of a series of | ||
1805 | system hangs. | ||
1806 | |||
1807 | <h3><a name="Interrupts and NMIs">Interrupts and NMIs</a></h3> | ||
1808 | |||
1809 | <p> | ||
1810 | The Linux kernel has interrupts, and RCU read-side critical sections are | ||
1811 | legal within interrupt handlers and within interrupt-disabled regions | ||
1812 | of code, as are invocations of <tt>call_rcu()</tt>. | ||
1813 | |||
1814 | <p> | ||
1815 | Some Linux-kernel architectures can enter an interrupt handler from | ||
1816 | non-idle process context, and then just never leave it, instead stealthily | ||
1817 | transitioning back to process context. | ||
1818 | This trick is sometimes used to invoke system calls from inside the kernel. | ||
1819 | These “half-interrupts” mean that RCU has to be very careful | ||
1820 | about how it counts interrupt nesting levels. | ||
1821 | I learned of this requirement the hard way during a rewrite | ||
1822 | of RCU's dyntick-idle code. | ||
1823 | |||
1824 | <p> | ||
1825 | The Linux kernel has non-maskable interrupts (NMIs), and | ||
1826 | RCU read-side critical sections are legal within NMI handlers. | ||
1827 | Thankfully, RCU update-side primitives, including | ||
1828 | <tt>call_rcu()</tt>, are prohibited within NMI handlers. | ||
1829 | |||
1830 | <p> | ||
1831 | The name notwithstanding, some Linux-kernel architectures | ||
1832 | can have nested NMIs, which RCU must handle correctly. | ||
1833 | Andy Lutomirski | ||
1834 | <a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a> | ||
1835 | with this requirement; | ||
1836 | he also kindly surprised me with | ||
1837 | <a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a> | ||
1838 | that meets this requirement. | ||
1839 | |||
1840 | <h3><a name="Loadable Modules">Loadable Modules</a></h3> | ||
1841 | |||
1842 | <p> | ||
1843 | The Linux kernel has loadable modules, and these modules can | ||
1844 | also be unloaded. | ||
1845 | After a given module has been unloaded, any attempt to call | ||
1846 | one of its functions results in a segmentation fault. | ||
1847 | The module-unload functions must therefore cancel any | ||
1848 | delayed calls to loadable-module functions, for example, | ||
1849 | any outstanding <tt>mod_timer()</tt> must be dealt with | ||
1850 | via <tt>del_timer_sync()</tt> or similar. | ||
1851 | |||
1852 | <p> | ||
1853 | Unfortunately, there is no way to cancel an RCU callback; | ||
1854 | once you invoke <tt>call_rcu()</tt>, the callback function is | ||
1855 | going to eventually be invoked, unless the system goes down first. | ||
1856 | Because it is normally considered socially irresponsible to crash the system | ||
1857 | in response to a module unload request, we need some other way | ||
1858 | to deal with in-flight RCU callbacks. | ||
1859 | |||
1860 | <p> | ||
1861 | RCU therefore provides | ||
1862 | <tt><a href="https://lwn.net/Articles/217484/">rcu_barrier()</a></tt>, | ||
1863 | which waits until all in-flight RCU callbacks have been invoked. | ||
1864 | If a module uses <tt>call_rcu()</tt>, its exit function should therefore | ||
1865 | prevent any future invocation of <tt>call_rcu()</tt>, then invoke | ||
1866 | <tt>rcu_barrier()</tt>. | ||
1867 | In theory, the underlying module-unload code could invoke | ||
1868 | <tt>rcu_barrier()</tt> unconditionally, but in practice this would | ||
1869 | incur unacceptable latencies. | ||
1870 | |||
1871 | <p> | ||
1872 | Nikita Danilov noted this requirement for an analogous filesystem-unmount | ||
1873 | situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU. | ||
1874 | The need for <tt>rcu_barrier()</tt> for module unloading became | ||
1875 | apparent later. | ||
1876 | |||
1877 | <h3><a name="Hotplug CPU">Hotplug CPU</a></h3> | ||
1878 | |||
1879 | <p> | ||
1880 | The Linux kernel supports CPU hotplug, which means that CPUs | ||
1881 | can come and go. | ||
1882 | It is of course illegal to use any RCU API member from an offline CPU. | ||
1883 | This requirement was present from day one in DYNIX/ptx, but | ||
1884 | on the other hand, the Linux kernel's CPU-hotplug implementation | ||
1885 | is “interesting.” | ||
1886 | |||
1887 | <p> | ||
1888 | The Linux-kernel CPU-hotplug implementation has notifiers that | ||
1889 | are used to allow the various kernel subsystems (including RCU) | ||
1890 | to respond appropriately to a given CPU-hotplug operation. | ||
1891 | Most RCU operations may be invoked from CPU-hotplug notifiers, | ||
1892 | including even normal synchronous grace-period operations | ||
1893 | such as <tt>synchronize_rcu()</tt>. | ||
1894 | However, expedited grace-period operations such as | ||
1895 | <tt>synchronize_rcu_expedited()</tt> are not supported, | ||
1896 | due to the fact that current implementations block CPU-hotplug | ||
1897 | operations, which could result in deadlock. | ||
1898 | |||
1899 | <p> | ||
1900 | In addition, all-callback-wait operations such as | ||
1901 | <tt>rcu_barrier()</tt> are also not supported, due to the | ||
1902 | fact that there are phases of CPU-hotplug operations where | ||
1903 | the outgoing CPU's callbacks will not be invoked until after | ||
1904 | the CPU-hotplug operation ends, which could also result in deadlock. | ||
1905 | |||
1906 | <h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3> | ||
1907 | |||
1908 | <p> | ||
1909 | RCU depends on the scheduler, and the scheduler uses RCU to | ||
1910 | protect some of its data structures. | ||
1911 | This means the scheduler is forbidden from acquiring | ||
1912 | the runqueue locks and the priority-inheritance locks | ||
1913 | in the middle of an outermost RCU read-side critical section unless | ||
1914 | it also releases them before exiting that same | ||
1915 | RCU read-side critical section. | ||
1916 | This same prohibition also applies to any lock that is acquired | ||
1917 | while holding any lock to which this prohibition applies. | ||
1918 | Violating this rule results in deadlock. | ||
1919 | |||
1920 | <p> | ||
1921 | For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt> | ||
1922 | implementation must be written carefully to avoid similar deadlocks. | ||
1923 | In particular, <tt>rcu_read_unlock()</tt> must tolerate an | ||
1924 | interrupt where the interrupt handler invokes both | ||
1925 | <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>. | ||
1926 | This possibility requires <tt>rcu_read_unlock()</tt> to use | ||
1927 | negative nesting levels to avoid destructive recursion via | ||
1928 | interrupt handler's use of RCU. | ||
1929 | |||
1930 | <p> | ||
1931 | This pair of mutual scheduler-RCU requirements came as a | ||
1932 | <a href="https://lwn.net/Articles/453002/">complete surprise</a>. | ||
1933 | |||
1934 | <p> | ||
1935 | As noted above, RCU makes use of kthreads, and it is necessary to | ||
1936 | avoid excessive CPU-time accumulation by these kthreads. | ||
1937 | This requirement was no surprise, but RCU's violation of it | ||
1938 | when running context-switch-heavy workloads when built with | ||
1939 | <tt>CONFIG_NO_HZ_FULL=y</tt> | ||
1940 | <a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>. | ||
1941 | RCU has made good progress towards meeting this requirement, even | ||
1942 | for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads, | ||
1943 | but there is room for further improvement. | ||
1944 | |||
1945 | <h3><a name="Tracing and RCU">Tracing and RCU</a></h3> | ||
1946 | |||
1947 | <p> | ||
1948 | It is possible to use tracing on RCU code, but tracing itself | ||
1949 | uses RCU. | ||
1950 | For this reason, <tt>rcu_dereference_raw_notrace()</tt> | ||
1951 | is provided for use by tracing, which avoids the destructive | ||
1952 | recursion that could otherwise ensue. | ||
1953 | This API is also used by virtualization in some architectures, | ||
1954 | where RCU readers execute in environments in which tracing | ||
1955 | cannot be used. | ||
1956 | The tracing folks both located the requirement and provided the | ||
1957 | needed fix, so this surprise requirement was relatively painless. | ||
1958 | |||
1959 | <h3><a name="Energy Efficiency">Energy Efficiency</a></h3> | ||
1960 | |||
1961 | <p> | ||
1962 | Interrupting idle CPUs is considered socially unacceptable, | ||
1963 | especially by people with battery-powered embedded systems. | ||
1964 | RCU therefore conserves energy by detecting which CPUs are | ||
1965 | idle, including tracking CPUs that have been interrupted from idle. | ||
1966 | This is a large part of the energy-efficiency requirement, | ||
1967 | so I learned of this via an irate phone call. | ||
1968 | |||
1969 | <p> | ||
1970 | Because RCU avoids interrupting idle CPUs, it is illegal to | ||
1971 | execute an RCU read-side critical section on an idle CPU. | ||
1972 | (Kernels built with <tt>CONFIG_PROVE_RCU=y</tt> will splat | ||
1973 | if you try it.) | ||
1974 | The <tt>RCU_NONIDLE()</tt> macro and <tt>_rcuidle</tt> | ||
1975 | event tracing is provided to work around this restriction. | ||
1976 | In addition, <tt>rcu_is_watching()</tt> may be used to | ||
1977 | test whether or not it is currently legal to run RCU read-side | ||
1978 | critical sections on this CPU. | ||
1979 | I learned of the need for diagnostics on the one hand | ||
1980 | and <tt>RCU_NONIDLE()</tt> on the other while inspecting | ||
1981 | idle-loop code. | ||
1982 | Steven Rostedt supplied <tt>_rcuidle</tt> event tracing, | ||
1983 | which is used quite heavily in the idle loop. | ||
1984 | |||
1985 | <p> | ||
1986 | It is similarly socially unacceptable to interrupt an | ||
1987 | <tt>nohz_full</tt> CPU running in userspace. | ||
1988 | RCU must therefore track <tt>nohz_full</tt> userspace | ||
1989 | execution. | ||
1990 | And in | ||
1991 | <a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a> | ||
1992 | kernels, RCU must separately track idle CPUs on the one hand and | ||
1993 | CPUs that are either idle or executing in userspace on the other. | ||
1994 | In both cases, RCU must be able to sample state at two points in | ||
1995 | time, and be able to determine whether or not some other CPU spent | ||
1996 | any time idle and/or executing in userspace. | ||
1997 | |||
1998 | <p> | ||
1999 | These energy-efficiency requirements have proven quite difficult to | ||
2000 | understand and to meet, for example, there have been more than five | ||
2001 | clean-sheet rewrites of RCU's energy-efficiency code, the last of | ||
2002 | which was finally able to demonstrate | ||
2003 | <a href="http://www.rdrop.com/users/paulmck/realtime/paper/AMPenergy.2013.04.19a.pdf">real energy savings running on real hardware [PDF]</a>. | ||
2004 | As noted earlier, | ||
2005 | I learned of many of these requirements via angry phone calls: | ||
2006 | Flaming me on the Linux-kernel mailing list was apparently not | ||
2007 | sufficient to fully vent their ire at RCU's energy-efficiency bugs! | ||
2008 | |||
2009 | <h3><a name="Performance, Scalability, Response Time, and Reliability"> | ||
2010 | Performance, Scalability, Response Time, and Reliability</a></h3> | ||
2011 | |||
2012 | <p> | ||
2013 | Expanding on the | ||
2014 | <a href="#Performance and Scalability">earlier discussion</a>, | ||
2015 | RCU is used heavily by hot code paths in performance-critical | ||
2016 | portions of the Linux kernel's networking, security, virtualization, | ||
2017 | and scheduling code paths. | ||
2018 | RCU must therefore use efficient implementations, especially in its | ||
2019 | read-side primitives. | ||
2020 | To that end, it would be good if preemptible RCU's implementation | ||
2021 | of <tt>rcu_read_lock()</tt> could be inlined, however, doing | ||
2022 | this requires resolving <tt>#include</tt> issues with the | ||
2023 | <tt>task_struct</tt> structure. | ||
2024 | |||
2025 | <p> | ||
2026 | The Linux kernel supports hardware configurations with up to | ||
2027 | 4096 CPUs, which means that RCU must be extremely scalable. | ||
2028 | Algorithms that involve frequent acquisitions of global locks or | ||
2029 | frequent atomic operations on global variables simply cannot be | ||
2030 | tolerated within the RCU implementation. | ||
2031 | RCU therefore makes heavy use of a combining tree based on the | ||
2032 | <tt>rcu_node</tt> structure. | ||
2033 | RCU is required to tolerate all CPUs continuously invoking any | ||
2034 | combination of RCU's runtime primitives with minimal per-operation | ||
2035 | overhead. | ||
2036 | In fact, in many cases, increasing load must <i>decrease</i> the | ||
2037 | per-operation overhead, witness the batching optimizations for | ||
2038 | <tt>synchronize_rcu()</tt>, <tt>call_rcu()</tt>, | ||
2039 | <tt>synchronize_rcu_expedited()</tt>, and <tt>rcu_barrier()</tt>. | ||
2040 | As a general rule, RCU must cheerfully accept whatever the | ||
2041 | rest of the Linux kernel decides to throw at it. | ||
2042 | |||
2043 | <p> | ||
2044 | The Linux kernel is used for real-time workloads, especially | ||
2045 | in conjunction with the | ||
2046 | <a href="https://rt.wiki.kernel.org/index.php/Main_Page">-rt patchset</a>. | ||
2047 | The real-time-latency response requirements are such that the | ||
2048 | traditional approach of disabling preemption across RCU | ||
2049 | read-side critical sections is inappropriate. | ||
2050 | Kernels built with <tt>CONFIG_PREEMPT=y</tt> therefore | ||
2051 | use an RCU implementation that allows RCU read-side critical | ||
2052 | sections to be preempted. | ||
2053 | This requirement made its presence known after users made it | ||
2054 | clear that an earlier | ||
2055 | <a href="https://lwn.net/Articles/107930/">real-time patch</a> | ||
2056 | did not meet their needs, in conjunction with some | ||
2057 | <a href="https://lkml.kernel.org/g/20050318002026.GA2693@us.ibm.com">RCU issues</a> | ||
2058 | encountered by a very early version of the -rt patchset. | ||
2059 | |||
2060 | <p> | ||
2061 | In addition, RCU must make do with a sub-100-microsecond real-time latency | ||
2062 | budget. | ||
2063 | In fact, on smaller systems with the -rt patchset, the Linux kernel | ||
2064 | provides sub-20-microsecond real-time latencies for the whole kernel, | ||
2065 | including RCU. | ||
2066 | RCU's scalability and latency must therefore be sufficient for | ||
2067 | these sorts of configurations. | ||
2068 | To my surprise, the sub-100-microsecond real-time latency budget | ||
2069 | <a href="http://www.rdrop.com/users/paulmck/realtime/paper/bigrt.2013.01.31a.LCA.pdf"> | ||
2070 | applies to even the largest systems [PDF]</a>, | ||
2071 | up to and including systems with 4096 CPUs. | ||
2072 | This real-time requirement motivated the grace-period kthread, which | ||
2073 | also simplified handling of a number of race conditions. | ||
2074 | |||
2075 | <p> | ||
2076 | Finally, RCU's status as a synchronization primitive means that | ||
2077 | any RCU failure can result in arbitrary memory corruption that can be | ||
2078 | extremely difficult to debug. | ||
2079 | This means that RCU must be extremely reliable, which in | ||
2080 | practice also means that RCU must have an aggressive stress-test | ||
2081 | suite. | ||
2082 | This stress-test suite is called <tt>rcutorture</tt>. | ||
2083 | |||
2084 | <p> | ||
2085 | Although the need for <tt>rcutorture</tt> was no surprise, | ||
2086 | the current immense popularity of the Linux kernel is posing | ||
2087 | interesting—and perhaps unprecedented—validation | ||
2088 | challenges. | ||
2089 | To see this, keep in mind that there are well over one billion | ||
2090 | instances of the Linux kernel running today, given Android | ||
2091 | smartphones, Linux-powered televisions, and servers. | ||
2092 | This number can be expected to increase sharply with the advent of | ||
2093 | the celebrated Internet of Things. | ||
2094 | |||
2095 | <p> | ||
2096 | Suppose that RCU contains a race condition that manifests on average | ||
2097 | once per million years of runtime. | ||
2098 | This bug will be occurring about three times per <i>day</i> across | ||
2099 | the installed base. | ||
2100 | RCU could simply hide behind hardware error rates, given that no one | ||
2101 | should really expect their smartphone to last for a million years. | ||
2102 | However, anyone taking too much comfort from this thought should | ||
2103 | consider the fact that in most jurisdictions, a successful multi-year | ||
2104 | test of a given mechanism, which might include a Linux kernel, | ||
2105 | suffices for a number of types of safety-critical certifications. | ||
2106 | In fact, rumor has it that the Linux kernel is already being used | ||
2107 | in production for safety-critical applications. | ||
2108 | I don't know about you, but I would feel quite bad if a bug in RCU | ||
2109 | killed someone. | ||
2110 | Which might explain my recent focus on validation and verification. | ||
2111 | |||
2112 | <h2><a name="Other RCU Flavors">Other RCU Flavors</a></h2> | ||
2113 | |||
2114 | <p> | ||
2115 | One of the more surprising things about RCU is that there are now | ||
2116 | no fewer than five <i>flavors</i>, or API families. | ||
2117 | In addition, the primary flavor that has been the sole focus up to | ||
2118 | this point has two different implementations, non-preemptible and | ||
2119 | preemptible. | ||
2120 | The other four flavors are listed below, with requirements for each | ||
2121 | described in a separate section. | ||
2122 | |||
2123 | <ol> | ||
2124 | <li> <a href="#Bottom-Half Flavor">Bottom-Half Flavor</a> | ||
2125 | <li> <a href="#Sched Flavor">Sched Flavor</a> | ||
2126 | <li> <a href="#Sleepable RCU">Sleepable RCU</a> | ||
2127 | <li> <a href="#Tasks RCU">Tasks RCU</a> | ||
2128 | </ol> | ||
2129 | |||
2130 | <h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3> | ||
2131 | |||
2132 | <p> | ||
2133 | The softirq-disable (AKA “bottom-half”, | ||
2134 | hence the “_bh” abbreviations) | ||
2135 | flavor of RCU, or <i>RCU-bh</i>, was developed by | ||
2136 | Dipankar Sarma to provide a flavor of RCU that could withstand the | ||
2137 | network-based denial-of-service attacks researched by Robert | ||
2138 | Olsson. | ||
2139 | These attacks placed so much networking load on the system | ||
2140 | that some of the CPUs never exited softirq execution, | ||
2141 | which in turn prevented those CPUs from ever executing a context switch, | ||
2142 | which, in the RCU implementation of that time, prevented grace periods | ||
2143 | from ever ending. | ||
2144 | The result was an out-of-memory condition and a system hang. | ||
2145 | |||
2146 | <p> | ||
2147 | The solution was the creation of RCU-bh, which does | ||
2148 | <tt>local_bh_disable()</tt> | ||
2149 | across its read-side critical sections, and which uses the transition | ||
2150 | from one type of softirq processing to another as a quiescent state | ||
2151 | in addition to context switch, idle, user mode, and offline. | ||
2152 | This means that RCU-bh grace periods can complete even when some of | ||
2153 | the CPUs execute in softirq indefinitely, thus allowing algorithms | ||
2154 | based on RCU-bh to withstand network-based denial-of-service attacks. | ||
2155 | |||
2156 | <p> | ||
2157 | Because | ||
2158 | <tt>rcu_read_lock_bh()</tt> and <tt>rcu_read_unlock_bh()</tt> | ||
2159 | disable and re-enable softirq handlers, any attempt to start a softirq | ||
2160 | handlers during the | ||
2161 | RCU-bh read-side critical section will be deferred. | ||
2162 | In this case, <tt>rcu_read_unlock_bh()</tt> | ||
2163 | will invoke softirq processing, which can take considerable time. | ||
2164 | One can of course argue that this softirq overhead should be associated | ||
2165 | with the code following the RCU-bh read-side critical section rather | ||
2166 | than <tt>rcu_read_unlock_bh()</tt>, but the fact | ||
2167 | is that most profiling tools cannot be expected to make this sort | ||
2168 | of fine distinction. | ||
2169 | For example, suppose that a three-millisecond-long RCU-bh read-side | ||
2170 | critical section executes during a time of heavy networking load. | ||
2171 | There will very likely be an attempt to invoke at least one softirq | ||
2172 | handler during that three milliseconds, but any such invocation will | ||
2173 | be delayed until the time of the <tt>rcu_read_unlock_bh()</tt>. | ||
2174 | This can of course make it appear at first glance as if | ||
2175 | <tt>rcu_read_unlock_bh()</tt> was executing very slowly. | ||
2176 | |||
2177 | <p> | ||
2178 | The | ||
2179 | <a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-bh API</a> | ||
2180 | includes | ||
2181 | <tt>rcu_read_lock_bh()</tt>, | ||
2182 | <tt>rcu_read_unlock_bh()</tt>, | ||
2183 | <tt>rcu_dereference_bh()</tt>, | ||
2184 | <tt>rcu_dereference_bh_check()</tt>, | ||
2185 | <tt>synchronize_rcu_bh()</tt>, | ||
2186 | <tt>synchronize_rcu_bh_expedited()</tt>, | ||
2187 | <tt>call_rcu_bh()</tt>, | ||
2188 | <tt>rcu_barrier_bh()</tt>, and | ||
2189 | <tt>rcu_read_lock_bh_held()</tt>. | ||
2190 | |||
2191 | <h3><a name="Sched Flavor">Sched Flavor</a></h3> | ||
2192 | |||
2193 | <p> | ||
2194 | Before preemptible RCU, waiting for an RCU grace period had the | ||
2195 | side effect of also waiting for all pre-existing interrupt | ||
2196 | and NMI handlers. | ||
2197 | However, there are legitimate preemptible-RCU implementations that | ||
2198 | do not have this property, given that any point in the code outside | ||
2199 | of an RCU read-side critical section can be a quiescent state. | ||
2200 | Therefore, <i>RCU-sched</i> was created, which follows “classic” | ||
2201 | RCU in that an RCU-sched grace period waits for for pre-existing | ||
2202 | interrupt and NMI handlers. | ||
2203 | In kernels built with <tt>CONFIG_PREEMPT=n</tt>, the RCU and RCU-sched | ||
2204 | APIs have identical implementations, while kernels built with | ||
2205 | <tt>CONFIG_PREEMPT=y</tt> provide a separate implementation for each. | ||
2206 | |||
2207 | <p> | ||
2208 | Note well that in <tt>CONFIG_PREEMPT=y</tt> kernels, | ||
2209 | <tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt> | ||
2210 | disable and re-enable preemption, respectively. | ||
2211 | This means that if there was a preemption attempt during the | ||
2212 | RCU-sched read-side critical section, <tt>rcu_read_unlock_sched()</tt> | ||
2213 | will enter the scheduler, with all the latency and overhead entailed. | ||
2214 | Just as with <tt>rcu_read_unlock_bh()</tt>, this can make it look | ||
2215 | as if <tt>rcu_read_unlock_sched()</tt> was executing very slowly. | ||
2216 | However, the highest-priority task won't be preempted, so that task | ||
2217 | will enjoy low-overhead <tt>rcu_read_unlock_sched()</tt> invocations. | ||
2218 | |||
2219 | <p> | ||
2220 | The | ||
2221 | <a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-sched API</a> | ||
2222 | includes | ||
2223 | <tt>rcu_read_lock_sched()</tt>, | ||
2224 | <tt>rcu_read_unlock_sched()</tt>, | ||
2225 | <tt>rcu_read_lock_sched_notrace()</tt>, | ||
2226 | <tt>rcu_read_unlock_sched_notrace()</tt>, | ||
2227 | <tt>rcu_dereference_sched()</tt>, | ||
2228 | <tt>rcu_dereference_sched_check()</tt>, | ||
2229 | <tt>synchronize_sched()</tt>, | ||
2230 | <tt>synchronize_rcu_sched_expedited()</tt>, | ||
2231 | <tt>call_rcu_sched()</tt>, | ||
2232 | <tt>rcu_barrier_sched()</tt>, and | ||
2233 | <tt>rcu_read_lock_sched_held()</tt>. | ||
2234 | However, anything that disables preemption also marks an RCU-sched | ||
2235 | read-side critical section, including | ||
2236 | <tt>preempt_disable()</tt> and <tt>preempt_enable()</tt>, | ||
2237 | <tt>local_irq_save()</tt> and <tt>local_irq_restore()</tt>, | ||
2238 | and so on. | ||
2239 | |||
2240 | <h3><a name="Sleepable RCU">Sleepable RCU</a></h3> | ||
2241 | |||
2242 | <p> | ||
2243 | For well over a decade, someone saying “I need to block within | ||
2244 | an RCU read-side critical section” was a reliable indication | ||
2245 | that this someone did not understand RCU. | ||
2246 | After all, if you are always blocking in an RCU read-side critical | ||
2247 | section, you can probably afford to use a higher-overhead synchronization | ||
2248 | mechanism. | ||
2249 | However, that changed with the advent of the Linux kernel's notifiers, | ||
2250 | whose RCU read-side critical | ||
2251 | sections almost never sleep, but sometimes need to. | ||
2252 | This resulted in the introduction of | ||
2253 | <a href="https://lwn.net/Articles/202847/">sleepable RCU</a>, | ||
2254 | or <i>SRCU</i>. | ||
2255 | |||
2256 | <p> | ||
2257 | SRCU allows different domains to be defined, with each such domain | ||
2258 | defined by an instance of an <tt>srcu_struct</tt> structure. | ||
2259 | A pointer to this structure must be passed in to each SRCU function, | ||
2260 | for example, <tt>synchronize_srcu(&ss)</tt>, where | ||
2261 | <tt>ss</tt> is the <tt>srcu_struct</tt> structure. | ||
2262 | The key benefit of these domains is that a slow SRCU reader in one | ||
2263 | domain does not delay an SRCU grace period in some other domain. | ||
2264 | That said, one consequence of these domains is that read-side code | ||
2265 | must pass a “cookie” from <tt>srcu_read_lock()</tt> | ||
2266 | to <tt>srcu_read_unlock()</tt>, for example, as follows: | ||
2267 | |||
2268 | <blockquote> | ||
2269 | <pre> | ||
2270 | 1 int idx; | ||
2271 | 2 | ||
2272 | 3 idx = srcu_read_lock(&ss); | ||
2273 | 4 do_something(); | ||
2274 | 5 srcu_read_unlock(&ss, idx); | ||
2275 | </pre> | ||
2276 | </blockquote> | ||
2277 | |||
2278 | <p> | ||
2279 | As noted above, it is legal to block within SRCU read-side critical sections, | ||
2280 | however, with great power comes great responsibility. | ||
2281 | If you block forever in one of a given domain's SRCU read-side critical | ||
2282 | sections, then that domain's grace periods will also be blocked forever. | ||
2283 | Of course, one good way to block forever is to deadlock, which can | ||
2284 | happen if any operation in a given domain's SRCU read-side critical | ||
2285 | section can block waiting, either directly or indirectly, for that domain's | ||
2286 | grace period to elapse. | ||
2287 | For example, this results in a self-deadlock: | ||
2288 | |||
2289 | <blockquote> | ||
2290 | <pre> | ||
2291 | 1 int idx; | ||
2292 | 2 | ||
2293 | 3 idx = srcu_read_lock(&ss); | ||
2294 | 4 do_something(); | ||
2295 | 5 synchronize_srcu(&ss); | ||
2296 | 6 srcu_read_unlock(&ss, idx); | ||
2297 | </pre> | ||
2298 | </blockquote> | ||
2299 | |||
2300 | <p> | ||
2301 | However, if line 5 acquired a mutex that was held across | ||
2302 | a <tt>synchronize_srcu()</tt> for domain <tt>ss</tt>, | ||
2303 | deadlock would still be possible. | ||
2304 | Furthermore, if line 5 acquired a mutex that was held across | ||
2305 | a <tt>synchronize_srcu()</tt> for some other domain <tt>ss1</tt>, | ||
2306 | and if an <tt>ss1</tt>-domain SRCU read-side critical section | ||
2307 | acquired another mutex that was held across as <tt>ss</tt>-domain | ||
2308 | <tt>synchronize_srcu()</tt>, | ||
2309 | deadlock would again be possible. | ||
2310 | Such a deadlock cycle could extend across an arbitrarily large number | ||
2311 | of different SRCU domains. | ||
2312 | Again, with great power comes great responsibility. | ||
2313 | |||
2314 | <p> | ||
2315 | Unlike the other RCU flavors, SRCU read-side critical sections can | ||
2316 | run on idle and even offline CPUs. | ||
2317 | This ability requires that <tt>srcu_read_lock()</tt> and | ||
2318 | <tt>srcu_read_unlock()</tt> contain memory barriers, which means | ||
2319 | that SRCU readers will run a bit slower than would RCU readers. | ||
2320 | It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt> | ||
2321 | API, which, in combination with <tt>srcu_read_unlock()</tt>, | ||
2322 | guarantees a full memory barrier. | ||
2323 | |||
2324 | <p> | ||
2325 | The | ||
2326 | <a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a> | ||
2327 | includes | ||
2328 | <tt>srcu_read_lock()</tt>, | ||
2329 | <tt>srcu_read_unlock()</tt>, | ||
2330 | <tt>srcu_dereference()</tt>, | ||
2331 | <tt>srcu_dereference_check()</tt>, | ||
2332 | <tt>synchronize_srcu()</tt>, | ||
2333 | <tt>synchronize_srcu_expedited()</tt>, | ||
2334 | <tt>call_srcu()</tt>, | ||
2335 | <tt>srcu_barrier()</tt>, and | ||
2336 | <tt>srcu_read_lock_held()</tt>. | ||
2337 | It also includes | ||
2338 | <tt>DEFINE_SRCU()</tt>, | ||
2339 | <tt>DEFINE_STATIC_SRCU()</tt>, and | ||
2340 | <tt>init_srcu_struct()</tt> | ||
2341 | APIs for defining and initializing <tt>srcu_struct</tt> structures. | ||
2342 | |||
2343 | <h3><a name="Tasks RCU">Tasks RCU</a></h3> | ||
2344 | |||
2345 | <p> | ||
2346 | Some forms of tracing use “tramopolines” to handle the | ||
2347 | binary rewriting required to install different types of probes. | ||
2348 | It would be good to be able to free old trampolines, which sounds | ||
2349 | like a job for some form of RCU. | ||
2350 | However, because it is necessary to be able to install a trace | ||
2351 | anywhere in the code, it is not possible to use read-side markers | ||
2352 | such as <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>. | ||
2353 | In addition, it does not work to have these markers in the trampoline | ||
2354 | itself, because there would need to be instructions following | ||
2355 | <tt>rcu_read_unlock()</tt>. | ||
2356 | Although <tt>synchronize_rcu()</tt> would guarantee that execution | ||
2357 | reached the <tt>rcu_read_unlock()</tt>, it would not be able to | ||
2358 | guarantee that execution had completely left the trampoline. | ||
2359 | |||
2360 | <p> | ||
2361 | The solution, in the form of | ||
2362 | <a href="https://lwn.net/Articles/607117/"><i>Tasks RCU</i></a>, | ||
2363 | is to have implicit | ||
2364 | read-side critical sections that are delimited by voluntary context | ||
2365 | switches, that is, calls to <tt>schedule()</tt>, | ||
2366 | <tt>cond_resched_rcu_qs()</tt>, and | ||
2367 | <tt>synchronize_rcu_tasks()</tt>. | ||
2368 | In addition, transitions to and from userspace execution also delimit | ||
2369 | tasks-RCU read-side critical sections. | ||
2370 | |||
2371 | <p> | ||
2372 | The tasks-RCU API is quite compact, consisting only of | ||
2373 | <tt>call_rcu_tasks()</tt>, | ||
2374 | <tt>synchronize_rcu_tasks()</tt>, and | ||
2375 | <tt>rcu_barrier_tasks()</tt>. | ||
2376 | |||
2377 | <h2><a name="Possible Future Changes">Possible Future Changes</a></h2> | ||
2378 | |||
2379 | <p> | ||
2380 | One of the tricks that RCU uses to attain update-side scalability is | ||
2381 | to increase grace-period latency with increasing numbers of CPUs. | ||
2382 | If this becomes a serious problem, it will be necessary to rework the | ||
2383 | grace-period state machine so as to avoid the need for the additional | ||
2384 | latency. | ||
2385 | |||
2386 | <p> | ||
2387 | Expedited grace periods scan the CPUs, so their latency and overhead | ||
2388 | increases with increasing numbers of CPUs. | ||
2389 | If this becomes a serious problem on large systems, it will be necessary | ||
2390 | to do some redesign to avoid this scalability problem. | ||
2391 | |||
2392 | <p> | ||
2393 | RCU disables CPU hotplug in a few places, perhaps most notably in the | ||
2394 | expedited grace-period and <tt>rcu_barrier()</tt> operations. | ||
2395 | If there is a strong reason to use expedited grace periods in CPU-hotplug | ||
2396 | notifiers, it will be necessary to avoid disabling CPU hotplug. | ||
2397 | This would introduce some complexity, so there had better be a <i>very</i> | ||
2398 | good reason. | ||
2399 | |||
2400 | <p> | ||
2401 | The tradeoff between grace-period latency on the one hand and interruptions | ||
2402 | of other CPUs on the other hand may need to be re-examined. | ||
2403 | The desire is of course for zero grace-period latency as well as zero | ||
2404 | interprocessor interrupts undertaken during an expedited grace period | ||
2405 | operation. | ||
2406 | While this ideal is unlikely to be achievable, it is quite possible that | ||
2407 | further improvements can be made. | ||
2408 | |||
2409 | <p> | ||
2410 | The multiprocessor implementations of RCU use a combining tree that | ||
2411 | groups CPUs so as to reduce lock contention and increase cache locality. | ||
2412 | However, this combining tree does not spread its memory across NUMA | ||
2413 | nodes nor does it align the CPU groups with hardware features such | ||
2414 | as sockets or cores. | ||
2415 | Such spreading and alignment is currently believed to be unnecessary | ||
2416 | because the hotpath read-side primitives do not access the combining | ||
2417 | tree, nor does <tt>call_rcu()</tt> in the common case. | ||
2418 | If you believe that your architecture needs such spreading and alignment, | ||
2419 | then your architecture should also benefit from the | ||
2420 | <tt>rcutree.rcu_fanout_leaf</tt> boot parameter, which can be set | ||
2421 | to the number of CPUs in a socket, NUMA node, or whatever. | ||
2422 | If the number of CPUs is too large, use a fraction of the number of | ||
2423 | CPUs. | ||
2424 | If the number of CPUs is a large prime number, well, that certainly | ||
2425 | is an “interesting” architectural choice! | ||
2426 | More flexible arrangements might be considered, but only if | ||
2427 | <tt>rcutree.rcu_fanout_leaf</tt> has proven inadequate, and only | ||
2428 | if the inadequacy has been demonstrated by a carefully run and | ||
2429 | realistic system-level workload. | ||
2430 | |||
2431 | <p> | ||
2432 | Please note that arrangements that require RCU to remap CPU numbers will | ||
2433 | require extremely good demonstration of need and full exploration of | ||
2434 | alternatives. | ||
2435 | |||
2436 | <p> | ||
2437 | There is an embarrassingly large number of flavors of RCU, and this | ||
2438 | number has been increasing over time. | ||
2439 | Perhaps it will be possible to combine some at some future date. | ||
2440 | |||
2441 | <p> | ||
2442 | RCU's various kthreads are reasonably recent additions. | ||
2443 | It is quite likely that adjustments will be required to more gracefully | ||
2444 | handle extreme loads. | ||
2445 | It might also be necessary to be able to relate CPU utilization by | ||
2446 | RCU's kthreads and softirq handlers to the code that instigated this | ||
2447 | CPU utilization. | ||
2448 | For example, RCU callback overhead might be charged back to the | ||
2449 | originating <tt>call_rcu()</tt> instance, though probably not | ||
2450 | in production kernels. | ||
2451 | |||
2452 | <h2><a name="Summary">Summary</a></h2> | ||
2453 | |||
2454 | <p> | ||
2455 | This document has presented more than two decade's worth of RCU | ||
2456 | requirements. | ||
2457 | Given that the requirements keep changing, this will not be the last | ||
2458 | word on this subject, but at least it serves to get an important | ||
2459 | subset of the requirements set forth. | ||
2460 | |||
2461 | <h2><a name="Acknowledgments">Acknowledgments</a></h2> | ||
2462 | |||
2463 | I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar, | ||
2464 | Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and | ||
2465 | Andy Lutomirski for their help in rendering | ||
2466 | this article human readable, and to Michelle Rankin for her support | ||
2467 | of this effort. | ||
2468 | Other contributions are acknowledged in the Linux kernel's git archive. | ||
2469 | The cartoon is copyright (c) 2013 by Melissa Broussard, | ||
2470 | and is provided | ||
2471 | under the terms of the Creative Commons Attribution-Share Alike 3.0 | ||
2472 | United States license. | ||
2473 | |||
2474 | <h3><a name="Answers to Quick Quizzes"> | ||
2475 | Answers to Quick Quizzes</a></h3> | ||
2476 | |||
2477 | <a name="qq1answer"></a> | ||
2478 | <p><b>Quick Quiz 1</b>: | ||
2479 | Wait a minute! | ||
2480 | You said that updaters can make useful forward progress concurrently | ||
2481 | with readers, but pre-existing readers will block | ||
2482 | <tt>synchronize_rcu()</tt>!!! | ||
2483 | Just who are you trying to fool??? | ||
2484 | |||
2485 | |||
2486 | </p><p><b>Answer</b>: | ||
2487 | First, if updaters do not wish to be blocked by readers, they can use | ||
2488 | <tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will | ||
2489 | be discussed later. | ||
2490 | Second, even when using <tt>synchronize_rcu()</tt>, the other | ||
2491 | update-side code does run concurrently with readers, whether pre-existing | ||
2492 | or not. | ||
2493 | |||
2494 | |||
2495 | </p><p><a href="#Quick%20Quiz%201"><b>Back to Quick Quiz 1</b>.</a> | ||
2496 | |||
2497 | <a name="qq2answer"></a> | ||
2498 | <p><b>Quick Quiz 2</b>: | ||
2499 | Why is the <tt>synchronize_rcu()</tt> on line 28 needed? | ||
2500 | |||
2501 | |||
2502 | </p><p><b>Answer</b>: | ||
2503 | Without that extra grace period, memory reordering could result in | ||
2504 | <tt>do_something_dlm()</tt> executing <tt>do_something()</tt> | ||
2505 | concurrently with the last bits of <tt>recovery()</tt>. | ||
2506 | |||
2507 | |||
2508 | </p><p><a href="#Quick%20Quiz%202"><b>Back to Quick Quiz 2</b>.</a> | ||
2509 | |||
2510 | <a name="qq3answer"></a> | ||
2511 | <p><b>Quick Quiz 3</b>: | ||
2512 | But <tt>rcu_assign_pointer()</tt> does nothing to prevent the | ||
2513 | two assignments to <tt>p->a</tt> and <tt>p->b</tt> | ||
2514 | from being reordered. | ||
2515 | Can't that also cause problems? | ||
2516 | |||
2517 | |||
2518 | </p><p><b>Answer</b>: | ||
2519 | No, it cannot. | ||
2520 | The readers cannot see either of these two fields until | ||
2521 | the assignment to <tt>gp</tt>, by which time both fields are | ||
2522 | fully initialized. | ||
2523 | So reordering the assignments | ||
2524 | to <tt>p->a</tt> and <tt>p->b</tt> cannot possibly | ||
2525 | cause any problems. | ||
2526 | |||
2527 | |||
2528 | </p><p><a href="#Quick%20Quiz%203"><b>Back to Quick Quiz 3</b>.</a> | ||
2529 | |||
2530 | <a name="qq4answer"></a> | ||
2531 | <p><b>Quick Quiz 4</b>: | ||
2532 | Without the <tt>rcu_dereference()</tt> or the | ||
2533 | <tt>rcu_access_pointer()</tt>, what destructive optimizations | ||
2534 | might the compiler make use of? | ||
2535 | |||
2536 | |||
2537 | </p><p><b>Answer</b>: | ||
2538 | Let's start with what happens to <tt>do_something_gp()</tt> | ||
2539 | if it fails to use <tt>rcu_dereference()</tt>. | ||
2540 | It could reuse a value formerly fetched from this same pointer. | ||
2541 | It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time | ||
2542 | manner, resulting in <i>load tearing</i>, in turn resulting a bytewise | ||
2543 | mash-up of two distince pointer values. | ||
2544 | It might even use value-speculation optimizations, where it makes a wrong | ||
2545 | guess, but by the time it gets around to checking the value, an update | ||
2546 | has changed the pointer to match the wrong guess. | ||
2547 | Too bad about any dereferences that returned pre-initialization garbage | ||
2548 | in the meantime! | ||
2549 | |||
2550 | <p> | ||
2551 | For <tt>remove_gp_synchronous()</tt>, as long as all modifications | ||
2552 | to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>, | ||
2553 | the above optimizations are harmless. | ||
2554 | However, | ||
2555 | with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>, | ||
2556 | <tt>sparse</tt> will complain if you | ||
2557 | define <tt>gp</tt> with <tt>__rcu</tt> and then | ||
2558 | access it without using | ||
2559 | either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>. | ||
2560 | |||
2561 | |||
2562 | </p><p><a href="#Quick%20Quiz%204"><b>Back to Quick Quiz 4</b>.</a> | ||
2563 | |||
2564 | <a name="qq5answer"></a> | ||
2565 | <p><b>Quick Quiz 5</b>: | ||
2566 | Given that multiple CPUs can start RCU read-side critical sections | ||
2567 | at any time without any ordering whatsoever, how can RCU possibly tell whether | ||
2568 | or not a given RCU read-side critical section starts before a | ||
2569 | given instance of <tt>synchronize_rcu()</tt>? | ||
2570 | |||
2571 | |||
2572 | </p><p><b>Answer</b>: | ||
2573 | If RCU cannot tell whether or not a given | ||
2574 | RCU read-side critical section starts before a | ||
2575 | given instance of <tt>synchronize_rcu()</tt>, | ||
2576 | then it must assume that the RCU read-side critical section | ||
2577 | started first. | ||
2578 | In other words, a given instance of <tt>synchronize_rcu()</tt> | ||
2579 | can avoid waiting on a given RCU read-side critical section only | ||
2580 | if it can prove that <tt>synchronize_rcu()</tt> started first. | ||
2581 | |||
2582 | |||
2583 | </p><p><a href="#Quick%20Quiz%205"><b>Back to Quick Quiz 5</b>.</a> | ||
2584 | |||
2585 | <a name="qq6answer"></a> | ||
2586 | <p><b>Quick Quiz 6</b>: | ||
2587 | The first and second guarantees require unbelievably strict ordering! | ||
2588 | Are all these memory barriers <i> really</i> required? | ||
2589 | |||
2590 | |||
2591 | </p><p><b>Answer</b>: | ||
2592 | Yes, they really are required. | ||
2593 | To see why the first guarantee is required, consider the following | ||
2594 | sequence of events: | ||
2595 | |||
2596 | <ol> | ||
2597 | <li> CPU 1: <tt>rcu_read_lock()</tt> | ||
2598 | <li> CPU 1: <tt>q = rcu_dereference(gp); | ||
2599 | /* Very likely to return p. */</tt> | ||
2600 | <li> CPU 0: <tt>list_del_rcu(p);</tt> | ||
2601 | <li> CPU 0: <tt>synchronize_rcu()</tt> starts. | ||
2602 | <li> CPU 1: <tt>do_something_with(q->a); | ||
2603 | /* No smp_mb(), so might happen after kfree(). */</tt> | ||
2604 | <li> CPU 1: <tt>rcu_read_unlock()</tt> | ||
2605 | <li> CPU 0: <tt>synchronize_rcu()</tt> returns. | ||
2606 | <li> CPU 0: <tt>kfree(p);</tt> | ||
2607 | </ol> | ||
2608 | |||
2609 | <p> | ||
2610 | Therefore, there absolutely must be a full memory barrier between the | ||
2611 | end of the RCU read-side critical section and the end of the | ||
2612 | grace period. | ||
2613 | |||
2614 | <p> | ||
2615 | The sequence of events demonstrating the necessity of the second rule | ||
2616 | is roughly similar: | ||
2617 | |||
2618 | <ol> | ||
2619 | <li> CPU 0: <tt>list_del_rcu(p);</tt> | ||
2620 | <li> CPU 0: <tt>synchronize_rcu()</tt> starts. | ||
2621 | <li> CPU 1: <tt>rcu_read_lock()</tt> | ||
2622 | <li> CPU 1: <tt>q = rcu_dereference(gp); | ||
2623 | /* Might return p if no memory barrier. */</tt> | ||
2624 | <li> CPU 0: <tt>synchronize_rcu()</tt> returns. | ||
2625 | <li> CPU 0: <tt>kfree(p);</tt> | ||
2626 | <li> CPU 1: <tt>do_something_with(q->a); /* Boom!!! */</tt> | ||
2627 | <li> CPU 1: <tt>rcu_read_unlock()</tt> | ||
2628 | </ol> | ||
2629 | |||
2630 | <p> | ||
2631 | And similarly, without a memory barrier between the beginning of the | ||
2632 | grace period and the beginning of the RCU read-side critical section, | ||
2633 | CPU 1 might end up accessing the freelist. | ||
2634 | |||
2635 | <p> | ||
2636 | The “as if” rule of course applies, so that any implementation | ||
2637 | that acts as if the appropriate memory barriers were in place is a | ||
2638 | correct implementation. | ||
2639 | That said, it is much easier to fool yourself into believing that you have | ||
2640 | adhered to the as-if rule than it is to actually adhere to it! | ||
2641 | |||
2642 | |||
2643 | </p><p><a href="#Quick%20Quiz%206"><b>Back to Quick Quiz 6</b>.</a> | ||
2644 | |||
2645 | <a name="qq7answer"></a> | ||
2646 | <p><b>Quick Quiz 7</b>: | ||
2647 | But how does the upgrade-to-write operation exclude other readers? | ||
2648 | |||
2649 | |||
2650 | </p><p><b>Answer</b>: | ||
2651 | It doesn't, just like normal RCU updates, which also do not exclude | ||
2652 | RCU readers. | ||
2653 | |||
2654 | |||
2655 | </p><p><a href="#Quick%20Quiz%207"><b>Back to Quick Quiz 7</b>.</a> | ||
2656 | |||
2657 | <a name="qq8answer"></a> | ||
2658 | <p><b>Quick Quiz 8</b>: | ||
2659 | Can't the compiler also reorder this code? | ||
2660 | |||
2661 | |||
2662 | </p><p><b>Answer</b>: | ||
2663 | No, the volatile casts in <tt>READ_ONCE()</tt> and | ||
2664 | <tt>WRITE_ONCE()</tt> prevent the compiler from reordering in | ||
2665 | this particular case. | ||
2666 | |||
2667 | |||
2668 | </p><p><a href="#Quick%20Quiz%208"><b>Back to Quick Quiz 8</b>.</a> | ||
2669 | |||
2670 | <a name="qq9answer"></a> | ||
2671 | <p><b>Quick Quiz 9</b>: | ||
2672 | Suppose that synchronize_rcu() did wait until all readers had completed. | ||
2673 | Would the updater be able to rely on this? | ||
2674 | |||
2675 | |||
2676 | </p><p><b>Answer</b>: | ||
2677 | No. | ||
2678 | Even if <tt>synchronize_rcu()</tt> were to wait until | ||
2679 | all readers had completed, a new reader might start immediately after | ||
2680 | <tt>synchronize_rcu()</tt> completed. | ||
2681 | Therefore, the code following | ||
2682 | <tt>synchronize_rcu()</tt> cannot rely on there being no readers | ||
2683 | in any case. | ||
2684 | |||
2685 | |||
2686 | </p><p><a href="#Quick%20Quiz%209"><b>Back to Quick Quiz 9</b>.</a> | ||
2687 | |||
2688 | <a name="qq10answer"></a> | ||
2689 | <p><b>Quick Quiz 10</b>: | ||
2690 | How long a sequence of grace periods, each separated by an RCU read-side | ||
2691 | critical section, would be required to partition the RCU read-side | ||
2692 | critical sections at the beginning and end of the chain? | ||
2693 | |||
2694 | |||
2695 | </p><p><b>Answer</b>: | ||
2696 | In theory, an infinite number. | ||
2697 | In practice, an unknown number that is sensitive to both implementation | ||
2698 | details and timing considerations. | ||
2699 | Therefore, even in practice, RCU users must abide by the theoretical rather | ||
2700 | than the practical answer. | ||
2701 | |||
2702 | |||
2703 | </p><p><a href="#Quick%20Quiz%2010"><b>Back to Quick Quiz 10</b>.</a> | ||
2704 | |||
2705 | <a name="qq11answer"></a> | ||
2706 | <p><b>Quick Quiz 11</b>: | ||
2707 | What about sleeping locks? | ||
2708 | |||
2709 | |||
2710 | </p><p><b>Answer</b>: | ||
2711 | These are forbidden within Linux-kernel RCU read-side critical sections | ||
2712 | because it is not legal to place a quiescent state (in this case, | ||
2713 | voluntary context switch) within an RCU read-side critical section. | ||
2714 | However, sleeping locks may be used within userspace RCU read-side critical | ||
2715 | sections, and also within Linux-kernel sleepable RCU | ||
2716 | <a href="#Sleepable RCU">(SRCU)</a> | ||
2717 | read-side critical sections. | ||
2718 | In addition, the -rt patchset turns spinlocks into a sleeping locks so | ||
2719 | that the corresponding critical sections can be preempted, which | ||
2720 | also means that these sleeplockified spinlocks (but not other sleeping locks!) | ||
2721 | may be acquire within -rt-Linux-kernel RCU read-side critical sections. | ||
2722 | |||
2723 | <p> | ||
2724 | Note that it <i>is</i> legal for a normal RCU read-side critical section | ||
2725 | to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>), | ||
2726 | but only as long as it does not loop indefinitely attempting to | ||
2727 | conditionally acquire that sleeping locks. | ||
2728 | The key point is that things like <tt>mutex_trylock()</tt> | ||
2729 | either return with the mutex held, or return an error indication if | ||
2730 | the mutex was not immediately available. | ||
2731 | Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping. | ||
2732 | |||
2733 | |||
2734 | </p><p><a href="#Quick%20Quiz%2011"><b>Back to Quick Quiz 11</b>.</a> | ||
2735 | |||
2736 | <a name="qq12answer"></a> | ||
2737 | <p><b>Quick Quiz 12</b>: | ||
2738 | Why does line 19 use <tt>rcu_access_pointer()</tt>? | ||
2739 | After all, <tt>call_rcu()</tt> on line 25 stores into the | ||
2740 | structure, which would interact badly with concurrent insertions. | ||
2741 | Doesn't this mean that <tt>rcu_dereference()</tt> is required? | ||
2742 | |||
2743 | |||
2744 | </p><p><b>Answer</b>: | ||
2745 | Presumably the <tt>->gp_lock</tt> acquired on line 18 excludes | ||
2746 | any changes, including any insertions that <tt>rcu_dereference()</tt> | ||
2747 | would protect against. | ||
2748 | Therefore, any insertions will be delayed until after <tt>->gp_lock</tt> | ||
2749 | is released on line 25, which in turn means that | ||
2750 | <tt>rcu_access_pointer()</tt> suffices. | ||
2751 | |||
2752 | |||
2753 | </p><p><a href="#Quick%20Quiz%2012"><b>Back to Quick Quiz 12</b>.</a> | ||
2754 | |||
2755 | <a name="qq13answer"></a> | ||
2756 | <p><b>Quick Quiz 13</b>: | ||
2757 | Earlier it was claimed that <tt>call_rcu()</tt> and | ||
2758 | <tt>kfree_rcu()</tt> allowed updaters to avoid being blocked | ||
2759 | by readers. | ||
2760 | But how can that be correct, given that the invocation of the callback | ||
2761 | and the freeing of the memory (respectively) must still wait for | ||
2762 | a grace period to elapse? | ||
2763 | |||
2764 | |||
2765 | </p><p><b>Answer</b>: | ||
2766 | We could define things this way, but keep in mind that this sort of | ||
2767 | definition would say that updates in garbage-collected languages | ||
2768 | cannot complete until the next time the garbage collector runs, | ||
2769 | which does not seem at all reasonable. | ||
2770 | The key point is that in most cases, an updater using either | ||
2771 | <tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the | ||
2772 | next update as soon as it has invoked <tt>call_rcu()</tt> or | ||
2773 | <tt>kfree_rcu()</tt>, without having to wait for a subsequent | ||
2774 | grace period. | ||
2775 | |||
2776 | |||
2777 | </p><p><a href="#Quick%20Quiz%2013"><b>Back to Quick Quiz 13</b>.</a> | ||
2778 | |||
2779 | <a name="qq14answer"></a> | ||
2780 | <p><b>Quick Quiz 14</b>: | ||
2781 | So what happens with <tt>synchronize_rcu()</tt> during | ||
2782 | scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> | ||
2783 | kernels? | ||
2784 | |||
2785 | |||
2786 | </p><p><b>Answer</b>: | ||
2787 | In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt> | ||
2788 | maps directly to <tt>synchronize_sched()</tt>. | ||
2789 | Therefore, <tt>synchronize_rcu()</tt> works normally throughout | ||
2790 | boot in <tt>CONFIG_PREEMPT=n</tt> kernels. | ||
2791 | However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels, | ||
2792 | so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt> | ||
2793 | during scheduler initialization. | ||
2794 | |||
2795 | |||
2796 | </p><p><a href="#Quick%20Quiz%2014"><b>Back to Quick Quiz 14</b>.</a> | ||
2797 | |||
2798 | |||
2799 | </body></html> | ||
diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx new file mode 100644 index 000000000000..1168010c39fe --- /dev/null +++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx | |||
@@ -0,0 +1,2643 @@ | |||
1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" | ||
2 | "http://www.w3.org/TR/html4/loose.dtd"> | ||
3 | <html> | ||
4 | <head><title>A Tour Through RCU's Requirements [LWN.net]</title> | ||
5 | <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8"> | ||
6 | |||
7 | <h1>A Tour Through RCU's Requirements</h1> | ||
8 | |||
9 | <p>Copyright IBM Corporation, 2015</p> | ||
10 | <p>Author: Paul E. McKenney</p> | ||
11 | <p><i>The initial version of this document appeared in the | ||
12 | <a href="https://lwn.net/">LWN</a> articles | ||
13 | <a href="https://lwn.net/Articles/652156/">here</a>, | ||
14 | <a href="https://lwn.net/Articles/652677/">here</a>, and | ||
15 | <a href="https://lwn.net/Articles/653326/">here</a>.</i></p> | ||
16 | |||
17 | <h2>Introduction</h2> | ||
18 | |||
19 | <p> | ||
20 | Read-copy update (RCU) is a synchronization mechanism that is often | ||
21 | used as a replacement for reader-writer locking. | ||
22 | RCU is unusual in that updaters do not block readers, | ||
23 | which means that RCU's read-side primitives can be exceedingly fast | ||
24 | and scalable. | ||
25 | In addition, updaters can make useful forward progress concurrently | ||
26 | with readers. | ||
27 | However, all this concurrency between RCU readers and updaters does raise | ||
28 | the question of exactly what RCU readers are doing, which in turn | ||
29 | raises the question of exactly what RCU's requirements are. | ||
30 | |||
31 | <p> | ||
32 | This document therefore summarizes RCU's requirements, and can be thought | ||
33 | of as an informal, high-level specification for RCU. | ||
34 | It is important to understand that RCU's specification is primarily | ||
35 | empirical in nature; | ||
36 | in fact, I learned about many of these requirements the hard way. | ||
37 | This situation might cause some consternation, however, not only | ||
38 | has this learning process been a lot of fun, but it has also been | ||
39 | a great privilege to work with so many people willing to apply | ||
40 | technologies in interesting new ways. | ||
41 | |||
42 | <p> | ||
43 | All that aside, here are the categories of currently known RCU requirements: | ||
44 | </p> | ||
45 | |||
46 | <ol> | ||
47 | <li> <a href="#Fundamental Requirements"> | ||
48 | Fundamental Requirements</a> | ||
49 | <li> <a href="#Fundamental Non-Requirements">Fundamental Non-Requirements</a> | ||
50 | <li> <a href="#Parallelism Facts of Life"> | ||
51 | Parallelism Facts of Life</a> | ||
52 | <li> <a href="#Quality-of-Implementation Requirements"> | ||
53 | Quality-of-Implementation Requirements</a> | ||
54 | <li> <a href="#Linux Kernel Complications"> | ||
55 | Linux Kernel Complications</a> | ||
56 | <li> <a href="#Software-Engineering Requirements"> | ||
57 | Software-Engineering Requirements</a> | ||
58 | <li> <a href="#Other RCU Flavors"> | ||
59 | Other RCU Flavors</a> | ||
60 | <li> <a href="#Possible Future Changes"> | ||
61 | Possible Future Changes</a> | ||
62 | </ol> | ||
63 | |||
64 | <p> | ||
65 | This is followed by a <a href="#Summary">summary</a>, | ||
66 | which is in turn followed by the inevitable | ||
67 | <a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>. | ||
68 | |||
69 | <h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2> | ||
70 | |||
71 | <p> | ||
72 | RCU's fundamental requirements are the closest thing RCU has to hard | ||
73 | mathematical requirements. | ||
74 | These are: | ||
75 | |||
76 | <ol> | ||
77 | <li> <a href="#Grace-Period Guarantee"> | ||
78 | Grace-Period Guarantee</a> | ||
79 | <li> <a href="#Publish-Subscribe Guarantee"> | ||
80 | Publish-Subscribe Guarantee</a> | ||
81 | <li> <a href="#RCU Primitives Guaranteed to Execute Unconditionally"> | ||
82 | RCU Primitives Guaranteed to Execute Unconditionally</a> | ||
83 | <li> <a href="#Guaranteed Read-to-Write Upgrade"> | ||
84 | Guaranteed Read-to-Write Upgrade</a> | ||
85 | </ol> | ||
86 | |||
87 | <h3><a name="Grace-Period Guarantee">Grace-Period Guarantee</a></h3> | ||
88 | |||
89 | <p> | ||
90 | RCU's grace-period guarantee is unusual in being premeditated: | ||
91 | Jack Slingwine and I had this guarantee firmly in mind when we started | ||
92 | work on RCU (then called “rclock”) in the early 1990s. | ||
93 | That said, the past two decades of experience with RCU have produced | ||
94 | a much more detailed understanding of this guarantee. | ||
95 | |||
96 | <p> | ||
97 | RCU's grace-period guarantee allows updaters to wait for the completion | ||
98 | of all pre-existing RCU read-side critical sections. | ||
99 | An RCU read-side critical section | ||
100 | begins with the marker <tt>rcu_read_lock()</tt> and ends with | ||
101 | the marker <tt>rcu_read_unlock()</tt>. | ||
102 | These markers may be nested, and RCU treats a nested set as one | ||
103 | big RCU read-side critical section. | ||
104 | Production-quality implementations of <tt>rcu_read_lock()</tt> and | ||
105 | <tt>rcu_read_unlock()</tt> are extremely lightweight, and in | ||
106 | fact have exactly zero overhead in Linux kernels built for production | ||
107 | use with <tt>CONFIG_PREEMPT=n</tt>. | ||
108 | |||
109 | <p> | ||
110 | This guarantee allows ordering to be enforced with extremely low | ||
111 | overhead to readers, for example: | ||
112 | |||
113 | <blockquote> | ||
114 | <pre> | ||
115 | 1 int x, y; | ||
116 | 2 | ||
117 | 3 void thread0(void) | ||
118 | 4 { | ||
119 | 5 rcu_read_lock(); | ||
120 | 6 r1 = READ_ONCE(x); | ||
121 | 7 r2 = READ_ONCE(y); | ||
122 | 8 rcu_read_unlock(); | ||
123 | 9 } | ||
124 | 10 | ||
125 | 11 void thread1(void) | ||
126 | 12 { | ||
127 | 13 WRITE_ONCE(x, 1); | ||
128 | 14 synchronize_rcu(); | ||
129 | 15 WRITE_ONCE(y, 1); | ||
130 | 16 } | ||
131 | </pre> | ||
132 | </blockquote> | ||
133 | |||
134 | <p> | ||
135 | Because the <tt>synchronize_rcu()</tt> on line 14 waits for | ||
136 | all pre-existing readers, any instance of <tt>thread0()</tt> that | ||
137 | loads a value of zero from <tt>x</tt> must complete before | ||
138 | <tt>thread1()</tt> stores to <tt>y</tt>, so that instance must | ||
139 | also load a value of zero from <tt>y</tt>. | ||
140 | Similarly, any instance of <tt>thread0()</tt> that loads a value of | ||
141 | one from <tt>y</tt> must have started after the | ||
142 | <tt>synchronize_rcu()</tt> started, and must therefore also load | ||
143 | a value of one from <tt>x</tt>. | ||
144 | Therefore, the outcome: | ||
145 | <blockquote> | ||
146 | <pre> | ||
147 | (r1 == 0 && r2 == 1) | ||
148 | </pre> | ||
149 | </blockquote> | ||
150 | cannot happen. | ||
151 | |||
152 | <p>@@QQ@@ | ||
153 | Wait a minute! | ||
154 | You said that updaters can make useful forward progress concurrently | ||
155 | with readers, but pre-existing readers will block | ||
156 | <tt>synchronize_rcu()</tt>!!! | ||
157 | Just who are you trying to fool??? | ||
158 | <p>@@QQA@@ | ||
159 | First, if updaters do not wish to be blocked by readers, they can use | ||
160 | <tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will | ||
161 | be discussed later. | ||
162 | Second, even when using <tt>synchronize_rcu()</tt>, the other | ||
163 | update-side code does run concurrently with readers, whether pre-existing | ||
164 | or not. | ||
165 | <p>@@QQE@@ | ||
166 | |||
167 | <p> | ||
168 | This scenario resembles one of the first uses of RCU in | ||
169 | <a href="https://en.wikipedia.org/wiki/DYNIX">DYNIX/ptx</a>, | ||
170 | which managed a distributed lock manager's transition into | ||
171 | a state suitable for handling recovery from node failure, | ||
172 | more or less as follows: | ||
173 | |||
174 | <blockquote> | ||
175 | <pre> | ||
176 | 1 #define STATE_NORMAL 0 | ||
177 | 2 #define STATE_WANT_RECOVERY 1 | ||
178 | 3 #define STATE_RECOVERING 2 | ||
179 | 4 #define STATE_WANT_NORMAL 3 | ||
180 | 5 | ||
181 | 6 int state = STATE_NORMAL; | ||
182 | 7 | ||
183 | 8 void do_something_dlm(void) | ||
184 | 9 { | ||
185 | 10 int state_snap; | ||
186 | 11 | ||
187 | 12 rcu_read_lock(); | ||
188 | 13 state_snap = READ_ONCE(state); | ||
189 | 14 if (state_snap == STATE_NORMAL) | ||
190 | 15 do_something(); | ||
191 | 16 else | ||
192 | 17 do_something_carefully(); | ||
193 | 18 rcu_read_unlock(); | ||
194 | 19 } | ||
195 | 20 | ||
196 | 21 void start_recovery(void) | ||
197 | 22 { | ||
198 | 23 WRITE_ONCE(state, STATE_WANT_RECOVERY); | ||
199 | 24 synchronize_rcu(); | ||
200 | 25 WRITE_ONCE(state, STATE_RECOVERING); | ||
201 | 26 recovery(); | ||
202 | 27 WRITE_ONCE(state, STATE_WANT_NORMAL); | ||
203 | 28 synchronize_rcu(); | ||
204 | 29 WRITE_ONCE(state, STATE_NORMAL); | ||
205 | 30 } | ||
206 | </pre> | ||
207 | </blockquote> | ||
208 | |||
209 | <p> | ||
210 | The RCU read-side critical section in <tt>do_something_dlm()</tt> | ||
211 | works with the <tt>synchronize_rcu()</tt> in <tt>start_recovery()</tt> | ||
212 | to guarantee that <tt>do_something()</tt> never runs concurrently | ||
213 | with <tt>recovery()</tt>, but with little or no synchronization | ||
214 | overhead in <tt>do_something_dlm()</tt>. | ||
215 | |||
216 | <p>@@QQ@@ | ||
217 | Why is the <tt>synchronize_rcu()</tt> on line 28 needed? | ||
218 | <p>@@QQA@@ | ||
219 | Without that extra grace period, memory reordering could result in | ||
220 | <tt>do_something_dlm()</tt> executing <tt>do_something()</tt> | ||
221 | concurrently with the last bits of <tt>recovery()</tt>. | ||
222 | <p>@@QQE@@ | ||
223 | |||
224 | <p> | ||
225 | In order to avoid fatal problems such as deadlocks, | ||
226 | an RCU read-side critical section must not contain calls to | ||
227 | <tt>synchronize_rcu()</tt>. | ||
228 | Similarly, an RCU read-side critical section must not | ||
229 | contain anything that waits, directly or indirectly, on completion of | ||
230 | an invocation of <tt>synchronize_rcu()</tt>. | ||
231 | |||
232 | <p> | ||
233 | Although RCU's grace-period guarantee is useful in and of itself, with | ||
234 | <a href="https://lwn.net/Articles/573497/">quite a few use cases</a>, | ||
235 | it would be good to be able to use RCU to coordinate read-side | ||
236 | access to linked data structures. | ||
237 | For this, the grace-period guarantee is not sufficient, as can | ||
238 | be seen in function <tt>add_gp_buggy()</tt> below. | ||
239 | We will look at the reader's code later, but in the meantime, just think of | ||
240 | the reader as locklessly picking up the <tt>gp</tt> pointer, | ||
241 | and, if the value loaded is non-<tt>NULL</tt>, locklessly accessing the | ||
242 | <tt>->a</tt> and <tt>->b</tt> fields. | ||
243 | |||
244 | <blockquote> | ||
245 | <pre> | ||
246 | 1 bool add_gp_buggy(int a, int b) | ||
247 | 2 { | ||
248 | 3 p = kmalloc(sizeof(*p), GFP_KERNEL); | ||
249 | 4 if (!p) | ||
250 | 5 return -ENOMEM; | ||
251 | 6 spin_lock(&gp_lock); | ||
252 | 7 if (rcu_access_pointer(gp)) { | ||
253 | 8 spin_unlock(&gp_lock); | ||
254 | 9 return false; | ||
255 | 10 } | ||
256 | 11 p->a = a; | ||
257 | 12 p->b = a; | ||
258 | 13 gp = p; /* ORDERING BUG */ | ||
259 | 14 spin_unlock(&gp_lock); | ||
260 | 15 return true; | ||
261 | 16 } | ||
262 | </pre> | ||
263 | </blockquote> | ||
264 | |||
265 | <p> | ||
266 | The problem is that both the compiler and weakly ordered CPUs are within | ||
267 | their rights to reorder this code as follows: | ||
268 | |||
269 | <blockquote> | ||
270 | <pre> | ||
271 | 1 bool add_gp_buggy_optimized(int a, int b) | ||
272 | 2 { | ||
273 | 3 p = kmalloc(sizeof(*p), GFP_KERNEL); | ||
274 | 4 if (!p) | ||
275 | 5 return -ENOMEM; | ||
276 | 6 spin_lock(&gp_lock); | ||
277 | 7 if (rcu_access_pointer(gp)) { | ||
278 | 8 spin_unlock(&gp_lock); | ||
279 | 9 return false; | ||
280 | 10 } | ||
281 | <b>11 gp = p; /* ORDERING BUG */ | ||
282 | 12 p->a = a; | ||
283 | 13 p->b = a;</b> | ||
284 | 14 spin_unlock(&gp_lock); | ||
285 | 15 return true; | ||
286 | 16 } | ||
287 | </pre> | ||
288 | </blockquote> | ||
289 | |||
290 | <p> | ||
291 | If an RCU reader fetches <tt>gp</tt> just after | ||
292 | <tt>add_gp_buggy_optimized</tt> executes line 11, | ||
293 | it will see garbage in the <tt>->a</tt> and <tt>->b</tt> | ||
294 | fields. | ||
295 | And this is but one of many ways in which compiler and hardware optimizations | ||
296 | could cause trouble. | ||
297 | Therefore, we clearly need some way to prevent the compiler and the CPU from | ||
298 | reordering in this manner, which brings us to the publish-subscribe | ||
299 | guarantee discussed in the next section. | ||
300 | |||
301 | <h3><a name="Publish-Subscribe Guarantee">Publish/Subscribe Guarantee</a></h3> | ||
302 | |||
303 | <p> | ||
304 | RCU's publish-subscribe guarantee allows data to be inserted | ||
305 | into a linked data structure without disrupting RCU readers. | ||
306 | The updater uses <tt>rcu_assign_pointer()</tt> to insert the | ||
307 | new data, and readers use <tt>rcu_dereference()</tt> to | ||
308 | access data, whether new or old. | ||
309 | The following shows an example of insertion: | ||
310 | |||
311 | <blockquote> | ||
312 | <pre> | ||
313 | 1 bool add_gp(int a, int b) | ||
314 | 2 { | ||
315 | 3 p = kmalloc(sizeof(*p), GFP_KERNEL); | ||
316 | 4 if (!p) | ||
317 | 5 return -ENOMEM; | ||
318 | 6 spin_lock(&gp_lock); | ||
319 | 7 if (rcu_access_pointer(gp)) { | ||
320 | 8 spin_unlock(&gp_lock); | ||
321 | 9 return false; | ||
322 | 10 } | ||
323 | 11 p->a = a; | ||
324 | 12 p->b = a; | ||
325 | 13 rcu_assign_pointer(gp, p); | ||
326 | 14 spin_unlock(&gp_lock); | ||
327 | 15 return true; | ||
328 | 16 } | ||
329 | </pre> | ||
330 | </blockquote> | ||
331 | |||
332 | <p> | ||
333 | The <tt>rcu_assign_pointer()</tt> on line 13 is conceptually | ||
334 | equivalent to a simple assignment statement, but also guarantees | ||
335 | that its assignment will | ||
336 | happen after the two assignments in lines 11 and 12, | ||
337 | similar to the C11 <tt>memory_order_release</tt> store operation. | ||
338 | It also prevents any number of “interesting” compiler | ||
339 | optimizations, for example, the use of <tt>gp</tt> as a scratch | ||
340 | location immediately preceding the assignment. | ||
341 | |||
342 | <p>@@QQ@@ | ||
343 | But <tt>rcu_assign_pointer()</tt> does nothing to prevent the | ||
344 | two assignments to <tt>p->a</tt> and <tt>p->b</tt> | ||
345 | from being reordered. | ||
346 | Can't that also cause problems? | ||
347 | <p>@@QQA@@ | ||
348 | No, it cannot. | ||
349 | The readers cannot see either of these two fields until | ||
350 | the assignment to <tt>gp</tt>, by which time both fields are | ||
351 | fully initialized. | ||
352 | So reordering the assignments | ||
353 | to <tt>p->a</tt> and <tt>p->b</tt> cannot possibly | ||
354 | cause any problems. | ||
355 | <p>@@QQE@@ | ||
356 | |||
357 | <p> | ||
358 | It is tempting to assume that the reader need not do anything special | ||
359 | to control its accesses to the RCU-protected data, | ||
360 | as shown in <tt>do_something_gp_buggy()</tt> below: | ||
361 | |||
362 | <blockquote> | ||
363 | <pre> | ||
364 | 1 bool do_something_gp_buggy(void) | ||
365 | 2 { | ||
366 | 3 rcu_read_lock(); | ||
367 | 4 p = gp; /* OPTIMIZATIONS GALORE!!! */ | ||
368 | 5 if (p) { | ||
369 | 6 do_something(p->a, p->b); | ||
370 | 7 rcu_read_unlock(); | ||
371 | 8 return true; | ||
372 | 9 } | ||
373 | 10 rcu_read_unlock(); | ||
374 | 11 return false; | ||
375 | 12 } | ||
376 | </pre> | ||
377 | </blockquote> | ||
378 | |||
379 | <p> | ||
380 | However, this temptation must be resisted because there are a | ||
381 | surprisingly large number of ways that the compiler | ||
382 | (to say nothing of | ||
383 | <a href="https://h71000.www7.hp.com/wizard/wiz_2637.html">DEC Alpha CPUs</a>) | ||
384 | can trip this code up. | ||
385 | For but one example, if the compiler were short of registers, it | ||
386 | might choose to refetch from <tt>gp</tt> rather than keeping | ||
387 | a separate copy in <tt>p</tt> as follows: | ||
388 | |||
389 | <blockquote> | ||
390 | <pre> | ||
391 | 1 bool do_something_gp_buggy_optimized(void) | ||
392 | 2 { | ||
393 | 3 rcu_read_lock(); | ||
394 | 4 if (gp) { /* OPTIMIZATIONS GALORE!!! */ | ||
395 | <b> 5 do_something(gp->a, gp->b);</b> | ||
396 | 6 rcu_read_unlock(); | ||
397 | 7 return true; | ||
398 | 8 } | ||
399 | 9 rcu_read_unlock(); | ||
400 | 10 return false; | ||
401 | 11 } | ||
402 | </pre> | ||
403 | </blockquote> | ||
404 | |||
405 | <p> | ||
406 | If this function ran concurrently with a series of updates that | ||
407 | replaced the current structure with a new one, | ||
408 | the fetches of <tt>gp->a</tt> | ||
409 | and <tt>gp->b</tt> might well come from two different structures, | ||
410 | which could cause serious confusion. | ||
411 | To prevent this (and much else besides), <tt>do_something_gp()</tt> uses | ||
412 | <tt>rcu_dereference()</tt> to fetch from <tt>gp</tt>: | ||
413 | |||
414 | <blockquote> | ||
415 | <pre> | ||
416 | 1 bool do_something_gp(void) | ||
417 | 2 { | ||
418 | 3 rcu_read_lock(); | ||
419 | 4 p = rcu_dereference(gp); | ||
420 | 5 if (p) { | ||
421 | 6 do_something(p->a, p->b); | ||
422 | 7 rcu_read_unlock(); | ||
423 | 8 return true; | ||
424 | 9 } | ||
425 | 10 rcu_read_unlock(); | ||
426 | 11 return false; | ||
427 | 12 } | ||
428 | </pre> | ||
429 | </blockquote> | ||
430 | |||
431 | <p> | ||
432 | The <tt>rcu_dereference()</tt> uses volatile casts and (for DEC Alpha) | ||
433 | memory barriers in the Linux kernel. | ||
434 | Should a | ||
435 | <a href="http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf">high-quality implementation of C11 <tt>memory_order_consume</tt> [PDF]</a> | ||
436 | ever appear, then <tt>rcu_dereference()</tt> could be implemented | ||
437 | as a <tt>memory_order_consume</tt> load. | ||
438 | Regardless of the exact implementation, a pointer fetched by | ||
439 | <tt>rcu_dereference()</tt> may not be used outside of the | ||
440 | outermost RCU read-side critical section containing that | ||
441 | <tt>rcu_dereference()</tt>, unless protection of | ||
442 | the corresponding data element has been passed from RCU to some | ||
443 | other synchronization mechanism, most commonly locking or | ||
444 | <a href="https://www.kernel.org/doc/Documentation/RCU/rcuref.txt">reference counting</a>. | ||
445 | |||
446 | <p> | ||
447 | In short, updaters use <tt>rcu_assign_pointer()</tt> and readers | ||
448 | use <tt>rcu_dereference()</tt>, and these two RCU API elements | ||
449 | work together to ensure that readers have a consistent view of | ||
450 | newly added data elements. | ||
451 | |||
452 | <p> | ||
453 | Of course, it is also necessary to remove elements from RCU-protected | ||
454 | data structures, for example, using the following process: | ||
455 | |||
456 | <ol> | ||
457 | <li> Remove the data element from the enclosing structure. | ||
458 | <li> Wait for all pre-existing RCU read-side critical sections | ||
459 | to complete (because only pre-existing readers can possibly have | ||
460 | a reference to the newly removed data element). | ||
461 | <li> At this point, only the updater has a reference to the | ||
462 | newly removed data element, so it can safely reclaim | ||
463 | the data element, for example, by passing it to <tt>kfree()</tt>. | ||
464 | </ol> | ||
465 | |||
466 | This process is implemented by <tt>remove_gp_synchronous()</tt>: | ||
467 | |||
468 | <blockquote> | ||
469 | <pre> | ||
470 | 1 bool remove_gp_synchronous(void) | ||
471 | 2 { | ||
472 | 3 struct foo *p; | ||
473 | 4 | ||
474 | 5 spin_lock(&gp_lock); | ||
475 | 6 p = rcu_access_pointer(gp); | ||
476 | 7 if (!p) { | ||
477 | 8 spin_unlock(&gp_lock); | ||
478 | 9 return false; | ||
479 | 10 } | ||
480 | 11 rcu_assign_pointer(gp, NULL); | ||
481 | 12 spin_unlock(&gp_lock); | ||
482 | 13 synchronize_rcu(); | ||
483 | 14 kfree(p); | ||
484 | 15 return true; | ||
485 | 16 } | ||
486 | </pre> | ||
487 | </blockquote> | ||
488 | |||
489 | <p> | ||
490 | This function is straightforward, with line 13 waiting for a grace | ||
491 | period before line 14 frees the old data element. | ||
492 | This waiting ensures that readers will reach line 7 of | ||
493 | <tt>do_something_gp()</tt> before the data element referenced by | ||
494 | <tt>p</tt> is freed. | ||
495 | The <tt>rcu_access_pointer()</tt> on line 6 is similar to | ||
496 | <tt>rcu_dereference()</tt>, except that: | ||
497 | |||
498 | <ol> | ||
499 | <li> The value returned by <tt>rcu_access_pointer()</tt> | ||
500 | cannot be dereferenced. | ||
501 | If you want to access the value pointed to as well as | ||
502 | the pointer itself, use <tt>rcu_dereference()</tt> | ||
503 | instead of <tt>rcu_access_pointer()</tt>. | ||
504 | <li> The call to <tt>rcu_access_pointer()</tt> need not be | ||
505 | protected. | ||
506 | In contrast, <tt>rcu_dereference()</tt> must either be | ||
507 | within an RCU read-side critical section or in a code | ||
508 | segment where the pointer cannot change, for example, in | ||
509 | code protected by the corresponding update-side lock. | ||
510 | </ol> | ||
511 | |||
512 | <p>@@QQ@@ | ||
513 | Without the <tt>rcu_dereference()</tt> or the | ||
514 | <tt>rcu_access_pointer()</tt>, what destructive optimizations | ||
515 | might the compiler make use of? | ||
516 | <p>@@QQA@@ | ||
517 | Let's start with what happens to <tt>do_something_gp()</tt> | ||
518 | if it fails to use <tt>rcu_dereference()</tt>. | ||
519 | It could reuse a value formerly fetched from this same pointer. | ||
520 | It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time | ||
521 | manner, resulting in <i>load tearing</i>, in turn resulting a bytewise | ||
522 | mash-up of two distince pointer values. | ||
523 | It might even use value-speculation optimizations, where it makes a wrong | ||
524 | guess, but by the time it gets around to checking the value, an update | ||
525 | has changed the pointer to match the wrong guess. | ||
526 | Too bad about any dereferences that returned pre-initialization garbage | ||
527 | in the meantime! | ||
528 | |||
529 | <p> | ||
530 | For <tt>remove_gp_synchronous()</tt>, as long as all modifications | ||
531 | to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>, | ||
532 | the above optimizations are harmless. | ||
533 | However, | ||
534 | with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>, | ||
535 | <tt>sparse</tt> will complain if you | ||
536 | define <tt>gp</tt> with <tt>__rcu</tt> and then | ||
537 | access it without using | ||
538 | either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>. | ||
539 | <p>@@QQE@@ | ||
540 | |||
541 | <p> | ||
542 | This simple linked-data-structure scenario clearly demonstrates the need | ||
543 | for RCU's stringent memory-ordering guarantees on systems with more than | ||
544 | one CPU: | ||
545 | |||
546 | <ol> | ||
547 | <li> Each CPU that has an RCU read-side critical section that | ||
548 | begins before <tt>synchronize_rcu()</tt> starts is | ||
549 | guaranteed to execute a full memory barrier between the time | ||
550 | that the RCU read-side critical section ends and the time that | ||
551 | <tt>synchronize_rcu()</tt> returns. | ||
552 | Without this guarantee, a pre-existing RCU read-side critical section | ||
553 | might hold a reference to the newly removed <tt>struct foo</tt> | ||
554 | after the <tt>kfree()</tt> on line 14 of | ||
555 | <tt>remove_gp_synchronous()</tt>. | ||
556 | <li> Each CPU that has an RCU read-side critical section that ends | ||
557 | after <tt>synchronize_rcu()</tt> returns is guaranteed | ||
558 | to execute a full memory barrier between the time that | ||
559 | <tt>synchronize_rcu()</tt> begins and the time that the RCU | ||
560 | read-side critical section begins. | ||
561 | Without this guarantee, a later RCU read-side critical section | ||
562 | running after the <tt>kfree()</tt> on line 14 of | ||
563 | <tt>remove_gp_synchronous()</tt> might | ||
564 | later run <tt>do_something_gp()</tt> and find the | ||
565 | newly deleted <tt>struct foo</tt>. | ||
566 | <li> If the task invoking <tt>synchronize_rcu()</tt> remains | ||
567 | on a given CPU, then that CPU is guaranteed to execute a full | ||
568 | memory barrier sometime during the execution of | ||
569 | <tt>synchronize_rcu()</tt>. | ||
570 | This guarantee ensures that the <tt>kfree()</tt> on | ||
571 | line 14 of <tt>remove_gp_synchronous()</tt> really does | ||
572 | execute after the removal on line 11. | ||
573 | <li> If the task invoking <tt>synchronize_rcu()</tt> migrates | ||
574 | among a group of CPUs during that invocation, then each of the | ||
575 | CPUs in that group is guaranteed to execute a full memory barrier | ||
576 | sometime during the execution of <tt>synchronize_rcu()</tt>. | ||
577 | This guarantee also ensures that the <tt>kfree()</tt> on | ||
578 | line 14 of <tt>remove_gp_synchronous()</tt> really does | ||
579 | execute after the removal on | ||
580 | line 11, but also in the case where the thread executing the | ||
581 | <tt>synchronize_rcu()</tt> migrates in the meantime. | ||
582 | </ol> | ||
583 | |||
584 | <p>@@QQ@@ | ||
585 | Given that multiple CPUs can start RCU read-side critical sections | ||
586 | at any time without any ordering whatsoever, how can RCU possibly tell whether | ||
587 | or not a given RCU read-side critical section starts before a | ||
588 | given instance of <tt>synchronize_rcu()</tt>? | ||
589 | <p>@@QQA@@ | ||
590 | If RCU cannot tell whether or not a given | ||
591 | RCU read-side critical section starts before a | ||
592 | given instance of <tt>synchronize_rcu()</tt>, | ||
593 | then it must assume that the RCU read-side critical section | ||
594 | started first. | ||
595 | In other words, a given instance of <tt>synchronize_rcu()</tt> | ||
596 | can avoid waiting on a given RCU read-side critical section only | ||
597 | if it can prove that <tt>synchronize_rcu()</tt> started first. | ||
598 | <p>@@QQE@@ | ||
599 | |||
600 | <p>@@QQ@@ | ||
601 | The first and second guarantees require unbelievably strict ordering! | ||
602 | Are all these memory barriers <i> really</i> required? | ||
603 | <p>@@QQA@@ | ||
604 | Yes, they really are required. | ||
605 | To see why the first guarantee is required, consider the following | ||
606 | sequence of events: | ||
607 | |||
608 | <ol> | ||
609 | <li> CPU 1: <tt>rcu_read_lock()</tt> | ||
610 | <li> CPU 1: <tt>q = rcu_dereference(gp); | ||
611 | /* Very likely to return p. */</tt> | ||
612 | <li> CPU 0: <tt>list_del_rcu(p);</tt> | ||
613 | <li> CPU 0: <tt>synchronize_rcu()</tt> starts. | ||
614 | <li> CPU 1: <tt>do_something_with(q->a); | ||
615 | /* No smp_mb(), so might happen after kfree(). */</tt> | ||
616 | <li> CPU 1: <tt>rcu_read_unlock()</tt> | ||
617 | <li> CPU 0: <tt>synchronize_rcu()</tt> returns. | ||
618 | <li> CPU 0: <tt>kfree(p);</tt> | ||
619 | </ol> | ||
620 | |||
621 | <p> | ||
622 | Therefore, there absolutely must be a full memory barrier between the | ||
623 | end of the RCU read-side critical section and the end of the | ||
624 | grace period. | ||
625 | |||
626 | <p> | ||
627 | The sequence of events demonstrating the necessity of the second rule | ||
628 | is roughly similar: | ||
629 | |||
630 | <ol> | ||
631 | <li> CPU 0: <tt>list_del_rcu(p);</tt> | ||
632 | <li> CPU 0: <tt>synchronize_rcu()</tt> starts. | ||
633 | <li> CPU 1: <tt>rcu_read_lock()</tt> | ||
634 | <li> CPU 1: <tt>q = rcu_dereference(gp); | ||
635 | /* Might return p if no memory barrier. */</tt> | ||
636 | <li> CPU 0: <tt>synchronize_rcu()</tt> returns. | ||
637 | <li> CPU 0: <tt>kfree(p);</tt> | ||
638 | <li> CPU 1: <tt>do_something_with(q->a); /* Boom!!! */</tt> | ||
639 | <li> CPU 1: <tt>rcu_read_unlock()</tt> | ||
640 | </ol> | ||
641 | |||
642 | <p> | ||
643 | And similarly, without a memory barrier between the beginning of the | ||
644 | grace period and the beginning of the RCU read-side critical section, | ||
645 | CPU 1 might end up accessing the freelist. | ||
646 | |||
647 | <p> | ||
648 | The “as if” rule of course applies, so that any implementation | ||
649 | that acts as if the appropriate memory barriers were in place is a | ||
650 | correct implementation. | ||
651 | That said, it is much easier to fool yourself into believing that you have | ||
652 | adhered to the as-if rule than it is to actually adhere to it! | ||
653 | <p>@@QQE@@ | ||
654 | |||
655 | <p> | ||
656 | In short, RCU's publish-subscribe guarantee is provided by the combination | ||
657 | of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>. | ||
658 | This guarantee allows data elements to be safely added to RCU-protected | ||
659 | linked data structures without disrupting RCU readers. | ||
660 | This guarantee can be used in combination with the grace-period | ||
661 | guarantee to also allow data elements to be removed from RCU-protected | ||
662 | linked data structures, again without disrupting RCU readers. | ||
663 | |||
664 | <p> | ||
665 | This guarantee was only partially premeditated. | ||
666 | DYNIX/ptx used an explicit memory barrier for publication, but had nothing | ||
667 | resembling <tt>rcu_dereference()</tt> for subscription, nor did it | ||
668 | have anything resembling the <tt>smp_read_barrier_depends()</tt> | ||
669 | that was later subsumed into <tt>rcu_dereference()</tt>. | ||
670 | The need for these operations made itself known quite suddenly at a | ||
671 | late-1990s meeting with the DEC Alpha architects, back in the days when | ||
672 | DEC was still a free-standing company. | ||
673 | It took the Alpha architects a good hour to convince me that any sort | ||
674 | of barrier would ever be needed, and it then took me a good <i>two</i> hours | ||
675 | to convince them that their documentation did not make this point clear. | ||
676 | More recent work with the C and C++ standards committees have provided | ||
677 | much education on tricks and traps from the compiler. | ||
678 | In short, compilers were much less tricky in the early 1990s, but in | ||
679 | 2015, don't even think about omitting <tt>rcu_dereference()</tt>! | ||
680 | |||
681 | <h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3> | ||
682 | |||
683 | <p> | ||
684 | The common-case RCU primitives are unconditional. | ||
685 | They are invoked, they do their job, and they return, with no possibility | ||
686 | of error, and no need to retry. | ||
687 | This is a key RCU design philosophy. | ||
688 | |||
689 | <p> | ||
690 | However, this philosophy is pragmatic rather than pigheaded. | ||
691 | If someone comes up with a good justification for a particular conditional | ||
692 | RCU primitive, it might well be implemented and added. | ||
693 | After all, this guarantee was reverse-engineered, not premeditated. | ||
694 | The unconditional nature of the RCU primitives was initially an | ||
695 | accident of implementation, and later experience with synchronization | ||
696 | primitives with conditional primitives caused me to elevate this | ||
697 | accident to a guarantee. | ||
698 | Therefore, the justification for adding a conditional primitive to | ||
699 | RCU would need to be based on detailed and compelling use cases. | ||
700 | |||
701 | <h3><a name="Guaranteed Read-to-Write Upgrade">Guaranteed Read-to-Write Upgrade</a></h3> | ||
702 | |||
703 | <p> | ||
704 | As far as RCU is concerned, it is always possible to carry out an | ||
705 | update within an RCU read-side critical section. | ||
706 | For example, that RCU read-side critical section might search for | ||
707 | a given data element, and then might acquire the update-side | ||
708 | spinlock in order to update that element, all while remaining | ||
709 | in that RCU read-side critical section. | ||
710 | Of course, it is necessary to exit the RCU read-side critical section | ||
711 | before invoking <tt>synchronize_rcu()</tt>, however, this | ||
712 | inconvenience can be avoided through use of the | ||
713 | <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members | ||
714 | described later in this document. | ||
715 | |||
716 | <p>@@QQ@@ | ||
717 | But how does the upgrade-to-write operation exclude other readers? | ||
718 | <p>@@QQA@@ | ||
719 | It doesn't, just like normal RCU updates, which also do not exclude | ||
720 | RCU readers. | ||
721 | <p>@@QQE@@ | ||
722 | |||
723 | <p> | ||
724 | This guarantee allows lookup code to be shared between read-side | ||
725 | and update-side code, and was premeditated, appearing in the earliest | ||
726 | DYNIX/ptx RCU documentation. | ||
727 | |||
728 | <h2><a name="Fundamental Non-Requirements">Fundamental Non-Requirements</a></h2> | ||
729 | |||
730 | <p> | ||
731 | RCU provides extremely lightweight readers, and its read-side guarantees, | ||
732 | though quite useful, are correspondingly lightweight. | ||
733 | It is therefore all too easy to assume that RCU is guaranteeing more | ||
734 | than it really is. | ||
735 | Of course, the list of things that RCU does not guarantee is infinitely | ||
736 | long, however, the following sections list a few non-guarantees that | ||
737 | have caused confusion. | ||
738 | Except where otherwise noted, these non-guarantees were premeditated. | ||
739 | |||
740 | <ol> | ||
741 | <li> <a href="#Readers Impose Minimal Ordering"> | ||
742 | Readers Impose Minimal Ordering</a> | ||
743 | <li> <a href="#Readers Do Not Exclude Updaters"> | ||
744 | Readers Do Not Exclude Updaters</a> | ||
745 | <li> <a href="#Updaters Only Wait For Old Readers"> | ||
746 | Updaters Only Wait For Old Readers</a> | ||
747 | <li> <a href="#Grace Periods Don't Partition Read-Side Critical Sections"> | ||
748 | Grace Periods Don't Partition Read-Side Critical Sections</a> | ||
749 | <li> <a href="#Read-Side Critical Sections Don't Partition Grace Periods"> | ||
750 | Read-Side Critical Sections Don't Partition Grace Periods</a> | ||
751 | <li> <a href="#Disabling Preemption Does Not Block Grace Periods"> | ||
752 | Disabling Preemption Does Not Block Grace Periods</a> | ||
753 | </ol> | ||
754 | |||
755 | <h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3> | ||
756 | |||
757 | <p> | ||
758 | Reader-side markers such as <tt>rcu_read_lock()</tt> and | ||
759 | <tt>rcu_read_unlock()</tt> provide absolutely no ordering guarantees | ||
760 | except through their interaction with the grace-period APIs such as | ||
761 | <tt>synchronize_rcu()</tt>. | ||
762 | To see this, consider the following pair of threads: | ||
763 | |||
764 | <blockquote> | ||
765 | <pre> | ||
766 | 1 void thread0(void) | ||
767 | 2 { | ||
768 | 3 rcu_read_lock(); | ||
769 | 4 WRITE_ONCE(x, 1); | ||
770 | 5 rcu_read_unlock(); | ||
771 | 6 rcu_read_lock(); | ||
772 | 7 WRITE_ONCE(y, 1); | ||
773 | 8 rcu_read_unlock(); | ||
774 | 9 } | ||
775 | 10 | ||
776 | 11 void thread1(void) | ||
777 | 12 { | ||
778 | 13 rcu_read_lock(); | ||
779 | 14 r1 = READ_ONCE(y); | ||
780 | 15 rcu_read_unlock(); | ||
781 | 16 rcu_read_lock(); | ||
782 | 17 r2 = READ_ONCE(x); | ||
783 | 18 rcu_read_unlock(); | ||
784 | 19 } | ||
785 | </pre> | ||
786 | </blockquote> | ||
787 | |||
788 | <p> | ||
789 | After <tt>thread0()</tt> and <tt>thread1()</tt> execute | ||
790 | concurrently, it is quite possible to have | ||
791 | |||
792 | <blockquote> | ||
793 | <pre> | ||
794 | (r1 == 1 && r2 == 0) | ||
795 | </pre> | ||
796 | </blockquote> | ||
797 | |||
798 | (that is, <tt>y</tt> appears to have been assigned before <tt>x</tt>), | ||
799 | which would not be possible if <tt>rcu_read_lock()</tt> and | ||
800 | <tt>rcu_read_unlock()</tt> had much in the way of ordering | ||
801 | properties. | ||
802 | But they do not, so the CPU is within its rights | ||
803 | to do significant reordering. | ||
804 | This is by design: Any significant ordering constraints would slow down | ||
805 | these fast-path APIs. | ||
806 | |||
807 | <p>@@QQ@@ | ||
808 | Can't the compiler also reorder this code? | ||
809 | <p>@@QQA@@ | ||
810 | No, the volatile casts in <tt>READ_ONCE()</tt> and | ||
811 | <tt>WRITE_ONCE()</tt> prevent the compiler from reordering in | ||
812 | this particular case. | ||
813 | <p>@@QQE@@ | ||
814 | |||
815 | <h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3> | ||
816 | |||
817 | <p> | ||
818 | Neither <tt>rcu_read_lock()</tt> nor <tt>rcu_read_unlock()</tt> | ||
819 | exclude updates. | ||
820 | All they do is to prevent grace periods from ending. | ||
821 | The following example illustrates this: | ||
822 | |||
823 | <blockquote> | ||
824 | <pre> | ||
825 | 1 void thread0(void) | ||
826 | 2 { | ||
827 | 3 rcu_read_lock(); | ||
828 | 4 r1 = READ_ONCE(y); | ||
829 | 5 if (r1) { | ||
830 | 6 do_something_with_nonzero_x(); | ||
831 | 7 r2 = READ_ONCE(x); | ||
832 | 8 WARN_ON(!r2); /* BUG!!! */ | ||
833 | 9 } | ||
834 | 10 rcu_read_unlock(); | ||
835 | 11 } | ||
836 | 12 | ||
837 | 13 void thread1(void) | ||
838 | 14 { | ||
839 | 15 spin_lock(&my_lock); | ||
840 | 16 WRITE_ONCE(x, 1); | ||
841 | 17 WRITE_ONCE(y, 1); | ||
842 | 18 spin_unlock(&my_lock); | ||
843 | 19 } | ||
844 | </pre> | ||
845 | </blockquote> | ||
846 | |||
847 | <p> | ||
848 | If the <tt>thread0()</tt> function's <tt>rcu_read_lock()</tt> | ||
849 | excluded the <tt>thread1()</tt> function's update, | ||
850 | the <tt>WARN_ON()</tt> could never fire. | ||
851 | But the fact is that <tt>rcu_read_lock()</tt> does not exclude | ||
852 | much of anything aside from subsequent grace periods, of which | ||
853 | <tt>thread1()</tt> has none, so the | ||
854 | <tt>WARN_ON()</tt> can and does fire. | ||
855 | |||
856 | <h3><a name="Updaters Only Wait For Old Readers">Updaters Only Wait For Old Readers</a></h3> | ||
857 | |||
858 | <p> | ||
859 | It might be tempting to assume that after <tt>synchronize_rcu()</tt> | ||
860 | completes, there are no readers executing. | ||
861 | This temptation must be avoided because | ||
862 | new readers can start immediately after <tt>synchronize_rcu()</tt> | ||
863 | starts, and <tt>synchronize_rcu()</tt> is under no | ||
864 | obligation to wait for these new readers. | ||
865 | |||
866 | <p>@@QQ@@ | ||
867 | Suppose that synchronize_rcu() did wait until all readers had completed. | ||
868 | Would the updater be able to rely on this? | ||
869 | <p>@@QQA@@ | ||
870 | No. | ||
871 | Even if <tt>synchronize_rcu()</tt> were to wait until | ||
872 | all readers had completed, a new reader might start immediately after | ||
873 | <tt>synchronize_rcu()</tt> completed. | ||
874 | Therefore, the code following | ||
875 | <tt>synchronize_rcu()</tt> cannot rely on there being no readers | ||
876 | in any case. | ||
877 | <p>@@QQE@@ | ||
878 | |||
879 | <h3><a name="Grace Periods Don't Partition Read-Side Critical Sections"> | ||
880 | Grace Periods Don't Partition Read-Side Critical Sections</a></h3> | ||
881 | |||
882 | <p> | ||
883 | It is tempting to assume that if any part of one RCU read-side critical | ||
884 | section precedes a given grace period, and if any part of another RCU | ||
885 | read-side critical section follows that same grace period, then all of | ||
886 | the first RCU read-side critical section must precede all of the second. | ||
887 | However, this just isn't the case: A single grace period does not | ||
888 | partition the set of RCU read-side critical sections. | ||
889 | An example of this situation can be illustrated as follows, where | ||
890 | <tt>x</tt>, <tt>y</tt>, and <tt>z</tt> are initially all zero: | ||
891 | |||
892 | <blockquote> | ||
893 | <pre> | ||
894 | 1 void thread0(void) | ||
895 | 2 { | ||
896 | 3 rcu_read_lock(); | ||
897 | 4 WRITE_ONCE(a, 1); | ||
898 | 5 WRITE_ONCE(b, 1); | ||
899 | 6 rcu_read_unlock(); | ||
900 | 7 } | ||
901 | 8 | ||
902 | 9 void thread1(void) | ||
903 | 10 { | ||
904 | 11 r1 = READ_ONCE(a); | ||
905 | 12 synchronize_rcu(); | ||
906 | 13 WRITE_ONCE(c, 1); | ||
907 | 14 } | ||
908 | 15 | ||
909 | 16 void thread2(void) | ||
910 | 17 { | ||
911 | 18 rcu_read_lock(); | ||
912 | 19 r2 = READ_ONCE(b); | ||
913 | 20 r3 = READ_ONCE(c); | ||
914 | 21 rcu_read_unlock(); | ||
915 | 22 } | ||
916 | </pre> | ||
917 | </blockquote> | ||
918 | |||
919 | <p> | ||
920 | It turns out that the outcome: | ||
921 | |||
922 | <blockquote> | ||
923 | <pre> | ||
924 | (r1 == 1 && r2 == 0 && r3 == 1) | ||
925 | </pre> | ||
926 | </blockquote> | ||
927 | |||
928 | is entirely possible. | ||
929 | The following figure show how this can happen, with each circled | ||
930 | <tt>QS</tt> indicating the point at which RCU recorded a | ||
931 | <i>quiescent state</i> for each thread, that is, a state in which | ||
932 | RCU knows that the thread cannot be in the midst of an RCU read-side | ||
933 | critical section that started before the current grace period: | ||
934 | |||
935 | <p><img src="GPpartitionReaders1.svg" alt="GPpartitionReaders1.svg" width="60%"></p> | ||
936 | |||
937 | <p> | ||
938 | If it is necessary to partition RCU read-side critical sections in this | ||
939 | manner, it is necessary to use two grace periods, where the first | ||
940 | grace period is known to end before the second grace period starts: | ||
941 | |||
942 | <blockquote> | ||
943 | <pre> | ||
944 | 1 void thread0(void) | ||
945 | 2 { | ||
946 | 3 rcu_read_lock(); | ||
947 | 4 WRITE_ONCE(a, 1); | ||
948 | 5 WRITE_ONCE(b, 1); | ||
949 | 6 rcu_read_unlock(); | ||
950 | 7 } | ||
951 | 8 | ||
952 | 9 void thread1(void) | ||
953 | 10 { | ||
954 | 11 r1 = READ_ONCE(a); | ||
955 | 12 synchronize_rcu(); | ||
956 | 13 WRITE_ONCE(c, 1); | ||
957 | 14 } | ||
958 | 15 | ||
959 | 16 void thread2(void) | ||
960 | 17 { | ||
961 | 18 r2 = READ_ONCE(c); | ||
962 | 19 synchronize_rcu(); | ||
963 | 20 WRITE_ONCE(d, 1); | ||
964 | 21 } | ||
965 | 22 | ||
966 | 23 void thread3(void) | ||
967 | 24 { | ||
968 | 25 rcu_read_lock(); | ||
969 | 26 r3 = READ_ONCE(b); | ||
970 | 27 r4 = READ_ONCE(d); | ||
971 | 28 rcu_read_unlock(); | ||
972 | 29 } | ||
973 | </pre> | ||
974 | </blockquote> | ||
975 | |||
976 | <p> | ||
977 | Here, if <tt>(r1 == 1)</tt>, then | ||
978 | <tt>thread0()</tt>'s write to <tt>b</tt> must happen | ||
979 | before the end of <tt>thread1()</tt>'s grace period. | ||
980 | If in addition <tt>(r4 == 1)</tt>, then | ||
981 | <tt>thread3()</tt>'s read from <tt>b</tt> must happen | ||
982 | after the beginning of <tt>thread2()</tt>'s grace period. | ||
983 | If it is also the case that <tt>(r2 == 1)</tt>, then the | ||
984 | end of <tt>thread1()</tt>'s grace period must precede the | ||
985 | beginning of <tt>thread2()</tt>'s grace period. | ||
986 | This mean that the two RCU read-side critical sections cannot overlap, | ||
987 | guaranteeing that <tt>(r3 == 1)</tt>. | ||
988 | As a result, the outcome: | ||
989 | |||
990 | <blockquote> | ||
991 | <pre> | ||
992 | (r1 == 1 && r2 == 1 && r3 == 0 && r4 == 1) | ||
993 | </pre> | ||
994 | </blockquote> | ||
995 | |||
996 | cannot happen. | ||
997 | |||
998 | <p> | ||
999 | This non-requirement was also non-premeditated, but became apparent | ||
1000 | when studying RCU's interaction with memory ordering. | ||
1001 | |||
1002 | <h3><a name="Read-Side Critical Sections Don't Partition Grace Periods"> | ||
1003 | Read-Side Critical Sections Don't Partition Grace Periods</a></h3> | ||
1004 | |||
1005 | <p> | ||
1006 | It is also tempting to assume that if an RCU read-side critical section | ||
1007 | happens between a pair of grace periods, then those grace periods cannot | ||
1008 | overlap. | ||
1009 | However, this temptation leads nowhere good, as can be illustrated by | ||
1010 | the following, with all variables initially zero: | ||
1011 | |||
1012 | <blockquote> | ||
1013 | <pre> | ||
1014 | 1 void thread0(void) | ||
1015 | 2 { | ||
1016 | 3 rcu_read_lock(); | ||
1017 | 4 WRITE_ONCE(a, 1); | ||
1018 | 5 WRITE_ONCE(b, 1); | ||
1019 | 6 rcu_read_unlock(); | ||
1020 | 7 } | ||
1021 | 8 | ||
1022 | 9 void thread1(void) | ||
1023 | 10 { | ||
1024 | 11 r1 = READ_ONCE(a); | ||
1025 | 12 synchronize_rcu(); | ||
1026 | 13 WRITE_ONCE(c, 1); | ||
1027 | 14 } | ||
1028 | 15 | ||
1029 | 16 void thread2(void) | ||
1030 | 17 { | ||
1031 | 18 rcu_read_lock(); | ||
1032 | 19 WRITE_ONCE(d, 1); | ||
1033 | 20 r2 = READ_ONCE(c); | ||
1034 | 21 rcu_read_unlock(); | ||
1035 | 22 } | ||
1036 | 23 | ||
1037 | 24 void thread3(void) | ||
1038 | 25 { | ||
1039 | 26 r3 = READ_ONCE(d); | ||
1040 | 27 synchronize_rcu(); | ||
1041 | 28 WRITE_ONCE(e, 1); | ||
1042 | 29 } | ||
1043 | 30 | ||
1044 | 31 void thread4(void) | ||
1045 | 32 { | ||
1046 | 33 rcu_read_lock(); | ||
1047 | 34 r4 = READ_ONCE(b); | ||
1048 | 35 r5 = READ_ONCE(e); | ||
1049 | 36 rcu_read_unlock(); | ||
1050 | 37 } | ||
1051 | </pre> | ||
1052 | </blockquote> | ||
1053 | |||
1054 | <p> | ||
1055 | In this case, the outcome: | ||
1056 | |||
1057 | <blockquote> | ||
1058 | <pre> | ||
1059 | (r1 == 1 && r2 == 1 && r3 == 1 && r4 == 0 && r5 == 1) | ||
1060 | </pre> | ||
1061 | </blockquote> | ||
1062 | |||
1063 | is entirely possible, as illustrated below: | ||
1064 | |||
1065 | <p><img src="ReadersPartitionGP1.svg" alt="ReadersPartitionGP1.svg" width="100%"></p> | ||
1066 | |||
1067 | <p> | ||
1068 | Again, an RCU read-side critical section can overlap almost all of a | ||
1069 | given grace period, just so long as it does not overlap the entire | ||
1070 | grace period. | ||
1071 | As a result, an RCU read-side critical section cannot partition a pair | ||
1072 | of RCU grace periods. | ||
1073 | |||
1074 | <p>@@QQ@@ | ||
1075 | How long a sequence of grace periods, each separated by an RCU read-side | ||
1076 | critical section, would be required to partition the RCU read-side | ||
1077 | critical sections at the beginning and end of the chain? | ||
1078 | <p>@@QQA@@ | ||
1079 | In theory, an infinite number. | ||
1080 | In practice, an unknown number that is sensitive to both implementation | ||
1081 | details and timing considerations. | ||
1082 | Therefore, even in practice, RCU users must abide by the theoretical rather | ||
1083 | than the practical answer. | ||
1084 | <p>@@QQE@@ | ||
1085 | |||
1086 | <h3><a name="Disabling Preemption Does Not Block Grace Periods"> | ||
1087 | Disabling Preemption Does Not Block Grace Periods</a></h3> | ||
1088 | |||
1089 | <p> | ||
1090 | There was a time when disabling preemption on any given CPU would block | ||
1091 | subsequent grace periods. | ||
1092 | However, this was an accident of implementation and is not a requirement. | ||
1093 | And in the current Linux-kernel implementation, disabling preemption | ||
1094 | on a given CPU in fact does not block grace periods, as Oleg Nesterov | ||
1095 | <a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>. | ||
1096 | |||
1097 | <p> | ||
1098 | If you need a preempt-disable region to block grace periods, you need to add | ||
1099 | <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example | ||
1100 | as follows: | ||
1101 | |||
1102 | <blockquote> | ||
1103 | <pre> | ||
1104 | 1 preempt_disable(); | ||
1105 | 2 rcu_read_lock(); | ||
1106 | 3 do_something(); | ||
1107 | 4 rcu_read_unlock(); | ||
1108 | 5 preempt_enable(); | ||
1109 | 6 | ||
1110 | 7 /* Spinlocks implicitly disable preemption. */ | ||
1111 | 8 spin_lock(&mylock); | ||
1112 | 9 rcu_read_lock(); | ||
1113 | 10 do_something(); | ||
1114 | 11 rcu_read_unlock(); | ||
1115 | 12 spin_unlock(&mylock); | ||
1116 | </pre> | ||
1117 | </blockquote> | ||
1118 | |||
1119 | <p> | ||
1120 | In theory, you could enter the RCU read-side critical section first, | ||
1121 | but it is more efficient to keep the entire RCU read-side critical | ||
1122 | section contained in the preempt-disable region as shown above. | ||
1123 | Of course, RCU read-side critical sections that extend outside of | ||
1124 | preempt-disable regions will work correctly, but such critical sections | ||
1125 | can be preempted, which forces <tt>rcu_read_unlock()</tt> to do | ||
1126 | more work. | ||
1127 | And no, this is <i>not</i> an invitation to enclose all of your RCU | ||
1128 | read-side critical sections within preempt-disable regions, because | ||
1129 | doing so would degrade real-time response. | ||
1130 | |||
1131 | <p> | ||
1132 | This non-requirement appeared with preemptible RCU. | ||
1133 | If you need a grace period that waits on non-preemptible code regions, use | ||
1134 | <a href="#Sched Flavor">RCU-sched</a>. | ||
1135 | |||
1136 | <h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2> | ||
1137 | |||
1138 | <p> | ||
1139 | These parallelism facts of life are by no means specific to RCU, but | ||
1140 | the RCU implementation must abide by them. | ||
1141 | They therefore bear repeating: | ||
1142 | |||
1143 | <ol> | ||
1144 | <li> Any CPU or task may be delayed at any time, | ||
1145 | and any attempts to avoid these delays by disabling | ||
1146 | preemption, interrupts, or whatever are completely futile. | ||
1147 | This is most obvious in preemptible user-level | ||
1148 | environments and in virtualized environments (where | ||
1149 | a given guest OS's VCPUs can be preempted at any time by | ||
1150 | the underlying hypervisor), but can also happen in bare-metal | ||
1151 | environments due to ECC errors, NMIs, and other hardware | ||
1152 | events. | ||
1153 | Although a delay of more than about 20 seconds can result | ||
1154 | in splats, the RCU implementation is obligated to use | ||
1155 | algorithms that can tolerate extremely long delays, but where | ||
1156 | “extremely long” is not long enough to allow | ||
1157 | wrap-around when incrementing a 64-bit counter. | ||
1158 | <li> Both the compiler and the CPU can reorder memory accesses. | ||
1159 | Where it matters, RCU must use compiler directives and | ||
1160 | memory-barrier instructions to preserve ordering. | ||
1161 | <li> Conflicting writes to memory locations in any given cache line | ||
1162 | will result in expensive cache misses. | ||
1163 | Greater numbers of concurrent writes and more-frequent | ||
1164 | concurrent writes will result in more dramatic slowdowns. | ||
1165 | RCU is therefore obligated to use algorithms that have | ||
1166 | sufficient locality to avoid significant performance and | ||
1167 | scalability problems. | ||
1168 | <li> As a rough rule of thumb, only one CPU's worth of processing | ||
1169 | may be carried out under the protection of any given exclusive | ||
1170 | lock. | ||
1171 | RCU must therefore use scalable locking designs. | ||
1172 | <li> Counters are finite, especially on 32-bit systems. | ||
1173 | RCU's use of counters must therefore tolerate counter wrap, | ||
1174 | or be designed such that counter wrap would take way more | ||
1175 | time than a single system is likely to run. | ||
1176 | An uptime of ten years is quite possible, a runtime | ||
1177 | of a century much less so. | ||
1178 | As an example of the latter, RCU's dyntick-idle nesting counter | ||
1179 | allows 54 bits for interrupt nesting level (this counter | ||
1180 | is 64 bits even on a 32-bit system). | ||
1181 | Overflowing this counter requires 2<sup>54</sup> | ||
1182 | half-interrupts on a given CPU without that CPU ever going idle. | ||
1183 | If a half-interrupt happened every microsecond, it would take | ||
1184 | 570 years of runtime to overflow this counter, which is currently | ||
1185 | believed to be an acceptably long time. | ||
1186 | <li> Linux systems can have thousands of CPUs running a single | ||
1187 | Linux kernel in a single shared-memory environment. | ||
1188 | RCU must therefore pay close attention to high-end scalability. | ||
1189 | </ol> | ||
1190 | |||
1191 | <p> | ||
1192 | This last parallelism fact of life means that RCU must pay special | ||
1193 | attention to the preceding facts of life. | ||
1194 | The idea that Linux might scale to systems with thousands of CPUs would | ||
1195 | have been met with some skepticism in the 1990s, but these requirements | ||
1196 | would have otherwise have been unsurprising, even in the early 1990s. | ||
1197 | |||
1198 | <h2><a name="Quality-of-Implementation Requirements">Quality-of-Implementation Requirements</a></h2> | ||
1199 | |||
1200 | <p> | ||
1201 | These sections list quality-of-implementation requirements. | ||
1202 | Although an RCU implementation that ignores these requirements could | ||
1203 | still be used, it would likely be subject to limitations that would | ||
1204 | make it inappropriate for industrial-strength production use. | ||
1205 | Classes of quality-of-implementation requirements are as follows: | ||
1206 | |||
1207 | <ol> | ||
1208 | <li> <a href="#Specialization">Specialization</a> | ||
1209 | <li> <a href="#Performance and Scalability">Performance and Scalability</a> | ||
1210 | <li> <a href="#Composability">Composability</a> | ||
1211 | <li> <a href="#Corner Cases">Corner Cases</a> | ||
1212 | </ol> | ||
1213 | |||
1214 | <p> | ||
1215 | These classes is covered in the following sections. | ||
1216 | |||
1217 | <h3><a name="Specialization">Specialization</a></h3> | ||
1218 | |||
1219 | <p> | ||
1220 | RCU is and always has been intended primarily for read-mostly situations, as | ||
1221 | illustrated by the following figure. | ||
1222 | This means that RCU's read-side primitives are optimized, often at the | ||
1223 | expense of its update-side primitives. | ||
1224 | |||
1225 | <p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p> | ||
1226 | |||
1227 | <p> | ||
1228 | This focus on read-mostly situations means that RCU must interoperate | ||
1229 | with other synchronization primitives. | ||
1230 | For example, the <tt>add_gp()</tt> and <tt>remove_gp_synchronous()</tt> | ||
1231 | examples discussed earlier use RCU to protect readers and locking to | ||
1232 | coordinate updaters. | ||
1233 | However, the need extends much farther, requiring that a variety of | ||
1234 | synchronization primitives be legal within RCU read-side critical sections, | ||
1235 | including spinlocks, sequence locks, atomic operations, reference | ||
1236 | counters, and memory barriers. | ||
1237 | |||
1238 | <p>@@QQ@@ | ||
1239 | What about sleeping locks? | ||
1240 | <p>@@QQA@@ | ||
1241 | These are forbidden within Linux-kernel RCU read-side critical sections | ||
1242 | because it is not legal to place a quiescent state (in this case, | ||
1243 | voluntary context switch) within an RCU read-side critical section. | ||
1244 | However, sleeping locks may be used within userspace RCU read-side critical | ||
1245 | sections, and also within Linux-kernel sleepable RCU | ||
1246 | <a href="#Sleepable RCU">(SRCU)</a> | ||
1247 | read-side critical sections. | ||
1248 | In addition, the -rt patchset turns spinlocks into a sleeping locks so | ||
1249 | that the corresponding critical sections can be preempted, which | ||
1250 | also means that these sleeplockified spinlocks (but not other sleeping locks!) | ||
1251 | may be acquire within -rt-Linux-kernel RCU read-side critical sections. | ||
1252 | |||
1253 | <p> | ||
1254 | Note that it <i>is</i> legal for a normal RCU read-side critical section | ||
1255 | to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>), | ||
1256 | but only as long as it does not loop indefinitely attempting to | ||
1257 | conditionally acquire that sleeping locks. | ||
1258 | The key point is that things like <tt>mutex_trylock()</tt> | ||
1259 | either return with the mutex held, or return an error indication if | ||
1260 | the mutex was not immediately available. | ||
1261 | Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping. | ||
1262 | <p>@@QQE@@ | ||
1263 | |||
1264 | <p> | ||
1265 | It often comes as a surprise that many algorithms do not require a | ||
1266 | consistent view of data, but many can function in that mode, | ||
1267 | with network routing being the poster child. | ||
1268 | Internet routing algorithms take significant time to propagate | ||
1269 | updates, so that by the time an update arrives at a given system, | ||
1270 | that system has been sending network traffic the wrong way for | ||
1271 | a considerable length of time. | ||
1272 | Having a few threads continue to send traffic the wrong way for a | ||
1273 | few more milliseconds is clearly not a problem: In the worst case, | ||
1274 | TCP retransmissions will eventually get the data where it needs to go. | ||
1275 | In general, when tracking the state of the universe outside of the | ||
1276 | computer, some level of inconsistency must be tolerated due to | ||
1277 | speed-of-light delays if nothing else. | ||
1278 | |||
1279 | <p> | ||
1280 | Furthermore, uncertainty about external state is inherent in many cases. | ||
1281 | For example, a pair of veternarians might use heartbeat to determine | ||
1282 | whether or not a given cat was alive. | ||
1283 | But how long should they wait after the last heartbeat to decide that | ||
1284 | the cat is in fact dead? | ||
1285 | Waiting less than 400 milliseconds makes no sense because this would | ||
1286 | mean that a relaxed cat would be considered to cycle between death | ||
1287 | and life more than 100 times per minute. | ||
1288 | Moreover, just as with human beings, a cat's heart might stop for | ||
1289 | some period of time, so the exact wait period is a judgment call. | ||
1290 | One of our pair of veternarians might wait 30 seconds before pronouncing | ||
1291 | the cat dead, while the other might insist on waiting a full minute. | ||
1292 | The two veternarians would then disagree on the state of the cat during | ||
1293 | the final 30 seconds of the minute following the last heartbeat, as | ||
1294 | fancifully illustrated below: | ||
1295 | |||
1296 | <p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p> | ||
1297 | |||
1298 | <p> | ||
1299 | Interestingly enough, this same situation applies to hardware. | ||
1300 | When push comes to shove, how do we tell whether or not some | ||
1301 | external server has failed? | ||
1302 | We send messages to it periodically, and declare it failed if we | ||
1303 | don't receive a response within a given period of time. | ||
1304 | Policy decisions can usually tolerate short | ||
1305 | periods of inconsistency. | ||
1306 | The policy was decided some time ago, and is only now being put into | ||
1307 | effect, so a few milliseconds of delay is normally inconsequential. | ||
1308 | |||
1309 | <p> | ||
1310 | However, there are algorithms that absolutely must see consistent data. | ||
1311 | For example, the translation between a user-level SystemV semaphore | ||
1312 | ID to the corresponding in-kernel data structure is protected by RCU, | ||
1313 | but it is absolutely forbidden to update a semaphore that has just been | ||
1314 | removed. | ||
1315 | In the Linux kernel, this need for consistency is accommodated by acquiring | ||
1316 | spinlocks located in the in-kernel data structure from within | ||
1317 | the RCU read-side critical section, and this is indicated by the | ||
1318 | green box in the figure above. | ||
1319 | Many other techniques may be used, and are in fact used within the | ||
1320 | Linux kernel. | ||
1321 | |||
1322 | <p> | ||
1323 | In short, RCU is not required to maintain consistency, and other | ||
1324 | mechanisms may be used in concert with RCU when consistency is required. | ||
1325 | RCU's specialization allows it to do its job extremely well, and its | ||
1326 | ability to interoperate with other synchronization mechanisms allows | ||
1327 | the right mix of synchronization tools to be used for a given job. | ||
1328 | |||
1329 | <h3><a name="Performance and Scalability">Performance and Scalability</a></h3> | ||
1330 | |||
1331 | <p> | ||
1332 | Energy efficiency is a critical component of performance today, | ||
1333 | and Linux-kernel RCU implementations must therefore avoid unnecessarily | ||
1334 | awakening idle CPUs. | ||
1335 | I cannot claim that this requirement was premeditated. | ||
1336 | In fact, I learned of it during a telephone conversation in which I | ||
1337 | was given “frank and open” feedback on the importance | ||
1338 | of energy efficiency in battery-powered systems and on specific | ||
1339 | energy-efficiency shortcomings of the Linux-kernel RCU implementation. | ||
1340 | In my experience, the battery-powered embedded community will consider | ||
1341 | any unnecessary wakeups to be extremely unfriendly acts. | ||
1342 | So much so that mere Linux-kernel-mailing-list posts are | ||
1343 | insufficient to vent their ire. | ||
1344 | |||
1345 | <p> | ||
1346 | Memory consumption is not particularly important for in most | ||
1347 | situations, and has become decreasingly | ||
1348 | so as memory sizes have expanded and memory | ||
1349 | costs have plummeted. | ||
1350 | However, as I learned from Matt Mackall's | ||
1351 | <a href="http://elinux.org/Linux_Tiny-FAQ">bloatwatch</a> | ||
1352 | efforts, memory footprint is critically important on single-CPU systems with | ||
1353 | non-preemptible (<tt>CONFIG_PREEMPT=n</tt>) kernels, and thus | ||
1354 | <a href="https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com">tiny RCU</a> | ||
1355 | was born. | ||
1356 | Josh Triplett has since taken over the small-memory banner with his | ||
1357 | <a href="https://tiny.wiki.kernel.org/">Linux kernel tinification</a> | ||
1358 | project, which resulted in | ||
1359 | <a href="#Sleepable RCU">SRCU</a> | ||
1360 | becoming optional for those kernels not needing it. | ||
1361 | |||
1362 | <p> | ||
1363 | The remaining performance requirements are, for the most part, | ||
1364 | unsurprising. | ||
1365 | For example, in keeping with RCU's read-side specialization, | ||
1366 | <tt>rcu_dereference()</tt> should have negligible overhead (for | ||
1367 | example, suppression of a few minor compiler optimizations). | ||
1368 | Similarly, in non-preemptible environments, <tt>rcu_read_lock()</tt> and | ||
1369 | <tt>rcu_read_unlock()</tt> should have exactly zero overhead. | ||
1370 | |||
1371 | <p> | ||
1372 | In preemptible environments, in the case where the RCU read-side | ||
1373 | critical section was not preempted (as will be the case for the | ||
1374 | highest-priority real-time process), <tt>rcu_read_lock()</tt> and | ||
1375 | <tt>rcu_read_unlock()</tt> should have minimal overhead. | ||
1376 | In particular, they should not contain atomic read-modify-write | ||
1377 | operations, memory-barrier instructions, preemption disabling, | ||
1378 | interrupt disabling, or backwards branches. | ||
1379 | However, in the case where the RCU read-side critical section was preempted, | ||
1380 | <tt>rcu_read_unlock()</tt> may acquire spinlocks and disable interrupts. | ||
1381 | This is why it is better to nest an RCU read-side critical section | ||
1382 | within a preempt-disable region than vice versa, at least in cases | ||
1383 | where that critical section is short enough to avoid unduly degrading | ||
1384 | real-time latencies. | ||
1385 | |||
1386 | <p> | ||
1387 | The <tt>synchronize_rcu()</tt> grace-period-wait primitive is | ||
1388 | optimized for throughput. | ||
1389 | It may therefore incur several milliseconds of latency in addition to | ||
1390 | the duration of the longest RCU read-side critical section. | ||
1391 | On the other hand, multiple concurrent invocations of | ||
1392 | <tt>synchronize_rcu()</tt> are required to use batching optimizations | ||
1393 | so that they can be satisfied by a single underlying grace-period-wait | ||
1394 | operation. | ||
1395 | For example, in the Linux kernel, it is not unusual for a single | ||
1396 | grace-period-wait operation to serve more than | ||
1397 | <a href="https://www.usenix.org/conference/2004-usenix-annual-technical-conference/making-rcu-safe-deep-sub-millisecond-response">1,000 separate invocations</a> | ||
1398 | of <tt>synchronize_rcu()</tt>, thus amortizing the per-invocation | ||
1399 | overhead down to nearly zero. | ||
1400 | However, the grace-period optimization is also required to avoid | ||
1401 | measurable degradation of real-time scheduling and interrupt latencies. | ||
1402 | |||
1403 | <p> | ||
1404 | In some cases, the multi-millisecond <tt>synchronize_rcu()</tt> | ||
1405 | latencies are unacceptable. | ||
1406 | In these cases, <tt>synchronize_rcu_expedited()</tt> may be used | ||
1407 | instead, reducing the grace-period latency down to a few tens of | ||
1408 | microseconds on small systems, at least in cases where the RCU read-side | ||
1409 | critical sections are short. | ||
1410 | There are currently no special latency requirements for | ||
1411 | <tt>synchronize_rcu_expedited()</tt> on large systems, but, | ||
1412 | consistent with the empirical nature of the RCU specification, | ||
1413 | that is subject to change. | ||
1414 | However, there most definitely are scalability requirements: | ||
1415 | A storm of <tt>synchronize_rcu_expedited()</tt> invocations on 4096 | ||
1416 | CPUs should at least make reasonable forward progress. | ||
1417 | In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt> | ||
1418 | is permitted to impose modest degradation of real-time latency | ||
1419 | on non-idle online CPUs. | ||
1420 | That said, it will likely be necessary to take further steps to reduce this | ||
1421 | degradation, hopefully to roughly that of a scheduling-clock interrupt. | ||
1422 | |||
1423 | <p> | ||
1424 | There are a number of situations where even | ||
1425 | <tt>synchronize_rcu_expedited()</tt>'s reduced grace-period | ||
1426 | latency is unacceptable. | ||
1427 | In these situations, the asynchronous <tt>call_rcu()</tt> can be | ||
1428 | used in place of <tt>synchronize_rcu()</tt> as follows: | ||
1429 | |||
1430 | <blockquote> | ||
1431 | <pre> | ||
1432 | 1 struct foo { | ||
1433 | 2 int a; | ||
1434 | 3 int b; | ||
1435 | 4 struct rcu_head rh; | ||
1436 | 5 }; | ||
1437 | 6 | ||
1438 | 7 static void remove_gp_cb(struct rcu_head *rhp) | ||
1439 | 8 { | ||
1440 | 9 struct foo *p = container_of(rhp, struct foo, rh); | ||
1441 | 10 | ||
1442 | 11 kfree(p); | ||
1443 | 12 } | ||
1444 | 13 | ||
1445 | 14 bool remove_gp_asynchronous(void) | ||
1446 | 15 { | ||
1447 | 16 struct foo *p; | ||
1448 | 17 | ||
1449 | 18 spin_lock(&gp_lock); | ||
1450 | 19 p = rcu_dereference(gp); | ||
1451 | 20 if (!p) { | ||
1452 | 21 spin_unlock(&gp_lock); | ||
1453 | 22 return false; | ||
1454 | 23 } | ||
1455 | 24 rcu_assign_pointer(gp, NULL); | ||
1456 | 25 call_rcu(&p->rh, remove_gp_cb); | ||
1457 | 26 spin_unlock(&gp_lock); | ||
1458 | 27 return true; | ||
1459 | 28 } | ||
1460 | </pre> | ||
1461 | </blockquote> | ||
1462 | |||
1463 | <p> | ||
1464 | A definition of <tt>struct foo</tt> is finally needed, and appears | ||
1465 | on lines 1-5. | ||
1466 | The function <tt>remove_gp_cb()</tt> is passed to <tt>call_rcu()</tt> | ||
1467 | on line 25, and will be invoked after the end of a subsequent | ||
1468 | grace period. | ||
1469 | This gets the same effect as <tt>remove_gp_synchronous()</tt>, | ||
1470 | but without forcing the updater to wait for a grace period to elapse. | ||
1471 | The <tt>call_rcu()</tt> function may be used in a number of | ||
1472 | situations where neither <tt>synchronize_rcu()</tt> nor | ||
1473 | <tt>synchronize_rcu_expedited()</tt> would be legal, | ||
1474 | including within preempt-disable code, <tt>local_bh_disable()</tt> code, | ||
1475 | interrupt-disable code, and interrupt handlers. | ||
1476 | However, even <tt>call_rcu()</tt> is illegal within NMI handlers. | ||
1477 | The callback function (<tt>remove_gp_cb()</tt> in this case) will be | ||
1478 | executed within softirq (software interrupt) environment within the | ||
1479 | Linux kernel, | ||
1480 | either within a real softirq handler or under the protection | ||
1481 | of <tt>local_bh_disable()</tt>. | ||
1482 | In both the Linux kernel and in userspace, it is bad practice to | ||
1483 | write an RCU callback function that takes too long. | ||
1484 | Long-running operations should be relegated to separate threads or | ||
1485 | (in the Linux kernel) workqueues. | ||
1486 | |||
1487 | <p>@@QQ@@ | ||
1488 | Why does line 19 use <tt>rcu_access_pointer()</tt>? | ||
1489 | After all, <tt>call_rcu()</tt> on line 25 stores into the | ||
1490 | structure, which would interact badly with concurrent insertions. | ||
1491 | Doesn't this mean that <tt>rcu_dereference()</tt> is required? | ||
1492 | <p>@@QQA@@ | ||
1493 | Presumably the <tt>->gp_lock</tt> acquired on line 18 excludes | ||
1494 | any changes, including any insertions that <tt>rcu_dereference()</tt> | ||
1495 | would protect against. | ||
1496 | Therefore, any insertions will be delayed until after <tt>->gp_lock</tt> | ||
1497 | is released on line 25, which in turn means that | ||
1498 | <tt>rcu_access_pointer()</tt> suffices. | ||
1499 | <p>@@QQE@@ | ||
1500 | |||
1501 | <p> | ||
1502 | However, all that <tt>remove_gp_cb()</tt> is doing is | ||
1503 | invoking <tt>kfree()</tt> on the data element. | ||
1504 | This is a common idiom, and is supported by <tt>kfree_rcu()</tt>, | ||
1505 | which allows “fire and forget” operation as shown below: | ||
1506 | |||
1507 | <blockquote> | ||
1508 | <pre> | ||
1509 | 1 struct foo { | ||
1510 | 2 int a; | ||
1511 | 3 int b; | ||
1512 | 4 struct rcu_head rh; | ||
1513 | 5 }; | ||
1514 | 6 | ||
1515 | 7 bool remove_gp_faf(void) | ||
1516 | 8 { | ||
1517 | 9 struct foo *p; | ||
1518 | 10 | ||
1519 | 11 spin_lock(&gp_lock); | ||
1520 | 12 p = rcu_dereference(gp); | ||
1521 | 13 if (!p) { | ||
1522 | 14 spin_unlock(&gp_lock); | ||
1523 | 15 return false; | ||
1524 | 16 } | ||
1525 | 17 rcu_assign_pointer(gp, NULL); | ||
1526 | 18 kfree_rcu(p, rh); | ||
1527 | 19 spin_unlock(&gp_lock); | ||
1528 | 20 return true; | ||
1529 | 21 } | ||
1530 | </pre> | ||
1531 | </blockquote> | ||
1532 | |||
1533 | <p> | ||
1534 | Note that <tt>remove_gp_faf()</tt> simply invokes | ||
1535 | <tt>kfree_rcu()</tt> and proceeds, without any need to pay any | ||
1536 | further attention to the subsequent grace period and <tt>kfree()</tt>. | ||
1537 | It is permissible to invoke <tt>kfree_rcu()</tt> from the same | ||
1538 | environments as for <tt>call_rcu()</tt>. | ||
1539 | Interestingly enough, DYNIX/ptx had the equivalents of | ||
1540 | <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>, but not | ||
1541 | <tt>synchronize_rcu()</tt>. | ||
1542 | This was due to the fact that RCU was not heavily used within DYNIX/ptx, | ||
1543 | so the very few places that needed something like | ||
1544 | <tt>synchronize_rcu()</tt> simply open-coded it. | ||
1545 | |||
1546 | <p>@@QQ@@ | ||
1547 | Earlier it was claimed that <tt>call_rcu()</tt> and | ||
1548 | <tt>kfree_rcu()</tt> allowed updaters to avoid being blocked | ||
1549 | by readers. | ||
1550 | But how can that be correct, given that the invocation of the callback | ||
1551 | and the freeing of the memory (respectively) must still wait for | ||
1552 | a grace period to elapse? | ||
1553 | <p>@@QQA@@ | ||
1554 | We could define things this way, but keep in mind that this sort of | ||
1555 | definition would say that updates in garbage-collected languages | ||
1556 | cannot complete until the next time the garbage collector runs, | ||
1557 | which does not seem at all reasonable. | ||
1558 | The key point is that in most cases, an updater using either | ||
1559 | <tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the | ||
1560 | next update as soon as it has invoked <tt>call_rcu()</tt> or | ||
1561 | <tt>kfree_rcu()</tt>, without having to wait for a subsequent | ||
1562 | grace period. | ||
1563 | <p>@@QQE@@ | ||
1564 | |||
1565 | <p> | ||
1566 | But what if the updater must wait for the completion of code to be | ||
1567 | executed after the end of the grace period, but has other tasks | ||
1568 | that can be carried out in the meantime? | ||
1569 | The polling-style <tt>get_state_synchronize_rcu()</tt> and | ||
1570 | <tt>cond_synchronize_rcu()</tt> functions may be used for this | ||
1571 | purpose, as shown below: | ||
1572 | |||
1573 | <blockquote> | ||
1574 | <pre> | ||
1575 | 1 bool remove_gp_poll(void) | ||
1576 | 2 { | ||
1577 | 3 struct foo *p; | ||
1578 | 4 unsigned long s; | ||
1579 | 5 | ||
1580 | 6 spin_lock(&gp_lock); | ||
1581 | 7 p = rcu_access_pointer(gp); | ||
1582 | 8 if (!p) { | ||
1583 | 9 spin_unlock(&gp_lock); | ||
1584 | 10 return false; | ||
1585 | 11 } | ||
1586 | 12 rcu_assign_pointer(gp, NULL); | ||
1587 | 13 spin_unlock(&gp_lock); | ||
1588 | 14 s = get_state_synchronize_rcu(); | ||
1589 | 15 do_something_while_waiting(); | ||
1590 | 16 cond_synchronize_rcu(s); | ||
1591 | 17 kfree(p); | ||
1592 | 18 return true; | ||
1593 | 19 } | ||
1594 | </pre> | ||
1595 | </blockquote> | ||
1596 | |||
1597 | <p> | ||
1598 | On line 14, <tt>get_state_synchronize_rcu()</tt> obtains a | ||
1599 | “cookie” from RCU, | ||
1600 | then line 15 carries out other tasks, | ||
1601 | and finally, line 16 returns immediately if a grace period has | ||
1602 | elapsed in the meantime, but otherwise waits as required. | ||
1603 | The need for <tt>get_state_synchronize_rcu</tt> and | ||
1604 | <tt>cond_synchronize_rcu()</tt> has appeared quite recently, | ||
1605 | so it is too early to tell whether they will stand the test of time. | ||
1606 | |||
1607 | <p> | ||
1608 | RCU thus provides a range of tools to allow updaters to strike the | ||
1609 | required tradeoff between latency, flexibility and CPU overhead. | ||
1610 | |||
1611 | <h3><a name="Composability">Composability</a></h3> | ||
1612 | |||
1613 | <p> | ||
1614 | Composability has received much attention in recent years, perhaps in part | ||
1615 | due to the collision of multicore hardware with object-oriented techniques | ||
1616 | designed in single-threaded environments for single-threaded use. | ||
1617 | And in theory, RCU read-side critical sections may be composed, and in | ||
1618 | fact may be nested arbitrarily deeply. | ||
1619 | In practice, as with all real-world implementations of composable | ||
1620 | constructs, there are limitations. | ||
1621 | |||
1622 | <p> | ||
1623 | Implementations of RCU for which <tt>rcu_read_lock()</tt> | ||
1624 | and <tt>rcu_read_unlock()</tt> generate no code, such as | ||
1625 | Linux-kernel RCU when <tt>CONFIG_PREEMPT=n</tt>, can be | ||
1626 | nested arbitrarily deeply. | ||
1627 | After all, there is no overhead. | ||
1628 | Except that if all these instances of <tt>rcu_read_lock()</tt> | ||
1629 | and <tt>rcu_read_unlock()</tt> are visible to the compiler, | ||
1630 | compilation will eventually fail due to exhausting memory, | ||
1631 | mass storage, or user patience, whichever comes first. | ||
1632 | If the nesting is not visible to the compiler, as is the case with | ||
1633 | mutually recursive functions each in its own translation unit, | ||
1634 | stack overflow will result. | ||
1635 | If the nesting takes the form of loops, either the control variable | ||
1636 | will overflow or (in the Linux kernel) you will get an RCU CPU stall warning. | ||
1637 | Nevertheless, this class of RCU implementations is one | ||
1638 | of the most composable constructs in existence. | ||
1639 | |||
1640 | <p> | ||
1641 | RCU implementations that explicitly track nesting depth | ||
1642 | are limited by the nesting-depth counter. | ||
1643 | For example, the Linux kernel's preemptible RCU limits nesting to | ||
1644 | <tt>INT_MAX</tt>. | ||
1645 | This should suffice for almost all practical purposes. | ||
1646 | That said, a consecutive pair of RCU read-side critical sections | ||
1647 | between which there is an operation that waits for a grace period | ||
1648 | cannot be enclosed in another RCU read-side critical section. | ||
1649 | This is because it is not legal to wait for a grace period within | ||
1650 | an RCU read-side critical section: To do so would result either | ||
1651 | in deadlock or | ||
1652 | in RCU implicitly splitting the enclosing RCU read-side critical | ||
1653 | section, neither of which is conducive to a long-lived and prosperous | ||
1654 | kernel. | ||
1655 | |||
1656 | <p> | ||
1657 | In short, although RCU read-side critical sections are highly composable, | ||
1658 | care is required in some situations, just as is the case for any other | ||
1659 | composable synchronization mechanism. | ||
1660 | |||
1661 | <h3><a name="Corner Cases">Corner Cases</a></h3> | ||
1662 | |||
1663 | <p> | ||
1664 | A given RCU workload might have an endless and intense stream of | ||
1665 | RCU read-side critical sections, perhaps even so intense that there | ||
1666 | was never a point in time during which there was not at least one | ||
1667 | RCU read-side critical section in flight. | ||
1668 | RCU cannot allow this situation to block grace periods: As long as | ||
1669 | all the RCU read-side critical sections are finite, grace periods | ||
1670 | must also be finite. | ||
1671 | |||
1672 | <p> | ||
1673 | That said, preemptible RCU implementations could potentially result | ||
1674 | in RCU read-side critical sections being preempted for long durations, | ||
1675 | which has the effect of creating a long-duration RCU read-side | ||
1676 | critical section. | ||
1677 | This situation can arise only in heavily loaded systems, but systems using | ||
1678 | real-time priorities are of course more vulnerable. | ||
1679 | Therefore, RCU priority boosting is provided to help deal with this | ||
1680 | case. | ||
1681 | That said, the exact requirements on RCU priority boosting will likely | ||
1682 | evolve as more experience accumulates. | ||
1683 | |||
1684 | <p> | ||
1685 | Other workloads might have very high update rates. | ||
1686 | Although one can argue that such workloads should instead use | ||
1687 | something other than RCU, the fact remains that RCU must | ||
1688 | handle such workloads gracefully. | ||
1689 | This requirement is another factor driving batching of grace periods, | ||
1690 | but it is also the driving force behind the checks for large numbers | ||
1691 | of queued RCU callbacks in the <tt>call_rcu()</tt> code path. | ||
1692 | Finally, high update rates should not delay RCU read-side critical | ||
1693 | sections, although some read-side delays can occur when using | ||
1694 | <tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use | ||
1695 | of <tt>try_stop_cpus()</tt>. | ||
1696 | (In the future, <tt>synchronize_rcu_expedited()</tt> will be | ||
1697 | converted to use lighter-weight inter-processor interrupts (IPIs), | ||
1698 | but this will still disturb readers, though to a much smaller degree.) | ||
1699 | |||
1700 | <p> | ||
1701 | Although all three of these corner cases were understood in the early | ||
1702 | 1990s, a simple user-level test consisting of <tt>close(open(path))</tt> | ||
1703 | in a tight loop | ||
1704 | in the early 2000s suddenly provided a much deeper appreciation of the | ||
1705 | high-update-rate corner case. | ||
1706 | This test also motivated addition of some RCU code to react to high update | ||
1707 | rates, for example, if a given CPU finds itself with more than 10,000 | ||
1708 | RCU callbacks queued, it will cause RCU to take evasive action by | ||
1709 | more aggressively starting grace periods and more aggressively forcing | ||
1710 | completion of grace-period processing. | ||
1711 | This evasive action causes the grace period to complete more quickly, | ||
1712 | but at the cost of restricting RCU's batching optimizations, thus | ||
1713 | increasing the CPU overhead incurred by that grace period. | ||
1714 | |||
1715 | <h2><a name="Software-Engineering Requirements"> | ||
1716 | Software-Engineering Requirements</a></h2> | ||
1717 | |||
1718 | <p> | ||
1719 | Between Murphy's Law and “To err is human”, it is necessary to | ||
1720 | guard against mishaps and misuse: | ||
1721 | |||
1722 | <ol> | ||
1723 | <li> It is all too easy to forget to use <tt>rcu_read_lock()</tt> | ||
1724 | everywhere that it is needed, so kernels built with | ||
1725 | <tt>CONFIG_PROVE_RCU=y</tt> will spat if | ||
1726 | <tt>rcu_dereference()</tt> is used outside of an | ||
1727 | RCU read-side critical section. | ||
1728 | Update-side code can use <tt>rcu_dereference_protected()</tt>, | ||
1729 | which takes a | ||
1730 | <a href="https://lwn.net/Articles/371986/">lockdep expression</a> | ||
1731 | to indicate what is providing the protection. | ||
1732 | If the indicated protection is not provided, a lockdep splat | ||
1733 | is emitted. | ||
1734 | |||
1735 | <p> | ||
1736 | Code shared between readers and updaters can use | ||
1737 | <tt>rcu_dereference_check()</tt>, which also takes a | ||
1738 | lockdep expression, and emits a lockdep splat if neither | ||
1739 | <tt>rcu_read_lock()</tt> nor the indicated protection | ||
1740 | is in place. | ||
1741 | In addition, <tt>rcu_dereference_raw()</tt> is used in those | ||
1742 | (hopefully rare) cases where the required protection cannot | ||
1743 | be easily described. | ||
1744 | Finally, <tt>rcu_read_lock_held()</tt> is provided to | ||
1745 | allow a function to verify that it has been invoked within | ||
1746 | an RCU read-side critical section. | ||
1747 | I was made aware of this set of requirements shortly after Thomas | ||
1748 | Gleixner audited a number of RCU uses. | ||
1749 | <li> A given function might wish to check for RCU-related preconditions | ||
1750 | upon entry, before using any other RCU API. | ||
1751 | The <tt>rcu_lockdep_assert()</tt> does this job, | ||
1752 | asserting the expression in kernels having lockdep enabled | ||
1753 | and doing nothing otherwise. | ||
1754 | <li> It is also easy to forget to use <tt>rcu_assign_pointer()</tt> | ||
1755 | and <tt>rcu_dereference()</tt>, perhaps (incorrectly) | ||
1756 | substituting a simple assignment. | ||
1757 | To catch this sort of error, a given RCU-protected pointer may be | ||
1758 | tagged with <tt>__rcu</tt>, after which running sparse | ||
1759 | with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain | ||
1760 | about simple-assignment accesses to that pointer. | ||
1761 | Arnd Bergmann made me aware of this requirement, and also | ||
1762 | supplied the needed | ||
1763 | <a href="https://lwn.net/Articles/376011/">patch series</a>. | ||
1764 | <li> Kernels built with <tt>CONFIG_DEBUG_OBJECTS_RCU_HEAD=y</tt> | ||
1765 | will splat if a data element is passed to <tt>call_rcu()</tt> | ||
1766 | twice in a row, without a grace period in between. | ||
1767 | (This error is similar to a double free.) | ||
1768 | The corresponding <tt>rcu_head</tt> structures that are | ||
1769 | dynamically allocated are automatically tracked, but | ||
1770 | <tt>rcu_head</tt> structures allocated on the stack | ||
1771 | must be initialized with <tt>init_rcu_head_on_stack()</tt> | ||
1772 | and cleaned up with <tt>destroy_rcu_head_on_stack()</tt>. | ||
1773 | Similarly, statically allocated non-stack <tt>rcu_head</tt> | ||
1774 | structures must be initialized with <tt>init_rcu_head()</tt> | ||
1775 | and cleaned up with <tt>destroy_rcu_head()</tt>. | ||
1776 | Mathieu Desnoyers made me aware of this requirement, and also | ||
1777 | supplied the needed | ||
1778 | <a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>. | ||
1779 | <li> An infinite loop in an RCU read-side critical section will | ||
1780 | eventually trigger an RCU CPU stall warning splat. | ||
1781 | However, RCU is not obligated to produce this splat | ||
1782 | unless there is a grace period waiting on that particular | ||
1783 | RCU read-side critical section. | ||
1784 | This requirement made itself known in the early 1990s, pretty | ||
1785 | much the first time that it was necessary to debug a CPU stall. | ||
1786 | <li> Although it would be very good to detect pointers leaking out | ||
1787 | of RCU read-side critical sections, there is currently no | ||
1788 | good way of doing this. | ||
1789 | One complication is the need to distinguish between pointers | ||
1790 | leaking and pointers that have been handed off from RCU to | ||
1791 | some other synchronization mechanism, for example, reference | ||
1792 | counting. | ||
1793 | <li> In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related | ||
1794 | information is provided via both debugfs and event tracing. | ||
1795 | <li> Open-coded use of <tt>rcu_assign_pointer()</tt> and | ||
1796 | <tt>rcu_dereference()</tt> to create typical linked | ||
1797 | data structures can be surprisingly error-prone. | ||
1798 | Therefore, RCU-protected | ||
1799 | <a href="https://lwn.net/Articles/609973/#RCU List APIs">linked lists</a> | ||
1800 | and, more recently, RCU-protected | ||
1801 | <a href="https://lwn.net/Articles/612100/">hash tables</a> | ||
1802 | are available. | ||
1803 | Many other special-purpose RCU-protected data structures are | ||
1804 | available in the Linux kernel and the userspace RCU library. | ||
1805 | <li> Some linked structures are created at compile time, but still | ||
1806 | require <tt>__rcu</tt> checking. | ||
1807 | The <tt>RCU_POINTER_INITIALIZER()</tt> macro serves this | ||
1808 | purpose. | ||
1809 | <li> It is not necessary to use <tt>rcu_assign_pointer()</tt> | ||
1810 | when creating linked structures that are to be published via | ||
1811 | a single external pointer. | ||
1812 | The <tt>RCU_INIT_POINTER()</tt> macro is provided for | ||
1813 | this task and also for assigning <tt>NULL</tt> pointers | ||
1814 | at runtime. | ||
1815 | </ol> | ||
1816 | |||
1817 | <p> | ||
1818 | This not a hard-and-fast list: RCU's diagnostic capabilities will | ||
1819 | continue to be guided by the number and type of usage bugs found | ||
1820 | in real-world RCU usage. | ||
1821 | |||
1822 | <h2><a name="Linux Kernel Complications">Linux Kernel Complications</a></h2> | ||
1823 | |||
1824 | <p> | ||
1825 | The Linux kernel provides an interesting environment for all kinds of | ||
1826 | software, including RCU. | ||
1827 | Some of the relevant points of interest are as follows: | ||
1828 | |||
1829 | <ol> | ||
1830 | <li> <a href="#Configuration">Configuration</a>. | ||
1831 | <li> <a href="#Firmware Interface">Firmware Interface</a>. | ||
1832 | <li> <a href="#Early Boot">Early Boot</a>. | ||
1833 | <li> <a href="#Interrupts and NMIs"> | ||
1834 | Interrupts and non-maskable interrupts (NMIs)</a>. | ||
1835 | <li> <a href="#Loadable Modules">Loadable Modules</a>. | ||
1836 | <li> <a href="#Hotplug CPU">Hotplug CPU</a>. | ||
1837 | <li> <a href="#Scheduler and RCU">Scheduler and RCU</a>. | ||
1838 | <li> <a href="#Tracing and RCU">Tracing and RCU</a>. | ||
1839 | <li> <a href="#Energy Efficiency">Energy Efficiency</a>. | ||
1840 | <li> <a href="#Performance, Scalability, Response Time, and Reliability"> | ||
1841 | Performance, Scalability, Response Time, and Reliability</a>. | ||
1842 | </ol> | ||
1843 | |||
1844 | <p> | ||
1845 | This list is probably incomplete, but it does give a feel for the | ||
1846 | most notable Linux-kernel complications. | ||
1847 | Each of the following sections covers one of the above topics. | ||
1848 | |||
1849 | <h3><a name="Configuration">Configuration</a></h3> | ||
1850 | |||
1851 | <p> | ||
1852 | RCU's goal is automatic configuration, so that almost nobody | ||
1853 | needs to worry about RCU's <tt>Kconfig</tt> options. | ||
1854 | And for almost all users, RCU does in fact work well | ||
1855 | “out of the box.” | ||
1856 | |||
1857 | <p> | ||
1858 | However, there are specialized use cases that are handled by | ||
1859 | kernel boot parameters and <tt>Kconfig</tt> options. | ||
1860 | Unfortunately, the <tt>Kconfig</tt> system will explicitly ask users | ||
1861 | about new <tt>Kconfig</tt> options, which requires almost all of them | ||
1862 | be hidden behind a <tt>CONFIG_RCU_EXPERT</tt> <tt>Kconfig</tt> option. | ||
1863 | |||
1864 | <p> | ||
1865 | This all should be quite obvious, but the fact remains that | ||
1866 | Linus Torvalds recently had to | ||
1867 | <a href="https://lkml.kernel.org/g/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com">remind</a> | ||
1868 | me of this requirement. | ||
1869 | |||
1870 | <h3><a name="Firmware Interface">Firmware Interface</a></h3> | ||
1871 | |||
1872 | <p> | ||
1873 | In many cases, kernel obtains information about the system from the | ||
1874 | firmware, and sometimes things are lost in translation. | ||
1875 | Or the translation is accurate, but the original message is bogus. | ||
1876 | |||
1877 | <p> | ||
1878 | For example, some systems' firmware overreports the number of CPUs, | ||
1879 | sometimes by a large factor. | ||
1880 | If RCU naively believed the firmware, as it used to do, | ||
1881 | it would create too many per-CPU kthreads. | ||
1882 | Although the resulting system will still run correctly, the extra | ||
1883 | kthreads needlessly consume memory and can cause confusion | ||
1884 | when they show up in <tt>ps</tt> listings. | ||
1885 | |||
1886 | <p> | ||
1887 | RCU must therefore wait for a given CPU to actually come online before | ||
1888 | it can allow itself to believe that the CPU actually exists. | ||
1889 | The resulting “ghost CPUs” (which are never going to | ||
1890 | come online) cause a number of | ||
1891 | <a href="https://paulmck.livejournal.com/37494.html">interesting complications</a>. | ||
1892 | |||
1893 | <h3><a name="Early Boot">Early Boot</a></h3> | ||
1894 | |||
1895 | <p> | ||
1896 | The Linux kernel's boot sequence is an interesting process, | ||
1897 | and RCU is used early, even before <tt>rcu_init()</tt> | ||
1898 | is invoked. | ||
1899 | In fact, a number of RCU's primitives can be used as soon as the | ||
1900 | initial task's <tt>task_struct</tt> is available and the | ||
1901 | boot CPU's per-CPU variables are set up. | ||
1902 | The read-side primitives (<tt>rcu_read_lock()</tt>, | ||
1903 | <tt>rcu_read_unlock()</tt>, <tt>rcu_dereference()</tt>, | ||
1904 | and <tt>rcu_access_pointer()</tt>) will operate normally very early on, | ||
1905 | as will <tt>rcu_assign_pointer()</tt>. | ||
1906 | |||
1907 | <p> | ||
1908 | Although <tt>call_rcu()</tt> may be invoked at any | ||
1909 | time during boot, callbacks are not guaranteed to be invoked until after | ||
1910 | the scheduler is fully up and running. | ||
1911 | This delay in callback invocation is due to the fact that RCU does not | ||
1912 | invoke callbacks until it is fully initialized, and this full initialization | ||
1913 | cannot occur until after the scheduler has initialized itself to the | ||
1914 | point where RCU can spawn and run its kthreads. | ||
1915 | In theory, it would be possible to invoke callbacks earlier, | ||
1916 | however, this is not a panacea because there would be severe restrictions | ||
1917 | on what operations those callbacks could invoke. | ||
1918 | |||
1919 | <p> | ||
1920 | Perhaps surprisingly, <tt>synchronize_rcu()</tt>, | ||
1921 | <a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a> | ||
1922 | (<a href="#Bottom-Half Flavor">discussed below</a>), | ||
1923 | and | ||
1924 | <a href="#Sched Flavor"><tt>synchronize_sched()</tt></a> | ||
1925 | will all operate normally | ||
1926 | during very early boot, the reason being that there is only one CPU | ||
1927 | and preemption is disabled. | ||
1928 | This means that the call <tt>synchronize_rcu()</tt> (or friends) | ||
1929 | itself is a quiescent | ||
1930 | state and thus a grace period, so the early-boot implementation can | ||
1931 | be a no-op. | ||
1932 | |||
1933 | <p> | ||
1934 | Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt> | ||
1935 | continue to operate normally through the remainder of boot, courtesy | ||
1936 | of the fact that preemption is disabled across their RCU read-side | ||
1937 | critical sections and also courtesy of the fact that there is still | ||
1938 | only one CPU. | ||
1939 | However, once the scheduler starts initializing, preemption is enabled. | ||
1940 | There is still only a single CPU, but the fact that preemption is enabled | ||
1941 | means that the no-op implementation of <tt>synchronize_rcu()</tt> no | ||
1942 | longer works in <tt>CONFIG_PREEMPT=y</tt> kernels. | ||
1943 | Therefore, as soon as the scheduler starts initializing, the early-boot | ||
1944 | fastpath is disabled. | ||
1945 | This means that <tt>synchronize_rcu()</tt> switches to its runtime | ||
1946 | mode of operation where it posts callbacks, which in turn means that | ||
1947 | any call to <tt>synchronize_rcu()</tt> will block until the corresponding | ||
1948 | callback is invoked. | ||
1949 | Unfortunately, the callback cannot be invoked until RCU's runtime | ||
1950 | grace-period machinery is up and running, which cannot happen until | ||
1951 | the scheduler has initialized itself sufficiently to allow RCU's | ||
1952 | kthreads to be spawned. | ||
1953 | Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler | ||
1954 | initialization can result in deadlock. | ||
1955 | |||
1956 | <p>@@QQ@@ | ||
1957 | So what happens with <tt>synchronize_rcu()</tt> during | ||
1958 | scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> | ||
1959 | kernels? | ||
1960 | <p>@@QQA@@ | ||
1961 | In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt> | ||
1962 | maps directly to <tt>synchronize_sched()</tt>. | ||
1963 | Therefore, <tt>synchronize_rcu()</tt> works normally throughout | ||
1964 | boot in <tt>CONFIG_PREEMPT=n</tt> kernels. | ||
1965 | However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels, | ||
1966 | so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt> | ||
1967 | during scheduler initialization. | ||
1968 | <p>@@QQE@@ | ||
1969 | |||
1970 | <p> | ||
1971 | I learned of these boot-time requirements as a result of a series of | ||
1972 | system hangs. | ||
1973 | |||
1974 | <h3><a name="Interrupts and NMIs">Interrupts and NMIs</a></h3> | ||
1975 | |||
1976 | <p> | ||
1977 | The Linux kernel has interrupts, and RCU read-side critical sections are | ||
1978 | legal within interrupt handlers and within interrupt-disabled regions | ||
1979 | of code, as are invocations of <tt>call_rcu()</tt>. | ||
1980 | |||
1981 | <p> | ||
1982 | Some Linux-kernel architectures can enter an interrupt handler from | ||
1983 | non-idle process context, and then just never leave it, instead stealthily | ||
1984 | transitioning back to process context. | ||
1985 | This trick is sometimes used to invoke system calls from inside the kernel. | ||
1986 | These “half-interrupts” mean that RCU has to be very careful | ||
1987 | about how it counts interrupt nesting levels. | ||
1988 | I learned of this requirement the hard way during a rewrite | ||
1989 | of RCU's dyntick-idle code. | ||
1990 | |||
1991 | <p> | ||
1992 | The Linux kernel has non-maskable interrupts (NMIs), and | ||
1993 | RCU read-side critical sections are legal within NMI handlers. | ||
1994 | Thankfully, RCU update-side primitives, including | ||
1995 | <tt>call_rcu()</tt>, are prohibited within NMI handlers. | ||
1996 | |||
1997 | <p> | ||
1998 | The name notwithstanding, some Linux-kernel architectures | ||
1999 | can have nested NMIs, which RCU must handle correctly. | ||
2000 | Andy Lutomirski | ||
2001 | <a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a> | ||
2002 | with this requirement; | ||
2003 | he also kindly surprised me with | ||
2004 | <a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a> | ||
2005 | that meets this requirement. | ||
2006 | |||
2007 | <h3><a name="Loadable Modules">Loadable Modules</a></h3> | ||
2008 | |||
2009 | <p> | ||
2010 | The Linux kernel has loadable modules, and these modules can | ||
2011 | also be unloaded. | ||
2012 | After a given module has been unloaded, any attempt to call | ||
2013 | one of its functions results in a segmentation fault. | ||
2014 | The module-unload functions must therefore cancel any | ||
2015 | delayed calls to loadable-module functions, for example, | ||
2016 | any outstanding <tt>mod_timer()</tt> must be dealt with | ||
2017 | via <tt>del_timer_sync()</tt> or similar. | ||
2018 | |||
2019 | <p> | ||
2020 | Unfortunately, there is no way to cancel an RCU callback; | ||
2021 | once you invoke <tt>call_rcu()</tt>, the callback function is | ||
2022 | going to eventually be invoked, unless the system goes down first. | ||
2023 | Because it is normally considered socially irresponsible to crash the system | ||
2024 | in response to a module unload request, we need some other way | ||
2025 | to deal with in-flight RCU callbacks. | ||
2026 | |||
2027 | <p> | ||
2028 | RCU therefore provides | ||
2029 | <tt><a href="https://lwn.net/Articles/217484/">rcu_barrier()</a></tt>, | ||
2030 | which waits until all in-flight RCU callbacks have been invoked. | ||
2031 | If a module uses <tt>call_rcu()</tt>, its exit function should therefore | ||
2032 | prevent any future invocation of <tt>call_rcu()</tt>, then invoke | ||
2033 | <tt>rcu_barrier()</tt>. | ||
2034 | In theory, the underlying module-unload code could invoke | ||
2035 | <tt>rcu_barrier()</tt> unconditionally, but in practice this would | ||
2036 | incur unacceptable latencies. | ||
2037 | |||
2038 | <p> | ||
2039 | Nikita Danilov noted this requirement for an analogous filesystem-unmount | ||
2040 | situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU. | ||
2041 | The need for <tt>rcu_barrier()</tt> for module unloading became | ||
2042 | apparent later. | ||
2043 | |||
2044 | <h3><a name="Hotplug CPU">Hotplug CPU</a></h3> | ||
2045 | |||
2046 | <p> | ||
2047 | The Linux kernel supports CPU hotplug, which means that CPUs | ||
2048 | can come and go. | ||
2049 | It is of course illegal to use any RCU API member from an offline CPU. | ||
2050 | This requirement was present from day one in DYNIX/ptx, but | ||
2051 | on the other hand, the Linux kernel's CPU-hotplug implementation | ||
2052 | is “interesting.” | ||
2053 | |||
2054 | <p> | ||
2055 | The Linux-kernel CPU-hotplug implementation has notifiers that | ||
2056 | are used to allow the various kernel subsystems (including RCU) | ||
2057 | to respond appropriately to a given CPU-hotplug operation. | ||
2058 | Most RCU operations may be invoked from CPU-hotplug notifiers, | ||
2059 | including even normal synchronous grace-period operations | ||
2060 | such as <tt>synchronize_rcu()</tt>. | ||
2061 | However, expedited grace-period operations such as | ||
2062 | <tt>synchronize_rcu_expedited()</tt> are not supported, | ||
2063 | due to the fact that current implementations block CPU-hotplug | ||
2064 | operations, which could result in deadlock. | ||
2065 | |||
2066 | <p> | ||
2067 | In addition, all-callback-wait operations such as | ||
2068 | <tt>rcu_barrier()</tt> are also not supported, due to the | ||
2069 | fact that there are phases of CPU-hotplug operations where | ||
2070 | the outgoing CPU's callbacks will not be invoked until after | ||
2071 | the CPU-hotplug operation ends, which could also result in deadlock. | ||
2072 | |||
2073 | <h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3> | ||
2074 | |||
2075 | <p> | ||
2076 | RCU depends on the scheduler, and the scheduler uses RCU to | ||
2077 | protect some of its data structures. | ||
2078 | This means the scheduler is forbidden from acquiring | ||
2079 | the runqueue locks and the priority-inheritance locks | ||
2080 | in the middle of an outermost RCU read-side critical section unless | ||
2081 | it also releases them before exiting that same | ||
2082 | RCU read-side critical section. | ||
2083 | This same prohibition also applies to any lock that is acquired | ||
2084 | while holding any lock to which this prohibition applies. | ||
2085 | Violating this rule results in deadlock. | ||
2086 | |||
2087 | <p> | ||
2088 | For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt> | ||
2089 | implementation must be written carefully to avoid similar deadlocks. | ||
2090 | In particular, <tt>rcu_read_unlock()</tt> must tolerate an | ||
2091 | interrupt where the interrupt handler invokes both | ||
2092 | <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>. | ||
2093 | This possibility requires <tt>rcu_read_unlock()</tt> to use | ||
2094 | negative nesting levels to avoid destructive recursion via | ||
2095 | interrupt handler's use of RCU. | ||
2096 | |||
2097 | <p> | ||
2098 | This pair of mutual scheduler-RCU requirements came as a | ||
2099 | <a href="https://lwn.net/Articles/453002/">complete surprise</a>. | ||
2100 | |||
2101 | <p> | ||
2102 | As noted above, RCU makes use of kthreads, and it is necessary to | ||
2103 | avoid excessive CPU-time accumulation by these kthreads. | ||
2104 | This requirement was no surprise, but RCU's violation of it | ||
2105 | when running context-switch-heavy workloads when built with | ||
2106 | <tt>CONFIG_NO_HZ_FULL=y</tt> | ||
2107 | <a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>. | ||
2108 | RCU has made good progress towards meeting this requirement, even | ||
2109 | for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads, | ||
2110 | but there is room for further improvement. | ||
2111 | |||
2112 | <h3><a name="Tracing and RCU">Tracing and RCU</a></h3> | ||
2113 | |||
2114 | <p> | ||
2115 | It is possible to use tracing on RCU code, but tracing itself | ||
2116 | uses RCU. | ||
2117 | For this reason, <tt>rcu_dereference_raw_notrace()</tt> | ||
2118 | is provided for use by tracing, which avoids the destructive | ||
2119 | recursion that could otherwise ensue. | ||
2120 | This API is also used by virtualization in some architectures, | ||
2121 | where RCU readers execute in environments in which tracing | ||
2122 | cannot be used. | ||
2123 | The tracing folks both located the requirement and provided the | ||
2124 | needed fix, so this surprise requirement was relatively painless. | ||
2125 | |||
2126 | <h3><a name="Energy Efficiency">Energy Efficiency</a></h3> | ||
2127 | |||
2128 | <p> | ||
2129 | Interrupting idle CPUs is considered socially unacceptable, | ||
2130 | especially by people with battery-powered embedded systems. | ||
2131 | RCU therefore conserves energy by detecting which CPUs are | ||
2132 | idle, including tracking CPUs that have been interrupted from idle. | ||
2133 | This is a large part of the energy-efficiency requirement, | ||
2134 | so I learned of this via an irate phone call. | ||
2135 | |||
2136 | <p> | ||
2137 | Because RCU avoids interrupting idle CPUs, it is illegal to | ||
2138 | execute an RCU read-side critical section on an idle CPU. | ||
2139 | (Kernels built with <tt>CONFIG_PROVE_RCU=y</tt> will splat | ||
2140 | if you try it.) | ||
2141 | The <tt>RCU_NONIDLE()</tt> macro and <tt>_rcuidle</tt> | ||
2142 | event tracing is provided to work around this restriction. | ||
2143 | In addition, <tt>rcu_is_watching()</tt> may be used to | ||
2144 | test whether or not it is currently legal to run RCU read-side | ||
2145 | critical sections on this CPU. | ||
2146 | I learned of the need for diagnostics on the one hand | ||
2147 | and <tt>RCU_NONIDLE()</tt> on the other while inspecting | ||
2148 | idle-loop code. | ||
2149 | Steven Rostedt supplied <tt>_rcuidle</tt> event tracing, | ||
2150 | which is used quite heavily in the idle loop. | ||
2151 | |||
2152 | <p> | ||
2153 | It is similarly socially unacceptable to interrupt an | ||
2154 | <tt>nohz_full</tt> CPU running in userspace. | ||
2155 | RCU must therefore track <tt>nohz_full</tt> userspace | ||
2156 | execution. | ||
2157 | And in | ||
2158 | <a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a> | ||
2159 | kernels, RCU must separately track idle CPUs on the one hand and | ||
2160 | CPUs that are either idle or executing in userspace on the other. | ||
2161 | In both cases, RCU must be able to sample state at two points in | ||
2162 | time, and be able to determine whether or not some other CPU spent | ||
2163 | any time idle and/or executing in userspace. | ||
2164 | |||
2165 | <p> | ||
2166 | These energy-efficiency requirements have proven quite difficult to | ||
2167 | understand and to meet, for example, there have been more than five | ||
2168 | clean-sheet rewrites of RCU's energy-efficiency code, the last of | ||
2169 | which was finally able to demonstrate | ||
2170 | <a href="http://www.rdrop.com/users/paulmck/realtime/paper/AMPenergy.2013.04.19a.pdf">real energy savings running on real hardware [PDF]</a>. | ||
2171 | As noted earlier, | ||
2172 | I learned of many of these requirements via angry phone calls: | ||
2173 | Flaming me on the Linux-kernel mailing list was apparently not | ||
2174 | sufficient to fully vent their ire at RCU's energy-efficiency bugs! | ||
2175 | |||
2176 | <h3><a name="Performance, Scalability, Response Time, and Reliability"> | ||
2177 | Performance, Scalability, Response Time, and Reliability</a></h3> | ||
2178 | |||
2179 | <p> | ||
2180 | Expanding on the | ||
2181 | <a href="#Performance and Scalability">earlier discussion</a>, | ||
2182 | RCU is used heavily by hot code paths in performance-critical | ||
2183 | portions of the Linux kernel's networking, security, virtualization, | ||
2184 | and scheduling code paths. | ||
2185 | RCU must therefore use efficient implementations, especially in its | ||
2186 | read-side primitives. | ||
2187 | To that end, it would be good if preemptible RCU's implementation | ||
2188 | of <tt>rcu_read_lock()</tt> could be inlined, however, doing | ||
2189 | this requires resolving <tt>#include</tt> issues with the | ||
2190 | <tt>task_struct</tt> structure. | ||
2191 | |||
2192 | <p> | ||
2193 | The Linux kernel supports hardware configurations with up to | ||
2194 | 4096 CPUs, which means that RCU must be extremely scalable. | ||
2195 | Algorithms that involve frequent acquisitions of global locks or | ||
2196 | frequent atomic operations on global variables simply cannot be | ||
2197 | tolerated within the RCU implementation. | ||
2198 | RCU therefore makes heavy use of a combining tree based on the | ||
2199 | <tt>rcu_node</tt> structure. | ||
2200 | RCU is required to tolerate all CPUs continuously invoking any | ||
2201 | combination of RCU's runtime primitives with minimal per-operation | ||
2202 | overhead. | ||
2203 | In fact, in many cases, increasing load must <i>decrease</i> the | ||
2204 | per-operation overhead, witness the batching optimizations for | ||
2205 | <tt>synchronize_rcu()</tt>, <tt>call_rcu()</tt>, | ||
2206 | <tt>synchronize_rcu_expedited()</tt>, and <tt>rcu_barrier()</tt>. | ||
2207 | As a general rule, RCU must cheerfully accept whatever the | ||
2208 | rest of the Linux kernel decides to throw at it. | ||
2209 | |||
2210 | <p> | ||
2211 | The Linux kernel is used for real-time workloads, especially | ||
2212 | in conjunction with the | ||
2213 | <a href="https://rt.wiki.kernel.org/index.php/Main_Page">-rt patchset</a>. | ||
2214 | The real-time-latency response requirements are such that the | ||
2215 | traditional approach of disabling preemption across RCU | ||
2216 | read-side critical sections is inappropriate. | ||
2217 | Kernels built with <tt>CONFIG_PREEMPT=y</tt> therefore | ||
2218 | use an RCU implementation that allows RCU read-side critical | ||
2219 | sections to be preempted. | ||
2220 | This requirement made its presence known after users made it | ||
2221 | clear that an earlier | ||
2222 | <a href="https://lwn.net/Articles/107930/">real-time patch</a> | ||
2223 | did not meet their needs, in conjunction with some | ||
2224 | <a href="https://lkml.kernel.org/g/20050318002026.GA2693@us.ibm.com">RCU issues</a> | ||
2225 | encountered by a very early version of the -rt patchset. | ||
2226 | |||
2227 | <p> | ||
2228 | In addition, RCU must make do with a sub-100-microsecond real-time latency | ||
2229 | budget. | ||
2230 | In fact, on smaller systems with the -rt patchset, the Linux kernel | ||
2231 | provides sub-20-microsecond real-time latencies for the whole kernel, | ||
2232 | including RCU. | ||
2233 | RCU's scalability and latency must therefore be sufficient for | ||
2234 | these sorts of configurations. | ||
2235 | To my surprise, the sub-100-microsecond real-time latency budget | ||
2236 | <a href="http://www.rdrop.com/users/paulmck/realtime/paper/bigrt.2013.01.31a.LCA.pdf"> | ||
2237 | applies to even the largest systems [PDF]</a>, | ||
2238 | up to and including systems with 4096 CPUs. | ||
2239 | This real-time requirement motivated the grace-period kthread, which | ||
2240 | also simplified handling of a number of race conditions. | ||
2241 | |||
2242 | <p> | ||
2243 | Finally, RCU's status as a synchronization primitive means that | ||
2244 | any RCU failure can result in arbitrary memory corruption that can be | ||
2245 | extremely difficult to debug. | ||
2246 | This means that RCU must be extremely reliable, which in | ||
2247 | practice also means that RCU must have an aggressive stress-test | ||
2248 | suite. | ||
2249 | This stress-test suite is called <tt>rcutorture</tt>. | ||
2250 | |||
2251 | <p> | ||
2252 | Although the need for <tt>rcutorture</tt> was no surprise, | ||
2253 | the current immense popularity of the Linux kernel is posing | ||
2254 | interesting—and perhaps unprecedented—validation | ||
2255 | challenges. | ||
2256 | To see this, keep in mind that there are well over one billion | ||
2257 | instances of the Linux kernel running today, given Android | ||
2258 | smartphones, Linux-powered televisions, and servers. | ||
2259 | This number can be expected to increase sharply with the advent of | ||
2260 | the celebrated Internet of Things. | ||
2261 | |||
2262 | <p> | ||
2263 | Suppose that RCU contains a race condition that manifests on average | ||
2264 | once per million years of runtime. | ||
2265 | This bug will be occurring about three times per <i>day</i> across | ||
2266 | the installed base. | ||
2267 | RCU could simply hide behind hardware error rates, given that no one | ||
2268 | should really expect their smartphone to last for a million years. | ||
2269 | However, anyone taking too much comfort from this thought should | ||
2270 | consider the fact that in most jurisdictions, a successful multi-year | ||
2271 | test of a given mechanism, which might include a Linux kernel, | ||
2272 | suffices for a number of types of safety-critical certifications. | ||
2273 | In fact, rumor has it that the Linux kernel is already being used | ||
2274 | in production for safety-critical applications. | ||
2275 | I don't know about you, but I would feel quite bad if a bug in RCU | ||
2276 | killed someone. | ||
2277 | Which might explain my recent focus on validation and verification. | ||
2278 | |||
2279 | <h2><a name="Other RCU Flavors">Other RCU Flavors</a></h2> | ||
2280 | |||
2281 | <p> | ||
2282 | One of the more surprising things about RCU is that there are now | ||
2283 | no fewer than five <i>flavors</i>, or API families. | ||
2284 | In addition, the primary flavor that has been the sole focus up to | ||
2285 | this point has two different implementations, non-preemptible and | ||
2286 | preemptible. | ||
2287 | The other four flavors are listed below, with requirements for each | ||
2288 | described in a separate section. | ||
2289 | |||
2290 | <ol> | ||
2291 | <li> <a href="#Bottom-Half Flavor">Bottom-Half Flavor</a> | ||
2292 | <li> <a href="#Sched Flavor">Sched Flavor</a> | ||
2293 | <li> <a href="#Sleepable RCU">Sleepable RCU</a> | ||
2294 | <li> <a href="#Tasks RCU">Tasks RCU</a> | ||
2295 | </ol> | ||
2296 | |||
2297 | <h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3> | ||
2298 | |||
2299 | <p> | ||
2300 | The softirq-disable (AKA “bottom-half”, | ||
2301 | hence the “_bh” abbreviations) | ||
2302 | flavor of RCU, or <i>RCU-bh</i>, was developed by | ||
2303 | Dipankar Sarma to provide a flavor of RCU that could withstand the | ||
2304 | network-based denial-of-service attacks researched by Robert | ||
2305 | Olsson. | ||
2306 | These attacks placed so much networking load on the system | ||
2307 | that some of the CPUs never exited softirq execution, | ||
2308 | which in turn prevented those CPUs from ever executing a context switch, | ||
2309 | which, in the RCU implementation of that time, prevented grace periods | ||
2310 | from ever ending. | ||
2311 | The result was an out-of-memory condition and a system hang. | ||
2312 | |||
2313 | <p> | ||
2314 | The solution was the creation of RCU-bh, which does | ||
2315 | <tt>local_bh_disable()</tt> | ||
2316 | across its read-side critical sections, and which uses the transition | ||
2317 | from one type of softirq processing to another as a quiescent state | ||
2318 | in addition to context switch, idle, user mode, and offline. | ||
2319 | This means that RCU-bh grace periods can complete even when some of | ||
2320 | the CPUs execute in softirq indefinitely, thus allowing algorithms | ||
2321 | based on RCU-bh to withstand network-based denial-of-service attacks. | ||
2322 | |||
2323 | <p> | ||
2324 | Because | ||
2325 | <tt>rcu_read_lock_bh()</tt> and <tt>rcu_read_unlock_bh()</tt> | ||
2326 | disable and re-enable softirq handlers, any attempt to start a softirq | ||
2327 | handlers during the | ||
2328 | RCU-bh read-side critical section will be deferred. | ||
2329 | In this case, <tt>rcu_read_unlock_bh()</tt> | ||
2330 | will invoke softirq processing, which can take considerable time. | ||
2331 | One can of course argue that this softirq overhead should be associated | ||
2332 | with the code following the RCU-bh read-side critical section rather | ||
2333 | than <tt>rcu_read_unlock_bh()</tt>, but the fact | ||
2334 | is that most profiling tools cannot be expected to make this sort | ||
2335 | of fine distinction. | ||
2336 | For example, suppose that a three-millisecond-long RCU-bh read-side | ||
2337 | critical section executes during a time of heavy networking load. | ||
2338 | There will very likely be an attempt to invoke at least one softirq | ||
2339 | handler during that three milliseconds, but any such invocation will | ||
2340 | be delayed until the time of the <tt>rcu_read_unlock_bh()</tt>. | ||
2341 | This can of course make it appear at first glance as if | ||
2342 | <tt>rcu_read_unlock_bh()</tt> was executing very slowly. | ||
2343 | |||
2344 | <p> | ||
2345 | The | ||
2346 | <a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-bh API</a> | ||
2347 | includes | ||
2348 | <tt>rcu_read_lock_bh()</tt>, | ||
2349 | <tt>rcu_read_unlock_bh()</tt>, | ||
2350 | <tt>rcu_dereference_bh()</tt>, | ||
2351 | <tt>rcu_dereference_bh_check()</tt>, | ||
2352 | <tt>synchronize_rcu_bh()</tt>, | ||
2353 | <tt>synchronize_rcu_bh_expedited()</tt>, | ||
2354 | <tt>call_rcu_bh()</tt>, | ||
2355 | <tt>rcu_barrier_bh()</tt>, and | ||
2356 | <tt>rcu_read_lock_bh_held()</tt>. | ||
2357 | |||
2358 | <h3><a name="Sched Flavor">Sched Flavor</a></h3> | ||
2359 | |||
2360 | <p> | ||
2361 | Before preemptible RCU, waiting for an RCU grace period had the | ||
2362 | side effect of also waiting for all pre-existing interrupt | ||
2363 | and NMI handlers. | ||
2364 | However, there are legitimate preemptible-RCU implementations that | ||
2365 | do not have this property, given that any point in the code outside | ||
2366 | of an RCU read-side critical section can be a quiescent state. | ||
2367 | Therefore, <i>RCU-sched</i> was created, which follows “classic” | ||
2368 | RCU in that an RCU-sched grace period waits for for pre-existing | ||
2369 | interrupt and NMI handlers. | ||
2370 | In kernels built with <tt>CONFIG_PREEMPT=n</tt>, the RCU and RCU-sched | ||
2371 | APIs have identical implementations, while kernels built with | ||
2372 | <tt>CONFIG_PREEMPT=y</tt> provide a separate implementation for each. | ||
2373 | |||
2374 | <p> | ||
2375 | Note well that in <tt>CONFIG_PREEMPT=y</tt> kernels, | ||
2376 | <tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt> | ||
2377 | disable and re-enable preemption, respectively. | ||
2378 | This means that if there was a preemption attempt during the | ||
2379 | RCU-sched read-side critical section, <tt>rcu_read_unlock_sched()</tt> | ||
2380 | will enter the scheduler, with all the latency and overhead entailed. | ||
2381 | Just as with <tt>rcu_read_unlock_bh()</tt>, this can make it look | ||
2382 | as if <tt>rcu_read_unlock_sched()</tt> was executing very slowly. | ||
2383 | However, the highest-priority task won't be preempted, so that task | ||
2384 | will enjoy low-overhead <tt>rcu_read_unlock_sched()</tt> invocations. | ||
2385 | |||
2386 | <p> | ||
2387 | The | ||
2388 | <a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-sched API</a> | ||
2389 | includes | ||
2390 | <tt>rcu_read_lock_sched()</tt>, | ||
2391 | <tt>rcu_read_unlock_sched()</tt>, | ||
2392 | <tt>rcu_read_lock_sched_notrace()</tt>, | ||
2393 | <tt>rcu_read_unlock_sched_notrace()</tt>, | ||
2394 | <tt>rcu_dereference_sched()</tt>, | ||
2395 | <tt>rcu_dereference_sched_check()</tt>, | ||
2396 | <tt>synchronize_sched()</tt>, | ||
2397 | <tt>synchronize_rcu_sched_expedited()</tt>, | ||
2398 | <tt>call_rcu_sched()</tt>, | ||
2399 | <tt>rcu_barrier_sched()</tt>, and | ||
2400 | <tt>rcu_read_lock_sched_held()</tt>. | ||
2401 | However, anything that disables preemption also marks an RCU-sched | ||
2402 | read-side critical section, including | ||
2403 | <tt>preempt_disable()</tt> and <tt>preempt_enable()</tt>, | ||
2404 | <tt>local_irq_save()</tt> and <tt>local_irq_restore()</tt>, | ||
2405 | and so on. | ||
2406 | |||
2407 | <h3><a name="Sleepable RCU">Sleepable RCU</a></h3> | ||
2408 | |||
2409 | <p> | ||
2410 | For well over a decade, someone saying “I need to block within | ||
2411 | an RCU read-side critical section” was a reliable indication | ||
2412 | that this someone did not understand RCU. | ||
2413 | After all, if you are always blocking in an RCU read-side critical | ||
2414 | section, you can probably afford to use a higher-overhead synchronization | ||
2415 | mechanism. | ||
2416 | However, that changed with the advent of the Linux kernel's notifiers, | ||
2417 | whose RCU read-side critical | ||
2418 | sections almost never sleep, but sometimes need to. | ||
2419 | This resulted in the introduction of | ||
2420 | <a href="https://lwn.net/Articles/202847/">sleepable RCU</a>, | ||
2421 | or <i>SRCU</i>. | ||
2422 | |||
2423 | <p> | ||
2424 | SRCU allows different domains to be defined, with each such domain | ||
2425 | defined by an instance of an <tt>srcu_struct</tt> structure. | ||
2426 | A pointer to this structure must be passed in to each SRCU function, | ||
2427 | for example, <tt>synchronize_srcu(&ss)</tt>, where | ||
2428 | <tt>ss</tt> is the <tt>srcu_struct</tt> structure. | ||
2429 | The key benefit of these domains is that a slow SRCU reader in one | ||
2430 | domain does not delay an SRCU grace period in some other domain. | ||
2431 | That said, one consequence of these domains is that read-side code | ||
2432 | must pass a “cookie” from <tt>srcu_read_lock()</tt> | ||
2433 | to <tt>srcu_read_unlock()</tt>, for example, as follows: | ||
2434 | |||
2435 | <blockquote> | ||
2436 | <pre> | ||
2437 | 1 int idx; | ||
2438 | 2 | ||
2439 | 3 idx = srcu_read_lock(&ss); | ||
2440 | 4 do_something(); | ||
2441 | 5 srcu_read_unlock(&ss, idx); | ||
2442 | </pre> | ||
2443 | </blockquote> | ||
2444 | |||
2445 | <p> | ||
2446 | As noted above, it is legal to block within SRCU read-side critical sections, | ||
2447 | however, with great power comes great responsibility. | ||
2448 | If you block forever in one of a given domain's SRCU read-side critical | ||
2449 | sections, then that domain's grace periods will also be blocked forever. | ||
2450 | Of course, one good way to block forever is to deadlock, which can | ||
2451 | happen if any operation in a given domain's SRCU read-side critical | ||
2452 | section can block waiting, either directly or indirectly, for that domain's | ||
2453 | grace period to elapse. | ||
2454 | For example, this results in a self-deadlock: | ||
2455 | |||
2456 | <blockquote> | ||
2457 | <pre> | ||
2458 | 1 int idx; | ||
2459 | 2 | ||
2460 | 3 idx = srcu_read_lock(&ss); | ||
2461 | 4 do_something(); | ||
2462 | 5 synchronize_srcu(&ss); | ||
2463 | 6 srcu_read_unlock(&ss, idx); | ||
2464 | </pre> | ||
2465 | </blockquote> | ||
2466 | |||
2467 | <p> | ||
2468 | However, if line 5 acquired a mutex that was held across | ||
2469 | a <tt>synchronize_srcu()</tt> for domain <tt>ss</tt>, | ||
2470 | deadlock would still be possible. | ||
2471 | Furthermore, if line 5 acquired a mutex that was held across | ||
2472 | a <tt>synchronize_srcu()</tt> for some other domain <tt>ss1</tt>, | ||
2473 | and if an <tt>ss1</tt>-domain SRCU read-side critical section | ||
2474 | acquired another mutex that was held across as <tt>ss</tt>-domain | ||
2475 | <tt>synchronize_srcu()</tt>, | ||
2476 | deadlock would again be possible. | ||
2477 | Such a deadlock cycle could extend across an arbitrarily large number | ||
2478 | of different SRCU domains. | ||
2479 | Again, with great power comes great responsibility. | ||
2480 | |||
2481 | <p> | ||
2482 | Unlike the other RCU flavors, SRCU read-side critical sections can | ||
2483 | run on idle and even offline CPUs. | ||
2484 | This ability requires that <tt>srcu_read_lock()</tt> and | ||
2485 | <tt>srcu_read_unlock()</tt> contain memory barriers, which means | ||
2486 | that SRCU readers will run a bit slower than would RCU readers. | ||
2487 | It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt> | ||
2488 | API, which, in combination with <tt>srcu_read_unlock()</tt>, | ||
2489 | guarantees a full memory barrier. | ||
2490 | |||
2491 | <p> | ||
2492 | The | ||
2493 | <a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a> | ||
2494 | includes | ||
2495 | <tt>srcu_read_lock()</tt>, | ||
2496 | <tt>srcu_read_unlock()</tt>, | ||
2497 | <tt>srcu_dereference()</tt>, | ||
2498 | <tt>srcu_dereference_check()</tt>, | ||
2499 | <tt>synchronize_srcu()</tt>, | ||
2500 | <tt>synchronize_srcu_expedited()</tt>, | ||
2501 | <tt>call_srcu()</tt>, | ||
2502 | <tt>srcu_barrier()</tt>, and | ||
2503 | <tt>srcu_read_lock_held()</tt>. | ||
2504 | It also includes | ||
2505 | <tt>DEFINE_SRCU()</tt>, | ||
2506 | <tt>DEFINE_STATIC_SRCU()</tt>, and | ||
2507 | <tt>init_srcu_struct()</tt> | ||
2508 | APIs for defining and initializing <tt>srcu_struct</tt> structures. | ||
2509 | |||
2510 | <h3><a name="Tasks RCU">Tasks RCU</a></h3> | ||
2511 | |||
2512 | <p> | ||
2513 | Some forms of tracing use “tramopolines” to handle the | ||
2514 | binary rewriting required to install different types of probes. | ||
2515 | It would be good to be able to free old trampolines, which sounds | ||
2516 | like a job for some form of RCU. | ||
2517 | However, because it is necessary to be able to install a trace | ||
2518 | anywhere in the code, it is not possible to use read-side markers | ||
2519 | such as <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>. | ||
2520 | In addition, it does not work to have these markers in the trampoline | ||
2521 | itself, because there would need to be instructions following | ||
2522 | <tt>rcu_read_unlock()</tt>. | ||
2523 | Although <tt>synchronize_rcu()</tt> would guarantee that execution | ||
2524 | reached the <tt>rcu_read_unlock()</tt>, it would not be able to | ||
2525 | guarantee that execution had completely left the trampoline. | ||
2526 | |||
2527 | <p> | ||
2528 | The solution, in the form of | ||
2529 | <a href="https://lwn.net/Articles/607117/"><i>Tasks RCU</i></a>, | ||
2530 | is to have implicit | ||
2531 | read-side critical sections that are delimited by voluntary context | ||
2532 | switches, that is, calls to <tt>schedule()</tt>, | ||
2533 | <tt>cond_resched_rcu_qs()</tt>, and | ||
2534 | <tt>synchronize_rcu_tasks()</tt>. | ||
2535 | In addition, transitions to and from userspace execution also delimit | ||
2536 | tasks-RCU read-side critical sections. | ||
2537 | |||
2538 | <p> | ||
2539 | The tasks-RCU API is quite compact, consisting only of | ||
2540 | <tt>call_rcu_tasks()</tt>, | ||
2541 | <tt>synchronize_rcu_tasks()</tt>, and | ||
2542 | <tt>rcu_barrier_tasks()</tt>. | ||
2543 | |||
2544 | <h2><a name="Possible Future Changes">Possible Future Changes</a></h2> | ||
2545 | |||
2546 | <p> | ||
2547 | One of the tricks that RCU uses to attain update-side scalability is | ||
2548 | to increase grace-period latency with increasing numbers of CPUs. | ||
2549 | If this becomes a serious problem, it will be necessary to rework the | ||
2550 | grace-period state machine so as to avoid the need for the additional | ||
2551 | latency. | ||
2552 | |||
2553 | <p> | ||
2554 | Expedited grace periods scan the CPUs, so their latency and overhead | ||
2555 | increases with increasing numbers of CPUs. | ||
2556 | If this becomes a serious problem on large systems, it will be necessary | ||
2557 | to do some redesign to avoid this scalability problem. | ||
2558 | |||
2559 | <p> | ||
2560 | RCU disables CPU hotplug in a few places, perhaps most notably in the | ||
2561 | expedited grace-period and <tt>rcu_barrier()</tt> operations. | ||
2562 | If there is a strong reason to use expedited grace periods in CPU-hotplug | ||
2563 | notifiers, it will be necessary to avoid disabling CPU hotplug. | ||
2564 | This would introduce some complexity, so there had better be a <i>very</i> | ||
2565 | good reason. | ||
2566 | |||
2567 | <p> | ||
2568 | The tradeoff between grace-period latency on the one hand and interruptions | ||
2569 | of other CPUs on the other hand may need to be re-examined. | ||
2570 | The desire is of course for zero grace-period latency as well as zero | ||
2571 | interprocessor interrupts undertaken during an expedited grace period | ||
2572 | operation. | ||
2573 | While this ideal is unlikely to be achievable, it is quite possible that | ||
2574 | further improvements can be made. | ||
2575 | |||
2576 | <p> | ||
2577 | The multiprocessor implementations of RCU use a combining tree that | ||
2578 | groups CPUs so as to reduce lock contention and increase cache locality. | ||
2579 | However, this combining tree does not spread its memory across NUMA | ||
2580 | nodes nor does it align the CPU groups with hardware features such | ||
2581 | as sockets or cores. | ||
2582 | Such spreading and alignment is currently believed to be unnecessary | ||
2583 | because the hotpath read-side primitives do not access the combining | ||
2584 | tree, nor does <tt>call_rcu()</tt> in the common case. | ||
2585 | If you believe that your architecture needs such spreading and alignment, | ||
2586 | then your architecture should also benefit from the | ||
2587 | <tt>rcutree.rcu_fanout_leaf</tt> boot parameter, which can be set | ||
2588 | to the number of CPUs in a socket, NUMA node, or whatever. | ||
2589 | If the number of CPUs is too large, use a fraction of the number of | ||
2590 | CPUs. | ||
2591 | If the number of CPUs is a large prime number, well, that certainly | ||
2592 | is an “interesting” architectural choice! | ||
2593 | More flexible arrangements might be considered, but only if | ||
2594 | <tt>rcutree.rcu_fanout_leaf</tt> has proven inadequate, and only | ||
2595 | if the inadequacy has been demonstrated by a carefully run and | ||
2596 | realistic system-level workload. | ||
2597 | |||
2598 | <p> | ||
2599 | Please note that arrangements that require RCU to remap CPU numbers will | ||
2600 | require extremely good demonstration of need and full exploration of | ||
2601 | alternatives. | ||
2602 | |||
2603 | <p> | ||
2604 | There is an embarrassingly large number of flavors of RCU, and this | ||
2605 | number has been increasing over time. | ||
2606 | Perhaps it will be possible to combine some at some future date. | ||
2607 | |||
2608 | <p> | ||
2609 | RCU's various kthreads are reasonably recent additions. | ||
2610 | It is quite likely that adjustments will be required to more gracefully | ||
2611 | handle extreme loads. | ||
2612 | It might also be necessary to be able to relate CPU utilization by | ||
2613 | RCU's kthreads and softirq handlers to the code that instigated this | ||
2614 | CPU utilization. | ||
2615 | For example, RCU callback overhead might be charged back to the | ||
2616 | originating <tt>call_rcu()</tt> instance, though probably not | ||
2617 | in production kernels. | ||
2618 | |||
2619 | <h2><a name="Summary">Summary</a></h2> | ||
2620 | |||
2621 | <p> | ||
2622 | This document has presented more than two decade's worth of RCU | ||
2623 | requirements. | ||
2624 | Given that the requirements keep changing, this will not be the last | ||
2625 | word on this subject, but at least it serves to get an important | ||
2626 | subset of the requirements set forth. | ||
2627 | |||
2628 | <h2><a name="Acknowledgments">Acknowledgments</a></h2> | ||
2629 | |||
2630 | I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar, | ||
2631 | Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and | ||
2632 | Andy Lutomirski for their help in rendering | ||
2633 | this article human readable, and to Michelle Rankin for her support | ||
2634 | of this effort. | ||
2635 | Other contributions are acknowledged in the Linux kernel's git archive. | ||
2636 | The cartoon is copyright (c) 2013 by Melissa Broussard, | ||
2637 | and is provided | ||
2638 | under the terms of the Creative Commons Attribution-Share Alike 3.0 | ||
2639 | United States license. | ||
2640 | |||
2641 | <p>@@QQAL@@ | ||
2642 | |||
2643 | </body></html> | ||
diff --git a/Documentation/RCU/Design/htmlqqz.sh b/Documentation/RCU/Design/htmlqqz.sh new file mode 100755 index 000000000000..d354f069559b --- /dev/null +++ b/Documentation/RCU/Design/htmlqqz.sh | |||
@@ -0,0 +1,108 @@ | |||
1 | #!/bin/sh | ||
2 | # | ||
3 | # Usage: sh htmlqqz.sh file | ||
4 | # | ||
5 | # Extracts and converts quick quizzes in a proto-HTML document file.htmlx. | ||
6 | # Commands, all of which must be on a line by themselves: | ||
7 | # | ||
8 | # "<p>@@QQ@@": Start of a quick quiz. | ||
9 | # "<p>@@QQA@@": Start of a quick-quiz answer. | ||
10 | # "<p>@@QQE@@": End of a quick-quiz answer, and thus of the quick quiz. | ||
11 | # "<p>@@QQAL@@": Place to put quick-quiz answer list. | ||
12 | # | ||
13 | # Places the result in file.html. | ||
14 | # | ||
15 | # This program is free software; you can redistribute it and/or modify | ||
16 | # it under the terms of the GNU General Public License as published by | ||
17 | # the Free Software Foundation; either version 2 of the License, or | ||
18 | # (at your option) any later version. | ||
19 | # | ||
20 | # This program is distributed in the hope that it will be useful, | ||
21 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
22 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
23 | # GNU General Public License for more details. | ||
24 | # | ||
25 | # You should have received a copy of the GNU General Public License | ||
26 | # along with this program; if not, you can access it online at | ||
27 | # http://www.gnu.org/licenses/gpl-2.0.html. | ||
28 | # | ||
29 | # Copyright (c) 2013 Paul E. McKenney, IBM Corporation. | ||
30 | |||
31 | fn=$1 | ||
32 | if test ! -r $fn.htmlx | ||
33 | then | ||
34 | echo "Error: $fn.htmlx unreadable." | ||
35 | exit 1 | ||
36 | fi | ||
37 | |||
38 | echo "<!-- DO NOT HAND EDIT. -->" > $fn.html | ||
39 | echo "<!-- Instead, edit $fn.htmlx and run 'sh htmlqqz.sh $fn' -->" >> $fn.html | ||
40 | awk < $fn.htmlx >> $fn.html ' | ||
41 | |||
42 | state == "" && $1 != "<p>@@QQ@@" && $1 != "<p>@@QQAL@@" { | ||
43 | print $0; | ||
44 | if ($0 ~ /^<p>@@QQ/) | ||
45 | print "Bad Quick Quiz command: " NR " (expected <p>@@QQ@@ or <p>@@QQAL@@)." > "/dev/stderr" | ||
46 | next; | ||
47 | } | ||
48 | |||
49 | state == "" && $1 == "<p>@@QQ@@" { | ||
50 | qqn++; | ||
51 | qqlineno = NR; | ||
52 | haveqq = 1; | ||
53 | state = "qq"; | ||
54 | print "<p><a name=\"Quick Quiz " qqn "\"><b>Quick Quiz " qqn "</b>:</a>" | ||
55 | next; | ||
56 | } | ||
57 | |||
58 | state == "qq" && $1 != "<p>@@QQA@@" { | ||
59 | qq[qqn] = qq[qqn] $0 "\n"; | ||
60 | print $0 | ||
61 | if ($0 ~ /^<p>@@QQ/) | ||
62 | print "Bad Quick Quiz command: " NR ". (expected <p>@@QQA@@)" > "/dev/stderr" | ||
63 | next; | ||
64 | } | ||
65 | |||
66 | state == "qq" && $1 == "<p>@@QQA@@" { | ||
67 | state = "qqa"; | ||
68 | print "<br><a href=\"#qq" qqn "answer\">Answer</a>" | ||
69 | next; | ||
70 | } | ||
71 | |||
72 | state == "qqa" && $1 != "<p>@@QQE@@" { | ||
73 | qqa[qqn] = qqa[qqn] $0 "\n"; | ||
74 | if ($0 ~ /^<p>@@QQ/) | ||
75 | print "Bad Quick Quiz command: " NR " (expected <p>@@QQE@@)." > "/dev/stderr" | ||
76 | next; | ||
77 | } | ||
78 | |||
79 | state == "qqa" && $1 == "<p>@@QQE@@" { | ||
80 | state = ""; | ||
81 | next; | ||
82 | } | ||
83 | |||
84 | state == "" && $1 == "<p>@@QQAL@@" { | ||
85 | haveqq = ""; | ||
86 | print "<h3><a name=\"Answers to Quick Quizzes\">" | ||
87 | print "Answers to Quick Quizzes</a></h3>" | ||
88 | print ""; | ||
89 | for (i = 1; i <= qqn; i++) { | ||
90 | print "<a name=\"qq" i "answer\"></a>" | ||
91 | print "<p><b>Quick Quiz " i "</b>:" | ||
92 | print qq[i]; | ||
93 | print ""; | ||
94 | print "</p><p><b>Answer</b>:" | ||
95 | print qqa[i]; | ||
96 | print ""; | ||
97 | print "</p><p><a href=\"#Quick%20Quiz%20" i "\"><b>Back to Quick Quiz " i "</b>.</a>" | ||
98 | print ""; | ||
99 | } | ||
100 | next; | ||
101 | } | ||
102 | |||
103 | END { | ||
104 | if (state != "") | ||
105 | print "Unterminated Quick Quiz: " qqlineno "." > "/dev/stderr" | ||
106 | else if (haveqq) | ||
107 | print "Missing \"<p>@@QQAL@@\", no Quick Quiz." > "/dev/stderr" | ||
108 | }' | ||