diff options
34 files changed, 7552 insertions, 287 deletions
diff --git a/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png b/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png new file mode 100644 index 000000000000..7496a55e4e7b --- /dev/null +++ b/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png | |||
| Binary files differ | |||
diff --git a/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg b/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg new file mode 100644 index 000000000000..4b4014fda770 --- /dev/null +++ b/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg | |||
| @@ -0,0 +1,374 @@ | |||
| 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?> | ||
| 2 | <!-- Created with Inkscape (http://www.inkscape.org/) --> | ||
| 3 | |||
| 4 | <svg | ||
| 5 | xmlns:dc="http://purl.org/dc/elements/1.1/" | ||
| 6 | xmlns:cc="http://creativecommons.org/ns#" | ||
| 7 | xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" | ||
| 8 | xmlns:svg="http://www.w3.org/2000/svg" | ||
| 9 | xmlns="http://www.w3.org/2000/svg" | ||
| 10 | xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" | ||
| 11 | xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" | ||
| 12 | width="447.99197" | ||
| 13 | height="428.19299" | ||
| 14 | id="svg2" | ||
| 15 | version="1.1" | ||
| 16 | inkscape:version="0.48.3.1 r9886" | ||
| 17 | sodipodi:docname="GPpartitionReaders1.svg"> | ||
| 18 | <defs | ||
| 19 | id="defs4"> | ||
| 20 | <marker | ||
| 21 | inkscape:stockid="Arrow2Lend" | ||
| 22 | orient="auto" | ||
| 23 | refY="0" | ||
| 24 | refX="0" | ||
| 25 | id="Arrow2Lend" | ||
| 26 | style="overflow:visible"> | ||
| 27 | <path | ||
| 28 | id="path3792" | ||
| 29 | style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" | ||
| 30 | d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" | ||
| 31 | transform="matrix(-1.1,0,0,-1.1,-1.1,0)" | ||
| 32 | inkscape:connector-curvature="0" /> | ||
| 33 | </marker> | ||
| 34 | <marker | ||
| 35 | inkscape:stockid="Arrow2Lstart" | ||
| 36 | orient="auto" | ||
| 37 | refY="0" | ||
| 38 | refX="0" | ||
| 39 | id="Arrow2Lstart" | ||
| 40 | style="overflow:visible"> | ||
| 41 | <path | ||
| 42 | id="path3789" | ||
| 43 | style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" | ||
| 44 | d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" | ||
| 45 | transform="matrix(1.1,0,0,1.1,1.1,0)" | ||
| 46 | inkscape:connector-curvature="0" /> | ||
| 47 | </marker> | ||
| 48 | </defs> | ||
| 49 | <sodipodi:namedview | ||
| 50 | id="base" | ||
| 51 | pagecolor="#ffffff" | ||
| 52 | bordercolor="#666666" | ||
| 53 | borderopacity="1.0" | ||
| 54 | inkscape:pageopacity="0.0" | ||
| 55 | inkscape:pageshadow="2" | ||
| 56 | inkscape:zoom="1.6184291" | ||
| 57 | inkscape:cx="223.99599" | ||
| 58 | inkscape:cy="214.0965" | ||
| 59 | inkscape:document-units="px" | ||
| 60 | inkscape:current-layer="layer1" | ||
| 61 | showgrid="false" | ||
| 62 | inkscape:window-width="979" | ||
| 63 | inkscape:window-height="836" | ||
| 64 | inkscape:window-x="571" | ||
| 65 | inkscape:window-y="335" | ||
| 66 | inkscape:window-maximized="0" | ||
| 67 | fit-margin-top="5" | ||
| 68 | fit-margin-left="5" | ||
| 69 | fit-margin-right="5" | ||
| 70 | fit-margin-bottom="5" /> | ||
| 71 | <metadata | ||
| 72 | id="metadata7"> | ||
| 73 | <rdf:RDF> | ||
| 74 | <cc:Work | ||
| 75 | rdf:about=""> | ||
| 76 | <dc:format>image/svg+xml</dc:format> | ||
| 77 | <dc:type | ||
| 78 | rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> | ||
| 79 | <dc:title></dc:title> | ||
| 80 | </cc:Work> | ||
| 81 | </rdf:RDF> | ||
| 82 | </metadata> | ||
| 83 | <g | ||
| 84 | inkscape:label="Layer 1" | ||
| 85 | inkscape:groupmode="layer" | ||
| 86 | id="layer1" | ||
| 87 | transform="translate(-28.441125,-185.60612)"> | ||
| 88 | <flowRoot | ||
| 89 | xml:space="preserve" | ||
| 90 | id="flowRoot2985" | ||
| 91 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion | ||
| 92 | id="flowRegion2987"><rect | ||
| 93 | id="rect2989" | ||
| 94 | width="82.85714" | ||
| 95 | height="11.428572" | ||
| 96 | x="240" | ||
| 97 | y="492.36218" /></flowRegion><flowPara | ||
| 98 | id="flowPara2991"></flowPara></flowRoot> <g | ||
| 99 | id="g4433" | ||
| 100 | transform="translate(2,0)"> | ||
| 101 | <text | ||
| 102 | sodipodi:linespacing="125%" | ||
| 103 | id="text2993" | ||
| 104 | y="-261.66608" | ||
| 105 | x="412.12299" | ||
| 106 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 107 | xml:space="preserve" | ||
| 108 | transform="matrix(0,1,-1,0,0,0)"><tspan | ||
| 109 | y="-261.66608" | ||
| 110 | x="412.12299" | ||
| 111 | id="tspan2995" | ||
| 112 | sodipodi:role="line">synchronize_rcu()</tspan></text> | ||
| 113 | <g | ||
| 114 | id="g4417" | ||
| 115 | transform="matrix(0,1,-1,0,730.90257,222.4928)"> | ||
| 116 | <path | ||
| 117 | style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)" | ||
| 118 | d="m 97.580736,477.4048 183.140664,0" | ||
| 119 | id="path2997" | ||
| 120 | inkscape:connector-curvature="0" | ||
| 121 | sodipodi:nodetypes="cc" /> | ||
| 122 | <path | ||
| 123 | style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" | ||
| 124 | d="m 96.752718,465.38398 0,22.62742" | ||
| 125 | id="path4397" | ||
| 126 | inkscape:connector-curvature="0" | ||
| 127 | sodipodi:nodetypes="cc" /> | ||
| 128 | <path | ||
| 129 | style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" | ||
| 130 | d="m 281.54942,465.38397 0,22.62742" | ||
| 131 | id="path4397-5" | ||
| 132 | inkscape:connector-curvature="0" | ||
| 133 | sodipodi:nodetypes="cc" /> | ||
| 134 | </g> | ||
| 135 | </g> | ||
| 136 | <text | ||
| 137 | xml:space="preserve" | ||
| 138 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 139 | x="112.04738" | ||
| 140 | y="268.18076" | ||
| 141 | id="text4429" | ||
| 142 | sodipodi:linespacing="125%"><tspan | ||
| 143 | sodipodi:role="line" | ||
| 144 | id="tspan4431" | ||
| 145 | x="112.04738" | ||
| 146 | y="268.18076">WRITE_ONCE(a, 1);</tspan></text> | ||
| 147 | <text | ||
| 148 | xml:space="preserve" | ||
| 149 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 150 | x="112.04738" | ||
| 151 | y="439.13766" | ||
| 152 | id="text4441" | ||
| 153 | sodipodi:linespacing="125%"><tspan | ||
| 154 | sodipodi:role="line" | ||
| 155 | id="tspan4443" | ||
| 156 | x="112.04738" | ||
| 157 | y="439.13766">WRITE_ONCE(b, 1);</tspan></text> | ||
| 158 | <text | ||
| 159 | xml:space="preserve" | ||
| 160 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 161 | x="255.60869" | ||
| 162 | y="309.29346" | ||
| 163 | id="text4445" | ||
| 164 | sodipodi:linespacing="125%"><tspan | ||
| 165 | sodipodi:role="line" | ||
| 166 | id="tspan4447" | ||
| 167 | x="255.60869" | ||
| 168 | y="309.29346">r1 = READ_ONCE(a);</tspan></text> | ||
| 169 | <text | ||
| 170 | xml:space="preserve" | ||
| 171 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 172 | x="255.14423" | ||
| 173 | y="520.61786" | ||
| 174 | id="text4449" | ||
| 175 | sodipodi:linespacing="125%"><tspan | ||
| 176 | sodipodi:role="line" | ||
| 177 | id="tspan4451" | ||
| 178 | x="255.14423" | ||
| 179 | y="520.61786">WRITE_ONCE(c, 1);</tspan></text> | ||
| 180 | <text | ||
| 181 | xml:space="preserve" | ||
| 182 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 183 | x="396.10254" | ||
| 184 | y="384.71124" | ||
| 185 | id="text4453" | ||
| 186 | sodipodi:linespacing="125%"><tspan | ||
| 187 | sodipodi:role="line" | ||
| 188 | id="tspan4455" | ||
| 189 | x="396.10254" | ||
| 190 | y="384.71124">r2 = READ_ONCE(b);</tspan></text> | ||
| 191 | <text | ||
| 192 | xml:space="preserve" | ||
| 193 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 194 | x="396.10254" | ||
| 195 | y="582.13617" | ||
| 196 | id="text4457" | ||
| 197 | sodipodi:linespacing="125%"><tspan | ||
| 198 | sodipodi:role="line" | ||
| 199 | id="tspan4459" | ||
| 200 | x="396.10254" | ||
| 201 | y="582.13617">r3 = READ_ONCE(c);</tspan></text> | ||
| 202 | <text | ||
| 203 | xml:space="preserve" | ||
| 204 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 205 | x="112.08231" | ||
| 206 | y="213.91006" | ||
| 207 | id="text4461" | ||
| 208 | sodipodi:linespacing="125%"><tspan | ||
| 209 | sodipodi:role="line" | ||
| 210 | id="tspan4463" | ||
| 211 | x="112.08231" | ||
| 212 | y="213.91006">thread0()</tspan></text> | ||
| 213 | <text | ||
| 214 | xml:space="preserve" | ||
| 215 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 216 | x="252.34512" | ||
| 217 | y="213.91006" | ||
| 218 | id="text4461-6" | ||
| 219 | sodipodi:linespacing="125%"><tspan | ||
| 220 | sodipodi:role="line" | ||
| 221 | id="tspan4463-0" | ||
| 222 | x="252.34512" | ||
| 223 | y="213.91006">thread1()</tspan></text> | ||
| 224 | <text | ||
| 225 | xml:space="preserve" | ||
| 226 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 227 | x="396.42557" | ||
| 228 | y="213.91006" | ||
| 229 | id="text4461-2" | ||
| 230 | sodipodi:linespacing="125%"><tspan | ||
| 231 | sodipodi:role="line" | ||
| 232 | id="tspan4463-2" | ||
| 233 | x="396.42557" | ||
| 234 | y="213.91006">thread2()</tspan></text> | ||
| 235 | <rect | ||
| 236 | style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
| 237 | id="rect4495" | ||
| 238 | width="436.28488" | ||
| 239 | height="416.4859" | ||
| 240 | x="34.648232" | ||
| 241 | y="191.10612" /> | ||
| 242 | <path | ||
| 243 | style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
| 244 | d="m 183.14066,191.10612 0,417.193 -0.70711,0" | ||
| 245 | id="path4497" | ||
| 246 | inkscape:connector-curvature="0" /> | ||
| 247 | <path | ||
| 248 | style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
| 249 | d="m 325.13867,191.10612 0,417.193 -0.70711,0" | ||
| 250 | id="path4497-5" | ||
| 251 | inkscape:connector-curvature="0" /> | ||
| 252 | <text | ||
| 253 | xml:space="preserve" | ||
| 254 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 255 | x="111.75929" | ||
| 256 | y="251.53981" | ||
| 257 | id="text4429-8" | ||
| 258 | sodipodi:linespacing="125%"><tspan | ||
| 259 | sodipodi:role="line" | ||
| 260 | id="tspan4431-9" | ||
| 261 | x="111.75929" | ||
| 262 | y="251.53981">rcu_read_lock();</tspan></text> | ||
| 263 | <text | ||
| 264 | xml:space="preserve" | ||
| 265 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 266 | x="396.10254" | ||
| 267 | y="367.91556" | ||
| 268 | id="text4429-8-9" | ||
| 269 | sodipodi:linespacing="125%"><tspan | ||
| 270 | sodipodi:role="line" | ||
| 271 | id="tspan4431-9-4" | ||
| 272 | x="396.10254" | ||
| 273 | y="367.91556">rcu_read_lock();</tspan></text> | ||
| 274 | <text | ||
| 275 | xml:space="preserve" | ||
| 276 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 277 | x="396.10254" | ||
| 278 | y="597.40289" | ||
| 279 | id="text4429-8-9-3" | ||
| 280 | sodipodi:linespacing="125%"><tspan | ||
| 281 | sodipodi:role="line" | ||
| 282 | id="tspan4431-9-4-4" | ||
| 283 | x="396.10254" | ||
| 284 | y="597.40289">rcu_read_unlock();</tspan></text> | ||
| 285 | <text | ||
| 286 | xml:space="preserve" | ||
| 287 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 288 | x="111.75929" | ||
| 289 | y="453.15311" | ||
| 290 | id="text4429-8-9-3-1" | ||
| 291 | sodipodi:linespacing="125%"><tspan | ||
| 292 | sodipodi:role="line" | ||
| 293 | id="tspan4431-9-4-4-6" | ||
| 294 | x="111.75929" | ||
| 295 | y="453.15311">rcu_read_unlock();</tspan></text> | ||
| 296 | <path | ||
| 297 | style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" | ||
| 298 | d="m 33.941125,227.87568 436.284885,0 0,0.7071" | ||
| 299 | id="path4608" | ||
| 300 | inkscape:connector-curvature="0" /> | ||
| 301 | <text | ||
| 302 | xml:space="preserve" | ||
| 303 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 304 | x="394.94427" | ||
| 305 | y="345.66351" | ||
| 306 | id="text4648" | ||
| 307 | sodipodi:linespacing="125%"><tspan | ||
| 308 | sodipodi:role="line" | ||
| 309 | id="tspan4650" | ||
| 310 | x="394.94427" | ||
| 311 | y="345.66351">QS</tspan></text> | ||
| 312 | <path | ||
| 313 | sodipodi:type="arc" | ||
| 314 | style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
| 315 | id="path4652" | ||
| 316 | sodipodi:cx="358.85669" | ||
| 317 | sodipodi:cy="142.87541" | ||
| 318 | sodipodi:rx="10.960155" | ||
| 319 | sodipodi:ry="10.253048" | ||
| 320 | d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" | ||
| 321 | transform="translate(36.441125,199.60612)" | ||
| 322 | sodipodi:start="4.7135481" | ||
| 323 | sodipodi:end="10.994651" | ||
| 324 | sodipodi:open="true" /> | ||
| 325 | <text | ||
| 326 | xml:space="preserve" | ||
| 327 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 328 | x="112.11968" | ||
| 329 | y="475.77856" | ||
| 330 | id="text4648-4" | ||
| 331 | sodipodi:linespacing="125%"><tspan | ||
| 332 | sodipodi:role="line" | ||
| 333 | id="tspan4650-4" | ||
| 334 | x="112.11968" | ||
| 335 | y="475.77856">QS</tspan></text> | ||
| 336 | <path | ||
| 337 | sodipodi:type="arc" | ||
| 338 | style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
| 339 | id="path4652-7" | ||
| 340 | sodipodi:cx="358.85669" | ||
| 341 | sodipodi:cy="142.87541" | ||
| 342 | sodipodi:rx="10.960155" | ||
| 343 | sodipodi:ry="10.253048" | ||
| 344 | d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" | ||
| 345 | transform="translate(-246.38346,329.72117)" | ||
| 346 | sodipodi:start="4.7135481" | ||
| 347 | sodipodi:end="10.994651" | ||
| 348 | sodipodi:open="true" /> | ||
| 349 | <path | ||
| 350 | sodipodi:type="arc" | ||
| 351 | style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
| 352 | id="path4652-7-7" | ||
| 353 | sodipodi:cx="358.85669" | ||
| 354 | sodipodi:cy="142.87541" | ||
| 355 | sodipodi:rx="10.960155" | ||
| 356 | sodipodi:ry="10.253048" | ||
| 357 | d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" | ||
| 358 | transform="translate(-103.65246,202.90878)" | ||
| 359 | sodipodi:start="4.7135481" | ||
| 360 | sodipodi:end="10.994651" | ||
| 361 | sodipodi:open="true" /> | ||
| 362 | <text | ||
| 363 | xml:space="preserve" | ||
| 364 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 365 | x="254.85066" | ||
| 366 | y="348.96619" | ||
| 367 | id="text4648-4-3" | ||
| 368 | sodipodi:linespacing="125%"><tspan | ||
| 369 | sodipodi:role="line" | ||
| 370 | id="tspan4650-4-5" | ||
| 371 | x="254.85066" | ||
| 372 | y="348.96619">QS</tspan></text> | ||
| 373 | </g> | ||
| 374 | </svg> | ||
diff --git a/Documentation/RCU/Design/Requirements/RCUApplicability.svg b/Documentation/RCU/Design/Requirements/RCUApplicability.svg new file mode 100644 index 000000000000..ebcbeee391ed --- /dev/null +++ b/Documentation/RCU/Design/Requirements/RCUApplicability.svg | |||
| @@ -0,0 +1,237 @@ | |||
| 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?> | ||
| 2 | <!-- Creator: fig2dev Version 3.2 Patchlevel 5d --> | ||
| 3 | |||
| 4 | <!-- CreationDate: Tue Mar 4 18:34:25 2014 --> | ||
| 5 | |||
| 6 | <!-- Magnification: 3.000 --> | ||
| 7 | |||
| 8 | <svg | ||
| 9 | xmlns:dc="http://purl.org/dc/elements/1.1/" | ||
| 10 | xmlns:cc="http://creativecommons.org/ns#" | ||
| 11 | xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" | ||
| 12 | xmlns:svg="http://www.w3.org/2000/svg" | ||
| 13 | xmlns="http://www.w3.org/2000/svg" | ||
| 14 | xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" | ||
| 15 | xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" | ||
| 16 | width="1089.1382" | ||
| 17 | height="668.21368" | ||
| 18 | viewBox="-2121 -36 14554.634 8876.4061" | ||
| 19 | id="svg2" | ||
| 20 | version="1.1" | ||
| 21 | inkscape:version="0.48.3.1 r9886" | ||
| 22 | sodipodi:docname="RCUApplicability.svg"> | ||
| 23 | <metadata | ||
| 24 | id="metadata40"> | ||
| 25 | <rdf:RDF> | ||
| 26 | <cc:Work | ||
| 27 | rdf:about=""> | ||
| 28 | <dc:format>image/svg+xml</dc:format> | ||
| 29 | <dc:type | ||
| 30 | rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> | ||
| 31 | <dc:title /> | ||
| 32 | </cc:Work> | ||
| 33 | </rdf:RDF> | ||
| 34 | </metadata> | ||
| 35 | <defs | ||
| 36 | id="defs38" /> | ||
| 37 | <sodipodi:namedview | ||
| 38 | pagecolor="#ffffff" | ||
| 39 | bordercolor="#666666" | ||
| 40 | borderopacity="1" | ||
| 41 | objecttolerance="10" | ||
| 42 | gridtolerance="10" | ||
| 43 | guidetolerance="10" | ||
| 44 | inkscape:pageopacity="0" | ||
| 45 | inkscape:pageshadow="2" | ||
| 46 | inkscape:window-width="849" | ||
| 47 | inkscape:window-height="639" | ||
| 48 | id="namedview36" | ||
| 49 | showgrid="false" | ||
| 50 | inkscape:zoom="0.51326165" | ||
| 51 | inkscape:cx="544.56912" | ||
| 52 | inkscape:cy="334.10686" | ||
| 53 | inkscape:window-x="149" | ||
| 54 | inkscape:window-y="448" | ||
| 55 | inkscape:window-maximized="0" | ||
| 56 | inkscape:current-layer="g4" | ||
| 57 | fit-margin-top="5" | ||
| 58 | fit-margin-left="5" | ||
| 59 | fit-margin-right="5" | ||
| 60 | fit-margin-bottom="5" /> | ||
| 61 | <g | ||
| 62 | style="fill:none;stroke-width:0.025in" | ||
| 63 | id="g4" | ||
| 64 | transform="translate(-2043.6828,14.791398)"> | ||
| 65 | <!-- Line: box --> | ||
| 66 | <rect | ||
| 67 | x="0" | ||
| 68 | y="0" | ||
| 69 | width="14400" | ||
| 70 | height="8775" | ||
| 71 | rx="0" | ||
| 72 | style="fill:#ffa1a1;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" | ||
| 73 | id="rect6" /> | ||
| 74 | <!-- Line: box --> | ||
| 75 | <rect | ||
| 76 | x="1350" | ||
| 77 | y="0" | ||
| 78 | width="11700" | ||
| 79 | height="6075" | ||
| 80 | rx="0" | ||
| 81 | style="fill:#ffff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" | ||
| 82 | id="rect8" /> | ||
| 83 | <!-- Line: box --> | ||
| 84 | <rect | ||
| 85 | x="2700" | ||
| 86 | y="0" | ||
| 87 | width="9000" | ||
| 88 | height="4275" | ||
| 89 | rx="0" | ||
| 90 | style="fill:#00ff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" | ||
| 91 | id="rect10" /> | ||
| 92 | <!-- Line: box --> | ||
| 93 | <rect | ||
| 94 | x="4050" | ||
| 95 | y="0" | ||
| 96 | width="6300" | ||
| 97 | height="2475" | ||
| 98 | rx="0" | ||
| 99 | style="fill:#87cfff;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" | ||
| 100 | id="rect12" /> | ||
| 101 | <!-- Text --> | ||
| 102 | <text | ||
| 103 | xml:space="preserve" | ||
| 104 | x="7200" | ||
| 105 | y="900" | ||
| 106 | font-style="normal" | ||
| 107 | font-weight="normal" | ||
| 108 | font-size="324" | ||
| 109 | id="text14" | ||
| 110 | sodipodi:linespacing="125%" | ||
| 111 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan | ||
| 112 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" | ||
| 113 | id="tspan3017">Read-Mostly, Stale &</tspan></text> | ||
| 114 | <!-- Text --> | ||
| 115 | <text | ||
| 116 | xml:space="preserve" | ||
| 117 | x="7200" | ||
| 118 | y="1350" | ||
| 119 | font-style="normal" | ||
| 120 | font-weight="normal" | ||
| 121 | font-size="324" | ||
| 122 | id="text16" | ||
| 123 | sodipodi:linespacing="125%" | ||
| 124 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan | ||
| 125 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" | ||
| 126 | id="tspan3019">Inconsistent Data OK</tspan></text> | ||
| 127 | <!-- Text --> | ||
| 128 | <text | ||
| 129 | xml:space="preserve" | ||
| 130 | x="7200" | ||
| 131 | y="1800" | ||
| 132 | font-style="normal" | ||
| 133 | font-weight="normal" | ||
| 134 | font-size="324" | ||
| 135 | id="text18" | ||
| 136 | sodipodi:linespacing="125%" | ||
| 137 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan | ||
| 138 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" | ||
| 139 | id="tspan3021">(RCU Works Great!!!)</tspan></text> | ||
| 140 | <!-- Text --> | ||
| 141 | <text | ||
| 142 | xml:space="preserve" | ||
| 143 | x="7200" | ||
| 144 | y="3825" | ||
| 145 | font-style="normal" | ||
| 146 | font-weight="normal" | ||
| 147 | font-size="324" | ||
| 148 | id="text20" | ||
| 149 | sodipodi:linespacing="125%" | ||
| 150 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan | ||
| 151 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" | ||
| 152 | id="tspan3023">(RCU Works Well)</tspan></text> | ||
| 153 | <!-- Text --> | ||
| 154 | <text | ||
| 155 | xml:space="preserve" | ||
| 156 | x="7200" | ||
| 157 | y="3375" | ||
| 158 | font-style="normal" | ||
| 159 | font-weight="normal" | ||
| 160 | font-size="324" | ||
| 161 | id="text22" | ||
| 162 | sodipodi:linespacing="125%" | ||
| 163 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan | ||
| 164 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" | ||
| 165 | id="tspan3025">Read-Mostly, Need Consistent Data</tspan></text> | ||
| 166 | <!-- Text --> | ||
| 167 | <text | ||
| 168 | xml:space="preserve" | ||
| 169 | x="7200" | ||
| 170 | y="5175" | ||
| 171 | font-style="normal" | ||
| 172 | font-weight="normal" | ||
| 173 | font-size="324" | ||
| 174 | id="text24" | ||
| 175 | sodipodi:linespacing="125%" | ||
| 176 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan | ||
| 177 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" | ||
| 178 | id="tspan3027">Read-Write, Need Consistent Data</tspan></text> | ||
| 179 | <!-- Text --> | ||
| 180 | <text | ||
| 181 | xml:space="preserve" | ||
| 182 | x="7200" | ||
| 183 | y="6975" | ||
| 184 | font-style="normal" | ||
| 185 | font-weight="normal" | ||
| 186 | font-size="324" | ||
| 187 | id="text26" | ||
| 188 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" | ||
| 189 | sodipodi:linespacing="125%">Update-Mostly, Need Consistent Data</text> | ||
| 190 | <!-- Text --> | ||
| 191 | <text | ||
| 192 | xml:space="preserve" | ||
| 193 | x="7200" | ||
| 194 | y="5625" | ||
| 195 | font-style="normal" | ||
| 196 | font-weight="normal" | ||
| 197 | font-size="324" | ||
| 198 | id="text28" | ||
| 199 | sodipodi:linespacing="125%" | ||
| 200 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan | ||
| 201 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" | ||
| 202 | id="tspan3029">(RCU Might Be OK...)</tspan></text> | ||
| 203 | <!-- Text --> | ||
| 204 | <text | ||
| 205 | xml:space="preserve" | ||
| 206 | x="7200" | ||
| 207 | y="7875" | ||
| 208 | font-style="normal" | ||
| 209 | font-weight="normal" | ||
| 210 | font-size="324" | ||
| 211 | id="text30" | ||
| 212 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" | ||
| 213 | sodipodi:linespacing="125%">(1) Provide Existence Guarantees For Update-Friendly Mechanisms</text> | ||
| 214 | <!-- Text --> | ||
| 215 | <text | ||
| 216 | xml:space="preserve" | ||
| 217 | x="7200" | ||
| 218 | y="8325" | ||
| 219 | font-style="normal" | ||
| 220 | font-weight="normal" | ||
| 221 | font-size="324" | ||
| 222 | id="text32" | ||
| 223 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" | ||
| 224 | sodipodi:linespacing="125%">(2) Provide Wait-Free Read-Side Primitives for Real-Time Use)</text> | ||
| 225 | <!-- Text --> | ||
| 226 | <text | ||
| 227 | xml:space="preserve" | ||
| 228 | x="7200" | ||
| 229 | y="7425" | ||
| 230 | font-style="normal" | ||
| 231 | font-weight="normal" | ||
| 232 | font-size="324" | ||
| 233 | id="text34" | ||
| 234 | style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" | ||
| 235 | sodipodi:linespacing="125%">(RCU is Very Unlikely to be the Right Tool For The Job, But it Can:</text> | ||
| 236 | </g> | ||
| 237 | </svg> | ||
diff --git a/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg b/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg new file mode 100644 index 000000000000..48cd1623d4d4 --- /dev/null +++ b/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg | |||
| @@ -0,0 +1,639 @@ | |||
| 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?> | ||
| 2 | <!-- Created with Inkscape (http://www.inkscape.org/) --> | ||
| 3 | |||
| 4 | <svg | ||
| 5 | xmlns:dc="http://purl.org/dc/elements/1.1/" | ||
| 6 | xmlns:cc="http://creativecommons.org/ns#" | ||
| 7 | xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" | ||
| 8 | xmlns:svg="http://www.w3.org/2000/svg" | ||
| 9 | xmlns="http://www.w3.org/2000/svg" | ||
| 10 | xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" | ||
| 11 | xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" | ||
| 12 | width="735.25" | ||
| 13 | height="516.21875" | ||
| 14 | id="svg2" | ||
| 15 | version="1.1" | ||
| 16 | inkscape:version="0.48.3.1 r9886" | ||
| 17 | sodipodi:docname="ReadersPartitionGP1.svg"> | ||
| 18 | <defs | ||
| 19 | id="defs4"> | ||
| 20 | <marker | ||
| 21 | inkscape:stockid="Arrow2Lend" | ||
| 22 | orient="auto" | ||
| 23 | refY="0" | ||
| 24 | refX="0" | ||
| 25 | id="Arrow2Lend" | ||
| 26 | style="overflow:visible"> | ||
| 27 | <path | ||
| 28 | id="path3792" | ||
| 29 | style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" | ||
| 30 | d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" | ||
| 31 | transform="matrix(-1.1,0,0,-1.1,-1.1,0)" | ||
| 32 | inkscape:connector-curvature="0" /> | ||
| 33 | </marker> | ||
| 34 | <marker | ||
| 35 | inkscape:stockid="Arrow2Lstart" | ||
| 36 | orient="auto" | ||
| 37 | refY="0" | ||
| 38 | refX="0" | ||
| 39 | id="Arrow2Lstart" | ||
| 40 | style="overflow:visible"> | ||
| 41 | <path | ||
| 42 | id="path3789" | ||
| 43 | style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" | ||
| 44 | d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" | ||
| 45 | transform="matrix(1.1,0,0,1.1,1.1,0)" | ||
| 46 | inkscape:connector-curvature="0" /> | ||
| 47 | </marker> | ||
| 48 | <marker | ||
| 49 | inkscape:stockid="Arrow2Lstart" | ||
| 50 | orient="auto" | ||
| 51 | refY="0" | ||
| 52 | refX="0" | ||
| 53 | id="Arrow2Lstart-4" | ||
| 54 | style="overflow:visible"> | ||
| 55 | <path | ||
| 56 | id="path3789-9" | ||
| 57 | style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" | ||
| 58 | d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" | ||
| 59 | transform="matrix(1.1,0,0,1.1,1.1,0)" | ||
| 60 | inkscape:connector-curvature="0" /> | ||
| 61 | </marker> | ||
| 62 | <marker | ||
| 63 | inkscape:stockid="Arrow2Lend" | ||
| 64 | orient="auto" | ||
| 65 | refY="0" | ||
| 66 | refX="0" | ||
| 67 | id="Arrow2Lend-4" | ||
| 68 | style="overflow:visible"> | ||
| 69 | <path | ||
| 70 | id="path3792-4" | ||
| 71 | style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" | ||
| 72 | d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" | ||
| 73 | transform="matrix(-1.1,0,0,-1.1,-1.1,0)" | ||
| 74 | inkscape:connector-curvature="0" /> | ||
| 75 | </marker> | ||
| 76 | </defs> | ||
| 77 | <sodipodi:namedview | ||
| 78 | id="base" | ||
| 79 | pagecolor="#ffffff" | ||
| 80 | bordercolor="#666666" | ||
| 81 | borderopacity="1.0" | ||
| 82 | inkscape:pageopacity="0.0" | ||
| 83 | inkscape:pageshadow="2" | ||
| 84 | inkscape:zoom="1.3670394" | ||
| 85 | inkscape:cx="367.26465" | ||
| 86 | inkscape:cy="258.46182" | ||
| 87 | inkscape:document-units="px" | ||
| 88 | inkscape:current-layer="g4433-6" | ||
| 89 | showgrid="false" | ||
| 90 | inkscape:window-width="1351" | ||
| 91 | inkscape:window-height="836" | ||
| 92 | inkscape:window-x="438" | ||
| 93 | inkscape:window-y="335" | ||
| 94 | inkscape:window-maximized="0" | ||
| 95 | fit-margin-top="5" | ||
| 96 | fit-margin-left="5" | ||
| 97 | fit-margin-right="5" | ||
| 98 | fit-margin-bottom="5" /> | ||
| 99 | <metadata | ||
| 100 | id="metadata7"> | ||
| 101 | <rdf:RDF> | ||
| 102 | <cc:Work | ||
| 103 | rdf:about=""> | ||
| 104 | <dc:format>image/svg+xml</dc:format> | ||
| 105 | <dc:type | ||
| 106 | rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> | ||
| 107 | <dc:title /> | ||
| 108 | </cc:Work> | ||
| 109 | </rdf:RDF> | ||
| 110 | </metadata> | ||
| 111 | <g | ||
| 112 | inkscape:label="Layer 1" | ||
| 113 | inkscape:groupmode="layer" | ||
| 114 | id="layer1" | ||
| 115 | transform="translate(-29.15625,-185.59375)"> | ||
| 116 | <flowRoot | ||
| 117 | xml:space="preserve" | ||
| 118 | id="flowRoot2985" | ||
| 119 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion | ||
| 120 | id="flowRegion2987"><rect | ||
| 121 | id="rect2989" | ||
| 122 | width="82.85714" | ||
| 123 | height="11.428572" | ||
| 124 | x="240" | ||
| 125 | y="492.36218" /></flowRegion><flowPara | ||
| 126 | id="flowPara2991" /></flowRoot> <g | ||
| 127 | id="g4433" | ||
| 128 | transform="translate(2,-12)"> | ||
| 129 | <text | ||
| 130 | sodipodi:linespacing="125%" | ||
| 131 | id="text2993" | ||
| 132 | y="-261.66608" | ||
| 133 | x="436.12299" | ||
| 134 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 135 | xml:space="preserve" | ||
| 136 | transform="matrix(0,1,-1,0,0,0)"><tspan | ||
| 137 | y="-261.66608" | ||
| 138 | x="436.12299" | ||
| 139 | id="tspan2995" | ||
| 140 | sodipodi:role="line">synchronize_rcu()</tspan></text> | ||
| 141 | <g | ||
| 142 | id="g4417" | ||
| 143 | transform="matrix(0,1,-1,0,730.90257,222.4928)"> | ||
| 144 | <path | ||
| 145 | style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)" | ||
| 146 | d="M 97.580736,477.4048 327.57913,476.09759" | ||
| 147 | id="path2997" | ||
| 148 | inkscape:connector-curvature="0" | ||
| 149 | sodipodi:nodetypes="cc" /> | ||
| 150 | <path | ||
| 151 | style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" | ||
| 152 | d="m 96.752718,465.38398 0,22.62742" | ||
| 153 | id="path4397" | ||
| 154 | inkscape:connector-curvature="0" | ||
| 155 | sodipodi:nodetypes="cc" /> | ||
| 156 | <path | ||
| 157 | style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" | ||
| 158 | d="m 328.40703,465.38397 0,22.62742" | ||
| 159 | id="path4397-5" | ||
| 160 | inkscape:connector-curvature="0" | ||
| 161 | sodipodi:nodetypes="cc" /> | ||
| 162 | </g> | ||
| 163 | </g> | ||
| 164 | <text | ||
| 165 | xml:space="preserve" | ||
| 166 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 167 | x="112.04738" | ||
| 168 | y="268.18076" | ||
| 169 | id="text4429" | ||
| 170 | sodipodi:linespacing="125%"><tspan | ||
| 171 | sodipodi:role="line" | ||
| 172 | id="tspan4431" | ||
| 173 | x="112.04738" | ||
| 174 | y="268.18076">WRITE_ONCE(a, 1);</tspan></text> | ||
| 175 | <text | ||
| 176 | xml:space="preserve" | ||
| 177 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 178 | x="112.04738" | ||
| 179 | y="487.13766" | ||
| 180 | id="text4441" | ||
| 181 | sodipodi:linespacing="125%"><tspan | ||
| 182 | sodipodi:role="line" | ||
| 183 | id="tspan4443" | ||
| 184 | x="112.04738" | ||
| 185 | y="487.13766">WRITE_ONCE(b, 1);</tspan></text> | ||
| 186 | <text | ||
| 187 | xml:space="preserve" | ||
| 188 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 189 | x="255.60869" | ||
| 190 | y="297.29346" | ||
| 191 | id="text4445" | ||
| 192 | sodipodi:linespacing="125%"><tspan | ||
| 193 | sodipodi:role="line" | ||
| 194 | id="tspan4447" | ||
| 195 | x="255.60869" | ||
| 196 | y="297.29346">r1 = READ_ONCE(a);</tspan></text> | ||
| 197 | <text | ||
| 198 | xml:space="preserve" | ||
| 199 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 200 | x="255.14423" | ||
| 201 | y="554.61786" | ||
| 202 | id="text4449" | ||
| 203 | sodipodi:linespacing="125%"><tspan | ||
| 204 | sodipodi:role="line" | ||
| 205 | id="tspan4451" | ||
| 206 | x="255.14423" | ||
| 207 | y="554.61786">WRITE_ONCE(c, 1);</tspan></text> | ||
| 208 | <text | ||
| 209 | xml:space="preserve" | ||
| 210 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 211 | x="396.10254" | ||
| 212 | y="370.71124" | ||
| 213 | id="text4453" | ||
| 214 | sodipodi:linespacing="125%"><tspan | ||
| 215 | sodipodi:role="line" | ||
| 216 | id="tspan4455" | ||
| 217 | x="396.10254" | ||
| 218 | y="370.71124">WRITE_ONCE(d, 1);</tspan></text> | ||
| 219 | <text | ||
| 220 | xml:space="preserve" | ||
| 221 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 222 | x="396.10254" | ||
| 223 | y="572.13617" | ||
| 224 | id="text4457" | ||
| 225 | sodipodi:linespacing="125%"><tspan | ||
| 226 | sodipodi:role="line" | ||
| 227 | id="tspan4459" | ||
| 228 | x="396.10254" | ||
| 229 | y="572.13617">r2 = READ_ONCE(c);</tspan></text> | ||
| 230 | <text | ||
| 231 | xml:space="preserve" | ||
| 232 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 233 | x="112.08231" | ||
| 234 | y="213.91006" | ||
| 235 | id="text4461" | ||
| 236 | sodipodi:linespacing="125%"><tspan | ||
| 237 | sodipodi:role="line" | ||
| 238 | id="tspan4463" | ||
| 239 | x="112.08231" | ||
| 240 | y="213.91006">thread0()</tspan></text> | ||
| 241 | <text | ||
| 242 | xml:space="preserve" | ||
| 243 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 244 | x="252.34512" | ||
| 245 | y="213.91006" | ||
| 246 | id="text4461-6" | ||
| 247 | sodipodi:linespacing="125%"><tspan | ||
| 248 | sodipodi:role="line" | ||
| 249 | id="tspan4463-0" | ||
| 250 | x="252.34512" | ||
| 251 | y="213.91006">thread1()</tspan></text> | ||
| 252 | <text | ||
| 253 | xml:space="preserve" | ||
| 254 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 255 | x="396.42557" | ||
| 256 | y="213.91006" | ||
| 257 | id="text4461-2" | ||
| 258 | sodipodi:linespacing="125%"><tspan | ||
| 259 | sodipodi:role="line" | ||
| 260 | id="tspan4463-2" | ||
| 261 | x="396.42557" | ||
| 262 | y="213.91006">thread2()</tspan></text> | ||
| 263 | <rect | ||
| 264 | style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
| 265 | id="rect4495" | ||
| 266 | width="724.25244" | ||
| 267 | height="505.21201" | ||
| 268 | x="34.648232" | ||
| 269 | y="191.10612" /> | ||
| 270 | <path | ||
| 271 | style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
| 272 | d="m 183.14066,191.10612 0,504.24243" | ||
| 273 | id="path4497" | ||
| 274 | inkscape:connector-curvature="0" | ||
| 275 | sodipodi:nodetypes="cc" /> | ||
| 276 | <path | ||
| 277 | style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
| 278 | d="m 325.13867,191.10612 0,504.24243" | ||
| 279 | id="path4497-5" | ||
| 280 | inkscape:connector-curvature="0" | ||
| 281 | sodipodi:nodetypes="cc" /> | ||
| 282 | <text | ||
| 283 | xml:space="preserve" | ||
| 284 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 285 | x="111.75929" | ||
| 286 | y="251.53981" | ||
| 287 | id="text4429-8" | ||
| 288 | sodipodi:linespacing="125%"><tspan | ||
| 289 | sodipodi:role="line" | ||
| 290 | id="tspan4431-9" | ||
| 291 | x="111.75929" | ||
| 292 | y="251.53981">rcu_read_lock();</tspan></text> | ||
| 293 | <text | ||
| 294 | xml:space="preserve" | ||
| 295 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 296 | x="396.10254" | ||
| 297 | y="353.91556" | ||
| 298 | id="text4429-8-9" | ||
| 299 | sodipodi:linespacing="125%"><tspan | ||
| 300 | sodipodi:role="line" | ||
| 301 | id="tspan4431-9-4" | ||
| 302 | x="396.10254" | ||
| 303 | y="353.91556">rcu_read_lock();</tspan></text> | ||
| 304 | <text | ||
| 305 | xml:space="preserve" | ||
| 306 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 307 | x="396.10254" | ||
| 308 | y="587.40289" | ||
| 309 | id="text4429-8-9-3" | ||
| 310 | sodipodi:linespacing="125%"><tspan | ||
| 311 | sodipodi:role="line" | ||
| 312 | id="tspan4431-9-4-4" | ||
| 313 | x="396.10254" | ||
| 314 | y="587.40289">rcu_read_unlock();</tspan></text> | ||
| 315 | <text | ||
| 316 | xml:space="preserve" | ||
| 317 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 318 | x="111.75929" | ||
| 319 | y="501.15311" | ||
| 320 | id="text4429-8-9-3-1" | ||
| 321 | sodipodi:linespacing="125%"><tspan | ||
| 322 | sodipodi:role="line" | ||
| 323 | id="tspan4431-9-4-4-6" | ||
| 324 | x="111.75929" | ||
| 325 | y="501.15311">rcu_read_unlock();</tspan></text> | ||
| 326 | <path | ||
| 327 | style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" | ||
| 328 | d="m 33.941125,227.87568 724.941765,0" | ||
| 329 | id="path4608" | ||
| 330 | inkscape:connector-curvature="0" | ||
| 331 | sodipodi:nodetypes="cc" /> | ||
| 332 | <text | ||
| 333 | xml:space="preserve" | ||
| 334 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 335 | x="394.94427" | ||
| 336 | y="331.66351" | ||
| 337 | id="text4648" | ||
| 338 | sodipodi:linespacing="125%"><tspan | ||
| 339 | sodipodi:role="line" | ||
| 340 | id="tspan4650" | ||
| 341 | x="394.94427" | ||
| 342 | y="331.66351">QS</tspan></text> | ||
| 343 | <path | ||
| 344 | sodipodi:type="arc" | ||
| 345 | style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
| 346 | id="path4652" | ||
| 347 | sodipodi:cx="358.85669" | ||
| 348 | sodipodi:cy="142.87541" | ||
| 349 | sodipodi:rx="10.960155" | ||
| 350 | sodipodi:ry="10.253048" | ||
| 351 | d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" | ||
| 352 | transform="translate(36.441125,185.60612)" | ||
| 353 | sodipodi:start="4.7135481" | ||
| 354 | sodipodi:end="10.994651" | ||
| 355 | sodipodi:open="true" /> | ||
| 356 | <text | ||
| 357 | xml:space="preserve" | ||
| 358 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 359 | x="112.11968" | ||
| 360 | y="523.77856" | ||
| 361 | id="text4648-4" | ||
| 362 | sodipodi:linespacing="125%"><tspan | ||
| 363 | sodipodi:role="line" | ||
| 364 | id="tspan4650-4" | ||
| 365 | x="112.11968" | ||
| 366 | y="523.77856">QS</tspan></text> | ||
| 367 | <path | ||
| 368 | sodipodi:type="arc" | ||
| 369 | style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
| 370 | id="path4652-7" | ||
| 371 | sodipodi:cx="358.85669" | ||
| 372 | sodipodi:cy="142.87541" | ||
| 373 | sodipodi:rx="10.960155" | ||
| 374 | sodipodi:ry="10.253048" | ||
| 375 | d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" | ||
| 376 | transform="translate(-246.38346,377.72117)" | ||
| 377 | sodipodi:start="4.7135481" | ||
| 378 | sodipodi:end="10.994651" | ||
| 379 | sodipodi:open="true" /> | ||
| 380 | <path | ||
| 381 | sodipodi:type="arc" | ||
| 382 | style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
| 383 | id="path4652-7-7" | ||
| 384 | sodipodi:cx="358.85669" | ||
| 385 | sodipodi:cy="142.87541" | ||
| 386 | sodipodi:rx="10.960155" | ||
| 387 | sodipodi:ry="10.253048" | ||
| 388 | d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" | ||
| 389 | transform="translate(-103.65246,190.90878)" | ||
| 390 | sodipodi:start="4.7135481" | ||
| 391 | sodipodi:end="10.994651" | ||
| 392 | sodipodi:open="true" /> | ||
| 393 | <text | ||
| 394 | xml:space="preserve" | ||
| 395 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 396 | x="254.85066" | ||
| 397 | y="336.96619" | ||
| 398 | id="text4648-4-3" | ||
| 399 | sodipodi:linespacing="125%"><tspan | ||
| 400 | sodipodi:role="line" | ||
| 401 | id="tspan4650-4-5" | ||
| 402 | x="254.85066" | ||
| 403 | y="336.96619">QS</tspan></text> | ||
| 404 | <path | ||
| 405 | style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
| 406 | d="m 470.93311,190.39903 0,504.24243" | ||
| 407 | id="path4497-5-6" | ||
| 408 | inkscape:connector-curvature="0" | ||
| 409 | sodipodi:nodetypes="cc" /> | ||
| 410 | <path | ||
| 411 | style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
| 412 | d="m 616.22755,190.38323 0,504.24243" | ||
| 413 | id="path4497-5-2" | ||
| 414 | inkscape:connector-curvature="0" | ||
| 415 | sodipodi:nodetypes="cc" /> | ||
| 416 | <g | ||
| 417 | id="g4433-6" | ||
| 418 | transform="translate(288.0964,78.32827)"> | ||
| 419 | <text | ||
| 420 | sodipodi:linespacing="125%" | ||
| 421 | id="text2993-7" | ||
| 422 | y="-261.66608" | ||
| 423 | x="440.12299" | ||
| 424 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 425 | xml:space="preserve" | ||
| 426 | transform="matrix(0,1,-1,0,0,0)"><tspan | ||
| 427 | y="-261.66608" | ||
| 428 | x="440.12299" | ||
| 429 | id="tspan2995-1" | ||
| 430 | sodipodi:role="line">synchronize_rcu()</tspan></text> | ||
| 431 | <g | ||
| 432 | id="g4417-1" | ||
| 433 | transform="matrix(0,1,-1,0,730.90257,222.4928)"> | ||
| 434 | <path | ||
| 435 | style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)" | ||
| 436 | d="M 97.580736,477.4048 328.5624,477.07246" | ||
| 437 | id="path2997-2" | ||
| 438 | inkscape:connector-curvature="0" | ||
| 439 | sodipodi:nodetypes="cc" /> | ||
| 440 | <path | ||
| 441 | style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" | ||
| 442 | d="m 96.752718,465.38398 0,22.62742" | ||
| 443 | id="path4397-3" | ||
| 444 | inkscape:connector-curvature="0" | ||
| 445 | sodipodi:nodetypes="cc" /> | ||
| 446 | <path | ||
| 447 | style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" | ||
| 448 | d="m 329.39039,465.38397 0,22.62742" | ||
| 449 | id="path4397-5-4" | ||
| 450 | inkscape:connector-curvature="0" | ||
| 451 | sodipodi:nodetypes="cc" /> | ||
| 452 | </g> | ||
| 453 | </g> | ||
| 454 | <text | ||
| 455 | xml:space="preserve" | ||
| 456 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 457 | x="541.70508" | ||
| 458 | y="387.6217" | ||
| 459 | id="text4445-0" | ||
| 460 | sodipodi:linespacing="125%"><tspan | ||
| 461 | sodipodi:role="line" | ||
| 462 | id="tspan4447-5" | ||
| 463 | x="541.70508" | ||
| 464 | y="387.6217">r3 = READ_ONCE(d);</tspan></text> | ||
| 465 | <text | ||
| 466 | xml:space="preserve" | ||
| 467 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 468 | x="541.2406" | ||
| 469 | y="646.94611" | ||
| 470 | id="text4449-6" | ||
| 471 | sodipodi:linespacing="125%"><tspan | ||
| 472 | sodipodi:role="line" | ||
| 473 | id="tspan4451-6" | ||
| 474 | x="541.2406" | ||
| 475 | y="646.94611">WRITE_ONCE(e, 1);</tspan></text> | ||
| 476 | <path | ||
| 477 | sodipodi:type="arc" | ||
| 478 | style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
| 479 | id="path4652-7-7-5" | ||
| 480 | sodipodi:cx="358.85669" | ||
| 481 | sodipodi:cy="142.87541" | ||
| 482 | sodipodi:rx="10.960155" | ||
| 483 | sodipodi:ry="10.253048" | ||
| 484 | d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" | ||
| 485 | transform="translate(182.44393,281.23704)" | ||
| 486 | sodipodi:start="4.7135481" | ||
| 487 | sodipodi:end="10.994651" | ||
| 488 | sodipodi:open="true" /> | ||
| 489 | <text | ||
| 490 | xml:space="preserve" | ||
| 491 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 492 | x="540.94702" | ||
| 493 | y="427.29443" | ||
| 494 | id="text4648-4-3-1" | ||
| 495 | sodipodi:linespacing="125%"><tspan | ||
| 496 | sodipodi:role="line" | ||
| 497 | id="tspan4650-4-5-7" | ||
| 498 | x="540.94702" | ||
| 499 | y="427.29443">QS</tspan></text> | ||
| 500 | <text | ||
| 501 | xml:space="preserve" | ||
| 502 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 503 | x="686.27747" | ||
| 504 | y="461.83929" | ||
| 505 | id="text4453-7" | ||
| 506 | sodipodi:linespacing="125%"><tspan | ||
| 507 | sodipodi:role="line" | ||
| 508 | id="tspan4455-1" | ||
| 509 | x="686.27747" | ||
| 510 | y="461.83929">r4 = READ_ONCE(b);</tspan></text> | ||
| 511 | <text | ||
| 512 | xml:space="preserve" | ||
| 513 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 514 | x="686.27747" | ||
| 515 | y="669.26422" | ||
| 516 | id="text4457-9" | ||
| 517 | sodipodi:linespacing="125%"><tspan | ||
| 518 | sodipodi:role="line" | ||
| 519 | id="tspan4459-2" | ||
| 520 | x="686.27747" | ||
| 521 | y="669.26422">r5 = READ_ONCE(e);</tspan></text> | ||
| 522 | <text | ||
| 523 | xml:space="preserve" | ||
| 524 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 525 | x="686.27747" | ||
| 526 | y="445.04358" | ||
| 527 | id="text4429-8-9-33" | ||
| 528 | sodipodi:linespacing="125%"><tspan | ||
| 529 | sodipodi:role="line" | ||
| 530 | id="tspan4431-9-4-2" | ||
| 531 | x="686.27747" | ||
| 532 | y="445.04358">rcu_read_lock();</tspan></text> | ||
| 533 | <text | ||
| 534 | xml:space="preserve" | ||
| 535 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 536 | x="686.27747" | ||
| 537 | y="684.53094" | ||
| 538 | id="text4429-8-9-3-8" | ||
| 539 | sodipodi:linespacing="125%"><tspan | ||
| 540 | sodipodi:role="line" | ||
| 541 | id="tspan4431-9-4-4-5" | ||
| 542 | x="686.27747" | ||
| 543 | y="684.53094">rcu_read_unlock();</tspan></text> | ||
| 544 | <text | ||
| 545 | xml:space="preserve" | ||
| 546 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 547 | x="685.11914" | ||
| 548 | y="422.79153" | ||
| 549 | id="text4648-9" | ||
| 550 | sodipodi:linespacing="125%"><tspan | ||
| 551 | sodipodi:role="line" | ||
| 552 | id="tspan4650-7" | ||
| 553 | x="685.11914" | ||
| 554 | y="422.79153">QS</tspan></text> | ||
| 555 | <path | ||
| 556 | sodipodi:type="arc" | ||
| 557 | style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
| 558 | id="path4652-8" | ||
| 559 | sodipodi:cx="358.85669" | ||
| 560 | sodipodi:cy="142.87541" | ||
| 561 | sodipodi:rx="10.960155" | ||
| 562 | sodipodi:ry="10.253048" | ||
| 563 | d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" | ||
| 564 | transform="translate(326.61602,276.73415)" | ||
| 565 | sodipodi:start="4.7135481" | ||
| 566 | sodipodi:end="10.994651" | ||
| 567 | sodipodi:open="true" /> | ||
| 568 | <text | ||
| 569 | xml:space="preserve" | ||
| 570 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 571 | x="397.85934" | ||
| 572 | y="609.59003" | ||
| 573 | id="text4648-5" | ||
| 574 | sodipodi:linespacing="125%"><tspan | ||
| 575 | sodipodi:role="line" | ||
| 576 | id="tspan4650-77" | ||
| 577 | x="397.85934" | ||
| 578 | y="609.59003">QS</tspan></text> | ||
| 579 | <path | ||
| 580 | sodipodi:type="arc" | ||
| 581 | style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
| 582 | id="path4652-80" | ||
| 583 | sodipodi:cx="358.85669" | ||
| 584 | sodipodi:cy="142.87541" | ||
| 585 | sodipodi:rx="10.960155" | ||
| 586 | sodipodi:ry="10.253048" | ||
| 587 | d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" | ||
| 588 | transform="translate(39.356201,463.53264)" | ||
| 589 | sodipodi:start="4.7135481" | ||
| 590 | sodipodi:end="10.994651" | ||
| 591 | sodipodi:open="true" /> | ||
| 592 | <text | ||
| 593 | xml:space="preserve" | ||
| 594 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 595 | x="256.75986" | ||
| 596 | y="586.99133" | ||
| 597 | id="text4648-5-2" | ||
| 598 | sodipodi:linespacing="125%"><tspan | ||
| 599 | sodipodi:role="line" | ||
| 600 | id="tspan4650-77-7" | ||
| 601 | x="256.75986" | ||
| 602 | y="586.99133">QS</tspan></text> | ||
| 603 | <path | ||
| 604 | sodipodi:type="arc" | ||
| 605 | style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" | ||
| 606 | id="path4652-80-5" | ||
| 607 | sodipodi:cx="358.85669" | ||
| 608 | sodipodi:cy="142.87541" | ||
| 609 | sodipodi:rx="10.960155" | ||
| 610 | sodipodi:ry="10.253048" | ||
| 611 | d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" | ||
| 612 | transform="translate(-101.74328,440.93395)" | ||
| 613 | sodipodi:start="4.7135481" | ||
| 614 | sodipodi:end="10.994651" | ||
| 615 | sodipodi:open="true" /> | ||
| 616 | <text | ||
| 617 | xml:space="preserve" | ||
| 618 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 619 | x="546.22791" | ||
| 620 | y="213.91006" | ||
| 621 | id="text4461-2-5" | ||
| 622 | sodipodi:linespacing="125%"><tspan | ||
| 623 | sodipodi:role="line" | ||
| 624 | id="tspan4463-2-6" | ||
| 625 | x="546.22791" | ||
| 626 | y="213.91006">thread3()</tspan></text> | ||
| 627 | <text | ||
| 628 | xml:space="preserve" | ||
| 629 | style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" | ||
| 630 | x="684.00067" | ||
| 631 | y="213.91006" | ||
| 632 | id="text4461-2-1" | ||
| 633 | sodipodi:linespacing="125%"><tspan | ||
| 634 | sodipodi:role="line" | ||
| 635 | id="tspan4463-2-0" | ||
| 636 | x="684.00067" | ||
| 637 | y="213.91006">thread4()</tspan></text> | ||
| 638 | </g> | ||
| 639 | </svg> | ||
diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html new file mode 100644 index 000000000000..a725f9900ec8 --- /dev/null +++ b/Documentation/RCU/Design/Requirements/Requirements.html | |||
| @@ -0,0 +1,2897 @@ | |||
| 1 | <!-- DO NOT HAND EDIT. --> | ||
| 2 | <!-- Instead, edit Documentation/RCU/Design/Requirements/Requirements.htmlx and run 'sh htmlqqz.sh Documentation/RCU/Design/Requirements/Requirements' --> | ||
| 3 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" | ||
| 4 | "http://www.w3.org/TR/html4/loose.dtd"> | ||
| 5 | <html> | ||
| 6 | <head><title>A Tour Through RCU's Requirements [LWN.net]</title> | ||
| 7 | <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8"> | ||
| 8 | |||
| 9 | <h1>A Tour Through RCU's Requirements</h1> | ||
| 10 | |||
| 11 | <p>Copyright IBM Corporation, 2015</p> | ||
| 12 | <p>Author: Paul E. McKenney</p> | ||
| 13 | <p><i>The initial version of this document appeared in the | ||
| 14 | <a href="https://lwn.net/">LWN</a> articles | ||
| 15 | <a href="https://lwn.net/Articles/652156/">here</a>, | ||
| 16 | <a href="https://lwn.net/Articles/652677/">here</a>, and | ||
| 17 | <a href="https://lwn.net/Articles/653326/">here</a>.</i></p> | ||
| 18 | |||
| 19 | <h2>Introduction</h2> | ||
| 20 | |||
| 21 | <p> | ||
| 22 | Read-copy update (RCU) is a synchronization mechanism that is often | ||
| 23 | used as a replacement for reader-writer locking. | ||
| 24 | RCU is unusual in that updaters do not block readers, | ||
| 25 | which means that RCU's read-side primitives can be exceedingly fast | ||
| 26 | and scalable. | ||
| 27 | In addition, updaters can make useful forward progress concurrently | ||
| 28 | with readers. | ||
| 29 | However, all this concurrency between RCU readers and updaters does raise | ||
| 30 | the question of exactly what RCU readers are doing, which in turn | ||
| 31 | raises the question of exactly what RCU's requirements are. | ||
| 32 | |||
| 33 | <p> | ||
| 34 | This document therefore summarizes RCU's requirements, and can be thought | ||
| 35 | of as an informal, high-level specification for RCU. | ||
| 36 | It is important to understand that RCU's specification is primarily | ||
| 37 | empirical in nature; | ||
| 38 | in fact, I learned about many of these requirements the hard way. | ||
| 39 | This situation might cause some consternation, however, not only | ||
| 40 | has this learning process been a lot of fun, but it has also been | ||
| 41 | a great privilege to work with so many people willing to apply | ||
| 42 | technologies in interesting new ways. | ||
| 43 | |||
| 44 | <p> | ||
| 45 | All that aside, here are the categories of currently known RCU requirements: | ||
| 46 | </p> | ||
| 47 | |||
| 48 | <ol> | ||
| 49 | <li> <a href="#Fundamental Requirements"> | ||
| 50 | Fundamental Requirements</a> | ||
| 51 | <li> <a href="#Fundamental Non-Requirements">Fundamental Non-Requirements</a> | ||
| 52 | <li> <a href="#Parallelism Facts of Life"> | ||
| 53 | Parallelism Facts of Life</a> | ||
| 54 | <li> <a href="#Quality-of-Implementation Requirements"> | ||
| 55 | Quality-of-Implementation Requirements</a> | ||
| 56 | <li> <a href="#Linux Kernel Complications"> | ||
| 57 | Linux Kernel Complications</a> | ||
| 58 | <li> <a href="#Software-Engineering Requirements"> | ||
| 59 | Software-Engineering Requirements</a> | ||
| 60 | <li> <a href="#Other RCU Flavors"> | ||
| 61 | Other RCU Flavors</a> | ||
| 62 | <li> <a href="#Possible Future Changes"> | ||
| 63 | Possible Future Changes</a> | ||
| 64 | </ol> | ||
| 65 | |||
| 66 | <p> | ||
| 67 | This is followed by a <a href="#Summary">summary</a>, | ||
| 68 | which is in turn followed by the inevitable | ||
| 69 | <a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>. | ||
| 70 | |||
| 71 | <h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2> | ||
| 72 | |||
| 73 | <p> | ||
| 74 | RCU's fundamental requirements are the closest thing RCU has to hard | ||
| 75 | mathematical requirements. | ||
| 76 | These are: | ||
| 77 | |||
| 78 | <ol> | ||
| 79 | <li> <a href="#Grace-Period Guarantee"> | ||
| 80 | Grace-Period Guarantee</a> | ||
| 81 | <li> <a href="#Publish-Subscribe Guarantee"> | ||
| 82 | Publish-Subscribe Guarantee</a> | ||
| 83 | <li> <a href="#Memory-Barrier Guarantees"> | ||
| 84 | Memory-Barrier Guarantees</a> | ||
| 85 | <li> <a href="#RCU Primitives Guaranteed to Execute Unconditionally"> | ||
| 86 | RCU Primitives Guaranteed to Execute Unconditionally</a> | ||
| 87 | <li> <a href="#Guaranteed Read-to-Write Upgrade"> | ||
| 88 | Guaranteed Read-to-Write Upgrade</a> | ||
| 89 | </ol> | ||
| 90 | |||
| 91 | <h3><a name="Grace-Period Guarantee">Grace-Period Guarantee</a></h3> | ||
| 92 | |||
| 93 | <p> | ||
| 94 | RCU's grace-period guarantee is unusual in being premeditated: | ||
| 95 | Jack Slingwine and I had this guarantee firmly in mind when we started | ||
| 96 | work on RCU (then called “rclock”) in the early 1990s. | ||
| 97 | That said, the past two decades of experience with RCU have produced | ||
| 98 | a much more detailed understanding of this guarantee. | ||
| 99 | |||
| 100 | <p> | ||
| 101 | RCU's grace-period guarantee allows updaters to wait for the completion | ||
| 102 | of all pre-existing RCU read-side critical sections. | ||
| 103 | An RCU read-side critical section | ||
| 104 | begins with the marker <tt>rcu_read_lock()</tt> and ends with | ||
| 105 | the marker <tt>rcu_read_unlock()</tt>. | ||
| 106 | These markers may be nested, and RCU treats a nested set as one | ||
| 107 | big RCU read-side critical section. | ||
| 108 | Production-quality implementations of <tt>rcu_read_lock()</tt> and | ||
| 109 | <tt>rcu_read_unlock()</tt> are extremely lightweight, and in | ||
| 110 | fact have exactly zero overhead in Linux kernels built for production | ||
| 111 | use with <tt>CONFIG_PREEMPT=n</tt>. | ||
| 112 | |||
| 113 | <p> | ||
| 114 | This guarantee allows ordering to be enforced with extremely low | ||
| 115 | overhead to readers, for example: | ||
| 116 | |||
| 117 | <blockquote> | ||
| 118 | <pre> | ||
| 119 | 1 int x, y; | ||
| 120 | 2 | ||
| 121 | 3 void thread0(void) | ||
| 122 | 4 { | ||
| 123 | 5 rcu_read_lock(); | ||
| 124 | 6 r1 = READ_ONCE(x); | ||
| 125 | 7 r2 = READ_ONCE(y); | ||
| 126 | 8 rcu_read_unlock(); | ||
| 127 | 9 } | ||
| 128 | 10 | ||
| 129 | 11 void thread1(void) | ||
| 130 | 12 { | ||
| 131 | 13 WRITE_ONCE(x, 1); | ||
| 132 | 14 synchronize_rcu(); | ||
| 133 | 15 WRITE_ONCE(y, 1); | ||
| 134 | 16 } | ||
| 135 | </pre> | ||
| 136 | </blockquote> | ||
| 137 | |||
| 138 | <p> | ||
| 139 | Because the <tt>synchronize_rcu()</tt> on line 14 waits for | ||
| 140 | all pre-existing readers, any instance of <tt>thread0()</tt> that | ||
| 141 | loads a value of zero from <tt>x</tt> must complete before | ||
| 142 | <tt>thread1()</tt> stores to <tt>y</tt>, so that instance must | ||
| 143 | also load a value of zero from <tt>y</tt>. | ||
| 144 | Similarly, any instance of <tt>thread0()</tt> that loads a value of | ||
| 145 | one from <tt>y</tt> must have started after the | ||
| 146 | <tt>synchronize_rcu()</tt> started, and must therefore also load | ||
| 147 | a value of one from <tt>x</tt>. | ||
| 148 | Therefore, the outcome: | ||
| 149 | <blockquote> | ||
| 150 | <pre> | ||
| 151 | (r1 == 0 && r2 == 1) | ||
| 152 | </pre> | ||
| 153 | </blockquote> | ||
| 154 | cannot happen. | ||
| 155 | |||
| 156 | <p><a name="Quick Quiz 1"><b>Quick Quiz 1</b>:</a> | ||
| 157 | Wait a minute! | ||
| 158 | You said that updaters can make useful forward progress concurrently | ||
| 159 | with readers, but pre-existing readers will block | ||
| 160 | <tt>synchronize_rcu()</tt>!!! | ||
| 161 | Just who are you trying to fool??? | ||
| 162 | <br><a href="#qq1answer">Answer</a> | ||
| 163 | |||
| 164 | <p> | ||
| 165 | This scenario resembles one of the first uses of RCU in | ||
| 166 | <a href="https://en.wikipedia.org/wiki/DYNIX">DYNIX/ptx</a>, | ||
| 167 | which managed a distributed lock manager's transition into | ||
| 168 | a state suitable for handling recovery from node failure, | ||
| 169 | more or less as follows: | ||
| 170 | |||
| 171 | <blockquote> | ||
| 172 | <pre> | ||
| 173 | 1 #define STATE_NORMAL 0 | ||
| 174 | 2 #define STATE_WANT_RECOVERY 1 | ||
| 175 | 3 #define STATE_RECOVERING 2 | ||
| 176 | 4 #define STATE_WANT_NORMAL 3 | ||
| 177 | 5 | ||
| 178 | 6 int state = STATE_NORMAL; | ||
| 179 | 7 | ||
| 180 | 8 void do_something_dlm(void) | ||
| 181 | 9 { | ||
| 182 | 10 int state_snap; | ||
| 183 | 11 | ||
| 184 | 12 rcu_read_lock(); | ||
| 185 | 13 state_snap = READ_ONCE(state); | ||
| 186 | 14 if (state_snap == STATE_NORMAL) | ||
| 187 | 15 do_something(); | ||
| 188 | 16 else | ||
| 189 | 17 do_something_carefully(); | ||
| 190 | 18 rcu_read_unlock(); | ||
| 191 | 19 } | ||
| 192 | 20 | ||
| 193 | 21 void start_recovery(void) | ||
| 194 | 22 { | ||
| 195 | 23 WRITE_ONCE(state, STATE_WANT_RECOVERY); | ||
| 196 | 24 synchronize_rcu(); | ||
| 197 | 25 WRITE_ONCE(state, STATE_RECOVERING); | ||
| 198 | 26 recovery(); | ||
| 199 | 27 WRITE_ONCE(state, STATE_WANT_NORMAL); | ||
| 200 | 28 synchronize_rcu(); | ||
| 201 | 29 WRITE_ONCE(state, STATE_NORMAL); | ||
| 202 | 30 } | ||
| 203 | </pre> | ||
| 204 | </blockquote> | ||
| 205 | |||
| 206 | <p> | ||
| 207 | The RCU read-side critical section in <tt>do_something_dlm()</tt> | ||
| 208 | works with the <tt>synchronize_rcu()</tt> in <tt>start_recovery()</tt> | ||
| 209 | to guarantee that <tt>do_something()</tt> never runs concurrently | ||
| 210 | with <tt>recovery()</tt>, but with little or no synchronization | ||
| 211 | overhead in <tt>do_something_dlm()</tt>. | ||
| 212 | |||
| 213 | <p><a name="Quick Quiz 2"><b>Quick Quiz 2</b>:</a> | ||
| 214 | Why is the <tt>synchronize_rcu()</tt> on line 28 needed? | ||
| 215 | <br><a href="#qq2answer">Answer</a> | ||
| 216 | |||
| 217 | <p> | ||
| 218 | In order to avoid fatal problems such as deadlocks, | ||
| 219 | an RCU read-side critical section must not contain calls to | ||
| 220 | <tt>synchronize_rcu()</tt>. | ||
| 221 | Similarly, an RCU read-side critical section must not | ||
| 222 | contain anything that waits, directly or indirectly, on completion of | ||
| 223 | an invocation of <tt>synchronize_rcu()</tt>. | ||
| 224 | |||
| 225 | <p> | ||
| 226 | Although RCU's grace-period guarantee is useful in and of itself, with | ||
| 227 | <a href="https://lwn.net/Articles/573497/">quite a few use cases</a>, | ||
| 228 | it would be good to be able to use RCU to coordinate read-side | ||
| 229 | access to linked data structures. | ||
| 230 | For this, the grace-period guarantee is not sufficient, as can | ||
| 231 | be seen in function <tt>add_gp_buggy()</tt> below. | ||
| 232 | We will look at the reader's code later, but in the meantime, just think of | ||
| 233 | the reader as locklessly picking up the <tt>gp</tt> pointer, | ||
| 234 | and, if the value loaded is non-<tt>NULL</tt>, locklessly accessing the | ||
| 235 | <tt>->a</tt> and <tt>->b</tt> fields. | ||
| 236 | |||
| 237 | <blockquote> | ||
| 238 | <pre> | ||
| 239 | 1 bool add_gp_buggy(int a, int b) | ||
| 240 | 2 { | ||
| 241 | 3 p = kmalloc(sizeof(*p), GFP_KERNEL); | ||
| 242 | 4 if (!p) | ||
| 243 | 5 return -ENOMEM; | ||
| 244 | 6 spin_lock(&gp_lock); | ||
| 245 | 7 if (rcu_access_pointer(gp)) { | ||
| 246 | 8 spin_unlock(&gp_lock); | ||
| 247 | 9 return false; | ||
| 248 | 10 } | ||
| 249 | 11 p->a = a; | ||
| 250 | 12 p->b = a; | ||
| 251 | 13 gp = p; /* ORDERING BUG */ | ||
| 252 | 14 spin_unlock(&gp_lock); | ||
| 253 | 15 return true; | ||
| 254 | 16 } | ||
| 255 | </pre> | ||
| 256 | </blockquote> | ||
| 257 | |||
| 258 | <p> | ||
| 259 | The problem is that both the compiler and weakly ordered CPUs are within | ||
| 260 | their rights to reorder this code as follows: | ||
| 261 | |||
| 262 | <blockquote> | ||
| 263 | <pre> | ||
| 264 | 1 bool add_gp_buggy_optimized(int a, int b) | ||
| 265 | 2 { | ||
| 266 | 3 p = kmalloc(sizeof(*p), GFP_KERNEL); | ||
| 267 | 4 if (!p) | ||
| 268 | 5 return -ENOMEM; | ||
| 269 | 6 spin_lock(&gp_lock); | ||
| 270 | 7 if (rcu_access_pointer(gp)) { | ||
| 271 | 8 spin_unlock(&gp_lock); | ||
| 272 | 9 return false; | ||
| 273 | 10 } | ||
| 274 | <b>11 gp = p; /* ORDERING BUG */ | ||
| 275 | 12 p->a = a; | ||
| 276 | 13 p->b = a;</b> | ||
| 277 | 14 spin_unlock(&gp_lock); | ||
| 278 | 15 return true; | ||
| 279 | 16 } | ||
| 280 | </pre> | ||
| 281 | </blockquote> | ||
| 282 | |||
| 283 | <p> | ||
| 284 | If an RCU reader fetches <tt>gp</tt> just after | ||
| 285 | <tt>add_gp_buggy_optimized</tt> executes line 11, | ||
| 286 | it will see garbage in the <tt>->a</tt> and <tt>->b</tt> | ||
| 287 | fields. | ||
| 288 | And this is but one of many ways in which compiler and hardware optimizations | ||
| 289 | could cause trouble. | ||
| 290 | Therefore, we clearly need some way to prevent the compiler and the CPU from | ||
| 291 | reordering in this manner, which brings us to the publish-subscribe | ||
| 292 | guarantee discussed in the next section. | ||
| 293 | |||
| 294 | <h3><a name="Publish-Subscribe Guarantee">Publish/Subscribe Guarantee</a></h3> | ||
| 295 | |||
| 296 | <p> | ||
| 297 | RCU's publish-subscribe guarantee allows data to be inserted | ||
| 298 | into a linked data structure without disrupting RCU readers. | ||
| 299 | The updater uses <tt>rcu_assign_pointer()</tt> to insert the | ||
| 300 | new data, and readers use <tt>rcu_dereference()</tt> to | ||
| 301 | access data, whether new or old. | ||
| 302 | The following shows an example of insertion: | ||
| 303 | |||
| 304 | <blockquote> | ||
| 305 | <pre> | ||
| 306 | 1 bool add_gp(int a, int b) | ||
| 307 | 2 { | ||
| 308 | 3 p = kmalloc(sizeof(*p), GFP_KERNEL); | ||
| 309 | 4 if (!p) | ||
| 310 | 5 return -ENOMEM; | ||
| 311 | 6 spin_lock(&gp_lock); | ||
| 312 | 7 if (rcu_access_pointer(gp)) { | ||
| 313 | 8 spin_unlock(&gp_lock); | ||
| 314 | 9 return false; | ||
| 315 | 10 } | ||
| 316 | 11 p->a = a; | ||
| 317 | 12 p->b = a; | ||
| 318 | 13 rcu_assign_pointer(gp, p); | ||
| 319 | 14 spin_unlock(&gp_lock); | ||
| 320 | 15 return true; | ||
| 321 | 16 } | ||
| 322 | </pre> | ||
| 323 | </blockquote> | ||
| 324 | |||
| 325 | <p> | ||
| 326 | The <tt>rcu_assign_pointer()</tt> on line 13 is conceptually | ||
| 327 | equivalent to a simple assignment statement, but also guarantees | ||
| 328 | that its assignment will | ||
| 329 | happen after the two assignments in lines 11 and 12, | ||
| 330 | similar to the C11 <tt>memory_order_release</tt> store operation. | ||
| 331 | It also prevents any number of “interesting” compiler | ||
| 332 | optimizations, for example, the use of <tt>gp</tt> as a scratch | ||
| 333 | location immediately preceding the assignment. | ||
| 334 | |||
| 335 | <p><a name="Quick Quiz 3"><b>Quick Quiz 3</b>:</a> | ||
| 336 | But <tt>rcu_assign_pointer()</tt> does nothing to prevent the | ||
| 337 | two assignments to <tt>p->a</tt> and <tt>p->b</tt> | ||
| 338 | from being reordered. | ||
| 339 | Can't that also cause problems? | ||
| 340 | <br><a href="#qq3answer">Answer</a> | ||
| 341 | |||
| 342 | <p> | ||
| 343 | It is tempting to assume that the reader need not do anything special | ||
| 344 | to control its accesses to the RCU-protected data, | ||
| 345 | as shown in <tt>do_something_gp_buggy()</tt> below: | ||
| 346 | |||
| 347 | <blockquote> | ||
| 348 | <pre> | ||
| 349 | 1 bool do_something_gp_buggy(void) | ||
| 350 | 2 { | ||
| 351 | 3 rcu_read_lock(); | ||
| 352 | 4 p = gp; /* OPTIMIZATIONS GALORE!!! */ | ||
| 353 | 5 if (p) { | ||
| 354 | 6 do_something(p->a, p->b); | ||
| 355 | 7 rcu_read_unlock(); | ||
| 356 | 8 return true; | ||
| 357 | 9 } | ||
| 358 | 10 rcu_read_unlock(); | ||
| 359 | 11 return false; | ||
| 360 | 12 } | ||
| 361 | </pre> | ||
| 362 | </blockquote> | ||
| 363 | |||
| 364 | <p> | ||
| 365 | However, this temptation must be resisted because there are a | ||
| 366 | surprisingly large number of ways that the compiler | ||
| 367 | (to say nothing of | ||
| 368 | <a href="https://h71000.www7.hp.com/wizard/wiz_2637.html">DEC Alpha CPUs</a>) | ||
| 369 | can trip this code up. | ||
| 370 | For but one example, if the compiler were short of registers, it | ||
| 371 | might choose to refetch from <tt>gp</tt> rather than keeping | ||
| 372 | a separate copy in <tt>p</tt> as follows: | ||
| 373 | |||
| 374 | <blockquote> | ||
| 375 | <pre> | ||
| 376 | 1 bool do_something_gp_buggy_optimized(void) | ||
| 377 | 2 { | ||
| 378 | 3 rcu_read_lock(); | ||
| 379 | 4 if (gp) { /* OPTIMIZATIONS GALORE!!! */ | ||
| 380 | <b> 5 do_something(gp->a, gp->b);</b> | ||
| 381 | 6 rcu_read_unlock(); | ||
| 382 | 7 return true; | ||
| 383 | 8 } | ||
| 384 | 9 rcu_read_unlock(); | ||
| 385 | 10 return false; | ||
| 386 | 11 } | ||
| 387 | </pre> | ||
| 388 | </blockquote> | ||
| 389 | |||
| 390 | <p> | ||
| 391 | If this function ran concurrently with a series of updates that | ||
| 392 | replaced the current structure with a new one, | ||
| 393 | the fetches of <tt>gp->a</tt> | ||
| 394 | and <tt>gp->b</tt> might well come from two different structures, | ||
| 395 | which could cause serious confusion. | ||
| 396 | To prevent this (and much else besides), <tt>do_something_gp()</tt> uses | ||
| 397 | <tt>rcu_dereference()</tt> to fetch from <tt>gp</tt>: | ||
| 398 | |||
| 399 | <blockquote> | ||
| 400 | <pre> | ||
| 401 | 1 bool do_something_gp(void) | ||
| 402 | 2 { | ||
| 403 | 3 rcu_read_lock(); | ||
| 404 | 4 p = rcu_dereference(gp); | ||
| 405 | 5 if (p) { | ||
| 406 | 6 do_something(p->a, p->b); | ||
| 407 | 7 rcu_read_unlock(); | ||
| 408 | 8 return true; | ||
| 409 | 9 } | ||
| 410 | 10 rcu_read_unlock(); | ||
| 411 | 11 return false; | ||
| 412 | 12 } | ||
| 413 | </pre> | ||
| 414 | </blockquote> | ||
| 415 | |||
| 416 | <p> | ||
| 417 | The <tt>rcu_dereference()</tt> uses volatile casts and (for DEC Alpha) | ||
| 418 | memory barriers in the Linux kernel. | ||
| 419 | Should a | ||
| 420 | <a href="http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf">high-quality implementation of C11 <tt>memory_order_consume</tt> [PDF]</a> | ||
| 421 | ever appear, then <tt>rcu_dereference()</tt> could be implemented | ||
| 422 | as a <tt>memory_order_consume</tt> load. | ||
| 423 | Regardless of the exact implementation, a pointer fetched by | ||
| 424 | <tt>rcu_dereference()</tt> may not be used outside of the | ||
| 425 | outermost RCU read-side critical section containing that | ||
| 426 | <tt>rcu_dereference()</tt>, unless protection of | ||
| 427 | the corresponding data element has been passed from RCU to some | ||
| 428 | other synchronization mechanism, most commonly locking or | ||
| 429 | <a href="https://www.kernel.org/doc/Documentation/RCU/rcuref.txt">reference counting</a>. | ||
| 430 | |||
| 431 | <p> | ||
| 432 | In short, updaters use <tt>rcu_assign_pointer()</tt> and readers | ||
| 433 | use <tt>rcu_dereference()</tt>, and these two RCU API elements | ||
| 434 | work together to ensure that readers have a consistent view of | ||
| 435 | newly added data elements. | ||
| 436 | |||
| 437 | <p> | ||
| 438 | Of course, it is also necessary to remove elements from RCU-protected | ||
| 439 | data structures, for example, using the following process: | ||
| 440 | |||
| 441 | <ol> | ||
| 442 | <li> Remove the data element from the enclosing structure. | ||
| 443 | <li> Wait for all pre-existing RCU read-side critical sections | ||
| 444 | to complete (because only pre-existing readers can possibly have | ||
| 445 | a reference to the newly removed data element). | ||
| 446 | <li> At this point, only the updater has a reference to the | ||
| 447 | newly removed data element, so it can safely reclaim | ||
| 448 | the data element, for example, by passing it to <tt>kfree()</tt>. | ||
| 449 | </ol> | ||
| 450 | |||
| 451 | This process is implemented by <tt>remove_gp_synchronous()</tt>: | ||
| 452 | |||
| 453 | <blockquote> | ||
| 454 | <pre> | ||
| 455 | 1 bool remove_gp_synchronous(void) | ||
| 456 | 2 { | ||
| 457 | 3 struct foo *p; | ||
| 458 | 4 | ||
| 459 | 5 spin_lock(&gp_lock); | ||
| 460 | 6 p = rcu_access_pointer(gp); | ||
| 461 | 7 if (!p) { | ||
| 462 | 8 spin_unlock(&gp_lock); | ||
| 463 | 9 return false; | ||
| 464 | 10 } | ||
| 465 | 11 rcu_assign_pointer(gp, NULL); | ||
| 466 | 12 spin_unlock(&gp_lock); | ||
| 467 | 13 synchronize_rcu(); | ||
| 468 | 14 kfree(p); | ||
| 469 | 15 return true; | ||
| 470 | 16 } | ||
| 471 | </pre> | ||
| 472 | </blockquote> | ||
| 473 | |||
| 474 | <p> | ||
| 475 | This function is straightforward, with line 13 waiting for a grace | ||
| 476 | period before line 14 frees the old data element. | ||
| 477 | This waiting ensures that readers will reach line 7 of | ||
| 478 | <tt>do_something_gp()</tt> before the data element referenced by | ||
| 479 | <tt>p</tt> is freed. | ||
| 480 | The <tt>rcu_access_pointer()</tt> on line 6 is similar to | ||
| 481 | <tt>rcu_dereference()</tt>, except that: | ||
| 482 | |||
| 483 | <ol> | ||
| 484 | <li> The value returned by <tt>rcu_access_pointer()</tt> | ||
| 485 | cannot be dereferenced. | ||
| 486 | If you want to access the value pointed to as well as | ||
| 487 | the pointer itself, use <tt>rcu_dereference()</tt> | ||
| 488 | instead of <tt>rcu_access_pointer()</tt>. | ||
| 489 | <li> The call to <tt>rcu_access_pointer()</tt> need not be | ||
| 490 | protected. | ||
| 491 | In contrast, <tt>rcu_dereference()</tt> must either be | ||
| 492 | within an RCU read-side critical section or in a code | ||
| 493 | segment where the pointer cannot change, for example, in | ||
| 494 | code protected by the corresponding update-side lock. | ||
| 495 | </ol> | ||
| 496 | |||
| 497 | <p><a name="Quick Quiz 4"><b>Quick Quiz 4</b>:</a> | ||
| 498 | Without the <tt>rcu_dereference()</tt> or the | ||
| 499 | <tt>rcu_access_pointer()</tt>, what destructive optimizations | ||
| 500 | might the compiler make use of? | ||
| 501 | <br><a href="#qq4answer">Answer</a> | ||
| 502 | |||
| 503 | <p> | ||
| 504 | In short, RCU's publish-subscribe guarantee is provided by the combination | ||
| 505 | of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>. | ||
| 506 | This guarantee allows data elements to be safely added to RCU-protected | ||
| 507 | linked data structures without disrupting RCU readers. | ||
| 508 | This guarantee can be used in combination with the grace-period | ||
| 509 | guarantee to also allow data elements to be removed from RCU-protected | ||
| 510 | linked data structures, again without disrupting RCU readers. | ||
| 511 | |||
| 512 | <p> | ||
| 513 | This guarantee was only partially premeditated. | ||
| 514 | DYNIX/ptx used an explicit memory barrier for publication, but had nothing | ||
| 515 | resembling <tt>rcu_dereference()</tt> for subscription, nor did it | ||
| 516 | have anything resembling the <tt>smp_read_barrier_depends()</tt> | ||
| 517 | that was later subsumed into <tt>rcu_dereference()</tt>. | ||
| 518 | The need for these operations made itself known quite suddenly at a | ||
| 519 | late-1990s meeting with the DEC Alpha architects, back in the days when | ||
| 520 | DEC was still a free-standing company. | ||
| 521 | It took the Alpha architects a good hour to convince me that any sort | ||
| 522 | of barrier would ever be needed, and it then took me a good <i>two</i> hours | ||
| 523 | to convince them that their documentation did not make this point clear. | ||
| 524 | More recent work with the C and C++ standards committees have provided | ||
| 525 | much education on tricks and traps from the compiler. | ||
| 526 | In short, compilers were much less tricky in the early 1990s, but in | ||
| 527 | 2015, don't even think about omitting <tt>rcu_dereference()</tt>! | ||
| 528 | |||
| 529 | <h3><a name="Memory-Barrier Guarantees">Memory-Barrier Guarantees</a></h3> | ||
| 530 | |||
| 531 | <p> | ||
| 532 | The previous section's simple linked-data-structure scenario clearly | ||
| 533 | demonstrates the need for RCU's stringent memory-ordering guarantees on | ||
| 534 | systems with more than one CPU: | ||
| 535 | |||
| 536 | <ol> | ||
| 537 | <li> Each CPU that has an RCU read-side critical section that | ||
| 538 | begins before <tt>synchronize_rcu()</tt> starts is | ||
| 539 | guaranteed to execute a full memory barrier between the time | ||
| 540 | that the RCU read-side critical section ends and the time that | ||
| 541 | <tt>synchronize_rcu()</tt> returns. | ||
| 542 | Without this guarantee, a pre-existing RCU read-side critical section | ||
| 543 | might hold a reference to the newly removed <tt>struct foo</tt> | ||
| 544 | after the <tt>kfree()</tt> on line 14 of | ||
| 545 | <tt>remove_gp_synchronous()</tt>. | ||
| 546 | <li> Each CPU that has an RCU read-side critical section that ends | ||
| 547 | after <tt>synchronize_rcu()</tt> returns is guaranteed | ||
| 548 | to execute a full memory barrier between the time that | ||
| 549 | <tt>synchronize_rcu()</tt> begins and the time that the RCU | ||
| 550 | read-side critical section begins. | ||
| 551 | Without this guarantee, a later RCU read-side critical section | ||
| 552 | running after the <tt>kfree()</tt> on line 14 of | ||
| 553 | <tt>remove_gp_synchronous()</tt> might | ||
| 554 | later run <tt>do_something_gp()</tt> and find the | ||
| 555 | newly deleted <tt>struct foo</tt>. | ||
| 556 | <li> If the task invoking <tt>synchronize_rcu()</tt> remains | ||
| 557 | on a given CPU, then that CPU is guaranteed to execute a full | ||
| 558 | memory barrier sometime during the execution of | ||
| 559 | <tt>synchronize_rcu()</tt>. | ||
| 560 | This guarantee ensures that the <tt>kfree()</tt> on | ||
| 561 | line 14 of <tt>remove_gp_synchronous()</tt> really does | ||
| 562 | execute after the removal on line 11. | ||
| 563 | <li> If the task invoking <tt>synchronize_rcu()</tt> migrates | ||
| 564 | among a group of CPUs during that invocation, then each of the | ||
| 565 | CPUs in that group is guaranteed to execute a full memory barrier | ||
| 566 | sometime during the execution of <tt>synchronize_rcu()</tt>. | ||
| 567 | This guarantee also ensures that the <tt>kfree()</tt> on | ||
| 568 | line 14 of <tt>remove_gp_synchronous()</tt> really does | ||
| 569 | execute after the removal on | ||
| 570 | line 11, but also in the case where the thread executing the | ||
| 571 | <tt>synchronize_rcu()</tt> migrates in the meantime. | ||
| 572 | </ol> | ||
| 573 | |||
| 574 | <p><a name="Quick Quiz 5"><b>Quick Quiz 5</b>:</a> | ||
| 575 | Given that multiple CPUs can start RCU read-side critical sections | ||
| 576 | at any time without any ordering whatsoever, how can RCU possibly tell whether | ||
| 577 | or not a given RCU read-side critical section starts before a | ||
| 578 | given instance of <tt>synchronize_rcu()</tt>? | ||
| 579 | <br><a href="#qq5answer">Answer</a> | ||
| 580 | |||
| 581 | <p><a name="Quick Quiz 6"><b>Quick Quiz 6</b>:</a> | ||
| 582 | The first and second guarantees require unbelievably strict ordering! | ||
| 583 | Are all these memory barriers <i> really</i> required? | ||
| 584 | <br><a href="#qq6answer">Answer</a> | ||
| 585 | |||
| 586 | <p> | ||
| 587 | Note that these memory-barrier requirements do not replace the fundamental | ||
| 588 | RCU requirement that a grace period wait for all pre-existing readers. | ||
| 589 | On the contrary, the memory barriers called out in this section must operate in | ||
| 590 | such a way as to <i>enforce</i> this fundamental requirement. | ||
| 591 | Of course, different implementations enforce this requirement in different | ||
| 592 | ways, but enforce it they must. | ||
| 593 | |||
| 594 | <h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3> | ||
| 595 | |||
| 596 | <p> | ||
| 597 | The common-case RCU primitives are unconditional. | ||
| 598 | They are invoked, they do their job, and they return, with no possibility | ||
| 599 | of error, and no need to retry. | ||
| 600 | This is a key RCU design philosophy. | ||
| 601 | |||
| 602 | <p> | ||
| 603 | However, this philosophy is pragmatic rather than pigheaded. | ||
| 604 | If someone comes up with a good justification for a particular conditional | ||
| 605 | RCU primitive, it might well be implemented and added. | ||
| 606 | After all, this guarantee was reverse-engineered, not premeditated. | ||
| 607 | The unconditional nature of the RCU primitives was initially an | ||
| 608 | accident of implementation, and later experience with synchronization | ||
| 609 | primitives with conditional primitives caused me to elevate this | ||
| 610 | accident to a guarantee. | ||
| 611 | Therefore, the justification for adding a conditional primitive to | ||
| 612 | RCU would need to be based on detailed and compelling use cases. | ||
| 613 | |||
| 614 | <h3><a name="Guaranteed Read-to-Write Upgrade">Guaranteed Read-to-Write Upgrade</a></h3> | ||
| 615 | |||
| 616 | <p> | ||
| 617 | As far as RCU is concerned, it is always possible to carry out an | ||
| 618 | update within an RCU read-side critical section. | ||
| 619 | For example, that RCU read-side critical section might search for | ||
| 620 | a given data element, and then might acquire the update-side | ||
| 621 | spinlock in order to update that element, all while remaining | ||
| 622 | in that RCU read-side critical section. | ||
| 623 | Of course, it is necessary to exit the RCU read-side critical section | ||
| 624 | before invoking <tt>synchronize_rcu()</tt>, however, this | ||
| 625 | inconvenience can be avoided through use of the | ||
| 626 | <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members | ||
| 627 | described later in this document. | ||
| 628 | |||
| 629 | <p><a name="Quick Quiz 7"><b>Quick Quiz 7</b>:</a> | ||
| 630 | But how does the upgrade-to-write operation exclude other readers? | ||
| 631 | <br><a href="#qq7answer">Answer</a> | ||
| 632 | |||
| 633 | <p> | ||
| 634 | This guarantee allows lookup code to be shared between read-side | ||
| 635 | and update-side code, and was premeditated, appearing in the earliest | ||
| 636 | DYNIX/ptx RCU documentation. | ||
| 637 | |||
| 638 | <h2><a name="Fundamental Non-Requirements">Fundamental Non-Requirements</a></h2> | ||
| 639 | |||
| 640 | <p> | ||
| 641 | RCU provides extremely lightweight readers, and its read-side guarantees, | ||
| 642 | though quite useful, are correspondingly lightweight. | ||
| 643 | It is therefore all too easy to assume that RCU is guaranteeing more | ||
| 644 | than it really is. | ||
| 645 | Of course, the list of things that RCU does not guarantee is infinitely | ||
| 646 | long, however, the following sections list a few non-guarantees that | ||
| 647 | have caused confusion. | ||
| 648 | Except where otherwise noted, these non-guarantees were premeditated. | ||
| 649 | |||
| 650 | <ol> | ||
| 651 | <li> <a href="#Readers Impose Minimal Ordering"> | ||
| 652 | Readers Impose Minimal Ordering</a> | ||
| 653 | <li> <a href="#Readers Do Not Exclude Updaters"> | ||
| 654 | Readers Do Not Exclude Updaters</a> | ||
| 655 | <li> <a href="#Updaters Only Wait For Old Readers"> | ||
| 656 | Updaters Only Wait For Old Readers</a> | ||
| 657 | <li> <a href="#Grace Periods Don't Partition Read-Side Critical Sections"> | ||
| 658 | Grace Periods Don't Partition Read-Side Critical Sections</a> | ||
| 659 | <li> <a href="#Read-Side Critical Sections Don't Partition Grace Periods"> | ||
| 660 | Read-Side Critical Sections Don't Partition Grace Periods</a> | ||
| 661 | <li> <a href="#Disabling Preemption Does Not Block Grace Periods"> | ||
| 662 | Disabling Preemption Does Not Block Grace Periods</a> | ||
| 663 | </ol> | ||
| 664 | |||
| 665 | <h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3> | ||
| 666 | |||
| 667 | <p> | ||
| 668 | Reader-side markers such as <tt>rcu_read_lock()</tt> and | ||
| 669 | <tt>rcu_read_unlock()</tt> provide absolutely no ordering guarantees | ||
| 670 | except through their interaction with the grace-period APIs such as | ||
| 671 | <tt>synchronize_rcu()</tt>. | ||
| 672 | To see this, consider the following pair of threads: | ||
| 673 | |||
| 674 | <blockquote> | ||
| 675 | <pre> | ||
| 676 | 1 void thread0(void) | ||
| 677 | 2 { | ||
| 678 | 3 rcu_read_lock(); | ||
| 679 | 4 WRITE_ONCE(x, 1); | ||
| 680 | 5 rcu_read_unlock(); | ||
| 681 | 6 rcu_read_lock(); | ||
| 682 | 7 WRITE_ONCE(y, 1); | ||
| 683 | 8 rcu_read_unlock(); | ||
| 684 | 9 } | ||
| 685 | 10 | ||
| 686 | 11 void thread1(void) | ||
| 687 | 12 { | ||
| 688 | 13 rcu_read_lock(); | ||
| 689 | 14 r1 = READ_ONCE(y); | ||
| 690 | 15 rcu_read_unlock(); | ||
| 691 | 16 rcu_read_lock(); | ||
| 692 | 17 r2 = READ_ONCE(x); | ||
| 693 | 18 rcu_read_unlock(); | ||
| 694 | 19 } | ||
| 695 | </pre> | ||
| 696 | </blockquote> | ||
| 697 | |||
| 698 | <p> | ||
| 699 | After <tt>thread0()</tt> and <tt>thread1()</tt> execute | ||
| 700 | concurrently, it is quite possible to have | ||
| 701 | |||
| 702 | <blockquote> | ||
| 703 | <pre> | ||
| 704 | (r1 == 1 && r2 == 0) | ||
| 705 | </pre> | ||
| 706 | </blockquote> | ||
| 707 | |||
| 708 | (that is, <tt>y</tt> appears to have been assigned before <tt>x</tt>), | ||
| 709 | which would not be possible if <tt>rcu_read_lock()</tt> and | ||
| 710 | <tt>rcu_read_unlock()</tt> had much in the way of ordering | ||
| 711 | properties. | ||
| 712 | But they do not, so the CPU is within its rights | ||
| 713 | to do significant reordering. | ||
| 714 | This is by design: Any significant ordering constraints would slow down | ||
| 715 | these fast-path APIs. | ||
| 716 | |||
| 717 | <p><a name="Quick Quiz 8"><b>Quick Quiz 8</b>:</a> | ||
| 718 | Can't the compiler also reorder this code? | ||
| 719 | <br><a href="#qq8answer">Answer</a> | ||
| 720 | |||
| 721 | <h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3> | ||
| 722 | |||
| 723 | <p> | ||
| 724 | Neither <tt>rcu_read_lock()</tt> nor <tt>rcu_read_unlock()</tt> | ||
| 725 | exclude updates. | ||
| 726 | All they do is to prevent grace periods from ending. | ||
| 727 | The following example illustrates this: | ||
| 728 | |||
| 729 | <blockquote> | ||
| 730 | <pre> | ||
| 731 | 1 void thread0(void) | ||
| 732 | 2 { | ||
| 733 | 3 rcu_read_lock(); | ||
| 734 | 4 r1 = READ_ONCE(y); | ||
| 735 | 5 if (r1) { | ||
| 736 | 6 do_something_with_nonzero_x(); | ||
| 737 | 7 r2 = READ_ONCE(x); | ||
| 738 | 8 WARN_ON(!r2); /* BUG!!! */ | ||
| 739 | 9 } | ||
| 740 | 10 rcu_read_unlock(); | ||
| 741 | 11 } | ||
| 742 | 12 | ||
| 743 | 13 void thread1(void) | ||
| 744 | 14 { | ||
| 745 | 15 spin_lock(&my_lock); | ||
| 746 | 16 WRITE_ONCE(x, 1); | ||
| 747 | 17 WRITE_ONCE(y, 1); | ||
| 748 | 18 spin_unlock(&my_lock); | ||
| 749 | 19 } | ||
| 750 | </pre> | ||
| 751 | </blockquote> | ||
| 752 | |||
| 753 | <p> | ||
| 754 | If the <tt>thread0()</tt> function's <tt>rcu_read_lock()</tt> | ||
| 755 | excluded the <tt>thread1()</tt> function's update, | ||
| 756 | the <tt>WARN_ON()</tt> could never fire. | ||
| 757 | But the fact is that <tt>rcu_read_lock()</tt> does not exclude | ||
| 758 | much of anything aside from subsequent grace periods, of which | ||
| 759 | <tt>thread1()</tt> has none, so the | ||
| 760 | <tt>WARN_ON()</tt> can and does fire. | ||
| 761 | |||
| 762 | <h3><a name="Updaters Only Wait For Old Readers">Updaters Only Wait For Old Readers</a></h3> | ||
| 763 | |||
| 764 | <p> | ||
| 765 | It might be tempting to assume that after <tt>synchronize_rcu()</tt> | ||
| 766 | completes, there are no readers executing. | ||
| 767 | This temptation must be avoided because | ||
| 768 | new readers can start immediately after <tt>synchronize_rcu()</tt> | ||
| 769 | starts, and <tt>synchronize_rcu()</tt> is under no | ||
| 770 | obligation to wait for these new readers. | ||
| 771 | |||
| 772 | <p><a name="Quick Quiz 9"><b>Quick Quiz 9</b>:</a> | ||
| 773 | Suppose that synchronize_rcu() did wait until all readers had completed. | ||
| 774 | Would the updater be able to rely on this? | ||
| 775 | <br><a href="#qq9answer">Answer</a> | ||
| 776 | |||
| 777 | <h3><a name="Grace Periods Don't Partition Read-Side Critical Sections"> | ||
| 778 | Grace Periods Don't Partition Read-Side Critical Sections</a></h3> | ||
| 779 | |||
| 780 | <p> | ||
| 781 | It is tempting to assume that if any part of one RCU read-side critical | ||
| 782 | section precedes a given grace period, and if any part of another RCU | ||
| 783 | read-side critical section follows that same grace period, then all of | ||
| 784 | the first RCU read-side critical section must precede all of the second. | ||
| 785 | However, this just isn't the case: A single grace period does not | ||
| 786 | partition the set of RCU read-side critical sections. | ||
| 787 | An example of this situation can be illustrated as follows, where | ||
| 788 | <tt>x</tt>, <tt>y</tt>, and <tt>z</tt> are initially all zero: | ||
| 789 | |||
| 790 | <blockquote> | ||
| 791 | <pre> | ||
| 792 | 1 void thread0(void) | ||
| 793 | 2 { | ||
| 794 | 3 rcu_read_lock(); | ||
| 795 | 4 WRITE_ONCE(a, 1); | ||
| 796 | 5 WRITE_ONCE(b, 1); | ||
| 797 | 6 rcu_read_unlock(); | ||
| 798 | 7 } | ||
| 799 | 8 | ||
| 800 | 9 void thread1(void) | ||
| 801 | 10 { | ||
| 802 | 11 r1 = READ_ONCE(a); | ||
| 803 | 12 synchronize_rcu(); | ||
| 804 | 13 WRITE_ONCE(c, 1); | ||
| 805 | 14 } | ||
| 806 | 15 | ||
| 807 | 16 void thread2(void) | ||
| 808 | 17 { | ||
| 809 | 18 rcu_read_lock(); | ||
| 810 | 19 r2 = READ_ONCE(b); | ||
| 811 | 20 r3 = READ_ONCE(c); | ||
| 812 | 21 rcu_read_unlock(); | ||
| 813 | 22 } | ||
| 814 | </pre> | ||
| 815 | </blockquote> | ||
| 816 | |||
| 817 | <p> | ||
| 818 | It turns out that the outcome: | ||
| 819 | |||
| 820 | <blockquote> | ||
| 821 | <pre> | ||
| 822 | (r1 == 1 && r2 == 0 && r3 == 1) | ||
| 823 | </pre> | ||
| 824 | </blockquote> | ||
| 825 | |||
| 826 | is entirely possible. | ||
| 827 | The following figure show how this can happen, with each circled | ||
| 828 | <tt>QS</tt> indicating the point at which RCU recorded a | ||
| 829 | <i>quiescent state</i> for each thread, that is, a state in which | ||
| 830 | RCU knows that the thread cannot be in the midst of an RCU read-side | ||
| 831 | critical section that started before the current grace period: | ||
| 832 | |||
| 833 | <p><img src="GPpartitionReaders1.svg" alt="GPpartitionReaders1.svg" width="60%"></p> | ||
| 834 | |||
| 835 | <p> | ||
| 836 | If it is necessary to partition RCU read-side critical sections in this | ||
| 837 | manner, it is necessary to use two grace periods, where the first | ||
| 838 | grace period is known to end before the second grace period starts: | ||
| 839 | |||
| 840 | <blockquote> | ||
| 841 | <pre> | ||
| 842 | 1 void thread0(void) | ||
| 843 | 2 { | ||
| 844 | 3 rcu_read_lock(); | ||
| 845 | 4 WRITE_ONCE(a, 1); | ||
| 846 | 5 WRITE_ONCE(b, 1); | ||
| 847 | 6 rcu_read_unlock(); | ||
| 848 | 7 } | ||
| 849 | 8 | ||
| 850 | 9 void thread1(void) | ||
| 851 | 10 { | ||
| 852 | 11 r1 = READ_ONCE(a); | ||
| 853 | 12 synchronize_rcu(); | ||
| 854 | 13 WRITE_ONCE(c, 1); | ||
| 855 | 14 } | ||
| 856 | 15 | ||
| 857 | 16 void thread2(void) | ||
| 858 | 17 { | ||
| 859 | 18 r2 = READ_ONCE(c); | ||
| 860 | 19 synchronize_rcu(); | ||
| 861 | 20 WRITE_ONCE(d, 1); | ||
| 862 | 21 } | ||
| 863 | 22 | ||
| 864 | 23 void thread3(void) | ||
| 865 | 24 { | ||
| 866 | 25 rcu_read_lock(); | ||
| 867 | 26 r3 = READ_ONCE(b); | ||
| 868 | 27 r4 = READ_ONCE(d); | ||
| 869 | 28 rcu_read_unlock(); | ||
| 870 | 29 } | ||
| 871 | </pre> | ||
| 872 | </blockquote> | ||
| 873 | |||
| 874 | <p> | ||
| 875 | Here, if <tt>(r1 == 1)</tt>, then | ||
| 876 | <tt>thread0()</tt>'s write to <tt>b</tt> must happen | ||
| 877 | before the end of <tt>thread1()</tt>'s grace period. | ||
| 878 | If in addition <tt>(r4 == 1)</tt>, then | ||
| 879 | <tt>thread3()</tt>'s read from <tt>b</tt> must happen | ||
| 880 | after the beginning of <tt>thread2()</tt>'s grace period. | ||
| 881 | If it is also the case that <tt>(r2 == 1)</tt>, then the | ||
| 882 | end of <tt>thread1()</tt>'s grace period must precede the | ||
| 883 | beginning of <tt>thread2()</tt>'s grace period. | ||
| 884 | This mean that the two RCU read-side critical sections cannot overlap, | ||
| 885 | guaranteeing that <tt>(r3 == 1)</tt>. | ||
| 886 | As a result, the outcome: | ||
| 887 | |||
| 888 | <blockquote> | ||
| 889 | <pre> | ||
| 890 | (r1 == 1 && r2 == 1 && r3 == 0 && r4 == 1) | ||
| 891 | </pre> | ||
| 892 | </blockquote> | ||
| 893 | |||
| 894 | cannot happen. | ||
| 895 | |||
| 896 | <p> | ||
| 897 | This non-requirement was also non-premeditated, but became apparent | ||
| 898 | when studying RCU's interaction with memory ordering. | ||
| 899 | |||
| 900 | <h3><a name="Read-Side Critical Sections Don't Partition Grace Periods"> | ||
| 901 | Read-Side Critical Sections Don't Partition Grace Periods</a></h3> | ||
| 902 | |||
| 903 | <p> | ||
| 904 | It is also tempting to assume that if an RCU read-side critical section | ||
| 905 | happens between a pair of grace periods, then those grace periods cannot | ||
| 906 | overlap. | ||
| 907 | However, this temptation leads nowhere good, as can be illustrated by | ||
| 908 | the following, with all variables initially zero: | ||
| 909 | |||
| 910 | <blockquote> | ||
| 911 | <pre> | ||
| 912 | 1 void thread0(void) | ||
| 913 | 2 { | ||
| 914 | 3 rcu_read_lock(); | ||
| 915 | 4 WRITE_ONCE(a, 1); | ||
| 916 | 5 WRITE_ONCE(b, 1); | ||
| 917 | 6 rcu_read_unlock(); | ||
| 918 | 7 } | ||
| 919 | 8 | ||
| 920 | 9 void thread1(void) | ||
| 921 | 10 { | ||
| 922 | 11 r1 = READ_ONCE(a); | ||
| 923 | 12 synchronize_rcu(); | ||
| 924 | 13 WRITE_ONCE(c, 1); | ||
| 925 | 14 } | ||
| 926 | 15 | ||
| 927 | 16 void thread2(void) | ||
| 928 | 17 { | ||
| 929 | 18 rcu_read_lock(); | ||
| 930 | 19 WRITE_ONCE(d, 1); | ||
| 931 | 20 r2 = READ_ONCE(c); | ||
| 932 | 21 rcu_read_unlock(); | ||
| 933 | 22 } | ||
| 934 | 23 | ||
| 935 | 24 void thread3(void) | ||
| 936 | 25 { | ||
| 937 | 26 r3 = READ_ONCE(d); | ||
| 938 | 27 synchronize_rcu(); | ||
| 939 | 28 WRITE_ONCE(e, 1); | ||
| 940 | 29 } | ||
| 941 | 30 | ||
| 942 | 31 void thread4(void) | ||
| 943 | 32 { | ||
| 944 | 33 rcu_read_lock(); | ||
| 945 | 34 r4 = READ_ONCE(b); | ||
| 946 | 35 r5 = READ_ONCE(e); | ||
| 947 | 36 rcu_read_unlock(); | ||
| 948 | 37 } | ||
| 949 | </pre> | ||
| 950 | </blockquote> | ||
| 951 | |||
| 952 | <p> | ||
| 953 | In this case, the outcome: | ||
| 954 | |||
| 955 | <blockquote> | ||
| 956 | <pre> | ||
| 957 | (r1 == 1 && r2 == 1 && r3 == 1 && r4 == 0 && r5 == 1) | ||
| 958 | </pre> | ||
| 959 | </blockquote> | ||
| 960 | |||
| 961 | is entirely possible, as illustrated below: | ||
| 962 | |||
| 963 | <p><img src="ReadersPartitionGP1.svg" alt="ReadersPartitionGP1.svg" width="100%"></p> | ||
| 964 | |||
| 965 | <p> | ||
| 966 | Again, an RCU read-side critical section can overlap almost all of a | ||
| 967 | given grace period, just so long as it does not overlap the entire | ||
| 968 | grace period. | ||
| 969 | As a result, an RCU read-side critical section cannot partition a pair | ||
| 970 | of RCU grace periods. | ||
| 971 | |||
| 972 | <p><a name="Quick Quiz 10"><b>Quick Quiz 10</b>:</a> | ||
| 973 | How long a sequence of grace periods, each separated by an RCU read-side | ||
| 974 | critical section, would be required to partition the RCU read-side | ||
| 975 | critical sections at the beginning and end of the chain? | ||
| 976 | <br><a href="#qq10answer">Answer</a> | ||
| 977 | |||
| 978 | <h3><a name="Disabling Preemption Does Not Block Grace Periods"> | ||
| 979 | Disabling Preemption Does Not Block Grace Periods</a></h3> | ||
| 980 | |||
| 981 | <p> | ||
| 982 | There was a time when disabling preemption on any given CPU would block | ||
| 983 | subsequent grace periods. | ||
| 984 | However, this was an accident of implementation and is not a requirement. | ||
| 985 | And in the current Linux-kernel implementation, disabling preemption | ||
| 986 | on a given CPU in fact does not block grace periods, as Oleg Nesterov | ||
| 987 | <a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>. | ||
| 988 | |||
| 989 | <p> | ||
| 990 | If you need a preempt-disable region to block grace periods, you need to add | ||
| 991 | <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example | ||
| 992 | as follows: | ||
| 993 | |||
| 994 | <blockquote> | ||
| 995 | <pre> | ||
| 996 | 1 preempt_disable(); | ||
| 997 | 2 rcu_read_lock(); | ||
| 998 | 3 do_something(); | ||
| 999 | 4 rcu_read_unlock(); | ||
| 1000 | 5 preempt_enable(); | ||
| 1001 | 6 | ||
| 1002 | 7 /* Spinlocks implicitly disable preemption. */ | ||
| 1003 | 8 spin_lock(&mylock); | ||
| 1004 | 9 rcu_read_lock(); | ||
| 1005 | 10 do_something(); | ||
| 1006 | 11 rcu_read_unlock(); | ||
| 1007 | 12 spin_unlock(&mylock); | ||
| 1008 | </pre> | ||
| 1009 | </blockquote> | ||
| 1010 | |||
| 1011 | <p> | ||
| 1012 | In theory, you could enter the RCU read-side critical section first, | ||
| 1013 | but it is more efficient to keep the entire RCU read-side critical | ||
| 1014 | section contained in the preempt-disable region as shown above. | ||
| 1015 | Of course, RCU read-side critical sections that extend outside of | ||
| 1016 | preempt-disable regions will work correctly, but such critical sections | ||
| 1017 | can be preempted, which forces <tt>rcu_read_unlock()</tt> to do | ||
| 1018 | more work. | ||
| 1019 | And no, this is <i>not</i> an invitation to enclose all of your RCU | ||
| 1020 | read-side critical sections within preempt-disable regions, because | ||
| 1021 | doing so would degrade real-time response. | ||
| 1022 | |||
| 1023 | <p> | ||
| 1024 | This non-requirement appeared with preemptible RCU. | ||
| 1025 | If you need a grace period that waits on non-preemptible code regions, use | ||
| 1026 | <a href="#Sched Flavor">RCU-sched</a>. | ||
| 1027 | |||
| 1028 | <h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2> | ||
| 1029 | |||
| 1030 | <p> | ||
| 1031 | These parallelism facts of life are by no means specific to RCU, but | ||
| 1032 | the RCU implementation must abide by them. | ||
| 1033 | They therefore bear repeating: | ||
| 1034 | |||
| 1035 | <ol> | ||
| 1036 | <li> Any CPU or task may be delayed at any time, | ||
| 1037 | and any attempts to avoid these delays by disabling | ||
| 1038 | preemption, interrupts, or whatever are completely futile. | ||
| 1039 | This is most obvious in preemptible user-level | ||
| 1040 | environments and in virtualized environments (where | ||
| 1041 | a given guest OS's VCPUs can be preempted at any time by | ||
| 1042 | the underlying hypervisor), but can also happen in bare-metal | ||
| 1043 | environments due to ECC errors, NMIs, and other hardware | ||
| 1044 | events. | ||
| 1045 | Although a delay of more than about 20 seconds can result | ||
| 1046 | in splats, the RCU implementation is obligated to use | ||
| 1047 | algorithms that can tolerate extremely long delays, but where | ||
| 1048 | “extremely long” is not long enough to allow | ||
| 1049 | wrap-around when incrementing a 64-bit counter. | ||
| 1050 | <li> Both the compiler and the CPU can reorder memory accesses. | ||
| 1051 | Where it matters, RCU must use compiler directives and | ||
| 1052 | memory-barrier instructions to preserve ordering. | ||
| 1053 | <li> Conflicting writes to memory locations in any given cache line | ||
| 1054 | will result in expensive cache misses. | ||
| 1055 | Greater numbers of concurrent writes and more-frequent | ||
| 1056 | concurrent writes will result in more dramatic slowdowns. | ||
| 1057 | RCU is therefore obligated to use algorithms that have | ||
| 1058 | sufficient locality to avoid significant performance and | ||
| 1059 | scalability problems. | ||
| 1060 | <li> As a rough rule of thumb, only one CPU's worth of processing | ||
| 1061 | may be carried out under the protection of any given exclusive | ||
| 1062 | lock. | ||
| 1063 | RCU must therefore use scalable locking designs. | ||
| 1064 | <li> Counters are finite, especially on 32-bit systems. | ||
| 1065 | RCU's use of counters must therefore tolerate counter wrap, | ||
| 1066 | or be designed such that counter wrap would take way more | ||
| 1067 | time than a single system is likely to run. | ||
| 1068 | An uptime of ten years is quite possible, a runtime | ||
| 1069 | of a century much less so. | ||
| 1070 | As an example of the latter, RCU's dyntick-idle nesting counter | ||
| 1071 | allows 54 bits for interrupt nesting level (this counter | ||
| 1072 | is 64 bits even on a 32-bit system). | ||
| 1073 | Overflowing this counter requires 2<sup>54</sup> | ||
| 1074 | half-interrupts on a given CPU without that CPU ever going idle. | ||
| 1075 | If a half-interrupt happened every microsecond, it would take | ||
| 1076 | 570 years of runtime to overflow this counter, which is currently | ||
| 1077 | believed to be an acceptably long time. | ||
| 1078 | <li> Linux systems can have thousands of CPUs running a single | ||
| 1079 | Linux kernel in a single shared-memory environment. | ||
| 1080 | RCU must therefore pay close attention to high-end scalability. | ||
| 1081 | </ol> | ||
| 1082 | |||
| 1083 | <p> | ||
| 1084 | This last parallelism fact of life means that RCU must pay special | ||
| 1085 | attention to the preceding facts of life. | ||
| 1086 | The idea that Linux might scale to systems with thousands of CPUs would | ||
| 1087 | have been met with some skepticism in the 1990s, but these requirements | ||
| 1088 | would have otherwise have been unsurprising, even in the early 1990s. | ||
| 1089 | |||
| 1090 | <h2><a name="Quality-of-Implementation Requirements">Quality-of-Implementation Requirements</a></h2> | ||
| 1091 | |||
| 1092 | <p> | ||
| 1093 | These sections list quality-of-implementation requirements. | ||
| 1094 | Although an RCU implementation that ignores these requirements could | ||
| 1095 | still be used, it would likely be subject to limitations that would | ||
| 1096 | make it inappropriate for industrial-strength production use. | ||
| 1097 | Classes of quality-of-implementation requirements are as follows: | ||
| 1098 | |||
| 1099 | <ol> | ||
| 1100 | <li> <a href="#Specialization">Specialization</a> | ||
| 1101 | <li> <a href="#Performance and Scalability">Performance and Scalability</a> | ||
| 1102 | <li> <a href="#Composability">Composability</a> | ||
| 1103 | <li> <a href="#Corner Cases">Corner Cases</a> | ||
| 1104 | </ol> | ||
| 1105 | |||
| 1106 | <p> | ||
| 1107 | These classes is covered in the following sections. | ||
| 1108 | |||
| 1109 | <h3><a name="Specialization">Specialization</a></h3> | ||
| 1110 | |||
| 1111 | <p> | ||
| 1112 | RCU is and always has been intended primarily for read-mostly situations, as | ||
| 1113 | illustrated by the following figure. | ||
| 1114 | This means that RCU's read-side primitives are optimized, often at the | ||
| 1115 | expense of its update-side primitives. | ||
| 1116 | |||
| 1117 | <p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p> | ||
| 1118 | |||
| 1119 | <p> | ||
| 1120 | This focus on read-mostly situations means that RCU must interoperate | ||
| 1121 | with other synchronization primitives. | ||
| 1122 | For example, the <tt>add_gp()</tt> and <tt>remove_gp_synchronous()</tt> | ||
| 1123 | examples discussed earlier use RCU to protect readers and locking to | ||
| 1124 | coordinate updaters. | ||
| 1125 | However, the need extends much farther, requiring that a variety of | ||
| 1126 | synchronization primitives be legal within RCU read-side critical sections, | ||
| 1127 | including spinlocks, sequence locks, atomic operations, reference | ||
| 1128 | counters, and memory barriers. | ||
| 1129 | |||
| 1130 | <p><a name="Quick Quiz 11"><b>Quick Quiz 11</b>:</a> | ||
| 1131 | What about sleeping locks? | ||
| 1132 | <br><a href="#qq11answer">Answer</a> | ||
| 1133 | |||
| 1134 | <p> | ||
| 1135 | It often comes as a surprise that many algorithms do not require a | ||
| 1136 | consistent view of data, but many can function in that mode, | ||
| 1137 | with network routing being the poster child. | ||
| 1138 | Internet routing algorithms take significant time to propagate | ||
| 1139 | updates, so that by the time an update arrives at a given system, | ||
| 1140 | that system has been sending network traffic the wrong way for | ||
| 1141 | a considerable length of time. | ||
| 1142 | Having a few threads continue to send traffic the wrong way for a | ||
| 1143 | few more milliseconds is clearly not a problem: In the worst case, | ||
| 1144 | TCP retransmissions will eventually get the data where it needs to go. | ||
| 1145 | In general, when tracking the state of the universe outside of the | ||
| 1146 | computer, some level of inconsistency must be tolerated due to | ||
| 1147 | speed-of-light delays if nothing else. | ||
| 1148 | |||
| 1149 | <p> | ||
| 1150 | Furthermore, uncertainty about external state is inherent in many cases. | ||
| 1151 | For example, a pair of veternarians might use heartbeat to determine | ||
| 1152 | whether or not a given cat was alive. | ||
| 1153 | But how long should they wait after the last heartbeat to decide that | ||
| 1154 | the cat is in fact dead? | ||
| 1155 | Waiting less than 400 milliseconds makes no sense because this would | ||
| 1156 | mean that a relaxed cat would be considered to cycle between death | ||
| 1157 | and life more than 100 times per minute. | ||
| 1158 | Moreover, just as with human beings, a cat's heart might stop for | ||
| 1159 | some period of time, so the exact wait period is a judgment call. | ||
| 1160 | One of our pair of veternarians might wait 30 seconds before pronouncing | ||
| 1161 | the cat dead, while the other might insist on waiting a full minute. | ||
| 1162 | The two veternarians would then disagree on the state of the cat during | ||
| 1163 | the final 30 seconds of the minute following the last heartbeat, as | ||
| 1164 | fancifully illustrated below: | ||
| 1165 | |||
| 1166 | <p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p> | ||
| 1167 | |||
| 1168 | <p> | ||
| 1169 | Interestingly enough, this same situation applies to hardware. | ||
| 1170 | When push comes to shove, how do we tell whether or not some | ||
| 1171 | external server has failed? | ||
| 1172 | We send messages to it periodically, and declare it failed if we | ||
| 1173 | don't receive a response within a given period of time. | ||
| 1174 | Policy decisions can usually tolerate short | ||
| 1175 | periods of inconsistency. | ||
| 1176 | The policy was decided some time ago, and is only now being put into | ||
| 1177 | effect, so a few milliseconds of delay is normally inconsequential. | ||
| 1178 | |||
| 1179 | <p> | ||
| 1180 | However, there are algorithms that absolutely must see consistent data. | ||
| 1181 | For example, the translation between a user-level SystemV semaphore | ||
| 1182 | ID to the corresponding in-kernel data structure is protected by RCU, | ||
| 1183 | but it is absolutely forbidden to update a semaphore that has just been | ||
| 1184 | removed. | ||
| 1185 | In the Linux kernel, this need for consistency is accommodated by acquiring | ||
| 1186 | spinlocks located in the in-kernel data structure from within | ||
| 1187 | the RCU read-side critical section, and this is indicated by the | ||
| 1188 | green box in the figure above. | ||
| 1189 | Many other techniques may be used, and are in fact used within the | ||
| 1190 | Linux kernel. | ||
| 1191 | |||
| 1192 | <p> | ||
| 1193 | In short, RCU is not required to maintain consistency, and other | ||
| 1194 | mechanisms may be used in concert with RCU when consistency is required. | ||
| 1195 | RCU's specialization allows it to do its job extremely well, and its | ||
| 1196 | ability to interoperate with other synchronization mechanisms allows | ||
| 1197 | the right mix of synchronization tools to be used for a given job. | ||
| 1198 | |||
| 1199 | <h3><a name="Performance and Scalability">Performance and Scalability</a></h3> | ||
| 1200 | |||
| 1201 | <p> | ||
| 1202 | Energy efficiency is a critical component of performance today, | ||
| 1203 | and Linux-kernel RCU implementations must therefore avoid unnecessarily | ||
| 1204 | awakening idle CPUs. | ||
| 1205 | I cannot claim that this requirement was premeditated. | ||
| 1206 | In fact, I learned of it during a telephone conversation in which I | ||
| 1207 | was given “frank and open” feedback on the importance | ||
| 1208 | of energy efficiency in battery-powered systems and on specific | ||
| 1209 | energy-efficiency shortcomings of the Linux-kernel RCU implementation. | ||
| 1210 | In my experience, the battery-powered embedded community will consider | ||
| 1211 | any unnecessary wakeups to be extremely unfriendly acts. | ||
| 1212 | So much so that mere Linux-kernel-mailing-list posts are | ||
| 1213 | insufficient to vent their ire. | ||
| 1214 | |||
| 1215 | <p> | ||
| 1216 | Memory consumption is not particularly important for in most | ||
| 1217 | situations, and has become decreasingly | ||
| 1218 | so as memory sizes have expanded and memory | ||
| 1219 | costs have plummeted. | ||
| 1220 | However, as I learned from Matt Mackall's | ||
| 1221 | <a href="http://elinux.org/Linux_Tiny-FAQ">bloatwatch</a> | ||
| 1222 | efforts, memory footprint is critically important on single-CPU systems with | ||
| 1223 | non-preemptible (<tt>CONFIG_PREEMPT=n</tt>) kernels, and thus | ||
| 1224 | <a href="https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com">tiny RCU</a> | ||
| 1225 | was born. | ||
| 1226 | Josh Triplett has since taken over the small-memory banner with his | ||
| 1227 | <a href="https://tiny.wiki.kernel.org/">Linux kernel tinification</a> | ||
| 1228 | project, which resulted in | ||
| 1229 | <a href="#Sleepable RCU">SRCU</a> | ||
| 1230 | becoming optional for those kernels not needing it. | ||
| 1231 | |||
| 1232 | <p> | ||
| 1233 | The remaining performance requirements are, for the most part, | ||
| 1234 | unsurprising. | ||
| 1235 | For example, in keeping with RCU's read-side specialization, | ||
| 1236 | <tt>rcu_dereference()</tt> should have negligible overhead (for | ||
| 1237 | example, suppression of a few minor compiler optimizations). | ||
| 1238 | Similarly, in non-preemptible environments, <tt>rcu_read_lock()</tt> and | ||
| 1239 | <tt>rcu_read_unlock()</tt> should have exactly zero overhead. | ||
| 1240 | |||
| 1241 | <p> | ||
| 1242 | In preemptible environments, in the case where the RCU read-side | ||
| 1243 | critical section was not preempted (as will be the case for the | ||
| 1244 | highest-priority real-time process), <tt>rcu_read_lock()</tt> and | ||
| 1245 | <tt>rcu_read_unlock()</tt> should have minimal overhead. | ||
| 1246 | In particular, they should not contain atomic read-modify-write | ||
| 1247 | operations, memory-barrier instructions, preemption disabling, | ||
| 1248 | interrupt disabling, or backwards branches. | ||
| 1249 | However, in the case where the RCU read-side critical section was preempted, | ||
| 1250 | <tt>rcu_read_unlock()</tt> may acquire spinlocks and disable interrupts. | ||
| 1251 | This is why it is better to nest an RCU read-side critical section | ||
| 1252 | within a preempt-disable region than vice versa, at least in cases | ||
| 1253 | where that critical section is short enough to avoid unduly degrading | ||
| 1254 | real-time latencies. | ||
| 1255 | |||
| 1256 | <p> | ||
| 1257 | The <tt>synchronize_rcu()</tt> grace-period-wait primitive is | ||
| 1258 | optimized for throughput. | ||
| 1259 | It may therefore incur several milliseconds of latency in addition to | ||
| 1260 | the duration of the longest RCU read-side critical section. | ||
| 1261 | On the other hand, multiple concurrent invocations of | ||
| 1262 | <tt>synchronize_rcu()</tt> are required to use batching optimizations | ||
| 1263 | so that they can be satisfied by a single underlying grace-period-wait | ||
| 1264 | operation. | ||
| 1265 | For example, in the Linux kernel, it is not unusual for a single | ||
| 1266 | grace-period-wait operation to serve more than | ||
| 1267 | <a href="https://www.usenix.org/conference/2004-usenix-annual-technical-conference/making-rcu-safe-deep-sub-millisecond-response">1,000 separate invocations</a> | ||
| 1268 | of <tt>synchronize_rcu()</tt>, thus amortizing the per-invocation | ||
| 1269 | overhead down to nearly zero. | ||
| 1270 | However, the grace-period optimization is also required to avoid | ||
| 1271 | measurable degradation of real-time scheduling and interrupt latencies. | ||
| 1272 | |||
| 1273 | <p> | ||
| 1274 | In some cases, the multi-millisecond <tt>synchronize_rcu()</tt> | ||
| 1275 | latencies are unacceptable. | ||
| 1276 | In these cases, <tt>synchronize_rcu_expedited()</tt> may be used | ||
| 1277 | instead, reducing the grace-period latency down to a few tens of | ||
| 1278 | microseconds on small systems, at least in cases where the RCU read-side | ||
| 1279 | critical sections are short. | ||
| 1280 | There are currently no special latency requirements for | ||
| 1281 | <tt>synchronize_rcu_expedited()</tt> on large systems, but, | ||
| 1282 | consistent with the empirical nature of the RCU specification, | ||
| 1283 | that is subject to change. | ||
| 1284 | However, there most definitely are scalability requirements: | ||
| 1285 | A storm of <tt>synchronize_rcu_expedited()</tt> invocations on 4096 | ||
| 1286 | CPUs should at least make reasonable forward progress. | ||
| 1287 | In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt> | ||
| 1288 | is permitted to impose modest degradation of real-time latency | ||
| 1289 | on non-idle online CPUs. | ||
| 1290 | That said, it will likely be necessary to take further steps to reduce this | ||
| 1291 | degradation, hopefully to roughly that of a scheduling-clock interrupt. | ||
| 1292 | |||
| 1293 | <p> | ||
| 1294 | There are a number of situations where even | ||
| 1295 | <tt>synchronize_rcu_expedited()</tt>'s reduced grace-period | ||
| 1296 | latency is unacceptable. | ||
| 1297 | In these situations, the asynchronous <tt>call_rcu()</tt> can be | ||
| 1298 | used in place of <tt>synchronize_rcu()</tt> as follows: | ||
| 1299 | |||
| 1300 | <blockquote> | ||
| 1301 | <pre> | ||
| 1302 | 1 struct foo { | ||
| 1303 | 2 int a; | ||
| 1304 | 3 int b; | ||
| 1305 | 4 struct rcu_head rh; | ||
| 1306 | 5 }; | ||
| 1307 | 6 | ||
| 1308 | 7 static void remove_gp_cb(struct rcu_head *rhp) | ||
| 1309 | 8 { | ||
| 1310 | 9 struct foo *p = container_of(rhp, struct foo, rh); | ||
| 1311 | 10 | ||
| 1312 | 11 kfree(p); | ||
| 1313 | 12 } | ||
| 1314 | 13 | ||
| 1315 | 14 bool remove_gp_asynchronous(void) | ||
| 1316 | 15 { | ||
| 1317 | 16 struct foo *p; | ||
| 1318 | 17 | ||
| 1319 | 18 spin_lock(&gp_lock); | ||
| 1320 | 19 p = rcu_dereference(gp); | ||
| 1321 | 20 if (!p) { | ||
| 1322 | 21 spin_unlock(&gp_lock); | ||
| 1323 | 22 return false; | ||
| 1324 | 23 } | ||
| 1325 | 24 rcu_assign_pointer(gp, NULL); | ||
| 1326 | 25 call_rcu(&p->rh, remove_gp_cb); | ||
| 1327 | 26 spin_unlock(&gp_lock); | ||
| 1328 | 27 return true; | ||
| 1329 | 28 } | ||
| 1330 | </pre> | ||
| 1331 | </blockquote> | ||
| 1332 | |||
| 1333 | <p> | ||
| 1334 | A definition of <tt>struct foo</tt> is finally needed, and appears | ||
| 1335 | on lines 1-5. | ||
| 1336 | The function <tt>remove_gp_cb()</tt> is passed to <tt>call_rcu()</tt> | ||
| 1337 | on line 25, and will be invoked after the end of a subsequent | ||
| 1338 | grace period. | ||
| 1339 | This gets the same effect as <tt>remove_gp_synchronous()</tt>, | ||
| 1340 | but without forcing the updater to wait for a grace period to elapse. | ||
| 1341 | The <tt>call_rcu()</tt> function may be used in a number of | ||
| 1342 | situations where neither <tt>synchronize_rcu()</tt> nor | ||
| 1343 | <tt>synchronize_rcu_expedited()</tt> would be legal, | ||
| 1344 | including within preempt-disable code, <tt>local_bh_disable()</tt> code, | ||
| 1345 | interrupt-disable code, and interrupt handlers. | ||
| 1346 | However, even <tt>call_rcu()</tt> is illegal within NMI handlers. | ||
| 1347 | The callback function (<tt>remove_gp_cb()</tt> in this case) will be | ||
| 1348 | executed within softirq (software interrupt) environment within the | ||
| 1349 | Linux kernel, | ||
| 1350 | either within a real softirq handler or under the protection | ||
| 1351 | of <tt>local_bh_disable()</tt>. | ||
| 1352 | In both the Linux kernel and in userspace, it is bad practice to | ||
| 1353 | write an RCU callback function that takes too long. | ||
| 1354 | Long-running operations should be relegated to separate threads or | ||
| 1355 | (in the Linux kernel) workqueues. | ||
| 1356 | |||
| 1357 | <p><a name="Quick Quiz 12"><b>Quick Quiz 12</b>:</a> | ||
| 1358 | Why does line 19 use <tt>rcu_access_pointer()</tt>? | ||
| 1359 | After all, <tt>call_rcu()</tt> on line 25 stores into the | ||
| 1360 | structure, which would interact badly with concurrent insertions. | ||
| 1361 | Doesn't this mean that <tt>rcu_dereference()</tt> is required? | ||
| 1362 | <br><a href="#qq12answer">Answer</a> | ||
| 1363 | |||
| 1364 | <p> | ||
| 1365 | However, all that <tt>remove_gp_cb()</tt> is doing is | ||
| 1366 | invoking <tt>kfree()</tt> on the data element. | ||
| 1367 | This is a common idiom, and is supported by <tt>kfree_rcu()</tt>, | ||
| 1368 | which allows “fire and forget” operation as shown below: | ||
| 1369 | |||
| 1370 | <blockquote> | ||
| 1371 | <pre> | ||
| 1372 | 1 struct foo { | ||
| 1373 | 2 int a; | ||
| 1374 | 3 int b; | ||
| 1375 | 4 struct rcu_head rh; | ||
| 1376 | 5 }; | ||
| 1377 | 6 | ||
| 1378 | 7 bool remove_gp_faf(void) | ||
| 1379 | 8 { | ||
| 1380 | 9 struct foo *p; | ||
| 1381 | 10 | ||
| 1382 | 11 spin_lock(&gp_lock); | ||
| 1383 | 12 p = rcu_dereference(gp); | ||
| 1384 | 13 if (!p) { | ||
| 1385 | 14 spin_unlock(&gp_lock); | ||
| 1386 | 15 return false; | ||
| 1387 | 16 } | ||
| 1388 | 17 rcu_assign_pointer(gp, NULL); | ||
| 1389 | 18 kfree_rcu(p, rh); | ||
| 1390 | 19 spin_unlock(&gp_lock); | ||
| 1391 | 20 return true; | ||
| 1392 | 21 } | ||
| 1393 | </pre> | ||
| 1394 | </blockquote> | ||
| 1395 | |||
| 1396 | <p> | ||
| 1397 | Note that <tt>remove_gp_faf()</tt> simply invokes | ||
| 1398 | <tt>kfree_rcu()</tt> and proceeds, without any need to pay any | ||
| 1399 | further attention to the subsequent grace period and <tt>kfree()</tt>. | ||
| 1400 | It is permissible to invoke <tt>kfree_rcu()</tt> from the same | ||
| 1401 | environments as for <tt>call_rcu()</tt>. | ||
| 1402 | Interestingly enough, DYNIX/ptx had the equivalents of | ||
| 1403 | <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>, but not | ||
| 1404 | <tt>synchronize_rcu()</tt>. | ||
| 1405 | This was due to the fact that RCU was not heavily used within DYNIX/ptx, | ||
| 1406 | so the very few places that needed something like | ||
| 1407 | <tt>synchronize_rcu()</tt> simply open-coded it. | ||
| 1408 | |||
| 1409 | <p><a name="Quick Quiz 13"><b>Quick Quiz 13</b>:</a> | ||
| 1410 | Earlier it was claimed that <tt>call_rcu()</tt> and | ||
| 1411 | <tt>kfree_rcu()</tt> allowed updaters to avoid being blocked | ||
| 1412 | by readers. | ||
| 1413 | But how can that be correct, given that the invocation of the callback | ||
| 1414 | and the freeing of the memory (respectively) must still wait for | ||
| 1415 | a grace period to elapse? | ||
| 1416 | <br><a href="#qq13answer">Answer</a> | ||
| 1417 | |||
| 1418 | <p> | ||
| 1419 | But what if the updater must wait for the completion of code to be | ||
| 1420 | executed after the end of the grace period, but has other tasks | ||
| 1421 | that can be carried out in the meantime? | ||
| 1422 | The polling-style <tt>get_state_synchronize_rcu()</tt> and | ||
| 1423 | <tt>cond_synchronize_rcu()</tt> functions may be used for this | ||
| 1424 | purpose, as shown below: | ||
| 1425 | |||
| 1426 | <blockquote> | ||
| 1427 | <pre> | ||
| 1428 | 1 bool remove_gp_poll(void) | ||
| 1429 | 2 { | ||
| 1430 | 3 struct foo *p; | ||
| 1431 | 4 unsigned long s; | ||
| 1432 | 5 | ||
| 1433 | 6 spin_lock(&gp_lock); | ||
| 1434 | 7 p = rcu_access_pointer(gp); | ||
| 1435 | 8 if (!p) { | ||
| 1436 | 9 spin_unlock(&gp_lock); | ||
| 1437 | 10 return false; | ||
| 1438 | 11 } | ||
| 1439 | 12 rcu_assign_pointer(gp, NULL); | ||
| 1440 | 13 spin_unlock(&gp_lock); | ||
| 1441 | 14 s = get_state_synchronize_rcu(); | ||
| 1442 | 15 do_something_while_waiting(); | ||
| 1443 | 16 cond_synchronize_rcu(s); | ||
| 1444 | 17 kfree(p); | ||
| 1445 | 18 return true; | ||
| 1446 | 19 } | ||
| 1447 | </pre> | ||
| 1448 | </blockquote> | ||
| 1449 | |||
| 1450 | <p> | ||
| 1451 | On line 14, <tt>get_state_synchronize_rcu()</tt> obtains a | ||
| 1452 | “cookie” from RCU, | ||
| 1453 | then line 15 carries out other tasks, | ||
| 1454 | and finally, line 16 returns immediately if a grace period has | ||
| 1455 | elapsed in the meantime, but otherwise waits as required. | ||
| 1456 | The need for <tt>get_state_synchronize_rcu</tt> and | ||
| 1457 | <tt>cond_synchronize_rcu()</tt> has appeared quite recently, | ||
| 1458 | so it is too early to tell whether they will stand the test of time. | ||
| 1459 | |||
| 1460 | <p> | ||
| 1461 | RCU thus provides a range of tools to allow updaters to strike the | ||
| 1462 | required tradeoff between latency, flexibility and CPU overhead. | ||
| 1463 | |||
| 1464 | <h3><a name="Composability">Composability</a></h3> | ||
| 1465 | |||
| 1466 | <p> | ||
| 1467 | Composability has received much attention in recent years, perhaps in part | ||
| 1468 | due to the collision of multicore hardware with object-oriented techniques | ||
| 1469 | designed in single-threaded environments for single-threaded use. | ||
| 1470 | And in theory, RCU read-side critical sections may be composed, and in | ||
| 1471 | fact may be nested arbitrarily deeply. | ||
| 1472 | In practice, as with all real-world implementations of composable | ||
| 1473 | constructs, there are limitations. | ||
| 1474 | |||
| 1475 | <p> | ||
| 1476 | Implementations of RCU for which <tt>rcu_read_lock()</tt> | ||
| 1477 | and <tt>rcu_read_unlock()</tt> generate no code, such as | ||
| 1478 | Linux-kernel RCU when <tt>CONFIG_PREEMPT=n</tt>, can be | ||
| 1479 | nested arbitrarily deeply. | ||
| 1480 | After all, there is no overhead. | ||
| 1481 | Except that if all these instances of <tt>rcu_read_lock()</tt> | ||
| 1482 | and <tt>rcu_read_unlock()</tt> are visible to the compiler, | ||
| 1483 | compilation will eventually fail due to exhausting memory, | ||
| 1484 | mass storage, or user patience, whichever comes first. | ||
| 1485 | If the nesting is not visible to the compiler, as is the case with | ||
| 1486 | mutually recursive functions each in its own translation unit, | ||
| 1487 | stack overflow will result. | ||
| 1488 | If the nesting takes the form of loops, either the control variable | ||
| 1489 | will overflow or (in the Linux kernel) you will get an RCU CPU stall warning. | ||
| 1490 | Nevertheless, this class of RCU implementations is one | ||
| 1491 | of the most composable constructs in existence. | ||
| 1492 | |||
| 1493 | <p> | ||
| 1494 | RCU implementations that explicitly track nesting depth | ||
| 1495 | are limited by the nesting-depth counter. | ||
| 1496 | For example, the Linux kernel's preemptible RCU limits nesting to | ||
| 1497 | <tt>INT_MAX</tt>. | ||
| 1498 | This should suffice for almost all practical purposes. | ||
| 1499 | That said, a consecutive pair of RCU read-side critical sections | ||
| 1500 | between which there is an operation that waits for a grace period | ||
| 1501 | cannot be enclosed in another RCU read-side critical section. | ||
| 1502 | This is because it is not legal to wait for a grace period within | ||
| 1503 | an RCU read-side critical section: To do so would result either | ||
| 1504 | in deadlock or | ||
| 1505 | in RCU implicitly splitting the enclosing RCU read-side critical | ||
| 1506 | section, neither of which is conducive to a long-lived and prosperous | ||
| 1507 | kernel. | ||
| 1508 | |||
| 1509 | <p> | ||
| 1510 | It is worth noting that RCU is not alone in limiting composability. | ||
| 1511 | For example, many transactional-memory implementations prohibit | ||
| 1512 | composing a pair of transactions separated by an irrevocable | ||
| 1513 | operation (for example, a network receive operation). | ||
| 1514 | For another example, lock-based critical sections can be composed | ||
| 1515 | surprisingly freely, but only if deadlock is avoided. | ||
| 1516 | |||
| 1517 | <p> | ||
| 1518 | In short, although RCU read-side critical sections are highly composable, | ||
| 1519 | care is required in some situations, just as is the case for any other | ||
| 1520 | composable synchronization mechanism. | ||
| 1521 | |||
| 1522 | <h3><a name="Corner Cases">Corner Cases</a></h3> | ||
| 1523 | |||
| 1524 | <p> | ||
| 1525 | A given RCU workload might have an endless and intense stream of | ||
| 1526 | RCU read-side critical sections, perhaps even so intense that there | ||
| 1527 | was never a point in time during which there was not at least one | ||
| 1528 | RCU read-side critical section in flight. | ||
| 1529 | RCU cannot allow this situation to block grace periods: As long as | ||
| 1530 | all the RCU read-side critical sections are finite, grace periods | ||
| 1531 | must also be finite. | ||
| 1532 | |||
| 1533 | <p> | ||
| 1534 | That said, preemptible RCU implementations could potentially result | ||
| 1535 | in RCU read-side critical sections being preempted for long durations, | ||
| 1536 | which has the effect of creating a long-duration RCU read-side | ||
| 1537 | critical section. | ||
| 1538 | This situation can arise only in heavily loaded systems, but systems using | ||
| 1539 | real-time priorities are of course more vulnerable. | ||
| 1540 | Therefore, RCU priority boosting is provided to help deal with this | ||
| 1541 | case. | ||
| 1542 | That said, the exact requirements on RCU priority boosting will likely | ||
| 1543 | evolve as more experience accumulates. | ||
| 1544 | |||
| 1545 | <p> | ||
| 1546 | Other workloads might have very high update rates. | ||
| 1547 | Although one can argue that such workloads should instead use | ||
| 1548 | something other than RCU, the fact remains that RCU must | ||
| 1549 | handle such workloads gracefully. | ||
| 1550 | This requirement is another factor driving batching of grace periods, | ||
| 1551 | but it is also the driving force behind the checks for large numbers | ||
| 1552 | of queued RCU callbacks in the <tt>call_rcu()</tt> code path. | ||
| 1553 | Finally, high update rates should not delay RCU read-side critical | ||
| 1554 | sections, although some read-side delays can occur when using | ||
| 1555 | <tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use | ||
| 1556 | of <tt>try_stop_cpus()</tt>. | ||
| 1557 | (In the future, <tt>synchronize_rcu_expedited()</tt> will be | ||
| 1558 | converted to use lighter-weight inter-processor interrupts (IPIs), | ||
| 1559 | but this will still disturb readers, though to a much smaller degree.) | ||
| 1560 | |||
| 1561 | <p> | ||
| 1562 | Although all three of these corner cases were understood in the early | ||
| 1563 | 1990s, a simple user-level test consisting of <tt>close(open(path))</tt> | ||
| 1564 | in a tight loop | ||
| 1565 | in the early 2000s suddenly provided a much deeper appreciation of the | ||
| 1566 | high-update-rate corner case. | ||
| 1567 | This test also motivated addition of some RCU code to react to high update | ||
| 1568 | rates, for example, if a given CPU finds itself with more than 10,000 | ||
| 1569 | RCU callbacks queued, it will cause RCU to take evasive action by | ||
| 1570 | more aggressively starting grace periods and more aggressively forcing | ||
| 1571 | completion of grace-period processing. | ||
| 1572 | This evasive action causes the grace period to complete more quickly, | ||
| 1573 | but at the cost of restricting RCU's batching optimizations, thus | ||
| 1574 | increasing the CPU overhead incurred by that grace period. | ||
| 1575 | |||
| 1576 | <h2><a name="Software-Engineering Requirements"> | ||
| 1577 | Software-Engineering Requirements</a></h2> | ||
| 1578 | |||
| 1579 | <p> | ||
| 1580 | Between Murphy's Law and “To err is human”, it is necessary to | ||
| 1581 | guard against mishaps and misuse: | ||
| 1582 | |||
| 1583 | <ol> | ||
| 1584 | <li> It is all too easy to forget to use <tt>rcu_read_lock()</tt> | ||
| 1585 | everywhere that it is needed, so kernels built with | ||
| 1586 | <tt>CONFIG_PROVE_RCU=y</tt> will spat if | ||
| 1587 | <tt>rcu_dereference()</tt> is used outside of an | ||
| 1588 | RCU read-side critical section. | ||
| 1589 | Update-side code can use <tt>rcu_dereference_protected()</tt>, | ||
| 1590 | which takes a | ||
| 1591 | <a href="https://lwn.net/Articles/371986/">lockdep expression</a> | ||
| 1592 | to indicate what is providing the protection. | ||
| 1593 | If the indicated protection is not provided, a lockdep splat | ||
| 1594 | is emitted. | ||
| 1595 | |||
| 1596 | <p> | ||
| 1597 | Code shared between readers and updaters can use | ||
| 1598 | <tt>rcu_dereference_check()</tt>, which also takes a | ||
| 1599 | lockdep expression, and emits a lockdep splat if neither | ||
| 1600 | <tt>rcu_read_lock()</tt> nor the indicated protection | ||
| 1601 | is in place. | ||
| 1602 | In addition, <tt>rcu_dereference_raw()</tt> is used in those | ||
| 1603 | (hopefully rare) cases where the required protection cannot | ||
| 1604 | be easily described. | ||
| 1605 | Finally, <tt>rcu_read_lock_held()</tt> is provided to | ||
| 1606 | allow a function to verify that it has been invoked within | ||
| 1607 | an RCU read-side critical section. | ||
| 1608 | I was made aware of this set of requirements shortly after Thomas | ||
| 1609 | Gleixner audited a number of RCU uses. | ||
| 1610 | <li> A given function might wish to check for RCU-related preconditions | ||
| 1611 | upon entry, before using any other RCU API. | ||
| 1612 | The <tt>rcu_lockdep_assert()</tt> does this job, | ||
| 1613 | asserting the expression in kernels having lockdep enabled | ||
| 1614 | and doing nothing otherwise. | ||
| 1615 | <li> It is also easy to forget to use <tt>rcu_assign_pointer()</tt> | ||
| 1616 | and <tt>rcu_dereference()</tt>, perhaps (incorrectly) | ||
| 1617 | substituting a simple assignment. | ||
| 1618 | To catch this sort of error, a given RCU-protected pointer may be | ||
| 1619 | tagged with <tt>__rcu</tt>, after which running sparse | ||
| 1620 | with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain | ||
| 1621 | about simple-assignment accesses to that pointer. | ||
| 1622 | Arnd Bergmann made me aware of this requirement, and also | ||
| 1623 | supplied the needed | ||
| 1624 | <a href="https://lwn.net/Articles/376011/">patch series</a>. | ||
| 1625 | <li> Kernels built with <tt>CONFIG_DEBUG_OBJECTS_RCU_HEAD=y</tt> | ||
| 1626 | will splat if a data element is passed to <tt>call_rcu()</tt> | ||
| 1627 | twice in a row, without a grace period in between. | ||
| 1628 | (This error is similar to a double free.) | ||
| 1629 | The corresponding <tt>rcu_head</tt> structures that are | ||
| 1630 | dynamically allocated are automatically tracked, but | ||
| 1631 | <tt>rcu_head</tt> structures allocated on the stack | ||
| 1632 | must be initialized with <tt>init_rcu_head_on_stack()</tt> | ||
| 1633 | and cleaned up with <tt>destroy_rcu_head_on_stack()</tt>. | ||
| 1634 | Similarly, statically allocated non-stack <tt>rcu_head</tt> | ||
| 1635 | structures must be initialized with <tt>init_rcu_head()</tt> | ||
| 1636 | and cleaned up with <tt>destroy_rcu_head()</tt>. | ||
| 1637 | Mathieu Desnoyers made me aware of this requirement, and also | ||
| 1638 | supplied the needed | ||
| 1639 | <a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>. | ||
| 1640 | <li> An infinite loop in an RCU read-side critical section will | ||
| 1641 | eventually trigger an RCU CPU stall warning splat, with | ||
| 1642 | the duration of “eventually” being controlled by the | ||
| 1643 | <tt>RCU_CPU_STALL_TIMEOUT</tt> <tt>Kconfig</tt> option, or, | ||
| 1644 | alternatively, by the | ||
| 1645 | <tt>rcupdate.rcu_cpu_stall_timeout</tt> boot/sysfs | ||
| 1646 | parameter. | ||
| 1647 | However, RCU is not obligated to produce this splat | ||
| 1648 | unless there is a grace period waiting on that particular | ||
| 1649 | RCU read-side critical section. | ||
| 1650 | <p> | ||
| 1651 | Some extreme workloads might intentionally delay | ||
| 1652 | RCU grace periods, and systems running those workloads can | ||
| 1653 | be booted with <tt>rcupdate.rcu_cpu_stall_suppress</tt> | ||
| 1654 | to suppress the splats. | ||
| 1655 | This kernel parameter may also be set via <tt>sysfs</tt>. | ||
| 1656 | Furthermore, RCU CPU stall warnings are counter-productive | ||
| 1657 | during sysrq dumps and during panics. | ||
| 1658 | RCU therefore supplies the <tt>rcu_sysrq_start()</tt> and | ||
| 1659 | <tt>rcu_sysrq_end()</tt> API members to be called before | ||
| 1660 | and after long sysrq dumps. | ||
| 1661 | RCU also supplies the <tt>rcu_panic()</tt> notifier that is | ||
| 1662 | automatically invoked at the beginning of a panic to suppress | ||
| 1663 | further RCU CPU stall warnings. | ||
| 1664 | |||
| 1665 | <p> | ||
| 1666 | This requirement made itself known in the early 1990s, pretty | ||
| 1667 | much the first time that it was necessary to debug a CPU stall. | ||
| 1668 | That said, the initial implementation in DYNIX/ptx was quite | ||
| 1669 | generic in comparison with that of Linux. | ||
| 1670 | <li> Although it would be very good to detect pointers leaking out | ||
| 1671 | of RCU read-side critical sections, there is currently no | ||
| 1672 | good way of doing this. | ||
| 1673 | One complication is the need to distinguish between pointers | ||
| 1674 | leaking and pointers that have been handed off from RCU to | ||
| 1675 | some other synchronization mechanism, for example, reference | ||
| 1676 | counting. | ||
| 1677 | <li> In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related | ||
| 1678 | information is provided via both debugfs and event tracing. | ||
| 1679 | <li> Open-coded use of <tt>rcu_assign_pointer()</tt> and | ||
| 1680 | <tt>rcu_dereference()</tt> to create typical linked | ||
| 1681 | data structures can be surprisingly error-prone. | ||
| 1682 | Therefore, RCU-protected | ||
| 1683 | <a href="https://lwn.net/Articles/609973/#RCU List APIs">linked lists</a> | ||
| 1684 | and, more recently, RCU-protected | ||
| 1685 | <a href="https://lwn.net/Articles/612100/">hash tables</a> | ||
| 1686 | are available. | ||
| 1687 | Many other special-purpose RCU-protected data structures are | ||
| 1688 | available in the Linux kernel and the userspace RCU library. | ||
| 1689 | <li> Some linked structures are created at compile time, but still | ||
| 1690 | require <tt>__rcu</tt> checking. | ||
| 1691 | The <tt>RCU_POINTER_INITIALIZER()</tt> macro serves this | ||
| 1692 | purpose. | ||
| 1693 | <li> It is not necessary to use <tt>rcu_assign_pointer()</tt> | ||
| 1694 | when creating linked structures that are to be published via | ||
| 1695 | a single external pointer. | ||
| 1696 | The <tt>RCU_INIT_POINTER()</tt> macro is provided for | ||
| 1697 | this task and also for assigning <tt>NULL</tt> pointers | ||
| 1698 | at runtime. | ||
| 1699 | </ol> | ||
| 1700 | |||
| 1701 | <p> | ||
| 1702 | This not a hard-and-fast list: RCU's diagnostic capabilities will | ||
| 1703 | continue to be guided by the number and type of usage bugs found | ||
| 1704 | in real-world RCU usage. | ||
| 1705 | |||
| 1706 | <h2><a name="Linux Kernel Complications">Linux Kernel Complications</a></h2> | ||
| 1707 | |||
| 1708 | <p> | ||
| 1709 | The Linux kernel provides an interesting environment for all kinds of | ||
| 1710 | software, including RCU. | ||
| 1711 | Some of the relevant points of interest are as follows: | ||
| 1712 | |||
| 1713 | <ol> | ||
| 1714 | <li> <a href="#Configuration">Configuration</a>. | ||
| 1715 | <li> <a href="#Firmware Interface">Firmware Interface</a>. | ||
| 1716 | <li> <a href="#Early Boot">Early Boot</a>. | ||
| 1717 | <li> <a href="#Interrupts and NMIs"> | ||
| 1718 | Interrupts and non-maskable interrupts (NMIs)</a>. | ||
| 1719 | <li> <a href="#Loadable Modules">Loadable Modules</a>. | ||
| 1720 | <li> <a href="#Hotplug CPU">Hotplug CPU</a>. | ||
| 1721 | <li> <a href="#Scheduler and RCU">Scheduler and RCU</a>. | ||
| 1722 | <li> <a href="#Tracing and RCU">Tracing and RCU</a>. | ||
| 1723 | <li> <a href="#Energy Efficiency">Energy Efficiency</a>. | ||
| 1724 | <li> <a href="#Memory Efficiency">Memory Efficiency</a>. | ||
| 1725 | <li> <a href="#Performance, Scalability, Response Time, and Reliability"> | ||
| 1726 | Performance, Scalability, Response Time, and Reliability</a>. | ||
| 1727 | </ol> | ||
| 1728 | |||
| 1729 | <p> | ||
| 1730 | This list is probably incomplete, but it does give a feel for the | ||
| 1731 | most notable Linux-kernel complications. | ||
| 1732 | Each of the following sections covers one of the above topics. | ||
| 1733 | |||
| 1734 | <h3><a name="Configuration">Configuration</a></h3> | ||
| 1735 | |||
| 1736 | <p> | ||
| 1737 | RCU's goal is automatic configuration, so that almost nobody | ||
| 1738 | needs to worry about RCU's <tt>Kconfig</tt> options. | ||
| 1739 | And for almost all users, RCU does in fact work well | ||
| 1740 | “out of the box.” | ||
| 1741 | |||
| 1742 | <p> | ||
| 1743 | However, there are specialized use cases that are handled by | ||
| 1744 | kernel boot parameters and <tt>Kconfig</tt> options. | ||
| 1745 | Unfortunately, the <tt>Kconfig</tt> system will explicitly ask users | ||
| 1746 | about new <tt>Kconfig</tt> options, which requires almost all of them | ||
| 1747 | be hidden behind a <tt>CONFIG_RCU_EXPERT</tt> <tt>Kconfig</tt> option. | ||
| 1748 | |||
| 1749 | <p> | ||
| 1750 | This all should be quite obvious, but the fact remains that | ||
| 1751 | Linus Torvalds recently had to | ||
| 1752 | <a href="https://lkml.kernel.org/g/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com">remind</a> | ||
| 1753 | me of this requirement. | ||
| 1754 | |||
| 1755 | <h3><a name="Firmware Interface">Firmware Interface</a></h3> | ||
| 1756 | |||
| 1757 | <p> | ||
| 1758 | In many cases, kernel obtains information about the system from the | ||
| 1759 | firmware, and sometimes things are lost in translation. | ||
| 1760 | Or the translation is accurate, but the original message is bogus. | ||
| 1761 | |||
| 1762 | <p> | ||
| 1763 | For example, some systems' firmware overreports the number of CPUs, | ||
| 1764 | sometimes by a large factor. | ||
| 1765 | If RCU naively believed the firmware, as it used to do, | ||
| 1766 | it would create too many per-CPU kthreads. | ||
| 1767 | Although the resulting system will still run correctly, the extra | ||
| 1768 | kthreads needlessly consume memory and can cause confusion | ||
| 1769 | when they show up in <tt>ps</tt> listings. | ||
| 1770 | |||
| 1771 | <p> | ||
| 1772 | RCU must therefore wait for a given CPU to actually come online before | ||
| 1773 | it can allow itself to believe that the CPU actually exists. | ||
| 1774 | The resulting “ghost CPUs” (which are never going to | ||
| 1775 | come online) cause a number of | ||
| 1776 | <a href="https://paulmck.livejournal.com/37494.html">interesting complications</a>. | ||
| 1777 | |||
| 1778 | <h3><a name="Early Boot">Early Boot</a></h3> | ||
| 1779 | |||
| 1780 | <p> | ||
| 1781 | The Linux kernel's boot sequence is an interesting process, | ||
| 1782 | and RCU is used early, even before <tt>rcu_init()</tt> | ||
| 1783 | is invoked. | ||
| 1784 | In fact, a number of RCU's primitives can be used as soon as the | ||
| 1785 | initial task's <tt>task_struct</tt> is available and the | ||
| 1786 | boot CPU's per-CPU variables are set up. | ||
| 1787 | The read-side primitives (<tt>rcu_read_lock()</tt>, | ||
| 1788 | <tt>rcu_read_unlock()</tt>, <tt>rcu_dereference()</tt>, | ||
| 1789 | and <tt>rcu_access_pointer()</tt>) will operate normally very early on, | ||
| 1790 | as will <tt>rcu_assign_pointer()</tt>. | ||
| 1791 | |||
| 1792 | <p> | ||
| 1793 | Although <tt>call_rcu()</tt> may be invoked at any | ||
| 1794 | time during boot, callbacks are not guaranteed to be invoked until after | ||
| 1795 | the scheduler is fully up and running. | ||
| 1796 | This delay in callback invocation is due to the fact that RCU does not | ||
| 1797 | invoke callbacks until it is fully initialized, and this full initialization | ||
| 1798 | cannot occur until after the scheduler has initialized itself to the | ||
| 1799 | point where RCU can spawn and run its kthreads. | ||
| 1800 | In theory, it would be possible to invoke callbacks earlier, | ||
| 1801 | however, this is not a panacea because there would be severe restrictions | ||
| 1802 | on what operations those callbacks could invoke. | ||
| 1803 | |||
| 1804 | <p> | ||
| 1805 | Perhaps surprisingly, <tt>synchronize_rcu()</tt>, | ||
| 1806 | <a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a> | ||
| 1807 | (<a href="#Bottom-Half Flavor">discussed below</a>), | ||
| 1808 | and | ||
| 1809 | <a href="#Sched Flavor"><tt>synchronize_sched()</tt></a> | ||
| 1810 | will all operate normally | ||
| 1811 | during very early boot, the reason being that there is only one CPU | ||
| 1812 | and preemption is disabled. | ||
| 1813 | This means that the call <tt>synchronize_rcu()</tt> (or friends) | ||
| 1814 | itself is a quiescent | ||
| 1815 | state and thus a grace period, so the early-boot implementation can | ||
| 1816 | be a no-op. | ||
| 1817 | |||
| 1818 | <p> | ||
| 1819 | Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt> | ||
| 1820 | continue to operate normally through the remainder of boot, courtesy | ||
| 1821 | of the fact that preemption is disabled across their RCU read-side | ||
| 1822 | critical sections and also courtesy of the fact that there is still | ||
| 1823 | only one CPU. | ||
| 1824 | However, once the scheduler starts initializing, preemption is enabled. | ||
| 1825 | There is still only a single CPU, but the fact that preemption is enabled | ||
| 1826 | means that the no-op implementation of <tt>synchronize_rcu()</tt> no | ||
| 1827 | longer works in <tt>CONFIG_PREEMPT=y</tt> kernels. | ||
| 1828 | Therefore, as soon as the scheduler starts initializing, the early-boot | ||
| 1829 | fastpath is disabled. | ||
| 1830 | This means that <tt>synchronize_rcu()</tt> switches to its runtime | ||
| 1831 | mode of operation where it posts callbacks, which in turn means that | ||
| 1832 | any call to <tt>synchronize_rcu()</tt> will block until the corresponding | ||
| 1833 | callback is invoked. | ||
| 1834 | Unfortunately, the callback cannot be invoked until RCU's runtime | ||
| 1835 | grace-period machinery is up and running, which cannot happen until | ||
| 1836 | the scheduler has initialized itself sufficiently to allow RCU's | ||
| 1837 | kthreads to be spawned. | ||
| 1838 | Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler | ||
| 1839 | initialization can result in deadlock. | ||
| 1840 | |||
| 1841 | <p><a name="Quick Quiz 14"><b>Quick Quiz 14</b>:</a> | ||
| 1842 | So what happens with <tt>synchronize_rcu()</tt> during | ||
| 1843 | scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> | ||
| 1844 | kernels? | ||
| 1845 | <br><a href="#qq14answer">Answer</a> | ||
| 1846 | |||
| 1847 | <p> | ||
| 1848 | I learned of these boot-time requirements as a result of a series of | ||
| 1849 | system hangs. | ||
| 1850 | |||
| 1851 | <h3><a name="Interrupts and NMIs">Interrupts and NMIs</a></h3> | ||
| 1852 | |||
| 1853 | <p> | ||
| 1854 | The Linux kernel has interrupts, and RCU read-side critical sections are | ||
| 1855 | legal within interrupt handlers and within interrupt-disabled regions | ||
| 1856 | of code, as are invocations of <tt>call_rcu()</tt>. | ||
| 1857 | |||
| 1858 | <p> | ||
| 1859 | Some Linux-kernel architectures can enter an interrupt handler from | ||
| 1860 | non-idle process context, and then just never leave it, instead stealthily | ||
| 1861 | transitioning back to process context. | ||
| 1862 | This trick is sometimes used to invoke system calls from inside the kernel. | ||
| 1863 | These “half-interrupts” mean that RCU has to be very careful | ||
| 1864 | about how it counts interrupt nesting levels. | ||
| 1865 | I learned of this requirement the hard way during a rewrite | ||
| 1866 | of RCU's dyntick-idle code. | ||
| 1867 | |||
| 1868 | <p> | ||
| 1869 | The Linux kernel has non-maskable interrupts (NMIs), and | ||
| 1870 | RCU read-side critical sections are legal within NMI handlers. | ||
| 1871 | Thankfully, RCU update-side primitives, including | ||
| 1872 | <tt>call_rcu()</tt>, are prohibited within NMI handlers. | ||
| 1873 | |||
| 1874 | <p> | ||
| 1875 | The name notwithstanding, some Linux-kernel architectures | ||
| 1876 | can have nested NMIs, which RCU must handle correctly. | ||
| 1877 | Andy Lutomirski | ||
| 1878 | <a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a> | ||
| 1879 | with this requirement; | ||
| 1880 | he also kindly surprised me with | ||
| 1881 | <a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a> | ||
| 1882 | that meets this requirement. | ||
| 1883 | |||
| 1884 | <h3><a name="Loadable Modules">Loadable Modules</a></h3> | ||
| 1885 | |||
| 1886 | <p> | ||
| 1887 | The Linux kernel has loadable modules, and these modules can | ||
| 1888 | also be unloaded. | ||
| 1889 | After a given module has been unloaded, any attempt to call | ||
| 1890 | one of its functions results in a segmentation fault. | ||
| 1891 | The module-unload functions must therefore cancel any | ||
| 1892 | delayed calls to loadable-module functions, for example, | ||
| 1893 | any outstanding <tt>mod_timer()</tt> must be dealt with | ||
| 1894 | via <tt>del_timer_sync()</tt> or similar. | ||
| 1895 | |||
| 1896 | <p> | ||
| 1897 | Unfortunately, there is no way to cancel an RCU callback; | ||
| 1898 | once you invoke <tt>call_rcu()</tt>, the callback function is | ||
| 1899 | going to eventually be invoked, unless the system goes down first. | ||
| 1900 | Because it is normally considered socially irresponsible to crash the system | ||
| 1901 | in response to a module unload request, we need some other way | ||
| 1902 | to deal with in-flight RCU callbacks. | ||
| 1903 | |||
| 1904 | <p> | ||
| 1905 | RCU therefore provides | ||
| 1906 | <tt><a href="https://lwn.net/Articles/217484/">rcu_barrier()</a></tt>, | ||
| 1907 | which waits until all in-flight RCU callbacks have been invoked. | ||
| 1908 | If a module uses <tt>call_rcu()</tt>, its exit function should therefore | ||
| 1909 | prevent any future invocation of <tt>call_rcu()</tt>, then invoke | ||
| 1910 | <tt>rcu_barrier()</tt>. | ||
| 1911 | In theory, the underlying module-unload code could invoke | ||
| 1912 | <tt>rcu_barrier()</tt> unconditionally, but in practice this would | ||
| 1913 | incur unacceptable latencies. | ||
| 1914 | |||
| 1915 | <p> | ||
| 1916 | Nikita Danilov noted this requirement for an analogous filesystem-unmount | ||
| 1917 | situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU. | ||
| 1918 | The need for <tt>rcu_barrier()</tt> for module unloading became | ||
| 1919 | apparent later. | ||
| 1920 | |||
| 1921 | <h3><a name="Hotplug CPU">Hotplug CPU</a></h3> | ||
| 1922 | |||
| 1923 | <p> | ||
| 1924 | The Linux kernel supports CPU hotplug, which means that CPUs | ||
| 1925 | can come and go. | ||
| 1926 | It is of course illegal to use any RCU API member from an offline CPU. | ||
| 1927 | This requirement was present from day one in DYNIX/ptx, but | ||
| 1928 | on the other hand, the Linux kernel's CPU-hotplug implementation | ||
| 1929 | is “interesting.” | ||
| 1930 | |||
| 1931 | <p> | ||
| 1932 | The Linux-kernel CPU-hotplug implementation has notifiers that | ||
| 1933 | are used to allow the various kernel subsystems (including RCU) | ||
| 1934 | to respond appropriately to a given CPU-hotplug operation. | ||
| 1935 | Most RCU operations may be invoked from CPU-hotplug notifiers, | ||
| 1936 | including even normal synchronous grace-period operations | ||
| 1937 | such as <tt>synchronize_rcu()</tt>. | ||
| 1938 | However, expedited grace-period operations such as | ||
| 1939 | <tt>synchronize_rcu_expedited()</tt> are not supported, | ||
| 1940 | due to the fact that current implementations block CPU-hotplug | ||
| 1941 | operations, which could result in deadlock. | ||
| 1942 | |||
| 1943 | <p> | ||
| 1944 | In addition, all-callback-wait operations such as | ||
| 1945 | <tt>rcu_barrier()</tt> are also not supported, due to the | ||
| 1946 | fact that there are phases of CPU-hotplug operations where | ||
| 1947 | the outgoing CPU's callbacks will not be invoked until after | ||
| 1948 | the CPU-hotplug operation ends, which could also result in deadlock. | ||
| 1949 | |||
| 1950 | <h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3> | ||
| 1951 | |||
| 1952 | <p> | ||
| 1953 | RCU depends on the scheduler, and the scheduler uses RCU to | ||
| 1954 | protect some of its data structures. | ||
| 1955 | This means the scheduler is forbidden from acquiring | ||
| 1956 | the runqueue locks and the priority-inheritance locks | ||
| 1957 | in the middle of an outermost RCU read-side critical section unless either | ||
| 1958 | (1) it releases them before exiting that same | ||
| 1959 | RCU read-side critical section, or | ||
| 1960 | (2) interrupts are disabled across | ||
| 1961 | that entire RCU read-side critical section. | ||
| 1962 | This same prohibition also applies (recursively!) to any lock that is acquired | ||
| 1963 | while holding any lock to which this prohibition applies. | ||
| 1964 | Adhering to this rule prevents preemptible RCU from invoking | ||
| 1965 | <tt>rcu_read_unlock_special()</tt> while either runqueue or | ||
| 1966 | priority-inheritance locks are held, thus avoiding deadlock. | ||
| 1967 | |||
| 1968 | <p> | ||
| 1969 | Prior to v4.4, it was only necessary to disable preemption across | ||
| 1970 | RCU read-side critical sections that acquired scheduler locks. | ||
| 1971 | In v4.4, expedited grace periods started using IPIs, and these | ||
| 1972 | IPIs could force a <tt>rcu_read_unlock()</tt> to take the slowpath. | ||
| 1973 | Therefore, this expedited-grace-period change required disabling of | ||
| 1974 | interrupts, not just preemption. | ||
| 1975 | |||
| 1976 | <p> | ||
| 1977 | For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt> | ||
| 1978 | implementation must be written carefully to avoid similar deadlocks. | ||
| 1979 | In particular, <tt>rcu_read_unlock()</tt> must tolerate an | ||
| 1980 | interrupt where the interrupt handler invokes both | ||
| 1981 | <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>. | ||
| 1982 | This possibility requires <tt>rcu_read_unlock()</tt> to use | ||
| 1983 | negative nesting levels to avoid destructive recursion via | ||
| 1984 | interrupt handler's use of RCU. | ||
| 1985 | |||
| 1986 | <p> | ||
| 1987 | This pair of mutual scheduler-RCU requirements came as a | ||
| 1988 | <a href="https://lwn.net/Articles/453002/">complete surprise</a>. | ||
| 1989 | |||
| 1990 | <p> | ||
| 1991 | As noted above, RCU makes use of kthreads, and it is necessary to | ||
| 1992 | avoid excessive CPU-time accumulation by these kthreads. | ||
| 1993 | This requirement was no surprise, but RCU's violation of it | ||
| 1994 | when running context-switch-heavy workloads when built with | ||
| 1995 | <tt>CONFIG_NO_HZ_FULL=y</tt> | ||
| 1996 | <a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>. | ||
| 1997 | RCU has made good progress towards meeting this requirement, even | ||
| 1998 | for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads, | ||
| 1999 | but there is room for further improvement. | ||
| 2000 | |||
| 2001 | <h3><a name="Tracing and RCU">Tracing and RCU</a></h3> | ||
| 2002 | |||
| 2003 | <p> | ||
| 2004 | It is possible to use tracing on RCU code, but tracing itself | ||
| 2005 | uses RCU. | ||
| 2006 | For this reason, <tt>rcu_dereference_raw_notrace()</tt> | ||
| 2007 | is provided for use by tracing, which avoids the destructive | ||
| 2008 | recursion that could otherwise ensue. | ||
| 2009 | This API is also used by virtualization in some architectures, | ||
| 2010 | where RCU readers execute in environments in which tracing | ||
| 2011 | cannot be used. | ||
| 2012 | The tracing folks both located the requirement and provided the | ||
| 2013 | needed fix, so this surprise requirement was relatively painless. | ||
| 2014 | |||
| 2015 | <h3><a name="Energy Efficiency">Energy Efficiency</a></h3> | ||
| 2016 | |||
| 2017 | <p> | ||
| 2018 | Interrupting idle CPUs is considered socially unacceptable, | ||
| 2019 | especially by people with battery-powered embedded systems. | ||
| 2020 | RCU therefore conserves energy by detecting which CPUs are | ||
| 2021 | idle, including tracking CPUs that have been interrupted from idle. | ||
| 2022 | This is a large part of the energy-efficiency requirement, | ||
| 2023 | so I learned of this via an irate phone call. | ||
| 2024 | |||
| 2025 | <p> | ||
| 2026 | Because RCU avoids interrupting idle CPUs, it is illegal to | ||
| 2027 | execute an RCU read-side critical section on an idle CPU. | ||
| 2028 | (Kernels built with <tt>CONFIG_PROVE_RCU=y</tt> will splat | ||
| 2029 | if you try it.) | ||
| 2030 | The <tt>RCU_NONIDLE()</tt> macro and <tt>_rcuidle</tt> | ||
| 2031 | event tracing is provided to work around this restriction. | ||
| 2032 | In addition, <tt>rcu_is_watching()</tt> may be used to | ||
| 2033 | test whether or not it is currently legal to run RCU read-side | ||
| 2034 | critical sections on this CPU. | ||
| 2035 | I learned of the need for diagnostics on the one hand | ||
| 2036 | and <tt>RCU_NONIDLE()</tt> on the other while inspecting | ||
| 2037 | idle-loop code. | ||
| 2038 | Steven Rostedt supplied <tt>_rcuidle</tt> event tracing, | ||
| 2039 | which is used quite heavily in the idle loop. | ||
| 2040 | |||
| 2041 | <p> | ||
| 2042 | It is similarly socially unacceptable to interrupt an | ||
| 2043 | <tt>nohz_full</tt> CPU running in userspace. | ||
| 2044 | RCU must therefore track <tt>nohz_full</tt> userspace | ||
| 2045 | execution. | ||
| 2046 | And in | ||
| 2047 | <a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a> | ||
| 2048 | kernels, RCU must separately track idle CPUs on the one hand and | ||
| 2049 | CPUs that are either idle or executing in userspace on the other. | ||
| 2050 | In both cases, RCU must be able to sample state at two points in | ||
| 2051 | time, and be able to determine whether or not some other CPU spent | ||
| 2052 | any time idle and/or executing in userspace. | ||
| 2053 | |||
| 2054 | <p> | ||
| 2055 | These energy-efficiency requirements have proven quite difficult to | ||
| 2056 | understand and to meet, for example, there have been more than five | ||
| 2057 | clean-sheet rewrites of RCU's energy-efficiency code, the last of | ||
| 2058 | which was finally able to demonstrate | ||
| 2059 | <a href="http://www.rdrop.com/users/paulmck/realtime/paper/AMPenergy.2013.04.19a.pdf">real energy savings running on real hardware [PDF]</a>. | ||
| 2060 | As noted earlier, | ||
| 2061 | I learned of many of these requirements via angry phone calls: | ||
| 2062 | Flaming me on the Linux-kernel mailing list was apparently not | ||
| 2063 | sufficient to fully vent their ire at RCU's energy-efficiency bugs! | ||
| 2064 | |||
| 2065 | <h3><a name="Memory Efficiency">Memory Efficiency</a></h3> | ||
| 2066 | |||
| 2067 | <p> | ||
| 2068 | Although small-memory non-realtime systems can simply use Tiny RCU, | ||
| 2069 | code size is only one aspect of memory efficiency. | ||
| 2070 | Another aspect is the size of the <tt>rcu_head</tt> structure | ||
| 2071 | used by <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>. | ||
| 2072 | Although this structure contains nothing more than a pair of pointers, | ||
| 2073 | it does appear in many RCU-protected data structures, including | ||
| 2074 | some that are size critical. | ||
| 2075 | The <tt>page</tt> structure is a case in point, as evidenced by | ||
| 2076 | the many occurrences of the <tt>union</tt> keyword within that structure. | ||
| 2077 | |||
| 2078 | <p> | ||
| 2079 | This need for memory efficiency is one reason that RCU uses hand-crafted | ||
| 2080 | singly linked lists to track the <tt>rcu_head</tt> structures that | ||
| 2081 | are waiting for a grace period to elapse. | ||
| 2082 | It is also the reason why <tt>rcu_head</tt> structures do not contain | ||
| 2083 | debug information, such as fields tracking the file and line of the | ||
| 2084 | <tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> that posted them. | ||
| 2085 | Although this information might appear in debug-only kernel builds at some | ||
| 2086 | point, in the meantime, the <tt>->func</tt> field will often provide | ||
| 2087 | the needed debug information. | ||
| 2088 | |||
| 2089 | <p> | ||
| 2090 | However, in some cases, the need for memory efficiency leads to even | ||
| 2091 | more extreme measures. | ||
| 2092 | Returning to the <tt>page</tt> structure, the <tt>rcu_head</tt> field | ||
| 2093 | shares storage with a great many other structures that are used at | ||
| 2094 | various points in the corresponding page's lifetime. | ||
| 2095 | In order to correctly resolve certain | ||
| 2096 | <a href="https://lkml.kernel.org/g/1439976106-137226-1-git-send-email-kirill.shutemov@linux.intel.com">race conditions</a>, | ||
| 2097 | the Linux kernel's memory-management subsystem needs a particular bit | ||
| 2098 | to remain zero during all phases of grace-period processing, | ||
| 2099 | and that bit happens to map to the bottom bit of the | ||
| 2100 | <tt>rcu_head</tt> structure's <tt>->next</tt> field. | ||
| 2101 | RCU makes this guarantee as long as <tt>call_rcu()</tt> | ||
| 2102 | is used to post the callback, as opposed to <tt>kfree_rcu()</tt> | ||
| 2103 | or some future “lazy” | ||
| 2104 | variant of <tt>call_rcu()</tt> that might one day be created for | ||
| 2105 | energy-efficiency purposes. | ||
| 2106 | |||
| 2107 | <h3><a name="Performance, Scalability, Response Time, and Reliability"> | ||
| 2108 | Performance, Scalability, Response Time, and Reliability</a></h3> | ||
| 2109 | |||
| 2110 | <p> | ||
| 2111 | Expanding on the | ||
| 2112 | <a href="#Performance and Scalability">earlier discussion</a>, | ||
| 2113 | RCU is used heavily by hot code paths in performance-critical | ||
| 2114 | portions of the Linux kernel's networking, security, virtualization, | ||
| 2115 | and scheduling code paths. | ||
| 2116 | RCU must therefore use efficient implementations, especially in its | ||
| 2117 | read-side primitives. | ||
| 2118 | To that end, it would be good if preemptible RCU's implementation | ||
| 2119 | of <tt>rcu_read_lock()</tt> could be inlined, however, doing | ||
| 2120 | this requires resolving <tt>#include</tt> issues with the | ||
| 2121 | <tt>task_struct</tt> structure. | ||
| 2122 | |||
| 2123 | <p> | ||
| 2124 | The Linux kernel supports hardware configurations with up to | ||
| 2125 | 4096 CPUs, which means that RCU must be extremely scalable. | ||
| 2126 | Algorithms that involve frequent acquisitions of global locks or | ||
| 2127 | frequent atomic operations on global variables simply cannot be | ||
| 2128 | tolerated within the RCU implementation. | ||
| 2129 | RCU therefore makes heavy use of a combining tree based on the | ||
| 2130 | <tt>rcu_node</tt> structure. | ||
| 2131 | RCU is required to tolerate all CPUs continuously invoking any | ||
| 2132 | combination of RCU's runtime primitives with minimal per-operation | ||
| 2133 | overhead. | ||
| 2134 | In fact, in many cases, increasing load must <i>decrease</i> the | ||
| 2135 | per-operation overhead, witness the batching optimizations for | ||
| 2136 | <tt>synchronize_rcu()</tt>, <tt>call_rcu()</tt>, | ||
| 2137 | <tt>synchronize_rcu_expedited()</tt>, and <tt>rcu_barrier()</tt>. | ||
| 2138 | As a general rule, RCU must cheerfully accept whatever the | ||
| 2139 | rest of the Linux kernel decides to throw at it. | ||
| 2140 | |||
| 2141 | <p> | ||
| 2142 | The Linux kernel is used for real-time workloads, especially | ||
| 2143 | in conjunction with the | ||
| 2144 | <a href="https://rt.wiki.kernel.org/index.php/Main_Page">-rt patchset</a>. | ||
| 2145 | The real-time-latency response requirements are such that the | ||
| 2146 | traditional approach of disabling preemption across RCU | ||
| 2147 | read-side critical sections is inappropriate. | ||
| 2148 | Kernels built with <tt>CONFIG_PREEMPT=y</tt> therefore | ||
| 2149 | use an RCU implementation that allows RCU read-side critical | ||
| 2150 | sections to be preempted. | ||
| 2151 | This requirement made its presence known after users made it | ||
| 2152 | clear that an earlier | ||
| 2153 | <a href="https://lwn.net/Articles/107930/">real-time patch</a> | ||
| 2154 | did not meet their needs, in conjunction with some | ||
| 2155 | <a href="https://lkml.kernel.org/g/20050318002026.GA2693@us.ibm.com">RCU issues</a> | ||
| 2156 | encountered by a very early version of the -rt patchset. | ||
| 2157 | |||
| 2158 | <p> | ||
| 2159 | In addition, RCU must make do with a sub-100-microsecond real-time latency | ||
| 2160 | budget. | ||
| 2161 | In fact, on smaller systems with the -rt patchset, the Linux kernel | ||
| 2162 | provides sub-20-microsecond real-time latencies for the whole kernel, | ||
| 2163 | including RCU. | ||
| 2164 | RCU's scalability and latency must therefore be sufficient for | ||
| 2165 | these sorts of configurations. | ||
| 2166 | To my surprise, the sub-100-microsecond real-time latency budget | ||
| 2167 | <a href="http://www.rdrop.com/users/paulmck/realtime/paper/bigrt.2013.01.31a.LCA.pdf"> | ||
| 2168 | applies to even the largest systems [PDF]</a>, | ||
| 2169 | up to and including systems with 4096 CPUs. | ||
| 2170 | This real-time requirement motivated the grace-period kthread, which | ||
| 2171 | also simplified handling of a number of race conditions. | ||
| 2172 | |||
| 2173 | <p> | ||
| 2174 | Finally, RCU's status as a synchronization primitive means that | ||
| 2175 | any RCU failure can result in arbitrary memory corruption that can be | ||
| 2176 | extremely difficult to debug. | ||
| 2177 | This means that RCU must be extremely reliable, which in | ||
| 2178 | practice also means that RCU must have an aggressive stress-test | ||
| 2179 | suite. | ||
| 2180 | This stress-test suite is called <tt>rcutorture</tt>. | ||
| 2181 | |||
| 2182 | <p> | ||
| 2183 | Although the need for <tt>rcutorture</tt> was no surprise, | ||
| 2184 | the current immense popularity of the Linux kernel is posing | ||
| 2185 | interesting—and perhaps unprecedented—validation | ||
| 2186 | challenges. | ||
| 2187 | To see this, keep in mind that there are well over one billion | ||
| 2188 | instances of the Linux kernel running today, given Android | ||
| 2189 | smartphones, Linux-powered televisions, and servers. | ||
| 2190 | This number can be expected to increase sharply with the advent of | ||
| 2191 | the celebrated Internet of Things. | ||
| 2192 | |||
| 2193 | <p> | ||
| 2194 | Suppose that RCU contains a race condition that manifests on average | ||
| 2195 | once per million years of runtime. | ||
| 2196 | This bug will be occurring about three times per <i>day</i> across | ||
| 2197 | the installed base. | ||
| 2198 | RCU could simply hide behind hardware error rates, given that no one | ||
| 2199 | should really expect their smartphone to last for a million years. | ||
| 2200 | However, anyone taking too much comfort from this thought should | ||
| 2201 | consider the fact that in most jurisdictions, a successful multi-year | ||
| 2202 | test of a given mechanism, which might include a Linux kernel, | ||
| 2203 | suffices for a number of types of safety-critical certifications. | ||
| 2204 | In fact, rumor has it that the Linux kernel is already being used | ||
| 2205 | in production for safety-critical applications. | ||
| 2206 | I don't know about you, but I would feel quite bad if a bug in RCU | ||
| 2207 | killed someone. | ||
| 2208 | Which might explain my recent focus on validation and verification. | ||
| 2209 | |||
| 2210 | <h2><a name="Other RCU Flavors">Other RCU Flavors</a></h2> | ||
| 2211 | |||
| 2212 | <p> | ||
| 2213 | One of the more surprising things about RCU is that there are now | ||
| 2214 | no fewer than five <i>flavors</i>, or API families. | ||
| 2215 | In addition, the primary flavor that has been the sole focus up to | ||
| 2216 | this point has two different implementations, non-preemptible and | ||
| 2217 | preemptible. | ||
| 2218 | The other four flavors are listed below, with requirements for each | ||
| 2219 | described in a separate section. | ||
| 2220 | |||
| 2221 | <ol> | ||
| 2222 | <li> <a href="#Bottom-Half Flavor">Bottom-Half Flavor</a> | ||
| 2223 | <li> <a href="#Sched Flavor">Sched Flavor</a> | ||
| 2224 | <li> <a href="#Sleepable RCU">Sleepable RCU</a> | ||
| 2225 | <li> <a href="#Tasks RCU">Tasks RCU</a> | ||
| 2226 | </ol> | ||
| 2227 | |||
| 2228 | <h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3> | ||
| 2229 | |||
| 2230 | <p> | ||
| 2231 | The softirq-disable (AKA “bottom-half”, | ||
| 2232 | hence the “_bh” abbreviations) | ||
| 2233 | flavor of RCU, or <i>RCU-bh</i>, was developed by | ||
| 2234 | Dipankar Sarma to provide a flavor of RCU that could withstand the | ||
| 2235 | network-based denial-of-service attacks researched by Robert | ||
| 2236 | Olsson. | ||
| 2237 | These attacks placed so much networking load on the system | ||
| 2238 | that some of the CPUs never exited softirq execution, | ||
| 2239 | which in turn prevented those CPUs from ever executing a context switch, | ||
| 2240 | which, in the RCU implementation of that time, prevented grace periods | ||
| 2241 | from ever ending. | ||
| 2242 | The result was an out-of-memory condition and a system hang. | ||
| 2243 | |||
| 2244 | <p> | ||
| 2245 | The solution was the creation of RCU-bh, which does | ||
| 2246 | <tt>local_bh_disable()</tt> | ||
| 2247 | across its read-side critical sections, and which uses the transition | ||
| 2248 | from one type of softirq processing to another as a quiescent state | ||
| 2249 | in addition to context switch, idle, user mode, and offline. | ||
| 2250 | This means that RCU-bh grace periods can complete even when some of | ||
| 2251 | the CPUs execute in softirq indefinitely, thus allowing algorithms | ||
| 2252 | based on RCU-bh to withstand network-based denial-of-service attacks. | ||
| 2253 | |||
| 2254 | <p> | ||
| 2255 | Because | ||
| 2256 | <tt>rcu_read_lock_bh()</tt> and <tt>rcu_read_unlock_bh()</tt> | ||
| 2257 | disable and re-enable softirq handlers, any attempt to start a softirq | ||
| 2258 | handlers during the | ||
| 2259 | RCU-bh read-side critical section will be deferred. | ||
| 2260 | In this case, <tt>rcu_read_unlock_bh()</tt> | ||
| 2261 | will invoke softirq processing, which can take considerable time. | ||
| 2262 | One can of course argue that this softirq overhead should be associated | ||
| 2263 | with the code following the RCU-bh read-side critical section rather | ||
| 2264 | than <tt>rcu_read_unlock_bh()</tt>, but the fact | ||
| 2265 | is that most profiling tools cannot be expected to make this sort | ||
| 2266 | of fine distinction. | ||
| 2267 | For example, suppose that a three-millisecond-long RCU-bh read-side | ||
| 2268 | critical section executes during a time of heavy networking load. | ||
| 2269 | There will very likely be an attempt to invoke at least one softirq | ||
| 2270 | handler during that three milliseconds, but any such invocation will | ||
| 2271 | be delayed until the time of the <tt>rcu_read_unlock_bh()</tt>. | ||
| 2272 | This can of course make it appear at first glance as if | ||
| 2273 | <tt>rcu_read_unlock_bh()</tt> was executing very slowly. | ||
| 2274 | |||
| 2275 | <p> | ||
| 2276 | The | ||
| 2277 | <a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-bh API</a> | ||
| 2278 | includes | ||
| 2279 | <tt>rcu_read_lock_bh()</tt>, | ||
| 2280 | <tt>rcu_read_unlock_bh()</tt>, | ||
| 2281 | <tt>rcu_dereference_bh()</tt>, | ||
| 2282 | <tt>rcu_dereference_bh_check()</tt>, | ||
| 2283 | <tt>synchronize_rcu_bh()</tt>, | ||
| 2284 | <tt>synchronize_rcu_bh_expedited()</tt>, | ||
| 2285 | <tt>call_rcu_bh()</tt>, | ||
| 2286 | <tt>rcu_barrier_bh()</tt>, and | ||
| 2287 | <tt>rcu_read_lock_bh_held()</tt>. | ||
| 2288 | |||
| 2289 | <h3><a name="Sched Flavor">Sched Flavor</a></h3> | ||
| 2290 | |||
| 2291 | <p> | ||
| 2292 | Before preemptible RCU, waiting for an RCU grace period had the | ||
| 2293 | side effect of also waiting for all pre-existing interrupt | ||
| 2294 | and NMI handlers. | ||
| 2295 | However, there are legitimate preemptible-RCU implementations that | ||
| 2296 | do not have this property, given that any point in the code outside | ||
| 2297 | of an RCU read-side critical section can be a quiescent state. | ||
| 2298 | Therefore, <i>RCU-sched</i> was created, which follows “classic” | ||
| 2299 | RCU in that an RCU-sched grace period waits for for pre-existing | ||
| 2300 | interrupt and NMI handlers. | ||
| 2301 | In kernels built with <tt>CONFIG_PREEMPT=n</tt>, the RCU and RCU-sched | ||
| 2302 | APIs have identical implementations, while kernels built with | ||
| 2303 | <tt>CONFIG_PREEMPT=y</tt> provide a separate implementation for each. | ||
| 2304 | |||
| 2305 | <p> | ||
| 2306 | Note well that in <tt>CONFIG_PREEMPT=y</tt> kernels, | ||
| 2307 | <tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt> | ||
| 2308 | disable and re-enable preemption, respectively. | ||
| 2309 | This means that if there was a preemption attempt during the | ||
| 2310 | RCU-sched read-side critical section, <tt>rcu_read_unlock_sched()</tt> | ||
| 2311 | will enter the scheduler, with all the latency and overhead entailed. | ||
| 2312 | Just as with <tt>rcu_read_unlock_bh()</tt>, this can make it look | ||
| 2313 | as if <tt>rcu_read_unlock_sched()</tt> was executing very slowly. | ||
| 2314 | However, the highest-priority task won't be preempted, so that task | ||
| 2315 | will enjoy low-overhead <tt>rcu_read_unlock_sched()</tt> invocations. | ||
| 2316 | |||
| 2317 | <p> | ||
| 2318 | The | ||
| 2319 | <a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-sched API</a> | ||
| 2320 | includes | ||
| 2321 | <tt>rcu_read_lock_sched()</tt>, | ||
| 2322 | <tt>rcu_read_unlock_sched()</tt>, | ||
| 2323 | <tt>rcu_read_lock_sched_notrace()</tt>, | ||
| 2324 | <tt>rcu_read_unlock_sched_notrace()</tt>, | ||
| 2325 | <tt>rcu_dereference_sched()</tt>, | ||
| 2326 | <tt>rcu_dereference_sched_check()</tt>, | ||
| 2327 | <tt>synchronize_sched()</tt>, | ||
| 2328 | <tt>synchronize_rcu_sched_expedited()</tt>, | ||
| 2329 | <tt>call_rcu_sched()</tt>, | ||
| 2330 | <tt>rcu_barrier_sched()</tt>, and | ||
| 2331 | <tt>rcu_read_lock_sched_held()</tt>. | ||
| 2332 | However, anything that disables preemption also marks an RCU-sched | ||
| 2333 | read-side critical section, including | ||
| 2334 | <tt>preempt_disable()</tt> and <tt>preempt_enable()</tt>, | ||
| 2335 | <tt>local_irq_save()</tt> and <tt>local_irq_restore()</tt>, | ||
| 2336 | and so on. | ||
| 2337 | |||
| 2338 | <h3><a name="Sleepable RCU">Sleepable RCU</a></h3> | ||
| 2339 | |||
| 2340 | <p> | ||
| 2341 | For well over a decade, someone saying “I need to block within | ||
| 2342 | an RCU read-side critical section” was a reliable indication | ||
| 2343 | that this someone did not understand RCU. | ||
| 2344 | After all, if you are always blocking in an RCU read-side critical | ||
| 2345 | section, you can probably afford to use a higher-overhead synchronization | ||
| 2346 | mechanism. | ||
| 2347 | However, that changed with the advent of the Linux kernel's notifiers, | ||
| 2348 | whose RCU read-side critical | ||
| 2349 | sections almost never sleep, but sometimes need to. | ||
| 2350 | This resulted in the introduction of | ||
| 2351 | <a href="https://lwn.net/Articles/202847/">sleepable RCU</a>, | ||
| 2352 | or <i>SRCU</i>. | ||
| 2353 | |||
| 2354 | <p> | ||
| 2355 | SRCU allows different domains to be defined, with each such domain | ||
| 2356 | defined by an instance of an <tt>srcu_struct</tt> structure. | ||
| 2357 | A pointer to this structure must be passed in to each SRCU function, | ||
| 2358 | for example, <tt>synchronize_srcu(&ss)</tt>, where | ||
| 2359 | <tt>ss</tt> is the <tt>srcu_struct</tt> structure. | ||
| 2360 | The key benefit of these domains is that a slow SRCU reader in one | ||
| 2361 | domain does not delay an SRCU grace period in some other domain. | ||
| 2362 | That said, one consequence of these domains is that read-side code | ||
| 2363 | must pass a “cookie” from <tt>srcu_read_lock()</tt> | ||
| 2364 | to <tt>srcu_read_unlock()</tt>, for example, as follows: | ||
| 2365 | |||
| 2366 | <blockquote> | ||
| 2367 | <pre> | ||
| 2368 | 1 int idx; | ||
| 2369 | 2 | ||
| 2370 | 3 idx = srcu_read_lock(&ss); | ||
| 2371 | 4 do_something(); | ||
| 2372 | 5 srcu_read_unlock(&ss, idx); | ||
| 2373 | </pre> | ||
| 2374 | </blockquote> | ||
| 2375 | |||
| 2376 | <p> | ||
| 2377 | As noted above, it is legal to block within SRCU read-side critical sections, | ||
| 2378 | however, with great power comes great responsibility. | ||
| 2379 | If you block forever in one of a given domain's SRCU read-side critical | ||
| 2380 | sections, then that domain's grace periods will also be blocked forever. | ||
| 2381 | Of course, one good way to block forever is to deadlock, which can | ||
| 2382 | happen if any operation in a given domain's SRCU read-side critical | ||
| 2383 | section can block waiting, either directly or indirectly, for that domain's | ||
| 2384 | grace period to elapse. | ||
| 2385 | For example, this results in a self-deadlock: | ||
| 2386 | |||
| 2387 | <blockquote> | ||
| 2388 | <pre> | ||
| 2389 | 1 int idx; | ||
| 2390 | 2 | ||
| 2391 | 3 idx = srcu_read_lock(&ss); | ||
| 2392 | 4 do_something(); | ||
| 2393 | 5 synchronize_srcu(&ss); | ||
| 2394 | 6 srcu_read_unlock(&ss, idx); | ||
| 2395 | </pre> | ||
| 2396 | </blockquote> | ||
| 2397 | |||
| 2398 | <p> | ||
| 2399 | However, if line 5 acquired a mutex that was held across | ||
| 2400 | a <tt>synchronize_srcu()</tt> for domain <tt>ss</tt>, | ||
| 2401 | deadlock would still be possible. | ||
| 2402 | Furthermore, if line 5 acquired a mutex that was held across | ||
| 2403 | a <tt>synchronize_srcu()</tt> for some other domain <tt>ss1</tt>, | ||
| 2404 | and if an <tt>ss1</tt>-domain SRCU read-side critical section | ||
| 2405 | acquired another mutex that was held across as <tt>ss</tt>-domain | ||
| 2406 | <tt>synchronize_srcu()</tt>, | ||
| 2407 | deadlock would again be possible. | ||
| 2408 | Such a deadlock cycle could extend across an arbitrarily large number | ||
| 2409 | of different SRCU domains. | ||
| 2410 | Again, with great power comes great responsibility. | ||
| 2411 | |||
| 2412 | <p> | ||
| 2413 | Unlike the other RCU flavors, SRCU read-side critical sections can | ||
| 2414 | run on idle and even offline CPUs. | ||
| 2415 | This ability requires that <tt>srcu_read_lock()</tt> and | ||
| 2416 | <tt>srcu_read_unlock()</tt> contain memory barriers, which means | ||
| 2417 | that SRCU readers will run a bit slower than would RCU readers. | ||
| 2418 | It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt> | ||
| 2419 | API, which, in combination with <tt>srcu_read_unlock()</tt>, | ||
| 2420 | guarantees a full memory barrier. | ||
| 2421 | |||
| 2422 | <p> | ||
| 2423 | The | ||
| 2424 | <a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a> | ||
| 2425 | includes | ||
| 2426 | <tt>srcu_read_lock()</tt>, | ||
| 2427 | <tt>srcu_read_unlock()</tt>, | ||
| 2428 | <tt>srcu_dereference()</tt>, | ||
| 2429 | <tt>srcu_dereference_check()</tt>, | ||
| 2430 | <tt>synchronize_srcu()</tt>, | ||
| 2431 | <tt>synchronize_srcu_expedited()</tt>, | ||
| 2432 | <tt>call_srcu()</tt>, | ||
| 2433 | <tt>srcu_barrier()</tt>, and | ||
| 2434 | <tt>srcu_read_lock_held()</tt>. | ||
| 2435 | It also includes | ||
| 2436 | <tt>DEFINE_SRCU()</tt>, | ||
| 2437 | <tt>DEFINE_STATIC_SRCU()</tt>, and | ||
| 2438 | <tt>init_srcu_struct()</tt> | ||
| 2439 | APIs for defining and initializing <tt>srcu_struct</tt> structures. | ||
| 2440 | |||
| 2441 | <h3><a name="Tasks RCU">Tasks RCU</a></h3> | ||
| 2442 | |||
| 2443 | <p> | ||
| 2444 | Some forms of tracing use “tramopolines” to handle the | ||
| 2445 | binary rewriting required to install different types of probes. | ||
| 2446 | It would be good to be able to free old trampolines, which sounds | ||
| 2447 | like a job for some form of RCU. | ||
| 2448 | However, because it is necessary to be able to install a trace | ||
| 2449 | anywhere in the code, it is not possible to use read-side markers | ||
| 2450 | such as <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>. | ||
| 2451 | In addition, it does not work to have these markers in the trampoline | ||
| 2452 | itself, because there would need to be instructions following | ||
| 2453 | <tt>rcu_read_unlock()</tt>. | ||
| 2454 | Although <tt>synchronize_rcu()</tt> would guarantee that execution | ||
| 2455 | reached the <tt>rcu_read_unlock()</tt>, it would not be able to | ||
| 2456 | guarantee that execution had completely left the trampoline. | ||
| 2457 | |||
| 2458 | <p> | ||
| 2459 | The solution, in the form of | ||
| 2460 | <a href="https://lwn.net/Articles/607117/"><i>Tasks RCU</i></a>, | ||
| 2461 | is to have implicit | ||
| 2462 | read-side critical sections that are delimited by voluntary context | ||
| 2463 | switches, that is, calls to <tt>schedule()</tt>, | ||
| 2464 | <tt>cond_resched_rcu_qs()</tt>, and | ||
| 2465 | <tt>synchronize_rcu_tasks()</tt>. | ||
| 2466 | In addition, transitions to and from userspace execution also delimit | ||
| 2467 | tasks-RCU read-side critical sections. | ||
| 2468 | |||
| 2469 | <p> | ||
| 2470 | The tasks-RCU API is quite compact, consisting only of | ||
| 2471 | <tt>call_rcu_tasks()</tt>, | ||
| 2472 | <tt>synchronize_rcu_tasks()</tt>, and | ||
| 2473 | <tt>rcu_barrier_tasks()</tt>. | ||
| 2474 | |||
| 2475 | <h2><a name="Possible Future Changes">Possible Future Changes</a></h2> | ||
| 2476 | |||
| 2477 | <p> | ||
| 2478 | One of the tricks that RCU uses to attain update-side scalability is | ||
| 2479 | to increase grace-period latency with increasing numbers of CPUs. | ||
| 2480 | If this becomes a serious problem, it will be necessary to rework the | ||
| 2481 | grace-period state machine so as to avoid the need for the additional | ||
| 2482 | latency. | ||
| 2483 | |||
| 2484 | <p> | ||
| 2485 | Expedited grace periods scan the CPUs, so their latency and overhead | ||
| 2486 | increases with increasing numbers of CPUs. | ||
| 2487 | If this becomes a serious problem on large systems, it will be necessary | ||
| 2488 | to do some redesign to avoid this scalability problem. | ||
| 2489 | |||
| 2490 | <p> | ||
| 2491 | RCU disables CPU hotplug in a few places, perhaps most notably in the | ||
| 2492 | expedited grace-period and <tt>rcu_barrier()</tt> operations. | ||
| 2493 | If there is a strong reason to use expedited grace periods in CPU-hotplug | ||
| 2494 | notifiers, it will be necessary to avoid disabling CPU hotplug. | ||
| 2495 | This would introduce some complexity, so there had better be a <i>very</i> | ||
| 2496 | good reason. | ||
| 2497 | |||
| 2498 | <p> | ||
| 2499 | The tradeoff between grace-period latency on the one hand and interruptions | ||
| 2500 | of other CPUs on the other hand may need to be re-examined. | ||
| 2501 | The desire is of course for zero grace-period latency as well as zero | ||
| 2502 | interprocessor interrupts undertaken during an expedited grace period | ||
| 2503 | operation. | ||
| 2504 | While this ideal is unlikely to be achievable, it is quite possible that | ||
| 2505 | further improvements can be made. | ||
| 2506 | |||
| 2507 | <p> | ||
| 2508 | The multiprocessor implementations of RCU use a combining tree that | ||
| 2509 | groups CPUs so as to reduce lock contention and increase cache locality. | ||
| 2510 | However, this combining tree does not spread its memory across NUMA | ||
| 2511 | nodes nor does it align the CPU groups with hardware features such | ||
| 2512 | as sockets or cores. | ||
| 2513 | Such spreading and alignment is currently believed to be unnecessary | ||
| 2514 | because the hotpath read-side primitives do not access the combining | ||
| 2515 | tree, nor does <tt>call_rcu()</tt> in the common case. | ||
| 2516 | If you believe that your architecture needs such spreading and alignment, | ||
| 2517 | then your architecture should also benefit from the | ||
| 2518 | <tt>rcutree.rcu_fanout_leaf</tt> boot parameter, which can be set | ||
| 2519 | to the number of CPUs in a socket, NUMA node, or whatever. | ||
| 2520 | If the number of CPUs is too large, use a fraction of the number of | ||
| 2521 | CPUs. | ||
| 2522 | If the number of CPUs is a large prime number, well, that certainly | ||
| 2523 | is an “interesting” architectural choice! | ||
| 2524 | More flexible arrangements might be considered, but only if | ||
| 2525 | <tt>rcutree.rcu_fanout_leaf</tt> has proven inadequate, and only | ||
| 2526 | if the inadequacy has been demonstrated by a carefully run and | ||
| 2527 | realistic system-level workload. | ||
| 2528 | |||
| 2529 | <p> | ||
| 2530 | Please note that arrangements that require RCU to remap CPU numbers will | ||
| 2531 | require extremely good demonstration of need and full exploration of | ||
| 2532 | alternatives. | ||
| 2533 | |||
| 2534 | <p> | ||
| 2535 | There is an embarrassingly large number of flavors of RCU, and this | ||
| 2536 | number has been increasing over time. | ||
| 2537 | Perhaps it will be possible to combine some at some future date. | ||
| 2538 | |||
| 2539 | <p> | ||
| 2540 | RCU's various kthreads are reasonably recent additions. | ||
| 2541 | It is quite likely that adjustments will be required to more gracefully | ||
| 2542 | handle extreme loads. | ||
| 2543 | It might also be necessary to be able to relate CPU utilization by | ||
| 2544 | RCU's kthreads and softirq handlers to the code that instigated this | ||
| 2545 | CPU utilization. | ||
| 2546 | For example, RCU callback overhead might be charged back to the | ||
| 2547 | originating <tt>call_rcu()</tt> instance, though probably not | ||
| 2548 | in production kernels. | ||
| 2549 | |||
| 2550 | <h2><a name="Summary">Summary</a></h2> | ||
| 2551 | |||
| 2552 | <p> | ||
| 2553 | This document has presented more than two decade's worth of RCU | ||
| 2554 | requirements. | ||
| 2555 | Given that the requirements keep changing, this will not be the last | ||
| 2556 | word on this subject, but at least it serves to get an important | ||
| 2557 | subset of the requirements set forth. | ||
| 2558 | |||
| 2559 | <h2><a name="Acknowledgments">Acknowledgments</a></h2> | ||
| 2560 | |||
| 2561 | I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar, | ||
| 2562 | Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and | ||
| 2563 | Andy Lutomirski for their help in rendering | ||
| 2564 | this article human readable, and to Michelle Rankin for her support | ||
| 2565 | of this effort. | ||
| 2566 | Other contributions are acknowledged in the Linux kernel's git archive. | ||
| 2567 | The cartoon is copyright (c) 2013 by Melissa Broussard, | ||
| 2568 | and is provided | ||
| 2569 | under the terms of the Creative Commons Attribution-Share Alike 3.0 | ||
| 2570 | United States license. | ||
| 2571 | |||
| 2572 | <h3><a name="Answers to Quick Quizzes"> | ||
| 2573 | Answers to Quick Quizzes</a></h3> | ||
| 2574 | |||
| 2575 | <a name="qq1answer"></a> | ||
| 2576 | <p><b>Quick Quiz 1</b>: | ||
| 2577 | Wait a minute! | ||
| 2578 | You said that updaters can make useful forward progress concurrently | ||
| 2579 | with readers, but pre-existing readers will block | ||
| 2580 | <tt>synchronize_rcu()</tt>!!! | ||
| 2581 | Just who are you trying to fool??? | ||
| 2582 | |||
| 2583 | |||
| 2584 | </p><p><b>Answer</b>: | ||
| 2585 | First, if updaters do not wish to be blocked by readers, they can use | ||
| 2586 | <tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will | ||
| 2587 | be discussed later. | ||
| 2588 | Second, even when using <tt>synchronize_rcu()</tt>, the other | ||
| 2589 | update-side code does run concurrently with readers, whether pre-existing | ||
| 2590 | or not. | ||
| 2591 | |||
| 2592 | |||
| 2593 | </p><p><a href="#Quick%20Quiz%201"><b>Back to Quick Quiz 1</b>.</a> | ||
| 2594 | |||
| 2595 | <a name="qq2answer"></a> | ||
| 2596 | <p><b>Quick Quiz 2</b>: | ||
| 2597 | Why is the <tt>synchronize_rcu()</tt> on line 28 needed? | ||
| 2598 | |||
| 2599 | |||
| 2600 | </p><p><b>Answer</b>: | ||
| 2601 | Without that extra grace period, memory reordering could result in | ||
| 2602 | <tt>do_something_dlm()</tt> executing <tt>do_something()</tt> | ||
| 2603 | concurrently with the last bits of <tt>recovery()</tt>. | ||
| 2604 | |||
| 2605 | |||
| 2606 | </p><p><a href="#Quick%20Quiz%202"><b>Back to Quick Quiz 2</b>.</a> | ||
| 2607 | |||
| 2608 | <a name="qq3answer"></a> | ||
| 2609 | <p><b>Quick Quiz 3</b>: | ||
| 2610 | But <tt>rcu_assign_pointer()</tt> does nothing to prevent the | ||
| 2611 | two assignments to <tt>p->a</tt> and <tt>p->b</tt> | ||
| 2612 | from being reordered. | ||
| 2613 | Can't that also cause problems? | ||
| 2614 | |||
| 2615 | |||
| 2616 | </p><p><b>Answer</b>: | ||
| 2617 | No, it cannot. | ||
| 2618 | The readers cannot see either of these two fields until | ||
| 2619 | the assignment to <tt>gp</tt>, by which time both fields are | ||
| 2620 | fully initialized. | ||
| 2621 | So reordering the assignments | ||
| 2622 | to <tt>p->a</tt> and <tt>p->b</tt> cannot possibly | ||
| 2623 | cause any problems. | ||
| 2624 | |||
| 2625 | |||
| 2626 | </p><p><a href="#Quick%20Quiz%203"><b>Back to Quick Quiz 3</b>.</a> | ||
| 2627 | |||
| 2628 | <a name="qq4answer"></a> | ||
| 2629 | <p><b>Quick Quiz 4</b>: | ||
| 2630 | Without the <tt>rcu_dereference()</tt> or the | ||
| 2631 | <tt>rcu_access_pointer()</tt>, what destructive optimizations | ||
| 2632 | might the compiler make use of? | ||
| 2633 | |||
| 2634 | |||
| 2635 | </p><p><b>Answer</b>: | ||
| 2636 | Let's start with what happens to <tt>do_something_gp()</tt> | ||
| 2637 | if it fails to use <tt>rcu_dereference()</tt>. | ||
| 2638 | It could reuse a value formerly fetched from this same pointer. | ||
| 2639 | It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time | ||
| 2640 | manner, resulting in <i>load tearing</i>, in turn resulting a bytewise | ||
| 2641 | mash-up of two distince pointer values. | ||
| 2642 | It might even use value-speculation optimizations, where it makes a wrong | ||
| 2643 | guess, but by the time it gets around to checking the value, an update | ||
| 2644 | has changed the pointer to match the wrong guess. | ||
| 2645 | Too bad about any dereferences that returned pre-initialization garbage | ||
| 2646 | in the meantime! | ||
| 2647 | |||
| 2648 | <p> | ||
| 2649 | For <tt>remove_gp_synchronous()</tt>, as long as all modifications | ||
| 2650 | to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>, | ||
| 2651 | the above optimizations are harmless. | ||
| 2652 | However, | ||
| 2653 | with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>, | ||
| 2654 | <tt>sparse</tt> will complain if you | ||
| 2655 | define <tt>gp</tt> with <tt>__rcu</tt> and then | ||
| 2656 | access it without using | ||
| 2657 | either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>. | ||
| 2658 | |||
| 2659 | |||
| 2660 | </p><p><a href="#Quick%20Quiz%204"><b>Back to Quick Quiz 4</b>.</a> | ||
| 2661 | |||
| 2662 | <a name="qq5answer"></a> | ||
| 2663 | <p><b>Quick Quiz 5</b>: | ||
| 2664 | Given that multiple CPUs can start RCU read-side critical sections | ||
| 2665 | at any time without any ordering whatsoever, how can RCU possibly tell whether | ||
| 2666 | or not a given RCU read-side critical section starts before a | ||
| 2667 | given instance of <tt>synchronize_rcu()</tt>? | ||
| 2668 | |||
| 2669 | |||
| 2670 | </p><p><b>Answer</b>: | ||
| 2671 | If RCU cannot tell whether or not a given | ||
| 2672 | RCU read-side critical section starts before a | ||
| 2673 | given instance of <tt>synchronize_rcu()</tt>, | ||
| 2674 | then it must assume that the RCU read-side critical section | ||
| 2675 | started first. | ||
| 2676 | In other words, a given instance of <tt>synchronize_rcu()</tt> | ||
| 2677 | can avoid waiting on a given RCU read-side critical section only | ||
| 2678 | if it can prove that <tt>synchronize_rcu()</tt> started first. | ||
| 2679 | |||
| 2680 | |||
| 2681 | </p><p><a href="#Quick%20Quiz%205"><b>Back to Quick Quiz 5</b>.</a> | ||
| 2682 | |||
| 2683 | <a name="qq6answer"></a> | ||
| 2684 | <p><b>Quick Quiz 6</b>: | ||
| 2685 | The first and second guarantees require unbelievably strict ordering! | ||
| 2686 | Are all these memory barriers <i> really</i> required? | ||
| 2687 | |||
| 2688 | |||
| 2689 | </p><p><b>Answer</b>: | ||
| 2690 | Yes, they really are required. | ||
| 2691 | To see why the first guarantee is required, consider the following | ||
| 2692 | sequence of events: | ||
| 2693 | |||
| 2694 | <ol> | ||
| 2695 | <li> CPU 1: <tt>rcu_read_lock()</tt> | ||
| 2696 | <li> CPU 1: <tt>q = rcu_dereference(gp); | ||
| 2697 | /* Very likely to return p. */</tt> | ||
| 2698 | <li> CPU 0: <tt>list_del_rcu(p);</tt> | ||
| 2699 | <li> CPU 0: <tt>synchronize_rcu()</tt> starts. | ||
| 2700 | <li> CPU 1: <tt>do_something_with(q->a); | ||
| 2701 | /* No smp_mb(), so might happen after kfree(). */</tt> | ||
| 2702 | <li> CPU 1: <tt>rcu_read_unlock()</tt> | ||
| 2703 | <li> CPU 0: <tt>synchronize_rcu()</tt> returns. | ||
| 2704 | <li> CPU 0: <tt>kfree(p);</tt> | ||
| 2705 | </ol> | ||
| 2706 | |||
| 2707 | <p> | ||
| 2708 | Therefore, there absolutely must be a full memory barrier between the | ||
| 2709 | end of the RCU read-side critical section and the end of the | ||
| 2710 | grace period. | ||
| 2711 | |||
| 2712 | <p> | ||
| 2713 | The sequence of events demonstrating the necessity of the second rule | ||
| 2714 | is roughly similar: | ||
| 2715 | |||
| 2716 | <ol> | ||
| 2717 | <li> CPU 0: <tt>list_del_rcu(p);</tt> | ||
| 2718 | <li> CPU 0: <tt>synchronize_rcu()</tt> starts. | ||
| 2719 | <li> CPU 1: <tt>rcu_read_lock()</tt> | ||
| 2720 | <li> CPU 1: <tt>q = rcu_dereference(gp); | ||
| 2721 | /* Might return p if no memory barrier. */</tt> | ||
| 2722 | <li> CPU 0: <tt>synchronize_rcu()</tt> returns. | ||
| 2723 | <li> CPU 0: <tt>kfree(p);</tt> | ||
| 2724 | <li> CPU 1: <tt>do_something_with(q->a); /* Boom!!! */</tt> | ||
| 2725 | <li> CPU 1: <tt>rcu_read_unlock()</tt> | ||
| 2726 | </ol> | ||
| 2727 | |||
| 2728 | <p> | ||
| 2729 | And similarly, without a memory barrier between the beginning of the | ||
| 2730 | grace period and the beginning of the RCU read-side critical section, | ||
| 2731 | CPU 1 might end up accessing the freelist. | ||
| 2732 | |||
| 2733 | <p> | ||
| 2734 | The “as if” rule of course applies, so that any implementation | ||
| 2735 | that acts as if the appropriate memory barriers were in place is a | ||
| 2736 | correct implementation. | ||
| 2737 | That said, it is much easier to fool yourself into believing that you have | ||
| 2738 | adhered to the as-if rule than it is to actually adhere to it! | ||
| 2739 | |||
| 2740 | |||
| 2741 | </p><p><a href="#Quick%20Quiz%206"><b>Back to Quick Quiz 6</b>.</a> | ||
| 2742 | |||
| 2743 | <a name="qq7answer"></a> | ||
| 2744 | <p><b>Quick Quiz 7</b>: | ||
| 2745 | But how does the upgrade-to-write operation exclude other readers? | ||
| 2746 | |||
| 2747 | |||
| 2748 | </p><p><b>Answer</b>: | ||
| 2749 | It doesn't, just like normal RCU updates, which also do not exclude | ||
| 2750 | RCU readers. | ||
| 2751 | |||
| 2752 | |||
| 2753 | </p><p><a href="#Quick%20Quiz%207"><b>Back to Quick Quiz 7</b>.</a> | ||
| 2754 | |||
| 2755 | <a name="qq8answer"></a> | ||
| 2756 | <p><b>Quick Quiz 8</b>: | ||
| 2757 | Can't the compiler also reorder this code? | ||
| 2758 | |||
| 2759 | |||
| 2760 | </p><p><b>Answer</b>: | ||
| 2761 | No, the volatile casts in <tt>READ_ONCE()</tt> and | ||
| 2762 | <tt>WRITE_ONCE()</tt> prevent the compiler from reordering in | ||
| 2763 | this particular case. | ||
| 2764 | |||
| 2765 | |||
| 2766 | </p><p><a href="#Quick%20Quiz%208"><b>Back to Quick Quiz 8</b>.</a> | ||
| 2767 | |||
| 2768 | <a name="qq9answer"></a> | ||
| 2769 | <p><b>Quick Quiz 9</b>: | ||
| 2770 | Suppose that synchronize_rcu() did wait until all readers had completed. | ||
| 2771 | Would the updater be able to rely on this? | ||
| 2772 | |||
| 2773 | |||
| 2774 | </p><p><b>Answer</b>: | ||
| 2775 | No. | ||
| 2776 | Even if <tt>synchronize_rcu()</tt> were to wait until | ||
| 2777 | all readers had completed, a new reader might start immediately after | ||
| 2778 | <tt>synchronize_rcu()</tt> completed. | ||
| 2779 | Therefore, the code following | ||
| 2780 | <tt>synchronize_rcu()</tt> cannot rely on there being no readers | ||
| 2781 | in any case. | ||
| 2782 | |||
| 2783 | |||
| 2784 | </p><p><a href="#Quick%20Quiz%209"><b>Back to Quick Quiz 9</b>.</a> | ||
| 2785 | |||
| 2786 | <a name="qq10answer"></a> | ||
| 2787 | <p><b>Quick Quiz 10</b>: | ||
| 2788 | How long a sequence of grace periods, each separated by an RCU read-side | ||
| 2789 | critical section, would be required to partition the RCU read-side | ||
| 2790 | critical sections at the beginning and end of the chain? | ||
| 2791 | |||
| 2792 | |||
| 2793 | </p><p><b>Answer</b>: | ||
| 2794 | In theory, an infinite number. | ||
| 2795 | In practice, an unknown number that is sensitive to both implementation | ||
| 2796 | details and timing considerations. | ||
| 2797 | Therefore, even in practice, RCU users must abide by the theoretical rather | ||
| 2798 | than the practical answer. | ||
| 2799 | |||
| 2800 | |||
| 2801 | </p><p><a href="#Quick%20Quiz%2010"><b>Back to Quick Quiz 10</b>.</a> | ||
| 2802 | |||
| 2803 | <a name="qq11answer"></a> | ||
| 2804 | <p><b>Quick Quiz 11</b>: | ||
| 2805 | What about sleeping locks? | ||
| 2806 | |||
| 2807 | |||
| 2808 | </p><p><b>Answer</b>: | ||
| 2809 | These are forbidden within Linux-kernel RCU read-side critical sections | ||
| 2810 | because it is not legal to place a quiescent state (in this case, | ||
| 2811 | voluntary context switch) within an RCU read-side critical section. | ||
| 2812 | However, sleeping locks may be used within userspace RCU read-side critical | ||
| 2813 | sections, and also within Linux-kernel sleepable RCU | ||
| 2814 | <a href="#Sleepable RCU">(SRCU)</a> | ||
| 2815 | read-side critical sections. | ||
| 2816 | In addition, the -rt patchset turns spinlocks into a sleeping locks so | ||
| 2817 | that the corresponding critical sections can be preempted, which | ||
| 2818 | also means that these sleeplockified spinlocks (but not other sleeping locks!) | ||
| 2819 | may be acquire within -rt-Linux-kernel RCU read-side critical sections. | ||
| 2820 | |||
| 2821 | <p> | ||
| 2822 | Note that it <i>is</i> legal for a normal RCU read-side critical section | ||
| 2823 | to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>), | ||
| 2824 | but only as long as it does not loop indefinitely attempting to | ||
| 2825 | conditionally acquire that sleeping locks. | ||
| 2826 | The key point is that things like <tt>mutex_trylock()</tt> | ||
| 2827 | either return with the mutex held, or return an error indication if | ||
| 2828 | the mutex was not immediately available. | ||
| 2829 | Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping. | ||
| 2830 | |||
| 2831 | |||
| 2832 | </p><p><a href="#Quick%20Quiz%2011"><b>Back to Quick Quiz 11</b>.</a> | ||
| 2833 | |||
| 2834 | <a name="qq12answer"></a> | ||
| 2835 | <p><b>Quick Quiz 12</b>: | ||
| 2836 | Why does line 19 use <tt>rcu_access_pointer()</tt>? | ||
| 2837 | After all, <tt>call_rcu()</tt> on line 25 stores into the | ||
| 2838 | structure, which would interact badly with concurrent insertions. | ||
| 2839 | Doesn't this mean that <tt>rcu_dereference()</tt> is required? | ||
| 2840 | |||
| 2841 | |||
| 2842 | </p><p><b>Answer</b>: | ||
| 2843 | Presumably the <tt>->gp_lock</tt> acquired on line 18 excludes | ||
| 2844 | any changes, including any insertions that <tt>rcu_dereference()</tt> | ||
| 2845 | would protect against. | ||
| 2846 | Therefore, any insertions will be delayed until after <tt>->gp_lock</tt> | ||
| 2847 | is released on line 25, which in turn means that | ||
| 2848 | <tt>rcu_access_pointer()</tt> suffices. | ||
| 2849 | |||
| 2850 | |||
| 2851 | </p><p><a href="#Quick%20Quiz%2012"><b>Back to Quick Quiz 12</b>.</a> | ||
| 2852 | |||
| 2853 | <a name="qq13answer"></a> | ||
| 2854 | <p><b>Quick Quiz 13</b>: | ||
| 2855 | Earlier it was claimed that <tt>call_rcu()</tt> and | ||
| 2856 | <tt>kfree_rcu()</tt> allowed updaters to avoid being blocked | ||
| 2857 | by readers. | ||
| 2858 | But how can that be correct, given that the invocation of the callback | ||
| 2859 | and the freeing of the memory (respectively) must still wait for | ||
| 2860 | a grace period to elapse? | ||
| 2861 | |||
| 2862 | |||
| 2863 | </p><p><b>Answer</b>: | ||
| 2864 | We could define things this way, but keep in mind that this sort of | ||
| 2865 | definition would say that updates in garbage-collected languages | ||
| 2866 | cannot complete until the next time the garbage collector runs, | ||
| 2867 | which does not seem at all reasonable. | ||
| 2868 | The key point is that in most cases, an updater using either | ||
| 2869 | <tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the | ||
| 2870 | next update as soon as it has invoked <tt>call_rcu()</tt> or | ||
| 2871 | <tt>kfree_rcu()</tt>, without having to wait for a subsequent | ||
| 2872 | grace period. | ||
| 2873 | |||
| 2874 | |||
| 2875 | </p><p><a href="#Quick%20Quiz%2013"><b>Back to Quick Quiz 13</b>.</a> | ||
| 2876 | |||
| 2877 | <a name="qq14answer"></a> | ||
| 2878 | <p><b>Quick Quiz 14</b>: | ||
| 2879 | So what happens with <tt>synchronize_rcu()</tt> during | ||
| 2880 | scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> | ||
| 2881 | kernels? | ||
| 2882 | |||
| 2883 | |||
| 2884 | </p><p><b>Answer</b>: | ||
| 2885 | In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt> | ||
| 2886 | maps directly to <tt>synchronize_sched()</tt>. | ||
| 2887 | Therefore, <tt>synchronize_rcu()</tt> works normally throughout | ||
| 2888 | boot in <tt>CONFIG_PREEMPT=n</tt> kernels. | ||
| 2889 | However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels, | ||
| 2890 | so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt> | ||
| 2891 | during scheduler initialization. | ||
| 2892 | |||
| 2893 | |||
| 2894 | </p><p><a href="#Quick%20Quiz%2014"><b>Back to Quick Quiz 14</b>.</a> | ||
| 2895 | |||
| 2896 | |||
| 2897 | </body></html> | ||
diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx new file mode 100644 index 000000000000..3a97ba490c42 --- /dev/null +++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx | |||
| @@ -0,0 +1,2741 @@ | |||
| 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" | ||
| 2 | "http://www.w3.org/TR/html4/loose.dtd"> | ||
| 3 | <html> | ||
| 4 | <head><title>A Tour Through RCU's Requirements [LWN.net]</title> | ||
| 5 | <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8"> | ||
| 6 | |||
| 7 | <h1>A Tour Through RCU's Requirements</h1> | ||
| 8 | |||
| 9 | <p>Copyright IBM Corporation, 2015</p> | ||
| 10 | <p>Author: Paul E. McKenney</p> | ||
| 11 | <p><i>The initial version of this document appeared in the | ||
| 12 | <a href="https://lwn.net/">LWN</a> articles | ||
| 13 | <a href="https://lwn.net/Articles/652156/">here</a>, | ||
| 14 | <a href="https://lwn.net/Articles/652677/">here</a>, and | ||
| 15 | <a href="https://lwn.net/Articles/653326/">here</a>.</i></p> | ||
| 16 | |||
| 17 | <h2>Introduction</h2> | ||
| 18 | |||
| 19 | <p> | ||
| 20 | Read-copy update (RCU) is a synchronization mechanism that is often | ||
| 21 | used as a replacement for reader-writer locking. | ||
| 22 | RCU is unusual in that updaters do not block readers, | ||
| 23 | which means that RCU's read-side primitives can be exceedingly fast | ||
| 24 | and scalable. | ||
| 25 | In addition, updaters can make useful forward progress concurrently | ||
| 26 | with readers. | ||
| 27 | However, all this concurrency between RCU readers and updaters does raise | ||
| 28 | the question of exactly what RCU readers are doing, which in turn | ||
| 29 | raises the question of exactly what RCU's requirements are. | ||
| 30 | |||
| 31 | <p> | ||
| 32 | This document therefore summarizes RCU's requirements, and can be thought | ||
| 33 | of as an informal, high-level specification for RCU. | ||
| 34 | It is important to understand that RCU's specification is primarily | ||
| 35 | empirical in nature; | ||
| 36 | in fact, I learned about many of these requirements the hard way. | ||
| 37 | This situation might cause some consternation, however, not only | ||
| 38 | has this learning process been a lot of fun, but it has also been | ||
| 39 | a great privilege to work with so many people willing to apply | ||
| 40 | technologies in interesting new ways. | ||
| 41 | |||
| 42 | <p> | ||
| 43 | All that aside, here are the categories of currently known RCU requirements: | ||
| 44 | </p> | ||
| 45 | |||
| 46 | <ol> | ||
| 47 | <li> <a href="#Fundamental Requirements"> | ||
| 48 | Fundamental Requirements</a> | ||
| 49 | <li> <a href="#Fundamental Non-Requirements">Fundamental Non-Requirements</a> | ||
| 50 | <li> <a href="#Parallelism Facts of Life"> | ||
| 51 | Parallelism Facts of Life</a> | ||
| 52 | <li> <a href="#Quality-of-Implementation Requirements"> | ||
| 53 | Quality-of-Implementation Requirements</a> | ||
| 54 | <li> <a href="#Linux Kernel Complications"> | ||
| 55 | Linux Kernel Complications</a> | ||
| 56 | <li> <a href="#Software-Engineering Requirements"> | ||
| 57 | Software-Engineering Requirements</a> | ||
| 58 | <li> <a href="#Other RCU Flavors"> | ||
| 59 | Other RCU Flavors</a> | ||
| 60 | <li> <a href="#Possible Future Changes"> | ||
| 61 | Possible Future Changes</a> | ||
| 62 | </ol> | ||
| 63 | |||
| 64 | <p> | ||
| 65 | This is followed by a <a href="#Summary">summary</a>, | ||
| 66 | which is in turn followed by the inevitable | ||
| 67 | <a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>. | ||
| 68 | |||
| 69 | <h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2> | ||
| 70 | |||
| 71 | <p> | ||
| 72 | RCU's fundamental requirements are the closest thing RCU has to hard | ||
| 73 | mathematical requirements. | ||
| 74 | These are: | ||
| 75 | |||
| 76 | <ol> | ||
| 77 | <li> <a href="#Grace-Period Guarantee"> | ||
| 78 | Grace-Period Guarantee</a> | ||
| 79 | <li> <a href="#Publish-Subscribe Guarantee"> | ||
| 80 | Publish-Subscribe Guarantee</a> | ||
| 81 | <li> <a href="#Memory-Barrier Guarantees"> | ||
| 82 | Memory-Barrier Guarantees</a> | ||
| 83 | <li> <a href="#RCU Primitives Guaranteed to Execute Unconditionally"> | ||
| 84 | RCU Primitives Guaranteed to Execute Unconditionally</a> | ||
| 85 | <li> <a href="#Guaranteed Read-to-Write Upgrade"> | ||
| 86 | Guaranteed Read-to-Write Upgrade</a> | ||
| 87 | </ol> | ||
| 88 | |||
| 89 | <h3><a name="Grace-Period Guarantee">Grace-Period Guarantee</a></h3> | ||
| 90 | |||
| 91 | <p> | ||
| 92 | RCU's grace-period guarantee is unusual in being premeditated: | ||
| 93 | Jack Slingwine and I had this guarantee firmly in mind when we started | ||
| 94 | work on RCU (then called “rclock”) in the early 1990s. | ||
| 95 | That said, the past two decades of experience with RCU have produced | ||
| 96 | a much more detailed understanding of this guarantee. | ||
| 97 | |||
| 98 | <p> | ||
| 99 | RCU's grace-period guarantee allows updaters to wait for the completion | ||
| 100 | of all pre-existing RCU read-side critical sections. | ||
| 101 | An RCU read-side critical section | ||
| 102 | begins with the marker <tt>rcu_read_lock()</tt> and ends with | ||
| 103 | the marker <tt>rcu_read_unlock()</tt>. | ||
| 104 | These markers may be nested, and RCU treats a nested set as one | ||
| 105 | big RCU read-side critical section. | ||
| 106 | Production-quality implementations of <tt>rcu_read_lock()</tt> and | ||
| 107 | <tt>rcu_read_unlock()</tt> are extremely lightweight, and in | ||
| 108 | fact have exactly zero overhead in Linux kernels built for production | ||
| 109 | use with <tt>CONFIG_PREEMPT=n</tt>. | ||
| 110 | |||
| 111 | <p> | ||
| 112 | This guarantee allows ordering to be enforced with extremely low | ||
| 113 | overhead to readers, for example: | ||
| 114 | |||
| 115 | <blockquote> | ||
| 116 | <pre> | ||
| 117 | 1 int x, y; | ||
| 118 | 2 | ||
| 119 | 3 void thread0(void) | ||
| 120 | 4 { | ||
| 121 | 5 rcu_read_lock(); | ||
| 122 | 6 r1 = READ_ONCE(x); | ||
| 123 | 7 r2 = READ_ONCE(y); | ||
| 124 | 8 rcu_read_unlock(); | ||
| 125 | 9 } | ||
| 126 | 10 | ||
| 127 | 11 void thread1(void) | ||
| 128 | 12 { | ||
| 129 | 13 WRITE_ONCE(x, 1); | ||
| 130 | 14 synchronize_rcu(); | ||
| 131 | 15 WRITE_ONCE(y, 1); | ||
| 132 | 16 } | ||
| 133 | </pre> | ||
| 134 | </blockquote> | ||
| 135 | |||
| 136 | <p> | ||
| 137 | Because the <tt>synchronize_rcu()</tt> on line 14 waits for | ||
| 138 | all pre-existing readers, any instance of <tt>thread0()</tt> that | ||
| 139 | loads a value of zero from <tt>x</tt> must complete before | ||
| 140 | <tt>thread1()</tt> stores to <tt>y</tt>, so that instance must | ||
| 141 | also load a value of zero from <tt>y</tt>. | ||
| 142 | Similarly, any instance of <tt>thread0()</tt> that loads a value of | ||
| 143 | one from <tt>y</tt> must have started after the | ||
| 144 | <tt>synchronize_rcu()</tt> started, and must therefore also load | ||
| 145 | a value of one from <tt>x</tt>. | ||
| 146 | Therefore, the outcome: | ||
| 147 | <blockquote> | ||
| 148 | <pre> | ||
| 149 | (r1 == 0 && r2 == 1) | ||
| 150 | </pre> | ||
| 151 | </blockquote> | ||
| 152 | cannot happen. | ||
| 153 | |||
| 154 | <p>@@QQ@@ | ||
| 155 | Wait a minute! | ||
| 156 | You said that updaters can make useful forward progress concurrently | ||
| 157 | with readers, but pre-existing readers will block | ||
| 158 | <tt>synchronize_rcu()</tt>!!! | ||
| 159 | Just who are you trying to fool??? | ||
| 160 | <p>@@QQA@@ | ||
| 161 | First, if updaters do not wish to be blocked by readers, they can use | ||
| 162 | <tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will | ||
| 163 | be discussed later. | ||
| 164 | Second, even when using <tt>synchronize_rcu()</tt>, the other | ||
| 165 | update-side code does run concurrently with readers, whether pre-existing | ||
| 166 | or not. | ||
| 167 | <p>@@QQE@@ | ||
| 168 | |||
| 169 | <p> | ||
| 170 | This scenario resembles one of the first uses of RCU in | ||
| 171 | <a href="https://en.wikipedia.org/wiki/DYNIX">DYNIX/ptx</a>, | ||
| 172 | which managed a distributed lock manager's transition into | ||
| 173 | a state suitable for handling recovery from node failure, | ||
| 174 | more or less as follows: | ||
| 175 | |||
| 176 | <blockquote> | ||
| 177 | <pre> | ||
| 178 | 1 #define STATE_NORMAL 0 | ||
| 179 | 2 #define STATE_WANT_RECOVERY 1 | ||
| 180 | 3 #define STATE_RECOVERING 2 | ||
| 181 | 4 #define STATE_WANT_NORMAL 3 | ||
| 182 | 5 | ||
| 183 | 6 int state = STATE_NORMAL; | ||
| 184 | 7 | ||
| 185 | 8 void do_something_dlm(void) | ||
| 186 | 9 { | ||
| 187 | 10 int state_snap; | ||
| 188 | 11 | ||
| 189 | 12 rcu_read_lock(); | ||
| 190 | 13 state_snap = READ_ONCE(state); | ||
| 191 | 14 if (state_snap == STATE_NORMAL) | ||
| 192 | 15 do_something(); | ||
| 193 | 16 else | ||
| 194 | 17 do_something_carefully(); | ||
| 195 | 18 rcu_read_unlock(); | ||
| 196 | 19 } | ||
| 197 | 20 | ||
| 198 | 21 void start_recovery(void) | ||
| 199 | 22 { | ||
| 200 | 23 WRITE_ONCE(state, STATE_WANT_RECOVERY); | ||
| 201 | 24 synchronize_rcu(); | ||
| 202 | 25 WRITE_ONCE(state, STATE_RECOVERING); | ||
| 203 | 26 recovery(); | ||
| 204 | 27 WRITE_ONCE(state, STATE_WANT_NORMAL); | ||
| 205 | 28 synchronize_rcu(); | ||
| 206 | 29 WRITE_ONCE(state, STATE_NORMAL); | ||
| 207 | 30 } | ||
| 208 | </pre> | ||
| 209 | </blockquote> | ||
| 210 | |||
| 211 | <p> | ||
| 212 | The RCU read-side critical section in <tt>do_something_dlm()</tt> | ||
| 213 | works with the <tt>synchronize_rcu()</tt> in <tt>start_recovery()</tt> | ||
| 214 | to guarantee that <tt>do_something()</tt> never runs concurrently | ||
| 215 | with <tt>recovery()</tt>, but with little or no synchronization | ||
| 216 | overhead in <tt>do_something_dlm()</tt>. | ||
| 217 | |||
| 218 | <p>@@QQ@@ | ||
| 219 | Why is the <tt>synchronize_rcu()</tt> on line 28 needed? | ||
| 220 | <p>@@QQA@@ | ||
| 221 | Without that extra grace period, memory reordering could result in | ||
| 222 | <tt>do_something_dlm()</tt> executing <tt>do_something()</tt> | ||
| 223 | concurrently with the last bits of <tt>recovery()</tt>. | ||
| 224 | <p>@@QQE@@ | ||
| 225 | |||
| 226 | <p> | ||
| 227 | In order to avoid fatal problems such as deadlocks, | ||
| 228 | an RCU read-side critical section must not contain calls to | ||
| 229 | <tt>synchronize_rcu()</tt>. | ||
| 230 | Similarly, an RCU read-side critical section must not | ||
| 231 | contain anything that waits, directly or indirectly, on completion of | ||
| 232 | an invocation of <tt>synchronize_rcu()</tt>. | ||
| 233 | |||
| 234 | <p> | ||
| 235 | Although RCU's grace-period guarantee is useful in and of itself, with | ||
| 236 | <a href="https://lwn.net/Articles/573497/">quite a few use cases</a>, | ||
| 237 | it would be good to be able to use RCU to coordinate read-side | ||
| 238 | access to linked data structures. | ||
| 239 | For this, the grace-period guarantee is not sufficient, as can | ||
| 240 | be seen in function <tt>add_gp_buggy()</tt> below. | ||
| 241 | We will look at the reader's code later, but in the meantime, just think of | ||
| 242 | the reader as locklessly picking up the <tt>gp</tt> pointer, | ||
| 243 | and, if the value loaded is non-<tt>NULL</tt>, locklessly accessing the | ||
| 244 | <tt>->a</tt> and <tt>->b</tt> fields. | ||
| 245 | |||
| 246 | <blockquote> | ||
| 247 | <pre> | ||
| 248 | 1 bool add_gp_buggy(int a, int b) | ||
| 249 | 2 { | ||
| 250 | 3 p = kmalloc(sizeof(*p), GFP_KERNEL); | ||
| 251 | 4 if (!p) | ||
| 252 | 5 return -ENOMEM; | ||
| 253 | 6 spin_lock(&gp_lock); | ||
| 254 | 7 if (rcu_access_pointer(gp)) { | ||
| 255 | 8 spin_unlock(&gp_lock); | ||
| 256 | 9 return false; | ||
| 257 | 10 } | ||
| 258 | 11 p->a = a; | ||
| 259 | 12 p->b = a; | ||
| 260 | 13 gp = p; /* ORDERING BUG */ | ||
| 261 | 14 spin_unlock(&gp_lock); | ||
| 262 | 15 return true; | ||
| 263 | 16 } | ||
| 264 | </pre> | ||
| 265 | </blockquote> | ||
| 266 | |||
| 267 | <p> | ||
| 268 | The problem is that both the compiler and weakly ordered CPUs are within | ||
| 269 | their rights to reorder this code as follows: | ||
| 270 | |||
| 271 | <blockquote> | ||
| 272 | <pre> | ||
| 273 | 1 bool add_gp_buggy_optimized(int a, int b) | ||
| 274 | 2 { | ||
| 275 | 3 p = kmalloc(sizeof(*p), GFP_KERNEL); | ||
| 276 | 4 if (!p) | ||
| 277 | 5 return -ENOMEM; | ||
| 278 | 6 spin_lock(&gp_lock); | ||
| 279 | 7 if (rcu_access_pointer(gp)) { | ||
| 280 | 8 spin_unlock(&gp_lock); | ||
| 281 | 9 return false; | ||
| 282 | 10 } | ||
| 283 | <b>11 gp = p; /* ORDERING BUG */ | ||
| 284 | 12 p->a = a; | ||
| 285 | 13 p->b = a;</b> | ||
| 286 | 14 spin_unlock(&gp_lock); | ||
| 287 | 15 return true; | ||
| 288 | 16 } | ||
| 289 | </pre> | ||
| 290 | </blockquote> | ||
| 291 | |||
| 292 | <p> | ||
| 293 | If an RCU reader fetches <tt>gp</tt> just after | ||
| 294 | <tt>add_gp_buggy_optimized</tt> executes line 11, | ||
| 295 | it will see garbage in the <tt>->a</tt> and <tt>->b</tt> | ||
| 296 | fields. | ||
| 297 | And this is but one of many ways in which compiler and hardware optimizations | ||
| 298 | could cause trouble. | ||
| 299 | Therefore, we clearly need some way to prevent the compiler and the CPU from | ||
| 300 | reordering in this manner, which brings us to the publish-subscribe | ||
| 301 | guarantee discussed in the next section. | ||
| 302 | |||
| 303 | <h3><a name="Publish-Subscribe Guarantee">Publish/Subscribe Guarantee</a></h3> | ||
| 304 | |||
| 305 | <p> | ||
| 306 | RCU's publish-subscribe guarantee allows data to be inserted | ||
| 307 | into a linked data structure without disrupting RCU readers. | ||
| 308 | The updater uses <tt>rcu_assign_pointer()</tt> to insert the | ||
| 309 | new data, and readers use <tt>rcu_dereference()</tt> to | ||
| 310 | access data, whether new or old. | ||
| 311 | The following shows an example of insertion: | ||
| 312 | |||
| 313 | <blockquote> | ||
| 314 | <pre> | ||
| 315 | 1 bool add_gp(int a, int b) | ||
| 316 | 2 { | ||
| 317 | 3 p = kmalloc(sizeof(*p), GFP_KERNEL); | ||
| 318 | 4 if (!p) | ||
| 319 | 5 return -ENOMEM; | ||
| 320 | 6 spin_lock(&gp_lock); | ||
| 321 | 7 if (rcu_access_pointer(gp)) { | ||
| 322 | 8 spin_unlock(&gp_lock); | ||
| 323 | 9 return false; | ||
| 324 | 10 } | ||
| 325 | 11 p->a = a; | ||
| 326 | 12 p->b = a; | ||
| 327 | 13 rcu_assign_pointer(gp, p); | ||
| 328 | 14 spin_unlock(&gp_lock); | ||
| 329 | 15 return true; | ||
| 330 | 16 } | ||
| 331 | </pre> | ||
| 332 | </blockquote> | ||
| 333 | |||
| 334 | <p> | ||
| 335 | The <tt>rcu_assign_pointer()</tt> on line 13 is conceptually | ||
| 336 | equivalent to a simple assignment statement, but also guarantees | ||
| 337 | that its assignment will | ||
| 338 | happen after the two assignments in lines 11 and 12, | ||
| 339 | similar to the C11 <tt>memory_order_release</tt> store operation. | ||
| 340 | It also prevents any number of “interesting” compiler | ||
| 341 | optimizations, for example, the use of <tt>gp</tt> as a scratch | ||
| 342 | location immediately preceding the assignment. | ||
| 343 | |||
| 344 | <p>@@QQ@@ | ||
| 345 | But <tt>rcu_assign_pointer()</tt> does nothing to prevent the | ||
| 346 | two assignments to <tt>p->a</tt> and <tt>p->b</tt> | ||
| 347 | from being reordered. | ||
| 348 | Can't that also cause problems? | ||
| 349 | <p>@@QQA@@ | ||
| 350 | No, it cannot. | ||
| 351 | The readers cannot see either of these two fields until | ||
| 352 | the assignment to <tt>gp</tt>, by which time both fields are | ||
| 353 | fully initialized. | ||
| 354 | So reordering the assignments | ||
| 355 | to <tt>p->a</tt> and <tt>p->b</tt> cannot possibly | ||
| 356 | cause any problems. | ||
| 357 | <p>@@QQE@@ | ||
| 358 | |||
| 359 | <p> | ||
| 360 | It is tempting to assume that the reader need not do anything special | ||
| 361 | to control its accesses to the RCU-protected data, | ||
| 362 | as shown in <tt>do_something_gp_buggy()</tt> below: | ||
| 363 | |||
| 364 | <blockquote> | ||
| 365 | <pre> | ||
| 366 | 1 bool do_something_gp_buggy(void) | ||
| 367 | 2 { | ||
| 368 | 3 rcu_read_lock(); | ||
| 369 | 4 p = gp; /* OPTIMIZATIONS GALORE!!! */ | ||
| 370 | 5 if (p) { | ||
| 371 | 6 do_something(p->a, p->b); | ||
| 372 | 7 rcu_read_unlock(); | ||
| 373 | 8 return true; | ||
| 374 | 9 } | ||
| 375 | 10 rcu_read_unlock(); | ||
| 376 | 11 return false; | ||
| 377 | 12 } | ||
| 378 | </pre> | ||
| 379 | </blockquote> | ||
| 380 | |||
| 381 | <p> | ||
| 382 | However, this temptation must be resisted because there are a | ||
| 383 | surprisingly large number of ways that the compiler | ||
| 384 | (to say nothing of | ||
| 385 | <a href="https://h71000.www7.hp.com/wizard/wiz_2637.html">DEC Alpha CPUs</a>) | ||
| 386 | can trip this code up. | ||
| 387 | For but one example, if the compiler were short of registers, it | ||
| 388 | might choose to refetch from <tt>gp</tt> rather than keeping | ||
| 389 | a separate copy in <tt>p</tt> as follows: | ||
| 390 | |||
| 391 | <blockquote> | ||
| 392 | <pre> | ||
| 393 | 1 bool do_something_gp_buggy_optimized(void) | ||
| 394 | 2 { | ||
| 395 | 3 rcu_read_lock(); | ||
| 396 | 4 if (gp) { /* OPTIMIZATIONS GALORE!!! */ | ||
| 397 | <b> 5 do_something(gp->a, gp->b);</b> | ||
| 398 | 6 rcu_read_unlock(); | ||
| 399 | 7 return true; | ||
| 400 | 8 } | ||
| 401 | 9 rcu_read_unlock(); | ||
| 402 | 10 return false; | ||
| 403 | 11 } | ||
| 404 | </pre> | ||
| 405 | </blockquote> | ||
| 406 | |||
| 407 | <p> | ||
| 408 | If this function ran concurrently with a series of updates that | ||
| 409 | replaced the current structure with a new one, | ||
| 410 | the fetches of <tt>gp->a</tt> | ||
| 411 | and <tt>gp->b</tt> might well come from two different structures, | ||
| 412 | which could cause serious confusion. | ||
| 413 | To prevent this (and much else besides), <tt>do_something_gp()</tt> uses | ||
| 414 | <tt>rcu_dereference()</tt> to fetch from <tt>gp</tt>: | ||
| 415 | |||
| 416 | <blockquote> | ||
| 417 | <pre> | ||
| 418 | 1 bool do_something_gp(void) | ||
| 419 | 2 { | ||
| 420 | 3 rcu_read_lock(); | ||
| 421 | 4 p = rcu_dereference(gp); | ||
| 422 | 5 if (p) { | ||
| 423 | 6 do_something(p->a, p->b); | ||
| 424 | 7 rcu_read_unlock(); | ||
| 425 | 8 return true; | ||
| 426 | 9 } | ||
| 427 | 10 rcu_read_unlock(); | ||
| 428 | 11 return false; | ||
| 429 | 12 } | ||
| 430 | </pre> | ||
| 431 | </blockquote> | ||
| 432 | |||
| 433 | <p> | ||
| 434 | The <tt>rcu_dereference()</tt> uses volatile casts and (for DEC Alpha) | ||
| 435 | memory barriers in the Linux kernel. | ||
| 436 | Should a | ||
| 437 | <a href="http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf">high-quality implementation of C11 <tt>memory_order_consume</tt> [PDF]</a> | ||
| 438 | ever appear, then <tt>rcu_dereference()</tt> could be implemented | ||
| 439 | as a <tt>memory_order_consume</tt> load. | ||
| 440 | Regardless of the exact implementation, a pointer fetched by | ||
| 441 | <tt>rcu_dereference()</tt> may not be used outside of the | ||
| 442 | outermost RCU read-side critical section containing that | ||
| 443 | <tt>rcu_dereference()</tt>, unless protection of | ||
| 444 | the corresponding data element has been passed from RCU to some | ||
| 445 | other synchronization mechanism, most commonly locking or | ||
| 446 | <a href="https://www.kernel.org/doc/Documentation/RCU/rcuref.txt">reference counting</a>. | ||
| 447 | |||
| 448 | <p> | ||
| 449 | In short, updaters use <tt>rcu_assign_pointer()</tt> and readers | ||
| 450 | use <tt>rcu_dereference()</tt>, and these two RCU API elements | ||
| 451 | work together to ensure that readers have a consistent view of | ||
| 452 | newly added data elements. | ||
| 453 | |||
| 454 | <p> | ||
| 455 | Of course, it is also necessary to remove elements from RCU-protected | ||
| 456 | data structures, for example, using the following process: | ||
| 457 | |||
| 458 | <ol> | ||
| 459 | <li> Remove the data element from the enclosing structure. | ||
| 460 | <li> Wait for all pre-existing RCU read-side critical sections | ||
| 461 | to complete (because only pre-existing readers can possibly have | ||
| 462 | a reference to the newly removed data element). | ||
| 463 | <li> At this point, only the updater has a reference to the | ||
| 464 | newly removed data element, so it can safely reclaim | ||
| 465 | the data element, for example, by passing it to <tt>kfree()</tt>. | ||
| 466 | </ol> | ||
| 467 | |||
| 468 | This process is implemented by <tt>remove_gp_synchronous()</tt>: | ||
| 469 | |||
| 470 | <blockquote> | ||
| 471 | <pre> | ||
| 472 | 1 bool remove_gp_synchronous(void) | ||
| 473 | 2 { | ||
| 474 | 3 struct foo *p; | ||
| 475 | 4 | ||
| 476 | 5 spin_lock(&gp_lock); | ||
| 477 | 6 p = rcu_access_pointer(gp); | ||
| 478 | 7 if (!p) { | ||
| 479 | 8 spin_unlock(&gp_lock); | ||
| 480 | 9 return false; | ||
| 481 | 10 } | ||
| 482 | 11 rcu_assign_pointer(gp, NULL); | ||
| 483 | 12 spin_unlock(&gp_lock); | ||
| 484 | 13 synchronize_rcu(); | ||
| 485 | 14 kfree(p); | ||
| 486 | 15 return true; | ||
| 487 | 16 } | ||
| 488 | </pre> | ||
| 489 | </blockquote> | ||
| 490 | |||
| 491 | <p> | ||
| 492 | This function is straightforward, with line 13 waiting for a grace | ||
| 493 | period before line 14 frees the old data element. | ||
| 494 | This waiting ensures that readers will reach line 7 of | ||
| 495 | <tt>do_something_gp()</tt> before the data element referenced by | ||
| 496 | <tt>p</tt> is freed. | ||
| 497 | The <tt>rcu_access_pointer()</tt> on line 6 is similar to | ||
| 498 | <tt>rcu_dereference()</tt>, except that: | ||
| 499 | |||
| 500 | <ol> | ||
| 501 | <li> The value returned by <tt>rcu_access_pointer()</tt> | ||
| 502 | cannot be dereferenced. | ||
| 503 | If you want to access the value pointed to as well as | ||
| 504 | the pointer itself, use <tt>rcu_dereference()</tt> | ||
| 505 | instead of <tt>rcu_access_pointer()</tt>. | ||
| 506 | <li> The call to <tt>rcu_access_pointer()</tt> need not be | ||
| 507 | protected. | ||
| 508 | In contrast, <tt>rcu_dereference()</tt> must either be | ||
| 509 | within an RCU read-side critical section or in a code | ||
| 510 | segment where the pointer cannot change, for example, in | ||
| 511 | code protected by the corresponding update-side lock. | ||
| 512 | </ol> | ||
| 513 | |||
| 514 | <p>@@QQ@@ | ||
| 515 | Without the <tt>rcu_dereference()</tt> or the | ||
| 516 | <tt>rcu_access_pointer()</tt>, what destructive optimizations | ||
| 517 | might the compiler make use of? | ||
| 518 | <p>@@QQA@@ | ||
| 519 | Let's start with what happens to <tt>do_something_gp()</tt> | ||
| 520 | if it fails to use <tt>rcu_dereference()</tt>. | ||
| 521 | It could reuse a value formerly fetched from this same pointer. | ||
| 522 | It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time | ||
| 523 | manner, resulting in <i>load tearing</i>, in turn resulting a bytewise | ||
| 524 | mash-up of two distince pointer values. | ||
| 525 | It might even use value-speculation optimizations, where it makes a wrong | ||
| 526 | guess, but by the time it gets around to checking the value, an update | ||
| 527 | has changed the pointer to match the wrong guess. | ||
| 528 | Too bad about any dereferences that returned pre-initialization garbage | ||
| 529 | in the meantime! | ||
| 530 | |||
| 531 | <p> | ||
| 532 | For <tt>remove_gp_synchronous()</tt>, as long as all modifications | ||
| 533 | to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>, | ||
| 534 | the above optimizations are harmless. | ||
| 535 | However, | ||
| 536 | with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>, | ||
| 537 | <tt>sparse</tt> will complain if you | ||
| 538 | define <tt>gp</tt> with <tt>__rcu</tt> and then | ||
| 539 | access it without using | ||
| 540 | either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>. | ||
| 541 | <p>@@QQE@@ | ||
| 542 | |||
| 543 | <p> | ||
| 544 | In short, RCU's publish-subscribe guarantee is provided by the combination | ||
| 545 | of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>. | ||
| 546 | This guarantee allows data elements to be safely added to RCU-protected | ||
| 547 | linked data structures without disrupting RCU readers. | ||
| 548 | This guarantee can be used in combination with the grace-period | ||
| 549 | guarantee to also allow data elements to be removed from RCU-protected | ||
| 550 | linked data structures, again without disrupting RCU readers. | ||
| 551 | |||
| 552 | <p> | ||
| 553 | This guarantee was only partially premeditated. | ||
| 554 | DYNIX/ptx used an explicit memory barrier for publication, but had nothing | ||
| 555 | resembling <tt>rcu_dereference()</tt> for subscription, nor did it | ||
| 556 | have anything resembling the <tt>smp_read_barrier_depends()</tt> | ||
| 557 | that was later subsumed into <tt>rcu_dereference()</tt>. | ||
| 558 | The need for these operations made itself known quite suddenly at a | ||
| 559 | late-1990s meeting with the DEC Alpha architects, back in the days when | ||
| 560 | DEC was still a free-standing company. | ||
| 561 | It took the Alpha architects a good hour to convince me that any sort | ||
| 562 | of barrier would ever be needed, and it then took me a good <i>two</i> hours | ||
| 563 | to convince them that their documentation did not make this point clear. | ||
| 564 | More recent work with the C and C++ standards committees have provided | ||
| 565 | much education on tricks and traps from the compiler. | ||
| 566 | In short, compilers were much less tricky in the early 1990s, but in | ||
| 567 | 2015, don't even think about omitting <tt>rcu_dereference()</tt>! | ||
| 568 | |||
| 569 | <h3><a name="Memory-Barrier Guarantees">Memory-Barrier Guarantees</a></h3> | ||
| 570 | |||
| 571 | <p> | ||
| 572 | The previous section's simple linked-data-structure scenario clearly | ||
| 573 | demonstrates the need for RCU's stringent memory-ordering guarantees on | ||
| 574 | systems with more than one CPU: | ||
| 575 | |||
| 576 | <ol> | ||
| 577 | <li> Each CPU that has an RCU read-side critical section that | ||
| 578 | begins before <tt>synchronize_rcu()</tt> starts is | ||
| 579 | guaranteed to execute a full memory barrier between the time | ||
| 580 | that the RCU read-side critical section ends and the time that | ||
| 581 | <tt>synchronize_rcu()</tt> returns. | ||
| 582 | Without this guarantee, a pre-existing RCU read-side critical section | ||
| 583 | might hold a reference to the newly removed <tt>struct foo</tt> | ||
| 584 | after the <tt>kfree()</tt> on line 14 of | ||
| 585 | <tt>remove_gp_synchronous()</tt>. | ||
| 586 | <li> Each CPU that has an RCU read-side critical section that ends | ||
| 587 | after <tt>synchronize_rcu()</tt> returns is guaranteed | ||
| 588 | to execute a full memory barrier between the time that | ||
| 589 | <tt>synchronize_rcu()</tt> begins and the time that the RCU | ||
| 590 | read-side critical section begins. | ||
| 591 | Without this guarantee, a later RCU read-side critical section | ||
| 592 | running after the <tt>kfree()</tt> on line 14 of | ||
| 593 | <tt>remove_gp_synchronous()</tt> might | ||
| 594 | later run <tt>do_something_gp()</tt> and find the | ||
| 595 | newly deleted <tt>struct foo</tt>. | ||
| 596 | <li> If the task invoking <tt>synchronize_rcu()</tt> remains | ||
| 597 | on a given CPU, then that CPU is guaranteed to execute a full | ||
| 598 | memory barrier sometime during the execution of | ||
| 599 | <tt>synchronize_rcu()</tt>. | ||
| 600 | This guarantee ensures that the <tt>kfree()</tt> on | ||
| 601 | line 14 of <tt>remove_gp_synchronous()</tt> really does | ||
| 602 | execute after the removal on line 11. | ||
| 603 | <li> If the task invoking <tt>synchronize_rcu()</tt> migrates | ||
| 604 | among a group of CPUs during that invocation, then each of the | ||
| 605 | CPUs in that group is guaranteed to execute a full memory barrier | ||
| 606 | sometime during the execution of <tt>synchronize_rcu()</tt>. | ||
| 607 | This guarantee also ensures that the <tt>kfree()</tt> on | ||
| 608 | line 14 of <tt>remove_gp_synchronous()</tt> really does | ||
| 609 | execute after the removal on | ||
| 610 | line 11, but also in the case where the thread executing the | ||
| 611 | <tt>synchronize_rcu()</tt> migrates in the meantime. | ||
| 612 | </ol> | ||
| 613 | |||
| 614 | <p>@@QQ@@ | ||
| 615 | Given that multiple CPUs can start RCU read-side critical sections | ||
| 616 | at any time without any ordering whatsoever, how can RCU possibly tell whether | ||
| 617 | or not a given RCU read-side critical section starts before a | ||
| 618 | given instance of <tt>synchronize_rcu()</tt>? | ||
| 619 | <p>@@QQA@@ | ||
| 620 | If RCU cannot tell whether or not a given | ||
| 621 | RCU read-side critical section starts before a | ||
| 622 | given instance of <tt>synchronize_rcu()</tt>, | ||
| 623 | then it must assume that the RCU read-side critical section | ||
| 624 | started first. | ||
| 625 | In other words, a given instance of <tt>synchronize_rcu()</tt> | ||
| 626 | can avoid waiting on a given RCU read-side critical section only | ||
| 627 | if it can prove that <tt>synchronize_rcu()</tt> started first. | ||
| 628 | <p>@@QQE@@ | ||
| 629 | |||
| 630 | <p>@@QQ@@ | ||
| 631 | The first and second guarantees require unbelievably strict ordering! | ||
| 632 | Are all these memory barriers <i> really</i> required? | ||
| 633 | <p>@@QQA@@ | ||
| 634 | Yes, they really are required. | ||
| 635 | To see why the first guarantee is required, consider the following | ||
| 636 | sequence of events: | ||
| 637 | |||
| 638 | <ol> | ||
| 639 | <li> CPU 1: <tt>rcu_read_lock()</tt> | ||
| 640 | <li> CPU 1: <tt>q = rcu_dereference(gp); | ||
| 641 | /* Very likely to return p. */</tt> | ||
| 642 | <li> CPU 0: <tt>list_del_rcu(p);</tt> | ||
| 643 | <li> CPU 0: <tt>synchronize_rcu()</tt> starts. | ||
| 644 | <li> CPU 1: <tt>do_something_with(q->a); | ||
| 645 | /* No smp_mb(), so might happen after kfree(). */</tt> | ||
| 646 | <li> CPU 1: <tt>rcu_read_unlock()</tt> | ||
| 647 | <li> CPU 0: <tt>synchronize_rcu()</tt> returns. | ||
| 648 | <li> CPU 0: <tt>kfree(p);</tt> | ||
| 649 | </ol> | ||
| 650 | |||
| 651 | <p> | ||
| 652 | Therefore, there absolutely must be a full memory barrier between the | ||
| 653 | end of the RCU read-side critical section and the end of the | ||
| 654 | grace period. | ||
| 655 | |||
| 656 | <p> | ||
| 657 | The sequence of events demonstrating the necessity of the second rule | ||
| 658 | is roughly similar: | ||
| 659 | |||
| 660 | <ol> | ||
| 661 | <li> CPU 0: <tt>list_del_rcu(p);</tt> | ||
| 662 | <li> CPU 0: <tt>synchronize_rcu()</tt> starts. | ||
| 663 | <li> CPU 1: <tt>rcu_read_lock()</tt> | ||
| 664 | <li> CPU 1: <tt>q = rcu_dereference(gp); | ||
| 665 | /* Might return p if no memory barrier. */</tt> | ||
| 666 | <li> CPU 0: <tt>synchronize_rcu()</tt> returns. | ||
| 667 | <li> CPU 0: <tt>kfree(p);</tt> | ||
| 668 | <li> CPU 1: <tt>do_something_with(q->a); /* Boom!!! */</tt> | ||
| 669 | <li> CPU 1: <tt>rcu_read_unlock()</tt> | ||
| 670 | </ol> | ||
| 671 | |||
| 672 | <p> | ||
| 673 | And similarly, without a memory barrier between the beginning of the | ||
| 674 | grace period and the beginning of the RCU read-side critical section, | ||
| 675 | CPU 1 might end up accessing the freelist. | ||
| 676 | |||
| 677 | <p> | ||
| 678 | The “as if” rule of course applies, so that any implementation | ||
| 679 | that acts as if the appropriate memory barriers were in place is a | ||
| 680 | correct implementation. | ||
| 681 | That said, it is much easier to fool yourself into believing that you have | ||
| 682 | adhered to the as-if rule than it is to actually adhere to it! | ||
| 683 | <p>@@QQE@@ | ||
| 684 | |||
| 685 | <p> | ||
| 686 | Note that these memory-barrier requirements do not replace the fundamental | ||
| 687 | RCU requirement that a grace period wait for all pre-existing readers. | ||
| 688 | On the contrary, the memory barriers called out in this section must operate in | ||
| 689 | such a way as to <i>enforce</i> this fundamental requirement. | ||
| 690 | Of course, different implementations enforce this requirement in different | ||
| 691 | ways, but enforce it they must. | ||
| 692 | |||
| 693 | <h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3> | ||
| 694 | |||
| 695 | <p> | ||
| 696 | The common-case RCU primitives are unconditional. | ||
| 697 | They are invoked, they do their job, and they return, with no possibility | ||
| 698 | of error, and no need to retry. | ||
| 699 | This is a key RCU design philosophy. | ||
| 700 | |||
| 701 | <p> | ||
| 702 | However, this philosophy is pragmatic rather than pigheaded. | ||
| 703 | If someone comes up with a good justification for a particular conditional | ||
| 704 | RCU primitive, it might well be implemented and added. | ||
| 705 | After all, this guarantee was reverse-engineered, not premeditated. | ||
| 706 | The unconditional nature of the RCU primitives was initially an | ||
| 707 | accident of implementation, and later experience with synchronization | ||
| 708 | primitives with conditional primitives caused me to elevate this | ||
| 709 | accident to a guarantee. | ||
| 710 | Therefore, the justification for adding a conditional primitive to | ||
| 711 | RCU would need to be based on detailed and compelling use cases. | ||
| 712 | |||
| 713 | <h3><a name="Guaranteed Read-to-Write Upgrade">Guaranteed Read-to-Write Upgrade</a></h3> | ||
| 714 | |||
| 715 | <p> | ||
| 716 | As far as RCU is concerned, it is always possible to carry out an | ||
| 717 | update within an RCU read-side critical section. | ||
| 718 | For example, that RCU read-side critical section might search for | ||
| 719 | a given data element, and then might acquire the update-side | ||
| 720 | spinlock in order to update that element, all while remaining | ||
| 721 | in that RCU read-side critical section. | ||
| 722 | Of course, it is necessary to exit the RCU read-side critical section | ||
| 723 | before invoking <tt>synchronize_rcu()</tt>, however, this | ||
| 724 | inconvenience can be avoided through use of the | ||
| 725 | <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members | ||
| 726 | described later in this document. | ||
| 727 | |||
| 728 | <p>@@QQ@@ | ||
| 729 | But how does the upgrade-to-write operation exclude other readers? | ||
| 730 | <p>@@QQA@@ | ||
| 731 | It doesn't, just like normal RCU updates, which also do not exclude | ||
| 732 | RCU readers. | ||
| 733 | <p>@@QQE@@ | ||
| 734 | |||
| 735 | <p> | ||
| 736 | This guarantee allows lookup code to be shared between read-side | ||
| 737 | and update-side code, and was premeditated, appearing in the earliest | ||
| 738 | DYNIX/ptx RCU documentation. | ||
| 739 | |||
| 740 | <h2><a name="Fundamental Non-Requirements">Fundamental Non-Requirements</a></h2> | ||
| 741 | |||
| 742 | <p> | ||
| 743 | RCU provides extremely lightweight readers, and its read-side guarantees, | ||
| 744 | though quite useful, are correspondingly lightweight. | ||
| 745 | It is therefore all too easy to assume that RCU is guaranteeing more | ||
| 746 | than it really is. | ||
| 747 | Of course, the list of things that RCU does not guarantee is infinitely | ||
| 748 | long, however, the following sections list a few non-guarantees that | ||
| 749 | have caused confusion. | ||
| 750 | Except where otherwise noted, these non-guarantees were premeditated. | ||
| 751 | |||
| 752 | <ol> | ||
| 753 | <li> <a href="#Readers Impose Minimal Ordering"> | ||
| 754 | Readers Impose Minimal Ordering</a> | ||
| 755 | <li> <a href="#Readers Do Not Exclude Updaters"> | ||
| 756 | Readers Do Not Exclude Updaters</a> | ||
| 757 | <li> <a href="#Updaters Only Wait For Old Readers"> | ||
| 758 | Updaters Only Wait For Old Readers</a> | ||
| 759 | <li> <a href="#Grace Periods Don't Partition Read-Side Critical Sections"> | ||
| 760 | Grace Periods Don't Partition Read-Side Critical Sections</a> | ||
| 761 | <li> <a href="#Read-Side Critical Sections Don't Partition Grace Periods"> | ||
| 762 | Read-Side Critical Sections Don't Partition Grace Periods</a> | ||
| 763 | <li> <a href="#Disabling Preemption Does Not Block Grace Periods"> | ||
| 764 | Disabling Preemption Does Not Block Grace Periods</a> | ||
| 765 | </ol> | ||
| 766 | |||
| 767 | <h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3> | ||
| 768 | |||
| 769 | <p> | ||
| 770 | Reader-side markers such as <tt>rcu_read_lock()</tt> and | ||
| 771 | <tt>rcu_read_unlock()</tt> provide absolutely no ordering guarantees | ||
| 772 | except through their interaction with the grace-period APIs such as | ||
| 773 | <tt>synchronize_rcu()</tt>. | ||
| 774 | To see this, consider the following pair of threads: | ||
| 775 | |||
| 776 | <blockquote> | ||
| 777 | <pre> | ||
| 778 | 1 void thread0(void) | ||
| 779 | 2 { | ||
| 780 | 3 rcu_read_lock(); | ||
| 781 | 4 WRITE_ONCE(x, 1); | ||
| 782 | 5 rcu_read_unlock(); | ||
| 783 | 6 rcu_read_lock(); | ||
| 784 | 7 WRITE_ONCE(y, 1); | ||
| 785 | 8 rcu_read_unlock(); | ||
| 786 | 9 } | ||
| 787 | 10 | ||
| 788 | 11 void thread1(void) | ||
| 789 | 12 { | ||
| 790 | 13 rcu_read_lock(); | ||
| 791 | 14 r1 = READ_ONCE(y); | ||
| 792 | 15 rcu_read_unlock(); | ||
| 793 | 16 rcu_read_lock(); | ||
| 794 | 17 r2 = READ_ONCE(x); | ||
| 795 | 18 rcu_read_unlock(); | ||
| 796 | 19 } | ||
| 797 | </pre> | ||
| 798 | </blockquote> | ||
| 799 | |||
| 800 | <p> | ||
| 801 | After <tt>thread0()</tt> and <tt>thread1()</tt> execute | ||
| 802 | concurrently, it is quite possible to have | ||
| 803 | |||
| 804 | <blockquote> | ||
| 805 | <pre> | ||
| 806 | (r1 == 1 && r2 == 0) | ||
| 807 | </pre> | ||
| 808 | </blockquote> | ||
| 809 | |||
| 810 | (that is, <tt>y</tt> appears to have been assigned before <tt>x</tt>), | ||
| 811 | which would not be possible if <tt>rcu_read_lock()</tt> and | ||
| 812 | <tt>rcu_read_unlock()</tt> had much in the way of ordering | ||
| 813 | properties. | ||
| 814 | But they do not, so the CPU is within its rights | ||
| 815 | to do significant reordering. | ||
| 816 | This is by design: Any significant ordering constraints would slow down | ||
| 817 | these fast-path APIs. | ||
| 818 | |||
| 819 | <p>@@QQ@@ | ||
| 820 | Can't the compiler also reorder this code? | ||
| 821 | <p>@@QQA@@ | ||
| 822 | No, the volatile casts in <tt>READ_ONCE()</tt> and | ||
| 823 | <tt>WRITE_ONCE()</tt> prevent the compiler from reordering in | ||
| 824 | this particular case. | ||
| 825 | <p>@@QQE@@ | ||
| 826 | |||
| 827 | <h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3> | ||
| 828 | |||
| 829 | <p> | ||
| 830 | Neither <tt>rcu_read_lock()</tt> nor <tt>rcu_read_unlock()</tt> | ||
| 831 | exclude updates. | ||
| 832 | All they do is to prevent grace periods from ending. | ||
| 833 | The following example illustrates this: | ||
| 834 | |||
| 835 | <blockquote> | ||
| 836 | <pre> | ||
| 837 | 1 void thread0(void) | ||
| 838 | 2 { | ||
| 839 | 3 rcu_read_lock(); | ||
| 840 | 4 r1 = READ_ONCE(y); | ||
| 841 | 5 if (r1) { | ||
| 842 | 6 do_something_with_nonzero_x(); | ||
| 843 | 7 r2 = READ_ONCE(x); | ||
| 844 | 8 WARN_ON(!r2); /* BUG!!! */ | ||
| 845 | 9 } | ||
| 846 | 10 rcu_read_unlock(); | ||
| 847 | 11 } | ||
| 848 | 12 | ||
| 849 | 13 void thread1(void) | ||
| 850 | 14 { | ||
| 851 | 15 spin_lock(&my_lock); | ||
| 852 | 16 WRITE_ONCE(x, 1); | ||
| 853 | 17 WRITE_ONCE(y, 1); | ||
| 854 | 18 spin_unlock(&my_lock); | ||
| 855 | 19 } | ||
| 856 | </pre> | ||
| 857 | </blockquote> | ||
| 858 | |||
| 859 | <p> | ||
| 860 | If the <tt>thread0()</tt> function's <tt>rcu_read_lock()</tt> | ||
| 861 | excluded the <tt>thread1()</tt> function's update, | ||
| 862 | the <tt>WARN_ON()</tt> could never fire. | ||
| 863 | But the fact is that <tt>rcu_read_lock()</tt> does not exclude | ||
| 864 | much of anything aside from subsequent grace periods, of which | ||
| 865 | <tt>thread1()</tt> has none, so the | ||
| 866 | <tt>WARN_ON()</tt> can and does fire. | ||
| 867 | |||
| 868 | <h3><a name="Updaters Only Wait For Old Readers">Updaters Only Wait For Old Readers</a></h3> | ||
| 869 | |||
| 870 | <p> | ||
| 871 | It might be tempting to assume that after <tt>synchronize_rcu()</tt> | ||
| 872 | completes, there are no readers executing. | ||
| 873 | This temptation must be avoided because | ||
| 874 | new readers can start immediately after <tt>synchronize_rcu()</tt> | ||
| 875 | starts, and <tt>synchronize_rcu()</tt> is under no | ||
| 876 | obligation to wait for these new readers. | ||
| 877 | |||
| 878 | <p>@@QQ@@ | ||
| 879 | Suppose that synchronize_rcu() did wait until all readers had completed. | ||
| 880 | Would the updater be able to rely on this? | ||
| 881 | <p>@@QQA@@ | ||
| 882 | No. | ||
| 883 | Even if <tt>synchronize_rcu()</tt> were to wait until | ||
| 884 | all readers had completed, a new reader might start immediately after | ||
| 885 | <tt>synchronize_rcu()</tt> completed. | ||
| 886 | Therefore, the code following | ||
| 887 | <tt>synchronize_rcu()</tt> cannot rely on there being no readers | ||
| 888 | in any case. | ||
| 889 | <p>@@QQE@@ | ||
| 890 | |||
| 891 | <h3><a name="Grace Periods Don't Partition Read-Side Critical Sections"> | ||
| 892 | Grace Periods Don't Partition Read-Side Critical Sections</a></h3> | ||
| 893 | |||
| 894 | <p> | ||
| 895 | It is tempting to assume that if any part of one RCU read-side critical | ||
| 896 | section precedes a given grace period, and if any part of another RCU | ||
| 897 | read-side critical section follows that same grace period, then all of | ||
| 898 | the first RCU read-side critical section must precede all of the second. | ||
| 899 | However, this just isn't the case: A single grace period does not | ||
| 900 | partition the set of RCU read-side critical sections. | ||
| 901 | An example of this situation can be illustrated as follows, where | ||
| 902 | <tt>x</tt>, <tt>y</tt>, and <tt>z</tt> are initially all zero: | ||
| 903 | |||
| 904 | <blockquote> | ||
| 905 | <pre> | ||
| 906 | 1 void thread0(void) | ||
| 907 | 2 { | ||
| 908 | 3 rcu_read_lock(); | ||
| 909 | 4 WRITE_ONCE(a, 1); | ||
| 910 | 5 WRITE_ONCE(b, 1); | ||
| 911 | 6 rcu_read_unlock(); | ||
| 912 | 7 } | ||
| 913 | 8 | ||
| 914 | 9 void thread1(void) | ||
| 915 | 10 { | ||
| 916 | 11 r1 = READ_ONCE(a); | ||
| 917 | 12 synchronize_rcu(); | ||
| 918 | 13 WRITE_ONCE(c, 1); | ||
| 919 | 14 } | ||
| 920 | 15 | ||
| 921 | 16 void thread2(void) | ||
| 922 | 17 { | ||
| 923 | 18 rcu_read_lock(); | ||
| 924 | 19 r2 = READ_ONCE(b); | ||
| 925 | 20 r3 = READ_ONCE(c); | ||
| 926 | 21 rcu_read_unlock(); | ||
| 927 | 22 } | ||
| 928 | </pre> | ||
| 929 | </blockquote> | ||
| 930 | |||
| 931 | <p> | ||
| 932 | It turns out that the outcome: | ||
| 933 | |||
| 934 | <blockquote> | ||
| 935 | <pre> | ||
| 936 | (r1 == 1 && r2 == 0 && r3 == 1) | ||
| 937 | </pre> | ||
| 938 | </blockquote> | ||
| 939 | |||
| 940 | is entirely possible. | ||
| 941 | The following figure show how this can happen, with each circled | ||
| 942 | <tt>QS</tt> indicating the point at which RCU recorded a | ||
| 943 | <i>quiescent state</i> for each thread, that is, a state in which | ||
| 944 | RCU knows that the thread cannot be in the midst of an RCU read-side | ||
| 945 | critical section that started before the current grace period: | ||
| 946 | |||
| 947 | <p><img src="GPpartitionReaders1.svg" alt="GPpartitionReaders1.svg" width="60%"></p> | ||
| 948 | |||
| 949 | <p> | ||
| 950 | If it is necessary to partition RCU read-side critical sections in this | ||
| 951 | manner, it is necessary to use two grace periods, where the first | ||
| 952 | grace period is known to end before the second grace period starts: | ||
| 953 | |||
| 954 | <blockquote> | ||
| 955 | <pre> | ||
| 956 | 1 void thread0(void) | ||
| 957 | 2 { | ||
| 958 | 3 rcu_read_lock(); | ||
| 959 | 4 WRITE_ONCE(a, 1); | ||
| 960 | 5 WRITE_ONCE(b, 1); | ||
| 961 | 6 rcu_read_unlock(); | ||
| 962 | 7 } | ||
| 963 | 8 | ||
| 964 | 9 void thread1(void) | ||
| 965 | 10 { | ||
| 966 | 11 r1 = READ_ONCE(a); | ||
| 967 | 12 synchronize_rcu(); | ||
| 968 | 13 WRITE_ONCE(c, 1); | ||
| 969 | 14 } | ||
| 970 | 15 | ||
| 971 | 16 void thread2(void) | ||
| 972 | 17 { | ||
| 973 | 18 r2 = READ_ONCE(c); | ||
| 974 | 19 synchronize_rcu(); | ||
| 975 | 20 WRITE_ONCE(d, 1); | ||
| 976 | 21 } | ||
| 977 | 22 | ||
| 978 | 23 void thread3(void) | ||
| 979 | 24 { | ||
| 980 | 25 rcu_read_lock(); | ||
| 981 | 26 r3 = READ_ONCE(b); | ||
| 982 | 27 r4 = READ_ONCE(d); | ||
| 983 | 28 rcu_read_unlock(); | ||
| 984 | 29 } | ||
| 985 | </pre> | ||
| 986 | </blockquote> | ||
| 987 | |||
| 988 | <p> | ||
| 989 | Here, if <tt>(r1 == 1)</tt>, then | ||
| 990 | <tt>thread0()</tt>'s write to <tt>b</tt> must happen | ||
| 991 | before the end of <tt>thread1()</tt>'s grace period. | ||
| 992 | If in addition <tt>(r4 == 1)</tt>, then | ||
| 993 | <tt>thread3()</tt>'s read from <tt>b</tt> must happen | ||
| 994 | after the beginning of <tt>thread2()</tt>'s grace period. | ||
| 995 | If it is also the case that <tt>(r2 == 1)</tt>, then the | ||
| 996 | end of <tt>thread1()</tt>'s grace period must precede the | ||
| 997 | beginning of <tt>thread2()</tt>'s grace period. | ||
| 998 | This mean that the two RCU read-side critical sections cannot overlap, | ||
| 999 | guaranteeing that <tt>(r3 == 1)</tt>. | ||
| 1000 | As a result, the outcome: | ||
| 1001 | |||
| 1002 | <blockquote> | ||
| 1003 | <pre> | ||
| 1004 | (r1 == 1 && r2 == 1 && r3 == 0 && r4 == 1) | ||
| 1005 | </pre> | ||
| 1006 | </blockquote> | ||
| 1007 | |||
| 1008 | cannot happen. | ||
| 1009 | |||
| 1010 | <p> | ||
| 1011 | This non-requirement was also non-premeditated, but became apparent | ||
| 1012 | when studying RCU's interaction with memory ordering. | ||
| 1013 | |||
| 1014 | <h3><a name="Read-Side Critical Sections Don't Partition Grace Periods"> | ||
| 1015 | Read-Side Critical Sections Don't Partition Grace Periods</a></h3> | ||
| 1016 | |||
| 1017 | <p> | ||
| 1018 | It is also tempting to assume that if an RCU read-side critical section | ||
| 1019 | happens between a pair of grace periods, then those grace periods cannot | ||
| 1020 | overlap. | ||
| 1021 | However, this temptation leads nowhere good, as can be illustrated by | ||
| 1022 | the following, with all variables initially zero: | ||
| 1023 | |||
| 1024 | <blockquote> | ||
| 1025 | <pre> | ||
| 1026 | 1 void thread0(void) | ||
| 1027 | 2 { | ||
| 1028 | 3 rcu_read_lock(); | ||
| 1029 | 4 WRITE_ONCE(a, 1); | ||
| 1030 | 5 WRITE_ONCE(b, 1); | ||
| 1031 | 6 rcu_read_unlock(); | ||
| 1032 | 7 } | ||
| 1033 | 8 | ||
| 1034 | 9 void thread1(void) | ||
| 1035 | 10 { | ||
| 1036 | 11 r1 = READ_ONCE(a); | ||
| 1037 | 12 synchronize_rcu(); | ||
| 1038 | 13 WRITE_ONCE(c, 1); | ||
| 1039 | 14 } | ||
| 1040 | 15 | ||
| 1041 | 16 void thread2(void) | ||
| 1042 | 17 { | ||
| 1043 | 18 rcu_read_lock(); | ||
| 1044 | 19 WRITE_ONCE(d, 1); | ||
| 1045 | 20 r2 = READ_ONCE(c); | ||
| 1046 | 21 rcu_read_unlock(); | ||
| 1047 | 22 } | ||
| 1048 | 23 | ||
| 1049 | 24 void thread3(void) | ||
| 1050 | 25 { | ||
| 1051 | 26 r3 = READ_ONCE(d); | ||
| 1052 | 27 synchronize_rcu(); | ||
| 1053 | 28 WRITE_ONCE(e, 1); | ||
| 1054 | 29 } | ||
| 1055 | 30 | ||
| 1056 | 31 void thread4(void) | ||
| 1057 | 32 { | ||
| 1058 | 33 rcu_read_lock(); | ||
| 1059 | 34 r4 = READ_ONCE(b); | ||
| 1060 | 35 r5 = READ_ONCE(e); | ||
| 1061 | 36 rcu_read_unlock(); | ||
| 1062 | 37 } | ||
| 1063 | </pre> | ||
| 1064 | </blockquote> | ||
| 1065 | |||
| 1066 | <p> | ||
| 1067 | In this case, the outcome: | ||
| 1068 | |||
| 1069 | <blockquote> | ||
| 1070 | <pre> | ||
| 1071 | (r1 == 1 && r2 == 1 && r3 == 1 && r4 == 0 && r5 == 1) | ||
| 1072 | </pre> | ||
| 1073 | </blockquote> | ||
| 1074 | |||
| 1075 | is entirely possible, as illustrated below: | ||
| 1076 | |||
| 1077 | <p><img src="ReadersPartitionGP1.svg" alt="ReadersPartitionGP1.svg" width="100%"></p> | ||
| 1078 | |||
| 1079 | <p> | ||
| 1080 | Again, an RCU read-side critical section can overlap almost all of a | ||
| 1081 | given grace period, just so long as it does not overlap the entire | ||
| 1082 | grace period. | ||
| 1083 | As a result, an RCU read-side critical section cannot partition a pair | ||
| 1084 | of RCU grace periods. | ||
| 1085 | |||
| 1086 | <p>@@QQ@@ | ||
| 1087 | How long a sequence of grace periods, each separated by an RCU read-side | ||
| 1088 | critical section, would be required to partition the RCU read-side | ||
| 1089 | critical sections at the beginning and end of the chain? | ||
| 1090 | <p>@@QQA@@ | ||
| 1091 | In theory, an infinite number. | ||
| 1092 | In practice, an unknown number that is sensitive to both implementation | ||
| 1093 | details and timing considerations. | ||
| 1094 | Therefore, even in practice, RCU users must abide by the theoretical rather | ||
| 1095 | than the practical answer. | ||
| 1096 | <p>@@QQE@@ | ||
| 1097 | |||
| 1098 | <h3><a name="Disabling Preemption Does Not Block Grace Periods"> | ||
| 1099 | Disabling Preemption Does Not Block Grace Periods</a></h3> | ||
| 1100 | |||
| 1101 | <p> | ||
| 1102 | There was a time when disabling preemption on any given CPU would block | ||
| 1103 | subsequent grace periods. | ||
| 1104 | However, this was an accident of implementation and is not a requirement. | ||
| 1105 | And in the current Linux-kernel implementation, disabling preemption | ||
| 1106 | on a given CPU in fact does not block grace periods, as Oleg Nesterov | ||
| 1107 | <a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>. | ||
| 1108 | |||
| 1109 | <p> | ||
| 1110 | If you need a preempt-disable region to block grace periods, you need to add | ||
| 1111 | <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example | ||
| 1112 | as follows: | ||
| 1113 | |||
| 1114 | <blockquote> | ||
| 1115 | <pre> | ||
| 1116 | 1 preempt_disable(); | ||
| 1117 | 2 rcu_read_lock(); | ||
| 1118 | 3 do_something(); | ||
| 1119 | 4 rcu_read_unlock(); | ||
| 1120 | 5 preempt_enable(); | ||
| 1121 | 6 | ||
| 1122 | 7 /* Spinlocks implicitly disable preemption. */ | ||
| 1123 | 8 spin_lock(&mylock); | ||
| 1124 | 9 rcu_read_lock(); | ||
| 1125 | 10 do_something(); | ||
| 1126 | 11 rcu_read_unlock(); | ||
| 1127 | 12 spin_unlock(&mylock); | ||
| 1128 | </pre> | ||
| 1129 | </blockquote> | ||
| 1130 | |||
| 1131 | <p> | ||
| 1132 | In theory, you could enter the RCU read-side critical section first, | ||
| 1133 | but it is more efficient to keep the entire RCU read-side critical | ||
| 1134 | section contained in the preempt-disable region as shown above. | ||
| 1135 | Of course, RCU read-side critical sections that extend outside of | ||
| 1136 | preempt-disable regions will work correctly, but such critical sections | ||
| 1137 | can be preempted, which forces <tt>rcu_read_unlock()</tt> to do | ||
| 1138 | more work. | ||
| 1139 | And no, this is <i>not</i> an invitation to enclose all of your RCU | ||
| 1140 | read-side critical sections within preempt-disable regions, because | ||
| 1141 | doing so would degrade real-time response. | ||
| 1142 | |||
| 1143 | <p> | ||
| 1144 | This non-requirement appeared with preemptible RCU. | ||
| 1145 | If you need a grace period that waits on non-preemptible code regions, use | ||
| 1146 | <a href="#Sched Flavor">RCU-sched</a>. | ||
| 1147 | |||
| 1148 | <h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2> | ||
| 1149 | |||
| 1150 | <p> | ||
| 1151 | These parallelism facts of life are by no means specific to RCU, but | ||
| 1152 | the RCU implementation must abide by them. | ||
| 1153 | They therefore bear repeating: | ||
| 1154 | |||
| 1155 | <ol> | ||
| 1156 | <li> Any CPU or task may be delayed at any time, | ||
| 1157 | and any attempts to avoid these delays by disabling | ||
| 1158 | preemption, interrupts, or whatever are completely futile. | ||
| 1159 | This is most obvious in preemptible user-level | ||
| 1160 | environments and in virtualized environments (where | ||
| 1161 | a given guest OS's VCPUs can be preempted at any time by | ||
| 1162 | the underlying hypervisor), but can also happen in bare-metal | ||
| 1163 | environments due to ECC errors, NMIs, and other hardware | ||
| 1164 | events. | ||
| 1165 | Although a delay of more than about 20 seconds can result | ||
| 1166 | in splats, the RCU implementation is obligated to use | ||
| 1167 | algorithms that can tolerate extremely long delays, but where | ||
| 1168 | “extremely long” is not long enough to allow | ||
| 1169 | wrap-around when incrementing a 64-bit counter. | ||
| 1170 | <li> Both the compiler and the CPU can reorder memory accesses. | ||
| 1171 | Where it matters, RCU must use compiler directives and | ||
| 1172 | memory-barrier instructions to preserve ordering. | ||
| 1173 | <li> Conflicting writes to memory locations in any given cache line | ||
| 1174 | will result in expensive cache misses. | ||
| 1175 | Greater numbers of concurrent writes and more-frequent | ||
| 1176 | concurrent writes will result in more dramatic slowdowns. | ||
| 1177 | RCU is therefore obligated to use algorithms that have | ||
| 1178 | sufficient locality to avoid significant performance and | ||
| 1179 | scalability problems. | ||
| 1180 | <li> As a rough rule of thumb, only one CPU's worth of processing | ||
| 1181 | may be carried out under the protection of any given exclusive | ||
| 1182 | lock. | ||
| 1183 | RCU must therefore use scalable locking designs. | ||
| 1184 | <li> Counters are finite, especially on 32-bit systems. | ||
| 1185 | RCU's use of counters must therefore tolerate counter wrap, | ||
| 1186 | or be designed such that counter wrap would take way more | ||
| 1187 | time than a single system is likely to run. | ||
| 1188 | An uptime of ten years is quite possible, a runtime | ||
| 1189 | of a century much less so. | ||
| 1190 | As an example of the latter, RCU's dyntick-idle nesting counter | ||
| 1191 | allows 54 bits for interrupt nesting level (this counter | ||
| 1192 | is 64 bits even on a 32-bit system). | ||
| 1193 | Overflowing this counter requires 2<sup>54</sup> | ||
| 1194 | half-interrupts on a given CPU without that CPU ever going idle. | ||
| 1195 | If a half-interrupt happened every microsecond, it would take | ||
| 1196 | 570 years of runtime to overflow this counter, which is currently | ||
| 1197 | believed to be an acceptably long time. | ||
| 1198 | <li> Linux systems can have thousands of CPUs running a single | ||
| 1199 | Linux kernel in a single shared-memory environment. | ||
| 1200 | RCU must therefore pay close attention to high-end scalability. | ||
| 1201 | </ol> | ||
| 1202 | |||
| 1203 | <p> | ||
| 1204 | This last parallelism fact of life means that RCU must pay special | ||
| 1205 | attention to the preceding facts of life. | ||
| 1206 | The idea that Linux might scale to systems with thousands of CPUs would | ||
| 1207 | have been met with some skepticism in the 1990s, but these requirements | ||
| 1208 | would have otherwise have been unsurprising, even in the early 1990s. | ||
| 1209 | |||
| 1210 | <h2><a name="Quality-of-Implementation Requirements">Quality-of-Implementation Requirements</a></h2> | ||
| 1211 | |||
| 1212 | <p> | ||
| 1213 | These sections list quality-of-implementation requirements. | ||
| 1214 | Although an RCU implementation that ignores these requirements could | ||
| 1215 | still be used, it would likely be subject to limitations that would | ||
| 1216 | make it inappropriate for industrial-strength production use. | ||
| 1217 | Classes of quality-of-implementation requirements are as follows: | ||
| 1218 | |||
| 1219 | <ol> | ||
| 1220 | <li> <a href="#Specialization">Specialization</a> | ||
| 1221 | <li> <a href="#Performance and Scalability">Performance and Scalability</a> | ||
| 1222 | <li> <a href="#Composability">Composability</a> | ||
| 1223 | <li> <a href="#Corner Cases">Corner Cases</a> | ||
| 1224 | </ol> | ||
| 1225 | |||
| 1226 | <p> | ||
| 1227 | These classes is covered in the following sections. | ||
| 1228 | |||
| 1229 | <h3><a name="Specialization">Specialization</a></h3> | ||
| 1230 | |||
| 1231 | <p> | ||
| 1232 | RCU is and always has been intended primarily for read-mostly situations, as | ||
| 1233 | illustrated by the following figure. | ||
| 1234 | This means that RCU's read-side primitives are optimized, often at the | ||
| 1235 | expense of its update-side primitives. | ||
| 1236 | |||
| 1237 | <p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p> | ||
| 1238 | |||
| 1239 | <p> | ||
| 1240 | This focus on read-mostly situations means that RCU must interoperate | ||
| 1241 | with other synchronization primitives. | ||
| 1242 | For example, the <tt>add_gp()</tt> and <tt>remove_gp_synchronous()</tt> | ||
| 1243 | examples discussed earlier use RCU to protect readers and locking to | ||
| 1244 | coordinate updaters. | ||
| 1245 | However, the need extends much farther, requiring that a variety of | ||
| 1246 | synchronization primitives be legal within RCU read-side critical sections, | ||
| 1247 | including spinlocks, sequence locks, atomic operations, reference | ||
| 1248 | counters, and memory barriers. | ||
| 1249 | |||
| 1250 | <p>@@QQ@@ | ||
| 1251 | What about sleeping locks? | ||
| 1252 | <p>@@QQA@@ | ||
| 1253 | These are forbidden within Linux-kernel RCU read-side critical sections | ||
| 1254 | because it is not legal to place a quiescent state (in this case, | ||
| 1255 | voluntary context switch) within an RCU read-side critical section. | ||
| 1256 | However, sleeping locks may be used within userspace RCU read-side critical | ||
| 1257 | sections, and also within Linux-kernel sleepable RCU | ||
| 1258 | <a href="#Sleepable RCU">(SRCU)</a> | ||
| 1259 | read-side critical sections. | ||
| 1260 | In addition, the -rt patchset turns spinlocks into a sleeping locks so | ||
| 1261 | that the corresponding critical sections can be preempted, which | ||
| 1262 | also means that these sleeplockified spinlocks (but not other sleeping locks!) | ||
| 1263 | may be acquire within -rt-Linux-kernel RCU read-side critical sections. | ||
| 1264 | |||
| 1265 | <p> | ||
| 1266 | Note that it <i>is</i> legal for a normal RCU read-side critical section | ||
| 1267 | to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>), | ||
| 1268 | but only as long as it does not loop indefinitely attempting to | ||
| 1269 | conditionally acquire that sleeping locks. | ||
| 1270 | The key point is that things like <tt>mutex_trylock()</tt> | ||
| 1271 | either return with the mutex held, or return an error indication if | ||
| 1272 | the mutex was not immediately available. | ||
| 1273 | Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping. | ||
| 1274 | <p>@@QQE@@ | ||
| 1275 | |||
| 1276 | <p> | ||
| 1277 | It often comes as a surprise that many algorithms do not require a | ||
| 1278 | consistent view of data, but many can function in that mode, | ||
| 1279 | with network routing being the poster child. | ||
| 1280 | Internet routing algorithms take significant time to propagate | ||
| 1281 | updates, so that by the time an update arrives at a given system, | ||
| 1282 | that system has been sending network traffic the wrong way for | ||
| 1283 | a considerable length of time. | ||
| 1284 | Having a few threads continue to send traffic the wrong way for a | ||
| 1285 | few more milliseconds is clearly not a problem: In the worst case, | ||
| 1286 | TCP retransmissions will eventually get the data where it needs to go. | ||
| 1287 | In general, when tracking the state of the universe outside of the | ||
| 1288 | computer, some level of inconsistency must be tolerated due to | ||
| 1289 | speed-of-light delays if nothing else. | ||
| 1290 | |||
| 1291 | <p> | ||
| 1292 | Furthermore, uncertainty about external state is inherent in many cases. | ||
| 1293 | For example, a pair of veternarians might use heartbeat to determine | ||
| 1294 | whether or not a given cat was alive. | ||
| 1295 | But how long should they wait after the last heartbeat to decide that | ||
| 1296 | the cat is in fact dead? | ||
| 1297 | Waiting less than 400 milliseconds makes no sense because this would | ||
| 1298 | mean that a relaxed cat would be considered to cycle between death | ||
| 1299 | and life more than 100 times per minute. | ||
| 1300 | Moreover, just as with human beings, a cat's heart might stop for | ||
| 1301 | some period of time, so the exact wait period is a judgment call. | ||
| 1302 | One of our pair of veternarians might wait 30 seconds before pronouncing | ||
| 1303 | the cat dead, while the other might insist on waiting a full minute. | ||
| 1304 | The two veternarians would then disagree on the state of the cat during | ||
| 1305 | the final 30 seconds of the minute following the last heartbeat, as | ||
| 1306 | fancifully illustrated below: | ||
| 1307 | |||
| 1308 | <p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p> | ||
| 1309 | |||
| 1310 | <p> | ||
| 1311 | Interestingly enough, this same situation applies to hardware. | ||
| 1312 | When push comes to shove, how do we tell whether or not some | ||
| 1313 | external server has failed? | ||
| 1314 | We send messages to it periodically, and declare it failed if we | ||
| 1315 | don't receive a response within a given period of time. | ||
| 1316 | Policy decisions can usually tolerate short | ||
| 1317 | periods of inconsistency. | ||
| 1318 | The policy was decided some time ago, and is only now being put into | ||
| 1319 | effect, so a few milliseconds of delay is normally inconsequential. | ||
| 1320 | |||
| 1321 | <p> | ||
| 1322 | However, there are algorithms that absolutely must see consistent data. | ||
| 1323 | For example, the translation between a user-level SystemV semaphore | ||
| 1324 | ID to the corresponding in-kernel data structure is protected by RCU, | ||
| 1325 | but it is absolutely forbidden to update a semaphore that has just been | ||
| 1326 | removed. | ||
| 1327 | In the Linux kernel, this need for consistency is accommodated by acquiring | ||
| 1328 | spinlocks located in the in-kernel data structure from within | ||
| 1329 | the RCU read-side critical section, and this is indicated by the | ||
| 1330 | green box in the figure above. | ||
| 1331 | Many other techniques may be used, and are in fact used within the | ||
| 1332 | Linux kernel. | ||
| 1333 | |||
| 1334 | <p> | ||
| 1335 | In short, RCU is not required to maintain consistency, and other | ||
| 1336 | mechanisms may be used in concert with RCU when consistency is required. | ||
| 1337 | RCU's specialization allows it to do its job extremely well, and its | ||
| 1338 | ability to interoperate with other synchronization mechanisms allows | ||
| 1339 | the right mix of synchronization tools to be used for a given job. | ||
| 1340 | |||
| 1341 | <h3><a name="Performance and Scalability">Performance and Scalability</a></h3> | ||
| 1342 | |||
| 1343 | <p> | ||
| 1344 | Energy efficiency is a critical component of performance today, | ||
| 1345 | and Linux-kernel RCU implementations must therefore avoid unnecessarily | ||
| 1346 | awakening idle CPUs. | ||
| 1347 | I cannot claim that this requirement was premeditated. | ||
| 1348 | In fact, I learned of it during a telephone conversation in which I | ||
| 1349 | was given “frank and open” feedback on the importance | ||
| 1350 | of energy efficiency in battery-powered systems and on specific | ||
| 1351 | energy-efficiency shortcomings of the Linux-kernel RCU implementation. | ||
| 1352 | In my experience, the battery-powered embedded community will consider | ||
| 1353 | any unnecessary wakeups to be extremely unfriendly acts. | ||
| 1354 | So much so that mere Linux-kernel-mailing-list posts are | ||
| 1355 | insufficient to vent their ire. | ||
| 1356 | |||
| 1357 | <p> | ||
| 1358 | Memory consumption is not particularly important for in most | ||
| 1359 | situations, and has become decreasingly | ||
| 1360 | so as memory sizes have expanded and memory | ||
| 1361 | costs have plummeted. | ||
| 1362 | However, as I learned from Matt Mackall's | ||
| 1363 | <a href="http://elinux.org/Linux_Tiny-FAQ">bloatwatch</a> | ||
| 1364 | efforts, memory footprint is critically important on single-CPU systems with | ||
| 1365 | non-preemptible (<tt>CONFIG_PREEMPT=n</tt>) kernels, and thus | ||
| 1366 | <a href="https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com">tiny RCU</a> | ||
| 1367 | was born. | ||
| 1368 | Josh Triplett has since taken over the small-memory banner with his | ||
| 1369 | <a href="https://tiny.wiki.kernel.org/">Linux kernel tinification</a> | ||
| 1370 | project, which resulted in | ||
| 1371 | <a href="#Sleepable RCU">SRCU</a> | ||
| 1372 | becoming optional for those kernels not needing it. | ||
| 1373 | |||
| 1374 | <p> | ||
| 1375 | The remaining performance requirements are, for the most part, | ||
| 1376 | unsurprising. | ||
| 1377 | For example, in keeping with RCU's read-side specialization, | ||
| 1378 | <tt>rcu_dereference()</tt> should have negligible overhead (for | ||
| 1379 | example, suppression of a few minor compiler optimizations). | ||
| 1380 | Similarly, in non-preemptible environments, <tt>rcu_read_lock()</tt> and | ||
| 1381 | <tt>rcu_read_unlock()</tt> should have exactly zero overhead. | ||
| 1382 | |||
| 1383 | <p> | ||
| 1384 | In preemptible environments, in the case where the RCU read-side | ||
| 1385 | critical section was not preempted (as will be the case for the | ||
| 1386 | highest-priority real-time process), <tt>rcu_read_lock()</tt> and | ||
| 1387 | <tt>rcu_read_unlock()</tt> should have minimal overhead. | ||
| 1388 | In particular, they should not contain atomic read-modify-write | ||
| 1389 | operations, memory-barrier instructions, preemption disabling, | ||
| 1390 | interrupt disabling, or backwards branches. | ||
| 1391 | However, in the case where the RCU read-side critical section was preempted, | ||
| 1392 | <tt>rcu_read_unlock()</tt> may acquire spinlocks and disable interrupts. | ||
| 1393 | This is why it is better to nest an RCU read-side critical section | ||
| 1394 | within a preempt-disable region than vice versa, at least in cases | ||
| 1395 | where that critical section is short enough to avoid unduly degrading | ||
| 1396 | real-time latencies. | ||
| 1397 | |||
| 1398 | <p> | ||
| 1399 | The <tt>synchronize_rcu()</tt> grace-period-wait primitive is | ||
| 1400 | optimized for throughput. | ||
| 1401 | It may therefore incur several milliseconds of latency in addition to | ||
| 1402 | the duration of the longest RCU read-side critical section. | ||
| 1403 | On the other hand, multiple concurrent invocations of | ||
| 1404 | <tt>synchronize_rcu()</tt> are required to use batching optimizations | ||
| 1405 | so that they can be satisfied by a single underlying grace-period-wait | ||
| 1406 | operation. | ||
| 1407 | For example, in the Linux kernel, it is not unusual for a single | ||
| 1408 | grace-period-wait operation to serve more than | ||
| 1409 | <a href="https://www.usenix.org/conference/2004-usenix-annual-technical-conference/making-rcu-safe-deep-sub-millisecond-response">1,000 separate invocations</a> | ||
| 1410 | of <tt>synchronize_rcu()</tt>, thus amortizing the per-invocation | ||
| 1411 | overhead down to nearly zero. | ||
| 1412 | However, the grace-period optimization is also required to avoid | ||
| 1413 | measurable degradation of real-time scheduling and interrupt latencies. | ||
| 1414 | |||
| 1415 | <p> | ||
| 1416 | In some cases, the multi-millisecond <tt>synchronize_rcu()</tt> | ||
| 1417 | latencies are unacceptable. | ||
| 1418 | In these cases, <tt>synchronize_rcu_expedited()</tt> may be used | ||
| 1419 | instead, reducing the grace-period latency down to a few tens of | ||
| 1420 | microseconds on small systems, at least in cases where the RCU read-side | ||
| 1421 | critical sections are short. | ||
| 1422 | There are currently no special latency requirements for | ||
| 1423 | <tt>synchronize_rcu_expedited()</tt> on large systems, but, | ||
| 1424 | consistent with the empirical nature of the RCU specification, | ||
| 1425 | that is subject to change. | ||
| 1426 | However, there most definitely are scalability requirements: | ||
| 1427 | A storm of <tt>synchronize_rcu_expedited()</tt> invocations on 4096 | ||
| 1428 | CPUs should at least make reasonable forward progress. | ||
| 1429 | In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt> | ||
| 1430 | is permitted to impose modest degradation of real-time latency | ||
| 1431 | on non-idle online CPUs. | ||
| 1432 | That said, it will likely be necessary to take further steps to reduce this | ||
| 1433 | degradation, hopefully to roughly that of a scheduling-clock interrupt. | ||
| 1434 | |||
| 1435 | <p> | ||
| 1436 | There are a number of situations where even | ||
| 1437 | <tt>synchronize_rcu_expedited()</tt>'s reduced grace-period | ||
| 1438 | latency is unacceptable. | ||
| 1439 | In these situations, the asynchronous <tt>call_rcu()</tt> can be | ||
| 1440 | used in place of <tt>synchronize_rcu()</tt> as follows: | ||
| 1441 | |||
| 1442 | <blockquote> | ||
| 1443 | <pre> | ||
| 1444 | 1 struct foo { | ||
| 1445 | 2 int a; | ||
| 1446 | 3 int b; | ||
| 1447 | 4 struct rcu_head rh; | ||
| 1448 | 5 }; | ||
| 1449 | 6 | ||
| 1450 | 7 static void remove_gp_cb(struct rcu_head *rhp) | ||
| 1451 | 8 { | ||
| 1452 | 9 struct foo *p = container_of(rhp, struct foo, rh); | ||
| 1453 | 10 | ||
| 1454 | 11 kfree(p); | ||
| 1455 | 12 } | ||
| 1456 | 13 | ||
| 1457 | 14 bool remove_gp_asynchronous(void) | ||
| 1458 | 15 { | ||
| 1459 | 16 struct foo *p; | ||
| 1460 | 17 | ||
| 1461 | 18 spin_lock(&gp_lock); | ||
| 1462 | 19 p = rcu_dereference(gp); | ||
| 1463 | 20 if (!p) { | ||
| 1464 | 21 spin_unlock(&gp_lock); | ||
| 1465 | 22 return false; | ||
| 1466 | 23 } | ||
| 1467 | 24 rcu_assign_pointer(gp, NULL); | ||
| 1468 | 25 call_rcu(&p->rh, remove_gp_cb); | ||
| 1469 | 26 spin_unlock(&gp_lock); | ||
| 1470 | 27 return true; | ||
| 1471 | 28 } | ||
| 1472 | </pre> | ||
| 1473 | </blockquote> | ||
| 1474 | |||
| 1475 | <p> | ||
| 1476 | A definition of <tt>struct foo</tt> is finally needed, and appears | ||
| 1477 | on lines 1-5. | ||
| 1478 | The function <tt>remove_gp_cb()</tt> is passed to <tt>call_rcu()</tt> | ||
| 1479 | on line 25, and will be invoked after the end of a subsequent | ||
| 1480 | grace period. | ||
| 1481 | This gets the same effect as <tt>remove_gp_synchronous()</tt>, | ||
| 1482 | but without forcing the updater to wait for a grace period to elapse. | ||
| 1483 | The <tt>call_rcu()</tt> function may be used in a number of | ||
| 1484 | situations where neither <tt>synchronize_rcu()</tt> nor | ||
| 1485 | <tt>synchronize_rcu_expedited()</tt> would be legal, | ||
| 1486 | including within preempt-disable code, <tt>local_bh_disable()</tt> code, | ||
| 1487 | interrupt-disable code, and interrupt handlers. | ||
| 1488 | However, even <tt>call_rcu()</tt> is illegal within NMI handlers. | ||
| 1489 | The callback function (<tt>remove_gp_cb()</tt> in this case) will be | ||
| 1490 | executed within softirq (software interrupt) environment within the | ||
| 1491 | Linux kernel, | ||
| 1492 | either within a real softirq handler or under the protection | ||
| 1493 | of <tt>local_bh_disable()</tt>. | ||
| 1494 | In both the Linux kernel and in userspace, it is bad practice to | ||
| 1495 | write an RCU callback function that takes too long. | ||
| 1496 | Long-running operations should be relegated to separate threads or | ||
| 1497 | (in the Linux kernel) workqueues. | ||
| 1498 | |||
| 1499 | <p>@@QQ@@ | ||
| 1500 | Why does line 19 use <tt>rcu_access_pointer()</tt>? | ||
| 1501 | After all, <tt>call_rcu()</tt> on line 25 stores into the | ||
| 1502 | structure, which would interact badly with concurrent insertions. | ||
| 1503 | Doesn't this mean that <tt>rcu_dereference()</tt> is required? | ||
| 1504 | <p>@@QQA@@ | ||
| 1505 | Presumably the <tt>->gp_lock</tt> acquired on line 18 excludes | ||
| 1506 | any changes, including any insertions that <tt>rcu_dereference()</tt> | ||
| 1507 | would protect against. | ||
| 1508 | Therefore, any insertions will be delayed until after <tt>->gp_lock</tt> | ||
| 1509 | is released on line 25, which in turn means that | ||
| 1510 | <tt>rcu_access_pointer()</tt> suffices. | ||
| 1511 | <p>@@QQE@@ | ||
| 1512 | |||
| 1513 | <p> | ||
| 1514 | However, all that <tt>remove_gp_cb()</tt> is doing is | ||
| 1515 | invoking <tt>kfree()</tt> on the data element. | ||
| 1516 | This is a common idiom, and is supported by <tt>kfree_rcu()</tt>, | ||
| 1517 | which allows “fire and forget” operation as shown below: | ||
| 1518 | |||
| 1519 | <blockquote> | ||
| 1520 | <pre> | ||
| 1521 | 1 struct foo { | ||
| 1522 | 2 int a; | ||
| 1523 | 3 int b; | ||
| 1524 | 4 struct rcu_head rh; | ||
| 1525 | 5 }; | ||
| 1526 | 6 | ||
| 1527 | 7 bool remove_gp_faf(void) | ||
| 1528 | 8 { | ||
| 1529 | 9 struct foo *p; | ||
| 1530 | 10 | ||
| 1531 | 11 spin_lock(&gp_lock); | ||
| 1532 | 12 p = rcu_dereference(gp); | ||
| 1533 | 13 if (!p) { | ||
| 1534 | 14 spin_unlock(&gp_lock); | ||
| 1535 | 15 return false; | ||
| 1536 | 16 } | ||
| 1537 | 17 rcu_assign_pointer(gp, NULL); | ||
| 1538 | 18 kfree_rcu(p, rh); | ||
| 1539 | 19 spin_unlock(&gp_lock); | ||
| 1540 | 20 return true; | ||
| 1541 | 21 } | ||
| 1542 | </pre> | ||
| 1543 | </blockquote> | ||
| 1544 | |||
| 1545 | <p> | ||
| 1546 | Note that <tt>remove_gp_faf()</tt> simply invokes | ||
| 1547 | <tt>kfree_rcu()</tt> and proceeds, without any need to pay any | ||
| 1548 | further attention to the subsequent grace period and <tt>kfree()</tt>. | ||
| 1549 | It is permissible to invoke <tt>kfree_rcu()</tt> from the same | ||
| 1550 | environments as for <tt>call_rcu()</tt>. | ||
| 1551 | Interestingly enough, DYNIX/ptx had the equivalents of | ||
| 1552 | <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>, but not | ||
| 1553 | <tt>synchronize_rcu()</tt>. | ||
| 1554 | This was due to the fact that RCU was not heavily used within DYNIX/ptx, | ||
| 1555 | so the very few places that needed something like | ||
| 1556 | <tt>synchronize_rcu()</tt> simply open-coded it. | ||
| 1557 | |||
| 1558 | <p>@@QQ@@ | ||
| 1559 | Earlier it was claimed that <tt>call_rcu()</tt> and | ||
| 1560 | <tt>kfree_rcu()</tt> allowed updaters to avoid being blocked | ||
| 1561 | by readers. | ||
| 1562 | But how can that be correct, given that the invocation of the callback | ||
| 1563 | and the freeing of the memory (respectively) must still wait for | ||
| 1564 | a grace period to elapse? | ||
| 1565 | <p>@@QQA@@ | ||
| 1566 | We could define things this way, but keep in mind that this sort of | ||
| 1567 | definition would say that updates in garbage-collected languages | ||
| 1568 | cannot complete until the next time the garbage collector runs, | ||
| 1569 | which does not seem at all reasonable. | ||
| 1570 | The key point is that in most cases, an updater using either | ||
| 1571 | <tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the | ||
| 1572 | next update as soon as it has invoked <tt>call_rcu()</tt> or | ||
| 1573 | <tt>kfree_rcu()</tt>, without having to wait for a subsequent | ||
| 1574 | grace period. | ||
| 1575 | <p>@@QQE@@ | ||
| 1576 | |||
| 1577 | <p> | ||
| 1578 | But what if the updater must wait for the completion of code to be | ||
| 1579 | executed after the end of the grace period, but has other tasks | ||
| 1580 | that can be carried out in the meantime? | ||
| 1581 | The polling-style <tt>get_state_synchronize_rcu()</tt> and | ||
| 1582 | <tt>cond_synchronize_rcu()</tt> functions may be used for this | ||
| 1583 | purpose, as shown below: | ||
| 1584 | |||
| 1585 | <blockquote> | ||
| 1586 | <pre> | ||
| 1587 | 1 bool remove_gp_poll(void) | ||
| 1588 | 2 { | ||
| 1589 | 3 struct foo *p; | ||
| 1590 | 4 unsigned long s; | ||
| 1591 | 5 | ||
| 1592 | 6 spin_lock(&gp_lock); | ||
| 1593 | 7 p = rcu_access_pointer(gp); | ||
| 1594 | 8 if (!p) { | ||
| 1595 | 9 spin_unlock(&gp_lock); | ||
| 1596 | 10 return false; | ||
| 1597 | 11 } | ||
| 1598 | 12 rcu_assign_pointer(gp, NULL); | ||
| 1599 | 13 spin_unlock(&gp_lock); | ||
| 1600 | 14 s = get_state_synchronize_rcu(); | ||
| 1601 | 15 do_something_while_waiting(); | ||
| 1602 | 16 cond_synchronize_rcu(s); | ||
| 1603 | 17 kfree(p); | ||
| 1604 | 18 return true; | ||
| 1605 | 19 } | ||
| 1606 | </pre> | ||
| 1607 | </blockquote> | ||
| 1608 | |||
| 1609 | <p> | ||
| 1610 | On line 14, <tt>get_state_synchronize_rcu()</tt> obtains a | ||
| 1611 | “cookie” from RCU, | ||
| 1612 | then line 15 carries out other tasks, | ||
| 1613 | and finally, line 16 returns immediately if a grace period has | ||
| 1614 | elapsed in the meantime, but otherwise waits as required. | ||
| 1615 | The need for <tt>get_state_synchronize_rcu</tt> and | ||
| 1616 | <tt>cond_synchronize_rcu()</tt> has appeared quite recently, | ||
| 1617 | so it is too early to tell whether they will stand the test of time. | ||
| 1618 | |||
| 1619 | <p> | ||
| 1620 | RCU thus provides a range of tools to allow updaters to strike the | ||
| 1621 | required tradeoff between latency, flexibility and CPU overhead. | ||
| 1622 | |||
| 1623 | <h3><a name="Composability">Composability</a></h3> | ||
| 1624 | |||
| 1625 | <p> | ||
| 1626 | Composability has received much attention in recent years, perhaps in part | ||
| 1627 | due to the collision of multicore hardware with object-oriented techniques | ||
| 1628 | designed in single-threaded environments for single-threaded use. | ||
| 1629 | And in theory, RCU read-side critical sections may be composed, and in | ||
| 1630 | fact may be nested arbitrarily deeply. | ||
| 1631 | In practice, as with all real-world implementations of composable | ||
| 1632 | constructs, there are limitations. | ||
| 1633 | |||
| 1634 | <p> | ||
| 1635 | Implementations of RCU for which <tt>rcu_read_lock()</tt> | ||
| 1636 | and <tt>rcu_read_unlock()</tt> generate no code, such as | ||
| 1637 | Linux-kernel RCU when <tt>CONFIG_PREEMPT=n</tt>, can be | ||
| 1638 | nested arbitrarily deeply. | ||
| 1639 | After all, there is no overhead. | ||
| 1640 | Except that if all these instances of <tt>rcu_read_lock()</tt> | ||
| 1641 | and <tt>rcu_read_unlock()</tt> are visible to the compiler, | ||
| 1642 | compilation will eventually fail due to exhausting memory, | ||
| 1643 | mass storage, or user patience, whichever comes first. | ||
| 1644 | If the nesting is not visible to the compiler, as is the case with | ||
| 1645 | mutually recursive functions each in its own translation unit, | ||
| 1646 | stack overflow will result. | ||
| 1647 | If the nesting takes the form of loops, either the control variable | ||
| 1648 | will overflow or (in the Linux kernel) you will get an RCU CPU stall warning. | ||
| 1649 | Nevertheless, this class of RCU implementations is one | ||
| 1650 | of the most composable constructs in existence. | ||
| 1651 | |||
| 1652 | <p> | ||
| 1653 | RCU implementations that explicitly track nesting depth | ||
| 1654 | are limited by the nesting-depth counter. | ||
| 1655 | For example, the Linux kernel's preemptible RCU limits nesting to | ||
| 1656 | <tt>INT_MAX</tt>. | ||
| 1657 | This should suffice for almost all practical purposes. | ||
| 1658 | That said, a consecutive pair of RCU read-side critical sections | ||
| 1659 | between which there is an operation that waits for a grace period | ||
| 1660 | cannot be enclosed in another RCU read-side critical section. | ||
| 1661 | This is because it is not legal to wait for a grace period within | ||
| 1662 | an RCU read-side critical section: To do so would result either | ||
| 1663 | in deadlock or | ||
| 1664 | in RCU implicitly splitting the enclosing RCU read-side critical | ||
| 1665 | section, neither of which is conducive to a long-lived and prosperous | ||
| 1666 | kernel. | ||
| 1667 | |||
| 1668 | <p> | ||
| 1669 | It is worth noting that RCU is not alone in limiting composability. | ||
| 1670 | For example, many transactional-memory implementations prohibit | ||
| 1671 | composing a pair of transactions separated by an irrevocable | ||
| 1672 | operation (for example, a network receive operation). | ||
| 1673 | For another example, lock-based critical sections can be composed | ||
| 1674 | surprisingly freely, but only if deadlock is avoided. | ||
| 1675 | |||
| 1676 | <p> | ||
| 1677 | In short, although RCU read-side critical sections are highly composable, | ||
| 1678 | care is required in some situations, just as is the case for any other | ||
| 1679 | composable synchronization mechanism. | ||
| 1680 | |||
| 1681 | <h3><a name="Corner Cases">Corner Cases</a></h3> | ||
| 1682 | |||
| 1683 | <p> | ||
| 1684 | A given RCU workload might have an endless and intense stream of | ||
| 1685 | RCU read-side critical sections, perhaps even so intense that there | ||
| 1686 | was never a point in time during which there was not at least one | ||
| 1687 | RCU read-side critical section in flight. | ||
| 1688 | RCU cannot allow this situation to block grace periods: As long as | ||
| 1689 | all the RCU read-side critical sections are finite, grace periods | ||
| 1690 | must also be finite. | ||
| 1691 | |||
| 1692 | <p> | ||
| 1693 | That said, preemptible RCU implementations could potentially result | ||
| 1694 | in RCU read-side critical sections being preempted for long durations, | ||
| 1695 | which has the effect of creating a long-duration RCU read-side | ||
| 1696 | critical section. | ||
| 1697 | This situation can arise only in heavily loaded systems, but systems using | ||
| 1698 | real-time priorities are of course more vulnerable. | ||
| 1699 | Therefore, RCU priority boosting is provided to help deal with this | ||
| 1700 | case. | ||
| 1701 | That said, the exact requirements on RCU priority boosting will likely | ||
| 1702 | evolve as more experience accumulates. | ||
| 1703 | |||
| 1704 | <p> | ||
| 1705 | Other workloads might have very high update rates. | ||
| 1706 | Although one can argue that such workloads should instead use | ||
| 1707 | something other than RCU, the fact remains that RCU must | ||
| 1708 | handle such workloads gracefully. | ||
| 1709 | This requirement is another factor driving batching of grace periods, | ||
| 1710 | but it is also the driving force behind the checks for large numbers | ||
| 1711 | of queued RCU callbacks in the <tt>call_rcu()</tt> code path. | ||
| 1712 | Finally, high update rates should not delay RCU read-side critical | ||
| 1713 | sections, although some read-side delays can occur when using | ||
| 1714 | <tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use | ||
| 1715 | of <tt>try_stop_cpus()</tt>. | ||
| 1716 | (In the future, <tt>synchronize_rcu_expedited()</tt> will be | ||
| 1717 | converted to use lighter-weight inter-processor interrupts (IPIs), | ||
| 1718 | but this will still disturb readers, though to a much smaller degree.) | ||
| 1719 | |||
| 1720 | <p> | ||
| 1721 | Although all three of these corner cases were understood in the early | ||
| 1722 | 1990s, a simple user-level test consisting of <tt>close(open(path))</tt> | ||
| 1723 | in a tight loop | ||
| 1724 | in the early 2000s suddenly provided a much deeper appreciation of the | ||
| 1725 | high-update-rate corner case. | ||
| 1726 | This test also motivated addition of some RCU code to react to high update | ||
| 1727 | rates, for example, if a given CPU finds itself with more than 10,000 | ||
| 1728 | RCU callbacks queued, it will cause RCU to take evasive action by | ||
| 1729 | more aggressively starting grace periods and more aggressively forcing | ||
| 1730 | completion of grace-period processing. | ||
| 1731 | This evasive action causes the grace period to complete more quickly, | ||
| 1732 | but at the cost of restricting RCU's batching optimizations, thus | ||
| 1733 | increasing the CPU overhead incurred by that grace period. | ||
| 1734 | |||
| 1735 | <h2><a name="Software-Engineering Requirements"> | ||
| 1736 | Software-Engineering Requirements</a></h2> | ||
| 1737 | |||
| 1738 | <p> | ||
| 1739 | Between Murphy's Law and “To err is human”, it is necessary to | ||
| 1740 | guard against mishaps and misuse: | ||
| 1741 | |||
| 1742 | <ol> | ||
| 1743 | <li> It is all too easy to forget to use <tt>rcu_read_lock()</tt> | ||
| 1744 | everywhere that it is needed, so kernels built with | ||
| 1745 | <tt>CONFIG_PROVE_RCU=y</tt> will spat if | ||
| 1746 | <tt>rcu_dereference()</tt> is used outside of an | ||
| 1747 | RCU read-side critical section. | ||
| 1748 | Update-side code can use <tt>rcu_dereference_protected()</tt>, | ||
| 1749 | which takes a | ||
| 1750 | <a href="https://lwn.net/Articles/371986/">lockdep expression</a> | ||
| 1751 | to indicate what is providing the protection. | ||
| 1752 | If the indicated protection is not provided, a lockdep splat | ||
| 1753 | is emitted. | ||
| 1754 | |||
| 1755 | <p> | ||
| 1756 | Code shared between readers and updaters can use | ||
| 1757 | <tt>rcu_dereference_check()</tt>, which also takes a | ||
| 1758 | lockdep expression, and emits a lockdep splat if neither | ||
| 1759 | <tt>rcu_read_lock()</tt> nor the indicated protection | ||
| 1760 | is in place. | ||
| 1761 | In addition, <tt>rcu_dereference_raw()</tt> is used in those | ||
| 1762 | (hopefully rare) cases where the required protection cannot | ||
| 1763 | be easily described. | ||
| 1764 | Finally, <tt>rcu_read_lock_held()</tt> is provided to | ||
| 1765 | allow a function to verify that it has been invoked within | ||
| 1766 | an RCU read-side critical section. | ||
| 1767 | I was made aware of this set of requirements shortly after Thomas | ||
| 1768 | Gleixner audited a number of RCU uses. | ||
| 1769 | <li> A given function might wish to check for RCU-related preconditions | ||
| 1770 | upon entry, before using any other RCU API. | ||
| 1771 | The <tt>rcu_lockdep_assert()</tt> does this job, | ||
| 1772 | asserting the expression in kernels having lockdep enabled | ||
| 1773 | and doing nothing otherwise. | ||
| 1774 | <li> It is also easy to forget to use <tt>rcu_assign_pointer()</tt> | ||
| 1775 | and <tt>rcu_dereference()</tt>, perhaps (incorrectly) | ||
| 1776 | substituting a simple assignment. | ||
| 1777 | To catch this sort of error, a given RCU-protected pointer may be | ||
| 1778 | tagged with <tt>__rcu</tt>, after which running sparse | ||
| 1779 | with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain | ||
| 1780 | about simple-assignment accesses to that pointer. | ||
| 1781 | Arnd Bergmann made me aware of this requirement, and also | ||
| 1782 | supplied the needed | ||
| 1783 | <a href="https://lwn.net/Articles/376011/">patch series</a>. | ||
| 1784 | <li> Kernels built with <tt>CONFIG_DEBUG_OBJECTS_RCU_HEAD=y</tt> | ||
| 1785 | will splat if a data element is passed to <tt>call_rcu()</tt> | ||
| 1786 | twice in a row, without a grace period in between. | ||
| 1787 | (This error is similar to a double free.) | ||
| 1788 | The corresponding <tt>rcu_head</tt> structures that are | ||
| 1789 | dynamically allocated are automatically tracked, but | ||
| 1790 | <tt>rcu_head</tt> structures allocated on the stack | ||
| 1791 | must be initialized with <tt>init_rcu_head_on_stack()</tt> | ||
| 1792 | and cleaned up with <tt>destroy_rcu_head_on_stack()</tt>. | ||
| 1793 | Similarly, statically allocated non-stack <tt>rcu_head</tt> | ||
| 1794 | structures must be initialized with <tt>init_rcu_head()</tt> | ||
| 1795 | and cleaned up with <tt>destroy_rcu_head()</tt>. | ||
| 1796 | Mathieu Desnoyers made me aware of this requirement, and also | ||
| 1797 | supplied the needed | ||
| 1798 | <a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>. | ||
| 1799 | <li> An infinite loop in an RCU read-side critical section will | ||
| 1800 | eventually trigger an RCU CPU stall warning splat, with | ||
| 1801 | the duration of “eventually” being controlled by the | ||
| 1802 | <tt>RCU_CPU_STALL_TIMEOUT</tt> <tt>Kconfig</tt> option, or, | ||
| 1803 | alternatively, by the | ||
| 1804 | <tt>rcupdate.rcu_cpu_stall_timeout</tt> boot/sysfs | ||
| 1805 | parameter. | ||
| 1806 | However, RCU is not obligated to produce this splat | ||
| 1807 | unless there is a grace period waiting on that particular | ||
| 1808 | RCU read-side critical section. | ||
| 1809 | <p> | ||
| 1810 | Some extreme workloads might intentionally delay | ||
| 1811 | RCU grace periods, and systems running those workloads can | ||
| 1812 | be booted with <tt>rcupdate.rcu_cpu_stall_suppress</tt> | ||
| 1813 | to suppress the splats. | ||
| 1814 | This kernel parameter may also be set via <tt>sysfs</tt>. | ||
| 1815 | Furthermore, RCU CPU stall warnings are counter-productive | ||
| 1816 | during sysrq dumps and during panics. | ||
| 1817 | RCU therefore supplies the <tt>rcu_sysrq_start()</tt> and | ||
| 1818 | <tt>rcu_sysrq_end()</tt> API members to be called before | ||
| 1819 | and after long sysrq dumps. | ||
| 1820 | RCU also supplies the <tt>rcu_panic()</tt> notifier that is | ||
| 1821 | automatically invoked at the beginning of a panic to suppress | ||
| 1822 | further RCU CPU stall warnings. | ||
| 1823 | |||
| 1824 | <p> | ||
| 1825 | This requirement made itself known in the early 1990s, pretty | ||
| 1826 | much the first time that it was necessary to debug a CPU stall. | ||
| 1827 | That said, the initial implementation in DYNIX/ptx was quite | ||
| 1828 | generic in comparison with that of Linux. | ||
| 1829 | <li> Although it would be very good to detect pointers leaking out | ||
| 1830 | of RCU read-side critical sections, there is currently no | ||
| 1831 | good way of doing this. | ||
| 1832 | One complication is the need to distinguish between pointers | ||
| 1833 | leaking and pointers that have been handed off from RCU to | ||
| 1834 | some other synchronization mechanism, for example, reference | ||
| 1835 | counting. | ||
| 1836 | <li> In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related | ||
| 1837 | information is provided via both debugfs and event tracing. | ||
| 1838 | <li> Open-coded use of <tt>rcu_assign_pointer()</tt> and | ||
| 1839 | <tt>rcu_dereference()</tt> to create typical linked | ||
| 1840 | data structures can be surprisingly error-prone. | ||
| 1841 | Therefore, RCU-protected | ||
| 1842 | <a href="https://lwn.net/Articles/609973/#RCU List APIs">linked lists</a> | ||
| 1843 | and, more recently, RCU-protected | ||
| 1844 | <a href="https://lwn.net/Articles/612100/">hash tables</a> | ||
| 1845 | are available. | ||
| 1846 | Many other special-purpose RCU-protected data structures are | ||
| 1847 | available in the Linux kernel and the userspace RCU library. | ||
| 1848 | <li> Some linked structures are created at compile time, but still | ||
| 1849 | require <tt>__rcu</tt> checking. | ||
| 1850 | The <tt>RCU_POINTER_INITIALIZER()</tt> macro serves this | ||
| 1851 | purpose. | ||
| 1852 | <li> It is not necessary to use <tt>rcu_assign_pointer()</tt> | ||
| 1853 | when creating linked structures that are to be published via | ||
| 1854 | a single external pointer. | ||
| 1855 | The <tt>RCU_INIT_POINTER()</tt> macro is provided for | ||
| 1856 | this task and also for assigning <tt>NULL</tt> pointers | ||
| 1857 | at runtime. | ||
| 1858 | </ol> | ||
| 1859 | |||
| 1860 | <p> | ||
| 1861 | This not a hard-and-fast list: RCU's diagnostic capabilities will | ||
| 1862 | continue to be guided by the number and type of usage bugs found | ||
| 1863 | in real-world RCU usage. | ||
| 1864 | |||
| 1865 | <h2><a name="Linux Kernel Complications">Linux Kernel Complications</a></h2> | ||
| 1866 | |||
| 1867 | <p> | ||
| 1868 | The Linux kernel provides an interesting environment for all kinds of | ||
| 1869 | software, including RCU. | ||
| 1870 | Some of the relevant points of interest are as follows: | ||
| 1871 | |||
| 1872 | <ol> | ||
| 1873 | <li> <a href="#Configuration">Configuration</a>. | ||
| 1874 | <li> <a href="#Firmware Interface">Firmware Interface</a>. | ||
| 1875 | <li> <a href="#Early Boot">Early Boot</a>. | ||
| 1876 | <li> <a href="#Interrupts and NMIs"> | ||
| 1877 | Interrupts and non-maskable interrupts (NMIs)</a>. | ||
| 1878 | <li> <a href="#Loadable Modules">Loadable Modules</a>. | ||
| 1879 | <li> <a href="#Hotplug CPU">Hotplug CPU</a>. | ||
| 1880 | <li> <a href="#Scheduler and RCU">Scheduler and RCU</a>. | ||
| 1881 | <li> <a href="#Tracing and RCU">Tracing and RCU</a>. | ||
| 1882 | <li> <a href="#Energy Efficiency">Energy Efficiency</a>. | ||
| 1883 | <li> <a href="#Memory Efficiency">Memory Efficiency</a>. | ||
| 1884 | <li> <a href="#Performance, Scalability, Response Time, and Reliability"> | ||
| 1885 | Performance, Scalability, Response Time, and Reliability</a>. | ||
| 1886 | </ol> | ||
| 1887 | |||
| 1888 | <p> | ||
| 1889 | This list is probably incomplete, but it does give a feel for the | ||
| 1890 | most notable Linux-kernel complications. | ||
| 1891 | Each of the following sections covers one of the above topics. | ||
| 1892 | |||
| 1893 | <h3><a name="Configuration">Configuration</a></h3> | ||
| 1894 | |||
| 1895 | <p> | ||
| 1896 | RCU's goal is automatic configuration, so that almost nobody | ||
| 1897 | needs to worry about RCU's <tt>Kconfig</tt> options. | ||
| 1898 | And for almost all users, RCU does in fact work well | ||
| 1899 | “out of the box.” | ||
| 1900 | |||
| 1901 | <p> | ||
| 1902 | However, there are specialized use cases that are handled by | ||
| 1903 | kernel boot parameters and <tt>Kconfig</tt> options. | ||
| 1904 | Unfortunately, the <tt>Kconfig</tt> system will explicitly ask users | ||
| 1905 | about new <tt>Kconfig</tt> options, which requires almost all of them | ||
| 1906 | be hidden behind a <tt>CONFIG_RCU_EXPERT</tt> <tt>Kconfig</tt> option. | ||
| 1907 | |||
| 1908 | <p> | ||
| 1909 | This all should be quite obvious, but the fact remains that | ||
| 1910 | Linus Torvalds recently had to | ||
| 1911 | <a href="https://lkml.kernel.org/g/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com">remind</a> | ||
| 1912 | me of this requirement. | ||
| 1913 | |||
| 1914 | <h3><a name="Firmware Interface">Firmware Interface</a></h3> | ||
| 1915 | |||
| 1916 | <p> | ||
| 1917 | In many cases, kernel obtains information about the system from the | ||
| 1918 | firmware, and sometimes things are lost in translation. | ||
| 1919 | Or the translation is accurate, but the original message is bogus. | ||
| 1920 | |||
| 1921 | <p> | ||
| 1922 | For example, some systems' firmware overreports the number of CPUs, | ||
| 1923 | sometimes by a large factor. | ||
| 1924 | If RCU naively believed the firmware, as it used to do, | ||
| 1925 | it would create too many per-CPU kthreads. | ||
| 1926 | Although the resulting system will still run correctly, the extra | ||
| 1927 | kthreads needlessly consume memory and can cause confusion | ||
| 1928 | when they show up in <tt>ps</tt> listings. | ||
| 1929 | |||
| 1930 | <p> | ||
| 1931 | RCU must therefore wait for a given CPU to actually come online before | ||
| 1932 | it can allow itself to believe that the CPU actually exists. | ||
| 1933 | The resulting “ghost CPUs” (which are never going to | ||
| 1934 | come online) cause a number of | ||
| 1935 | <a href="https://paulmck.livejournal.com/37494.html">interesting complications</a>. | ||
| 1936 | |||
| 1937 | <h3><a name="Early Boot">Early Boot</a></h3> | ||
| 1938 | |||
| 1939 | <p> | ||
| 1940 | The Linux kernel's boot sequence is an interesting process, | ||
| 1941 | and RCU is used early, even before <tt>rcu_init()</tt> | ||
| 1942 | is invoked. | ||
| 1943 | In fact, a number of RCU's primitives can be used as soon as the | ||
| 1944 | initial task's <tt>task_struct</tt> is available and the | ||
| 1945 | boot CPU's per-CPU variables are set up. | ||
| 1946 | The read-side primitives (<tt>rcu_read_lock()</tt>, | ||
| 1947 | <tt>rcu_read_unlock()</tt>, <tt>rcu_dereference()</tt>, | ||
| 1948 | and <tt>rcu_access_pointer()</tt>) will operate normally very early on, | ||
| 1949 | as will <tt>rcu_assign_pointer()</tt>. | ||
| 1950 | |||
| 1951 | <p> | ||
| 1952 | Although <tt>call_rcu()</tt> may be invoked at any | ||
| 1953 | time during boot, callbacks are not guaranteed to be invoked until after | ||
| 1954 | the scheduler is fully up and running. | ||
| 1955 | This delay in callback invocation is due to the fact that RCU does not | ||
| 1956 | invoke callbacks until it is fully initialized, and this full initialization | ||
| 1957 | cannot occur until after the scheduler has initialized itself to the | ||
| 1958 | point where RCU can spawn and run its kthreads. | ||
| 1959 | In theory, it would be possible to invoke callbacks earlier, | ||
| 1960 | however, this is not a panacea because there would be severe restrictions | ||
| 1961 | on what operations those callbacks could invoke. | ||
| 1962 | |||
| 1963 | <p> | ||
| 1964 | Perhaps surprisingly, <tt>synchronize_rcu()</tt>, | ||
| 1965 | <a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a> | ||
| 1966 | (<a href="#Bottom-Half Flavor">discussed below</a>), | ||
| 1967 | and | ||
| 1968 | <a href="#Sched Flavor"><tt>synchronize_sched()</tt></a> | ||
| 1969 | will all operate normally | ||
| 1970 | during very early boot, the reason being that there is only one CPU | ||
| 1971 | and preemption is disabled. | ||
| 1972 | This means that the call <tt>synchronize_rcu()</tt> (or friends) | ||
| 1973 | itself is a quiescent | ||
| 1974 | state and thus a grace period, so the early-boot implementation can | ||
| 1975 | be a no-op. | ||
| 1976 | |||
| 1977 | <p> | ||
| 1978 | Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt> | ||
| 1979 | continue to operate normally through the remainder of boot, courtesy | ||
| 1980 | of the fact that preemption is disabled across their RCU read-side | ||
| 1981 | critical sections and also courtesy of the fact that there is still | ||
| 1982 | only one CPU. | ||
| 1983 | However, once the scheduler starts initializing, preemption is enabled. | ||
| 1984 | There is still only a single CPU, but the fact that preemption is enabled | ||
| 1985 | means that the no-op implementation of <tt>synchronize_rcu()</tt> no | ||
| 1986 | longer works in <tt>CONFIG_PREEMPT=y</tt> kernels. | ||
| 1987 | Therefore, as soon as the scheduler starts initializing, the early-boot | ||
| 1988 | fastpath is disabled. | ||
| 1989 | This means that <tt>synchronize_rcu()</tt> switches to its runtime | ||
| 1990 | mode of operation where it posts callbacks, which in turn means that | ||
| 1991 | any call to <tt>synchronize_rcu()</tt> will block until the corresponding | ||
| 1992 | callback is invoked. | ||
| 1993 | Unfortunately, the callback cannot be invoked until RCU's runtime | ||
| 1994 | grace-period machinery is up and running, which cannot happen until | ||
| 1995 | the scheduler has initialized itself sufficiently to allow RCU's | ||
| 1996 | kthreads to be spawned. | ||
| 1997 | Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler | ||
| 1998 | initialization can result in deadlock. | ||
| 1999 | |||
| 2000 | <p>@@QQ@@ | ||
| 2001 | So what happens with <tt>synchronize_rcu()</tt> during | ||
| 2002 | scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> | ||
| 2003 | kernels? | ||
| 2004 | <p>@@QQA@@ | ||
| 2005 | In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt> | ||
| 2006 | maps directly to <tt>synchronize_sched()</tt>. | ||
| 2007 | Therefore, <tt>synchronize_rcu()</tt> works normally throughout | ||
| 2008 | boot in <tt>CONFIG_PREEMPT=n</tt> kernels. | ||
| 2009 | However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels, | ||
| 2010 | so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt> | ||
| 2011 | during scheduler initialization. | ||
| 2012 | <p>@@QQE@@ | ||
| 2013 | |||
| 2014 | <p> | ||
| 2015 | I learned of these boot-time requirements as a result of a series of | ||
| 2016 | system hangs. | ||
| 2017 | |||
| 2018 | <h3><a name="Interrupts and NMIs">Interrupts and NMIs</a></h3> | ||
| 2019 | |||
| 2020 | <p> | ||
| 2021 | The Linux kernel has interrupts, and RCU read-side critical sections are | ||
| 2022 | legal within interrupt handlers and within interrupt-disabled regions | ||
| 2023 | of code, as are invocations of <tt>call_rcu()</tt>. | ||
| 2024 | |||
| 2025 | <p> | ||
| 2026 | Some Linux-kernel architectures can enter an interrupt handler from | ||
| 2027 | non-idle process context, and then just never leave it, instead stealthily | ||
| 2028 | transitioning back to process context. | ||
| 2029 | This trick is sometimes used to invoke system calls from inside the kernel. | ||
| 2030 | These “half-interrupts” mean that RCU has to be very careful | ||
| 2031 | about how it counts interrupt nesting levels. | ||
| 2032 | I learned of this requirement the hard way during a rewrite | ||
| 2033 | of RCU's dyntick-idle code. | ||
| 2034 | |||
| 2035 | <p> | ||
| 2036 | The Linux kernel has non-maskable interrupts (NMIs), and | ||
| 2037 | RCU read-side critical sections are legal within NMI handlers. | ||
| 2038 | Thankfully, RCU update-side primitives, including | ||
| 2039 | <tt>call_rcu()</tt>, are prohibited within NMI handlers. | ||
| 2040 | |||
| 2041 | <p> | ||
| 2042 | The name notwithstanding, some Linux-kernel architectures | ||
| 2043 | can have nested NMIs, which RCU must handle correctly. | ||
| 2044 | Andy Lutomirski | ||
| 2045 | <a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a> | ||
| 2046 | with this requirement; | ||
| 2047 | he also kindly surprised me with | ||
| 2048 | <a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a> | ||
| 2049 | that meets this requirement. | ||
| 2050 | |||
| 2051 | <h3><a name="Loadable Modules">Loadable Modules</a></h3> | ||
| 2052 | |||
| 2053 | <p> | ||
| 2054 | The Linux kernel has loadable modules, and these modules can | ||
| 2055 | also be unloaded. | ||
| 2056 | After a given module has been unloaded, any attempt to call | ||
| 2057 | one of its functions results in a segmentation fault. | ||
| 2058 | The module-unload functions must therefore cancel any | ||
| 2059 | delayed calls to loadable-module functions, for example, | ||
| 2060 | any outstanding <tt>mod_timer()</tt> must be dealt with | ||
| 2061 | via <tt>del_timer_sync()</tt> or similar. | ||
| 2062 | |||
| 2063 | <p> | ||
| 2064 | Unfortunately, there is no way to cancel an RCU callback; | ||
| 2065 | once you invoke <tt>call_rcu()</tt>, the callback function is | ||
| 2066 | going to eventually be invoked, unless the system goes down first. | ||
| 2067 | Because it is normally considered socially irresponsible to crash the system | ||
| 2068 | in response to a module unload request, we need some other way | ||
| 2069 | to deal with in-flight RCU callbacks. | ||
| 2070 | |||
| 2071 | <p> | ||
| 2072 | RCU therefore provides | ||
| 2073 | <tt><a href="https://lwn.net/Articles/217484/">rcu_barrier()</a></tt>, | ||
| 2074 | which waits until all in-flight RCU callbacks have been invoked. | ||
| 2075 | If a module uses <tt>call_rcu()</tt>, its exit function should therefore | ||
| 2076 | prevent any future invocation of <tt>call_rcu()</tt>, then invoke | ||
| 2077 | <tt>rcu_barrier()</tt>. | ||
| 2078 | In theory, the underlying module-unload code could invoke | ||
| 2079 | <tt>rcu_barrier()</tt> unconditionally, but in practice this would | ||
| 2080 | incur unacceptable latencies. | ||
| 2081 | |||
| 2082 | <p> | ||
| 2083 | Nikita Danilov noted this requirement for an analogous filesystem-unmount | ||
| 2084 | situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU. | ||
| 2085 | The need for <tt>rcu_barrier()</tt> for module unloading became | ||
| 2086 | apparent later. | ||
| 2087 | |||
| 2088 | <h3><a name="Hotplug CPU">Hotplug CPU</a></h3> | ||
| 2089 | |||
| 2090 | <p> | ||
| 2091 | The Linux kernel supports CPU hotplug, which means that CPUs | ||
| 2092 | can come and go. | ||
| 2093 | It is of course illegal to use any RCU API member from an offline CPU. | ||
| 2094 | This requirement was present from day one in DYNIX/ptx, but | ||
| 2095 | on the other hand, the Linux kernel's CPU-hotplug implementation | ||
| 2096 | is “interesting.” | ||
| 2097 | |||
| 2098 | <p> | ||
| 2099 | The Linux-kernel CPU-hotplug implementation has notifiers that | ||
| 2100 | are used to allow the various kernel subsystems (including RCU) | ||
| 2101 | to respond appropriately to a given CPU-hotplug operation. | ||
| 2102 | Most RCU operations may be invoked from CPU-hotplug notifiers, | ||
| 2103 | including even normal synchronous grace-period operations | ||
| 2104 | such as <tt>synchronize_rcu()</tt>. | ||
| 2105 | However, expedited grace-period operations such as | ||
| 2106 | <tt>synchronize_rcu_expedited()</tt> are not supported, | ||
| 2107 | due to the fact that current implementations block CPU-hotplug | ||
| 2108 | operations, which could result in deadlock. | ||
| 2109 | |||
| 2110 | <p> | ||
| 2111 | In addition, all-callback-wait operations such as | ||
| 2112 | <tt>rcu_barrier()</tt> are also not supported, due to the | ||
| 2113 | fact that there are phases of CPU-hotplug operations where | ||
| 2114 | the outgoing CPU's callbacks will not be invoked until after | ||
| 2115 | the CPU-hotplug operation ends, which could also result in deadlock. | ||
| 2116 | |||
| 2117 | <h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3> | ||
| 2118 | |||
| 2119 | <p> | ||
| 2120 | RCU depends on the scheduler, and the scheduler uses RCU to | ||
| 2121 | protect some of its data structures. | ||
| 2122 | This means the scheduler is forbidden from acquiring | ||
| 2123 | the runqueue locks and the priority-inheritance locks | ||
| 2124 | in the middle of an outermost RCU read-side critical section unless either | ||
| 2125 | (1) it releases them before exiting that same | ||
| 2126 | RCU read-side critical section, or | ||
| 2127 | (2) interrupts are disabled across | ||
| 2128 | that entire RCU read-side critical section. | ||
| 2129 | This same prohibition also applies (recursively!) to any lock that is acquired | ||
| 2130 | while holding any lock to which this prohibition applies. | ||
| 2131 | Adhering to this rule prevents preemptible RCU from invoking | ||
| 2132 | <tt>rcu_read_unlock_special()</tt> while either runqueue or | ||
| 2133 | priority-inheritance locks are held, thus avoiding deadlock. | ||
| 2134 | |||
| 2135 | <p> | ||
| 2136 | Prior to v4.4, it was only necessary to disable preemption across | ||
| 2137 | RCU read-side critical sections that acquired scheduler locks. | ||
| 2138 | In v4.4, expedited grace periods started using IPIs, and these | ||
| 2139 | IPIs could force a <tt>rcu_read_unlock()</tt> to take the slowpath. | ||
| 2140 | Therefore, this expedited-grace-period change required disabling of | ||
| 2141 | interrupts, not just preemption. | ||
| 2142 | |||
| 2143 | <p> | ||
| 2144 | For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt> | ||
| 2145 | implementation must be written carefully to avoid similar deadlocks. | ||
| 2146 | In particular, <tt>rcu_read_unlock()</tt> must tolerate an | ||
| 2147 | interrupt where the interrupt handler invokes both | ||
| 2148 | <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>. | ||
| 2149 | This possibility requires <tt>rcu_read_unlock()</tt> to use | ||
| 2150 | negative nesting levels to avoid destructive recursion via | ||
| 2151 | interrupt handler's use of RCU. | ||
| 2152 | |||
| 2153 | <p> | ||
| 2154 | This pair of mutual scheduler-RCU requirements came as a | ||
| 2155 | <a href="https://lwn.net/Articles/453002/">complete surprise</a>. | ||
| 2156 | |||
| 2157 | <p> | ||
| 2158 | As noted above, RCU makes use of kthreads, and it is necessary to | ||
| 2159 | avoid excessive CPU-time accumulation by these kthreads. | ||
| 2160 | This requirement was no surprise, but RCU's violation of it | ||
| 2161 | when running context-switch-heavy workloads when built with | ||
| 2162 | <tt>CONFIG_NO_HZ_FULL=y</tt> | ||
| 2163 | <a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>. | ||
| 2164 | RCU has made good progress towards meeting this requirement, even | ||
| 2165 | for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads, | ||
| 2166 | but there is room for further improvement. | ||
| 2167 | |||
| 2168 | <h3><a name="Tracing and RCU">Tracing and RCU</a></h3> | ||
| 2169 | |||
| 2170 | <p> | ||
| 2171 | It is possible to use tracing on RCU code, but tracing itself | ||
| 2172 | uses RCU. | ||
| 2173 | For this reason, <tt>rcu_dereference_raw_notrace()</tt> | ||
| 2174 | is provided for use by tracing, which avoids the destructive | ||
| 2175 | recursion that could otherwise ensue. | ||
| 2176 | This API is also used by virtualization in some architectures, | ||
| 2177 | where RCU readers execute in environments in which tracing | ||
| 2178 | cannot be used. | ||
| 2179 | The tracing folks both located the requirement and provided the | ||
| 2180 | needed fix, so this surprise requirement was relatively painless. | ||
| 2181 | |||
| 2182 | <h3><a name="Energy Efficiency">Energy Efficiency</a></h3> | ||
| 2183 | |||
| 2184 | <p> | ||
| 2185 | Interrupting idle CPUs is considered socially unacceptable, | ||
| 2186 | especially by people with battery-powered embedded systems. | ||
| 2187 | RCU therefore conserves energy by detecting which CPUs are | ||
| 2188 | idle, including tracking CPUs that have been interrupted from idle. | ||
| 2189 | This is a large part of the energy-efficiency requirement, | ||
| 2190 | so I learned of this via an irate phone call. | ||
| 2191 | |||
| 2192 | <p> | ||
| 2193 | Because RCU avoids interrupting idle CPUs, it is illegal to | ||
| 2194 | execute an RCU read-side critical section on an idle CPU. | ||
| 2195 | (Kernels built with <tt>CONFIG_PROVE_RCU=y</tt> will splat | ||
| 2196 | if you try it.) | ||
| 2197 | The <tt>RCU_NONIDLE()</tt> macro and <tt>_rcuidle</tt> | ||
| 2198 | event tracing is provided to work around this restriction. | ||
| 2199 | In addition, <tt>rcu_is_watching()</tt> may be used to | ||
| 2200 | test whether or not it is currently legal to run RCU read-side | ||
| 2201 | critical sections on this CPU. | ||
| 2202 | I learned of the need for diagnostics on the one hand | ||
| 2203 | and <tt>RCU_NONIDLE()</tt> on the other while inspecting | ||
| 2204 | idle-loop code. | ||
| 2205 | Steven Rostedt supplied <tt>_rcuidle</tt> event tracing, | ||
| 2206 | which is used quite heavily in the idle loop. | ||
| 2207 | |||
| 2208 | <p> | ||
| 2209 | It is similarly socially unacceptable to interrupt an | ||
| 2210 | <tt>nohz_full</tt> CPU running in userspace. | ||
| 2211 | RCU must therefore track <tt>nohz_full</tt> userspace | ||
| 2212 | execution. | ||
| 2213 | And in | ||
| 2214 | <a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a> | ||
| 2215 | kernels, RCU must separately track idle CPUs on the one hand and | ||
| 2216 | CPUs that are either idle or executing in userspace on the other. | ||
| 2217 | In both cases, RCU must be able to sample state at two points in | ||
| 2218 | time, and be able to determine whether or not some other CPU spent | ||
| 2219 | any time idle and/or executing in userspace. | ||
| 2220 | |||
| 2221 | <p> | ||
| 2222 | These energy-efficiency requirements have proven quite difficult to | ||
| 2223 | understand and to meet, for example, there have been more than five | ||
| 2224 | clean-sheet rewrites of RCU's energy-efficiency code, the last of | ||
| 2225 | which was finally able to demonstrate | ||
| 2226 | <a href="http://www.rdrop.com/users/paulmck/realtime/paper/AMPenergy.2013.04.19a.pdf">real energy savings running on real hardware [PDF]</a>. | ||
| 2227 | As noted earlier, | ||
| 2228 | I learned of many of these requirements via angry phone calls: | ||
| 2229 | Flaming me on the Linux-kernel mailing list was apparently not | ||
| 2230 | sufficient to fully vent their ire at RCU's energy-efficiency bugs! | ||
| 2231 | |||
| 2232 | <h3><a name="Memory Efficiency">Memory Efficiency</a></h3> | ||
| 2233 | |||
| 2234 | <p> | ||
| 2235 | Although small-memory non-realtime systems can simply use Tiny RCU, | ||
| 2236 | code size is only one aspect of memory efficiency. | ||
| 2237 | Another aspect is the size of the <tt>rcu_head</tt> structure | ||
| 2238 | used by <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>. | ||
| 2239 | Although this structure contains nothing more than a pair of pointers, | ||
| 2240 | it does appear in many RCU-protected data structures, including | ||
| 2241 | some that are size critical. | ||
| 2242 | The <tt>page</tt> structure is a case in point, as evidenced by | ||
| 2243 | the many occurrences of the <tt>union</tt> keyword within that structure. | ||
| 2244 | |||
| 2245 | <p> | ||
| 2246 | This need for memory efficiency is one reason that RCU uses hand-crafted | ||
| 2247 | singly linked lists to track the <tt>rcu_head</tt> structures that | ||
| 2248 | are waiting for a grace period to elapse. | ||
| 2249 | It is also the reason why <tt>rcu_head</tt> structures do not contain | ||
| 2250 | debug information, such as fields tracking the file and line of the | ||
| 2251 | <tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> that posted them. | ||
| 2252 | Although this information might appear in debug-only kernel builds at some | ||
| 2253 | point, in the meantime, the <tt>->func</tt> field will often provide | ||
| 2254 | the needed debug information. | ||
| 2255 | |||
| 2256 | <p> | ||
| 2257 | However, in some cases, the need for memory efficiency leads to even | ||
| 2258 | more extreme measures. | ||
| 2259 | Returning to the <tt>page</tt> structure, the <tt>rcu_head</tt> field | ||
| 2260 | shares storage with a great many other structures that are used at | ||
| 2261 | various points in the corresponding page's lifetime. | ||
| 2262 | In order to correctly resolve certain | ||
| 2263 | <a href="https://lkml.kernel.org/g/1439976106-137226-1-git-send-email-kirill.shutemov@linux.intel.com">race conditions</a>, | ||
| 2264 | the Linux kernel's memory-management subsystem needs a particular bit | ||
| 2265 | to remain zero during all phases of grace-period processing, | ||
| 2266 | and that bit happens to map to the bottom bit of the | ||
| 2267 | <tt>rcu_head</tt> structure's <tt>->next</tt> field. | ||
| 2268 | RCU makes this guarantee as long as <tt>call_rcu()</tt> | ||
| 2269 | is used to post the callback, as opposed to <tt>kfree_rcu()</tt> | ||
| 2270 | or some future “lazy” | ||
| 2271 | variant of <tt>call_rcu()</tt> that might one day be created for | ||
| 2272 | energy-efficiency purposes. | ||
| 2273 | |||
| 2274 | <h3><a name="Performance, Scalability, Response Time, and Reliability"> | ||
| 2275 | Performance, Scalability, Response Time, and Reliability</a></h3> | ||
| 2276 | |||
| 2277 | <p> | ||
| 2278 | Expanding on the | ||
| 2279 | <a href="#Performance and Scalability">earlier discussion</a>, | ||
| 2280 | RCU is used heavily by hot code paths in performance-critical | ||
| 2281 | portions of the Linux kernel's networking, security, virtualization, | ||
| 2282 | and scheduling code paths. | ||
| 2283 | RCU must therefore use efficient implementations, especially in its | ||
| 2284 | read-side primitives. | ||
| 2285 | To that end, it would be good if preemptible RCU's implementation | ||
| 2286 | of <tt>rcu_read_lock()</tt> could be inlined, however, doing | ||
| 2287 | this requires resolving <tt>#include</tt> issues with the | ||
| 2288 | <tt>task_struct</tt> structure. | ||
| 2289 | |||
| 2290 | <p> | ||
| 2291 | The Linux kernel supports hardware configurations with up to | ||
| 2292 | 4096 CPUs, which means that RCU must be extremely scalable. | ||
| 2293 | Algorithms that involve frequent acquisitions of global locks or | ||
| 2294 | frequent atomic operations on global variables simply cannot be | ||
| 2295 | tolerated within the RCU implementation. | ||
| 2296 | RCU therefore makes heavy use of a combining tree based on the | ||
| 2297 | <tt>rcu_node</tt> structure. | ||
| 2298 | RCU is required to tolerate all CPUs continuously invoking any | ||
| 2299 | combination of RCU's runtime primitives with minimal per-operation | ||
| 2300 | overhead. | ||
| 2301 | In fact, in many cases, increasing load must <i>decrease</i> the | ||
| 2302 | per-operation overhead, witness the batching optimizations for | ||
| 2303 | <tt>synchronize_rcu()</tt>, <tt>call_rcu()</tt>, | ||
| 2304 | <tt>synchronize_rcu_expedited()</tt>, and <tt>rcu_barrier()</tt>. | ||
| 2305 | As a general rule, RCU must cheerfully accept whatever the | ||
| 2306 | rest of the Linux kernel decides to throw at it. | ||
| 2307 | |||
| 2308 | <p> | ||
| 2309 | The Linux kernel is used for real-time workloads, especially | ||
| 2310 | in conjunction with the | ||
| 2311 | <a href="https://rt.wiki.kernel.org/index.php/Main_Page">-rt patchset</a>. | ||
| 2312 | The real-time-latency response requirements are such that the | ||
| 2313 | traditional approach of disabling preemption across RCU | ||
| 2314 | read-side critical sections is inappropriate. | ||
| 2315 | Kernels built with <tt>CONFIG_PREEMPT=y</tt> therefore | ||
| 2316 | use an RCU implementation that allows RCU read-side critical | ||
| 2317 | sections to be preempted. | ||
| 2318 | This requirement made its presence known after users made it | ||
| 2319 | clear that an earlier | ||
| 2320 | <a href="https://lwn.net/Articles/107930/">real-time patch</a> | ||
| 2321 | did not meet their needs, in conjunction with some | ||
| 2322 | <a href="https://lkml.kernel.org/g/20050318002026.GA2693@us.ibm.com">RCU issues</a> | ||
| 2323 | encountered by a very early version of the -rt patchset. | ||
| 2324 | |||
| 2325 | <p> | ||
| 2326 | In addition, RCU must make do with a sub-100-microsecond real-time latency | ||
| 2327 | budget. | ||
| 2328 | In fact, on smaller systems with the -rt patchset, the Linux kernel | ||
| 2329 | provides sub-20-microsecond real-time latencies for the whole kernel, | ||
| 2330 | including RCU. | ||
| 2331 | RCU's scalability and latency must therefore be sufficient for | ||
| 2332 | these sorts of configurations. | ||
| 2333 | To my surprise, the sub-100-microsecond real-time latency budget | ||
| 2334 | <a href="http://www.rdrop.com/users/paulmck/realtime/paper/bigrt.2013.01.31a.LCA.pdf"> | ||
| 2335 | applies to even the largest systems [PDF]</a>, | ||
| 2336 | up to and including systems with 4096 CPUs. | ||
| 2337 | This real-time requirement motivated the grace-period kthread, which | ||
| 2338 | also simplified handling of a number of race conditions. | ||
| 2339 | |||
| 2340 | <p> | ||
| 2341 | Finally, RCU's status as a synchronization primitive means that | ||
| 2342 | any RCU failure can result in arbitrary memory corruption that can be | ||
| 2343 | extremely difficult to debug. | ||
| 2344 | This means that RCU must be extremely reliable, which in | ||
| 2345 | practice also means that RCU must have an aggressive stress-test | ||
| 2346 | suite. | ||
| 2347 | This stress-test suite is called <tt>rcutorture</tt>. | ||
| 2348 | |||
| 2349 | <p> | ||
| 2350 | Although the need for <tt>rcutorture</tt> was no surprise, | ||
| 2351 | the current immense popularity of the Linux kernel is posing | ||
| 2352 | interesting—and perhaps unprecedented—validation | ||
| 2353 | challenges. | ||
| 2354 | To see this, keep in mind that there are well over one billion | ||
| 2355 | instances of the Linux kernel running today, given Android | ||
| 2356 | smartphones, Linux-powered televisions, and servers. | ||
| 2357 | This number can be expected to increase sharply with the advent of | ||
| 2358 | the celebrated Internet of Things. | ||
| 2359 | |||
| 2360 | <p> | ||
| 2361 | Suppose that RCU contains a race condition that manifests on average | ||
| 2362 | once per million years of runtime. | ||
| 2363 | This bug will be occurring about three times per <i>day</i> across | ||
| 2364 | the installed base. | ||
| 2365 | RCU could simply hide behind hardware error rates, given that no one | ||
| 2366 | should really expect their smartphone to last for a million years. | ||
| 2367 | However, anyone taking too much comfort from this thought should | ||
| 2368 | consider the fact that in most jurisdictions, a successful multi-year | ||
| 2369 | test of a given mechanism, which might include a Linux kernel, | ||
| 2370 | suffices for a number of types of safety-critical certifications. | ||
| 2371 | In fact, rumor has it that the Linux kernel is already being used | ||
| 2372 | in production for safety-critical applications. | ||
| 2373 | I don't know about you, but I would feel quite bad if a bug in RCU | ||
| 2374 | killed someone. | ||
| 2375 | Which might explain my recent focus on validation and verification. | ||
| 2376 | |||
| 2377 | <h2><a name="Other RCU Flavors">Other RCU Flavors</a></h2> | ||
| 2378 | |||
| 2379 | <p> | ||
| 2380 | One of the more surprising things about RCU is that there are now | ||
| 2381 | no fewer than five <i>flavors</i>, or API families. | ||
| 2382 | In addition, the primary flavor that has been the sole focus up to | ||
| 2383 | this point has two different implementations, non-preemptible and | ||
| 2384 | preemptible. | ||
| 2385 | The other four flavors are listed below, with requirements for each | ||
| 2386 | described in a separate section. | ||
| 2387 | |||
| 2388 | <ol> | ||
| 2389 | <li> <a href="#Bottom-Half Flavor">Bottom-Half Flavor</a> | ||
| 2390 | <li> <a href="#Sched Flavor">Sched Flavor</a> | ||
| 2391 | <li> <a href="#Sleepable RCU">Sleepable RCU</a> | ||
| 2392 | <li> <a href="#Tasks RCU">Tasks RCU</a> | ||
| 2393 | </ol> | ||
| 2394 | |||
| 2395 | <h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3> | ||
| 2396 | |||
| 2397 | <p> | ||
| 2398 | The softirq-disable (AKA “bottom-half”, | ||
| 2399 | hence the “_bh” abbreviations) | ||
| 2400 | flavor of RCU, or <i>RCU-bh</i>, was developed by | ||
| 2401 | Dipankar Sarma to provide a flavor of RCU that could withstand the | ||
| 2402 | network-based denial-of-service attacks researched by Robert | ||
| 2403 | Olsson. | ||
| 2404 | These attacks placed so much networking load on the system | ||
| 2405 | that some of the CPUs never exited softirq execution, | ||
| 2406 | which in turn prevented those CPUs from ever executing a context switch, | ||
| 2407 | which, in the RCU implementation of that time, prevented grace periods | ||
| 2408 | from ever ending. | ||
| 2409 | The result was an out-of-memory condition and a system hang. | ||
| 2410 | |||
| 2411 | <p> | ||
| 2412 | The solution was the creation of RCU-bh, which does | ||
| 2413 | <tt>local_bh_disable()</tt> | ||
| 2414 | across its read-side critical sections, and which uses the transition | ||
| 2415 | from one type of softirq processing to another as a quiescent state | ||
| 2416 | in addition to context switch, idle, user mode, and offline. | ||
| 2417 | This means that RCU-bh grace periods can complete even when some of | ||
| 2418 | the CPUs execute in softirq indefinitely, thus allowing algorithms | ||
| 2419 | based on RCU-bh to withstand network-based denial-of-service attacks. | ||
| 2420 | |||
| 2421 | <p> | ||
| 2422 | Because | ||
| 2423 | <tt>rcu_read_lock_bh()</tt> and <tt>rcu_read_unlock_bh()</tt> | ||
| 2424 | disable and re-enable softirq handlers, any attempt to start a softirq | ||
| 2425 | handlers during the | ||
| 2426 | RCU-bh read-side critical section will be deferred. | ||
| 2427 | In this case, <tt>rcu_read_unlock_bh()</tt> | ||
| 2428 | will invoke softirq processing, which can take considerable time. | ||
| 2429 | One can of course argue that this softirq overhead should be associated | ||
| 2430 | with the code following the RCU-bh read-side critical section rather | ||
| 2431 | than <tt>rcu_read_unlock_bh()</tt>, but the fact | ||
| 2432 | is that most profiling tools cannot be expected to make this sort | ||
| 2433 | of fine distinction. | ||
| 2434 | For example, suppose that a three-millisecond-long RCU-bh read-side | ||
| 2435 | critical section executes during a time of heavy networking load. | ||
| 2436 | There will very likely be an attempt to invoke at least one softirq | ||
| 2437 | handler during that three milliseconds, but any such invocation will | ||
| 2438 | be delayed until the time of the <tt>rcu_read_unlock_bh()</tt>. | ||
| 2439 | This can of course make it appear at first glance as if | ||
| 2440 | <tt>rcu_read_unlock_bh()</tt> was executing very slowly. | ||
| 2441 | |||
| 2442 | <p> | ||
| 2443 | The | ||
| 2444 | <a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-bh API</a> | ||
| 2445 | includes | ||
| 2446 | <tt>rcu_read_lock_bh()</tt>, | ||
| 2447 | <tt>rcu_read_unlock_bh()</tt>, | ||
| 2448 | <tt>rcu_dereference_bh()</tt>, | ||
| 2449 | <tt>rcu_dereference_bh_check()</tt>, | ||
| 2450 | <tt>synchronize_rcu_bh()</tt>, | ||
| 2451 | <tt>synchronize_rcu_bh_expedited()</tt>, | ||
| 2452 | <tt>call_rcu_bh()</tt>, | ||
| 2453 | <tt>rcu_barrier_bh()</tt>, and | ||
| 2454 | <tt>rcu_read_lock_bh_held()</tt>. | ||
| 2455 | |||
| 2456 | <h3><a name="Sched Flavor">Sched Flavor</a></h3> | ||
| 2457 | |||
| 2458 | <p> | ||
| 2459 | Before preemptible RCU, waiting for an RCU grace period had the | ||
| 2460 | side effect of also waiting for all pre-existing interrupt | ||
| 2461 | and NMI handlers. | ||
| 2462 | However, there are legitimate preemptible-RCU implementations that | ||
| 2463 | do not have this property, given that any point in the code outside | ||
| 2464 | of an RCU read-side critical section can be a quiescent state. | ||
| 2465 | Therefore, <i>RCU-sched</i> was created, which follows “classic” | ||
| 2466 | RCU in that an RCU-sched grace period waits for for pre-existing | ||
| 2467 | interrupt and NMI handlers. | ||
| 2468 | In kernels built with <tt>CONFIG_PREEMPT=n</tt>, the RCU and RCU-sched | ||
| 2469 | APIs have identical implementations, while kernels built with | ||
| 2470 | <tt>CONFIG_PREEMPT=y</tt> provide a separate implementation for each. | ||
| 2471 | |||
| 2472 | <p> | ||
| 2473 | Note well that in <tt>CONFIG_PREEMPT=y</tt> kernels, | ||
| 2474 | <tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt> | ||
| 2475 | disable and re-enable preemption, respectively. | ||
| 2476 | This means that if there was a preemption attempt during the | ||
| 2477 | RCU-sched read-side critical section, <tt>rcu_read_unlock_sched()</tt> | ||
| 2478 | will enter the scheduler, with all the latency and overhead entailed. | ||
| 2479 | Just as with <tt>rcu_read_unlock_bh()</tt>, this can make it look | ||
| 2480 | as if <tt>rcu_read_unlock_sched()</tt> was executing very slowly. | ||
| 2481 | However, the highest-priority task won't be preempted, so that task | ||
| 2482 | will enjoy low-overhead <tt>rcu_read_unlock_sched()</tt> invocations. | ||
| 2483 | |||
| 2484 | <p> | ||
| 2485 | The | ||
| 2486 | <a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-sched API</a> | ||
| 2487 | includes | ||
| 2488 | <tt>rcu_read_lock_sched()</tt>, | ||
| 2489 | <tt>rcu_read_unlock_sched()</tt>, | ||
| 2490 | <tt>rcu_read_lock_sched_notrace()</tt>, | ||
| 2491 | <tt>rcu_read_unlock_sched_notrace()</tt>, | ||
| 2492 | <tt>rcu_dereference_sched()</tt>, | ||
| 2493 | <tt>rcu_dereference_sched_check()</tt>, | ||
| 2494 | <tt>synchronize_sched()</tt>, | ||
| 2495 | <tt>synchronize_rcu_sched_expedited()</tt>, | ||
| 2496 | <tt>call_rcu_sched()</tt>, | ||
| 2497 | <tt>rcu_barrier_sched()</tt>, and | ||
| 2498 | <tt>rcu_read_lock_sched_held()</tt>. | ||
| 2499 | However, anything that disables preemption also marks an RCU-sched | ||
| 2500 | read-side critical section, including | ||
| 2501 | <tt>preempt_disable()</tt> and <tt>preempt_enable()</tt>, | ||
| 2502 | <tt>local_irq_save()</tt> and <tt>local_irq_restore()</tt>, | ||
| 2503 | and so on. | ||
| 2504 | |||
| 2505 | <h3><a name="Sleepable RCU">Sleepable RCU</a></h3> | ||
| 2506 | |||
| 2507 | <p> | ||
| 2508 | For well over a decade, someone saying “I need to block within | ||
| 2509 | an RCU read-side critical section” was a reliable indication | ||
| 2510 | that this someone did not understand RCU. | ||
| 2511 | After all, if you are always blocking in an RCU read-side critical | ||
| 2512 | section, you can probably afford to use a higher-overhead synchronization | ||
| 2513 | mechanism. | ||
| 2514 | However, that changed with the advent of the Linux kernel's notifiers, | ||
| 2515 | whose RCU read-side critical | ||
| 2516 | sections almost never sleep, but sometimes need to. | ||
| 2517 | This resulted in the introduction of | ||
| 2518 | <a href="https://lwn.net/Articles/202847/">sleepable RCU</a>, | ||
| 2519 | or <i>SRCU</i>. | ||
| 2520 | |||
| 2521 | <p> | ||
| 2522 | SRCU allows different domains to be defined, with each such domain | ||
| 2523 | defined by an instance of an <tt>srcu_struct</tt> structure. | ||
| 2524 | A pointer to this structure must be passed in to each SRCU function, | ||
| 2525 | for example, <tt>synchronize_srcu(&ss)</tt>, where | ||
| 2526 | <tt>ss</tt> is the <tt>srcu_struct</tt> structure. | ||
| 2527 | The key benefit of these domains is that a slow SRCU reader in one | ||
| 2528 | domain does not delay an SRCU grace period in some other domain. | ||
| 2529 | That said, one consequence of these domains is that read-side code | ||
| 2530 | must pass a “cookie” from <tt>srcu_read_lock()</tt> | ||
| 2531 | to <tt>srcu_read_unlock()</tt>, for example, as follows: | ||
| 2532 | |||
| 2533 | <blockquote> | ||
| 2534 | <pre> | ||
| 2535 | 1 int idx; | ||
| 2536 | 2 | ||
| 2537 | 3 idx = srcu_read_lock(&ss); | ||
| 2538 | 4 do_something(); | ||
| 2539 | 5 srcu_read_unlock(&ss, idx); | ||
| 2540 | </pre> | ||
| 2541 | </blockquote> | ||
| 2542 | |||
| 2543 | <p> | ||
| 2544 | As noted above, it is legal to block within SRCU read-side critical sections, | ||
| 2545 | however, with great power comes great responsibility. | ||
| 2546 | If you block forever in one of a given domain's SRCU read-side critical | ||
| 2547 | sections, then that domain's grace periods will also be blocked forever. | ||
| 2548 | Of course, one good way to block forever is to deadlock, which can | ||
| 2549 | happen if any operation in a given domain's SRCU read-side critical | ||
| 2550 | section can block waiting, either directly or indirectly, for that domain's | ||
| 2551 | grace period to elapse. | ||
| 2552 | For example, this results in a self-deadlock: | ||
| 2553 | |||
| 2554 | <blockquote> | ||
| 2555 | <pre> | ||
| 2556 | 1 int idx; | ||
| 2557 | 2 | ||
| 2558 | 3 idx = srcu_read_lock(&ss); | ||
| 2559 | 4 do_something(); | ||
| 2560 | 5 synchronize_srcu(&ss); | ||
| 2561 | 6 srcu_read_unlock(&ss, idx); | ||
| 2562 | </pre> | ||
| 2563 | </blockquote> | ||
| 2564 | |||
| 2565 | <p> | ||
| 2566 | However, if line 5 acquired a mutex that was held across | ||
| 2567 | a <tt>synchronize_srcu()</tt> for domain <tt>ss</tt>, | ||
| 2568 | deadlock would still be possible. | ||
| 2569 | Furthermore, if line 5 acquired a mutex that was held across | ||
| 2570 | a <tt>synchronize_srcu()</tt> for some other domain <tt>ss1</tt>, | ||
| 2571 | and if an <tt>ss1</tt>-domain SRCU read-side critical section | ||
| 2572 | acquired another mutex that was held across as <tt>ss</tt>-domain | ||
| 2573 | <tt>synchronize_srcu()</tt>, | ||
| 2574 | deadlock would again be possible. | ||
| 2575 | Such a deadlock cycle could extend across an arbitrarily large number | ||
| 2576 | of different SRCU domains. | ||
| 2577 | Again, with great power comes great responsibility. | ||
| 2578 | |||
| 2579 | <p> | ||
| 2580 | Unlike the other RCU flavors, SRCU read-side critical sections can | ||
| 2581 | run on idle and even offline CPUs. | ||
| 2582 | This ability requires that <tt>srcu_read_lock()</tt> and | ||
| 2583 | <tt>srcu_read_unlock()</tt> contain memory barriers, which means | ||
| 2584 | that SRCU readers will run a bit slower than would RCU readers. | ||
| 2585 | It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt> | ||
| 2586 | API, which, in combination with <tt>srcu_read_unlock()</tt>, | ||
| 2587 | guarantees a full memory barrier. | ||
| 2588 | |||
| 2589 | <p> | ||
| 2590 | The | ||
| 2591 | <a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a> | ||
| 2592 | includes | ||
| 2593 | <tt>srcu_read_lock()</tt>, | ||
| 2594 | <tt>srcu_read_unlock()</tt>, | ||
| 2595 | <tt>srcu_dereference()</tt>, | ||
| 2596 | <tt>srcu_dereference_check()</tt>, | ||
| 2597 | <tt>synchronize_srcu()</tt>, | ||
| 2598 | <tt>synchronize_srcu_expedited()</tt>, | ||
| 2599 | <tt>call_srcu()</tt>, | ||
| 2600 | <tt>srcu_barrier()</tt>, and | ||
| 2601 | <tt>srcu_read_lock_held()</tt>. | ||
| 2602 | It also includes | ||
| 2603 | <tt>DEFINE_SRCU()</tt>, | ||
| 2604 | <tt>DEFINE_STATIC_SRCU()</tt>, and | ||
| 2605 | <tt>init_srcu_struct()</tt> | ||
| 2606 | APIs for defining and initializing <tt>srcu_struct</tt> structures. | ||
| 2607 | |||
| 2608 | <h3><a name="Tasks RCU">Tasks RCU</a></h3> | ||
| 2609 | |||
| 2610 | <p> | ||
| 2611 | Some forms of tracing use “tramopolines” to handle the | ||
| 2612 | binary rewriting required to install different types of probes. | ||
| 2613 | It would be good to be able to free old trampolines, which sounds | ||
| 2614 | like a job for some form of RCU. | ||
| 2615 | However, because it is necessary to be able to install a trace | ||
| 2616 | anywhere in the code, it is not possible to use read-side markers | ||
| 2617 | such as <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>. | ||
| 2618 | In addition, it does not work to have these markers in the trampoline | ||
| 2619 | itself, because there would need to be instructions following | ||
| 2620 | <tt>rcu_read_unlock()</tt>. | ||
| 2621 | Although <tt>synchronize_rcu()</tt> would guarantee that execution | ||
| 2622 | reached the <tt>rcu_read_unlock()</tt>, it would not be able to | ||
| 2623 | guarantee that execution had completely left the trampoline. | ||
| 2624 | |||
| 2625 | <p> | ||
| 2626 | The solution, in the form of | ||
| 2627 | <a href="https://lwn.net/Articles/607117/"><i>Tasks RCU</i></a>, | ||
| 2628 | is to have implicit | ||
| 2629 | read-side critical sections that are delimited by voluntary context | ||
| 2630 | switches, that is, calls to <tt>schedule()</tt>, | ||
| 2631 | <tt>cond_resched_rcu_qs()</tt>, and | ||
| 2632 | <tt>synchronize_rcu_tasks()</tt>. | ||
| 2633 | In addition, transitions to and from userspace execution also delimit | ||
| 2634 | tasks-RCU read-side critical sections. | ||
| 2635 | |||
| 2636 | <p> | ||
| 2637 | The tasks-RCU API is quite compact, consisting only of | ||
| 2638 | <tt>call_rcu_tasks()</tt>, | ||
| 2639 | <tt>synchronize_rcu_tasks()</tt>, and | ||
| 2640 | <tt>rcu_barrier_tasks()</tt>. | ||
| 2641 | |||
| 2642 | <h2><a name="Possible Future Changes">Possible Future Changes</a></h2> | ||
| 2643 | |||
| 2644 | <p> | ||
| 2645 | One of the tricks that RCU uses to attain update-side scalability is | ||
| 2646 | to increase grace-period latency with increasing numbers of CPUs. | ||
| 2647 | If this becomes a serious problem, it will be necessary to rework the | ||
| 2648 | grace-period state machine so as to avoid the need for the additional | ||
| 2649 | latency. | ||
| 2650 | |||
| 2651 | <p> | ||
| 2652 | Expedited grace periods scan the CPUs, so their latency and overhead | ||
| 2653 | increases with increasing numbers of CPUs. | ||
| 2654 | If this becomes a serious problem on large systems, it will be necessary | ||
| 2655 | to do some redesign to avoid this scalability problem. | ||
| 2656 | |||
| 2657 | <p> | ||
| 2658 | RCU disables CPU hotplug in a few places, perhaps most notably in the | ||
| 2659 | expedited grace-period and <tt>rcu_barrier()</tt> operations. | ||
| 2660 | If there is a strong reason to use expedited grace periods in CPU-hotplug | ||
| 2661 | notifiers, it will be necessary to avoid disabling CPU hotplug. | ||
| 2662 | This would introduce some complexity, so there had better be a <i>very</i> | ||
| 2663 | good reason. | ||
| 2664 | |||
| 2665 | <p> | ||
| 2666 | The tradeoff between grace-period latency on the one hand and interruptions | ||
| 2667 | of other CPUs on the other hand may need to be re-examined. | ||
| 2668 | The desire is of course for zero grace-period latency as well as zero | ||
| 2669 | interprocessor interrupts undertaken during an expedited grace period | ||
| 2670 | operation. | ||
| 2671 | While this ideal is unlikely to be achievable, it is quite possible that | ||
| 2672 | further improvements can be made. | ||
| 2673 | |||
| 2674 | <p> | ||
| 2675 | The multiprocessor implementations of RCU use a combining tree that | ||
| 2676 | groups CPUs so as to reduce lock contention and increase cache locality. | ||
| 2677 | However, this combining tree does not spread its memory across NUMA | ||
| 2678 | nodes nor does it align the CPU groups with hardware features such | ||
| 2679 | as sockets or cores. | ||
| 2680 | Such spreading and alignment is currently believed to be unnecessary | ||
| 2681 | because the hotpath read-side primitives do not access the combining | ||
| 2682 | tree, nor does <tt>call_rcu()</tt> in the common case. | ||
| 2683 | If you believe that your architecture needs such spreading and alignment, | ||
| 2684 | then your architecture should also benefit from the | ||
| 2685 | <tt>rcutree.rcu_fanout_leaf</tt> boot parameter, which can be set | ||
| 2686 | to the number of CPUs in a socket, NUMA node, or whatever. | ||
| 2687 | If the number of CPUs is too large, use a fraction of the number of | ||
| 2688 | CPUs. | ||
| 2689 | If the number of CPUs is a large prime number, well, that certainly | ||
| 2690 | is an “interesting” architectural choice! | ||
| 2691 | More flexible arrangements might be considered, but only if | ||
| 2692 | <tt>rcutree.rcu_fanout_leaf</tt> has proven inadequate, and only | ||
| 2693 | if the inadequacy has been demonstrated by a carefully run and | ||
| 2694 | realistic system-level workload. | ||
| 2695 | |||
| 2696 | <p> | ||
| 2697 | Please note that arrangements that require RCU to remap CPU numbers will | ||
| 2698 | require extremely good demonstration of need and full exploration of | ||
| 2699 | alternatives. | ||
| 2700 | |||
| 2701 | <p> | ||
| 2702 | There is an embarrassingly large number of flavors of RCU, and this | ||
| 2703 | number has been increasing over time. | ||
| 2704 | Perhaps it will be possible to combine some at some future date. | ||
| 2705 | |||
| 2706 | <p> | ||
| 2707 | RCU's various kthreads are reasonably recent additions. | ||
| 2708 | It is quite likely that adjustments will be required to more gracefully | ||
| 2709 | handle extreme loads. | ||
| 2710 | It might also be necessary to be able to relate CPU utilization by | ||
| 2711 | RCU's kthreads and softirq handlers to the code that instigated this | ||
| 2712 | CPU utilization. | ||
| 2713 | For example, RCU callback overhead might be charged back to the | ||
| 2714 | originating <tt>call_rcu()</tt> instance, though probably not | ||
| 2715 | in production kernels. | ||
| 2716 | |||
| 2717 | <h2><a name="Summary">Summary</a></h2> | ||
| 2718 | |||
| 2719 | <p> | ||
| 2720 | This document has presented more than two decade's worth of RCU | ||
| 2721 | requirements. | ||
| 2722 | Given that the requirements keep changing, this will not be the last | ||
| 2723 | word on this subject, but at least it serves to get an important | ||
| 2724 | subset of the requirements set forth. | ||
| 2725 | |||
| 2726 | <h2><a name="Acknowledgments">Acknowledgments</a></h2> | ||
| 2727 | |||
| 2728 | I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar, | ||
| 2729 | Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and | ||
| 2730 | Andy Lutomirski for their help in rendering | ||
| 2731 | this article human readable, and to Michelle Rankin for her support | ||
| 2732 | of this effort. | ||
| 2733 | Other contributions are acknowledged in the Linux kernel's git archive. | ||
| 2734 | The cartoon is copyright (c) 2013 by Melissa Broussard, | ||
| 2735 | and is provided | ||
| 2736 | under the terms of the Creative Commons Attribution-Share Alike 3.0 | ||
| 2737 | United States license. | ||
| 2738 | |||
| 2739 | <p>@@QQAL@@ | ||
| 2740 | |||
| 2741 | </body></html> | ||
diff --git a/Documentation/RCU/Design/htmlqqz.sh b/Documentation/RCU/Design/htmlqqz.sh new file mode 100755 index 000000000000..d354f069559b --- /dev/null +++ b/Documentation/RCU/Design/htmlqqz.sh | |||
| @@ -0,0 +1,108 @@ | |||
| 1 | #!/bin/sh | ||
| 2 | # | ||
| 3 | # Usage: sh htmlqqz.sh file | ||
| 4 | # | ||
| 5 | # Extracts and converts quick quizzes in a proto-HTML document file.htmlx. | ||
| 6 | # Commands, all of which must be on a line by themselves: | ||
| 7 | # | ||
| 8 | # "<p>@@QQ@@": Start of a quick quiz. | ||
| 9 | # "<p>@@QQA@@": Start of a quick-quiz answer. | ||
| 10 | # "<p>@@QQE@@": End of a quick-quiz answer, and thus of the quick quiz. | ||
| 11 | # "<p>@@QQAL@@": Place to put quick-quiz answer list. | ||
| 12 | # | ||
| 13 | # Places the result in file.html. | ||
| 14 | # | ||
| 15 | # This program is free software; you can redistribute it and/or modify | ||
| 16 | # it under the terms of the GNU General Public License as published by | ||
| 17 | # the Free Software Foundation; either version 2 of the License, or | ||
| 18 | # (at your option) any later version. | ||
| 19 | # | ||
| 20 | # This program is distributed in the hope that it will be useful, | ||
| 21 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 22 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 23 | # GNU General Public License for more details. | ||
| 24 | # | ||
| 25 | # You should have received a copy of the GNU General Public License | ||
| 26 | # along with this program; if not, you can access it online at | ||
| 27 | # http://www.gnu.org/licenses/gpl-2.0.html. | ||
| 28 | # | ||
| 29 | # Copyright (c) 2013 Paul E. McKenney, IBM Corporation. | ||
| 30 | |||
| 31 | fn=$1 | ||
| 32 | if test ! -r $fn.htmlx | ||
| 33 | then | ||
| 34 | echo "Error: $fn.htmlx unreadable." | ||
| 35 | exit 1 | ||
| 36 | fi | ||
| 37 | |||
| 38 | echo "<!-- DO NOT HAND EDIT. -->" > $fn.html | ||
| 39 | echo "<!-- Instead, edit $fn.htmlx and run 'sh htmlqqz.sh $fn' -->" >> $fn.html | ||
| 40 | awk < $fn.htmlx >> $fn.html ' | ||
| 41 | |||
| 42 | state == "" && $1 != "<p>@@QQ@@" && $1 != "<p>@@QQAL@@" { | ||
| 43 | print $0; | ||
| 44 | if ($0 ~ /^<p>@@QQ/) | ||
| 45 | print "Bad Quick Quiz command: " NR " (expected <p>@@QQ@@ or <p>@@QQAL@@)." > "/dev/stderr" | ||
| 46 | next; | ||
| 47 | } | ||
| 48 | |||
| 49 | state == "" && $1 == "<p>@@QQ@@" { | ||
| 50 | qqn++; | ||
| 51 | qqlineno = NR; | ||
| 52 | haveqq = 1; | ||
| 53 | state = "qq"; | ||
| 54 | print "<p><a name=\"Quick Quiz " qqn "\"><b>Quick Quiz " qqn "</b>:</a>" | ||
| 55 | next; | ||
| 56 | } | ||
| 57 | |||
| 58 | state == "qq" && $1 != "<p>@@QQA@@" { | ||
| 59 | qq[qqn] = qq[qqn] $0 "\n"; | ||
| 60 | print $0 | ||
| 61 | if ($0 ~ /^<p>@@QQ/) | ||
| 62 | print "Bad Quick Quiz command: " NR ". (expected <p>@@QQA@@)" > "/dev/stderr" | ||
| 63 | next; | ||
| 64 | } | ||
| 65 | |||
| 66 | state == "qq" && $1 == "<p>@@QQA@@" { | ||
| 67 | state = "qqa"; | ||
| 68 | print "<br><a href=\"#qq" qqn "answer\">Answer</a>" | ||
| 69 | next; | ||
| 70 | } | ||
| 71 | |||
| 72 | state == "qqa" && $1 != "<p>@@QQE@@" { | ||
| 73 | qqa[qqn] = qqa[qqn] $0 "\n"; | ||
| 74 | if ($0 ~ /^<p>@@QQ/) | ||
| 75 | print "Bad Quick Quiz command: " NR " (expected <p>@@QQE@@)." > "/dev/stderr" | ||
| 76 | next; | ||
| 77 | } | ||
| 78 | |||
| 79 | state == "qqa" && $1 == "<p>@@QQE@@" { | ||
| 80 | state = ""; | ||
| 81 | next; | ||
| 82 | } | ||
| 83 | |||
| 84 | state == "" && $1 == "<p>@@QQAL@@" { | ||
| 85 | haveqq = ""; | ||
| 86 | print "<h3><a name=\"Answers to Quick Quizzes\">" | ||
| 87 | print "Answers to Quick Quizzes</a></h3>" | ||
| 88 | print ""; | ||
| 89 | for (i = 1; i <= qqn; i++) { | ||
| 90 | print "<a name=\"qq" i "answer\"></a>" | ||
| 91 | print "<p><b>Quick Quiz " i "</b>:" | ||
| 92 | print qq[i]; | ||
| 93 | print ""; | ||
| 94 | print "</p><p><b>Answer</b>:" | ||
| 95 | print qqa[i]; | ||
| 96 | print ""; | ||
| 97 | print "</p><p><a href=\"#Quick%20Quiz%20" i "\"><b>Back to Quick Quiz " i "</b>.</a>" | ||
| 98 | print ""; | ||
| 99 | } | ||
| 100 | next; | ||
| 101 | } | ||
| 102 | |||
| 103 | END { | ||
| 104 | if (state != "") | ||
| 105 | print "Unterminated Quick Quiz: " qqlineno "." > "/dev/stderr" | ||
| 106 | else if (haveqq) | ||
| 107 | print "Missing \"<p>@@QQAL@@\", no Quick Quiz." > "/dev/stderr" | ||
| 108 | }' | ||
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 742f69d18fc8..d8186da15ca1 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
| @@ -3296,18 +3296,35 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
| 3296 | rcutorture.verbose= [KNL] | 3296 | rcutorture.verbose= [KNL] |
| 3297 | Enable additional printk() statements. | 3297 | Enable additional printk() statements. |
| 3298 | 3298 | ||
| 3299 | rcupdate.rcu_cpu_stall_suppress= [KNL] | ||
| 3300 | Suppress RCU CPU stall warning messages. | ||
| 3301 | |||
| 3302 | rcupdate.rcu_cpu_stall_timeout= [KNL] | ||
| 3303 | Set timeout for RCU CPU stall warning messages. | ||
| 3304 | |||
| 3299 | rcupdate.rcu_expedited= [KNL] | 3305 | rcupdate.rcu_expedited= [KNL] |
| 3300 | Use expedited grace-period primitives, for | 3306 | Use expedited grace-period primitives, for |
| 3301 | example, synchronize_rcu_expedited() instead | 3307 | example, synchronize_rcu_expedited() instead |
| 3302 | of synchronize_rcu(). This reduces latency, | 3308 | of synchronize_rcu(). This reduces latency, |
| 3303 | but can increase CPU utilization, degrade | 3309 | but can increase CPU utilization, degrade |
| 3304 | real-time latency, and degrade energy efficiency. | 3310 | real-time latency, and degrade energy efficiency. |
| 3305 | 3311 | No effect on CONFIG_TINY_RCU kernels. | |
| 3306 | rcupdate.rcu_cpu_stall_suppress= [KNL] | 3312 | |
| 3307 | Suppress RCU CPU stall warning messages. | 3313 | rcupdate.rcu_normal= [KNL] |
| 3308 | 3314 | Use only normal grace-period primitives, | |
| 3309 | rcupdate.rcu_cpu_stall_timeout= [KNL] | 3315 | for example, synchronize_rcu() instead of |
| 3310 | Set timeout for RCU CPU stall warning messages. | 3316 | synchronize_rcu_expedited(). This improves |
| 3317 | real-time latency, CPU utilization, and | ||
| 3318 | energy efficiency, but can expose users to | ||
| 3319 | increased grace-period latency. This parameter | ||
| 3320 | overrides rcupdate.rcu_expedited. No effect on | ||
| 3321 | CONFIG_TINY_RCU kernels. | ||
| 3322 | |||
| 3323 | rcupdate.rcu_normal_after_boot= [KNL] | ||
| 3324 | Once boot has completed (that is, after | ||
| 3325 | rcu_end_inkernel_boot() has been invoked), use | ||
| 3326 | only normal grace-period primitives. No effect | ||
| 3327 | on CONFIG_TINY_RCU kernels. | ||
| 3311 | 3328 | ||
| 3312 | rcupdate.rcu_task_stall_timeout= [KNL] | 3329 | rcupdate.rcu_task_stall_timeout= [KNL] |
| 3313 | Set timeout in jiffies for RCU task stall warning | 3330 | Set timeout in jiffies for RCU task stall warning |
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index aef9487303d0..85304ebd187c 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt | |||
| @@ -194,7 +194,7 @@ There are some minimal guarantees that may be expected of a CPU: | |||
| 194 | (*) On any given CPU, dependent memory accesses will be issued in order, with | 194 | (*) On any given CPU, dependent memory accesses will be issued in order, with |
| 195 | respect to itself. This means that for: | 195 | respect to itself. This means that for: |
| 196 | 196 | ||
| 197 | WRITE_ONCE(Q, P); smp_read_barrier_depends(); D = READ_ONCE(*Q); | 197 | Q = READ_ONCE(P); smp_read_barrier_depends(); D = READ_ONCE(*Q); |
| 198 | 198 | ||
| 199 | the CPU will issue the following memory operations: | 199 | the CPU will issue the following memory operations: |
| 200 | 200 | ||
| @@ -202,9 +202,9 @@ There are some minimal guarantees that may be expected of a CPU: | |||
| 202 | 202 | ||
| 203 | and always in that order. On most systems, smp_read_barrier_depends() | 203 | and always in that order. On most systems, smp_read_barrier_depends() |
| 204 | does nothing, but it is required for DEC Alpha. The READ_ONCE() | 204 | does nothing, but it is required for DEC Alpha. The READ_ONCE() |
| 205 | and WRITE_ONCE() are required to prevent compiler mischief. Please | 205 | is required to prevent compiler mischief. Please note that you |
| 206 | note that you should normally use something like rcu_dereference() | 206 | should normally use something like rcu_dereference() instead of |
| 207 | instead of open-coding smp_read_barrier_depends(). | 207 | open-coding smp_read_barrier_depends(). |
| 208 | 208 | ||
| 209 | (*) Overlapping loads and stores within a particular CPU will appear to be | 209 | (*) Overlapping loads and stores within a particular CPU will appear to be |
| 210 | ordered within that CPU. This means that for: | 210 | ordered within that CPU. This means that for: |
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 5381a728d23e..e5139402e7f8 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c | |||
| @@ -133,6 +133,12 @@ static void sysrq_handle_crash(int key) | |||
| 133 | { | 133 | { |
| 134 | char *killer = NULL; | 134 | char *killer = NULL; |
| 135 | 135 | ||
| 136 | /* we need to release the RCU read lock here, | ||
| 137 | * otherwise we get an annoying | ||
| 138 | * 'BUG: sleeping function called from invalid context' | ||
| 139 | * complaint from the kernel before the panic. | ||
| 140 | */ | ||
| 141 | rcu_read_unlock(); | ||
| 136 | panic_on_oops = 1; /* force panic */ | 142 | panic_on_oops = 1; /* force panic */ |
| 137 | wmb(); | 143 | wmb(); |
| 138 | *killer = 1; | 144 | *killer = 1; |
diff --git a/include/linux/list.h b/include/linux/list.h index 993395a2e55c..5356f4d661a7 100644 --- a/include/linux/list.h +++ b/include/linux/list.h | |||
| @@ -24,7 +24,7 @@ | |||
| 24 | 24 | ||
| 25 | static inline void INIT_LIST_HEAD(struct list_head *list) | 25 | static inline void INIT_LIST_HEAD(struct list_head *list) |
| 26 | { | 26 | { |
| 27 | list->next = list; | 27 | WRITE_ONCE(list->next, list); |
| 28 | list->prev = list; | 28 | list->prev = list; |
| 29 | } | 29 | } |
| 30 | 30 | ||
| @@ -42,7 +42,7 @@ static inline void __list_add(struct list_head *new, | |||
| 42 | next->prev = new; | 42 | next->prev = new; |
| 43 | new->next = next; | 43 | new->next = next; |
| 44 | new->prev = prev; | 44 | new->prev = prev; |
| 45 | prev->next = new; | 45 | WRITE_ONCE(prev->next, new); |
| 46 | } | 46 | } |
| 47 | #else | 47 | #else |
| 48 | extern void __list_add(struct list_head *new, | 48 | extern void __list_add(struct list_head *new, |
| @@ -186,7 +186,7 @@ static inline int list_is_last(const struct list_head *list, | |||
| 186 | */ | 186 | */ |
| 187 | static inline int list_empty(const struct list_head *head) | 187 | static inline int list_empty(const struct list_head *head) |
| 188 | { | 188 | { |
| 189 | return head->next == head; | 189 | return READ_ONCE(head->next) == head; |
| 190 | } | 190 | } |
| 191 | 191 | ||
| 192 | /** | 192 | /** |
| @@ -608,7 +608,7 @@ static inline int hlist_unhashed(const struct hlist_node *h) | |||
| 608 | 608 | ||
| 609 | static inline int hlist_empty(const struct hlist_head *h) | 609 | static inline int hlist_empty(const struct hlist_head *h) |
| 610 | { | 610 | { |
| 611 | return !h->first; | 611 | return !READ_ONCE(h->first); |
| 612 | } | 612 | } |
| 613 | 613 | ||
| 614 | static inline void __hlist_del(struct hlist_node *n) | 614 | static inline void __hlist_del(struct hlist_node *n) |
| @@ -642,7 +642,7 @@ static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) | |||
| 642 | n->next = first; | 642 | n->next = first; |
| 643 | if (first) | 643 | if (first) |
| 644 | first->pprev = &n->next; | 644 | first->pprev = &n->next; |
| 645 | h->first = n; | 645 | WRITE_ONCE(h->first, n); |
| 646 | n->pprev = &h->first; | 646 | n->pprev = &h->first; |
| 647 | } | 647 | } |
| 648 | 648 | ||
| @@ -653,14 +653,14 @@ static inline void hlist_add_before(struct hlist_node *n, | |||
| 653 | n->pprev = next->pprev; | 653 | n->pprev = next->pprev; |
| 654 | n->next = next; | 654 | n->next = next; |
| 655 | next->pprev = &n->next; | 655 | next->pprev = &n->next; |
| 656 | *(n->pprev) = n; | 656 | WRITE_ONCE(*(n->pprev), n); |
| 657 | } | 657 | } |
| 658 | 658 | ||
| 659 | static inline void hlist_add_behind(struct hlist_node *n, | 659 | static inline void hlist_add_behind(struct hlist_node *n, |
| 660 | struct hlist_node *prev) | 660 | struct hlist_node *prev) |
| 661 | { | 661 | { |
| 662 | n->next = prev->next; | 662 | n->next = prev->next; |
| 663 | prev->next = n; | 663 | WRITE_ONCE(prev->next, n); |
| 664 | n->pprev = &prev->next; | 664 | n->pprev = &prev->next; |
| 665 | 665 | ||
| 666 | if (n->next) | 666 | if (n->next) |
diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h index 8132214e8efd..ee7229a6c06a 100644 --- a/include/linux/list_bl.h +++ b/include/linux/list_bl.h | |||
| @@ -70,7 +70,7 @@ static inline void hlist_bl_set_first(struct hlist_bl_head *h, | |||
| 70 | 70 | ||
| 71 | static inline int hlist_bl_empty(const struct hlist_bl_head *h) | 71 | static inline int hlist_bl_empty(const struct hlist_bl_head *h) |
| 72 | { | 72 | { |
| 73 | return !((unsigned long)h->first & ~LIST_BL_LOCKMASK); | 73 | return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK); |
| 74 | } | 74 | } |
| 75 | 75 | ||
| 76 | static inline void hlist_bl_add_head(struct hlist_bl_node *n, | 76 | static inline void hlist_bl_add_head(struct hlist_bl_node *n, |
diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h index 444d2b1313bd..b01fe1009084 100644 --- a/include/linux/list_nulls.h +++ b/include/linux/list_nulls.h | |||
| @@ -57,7 +57,7 @@ static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h) | |||
| 57 | 57 | ||
| 58 | static inline int hlist_nulls_empty(const struct hlist_nulls_head *h) | 58 | static inline int hlist_nulls_empty(const struct hlist_nulls_head *h) |
| 59 | { | 59 | { |
| 60 | return is_a_nulls(h->first); | 60 | return is_a_nulls(READ_ONCE(h->first)); |
| 61 | } | 61 | } |
| 62 | 62 | ||
| 63 | static inline void hlist_nulls_add_head(struct hlist_nulls_node *n, | 63 | static inline void hlist_nulls_add_head(struct hlist_nulls_node *n, |
diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 5ed540986019..14ec1652daf4 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h | |||
| @@ -179,32 +179,31 @@ static inline void list_replace_rcu(struct list_head *old, | |||
| 179 | } | 179 | } |
| 180 | 180 | ||
| 181 | /** | 181 | /** |
| 182 | * list_splice_init_rcu - splice an RCU-protected list into an existing list. | 182 | * __list_splice_init_rcu - join an RCU-protected list into an existing list. |
| 183 | * @list: the RCU-protected list to splice | 183 | * @list: the RCU-protected list to splice |
| 184 | * @head: the place in the list to splice the first list into | 184 | * @prev: points to the last element of the existing list |
| 185 | * @next: points to the first element of the existing list | ||
| 185 | * @sync: function to sync: synchronize_rcu(), synchronize_sched(), ... | 186 | * @sync: function to sync: synchronize_rcu(), synchronize_sched(), ... |
| 186 | * | 187 | * |
| 187 | * @head can be RCU-read traversed concurrently with this function. | 188 | * The list pointed to by @prev and @next can be RCU-read traversed |
| 189 | * concurrently with this function. | ||
| 188 | * | 190 | * |
| 189 | * Note that this function blocks. | 191 | * Note that this function blocks. |
| 190 | * | 192 | * |
| 191 | * Important note: the caller must take whatever action is necessary to | 193 | * Important note: the caller must take whatever action is necessary to prevent |
| 192 | * prevent any other updates to @head. In principle, it is possible | 194 | * any other updates to the existing list. In principle, it is possible to |
| 193 | * to modify the list as soon as sync() begins execution. | 195 | * modify the list as soon as sync() begins execution. If this sort of thing |
| 194 | * If this sort of thing becomes necessary, an alternative version | 196 | * becomes necessary, an alternative version based on call_rcu() could be |
| 195 | * based on call_rcu() could be created. But only if -really- | 197 | * created. But only if -really- needed -- there is no shortage of RCU API |
| 196 | * needed -- there is no shortage of RCU API members. | 198 | * members. |
| 197 | */ | 199 | */ |
| 198 | static inline void list_splice_init_rcu(struct list_head *list, | 200 | static inline void __list_splice_init_rcu(struct list_head *list, |
| 199 | struct list_head *head, | 201 | struct list_head *prev, |
| 200 | void (*sync)(void)) | 202 | struct list_head *next, |
| 203 | void (*sync)(void)) | ||
| 201 | { | 204 | { |
| 202 | struct list_head *first = list->next; | 205 | struct list_head *first = list->next; |
| 203 | struct list_head *last = list->prev; | 206 | struct list_head *last = list->prev; |
| 204 | struct list_head *at = head->next; | ||
| 205 | |||
| 206 | if (list_empty(list)) | ||
| 207 | return; | ||
| 208 | 207 | ||
| 209 | /* | 208 | /* |
| 210 | * "first" and "last" tracking list, so initialize it. RCU readers | 209 | * "first" and "last" tracking list, so initialize it. RCU readers |
| @@ -231,10 +230,40 @@ static inline void list_splice_init_rcu(struct list_head *list, | |||
| 231 | * this function. | 230 | * this function. |
| 232 | */ | 231 | */ |
| 233 | 232 | ||
| 234 | last->next = at; | 233 | last->next = next; |
| 235 | rcu_assign_pointer(list_next_rcu(head), first); | 234 | rcu_assign_pointer(list_next_rcu(prev), first); |
| 236 | first->prev = head; | 235 | first->prev = prev; |
| 237 | at->prev = last; | 236 | next->prev = last; |
| 237 | } | ||
| 238 | |||
| 239 | /** | ||
| 240 | * list_splice_init_rcu - splice an RCU-protected list into an existing list, | ||
| 241 | * designed for stacks. | ||
| 242 | * @list: the RCU-protected list to splice | ||
| 243 | * @head: the place in the existing list to splice the first list into | ||
| 244 | * @sync: function to sync: synchronize_rcu(), synchronize_sched(), ... | ||
| 245 | */ | ||
| 246 | static inline void list_splice_init_rcu(struct list_head *list, | ||
| 247 | struct list_head *head, | ||
| 248 | void (*sync)(void)) | ||
| 249 | { | ||
| 250 | if (!list_empty(list)) | ||
| 251 | __list_splice_init_rcu(list, head, head->next, sync); | ||
| 252 | } | ||
| 253 | |||
| 254 | /** | ||
| 255 | * list_splice_tail_init_rcu - splice an RCU-protected list into an existing | ||
| 256 | * list, designed for queues. | ||
| 257 | * @list: the RCU-protected list to splice | ||
| 258 | * @head: the place in the existing list to splice the first list into | ||
| 259 | * @sync: function to sync: synchronize_rcu(), synchronize_sched(), ... | ||
| 260 | */ | ||
| 261 | static inline void list_splice_tail_init_rcu(struct list_head *list, | ||
| 262 | struct list_head *head, | ||
| 263 | void (*sync)(void)) | ||
| 264 | { | ||
| 265 | if (!list_empty(list)) | ||
| 266 | __list_splice_init_rcu(list, head->prev, head, sync); | ||
| 238 | } | 267 | } |
| 239 | 268 | ||
| 240 | /** | 269 | /** |
| @@ -305,6 +334,42 @@ static inline void list_splice_init_rcu(struct list_head *list, | |||
| 305 | pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) | 334 | pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) |
| 306 | 335 | ||
| 307 | /** | 336 | /** |
| 337 | * list_entry_lockless - get the struct for this entry | ||
| 338 | * @ptr: the &struct list_head pointer. | ||
| 339 | * @type: the type of the struct this is embedded in. | ||
| 340 | * @member: the name of the list_head within the struct. | ||
| 341 | * | ||
| 342 | * This primitive may safely run concurrently with the _rcu list-mutation | ||
| 343 | * primitives such as list_add_rcu(), but requires some implicit RCU | ||
| 344 | * read-side guarding. One example is running within a special | ||
| 345 | * exception-time environment where preemption is disabled and where | ||
| 346 | * lockdep cannot be invoked (in which case updaters must use RCU-sched, | ||
| 347 | * as in synchronize_sched(), call_rcu_sched(), and friends). Another | ||
| 348 | * example is when items are added to the list, but never deleted. | ||
| 349 | */ | ||
| 350 | #define list_entry_lockless(ptr, type, member) \ | ||
| 351 | container_of((typeof(ptr))lockless_dereference(ptr), type, member) | ||
| 352 | |||
| 353 | /** | ||
| 354 | * list_for_each_entry_lockless - iterate over rcu list of given type | ||
| 355 | * @pos: the type * to use as a loop cursor. | ||
| 356 | * @head: the head for your list. | ||
| 357 | * @member: the name of the list_struct within the struct. | ||
| 358 | * | ||
| 359 | * This primitive may safely run concurrently with the _rcu list-mutation | ||
| 360 | * primitives such as list_add_rcu(), but requires some implicit RCU | ||
| 361 | * read-side guarding. One example is running within a special | ||
| 362 | * exception-time environment where preemption is disabled and where | ||
| 363 | * lockdep cannot be invoked (in which case updaters must use RCU-sched, | ||
| 364 | * as in synchronize_sched(), call_rcu_sched(), and friends). Another | ||
| 365 | * example is when items are added to the list, but never deleted. | ||
| 366 | */ | ||
| 367 | #define list_for_each_entry_lockless(pos, head, member) \ | ||
| 368 | for (pos = list_entry_lockless((head)->next, typeof(*pos), member); \ | ||
| 369 | &pos->member != (head); \ | ||
| 370 | pos = list_entry_lockless(pos->member.next, typeof(*pos), member)) | ||
| 371 | |||
| 372 | /** | ||
| 308 | * list_for_each_entry_continue_rcu - continue iteration over list of given type | 373 | * list_for_each_entry_continue_rcu - continue iteration over list of given type |
| 309 | * @pos: the type * to use as a loop cursor. | 374 | * @pos: the type * to use as a loop cursor. |
| 310 | * @head: the head for your list. | 375 | * @head: the head for your list. |
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index a0189ba67fde..14e6f47ee16f 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h | |||
| @@ -48,10 +48,17 @@ | |||
| 48 | 48 | ||
| 49 | #include <asm/barrier.h> | 49 | #include <asm/barrier.h> |
| 50 | 50 | ||
| 51 | #ifndef CONFIG_TINY_RCU | ||
| 51 | extern int rcu_expedited; /* for sysctl */ | 52 | extern int rcu_expedited; /* for sysctl */ |
| 53 | extern int rcu_normal; /* also for sysctl */ | ||
| 54 | #endif /* #ifndef CONFIG_TINY_RCU */ | ||
| 52 | 55 | ||
| 53 | #ifdef CONFIG_TINY_RCU | 56 | #ifdef CONFIG_TINY_RCU |
| 54 | /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ | 57 | /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ |
| 58 | static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */ | ||
| 59 | { | ||
| 60 | return true; | ||
| 61 | } | ||
| 55 | static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */ | 62 | static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */ |
| 56 | { | 63 | { |
| 57 | return false; | 64 | return false; |
| @@ -65,6 +72,7 @@ static inline void rcu_unexpedite_gp(void) | |||
| 65 | { | 72 | { |
| 66 | } | 73 | } |
| 67 | #else /* #ifdef CONFIG_TINY_RCU */ | 74 | #else /* #ifdef CONFIG_TINY_RCU */ |
| 75 | bool rcu_gp_is_normal(void); /* Internal RCU use. */ | ||
| 68 | bool rcu_gp_is_expedited(void); /* Internal RCU use. */ | 76 | bool rcu_gp_is_expedited(void); /* Internal RCU use. */ |
| 69 | void rcu_expedite_gp(void); | 77 | void rcu_expedite_gp(void); |
| 70 | void rcu_unexpedite_gp(void); | 78 | void rcu_unexpedite_gp(void); |
| @@ -321,7 +329,6 @@ static inline int rcu_preempt_depth(void) | |||
| 321 | 329 | ||
| 322 | /* Internal to kernel */ | 330 | /* Internal to kernel */ |
| 323 | void rcu_init(void); | 331 | void rcu_init(void); |
| 324 | void rcu_end_inkernel_boot(void); | ||
| 325 | void rcu_sched_qs(void); | 332 | void rcu_sched_qs(void); |
| 326 | void rcu_bh_qs(void); | 333 | void rcu_bh_qs(void); |
| 327 | void rcu_check_callbacks(int user); | 334 | void rcu_check_callbacks(int user); |
| @@ -329,6 +336,12 @@ struct notifier_block; | |||
| 329 | int rcu_cpu_notify(struct notifier_block *self, | 336 | int rcu_cpu_notify(struct notifier_block *self, |
| 330 | unsigned long action, void *hcpu); | 337 | unsigned long action, void *hcpu); |
| 331 | 338 | ||
| 339 | #ifndef CONFIG_TINY_RCU | ||
| 340 | void rcu_end_inkernel_boot(void); | ||
| 341 | #else /* #ifndef CONFIG_TINY_RCU */ | ||
| 342 | static inline void rcu_end_inkernel_boot(void) { } | ||
| 343 | #endif /* #ifndef CONFIG_TINY_RCU */ | ||
| 344 | |||
| 332 | #ifdef CONFIG_RCU_STALL_COMMON | 345 | #ifdef CONFIG_RCU_STALL_COMMON |
| 333 | void rcu_sysrq_start(void); | 346 | void rcu_sysrq_start(void); |
| 334 | void rcu_sysrq_end(void); | 347 | void rcu_sysrq_end(void); |
| @@ -379,9 +392,9 @@ static inline void rcu_init_nohz(void) | |||
| 379 | */ | 392 | */ |
| 380 | #define RCU_NONIDLE(a) \ | 393 | #define RCU_NONIDLE(a) \ |
| 381 | do { \ | 394 | do { \ |
| 382 | rcu_irq_enter(); \ | 395 | rcu_irq_enter_irqson(); \ |
| 383 | do { a; } while (0); \ | 396 | do { a; } while (0); \ |
| 384 | rcu_irq_exit(); \ | 397 | rcu_irq_exit_irqson(); \ |
| 385 | } while (0) | 398 | } while (0) |
| 386 | 399 | ||
| 387 | /* | 400 | /* |
| @@ -741,7 +754,7 @@ static inline void rcu_preempt_sleep_check(void) | |||
| 741 | * The tracing infrastructure traces RCU (we want that), but unfortunately | 754 | * The tracing infrastructure traces RCU (we want that), but unfortunately |
| 742 | * some of the RCU checks causes tracing to lock up the system. | 755 | * some of the RCU checks causes tracing to lock up the system. |
| 743 | * | 756 | * |
| 744 | * The tracing version of rcu_dereference_raw() must not call | 757 | * The no-tracing version of rcu_dereference_raw() must not call |
| 745 | * rcu_read_lock_held(). | 758 | * rcu_read_lock_held(). |
| 746 | */ | 759 | */ |
| 747 | #define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu) | 760 | #define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu) |
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 4c1aaf9cce7b..64809aea661c 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h | |||
| @@ -181,6 +181,14 @@ static inline void rcu_irq_enter(void) | |||
| 181 | { | 181 | { |
| 182 | } | 182 | } |
| 183 | 183 | ||
| 184 | static inline void rcu_irq_exit_irqson(void) | ||
| 185 | { | ||
| 186 | } | ||
| 187 | |||
| 188 | static inline void rcu_irq_enter_irqson(void) | ||
| 189 | { | ||
| 190 | } | ||
| 191 | |||
| 184 | static inline void rcu_irq_exit(void) | 192 | static inline void rcu_irq_exit(void) |
| 185 | { | 193 | { |
| 186 | } | 194 | } |
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 60d15a080d7c..ad1eda9fa4da 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h | |||
| @@ -37,7 +37,7 @@ void rcu_cpu_stall_reset(void); | |||
| 37 | /* | 37 | /* |
| 38 | * Note a virtualization-based context switch. This is simply a | 38 | * Note a virtualization-based context switch. This is simply a |
| 39 | * wrapper around rcu_note_context_switch(), which allows TINY_RCU | 39 | * wrapper around rcu_note_context_switch(), which allows TINY_RCU |
| 40 | * to save a few bytes. | 40 | * to save a few bytes. The caller must have disabled interrupts. |
| 41 | */ | 41 | */ |
| 42 | static inline void rcu_virt_note_context_switch(int cpu) | 42 | static inline void rcu_virt_note_context_switch(int cpu) |
| 43 | { | 43 | { |
| @@ -97,6 +97,8 @@ void rcu_idle_enter(void); | |||
| 97 | void rcu_idle_exit(void); | 97 | void rcu_idle_exit(void); |
| 98 | void rcu_irq_enter(void); | 98 | void rcu_irq_enter(void); |
| 99 | void rcu_irq_exit(void); | 99 | void rcu_irq_exit(void); |
| 100 | void rcu_irq_enter_irqson(void); | ||
| 101 | void rcu_irq_exit_irqson(void); | ||
| 100 | 102 | ||
| 101 | void exit_rcu(void); | 103 | void exit_rcu(void); |
| 102 | 104 | ||
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 696a339c592c..7834a8a8bf1e 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h | |||
| @@ -171,8 +171,8 @@ extern void syscall_unregfunc(void); | |||
| 171 | TP_PROTO(data_proto), \ | 171 | TP_PROTO(data_proto), \ |
| 172 | TP_ARGS(data_args), \ | 172 | TP_ARGS(data_args), \ |
| 173 | TP_CONDITION(cond), \ | 173 | TP_CONDITION(cond), \ |
| 174 | rcu_irq_enter(), \ | 174 | rcu_irq_enter_irqson(), \ |
| 175 | rcu_irq_exit()); \ | 175 | rcu_irq_exit_irqson()); \ |
| 176 | } | 176 | } |
| 177 | #else | 177 | #else |
| 178 | #define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args) | 178 | #define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args) |
diff --git a/init/main.c b/init/main.c index 9e64d7097f1a..c6ebefafa496 100644 --- a/init/main.c +++ b/init/main.c | |||
| @@ -943,6 +943,8 @@ static int __ref kernel_init(void *unused) | |||
| 943 | 943 | ||
| 944 | flush_delayed_fput(); | 944 | flush_delayed_fput(); |
| 945 | 945 | ||
| 946 | rcu_end_inkernel_boot(); | ||
| 947 | |||
| 946 | if (ramdisk_execute_command) { | 948 | if (ramdisk_execute_command) { |
| 947 | ret = run_init_process(ramdisk_execute_command); | 949 | ret = run_init_process(ramdisk_execute_command); |
| 948 | if (!ret) | 950 | if (!ret) |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index e83b26464061..152da4a48867 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
| @@ -20,7 +20,7 @@ | |||
| 20 | #include <linux/capability.h> | 20 | #include <linux/capability.h> |
| 21 | #include <linux/compiler.h> | 21 | #include <linux/compiler.h> |
| 22 | 22 | ||
| 23 | #include <linux/rcupdate.h> /* rcu_expedited */ | 23 | #include <linux/rcupdate.h> /* rcu_expedited and rcu_normal */ |
| 24 | 24 | ||
| 25 | #define KERNEL_ATTR_RO(_name) \ | 25 | #define KERNEL_ATTR_RO(_name) \ |
| 26 | static struct kobj_attribute _name##_attr = __ATTR_RO(_name) | 26 | static struct kobj_attribute _name##_attr = __ATTR_RO(_name) |
| @@ -144,11 +144,12 @@ static ssize_t fscaps_show(struct kobject *kobj, | |||
| 144 | } | 144 | } |
| 145 | KERNEL_ATTR_RO(fscaps); | 145 | KERNEL_ATTR_RO(fscaps); |
| 146 | 146 | ||
| 147 | #ifndef CONFIG_TINY_RCU | ||
| 147 | int rcu_expedited; | 148 | int rcu_expedited; |
| 148 | static ssize_t rcu_expedited_show(struct kobject *kobj, | 149 | static ssize_t rcu_expedited_show(struct kobject *kobj, |
| 149 | struct kobj_attribute *attr, char *buf) | 150 | struct kobj_attribute *attr, char *buf) |
| 150 | { | 151 | { |
| 151 | return sprintf(buf, "%d\n", rcu_expedited); | 152 | return sprintf(buf, "%d\n", READ_ONCE(rcu_expedited)); |
| 152 | } | 153 | } |
| 153 | static ssize_t rcu_expedited_store(struct kobject *kobj, | 154 | static ssize_t rcu_expedited_store(struct kobject *kobj, |
| 154 | struct kobj_attribute *attr, | 155 | struct kobj_attribute *attr, |
| @@ -161,6 +162,24 @@ static ssize_t rcu_expedited_store(struct kobject *kobj, | |||
| 161 | } | 162 | } |
| 162 | KERNEL_ATTR_RW(rcu_expedited); | 163 | KERNEL_ATTR_RW(rcu_expedited); |
| 163 | 164 | ||
| 165 | int rcu_normal; | ||
| 166 | static ssize_t rcu_normal_show(struct kobject *kobj, | ||
| 167 | struct kobj_attribute *attr, char *buf) | ||
| 168 | { | ||
| 169 | return sprintf(buf, "%d\n", READ_ONCE(rcu_normal)); | ||
| 170 | } | ||
| 171 | static ssize_t rcu_normal_store(struct kobject *kobj, | ||
| 172 | struct kobj_attribute *attr, | ||
| 173 | const char *buf, size_t count) | ||
| 174 | { | ||
| 175 | if (kstrtoint(buf, 0, &rcu_normal)) | ||
| 176 | return -EINVAL; | ||
| 177 | |||
| 178 | return count; | ||
| 179 | } | ||
| 180 | KERNEL_ATTR_RW(rcu_normal); | ||
| 181 | #endif /* #ifndef CONFIG_TINY_RCU */ | ||
| 182 | |||
| 164 | /* | 183 | /* |
| 165 | * Make /sys/kernel/notes give the raw contents of our kernel .notes section. | 184 | * Make /sys/kernel/notes give the raw contents of our kernel .notes section. |
| 166 | */ | 185 | */ |
| @@ -202,7 +221,10 @@ static struct attribute * kernel_attrs[] = { | |||
| 202 | &kexec_crash_size_attr.attr, | 221 | &kexec_crash_size_attr.attr, |
| 203 | &vmcoreinfo_attr.attr, | 222 | &vmcoreinfo_attr.attr, |
| 204 | #endif | 223 | #endif |
| 224 | #ifndef CONFIG_TINY_RCU | ||
| 205 | &rcu_expedited_attr.attr, | 225 | &rcu_expedited_attr.attr, |
| 226 | &rcu_normal_attr.attr, | ||
| 227 | #endif | ||
| 206 | NULL | 228 | NULL |
| 207 | }; | 229 | }; |
| 208 | 230 | ||
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d89328e260df..d2988d047d66 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
| @@ -162,6 +162,27 @@ static int rcu_torture_writer_state; | |||
| 162 | #define RTWS_SYNC 7 | 162 | #define RTWS_SYNC 7 |
| 163 | #define RTWS_STUTTER 8 | 163 | #define RTWS_STUTTER 8 |
| 164 | #define RTWS_STOPPING 9 | 164 | #define RTWS_STOPPING 9 |
| 165 | static const char * const rcu_torture_writer_state_names[] = { | ||
| 166 | "RTWS_FIXED_DELAY", | ||
| 167 | "RTWS_DELAY", | ||
| 168 | "RTWS_REPLACE", | ||
| 169 | "RTWS_DEF_FREE", | ||
| 170 | "RTWS_EXP_SYNC", | ||
| 171 | "RTWS_COND_GET", | ||
| 172 | "RTWS_COND_SYNC", | ||
| 173 | "RTWS_SYNC", | ||
| 174 | "RTWS_STUTTER", | ||
| 175 | "RTWS_STOPPING", | ||
| 176 | }; | ||
| 177 | |||
| 178 | static const char *rcu_torture_writer_state_getname(void) | ||
| 179 | { | ||
| 180 | unsigned int i = READ_ONCE(rcu_torture_writer_state); | ||
| 181 | |||
| 182 | if (i >= ARRAY_SIZE(rcu_torture_writer_state_names)) | ||
| 183 | return "???"; | ||
| 184 | return rcu_torture_writer_state_names[i]; | ||
| 185 | } | ||
| 165 | 186 | ||
| 166 | #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) | 187 | #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) |
| 167 | #define RCUTORTURE_RUNNABLE_INIT 1 | 188 | #define RCUTORTURE_RUNNABLE_INIT 1 |
| @@ -1307,7 +1328,8 @@ rcu_torture_stats_print(void) | |||
| 1307 | 1328 | ||
| 1308 | rcutorture_get_gp_data(cur_ops->ttype, | 1329 | rcutorture_get_gp_data(cur_ops->ttype, |
| 1309 | &flags, &gpnum, &completed); | 1330 | &flags, &gpnum, &completed); |
| 1310 | pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n", | 1331 | pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n", |
| 1332 | rcu_torture_writer_state_getname(), | ||
| 1311 | rcu_torture_writer_state, | 1333 | rcu_torture_writer_state, |
| 1312 | gpnum, completed, flags); | 1334 | gpnum, completed, flags); |
| 1313 | show_rcu_gp_kthreads(); | 1335 | show_rcu_gp_kthreads(); |
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index a63a1ea5a41b..9b9cdd549caa 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c | |||
| @@ -489,7 +489,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) | |||
| 489 | */ | 489 | */ |
| 490 | void synchronize_srcu(struct srcu_struct *sp) | 490 | void synchronize_srcu(struct srcu_struct *sp) |
| 491 | { | 491 | { |
| 492 | __synchronize_srcu(sp, rcu_gp_is_expedited() | 492 | __synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal()) |
| 493 | ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT | 493 | ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT |
| 494 | : SYNCHRONIZE_SRCU_TRYCOUNT); | 494 | : SYNCHRONIZE_SRCU_TRYCOUNT); |
| 495 | } | 495 | } |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f07343b54fe5..e41dd4131f7a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
| @@ -68,10 +68,6 @@ MODULE_ALIAS("rcutree"); | |||
| 68 | 68 | ||
| 69 | /* Data structures. */ | 69 | /* Data structures. */ |
| 70 | 70 | ||
| 71 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; | ||
| 72 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | ||
| 73 | static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; | ||
| 74 | |||
| 75 | /* | 71 | /* |
| 76 | * In order to export the rcu_state name to the tracing tools, it | 72 | * In order to export the rcu_state name to the tracing tools, it |
| 77 | * needs to be added in the __tracepoint_string section. | 73 | * needs to be added in the __tracepoint_string section. |
| @@ -246,24 +242,17 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) | |||
| 246 | */ | 242 | */ |
| 247 | void rcu_sched_qs(void) | 243 | void rcu_sched_qs(void) |
| 248 | { | 244 | { |
| 249 | unsigned long flags; | 245 | if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) |
| 250 | 246 | return; | |
| 251 | if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) { | 247 | trace_rcu_grace_period(TPS("rcu_sched"), |
| 252 | trace_rcu_grace_period(TPS("rcu_sched"), | 248 | __this_cpu_read(rcu_sched_data.gpnum), |
| 253 | __this_cpu_read(rcu_sched_data.gpnum), | 249 | TPS("cpuqs")); |
| 254 | TPS("cpuqs")); | 250 | __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); |
| 255 | __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); | 251 | if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) |
| 256 | if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) | 252 | return; |
| 257 | return; | 253 | __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); |
| 258 | local_irq_save(flags); | 254 | rcu_report_exp_rdp(&rcu_sched_state, |
| 259 | if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) { | 255 | this_cpu_ptr(&rcu_sched_data), true); |
| 260 | __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); | ||
| 261 | rcu_report_exp_rdp(&rcu_sched_state, | ||
| 262 | this_cpu_ptr(&rcu_sched_data), | ||
| 263 | true); | ||
| 264 | } | ||
| 265 | local_irq_restore(flags); | ||
| 266 | } | ||
| 267 | } | 256 | } |
| 268 | 257 | ||
| 269 | void rcu_bh_qs(void) | 258 | void rcu_bh_qs(void) |
| @@ -300,17 +289,16 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); | |||
| 300 | * We inform the RCU core by emulating a zero-duration dyntick-idle | 289 | * We inform the RCU core by emulating a zero-duration dyntick-idle |
| 301 | * period, which we in turn do by incrementing the ->dynticks counter | 290 | * period, which we in turn do by incrementing the ->dynticks counter |
| 302 | * by two. | 291 | * by two. |
| 292 | * | ||
| 293 | * The caller must have disabled interrupts. | ||
| 303 | */ | 294 | */ |
| 304 | static void rcu_momentary_dyntick_idle(void) | 295 | static void rcu_momentary_dyntick_idle(void) |
| 305 | { | 296 | { |
| 306 | unsigned long flags; | ||
| 307 | struct rcu_data *rdp; | 297 | struct rcu_data *rdp; |
| 308 | struct rcu_dynticks *rdtp; | 298 | struct rcu_dynticks *rdtp; |
| 309 | int resched_mask; | 299 | int resched_mask; |
| 310 | struct rcu_state *rsp; | 300 | struct rcu_state *rsp; |
| 311 | 301 | ||
| 312 | local_irq_save(flags); | ||
| 313 | |||
| 314 | /* | 302 | /* |
| 315 | * Yes, we can lose flag-setting operations. This is OK, because | 303 | * Yes, we can lose flag-setting operations. This is OK, because |
| 316 | * the flag will be set again after some delay. | 304 | * the flag will be set again after some delay. |
| @@ -340,13 +328,12 @@ static void rcu_momentary_dyntick_idle(void) | |||
| 340 | smp_mb__after_atomic(); /* Later stuff after QS. */ | 328 | smp_mb__after_atomic(); /* Later stuff after QS. */ |
| 341 | break; | 329 | break; |
| 342 | } | 330 | } |
| 343 | local_irq_restore(flags); | ||
| 344 | } | 331 | } |
| 345 | 332 | ||
| 346 | /* | 333 | /* |
| 347 | * Note a context switch. This is a quiescent state for RCU-sched, | 334 | * Note a context switch. This is a quiescent state for RCU-sched, |
| 348 | * and requires special handling for preemptible RCU. | 335 | * and requires special handling for preemptible RCU. |
| 349 | * The caller must have disabled preemption. | 336 | * The caller must have disabled interrupts. |
| 350 | */ | 337 | */ |
| 351 | void rcu_note_context_switch(void) | 338 | void rcu_note_context_switch(void) |
| 352 | { | 339 | { |
| @@ -376,9 +363,14 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); | |||
| 376 | */ | 363 | */ |
| 377 | void rcu_all_qs(void) | 364 | void rcu_all_qs(void) |
| 378 | { | 365 | { |
| 366 | unsigned long flags; | ||
| 367 | |||
| 379 | barrier(); /* Avoid RCU read-side critical sections leaking down. */ | 368 | barrier(); /* Avoid RCU read-side critical sections leaking down. */ |
| 380 | if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) | 369 | if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) { |
| 370 | local_irq_save(flags); | ||
| 381 | rcu_momentary_dyntick_idle(); | 371 | rcu_momentary_dyntick_idle(); |
| 372 | local_irq_restore(flags); | ||
| 373 | } | ||
| 382 | this_cpu_inc(rcu_qs_ctr); | 374 | this_cpu_inc(rcu_qs_ctr); |
| 383 | barrier(); /* Avoid RCU read-side critical sections leaking up. */ | 375 | barrier(); /* Avoid RCU read-side critical sections leaking up. */ |
| 384 | } | 376 | } |
| @@ -605,25 +597,25 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) | |||
| 605 | * The caller must have disabled interrupts to prevent races with | 597 | * The caller must have disabled interrupts to prevent races with |
| 606 | * normal callback registry. | 598 | * normal callback registry. |
| 607 | */ | 599 | */ |
| 608 | static int | 600 | static bool |
| 609 | cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) | 601 | cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) |
| 610 | { | 602 | { |
| 611 | int i; | 603 | int i; |
| 612 | 604 | ||
| 613 | if (rcu_gp_in_progress(rsp)) | 605 | if (rcu_gp_in_progress(rsp)) |
| 614 | return 0; /* No, a grace period is already in progress. */ | 606 | return false; /* No, a grace period is already in progress. */ |
| 615 | if (rcu_future_needs_gp(rsp)) | 607 | if (rcu_future_needs_gp(rsp)) |
| 616 | return 1; /* Yes, a no-CBs CPU needs one. */ | 608 | return true; /* Yes, a no-CBs CPU needs one. */ |
| 617 | if (!rdp->nxttail[RCU_NEXT_TAIL]) | 609 | if (!rdp->nxttail[RCU_NEXT_TAIL]) |
| 618 | return 0; /* No, this is a no-CBs (or offline) CPU. */ | 610 | return false; /* No, this is a no-CBs (or offline) CPU. */ |
| 619 | if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) | 611 | if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) |
| 620 | return 1; /* Yes, this CPU has newly registered callbacks. */ | 612 | return true; /* Yes, CPU has newly registered callbacks. */ |
| 621 | for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) | 613 | for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) |
| 622 | if (rdp->nxttail[i - 1] != rdp->nxttail[i] && | 614 | if (rdp->nxttail[i - 1] != rdp->nxttail[i] && |
| 623 | ULONG_CMP_LT(READ_ONCE(rsp->completed), | 615 | ULONG_CMP_LT(READ_ONCE(rsp->completed), |
| 624 | rdp->nxtcompleted[i])) | 616 | rdp->nxtcompleted[i])) |
| 625 | return 1; /* Yes, CBs for future grace period. */ | 617 | return true; /* Yes, CBs for future grace period. */ |
| 626 | return 0; /* No grace period needed. */ | 618 | return false; /* No grace period needed. */ |
| 627 | } | 619 | } |
| 628 | 620 | ||
| 629 | /* | 621 | /* |
| @@ -740,7 +732,7 @@ void rcu_user_enter(void) | |||
| 740 | * | 732 | * |
| 741 | * Exit from an interrupt handler, which might possibly result in entering | 733 | * Exit from an interrupt handler, which might possibly result in entering |
| 742 | * idle mode, in other words, leaving the mode in which read-side critical | 734 | * idle mode, in other words, leaving the mode in which read-side critical |
| 743 | * sections can occur. | 735 | * sections can occur. The caller must have disabled interrupts. |
| 744 | * | 736 | * |
| 745 | * This code assumes that the idle loop never does anything that might | 737 | * This code assumes that the idle loop never does anything that might |
| 746 | * result in unbalanced calls to irq_enter() and irq_exit(). If your | 738 | * result in unbalanced calls to irq_enter() and irq_exit(). If your |
| @@ -753,11 +745,10 @@ void rcu_user_enter(void) | |||
| 753 | */ | 745 | */ |
| 754 | void rcu_irq_exit(void) | 746 | void rcu_irq_exit(void) |
| 755 | { | 747 | { |
| 756 | unsigned long flags; | ||
| 757 | long long oldval; | 748 | long long oldval; |
| 758 | struct rcu_dynticks *rdtp; | 749 | struct rcu_dynticks *rdtp; |
| 759 | 750 | ||
| 760 | local_irq_save(flags); | 751 | RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); |
| 761 | rdtp = this_cpu_ptr(&rcu_dynticks); | 752 | rdtp = this_cpu_ptr(&rcu_dynticks); |
| 762 | oldval = rdtp->dynticks_nesting; | 753 | oldval = rdtp->dynticks_nesting; |
| 763 | rdtp->dynticks_nesting--; | 754 | rdtp->dynticks_nesting--; |
| @@ -768,6 +759,17 @@ void rcu_irq_exit(void) | |||
| 768 | else | 759 | else |
| 769 | rcu_eqs_enter_common(oldval, true); | 760 | rcu_eqs_enter_common(oldval, true); |
| 770 | rcu_sysidle_enter(1); | 761 | rcu_sysidle_enter(1); |
| 762 | } | ||
| 763 | |||
| 764 | /* | ||
| 765 | * Wrapper for rcu_irq_exit() where interrupts are enabled. | ||
| 766 | */ | ||
| 767 | void rcu_irq_exit_irqson(void) | ||
| 768 | { | ||
| 769 | unsigned long flags; | ||
| 770 | |||
| 771 | local_irq_save(flags); | ||
| 772 | rcu_irq_exit(); | ||
| 771 | local_irq_restore(flags); | 773 | local_irq_restore(flags); |
| 772 | } | 774 | } |
| 773 | 775 | ||
| @@ -865,7 +867,7 @@ void rcu_user_exit(void) | |||
| 865 | * | 867 | * |
| 866 | * Enter an interrupt handler, which might possibly result in exiting | 868 | * Enter an interrupt handler, which might possibly result in exiting |
| 867 | * idle mode, in other words, entering the mode in which read-side critical | 869 | * idle mode, in other words, entering the mode in which read-side critical |
| 868 | * sections can occur. | 870 | * sections can occur. The caller must have disabled interrupts. |
| 869 | * | 871 | * |
| 870 | * Note that the Linux kernel is fully capable of entering an interrupt | 872 | * Note that the Linux kernel is fully capable of entering an interrupt |
| 871 | * handler that it never exits, for example when doing upcalls to | 873 | * handler that it never exits, for example when doing upcalls to |
| @@ -881,11 +883,10 @@ void rcu_user_exit(void) | |||
| 881 | */ | 883 | */ |
| 882 | void rcu_irq_enter(void) | 884 | void rcu_irq_enter(void) |
| 883 | { | 885 | { |
| 884 | unsigned long flags; | ||
| 885 | struct rcu_dynticks *rdtp; | 886 | struct rcu_dynticks *rdtp; |
| 886 | long long oldval; | 887 | long long oldval; |
| 887 | 888 | ||
| 888 | local_irq_save(flags); | 889 | RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!"); |
| 889 | rdtp = this_cpu_ptr(&rcu_dynticks); | 890 | rdtp = this_cpu_ptr(&rcu_dynticks); |
| 890 | oldval = rdtp->dynticks_nesting; | 891 | oldval = rdtp->dynticks_nesting; |
| 891 | rdtp->dynticks_nesting++; | 892 | rdtp->dynticks_nesting++; |
| @@ -896,6 +897,17 @@ void rcu_irq_enter(void) | |||
| 896 | else | 897 | else |
| 897 | rcu_eqs_exit_common(oldval, true); | 898 | rcu_eqs_exit_common(oldval, true); |
| 898 | rcu_sysidle_exit(1); | 899 | rcu_sysidle_exit(1); |
| 900 | } | ||
| 901 | |||
| 902 | /* | ||
| 903 | * Wrapper for rcu_irq_enter() where interrupts are enabled. | ||
| 904 | */ | ||
| 905 | void rcu_irq_enter_irqson(void) | ||
| 906 | { | ||
| 907 | unsigned long flags; | ||
| 908 | |||
| 909 | local_irq_save(flags); | ||
| 910 | rcu_irq_enter(); | ||
| 899 | local_irq_restore(flags); | 911 | local_irq_restore(flags); |
| 900 | } | 912 | } |
| 901 | 913 | ||
| @@ -1187,6 +1199,16 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) | |||
| 1187 | } | 1199 | } |
| 1188 | 1200 | ||
| 1189 | /* | 1201 | /* |
| 1202 | * Convert a ->gp_state value to a character string. | ||
| 1203 | */ | ||
| 1204 | static const char *gp_state_getname(short gs) | ||
| 1205 | { | ||
| 1206 | if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) | ||
| 1207 | return "???"; | ||
| 1208 | return gp_state_names[gs]; | ||
| 1209 | } | ||
| 1210 | |||
| 1211 | /* | ||
| 1190 | * Complain about starvation of grace-period kthread. | 1212 | * Complain about starvation of grace-period kthread. |
| 1191 | */ | 1213 | */ |
| 1192 | static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) | 1214 | static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) |
| @@ -1196,12 +1218,16 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) | |||
| 1196 | 1218 | ||
| 1197 | j = jiffies; | 1219 | j = jiffies; |
| 1198 | gpa = READ_ONCE(rsp->gp_activity); | 1220 | gpa = READ_ONCE(rsp->gp_activity); |
| 1199 | if (j - gpa > 2 * HZ) | 1221 | if (j - gpa > 2 * HZ) { |
| 1200 | pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n", | 1222 | pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n", |
| 1201 | rsp->name, j - gpa, | 1223 | rsp->name, j - gpa, |
| 1202 | rsp->gpnum, rsp->completed, | 1224 | rsp->gpnum, rsp->completed, |
| 1203 | rsp->gp_flags, rsp->gp_state, | 1225 | rsp->gp_flags, |
| 1204 | rsp->gp_kthread ? rsp->gp_kthread->state : 0); | 1226 | gp_state_getname(rsp->gp_state), rsp->gp_state, |
| 1227 | rsp->gp_kthread ? rsp->gp_kthread->state : ~0); | ||
| 1228 | if (rsp->gp_kthread) | ||
| 1229 | sched_show_task(rsp->gp_kthread); | ||
| 1230 | } | ||
| 1205 | } | 1231 | } |
| 1206 | 1232 | ||
| 1207 | /* | 1233 | /* |
| @@ -1214,7 +1240,7 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) | |||
| 1214 | struct rcu_node *rnp; | 1240 | struct rcu_node *rnp; |
| 1215 | 1241 | ||
| 1216 | rcu_for_each_leaf_node(rsp, rnp) { | 1242 | rcu_for_each_leaf_node(rsp, rnp) { |
| 1217 | raw_spin_lock_irqsave(&rnp->lock, flags); | 1243 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 1218 | if (rnp->qsmask != 0) { | 1244 | if (rnp->qsmask != 0) { |
| 1219 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) | 1245 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) |
| 1220 | if (rnp->qsmask & (1UL << cpu)) | 1246 | if (rnp->qsmask & (1UL << cpu)) |
| @@ -1237,7 +1263,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) | |||
| 1237 | 1263 | ||
| 1238 | /* Only let one CPU complain about others per time interval. */ | 1264 | /* Only let one CPU complain about others per time interval. */ |
| 1239 | 1265 | ||
| 1240 | raw_spin_lock_irqsave(&rnp->lock, flags); | 1266 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 1241 | delta = jiffies - READ_ONCE(rsp->jiffies_stall); | 1267 | delta = jiffies - READ_ONCE(rsp->jiffies_stall); |
| 1242 | if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { | 1268 | if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { |
| 1243 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1269 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| @@ -1256,7 +1282,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) | |||
| 1256 | rsp->name); | 1282 | rsp->name); |
| 1257 | print_cpu_stall_info_begin(); | 1283 | print_cpu_stall_info_begin(); |
| 1258 | rcu_for_each_leaf_node(rsp, rnp) { | 1284 | rcu_for_each_leaf_node(rsp, rnp) { |
| 1259 | raw_spin_lock_irqsave(&rnp->lock, flags); | 1285 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 1260 | ndetected += rcu_print_task_stall(rnp); | 1286 | ndetected += rcu_print_task_stall(rnp); |
| 1261 | if (rnp->qsmask != 0) { | 1287 | if (rnp->qsmask != 0) { |
| 1262 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) | 1288 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) |
| @@ -1327,7 +1353,7 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
| 1327 | 1353 | ||
| 1328 | rcu_dump_cpu_stacks(rsp); | 1354 | rcu_dump_cpu_stacks(rsp); |
| 1329 | 1355 | ||
| 1330 | raw_spin_lock_irqsave(&rnp->lock, flags); | 1356 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 1331 | if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) | 1357 | if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) |
| 1332 | WRITE_ONCE(rsp->jiffies_stall, | 1358 | WRITE_ONCE(rsp->jiffies_stall, |
| 1333 | jiffies + 3 * rcu_jiffies_till_stall_check() + 3); | 1359 | jiffies + 3 * rcu_jiffies_till_stall_check() + 3); |
| @@ -1534,10 +1560,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, | |||
| 1534 | * hold it, acquire the root rcu_node structure's lock in order to | 1560 | * hold it, acquire the root rcu_node structure's lock in order to |
| 1535 | * start one (if needed). | 1561 | * start one (if needed). |
| 1536 | */ | 1562 | */ |
| 1537 | if (rnp != rnp_root) { | 1563 | if (rnp != rnp_root) |
| 1538 | raw_spin_lock(&rnp_root->lock); | 1564 | raw_spin_lock_rcu_node(rnp_root); |
| 1539 | smp_mb__after_unlock_lock(); | ||
| 1540 | } | ||
| 1541 | 1565 | ||
| 1542 | /* | 1566 | /* |
| 1543 | * Get a new grace-period number. If there really is no grace | 1567 | * Get a new grace-period number. If there really is no grace |
| @@ -1786,11 +1810,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1786 | if ((rdp->gpnum == READ_ONCE(rnp->gpnum) && | 1810 | if ((rdp->gpnum == READ_ONCE(rnp->gpnum) && |
| 1787 | rdp->completed == READ_ONCE(rnp->completed) && | 1811 | rdp->completed == READ_ONCE(rnp->completed) && |
| 1788 | !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */ | 1812 | !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */ |
| 1789 | !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ | 1813 | !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */ |
| 1790 | local_irq_restore(flags); | 1814 | local_irq_restore(flags); |
| 1791 | return; | 1815 | return; |
| 1792 | } | 1816 | } |
| 1793 | smp_mb__after_unlock_lock(); | ||
| 1794 | needwake = __note_gp_changes(rsp, rnp, rdp); | 1817 | needwake = __note_gp_changes(rsp, rnp, rdp); |
| 1795 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1818 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 1796 | if (needwake) | 1819 | if (needwake) |
| @@ -1805,21 +1828,20 @@ static void rcu_gp_slow(struct rcu_state *rsp, int delay) | |||
| 1805 | } | 1828 | } |
| 1806 | 1829 | ||
| 1807 | /* | 1830 | /* |
| 1808 | * Initialize a new grace period. Return 0 if no grace period required. | 1831 | * Initialize a new grace period. Return false if no grace period required. |
| 1809 | */ | 1832 | */ |
| 1810 | static int rcu_gp_init(struct rcu_state *rsp) | 1833 | static bool rcu_gp_init(struct rcu_state *rsp) |
| 1811 | { | 1834 | { |
| 1812 | unsigned long oldmask; | 1835 | unsigned long oldmask; |
| 1813 | struct rcu_data *rdp; | 1836 | struct rcu_data *rdp; |
| 1814 | struct rcu_node *rnp = rcu_get_root(rsp); | 1837 | struct rcu_node *rnp = rcu_get_root(rsp); |
| 1815 | 1838 | ||
| 1816 | WRITE_ONCE(rsp->gp_activity, jiffies); | 1839 | WRITE_ONCE(rsp->gp_activity, jiffies); |
| 1817 | raw_spin_lock_irq(&rnp->lock); | 1840 | raw_spin_lock_irq_rcu_node(rnp); |
| 1818 | smp_mb__after_unlock_lock(); | ||
| 1819 | if (!READ_ONCE(rsp->gp_flags)) { | 1841 | if (!READ_ONCE(rsp->gp_flags)) { |
| 1820 | /* Spurious wakeup, tell caller to go back to sleep. */ | 1842 | /* Spurious wakeup, tell caller to go back to sleep. */ |
| 1821 | raw_spin_unlock_irq(&rnp->lock); | 1843 | raw_spin_unlock_irq(&rnp->lock); |
| 1822 | return 0; | 1844 | return false; |
| 1823 | } | 1845 | } |
| 1824 | WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */ | 1846 | WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */ |
| 1825 | 1847 | ||
| @@ -1829,7 +1851,7 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
| 1829 | * Not supposed to be able to happen. | 1851 | * Not supposed to be able to happen. |
| 1830 | */ | 1852 | */ |
| 1831 | raw_spin_unlock_irq(&rnp->lock); | 1853 | raw_spin_unlock_irq(&rnp->lock); |
| 1832 | return 0; | 1854 | return false; |
| 1833 | } | 1855 | } |
| 1834 | 1856 | ||
| 1835 | /* Advance to a new grace period and initialize state. */ | 1857 | /* Advance to a new grace period and initialize state. */ |
| @@ -1847,8 +1869,7 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
| 1847 | */ | 1869 | */ |
| 1848 | rcu_for_each_leaf_node(rsp, rnp) { | 1870 | rcu_for_each_leaf_node(rsp, rnp) { |
| 1849 | rcu_gp_slow(rsp, gp_preinit_delay); | 1871 | rcu_gp_slow(rsp, gp_preinit_delay); |
| 1850 | raw_spin_lock_irq(&rnp->lock); | 1872 | raw_spin_lock_irq_rcu_node(rnp); |
| 1851 | smp_mb__after_unlock_lock(); | ||
| 1852 | if (rnp->qsmaskinit == rnp->qsmaskinitnext && | 1873 | if (rnp->qsmaskinit == rnp->qsmaskinitnext && |
| 1853 | !rnp->wait_blkd_tasks) { | 1874 | !rnp->wait_blkd_tasks) { |
| 1854 | /* Nothing to do on this leaf rcu_node structure. */ | 1875 | /* Nothing to do on this leaf rcu_node structure. */ |
| @@ -1904,8 +1925,7 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
| 1904 | */ | 1925 | */ |
| 1905 | rcu_for_each_node_breadth_first(rsp, rnp) { | 1926 | rcu_for_each_node_breadth_first(rsp, rnp) { |
| 1906 | rcu_gp_slow(rsp, gp_init_delay); | 1927 | rcu_gp_slow(rsp, gp_init_delay); |
| 1907 | raw_spin_lock_irq(&rnp->lock); | 1928 | raw_spin_lock_irq_rcu_node(rnp); |
| 1908 | smp_mb__after_unlock_lock(); | ||
| 1909 | rdp = this_cpu_ptr(rsp->rda); | 1929 | rdp = this_cpu_ptr(rsp->rda); |
| 1910 | rcu_preempt_check_blocked_tasks(rnp); | 1930 | rcu_preempt_check_blocked_tasks(rnp); |
| 1911 | rnp->qsmask = rnp->qsmaskinit; | 1931 | rnp->qsmask = rnp->qsmaskinit; |
| @@ -1923,7 +1943,7 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
| 1923 | WRITE_ONCE(rsp->gp_activity, jiffies); | 1943 | WRITE_ONCE(rsp->gp_activity, jiffies); |
| 1924 | } | 1944 | } |
| 1925 | 1945 | ||
| 1926 | return 1; | 1946 | return true; |
| 1927 | } | 1947 | } |
| 1928 | 1948 | ||
| 1929 | /* | 1949 | /* |
| @@ -1973,8 +1993,7 @@ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) | |||
| 1973 | } | 1993 | } |
| 1974 | /* Clear flag to prevent immediate re-entry. */ | 1994 | /* Clear flag to prevent immediate re-entry. */ |
| 1975 | if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { | 1995 | if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { |
| 1976 | raw_spin_lock_irq(&rnp->lock); | 1996 | raw_spin_lock_irq_rcu_node(rnp); |
| 1977 | smp_mb__after_unlock_lock(); | ||
| 1978 | WRITE_ONCE(rsp->gp_flags, | 1997 | WRITE_ONCE(rsp->gp_flags, |
| 1979 | READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); | 1998 | READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); |
| 1980 | raw_spin_unlock_irq(&rnp->lock); | 1999 | raw_spin_unlock_irq(&rnp->lock); |
| @@ -1993,8 +2012,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
| 1993 | struct rcu_node *rnp = rcu_get_root(rsp); | 2012 | struct rcu_node *rnp = rcu_get_root(rsp); |
| 1994 | 2013 | ||
| 1995 | WRITE_ONCE(rsp->gp_activity, jiffies); | 2014 | WRITE_ONCE(rsp->gp_activity, jiffies); |
| 1996 | raw_spin_lock_irq(&rnp->lock); | 2015 | raw_spin_lock_irq_rcu_node(rnp); |
| 1997 | smp_mb__after_unlock_lock(); | ||
| 1998 | gp_duration = jiffies - rsp->gp_start; | 2016 | gp_duration = jiffies - rsp->gp_start; |
| 1999 | if (gp_duration > rsp->gp_max) | 2017 | if (gp_duration > rsp->gp_max) |
| 2000 | rsp->gp_max = gp_duration; | 2018 | rsp->gp_max = gp_duration; |
| @@ -2019,8 +2037,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
| 2019 | * grace period is recorded in any of the rcu_node structures. | 2037 | * grace period is recorded in any of the rcu_node structures. |
| 2020 | */ | 2038 | */ |
| 2021 | rcu_for_each_node_breadth_first(rsp, rnp) { | 2039 | rcu_for_each_node_breadth_first(rsp, rnp) { |
| 2022 | raw_spin_lock_irq(&rnp->lock); | 2040 | raw_spin_lock_irq_rcu_node(rnp); |
| 2023 | smp_mb__after_unlock_lock(); | ||
| 2024 | WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); | 2041 | WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); |
| 2025 | WARN_ON_ONCE(rnp->qsmask); | 2042 | WARN_ON_ONCE(rnp->qsmask); |
| 2026 | WRITE_ONCE(rnp->completed, rsp->gpnum); | 2043 | WRITE_ONCE(rnp->completed, rsp->gpnum); |
| @@ -2035,8 +2052,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
| 2035 | rcu_gp_slow(rsp, gp_cleanup_delay); | 2052 | rcu_gp_slow(rsp, gp_cleanup_delay); |
| 2036 | } | 2053 | } |
| 2037 | rnp = rcu_get_root(rsp); | 2054 | rnp = rcu_get_root(rsp); |
| 2038 | raw_spin_lock_irq(&rnp->lock); | 2055 | raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */ |
| 2039 | smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */ | ||
| 2040 | rcu_nocb_gp_set(rnp, nocb); | 2056 | rcu_nocb_gp_set(rnp, nocb); |
| 2041 | 2057 | ||
| 2042 | /* Declare grace period done. */ | 2058 | /* Declare grace period done. */ |
| @@ -2284,8 +2300,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
| 2284 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 2300 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 2285 | rnp_c = rnp; | 2301 | rnp_c = rnp; |
| 2286 | rnp = rnp->parent; | 2302 | rnp = rnp->parent; |
| 2287 | raw_spin_lock_irqsave(&rnp->lock, flags); | 2303 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 2288 | smp_mb__after_unlock_lock(); | ||
| 2289 | oldmask = rnp_c->qsmask; | 2304 | oldmask = rnp_c->qsmask; |
| 2290 | } | 2305 | } |
| 2291 | 2306 | ||
| @@ -2332,8 +2347,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, | |||
| 2332 | gps = rnp->gpnum; | 2347 | gps = rnp->gpnum; |
| 2333 | mask = rnp->grpmask; | 2348 | mask = rnp->grpmask; |
| 2334 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 2349 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
| 2335 | raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ | 2350 | raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ |
| 2336 | smp_mb__after_unlock_lock(); | ||
| 2337 | rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); | 2351 | rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); |
| 2338 | } | 2352 | } |
| 2339 | 2353 | ||
| @@ -2355,8 +2369,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 2355 | struct rcu_node *rnp; | 2369 | struct rcu_node *rnp; |
| 2356 | 2370 | ||
| 2357 | rnp = rdp->mynode; | 2371 | rnp = rdp->mynode; |
| 2358 | raw_spin_lock_irqsave(&rnp->lock, flags); | 2372 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 2359 | smp_mb__after_unlock_lock(); | ||
| 2360 | if ((rdp->cpu_no_qs.b.norm && | 2373 | if ((rdp->cpu_no_qs.b.norm && |
| 2361 | rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || | 2374 | rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || |
| 2362 | rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || | 2375 | rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || |
| @@ -2582,8 +2595,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) | |||
| 2582 | rnp = rnp->parent; | 2595 | rnp = rnp->parent; |
| 2583 | if (!rnp) | 2596 | if (!rnp) |
| 2584 | break; | 2597 | break; |
| 2585 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 2598 | raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ |
| 2586 | smp_mb__after_unlock_lock(); /* GP memory ordering. */ | ||
| 2587 | rnp->qsmaskinit &= ~mask; | 2599 | rnp->qsmaskinit &= ~mask; |
| 2588 | rnp->qsmask &= ~mask; | 2600 | rnp->qsmask &= ~mask; |
| 2589 | if (rnp->qsmaskinit) { | 2601 | if (rnp->qsmaskinit) { |
| @@ -2611,8 +2623,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) | |||
| 2611 | 2623 | ||
| 2612 | /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ | 2624 | /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ |
| 2613 | mask = rdp->grpmask; | 2625 | mask = rdp->grpmask; |
| 2614 | raw_spin_lock_irqsave(&rnp->lock, flags); | 2626 | raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ |
| 2615 | smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ | ||
| 2616 | rnp->qsmaskinitnext &= ~mask; | 2627 | rnp->qsmaskinitnext &= ~mask; |
| 2617 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 2628 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 2618 | } | 2629 | } |
| @@ -2809,8 +2820,7 @@ static void force_qs_rnp(struct rcu_state *rsp, | |||
| 2809 | rcu_for_each_leaf_node(rsp, rnp) { | 2820 | rcu_for_each_leaf_node(rsp, rnp) { |
| 2810 | cond_resched_rcu_qs(); | 2821 | cond_resched_rcu_qs(); |
| 2811 | mask = 0; | 2822 | mask = 0; |
| 2812 | raw_spin_lock_irqsave(&rnp->lock, flags); | 2823 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 2813 | smp_mb__after_unlock_lock(); | ||
| 2814 | if (rnp->qsmask == 0) { | 2824 | if (rnp->qsmask == 0) { |
| 2815 | if (rcu_state_p == &rcu_sched_state || | 2825 | if (rcu_state_p == &rcu_sched_state || |
| 2816 | rsp != rcu_state_p || | 2826 | rsp != rcu_state_p || |
| @@ -2881,8 +2891,7 @@ static void force_quiescent_state(struct rcu_state *rsp) | |||
| 2881 | /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ | 2891 | /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ |
| 2882 | 2892 | ||
| 2883 | /* Reached the root of the rcu_node tree, acquire lock. */ | 2893 | /* Reached the root of the rcu_node tree, acquire lock. */ |
| 2884 | raw_spin_lock_irqsave(&rnp_old->lock, flags); | 2894 | raw_spin_lock_irqsave_rcu_node(rnp_old, flags); |
| 2885 | smp_mb__after_unlock_lock(); | ||
| 2886 | raw_spin_unlock(&rnp_old->fqslock); | 2895 | raw_spin_unlock(&rnp_old->fqslock); |
| 2887 | if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { | 2896 | if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { |
| 2888 | rsp->n_force_qs_lh++; | 2897 | rsp->n_force_qs_lh++; |
| @@ -2914,7 +2923,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) | |||
| 2914 | /* Does this CPU require a not-yet-started grace period? */ | 2923 | /* Does this CPU require a not-yet-started grace period? */ |
| 2915 | local_irq_save(flags); | 2924 | local_irq_save(flags); |
| 2916 | if (cpu_needs_another_gp(rsp, rdp)) { | 2925 | if (cpu_needs_another_gp(rsp, rdp)) { |
| 2917 | raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ | 2926 | raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */ |
| 2918 | needwake = rcu_start_gp(rsp); | 2927 | needwake = rcu_start_gp(rsp); |
| 2919 | raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); | 2928 | raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); |
| 2920 | if (needwake) | 2929 | if (needwake) |
| @@ -3005,8 +3014,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
| 3005 | if (!rcu_gp_in_progress(rsp)) { | 3014 | if (!rcu_gp_in_progress(rsp)) { |
| 3006 | struct rcu_node *rnp_root = rcu_get_root(rsp); | 3015 | struct rcu_node *rnp_root = rcu_get_root(rsp); |
| 3007 | 3016 | ||
| 3008 | raw_spin_lock(&rnp_root->lock); | 3017 | raw_spin_lock_rcu_node(rnp_root); |
| 3009 | smp_mb__after_unlock_lock(); | ||
| 3010 | needwake = rcu_start_gp(rsp); | 3018 | needwake = rcu_start_gp(rsp); |
| 3011 | raw_spin_unlock(&rnp_root->lock); | 3019 | raw_spin_unlock(&rnp_root->lock); |
| 3012 | if (needwake) | 3020 | if (needwake) |
| @@ -3365,7 +3373,6 @@ static unsigned long rcu_seq_snap(unsigned long *sp) | |||
| 3365 | { | 3373 | { |
| 3366 | unsigned long s; | 3374 | unsigned long s; |
| 3367 | 3375 | ||
| 3368 | smp_mb(); /* Caller's modifications seen first by other CPUs. */ | ||
| 3369 | s = (READ_ONCE(*sp) + 3) & ~0x1; | 3376 | s = (READ_ONCE(*sp) + 3) & ~0x1; |
| 3370 | smp_mb(); /* Above access must not bleed into critical section. */ | 3377 | smp_mb(); /* Above access must not bleed into critical section. */ |
| 3371 | return s; | 3378 | return s; |
| @@ -3392,6 +3399,7 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp) | |||
| 3392 | } | 3399 | } |
| 3393 | static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) | 3400 | static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) |
| 3394 | { | 3401 | { |
| 3402 | smp_mb(); /* Caller's modifications seen first by other CPUs. */ | ||
| 3395 | return rcu_seq_snap(&rsp->expedited_sequence); | 3403 | return rcu_seq_snap(&rsp->expedited_sequence); |
| 3396 | } | 3404 | } |
| 3397 | static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) | 3405 | static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) |
| @@ -3426,8 +3434,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) | |||
| 3426 | * CPUs for the current rcu_node structure up the rcu_node tree. | 3434 | * CPUs for the current rcu_node structure up the rcu_node tree. |
| 3427 | */ | 3435 | */ |
| 3428 | rcu_for_each_leaf_node(rsp, rnp) { | 3436 | rcu_for_each_leaf_node(rsp, rnp) { |
| 3429 | raw_spin_lock_irqsave(&rnp->lock, flags); | 3437 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 3430 | smp_mb__after_unlock_lock(); | ||
| 3431 | if (rnp->expmaskinit == rnp->expmaskinitnext) { | 3438 | if (rnp->expmaskinit == rnp->expmaskinitnext) { |
| 3432 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 3439 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 3433 | continue; /* No new CPUs, nothing to do. */ | 3440 | continue; /* No new CPUs, nothing to do. */ |
| @@ -3447,8 +3454,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) | |||
| 3447 | rnp_up = rnp->parent; | 3454 | rnp_up = rnp->parent; |
| 3448 | done = false; | 3455 | done = false; |
| 3449 | while (rnp_up) { | 3456 | while (rnp_up) { |
| 3450 | raw_spin_lock_irqsave(&rnp_up->lock, flags); | 3457 | raw_spin_lock_irqsave_rcu_node(rnp_up, flags); |
| 3451 | smp_mb__after_unlock_lock(); | ||
| 3452 | if (rnp_up->expmaskinit) | 3458 | if (rnp_up->expmaskinit) |
| 3453 | done = true; | 3459 | done = true; |
| 3454 | rnp_up->expmaskinit |= mask; | 3460 | rnp_up->expmaskinit |= mask; |
| @@ -3472,8 +3478,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) | |||
| 3472 | 3478 | ||
| 3473 | sync_exp_reset_tree_hotplug(rsp); | 3479 | sync_exp_reset_tree_hotplug(rsp); |
| 3474 | rcu_for_each_node_breadth_first(rsp, rnp) { | 3480 | rcu_for_each_node_breadth_first(rsp, rnp) { |
| 3475 | raw_spin_lock_irqsave(&rnp->lock, flags); | 3481 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 3476 | smp_mb__after_unlock_lock(); | ||
| 3477 | WARN_ON_ONCE(rnp->expmask); | 3482 | WARN_ON_ONCE(rnp->expmask); |
| 3478 | rnp->expmask = rnp->expmaskinit; | 3483 | rnp->expmask = rnp->expmaskinit; |
| 3479 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 3484 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| @@ -3531,8 +3536,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | |||
| 3531 | mask = rnp->grpmask; | 3536 | mask = rnp->grpmask; |
| 3532 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | 3537 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ |
| 3533 | rnp = rnp->parent; | 3538 | rnp = rnp->parent; |
| 3534 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | 3539 | raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ |
| 3535 | smp_mb__after_unlock_lock(); | ||
| 3536 | WARN_ON_ONCE(!(rnp->expmask & mask)); | 3540 | WARN_ON_ONCE(!(rnp->expmask & mask)); |
| 3537 | rnp->expmask &= ~mask; | 3541 | rnp->expmask &= ~mask; |
| 3538 | } | 3542 | } |
| @@ -3549,8 +3553,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, | |||
| 3549 | { | 3553 | { |
| 3550 | unsigned long flags; | 3554 | unsigned long flags; |
| 3551 | 3555 | ||
| 3552 | raw_spin_lock_irqsave(&rnp->lock, flags); | 3556 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 3553 | smp_mb__after_unlock_lock(); | ||
| 3554 | __rcu_report_exp_rnp(rsp, rnp, wake, flags); | 3557 | __rcu_report_exp_rnp(rsp, rnp, wake, flags); |
| 3555 | } | 3558 | } |
| 3556 | 3559 | ||
| @@ -3564,8 +3567,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, | |||
| 3564 | { | 3567 | { |
| 3565 | unsigned long flags; | 3568 | unsigned long flags; |
| 3566 | 3569 | ||
| 3567 | raw_spin_lock_irqsave(&rnp->lock, flags); | 3570 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 3568 | smp_mb__after_unlock_lock(); | ||
| 3569 | if (!(rnp->expmask & mask)) { | 3571 | if (!(rnp->expmask & mask)) { |
| 3570 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 3572 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 3571 | return; | 3573 | return; |
| @@ -3609,7 +3611,7 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, | |||
| 3609 | */ | 3611 | */ |
| 3610 | static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) | 3612 | static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) |
| 3611 | { | 3613 | { |
| 3612 | struct rcu_data *rdp; | 3614 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); |
| 3613 | struct rcu_node *rnp0; | 3615 | struct rcu_node *rnp0; |
| 3614 | struct rcu_node *rnp1 = NULL; | 3616 | struct rcu_node *rnp1 = NULL; |
| 3615 | 3617 | ||
| @@ -3623,7 +3625,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) | |||
| 3623 | if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { | 3625 | if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { |
| 3624 | if (mutex_trylock(&rnp0->exp_funnel_mutex)) { | 3626 | if (mutex_trylock(&rnp0->exp_funnel_mutex)) { |
| 3625 | if (sync_exp_work_done(rsp, rnp0, NULL, | 3627 | if (sync_exp_work_done(rsp, rnp0, NULL, |
| 3626 | &rsp->expedited_workdone0, s)) | 3628 | &rdp->expedited_workdone0, s)) |
| 3627 | return NULL; | 3629 | return NULL; |
| 3628 | return rnp0; | 3630 | return rnp0; |
| 3629 | } | 3631 | } |
| @@ -3637,14 +3639,13 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) | |||
| 3637 | * can be inexact, as it is just promoting locality and is not | 3639 | * can be inexact, as it is just promoting locality and is not |
| 3638 | * strictly needed for correctness. | 3640 | * strictly needed for correctness. |
| 3639 | */ | 3641 | */ |
| 3640 | rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); | 3642 | if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s)) |
| 3641 | if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s)) | ||
| 3642 | return NULL; | 3643 | return NULL; |
| 3643 | mutex_lock(&rdp->exp_funnel_mutex); | 3644 | mutex_lock(&rdp->exp_funnel_mutex); |
| 3644 | rnp0 = rdp->mynode; | 3645 | rnp0 = rdp->mynode; |
| 3645 | for (; rnp0 != NULL; rnp0 = rnp0->parent) { | 3646 | for (; rnp0 != NULL; rnp0 = rnp0->parent) { |
| 3646 | if (sync_exp_work_done(rsp, rnp1, rdp, | 3647 | if (sync_exp_work_done(rsp, rnp1, rdp, |
| 3647 | &rsp->expedited_workdone2, s)) | 3648 | &rdp->expedited_workdone2, s)) |
| 3648 | return NULL; | 3649 | return NULL; |
| 3649 | mutex_lock(&rnp0->exp_funnel_mutex); | 3650 | mutex_lock(&rnp0->exp_funnel_mutex); |
| 3650 | if (rnp1) | 3651 | if (rnp1) |
| @@ -3654,7 +3655,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) | |||
| 3654 | rnp1 = rnp0; | 3655 | rnp1 = rnp0; |
| 3655 | } | 3656 | } |
| 3656 | if (sync_exp_work_done(rsp, rnp1, rdp, | 3657 | if (sync_exp_work_done(rsp, rnp1, rdp, |
| 3657 | &rsp->expedited_workdone3, s)) | 3658 | &rdp->expedited_workdone3, s)) |
| 3658 | return NULL; | 3659 | return NULL; |
| 3659 | return rnp1; | 3660 | return rnp1; |
| 3660 | } | 3661 | } |
| @@ -3708,8 +3709,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, | |||
| 3708 | 3709 | ||
| 3709 | sync_exp_reset_tree(rsp); | 3710 | sync_exp_reset_tree(rsp); |
| 3710 | rcu_for_each_leaf_node(rsp, rnp) { | 3711 | rcu_for_each_leaf_node(rsp, rnp) { |
| 3711 | raw_spin_lock_irqsave(&rnp->lock, flags); | 3712 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 3712 | smp_mb__after_unlock_lock(); | ||
| 3713 | 3713 | ||
| 3714 | /* Each pass checks a CPU for identity, offline, and idle. */ | 3714 | /* Each pass checks a CPU for identity, offline, and idle. */ |
| 3715 | mask_ofl_test = 0; | 3715 | mask_ofl_test = 0; |
| @@ -3741,24 +3741,22 @@ retry_ipi: | |||
| 3741 | ret = smp_call_function_single(cpu, func, rsp, 0); | 3741 | ret = smp_call_function_single(cpu, func, rsp, 0); |
| 3742 | if (!ret) { | 3742 | if (!ret) { |
| 3743 | mask_ofl_ipi &= ~mask; | 3743 | mask_ofl_ipi &= ~mask; |
| 3744 | } else { | 3744 | continue; |
| 3745 | /* Failed, raced with offline. */ | 3745 | } |
| 3746 | raw_spin_lock_irqsave(&rnp->lock, flags); | 3746 | /* Failed, raced with offline. */ |
| 3747 | if (cpu_online(cpu) && | 3747 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 3748 | (rnp->expmask & mask)) { | 3748 | if (cpu_online(cpu) && |
| 3749 | raw_spin_unlock_irqrestore(&rnp->lock, | 3749 | (rnp->expmask & mask)) { |
| 3750 | flags); | ||
| 3751 | schedule_timeout_uninterruptible(1); | ||
| 3752 | if (cpu_online(cpu) && | ||
| 3753 | (rnp->expmask & mask)) | ||
| 3754 | goto retry_ipi; | ||
| 3755 | raw_spin_lock_irqsave(&rnp->lock, | ||
| 3756 | flags); | ||
| 3757 | } | ||
| 3758 | if (!(rnp->expmask & mask)) | ||
| 3759 | mask_ofl_ipi &= ~mask; | ||
| 3760 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 3750 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 3751 | schedule_timeout_uninterruptible(1); | ||
| 3752 | if (cpu_online(cpu) && | ||
| 3753 | (rnp->expmask & mask)) | ||
| 3754 | goto retry_ipi; | ||
| 3755 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
| 3761 | } | 3756 | } |
| 3757 | if (!(rnp->expmask & mask)) | ||
| 3758 | mask_ofl_ipi &= ~mask; | ||
| 3759 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 3762 | } | 3760 | } |
| 3763 | /* Report quiescent states for those that went offline. */ | 3761 | /* Report quiescent states for those that went offline. */ |
| 3764 | mask_ofl_test |= mask_ofl_ipi; | 3762 | mask_ofl_test |= mask_ofl_ipi; |
| @@ -3773,6 +3771,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) | |||
| 3773 | unsigned long jiffies_stall; | 3771 | unsigned long jiffies_stall; |
| 3774 | unsigned long jiffies_start; | 3772 | unsigned long jiffies_start; |
| 3775 | unsigned long mask; | 3773 | unsigned long mask; |
| 3774 | int ndetected; | ||
| 3776 | struct rcu_node *rnp; | 3775 | struct rcu_node *rnp; |
| 3777 | struct rcu_node *rnp_root = rcu_get_root(rsp); | 3776 | struct rcu_node *rnp_root = rcu_get_root(rsp); |
| 3778 | int ret; | 3777 | int ret; |
| @@ -3785,7 +3784,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) | |||
| 3785 | rsp->expedited_wq, | 3784 | rsp->expedited_wq, |
| 3786 | sync_rcu_preempt_exp_done(rnp_root), | 3785 | sync_rcu_preempt_exp_done(rnp_root), |
| 3787 | jiffies_stall); | 3786 | jiffies_stall); |
| 3788 | if (ret > 0) | 3787 | if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) |
| 3789 | return; | 3788 | return; |
| 3790 | if (ret < 0) { | 3789 | if (ret < 0) { |
| 3791 | /* Hit a signal, disable CPU stall warnings. */ | 3790 | /* Hit a signal, disable CPU stall warnings. */ |
| @@ -3795,14 +3794,16 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) | |||
| 3795 | } | 3794 | } |
| 3796 | pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", | 3795 | pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", |
| 3797 | rsp->name); | 3796 | rsp->name); |
| 3797 | ndetected = 0; | ||
| 3798 | rcu_for_each_leaf_node(rsp, rnp) { | 3798 | rcu_for_each_leaf_node(rsp, rnp) { |
| 3799 | (void)rcu_print_task_exp_stall(rnp); | 3799 | ndetected = rcu_print_task_exp_stall(rnp); |
| 3800 | mask = 1; | 3800 | mask = 1; |
| 3801 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { | 3801 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { |
| 3802 | struct rcu_data *rdp; | 3802 | struct rcu_data *rdp; |
| 3803 | 3803 | ||
| 3804 | if (!(rnp->expmask & mask)) | 3804 | if (!(rnp->expmask & mask)) |
| 3805 | continue; | 3805 | continue; |
| 3806 | ndetected++; | ||
| 3806 | rdp = per_cpu_ptr(rsp->rda, cpu); | 3807 | rdp = per_cpu_ptr(rsp->rda, cpu); |
| 3807 | pr_cont(" %d-%c%c%c", cpu, | 3808 | pr_cont(" %d-%c%c%c", cpu, |
| 3808 | "O."[cpu_online(cpu)], | 3809 | "O."[cpu_online(cpu)], |
| @@ -3811,8 +3812,23 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) | |||
| 3811 | } | 3812 | } |
| 3812 | mask <<= 1; | 3813 | mask <<= 1; |
| 3813 | } | 3814 | } |
| 3814 | pr_cont(" } %lu jiffies s: %lu\n", | 3815 | pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", |
| 3815 | jiffies - jiffies_start, rsp->expedited_sequence); | 3816 | jiffies - jiffies_start, rsp->expedited_sequence, |
| 3817 | rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); | ||
| 3818 | if (!ndetected) { | ||
| 3819 | pr_err("blocking rcu_node structures:"); | ||
| 3820 | rcu_for_each_node_breadth_first(rsp, rnp) { | ||
| 3821 | if (rnp == rnp_root) | ||
| 3822 | continue; /* printed unconditionally */ | ||
| 3823 | if (sync_rcu_preempt_exp_done(rnp)) | ||
| 3824 | continue; | ||
| 3825 | pr_cont(" l=%u:%d-%d:%#lx/%c", | ||
| 3826 | rnp->level, rnp->grplo, rnp->grphi, | ||
| 3827 | rnp->expmask, | ||
| 3828 | ".T"[!!rnp->exp_tasks]); | ||
| 3829 | } | ||
| 3830 | pr_cont("\n"); | ||
| 3831 | } | ||
| 3816 | rcu_for_each_leaf_node(rsp, rnp) { | 3832 | rcu_for_each_leaf_node(rsp, rnp) { |
| 3817 | mask = 1; | 3833 | mask = 1; |
| 3818 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { | 3834 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { |
| @@ -3847,6 +3863,16 @@ void synchronize_sched_expedited(void) | |||
| 3847 | struct rcu_node *rnp; | 3863 | struct rcu_node *rnp; |
| 3848 | struct rcu_state *rsp = &rcu_sched_state; | 3864 | struct rcu_state *rsp = &rcu_sched_state; |
| 3849 | 3865 | ||
| 3866 | /* If only one CPU, this is automatically a grace period. */ | ||
| 3867 | if (rcu_blocking_is_gp()) | ||
| 3868 | return; | ||
| 3869 | |||
| 3870 | /* If expedited grace periods are prohibited, fall back to normal. */ | ||
| 3871 | if (rcu_gp_is_normal()) { | ||
| 3872 | wait_rcu_gp(call_rcu_sched); | ||
| 3873 | return; | ||
| 3874 | } | ||
| 3875 | |||
| 3850 | /* Take a snapshot of the sequence number. */ | 3876 | /* Take a snapshot of the sequence number. */ |
| 3851 | s = rcu_exp_gp_seq_snap(rsp); | 3877 | s = rcu_exp_gp_seq_snap(rsp); |
| 3852 | 3878 | ||
| @@ -4135,7 +4161,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) | |||
| 4135 | rnp = rnp->parent; | 4161 | rnp = rnp->parent; |
| 4136 | if (rnp == NULL) | 4162 | if (rnp == NULL) |
| 4137 | return; | 4163 | return; |
| 4138 | raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */ | 4164 | raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */ |
| 4139 | rnp->qsmaskinit |= mask; | 4165 | rnp->qsmaskinit |= mask; |
| 4140 | raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ | 4166 | raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ |
| 4141 | } | 4167 | } |
| @@ -4152,7 +4178,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
| 4152 | struct rcu_node *rnp = rcu_get_root(rsp); | 4178 | struct rcu_node *rnp = rcu_get_root(rsp); |
| 4153 | 4179 | ||
| 4154 | /* Set up local state, ensuring consistent view of global state. */ | 4180 | /* Set up local state, ensuring consistent view of global state. */ |
| 4155 | raw_spin_lock_irqsave(&rnp->lock, flags); | 4181 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 4156 | rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); | 4182 | rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); |
| 4157 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); | 4183 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); |
| 4158 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); | 4184 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); |
| @@ -4179,7 +4205,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
| 4179 | struct rcu_node *rnp = rcu_get_root(rsp); | 4205 | struct rcu_node *rnp = rcu_get_root(rsp); |
| 4180 | 4206 | ||
| 4181 | /* Set up local state, ensuring consistent view of global state. */ | 4207 | /* Set up local state, ensuring consistent view of global state. */ |
| 4182 | raw_spin_lock_irqsave(&rnp->lock, flags); | 4208 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 4183 | rdp->qlen_last_fqs_check = 0; | 4209 | rdp->qlen_last_fqs_check = 0; |
| 4184 | rdp->n_force_qs_snap = rsp->n_force_qs; | 4210 | rdp->n_force_qs_snap = rsp->n_force_qs; |
| 4185 | rdp->blimit = blimit; | 4211 | rdp->blimit = blimit; |
| @@ -4198,8 +4224,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
| 4198 | */ | 4224 | */ |
| 4199 | rnp = rdp->mynode; | 4225 | rnp = rdp->mynode; |
| 4200 | mask = rdp->grpmask; | 4226 | mask = rdp->grpmask; |
| 4201 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 4227 | raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ |
| 4202 | smp_mb__after_unlock_lock(); | ||
| 4203 | rnp->qsmaskinitnext |= mask; | 4228 | rnp->qsmaskinitnext |= mask; |
| 4204 | rnp->expmaskinitnext |= mask; | 4229 | rnp->expmaskinitnext |= mask; |
| 4205 | if (!rdp->beenonline) | 4230 | if (!rdp->beenonline) |
| @@ -4327,14 +4352,14 @@ static int __init rcu_spawn_gp_kthread(void) | |||
| 4327 | t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name); | 4352 | t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name); |
| 4328 | BUG_ON(IS_ERR(t)); | 4353 | BUG_ON(IS_ERR(t)); |
| 4329 | rnp = rcu_get_root(rsp); | 4354 | rnp = rcu_get_root(rsp); |
| 4330 | raw_spin_lock_irqsave(&rnp->lock, flags); | 4355 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 4331 | rsp->gp_kthread = t; | 4356 | rsp->gp_kthread = t; |
| 4332 | if (kthread_prio) { | 4357 | if (kthread_prio) { |
| 4333 | sp.sched_priority = kthread_prio; | 4358 | sp.sched_priority = kthread_prio; |
| 4334 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | 4359 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); |
| 4335 | } | 4360 | } |
| 4336 | wake_up_process(t); | ||
| 4337 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 4361 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 4362 | wake_up_process(t); | ||
| 4338 | } | 4363 | } |
| 4339 | rcu_spawn_nocb_kthreads(); | 4364 | rcu_spawn_nocb_kthreads(); |
| 4340 | rcu_spawn_boost_kthreads(); | 4365 | rcu_spawn_boost_kthreads(); |
| @@ -4385,12 +4410,14 @@ static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) | |||
| 4385 | /* | 4410 | /* |
| 4386 | * Helper function for rcu_init() that initializes one rcu_state structure. | 4411 | * Helper function for rcu_init() that initializes one rcu_state structure. |
| 4387 | */ | 4412 | */ |
| 4388 | static void __init rcu_init_one(struct rcu_state *rsp, | 4413 | static void __init rcu_init_one(struct rcu_state *rsp) |
| 4389 | struct rcu_data __percpu *rda) | ||
| 4390 | { | 4414 | { |
| 4391 | static const char * const buf[] = RCU_NODE_NAME_INIT; | 4415 | static const char * const buf[] = RCU_NODE_NAME_INIT; |
| 4392 | static const char * const fqs[] = RCU_FQS_NAME_INIT; | 4416 | static const char * const fqs[] = RCU_FQS_NAME_INIT; |
| 4393 | static const char * const exp[] = RCU_EXP_NAME_INIT; | 4417 | static const char * const exp[] = RCU_EXP_NAME_INIT; |
| 4418 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; | ||
| 4419 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | ||
| 4420 | static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; | ||
| 4394 | static u8 fl_mask = 0x1; | 4421 | static u8 fl_mask = 0x1; |
| 4395 | 4422 | ||
| 4396 | int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ | 4423 | int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ |
| @@ -4576,8 +4603,8 @@ void __init rcu_init(void) | |||
| 4576 | 4603 | ||
| 4577 | rcu_bootup_announce(); | 4604 | rcu_bootup_announce(); |
| 4578 | rcu_init_geometry(); | 4605 | rcu_init_geometry(); |
| 4579 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); | 4606 | rcu_init_one(&rcu_bh_state); |
| 4580 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); | 4607 | rcu_init_one(&rcu_sched_state); |
| 4581 | if (dump_tree) | 4608 | if (dump_tree) |
| 4582 | rcu_dump_rcu_node_tree(&rcu_sched_state); | 4609 | rcu_dump_rcu_node_tree(&rcu_sched_state); |
| 4583 | __rcu_init_preempt(); | 4610 | __rcu_init_preempt(); |
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9fb4e238d4dc..83360b4f4352 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
| @@ -178,6 +178,8 @@ struct rcu_node { | |||
| 178 | /* beginning of each expedited GP. */ | 178 | /* beginning of each expedited GP. */ |
| 179 | unsigned long expmaskinitnext; | 179 | unsigned long expmaskinitnext; |
| 180 | /* Online CPUs for next expedited GP. */ | 180 | /* Online CPUs for next expedited GP. */ |
| 181 | /* Any CPU that has ever been online will */ | ||
| 182 | /* have its bit set. */ | ||
| 181 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ | 183 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ |
| 182 | /* Only one bit will be set in this mask. */ | 184 | /* Only one bit will be set in this mask. */ |
| 183 | int grplo; /* lowest-numbered CPU or group here. */ | 185 | int grplo; /* lowest-numbered CPU or group here. */ |
| @@ -384,6 +386,10 @@ struct rcu_data { | |||
| 384 | struct rcu_head oom_head; | 386 | struct rcu_head oom_head; |
| 385 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 387 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
| 386 | struct mutex exp_funnel_mutex; | 388 | struct mutex exp_funnel_mutex; |
| 389 | atomic_long_t expedited_workdone0; /* # done by others #0. */ | ||
| 390 | atomic_long_t expedited_workdone1; /* # done by others #1. */ | ||
| 391 | atomic_long_t expedited_workdone2; /* # done by others #2. */ | ||
| 392 | atomic_long_t expedited_workdone3; /* # done by others #3. */ | ||
| 387 | 393 | ||
| 388 | /* 7) Callback offloading. */ | 394 | /* 7) Callback offloading. */ |
| 389 | #ifdef CONFIG_RCU_NOCB_CPU | 395 | #ifdef CONFIG_RCU_NOCB_CPU |
| @@ -498,10 +504,6 @@ struct rcu_state { | |||
| 498 | /* End of fields guarded by barrier_mutex. */ | 504 | /* End of fields guarded by barrier_mutex. */ |
| 499 | 505 | ||
| 500 | unsigned long expedited_sequence; /* Take a ticket. */ | 506 | unsigned long expedited_sequence; /* Take a ticket. */ |
| 501 | atomic_long_t expedited_workdone0; /* # done by others #0. */ | ||
| 502 | atomic_long_t expedited_workdone1; /* # done by others #1. */ | ||
| 503 | atomic_long_t expedited_workdone2; /* # done by others #2. */ | ||
| 504 | atomic_long_t expedited_workdone3; /* # done by others #3. */ | ||
| 505 | atomic_long_t expedited_normal; /* # fallbacks to normal. */ | 507 | atomic_long_t expedited_normal; /* # fallbacks to normal. */ |
| 506 | atomic_t expedited_need_qs; /* # CPUs left to check in. */ | 508 | atomic_t expedited_need_qs; /* # CPUs left to check in. */ |
| 507 | wait_queue_head_t expedited_wq; /* Wait for check-ins. */ | 509 | wait_queue_head_t expedited_wq; /* Wait for check-ins. */ |
| @@ -545,6 +547,18 @@ struct rcu_state { | |||
| 545 | #define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */ | 547 | #define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */ |
| 546 | #define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */ | 548 | #define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */ |
| 547 | 549 | ||
| 550 | #ifndef RCU_TREE_NONCORE | ||
| 551 | static const char * const gp_state_names[] = { | ||
| 552 | "RCU_GP_IDLE", | ||
| 553 | "RCU_GP_WAIT_GPS", | ||
| 554 | "RCU_GP_DONE_GPS", | ||
| 555 | "RCU_GP_WAIT_FQS", | ||
| 556 | "RCU_GP_DOING_FQS", | ||
| 557 | "RCU_GP_CLEANUP", | ||
| 558 | "RCU_GP_CLEANED", | ||
| 559 | }; | ||
| 560 | #endif /* #ifndef RCU_TREE_NONCORE */ | ||
| 561 | |||
| 548 | extern struct list_head rcu_struct_flavors; | 562 | extern struct list_head rcu_struct_flavors; |
| 549 | 563 | ||
| 550 | /* Sequence through rcu_state structures for each RCU flavor. */ | 564 | /* Sequence through rcu_state structures for each RCU flavor. */ |
| @@ -664,3 +678,42 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) | |||
| 664 | #else /* #ifdef CONFIG_PPC */ | 678 | #else /* #ifdef CONFIG_PPC */ |
| 665 | #define smp_mb__after_unlock_lock() do { } while (0) | 679 | #define smp_mb__after_unlock_lock() do { } while (0) |
| 666 | #endif /* #else #ifdef CONFIG_PPC */ | 680 | #endif /* #else #ifdef CONFIG_PPC */ |
| 681 | |||
| 682 | /* | ||
| 683 | * Wrappers for the rcu_node::lock acquire. | ||
| 684 | * | ||
| 685 | * Because the rcu_nodes form a tree, the tree traversal locking will observe | ||
| 686 | * different lock values, this in turn means that an UNLOCK of one level | ||
| 687 | * followed by a LOCK of another level does not imply a full memory barrier; | ||
| 688 | * and most importantly transitivity is lost. | ||
| 689 | * | ||
| 690 | * In order to restore full ordering between tree levels, augment the regular | ||
| 691 | * lock acquire functions with smp_mb__after_unlock_lock(). | ||
| 692 | */ | ||
| 693 | static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp) | ||
| 694 | { | ||
| 695 | raw_spin_lock(&rnp->lock); | ||
| 696 | smp_mb__after_unlock_lock(); | ||
| 697 | } | ||
| 698 | |||
| 699 | static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp) | ||
| 700 | { | ||
| 701 | raw_spin_lock_irq(&rnp->lock); | ||
| 702 | smp_mb__after_unlock_lock(); | ||
| 703 | } | ||
| 704 | |||
| 705 | #define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ | ||
| 706 | do { \ | ||
| 707 | typecheck(unsigned long, flags); \ | ||
| 708 | raw_spin_lock_irqsave(&(rnp)->lock, flags); \ | ||
| 709 | smp_mb__after_unlock_lock(); \ | ||
| 710 | } while (0) | ||
| 711 | |||
| 712 | static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp) | ||
| 713 | { | ||
| 714 | bool locked = raw_spin_trylock(&rnp->lock); | ||
| 715 | |||
| 716 | if (locked) | ||
| 717 | smp_mb__after_unlock_lock(); | ||
| 718 | return locked; | ||
| 719 | } | ||
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 630c19772630..9467a8b7e756 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
| @@ -63,8 +63,7 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ | |||
| 63 | 63 | ||
| 64 | /* | 64 | /* |
| 65 | * Check the RCU kernel configuration parameters and print informative | 65 | * Check the RCU kernel configuration parameters and print informative |
| 66 | * messages about anything out of the ordinary. If you like #ifdef, you | 66 | * messages about anything out of the ordinary. |
| 67 | * will love this function. | ||
| 68 | */ | 67 | */ |
| 69 | static void __init rcu_bootup_announce_oddness(void) | 68 | static void __init rcu_bootup_announce_oddness(void) |
| 70 | { | 69 | { |
| @@ -147,8 +146,8 @@ static void __init rcu_bootup_announce(void) | |||
| 147 | * the corresponding expedited grace period will also be the end of the | 146 | * the corresponding expedited grace period will also be the end of the |
| 148 | * normal grace period. | 147 | * normal grace period. |
| 149 | */ | 148 | */ |
| 150 | static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, | 149 | static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) |
| 151 | unsigned long flags) __releases(rnp->lock) | 150 | __releases(rnp->lock) /* But leaves rrupts disabled. */ |
| 152 | { | 151 | { |
| 153 | int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) + | 152 | int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) + |
| 154 | (rnp->exp_tasks ? RCU_EXP_TASKS : 0) + | 153 | (rnp->exp_tasks ? RCU_EXP_TASKS : 0) + |
| @@ -236,7 +235,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, | |||
| 236 | rnp->gp_tasks = &t->rcu_node_entry; | 235 | rnp->gp_tasks = &t->rcu_node_entry; |
| 237 | if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) | 236 | if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) |
| 238 | rnp->exp_tasks = &t->rcu_node_entry; | 237 | rnp->exp_tasks = &t->rcu_node_entry; |
| 239 | raw_spin_unlock(&rnp->lock); | 238 | raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */ |
| 240 | 239 | ||
| 241 | /* | 240 | /* |
| 242 | * Report the quiescent state for the expedited GP. This expedited | 241 | * Report the quiescent state for the expedited GP. This expedited |
| @@ -251,7 +250,6 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, | |||
| 251 | } else { | 250 | } else { |
| 252 | WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs); | 251 | WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs); |
| 253 | } | 252 | } |
| 254 | local_irq_restore(flags); | ||
| 255 | } | 253 | } |
| 256 | 254 | ||
| 257 | /* | 255 | /* |
| @@ -286,12 +284,11 @@ static void rcu_preempt_qs(void) | |||
| 286 | * predating the current grace period drain, in other words, until | 284 | * predating the current grace period drain, in other words, until |
| 287 | * rnp->gp_tasks becomes NULL. | 285 | * rnp->gp_tasks becomes NULL. |
| 288 | * | 286 | * |
| 289 | * Caller must disable preemption. | 287 | * Caller must disable interrupts. |
| 290 | */ | 288 | */ |
| 291 | static void rcu_preempt_note_context_switch(void) | 289 | static void rcu_preempt_note_context_switch(void) |
| 292 | { | 290 | { |
| 293 | struct task_struct *t = current; | 291 | struct task_struct *t = current; |
| 294 | unsigned long flags; | ||
| 295 | struct rcu_data *rdp; | 292 | struct rcu_data *rdp; |
| 296 | struct rcu_node *rnp; | 293 | struct rcu_node *rnp; |
| 297 | 294 | ||
| @@ -301,8 +298,7 @@ static void rcu_preempt_note_context_switch(void) | |||
| 301 | /* Possibly blocking in an RCU read-side critical section. */ | 298 | /* Possibly blocking in an RCU read-side critical section. */ |
| 302 | rdp = this_cpu_ptr(rcu_state_p->rda); | 299 | rdp = this_cpu_ptr(rcu_state_p->rda); |
| 303 | rnp = rdp->mynode; | 300 | rnp = rdp->mynode; |
| 304 | raw_spin_lock_irqsave(&rnp->lock, flags); | 301 | raw_spin_lock_rcu_node(rnp); |
| 305 | smp_mb__after_unlock_lock(); | ||
| 306 | t->rcu_read_unlock_special.b.blocked = true; | 302 | t->rcu_read_unlock_special.b.blocked = true; |
| 307 | t->rcu_blocked_node = rnp; | 303 | t->rcu_blocked_node = rnp; |
| 308 | 304 | ||
| @@ -318,7 +314,7 @@ static void rcu_preempt_note_context_switch(void) | |||
| 318 | (rnp->qsmask & rdp->grpmask) | 314 | (rnp->qsmask & rdp->grpmask) |
| 319 | ? rnp->gpnum | 315 | ? rnp->gpnum |
| 320 | : rnp->gpnum + 1); | 316 | : rnp->gpnum + 1); |
| 321 | rcu_preempt_ctxt_queue(rnp, rdp, flags); | 317 | rcu_preempt_ctxt_queue(rnp, rdp); |
| 322 | } else if (t->rcu_read_lock_nesting < 0 && | 318 | } else if (t->rcu_read_lock_nesting < 0 && |
| 323 | t->rcu_read_unlock_special.s) { | 319 | t->rcu_read_unlock_special.s) { |
| 324 | 320 | ||
| @@ -450,20 +446,13 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
| 450 | 446 | ||
| 451 | /* | 447 | /* |
| 452 | * Remove this task from the list it blocked on. The task | 448 | * Remove this task from the list it blocked on. The task |
| 453 | * now remains queued on the rcu_node corresponding to | 449 | * now remains queued on the rcu_node corresponding to the |
| 454 | * the CPU it first blocked on, so the first attempt to | 450 | * CPU it first blocked on, so there is no longer any need |
| 455 | * acquire the task's rcu_node's ->lock will succeed. | 451 | * to loop. Retain a WARN_ON_ONCE() out of sheer paranoia. |
| 456 | * Keep the loop and add a WARN_ON() out of sheer paranoia. | ||
| 457 | */ | 452 | */ |
| 458 | for (;;) { | 453 | rnp = t->rcu_blocked_node; |
| 459 | rnp = t->rcu_blocked_node; | 454 | raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ |
| 460 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 455 | WARN_ON_ONCE(rnp != t->rcu_blocked_node); |
| 461 | smp_mb__after_unlock_lock(); | ||
| 462 | if (rnp == t->rcu_blocked_node) | ||
| 463 | break; | ||
| 464 | WARN_ON_ONCE(1); | ||
| 465 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
| 466 | } | ||
| 467 | empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); | 456 | empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); |
| 468 | empty_exp = sync_rcu_preempt_exp_done(rnp); | 457 | empty_exp = sync_rcu_preempt_exp_done(rnp); |
| 469 | smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ | 458 | smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ |
| @@ -527,7 +516,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) | |||
| 527 | unsigned long flags; | 516 | unsigned long flags; |
| 528 | struct task_struct *t; | 517 | struct task_struct *t; |
| 529 | 518 | ||
| 530 | raw_spin_lock_irqsave(&rnp->lock, flags); | 519 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 531 | if (!rcu_preempt_blocked_readers_cgp(rnp)) { | 520 | if (!rcu_preempt_blocked_readers_cgp(rnp)) { |
| 532 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 521 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 533 | return; | 522 | return; |
| @@ -748,6 +737,12 @@ void synchronize_rcu_expedited(void) | |||
| 748 | struct rcu_state *rsp = rcu_state_p; | 737 | struct rcu_state *rsp = rcu_state_p; |
| 749 | unsigned long s; | 738 | unsigned long s; |
| 750 | 739 | ||
| 740 | /* If expedited grace periods are prohibited, fall back to normal. */ | ||
| 741 | if (rcu_gp_is_normal()) { | ||
| 742 | wait_rcu_gp(call_rcu); | ||
| 743 | return; | ||
| 744 | } | ||
| 745 | |||
| 751 | s = rcu_exp_gp_seq_snap(rsp); | 746 | s = rcu_exp_gp_seq_snap(rsp); |
| 752 | 747 | ||
| 753 | rnp_unlock = exp_funnel_lock(rsp, s); | 748 | rnp_unlock = exp_funnel_lock(rsp, s); |
| @@ -788,7 +783,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier); | |||
| 788 | */ | 783 | */ |
| 789 | static void __init __rcu_init_preempt(void) | 784 | static void __init __rcu_init_preempt(void) |
| 790 | { | 785 | { |
| 791 | rcu_init_one(rcu_state_p, rcu_data_p); | 786 | rcu_init_one(rcu_state_p); |
| 792 | } | 787 | } |
| 793 | 788 | ||
| 794 | /* | 789 | /* |
| @@ -989,8 +984,7 @@ static int rcu_boost(struct rcu_node *rnp) | |||
| 989 | READ_ONCE(rnp->boost_tasks) == NULL) | 984 | READ_ONCE(rnp->boost_tasks) == NULL) |
| 990 | return 0; /* Nothing left to boost. */ | 985 | return 0; /* Nothing left to boost. */ |
| 991 | 986 | ||
| 992 | raw_spin_lock_irqsave(&rnp->lock, flags); | 987 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 993 | smp_mb__after_unlock_lock(); | ||
| 994 | 988 | ||
| 995 | /* | 989 | /* |
| 996 | * Recheck under the lock: all tasks in need of boosting | 990 | * Recheck under the lock: all tasks in need of boosting |
| @@ -1176,8 +1170,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | |||
| 1176 | "rcub/%d", rnp_index); | 1170 | "rcub/%d", rnp_index); |
| 1177 | if (IS_ERR(t)) | 1171 | if (IS_ERR(t)) |
| 1178 | return PTR_ERR(t); | 1172 | return PTR_ERR(t); |
| 1179 | raw_spin_lock_irqsave(&rnp->lock, flags); | 1173 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 1180 | smp_mb__after_unlock_lock(); | ||
| 1181 | rnp->boost_kthread_task = t; | 1174 | rnp->boost_kthread_task = t; |
| 1182 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1175 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 1183 | sp.sched_priority = kthread_prio; | 1176 | sp.sched_priority = kthread_prio; |
| @@ -1524,7 +1517,8 @@ static void rcu_prepare_for_idle(void) | |||
| 1524 | struct rcu_state *rsp; | 1517 | struct rcu_state *rsp; |
| 1525 | int tne; | 1518 | int tne; |
| 1526 | 1519 | ||
| 1527 | if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) | 1520 | if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || |
| 1521 | rcu_is_nocb_cpu(smp_processor_id())) | ||
| 1528 | return; | 1522 | return; |
| 1529 | 1523 | ||
| 1530 | /* Handle nohz enablement switches conservatively. */ | 1524 | /* Handle nohz enablement switches conservatively. */ |
| @@ -1538,10 +1532,6 @@ static void rcu_prepare_for_idle(void) | |||
| 1538 | if (!tne) | 1532 | if (!tne) |
| 1539 | return; | 1533 | return; |
| 1540 | 1534 | ||
| 1541 | /* If this is a no-CBs CPU, no callbacks, just return. */ | ||
| 1542 | if (rcu_is_nocb_cpu(smp_processor_id())) | ||
| 1543 | return; | ||
| 1544 | |||
| 1545 | /* | 1535 | /* |
| 1546 | * If a non-lazy callback arrived at a CPU having only lazy | 1536 | * If a non-lazy callback arrived at a CPU having only lazy |
| 1547 | * callbacks, invoke RCU core for the side-effect of recalculating | 1537 | * callbacks, invoke RCU core for the side-effect of recalculating |
| @@ -1567,8 +1557,7 @@ static void rcu_prepare_for_idle(void) | |||
| 1567 | if (!*rdp->nxttail[RCU_DONE_TAIL]) | 1557 | if (!*rdp->nxttail[RCU_DONE_TAIL]) |
| 1568 | continue; | 1558 | continue; |
| 1569 | rnp = rdp->mynode; | 1559 | rnp = rdp->mynode; |
| 1570 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 1560 | raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ |
| 1571 | smp_mb__after_unlock_lock(); | ||
| 1572 | needwake = rcu_accelerate_cbs(rsp, rnp, rdp); | 1561 | needwake = rcu_accelerate_cbs(rsp, rnp, rdp); |
| 1573 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 1562 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
| 1574 | if (needwake) | 1563 | if (needwake) |
| @@ -2068,8 +2057,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) | |||
| 2068 | bool needwake; | 2057 | bool needwake; |
| 2069 | struct rcu_node *rnp = rdp->mynode; | 2058 | struct rcu_node *rnp = rdp->mynode; |
| 2070 | 2059 | ||
| 2071 | raw_spin_lock_irqsave(&rnp->lock, flags); | 2060 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 2072 | smp_mb__after_unlock_lock(); | ||
| 2073 | needwake = rcu_start_future_gp(rnp, rdp, &c); | 2061 | needwake = rcu_start_future_gp(rnp, rdp, &c); |
| 2074 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 2062 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 2075 | if (needwake) | 2063 | if (needwake) |
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index ef7093cc9b5c..1088e64f01ad 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * Read-Copy Update tracing for classic implementation | 2 | * Read-Copy Update tracing for hierarchical implementation. |
| 3 | * | 3 | * |
| 4 | * This program is free software; you can redistribute it and/or modify | 4 | * This program is free software; you can redistribute it and/or modify |
| 5 | * it under the terms of the GNU General Public License as published by | 5 | * it under the terms of the GNU General Public License as published by |
| @@ -16,6 +16,7 @@ | |||
| 16 | * http://www.gnu.org/licenses/gpl-2.0.html. | 16 | * http://www.gnu.org/licenses/gpl-2.0.html. |
| 17 | * | 17 | * |
| 18 | * Copyright IBM Corporation, 2008 | 18 | * Copyright IBM Corporation, 2008 |
| 19 | * Author: Paul E. McKenney | ||
| 19 | * | 20 | * |
| 20 | * Papers: http://www.rdrop.com/users/paulmck/RCU | 21 | * Papers: http://www.rdrop.com/users/paulmck/RCU |
| 21 | * | 22 | * |
| @@ -33,9 +34,7 @@ | |||
| 33 | #include <linux/sched.h> | 34 | #include <linux/sched.h> |
| 34 | #include <linux/atomic.h> | 35 | #include <linux/atomic.h> |
| 35 | #include <linux/bitops.h> | 36 | #include <linux/bitops.h> |
| 36 | #include <linux/module.h> | ||
| 37 | #include <linux/completion.h> | 37 | #include <linux/completion.h> |
| 38 | #include <linux/moduleparam.h> | ||
| 39 | #include <linux/percpu.h> | 38 | #include <linux/percpu.h> |
| 40 | #include <linux/notifier.h> | 39 | #include <linux/notifier.h> |
| 41 | #include <linux/cpu.h> | 40 | #include <linux/cpu.h> |
| @@ -183,14 +182,20 @@ static const struct file_operations rcudata_fops = { | |||
| 183 | 182 | ||
| 184 | static int show_rcuexp(struct seq_file *m, void *v) | 183 | static int show_rcuexp(struct seq_file *m, void *v) |
| 185 | { | 184 | { |
| 185 | int cpu; | ||
| 186 | struct rcu_state *rsp = (struct rcu_state *)m->private; | 186 | struct rcu_state *rsp = (struct rcu_state *)m->private; |
| 187 | 187 | struct rcu_data *rdp; | |
| 188 | unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; | ||
| 189 | |||
| 190 | for_each_possible_cpu(cpu) { | ||
| 191 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 192 | s0 += atomic_long_read(&rdp->expedited_workdone0); | ||
| 193 | s1 += atomic_long_read(&rdp->expedited_workdone1); | ||
| 194 | s2 += atomic_long_read(&rdp->expedited_workdone2); | ||
| 195 | s3 += atomic_long_read(&rdp->expedited_workdone3); | ||
| 196 | } | ||
| 188 | seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", | 197 | seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", |
| 189 | rsp->expedited_sequence, | 198 | rsp->expedited_sequence, s0, s1, s2, s3, |
| 190 | atomic_long_read(&rsp->expedited_workdone0), | ||
| 191 | atomic_long_read(&rsp->expedited_workdone1), | ||
| 192 | atomic_long_read(&rsp->expedited_workdone2), | ||
| 193 | atomic_long_read(&rsp->expedited_workdone3), | ||
| 194 | atomic_long_read(&rsp->expedited_normal), | 199 | atomic_long_read(&rsp->expedited_normal), |
| 195 | atomic_read(&rsp->expedited_need_qs), | 200 | atomic_read(&rsp->expedited_need_qs), |
| 196 | rsp->expedited_sequence / 2); | 201 | rsp->expedited_sequence / 2); |
| @@ -319,7 +324,7 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) | |||
| 319 | unsigned long gpmax; | 324 | unsigned long gpmax; |
| 320 | struct rcu_node *rnp = &rsp->node[0]; | 325 | struct rcu_node *rnp = &rsp->node[0]; |
| 321 | 326 | ||
| 322 | raw_spin_lock_irqsave(&rnp->lock, flags); | 327 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 323 | completed = READ_ONCE(rsp->completed); | 328 | completed = READ_ONCE(rsp->completed); |
| 324 | gpnum = READ_ONCE(rsp->gpnum); | 329 | gpnum = READ_ONCE(rsp->gpnum); |
| 325 | if (completed == gpnum) | 330 | if (completed == gpnum) |
| @@ -487,16 +492,4 @@ free_out: | |||
| 487 | debugfs_remove_recursive(rcudir); | 492 | debugfs_remove_recursive(rcudir); |
| 488 | return 1; | 493 | return 1; |
| 489 | } | 494 | } |
| 490 | 495 | device_initcall(rcutree_trace_init); | |
| 491 | static void __exit rcutree_trace_cleanup(void) | ||
| 492 | { | ||
| 493 | debugfs_remove_recursive(rcudir); | ||
| 494 | } | ||
| 495 | |||
| 496 | |||
| 497 | module_init(rcutree_trace_init); | ||
| 498 | module_exit(rcutree_trace_cleanup); | ||
| 499 | |||
| 500 | MODULE_AUTHOR("Paul E. McKenney"); | ||
| 501 | MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); | ||
| 502 | MODULE_LICENSE("GPL"); | ||
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 5f748c5a40f0..76b94e19430b 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
| @@ -60,7 +60,12 @@ MODULE_ALIAS("rcupdate"); | |||
| 60 | #endif | 60 | #endif |
| 61 | #define MODULE_PARAM_PREFIX "rcupdate." | 61 | #define MODULE_PARAM_PREFIX "rcupdate." |
| 62 | 62 | ||
| 63 | #ifndef CONFIG_TINY_RCU | ||
| 63 | module_param(rcu_expedited, int, 0); | 64 | module_param(rcu_expedited, int, 0); |
| 65 | module_param(rcu_normal, int, 0); | ||
| 66 | static int rcu_normal_after_boot; | ||
| 67 | module_param(rcu_normal_after_boot, int, 0); | ||
| 68 | #endif /* #ifndef CONFIG_TINY_RCU */ | ||
| 64 | 69 | ||
| 65 | #if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) | 70 | #if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) |
| 66 | /** | 71 | /** |
| @@ -113,6 +118,17 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held); | |||
| 113 | 118 | ||
| 114 | #ifndef CONFIG_TINY_RCU | 119 | #ifndef CONFIG_TINY_RCU |
| 115 | 120 | ||
| 121 | /* | ||
| 122 | * Should expedited grace-period primitives always fall back to their | ||
| 123 | * non-expedited counterparts? Intended for use within RCU. Note | ||
| 124 | * that if the user specifies both rcu_expedited and rcu_normal, then | ||
| 125 | * rcu_normal wins. | ||
| 126 | */ | ||
| 127 | bool rcu_gp_is_normal(void) | ||
| 128 | { | ||
| 129 | return READ_ONCE(rcu_normal); | ||
| 130 | } | ||
| 131 | |||
| 116 | static atomic_t rcu_expedited_nesting = | 132 | static atomic_t rcu_expedited_nesting = |
| 117 | ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0); | 133 | ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0); |
| 118 | 134 | ||
| @@ -157,8 +173,6 @@ void rcu_unexpedite_gp(void) | |||
| 157 | } | 173 | } |
| 158 | EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); | 174 | EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); |
| 159 | 175 | ||
| 160 | #endif /* #ifndef CONFIG_TINY_RCU */ | ||
| 161 | |||
| 162 | /* | 176 | /* |
| 163 | * Inform RCU of the end of the in-kernel boot sequence. | 177 | * Inform RCU of the end of the in-kernel boot sequence. |
| 164 | */ | 178 | */ |
| @@ -166,8 +180,12 @@ void rcu_end_inkernel_boot(void) | |||
| 166 | { | 180 | { |
| 167 | if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT)) | 181 | if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT)) |
| 168 | rcu_unexpedite_gp(); | 182 | rcu_unexpedite_gp(); |
| 183 | if (rcu_normal_after_boot) | ||
| 184 | WRITE_ONCE(rcu_normal, 1); | ||
| 169 | } | 185 | } |
| 170 | 186 | ||
| 187 | #endif /* #ifndef CONFIG_TINY_RCU */ | ||
| 188 | |||
| 171 | #ifdef CONFIG_PREEMPT_RCU | 189 | #ifdef CONFIG_PREEMPT_RCU |
| 172 | 190 | ||
| 173 | /* | 191 | /* |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 732e993b564b..1ef0d7aeab47 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -3109,7 +3109,6 @@ static void __sched notrace __schedule(bool preempt) | |||
| 3109 | 3109 | ||
| 3110 | cpu = smp_processor_id(); | 3110 | cpu = smp_processor_id(); |
| 3111 | rq = cpu_rq(cpu); | 3111 | rq = cpu_rq(cpu); |
| 3112 | rcu_note_context_switch(); | ||
| 3113 | prev = rq->curr; | 3112 | prev = rq->curr; |
| 3114 | 3113 | ||
| 3115 | /* | 3114 | /* |
| @@ -3128,13 +3127,16 @@ static void __sched notrace __schedule(bool preempt) | |||
| 3128 | if (sched_feat(HRTICK)) | 3127 | if (sched_feat(HRTICK)) |
| 3129 | hrtick_clear(rq); | 3128 | hrtick_clear(rq); |
| 3130 | 3129 | ||
| 3130 | local_irq_disable(); | ||
| 3131 | rcu_note_context_switch(); | ||
| 3132 | |||
| 3131 | /* | 3133 | /* |
| 3132 | * Make sure that signal_pending_state()->signal_pending() below | 3134 | * Make sure that signal_pending_state()->signal_pending() below |
| 3133 | * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) | 3135 | * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) |
| 3134 | * done by the caller to avoid the race with signal_wake_up(). | 3136 | * done by the caller to avoid the race with signal_wake_up(). |
| 3135 | */ | 3137 | */ |
| 3136 | smp_mb__before_spinlock(); | 3138 | smp_mb__before_spinlock(); |
| 3137 | raw_spin_lock_irq(&rq->lock); | 3139 | raw_spin_lock(&rq->lock); |
| 3138 | lockdep_pin_lock(&rq->lock); | 3140 | lockdep_pin_lock(&rq->lock); |
| 3139 | 3141 | ||
| 3140 | rq->clock_skip_update <<= 1; /* promote REQ to ACT */ | 3142 | rq->clock_skip_update <<= 1; /* promote REQ to ACT */ |
diff --git a/lib/list_debug.c b/lib/list_debug.c index c24c2f7e296f..3859bf63561c 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c | |||
| @@ -37,7 +37,7 @@ void __list_add(struct list_head *new, | |||
| 37 | next->prev = new; | 37 | next->prev = new; |
| 38 | new->next = next; | 38 | new->next = next; |
| 39 | new->prev = prev; | 39 | new->prev = prev; |
| 40 | prev->next = new; | 40 | WRITE_ONCE(prev->next, new); |
| 41 | } | 41 | } |
| 42 | EXPORT_SYMBOL(__list_add); | 42 | EXPORT_SYMBOL(__list_add); |
| 43 | 43 | ||
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 5236e073919d..0f80eefb0bfd 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | |||
| @@ -38,8 +38,6 @@ | |||
| 38 | # | 38 | # |
| 39 | # Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 39 | # Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> |
| 40 | 40 | ||
| 41 | grace=120 | ||
| 42 | |||
| 43 | T=/tmp/kvm-test-1-run.sh.$$ | 41 | T=/tmp/kvm-test-1-run.sh.$$ |
| 44 | trap 'rm -rf $T' 0 | 42 | trap 'rm -rf $T' 0 |
| 45 | touch $T | 43 | touch $T |
| @@ -152,7 +150,7 @@ fi | |||
| 152 | qemu_args="`specify_qemu_cpus "$QEMU" "$qemu_args" "$cpu_count"`" | 150 | qemu_args="`specify_qemu_cpus "$QEMU" "$qemu_args" "$cpu_count"`" |
| 153 | 151 | ||
| 154 | # Generate architecture-specific and interaction-specific qemu arguments | 152 | # Generate architecture-specific and interaction-specific qemu arguments |
| 155 | qemu_args="$qemu_args `identify_qemu_args "$QEMU" "$builddir/console.log"`" | 153 | qemu_args="$qemu_args `identify_qemu_args "$QEMU" "$resdir/console.log"`" |
| 156 | 154 | ||
| 157 | # Generate qemu -append arguments | 155 | # Generate qemu -append arguments |
| 158 | qemu_append="`identify_qemu_append "$QEMU"`" | 156 | qemu_append="`identify_qemu_append "$QEMU"`" |
| @@ -168,7 +166,7 @@ then | |||
| 168 | touch $resdir/buildonly | 166 | touch $resdir/buildonly |
| 169 | exit 0 | 167 | exit 0 |
| 170 | fi | 168 | fi |
| 171 | echo "NOTE: $QEMU either did not run or was interactive" > $builddir/console.log | 169 | echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log |
| 172 | echo $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd | 170 | echo $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd |
| 173 | ( $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append "$qemu_append $boot_args"; echo $? > $resdir/qemu-retval ) & | 171 | ( $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append "$qemu_append $boot_args"; echo $? > $resdir/qemu-retval ) & |
| 174 | qemu_pid=$! | 172 | qemu_pid=$! |
| @@ -214,7 +212,7 @@ then | |||
| 214 | else | 212 | else |
| 215 | break | 213 | break |
| 216 | fi | 214 | fi |
| 217 | if test $kruntime -ge $((seconds + grace)) | 215 | if test $kruntime -ge $((seconds + $TORTURE_SHUTDOWN_GRACE)) |
| 218 | then | 216 | then |
| 219 | echo "!!! PID $qemu_pid hung at $kruntime vs. $seconds seconds" >> $resdir/Warnings 2>&1 | 217 | echo "!!! PID $qemu_pid hung at $kruntime vs. $seconds seconds" >> $resdir/Warnings 2>&1 |
| 220 | kill -KILL $qemu_pid | 218 | kill -KILL $qemu_pid |
| @@ -224,6 +222,5 @@ then | |||
| 224 | done | 222 | done |
| 225 | fi | 223 | fi |
| 226 | 224 | ||
| 227 | cp $builddir/console.log $resdir | ||
| 228 | parse-torture.sh $resdir/console.log $title | 225 | parse-torture.sh $resdir/console.log $title |
| 229 | parse-console.sh $resdir/console.log $title | 226 | parse-console.sh $resdir/console.log $title |
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index f6483609ebc2..4a431767f77a 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh | |||
| @@ -42,6 +42,7 @@ TORTURE_DEFCONFIG=defconfig | |||
| 42 | TORTURE_BOOT_IMAGE="" | 42 | TORTURE_BOOT_IMAGE="" |
| 43 | TORTURE_INITRD="$KVM/initrd"; export TORTURE_INITRD | 43 | TORTURE_INITRD="$KVM/initrd"; export TORTURE_INITRD |
| 44 | TORTURE_KMAKE_ARG="" | 44 | TORTURE_KMAKE_ARG="" |
| 45 | TORTURE_SHUTDOWN_GRACE=180 | ||
| 45 | TORTURE_SUITE=rcu | 46 | TORTURE_SUITE=rcu |
| 46 | resdir="" | 47 | resdir="" |
| 47 | configs="" | 48 | configs="" |
| @@ -149,6 +150,11 @@ do | |||
| 149 | resdir=$2 | 150 | resdir=$2 |
| 150 | shift | 151 | shift |
| 151 | ;; | 152 | ;; |
| 153 | --shutdown-grace) | ||
| 154 | checkarg --shutdown-grace "(seconds)" "$#" "$2" '^[0-9]*$' '^error' | ||
| 155 | TORTURE_SHUTDOWN_GRACE=$2 | ||
| 156 | shift | ||
| 157 | ;; | ||
| 152 | --torture) | 158 | --torture) |
| 153 | checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\)$' '^--' | 159 | checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\)$' '^--' |
| 154 | TORTURE_SUITE=$2 | 160 | TORTURE_SUITE=$2 |
| @@ -266,6 +272,7 @@ TORTURE_KMAKE_ARG="$TORTURE_KMAKE_ARG"; export TORTURE_KMAKE_ARG | |||
| 266 | TORTURE_QEMU_CMD="$TORTURE_QEMU_CMD"; export TORTURE_QEMU_CMD | 272 | TORTURE_QEMU_CMD="$TORTURE_QEMU_CMD"; export TORTURE_QEMU_CMD |
| 267 | TORTURE_QEMU_INTERACTIVE="$TORTURE_QEMU_INTERACTIVE"; export TORTURE_QEMU_INTERACTIVE | 273 | TORTURE_QEMU_INTERACTIVE="$TORTURE_QEMU_INTERACTIVE"; export TORTURE_QEMU_INTERACTIVE |
| 268 | TORTURE_QEMU_MAC="$TORTURE_QEMU_MAC"; export TORTURE_QEMU_MAC | 274 | TORTURE_QEMU_MAC="$TORTURE_QEMU_MAC"; export TORTURE_QEMU_MAC |
| 275 | TORTURE_SHUTDOWN_GRACE="$TORTURE_SHUTDOWN_GRACE"; export TORTURE_SHUTDOWN_GRACE | ||
| 269 | TORTURE_SUITE="$TORTURE_SUITE"; export TORTURE_SUITE | 276 | TORTURE_SUITE="$TORTURE_SUITE"; export TORTURE_SUITE |
| 270 | if ! test -e $resdir | 277 | if ! test -e $resdir |
| 271 | then | 278 | then |
| @@ -307,10 +314,10 @@ awk < $T/cfgcpu.pack \ | |||
| 307 | } | 314 | } |
| 308 | 315 | ||
| 309 | # Dump out the scripting required to run one test batch. | 316 | # Dump out the scripting required to run one test batch. |
| 310 | function dump(first, pastlast) | 317 | function dump(first, pastlast, batchnum) |
| 311 | { | 318 | { |
| 312 | print "echo ----Start batch: `date`"; | 319 | print "echo ----Start batch " batchnum ": `date`"; |
| 313 | print "echo ----Start batch: `date` >> " rd "/log"; | 320 | print "echo ----Start batch " batchnum ": `date` >> " rd "/log"; |
| 314 | jn=1 | 321 | jn=1 |
| 315 | for (j = first; j < pastlast; j++) { | 322 | for (j = first; j < pastlast; j++) { |
| 316 | builddir=KVM "/b" jn | 323 | builddir=KVM "/b" jn |
| @@ -371,25 +378,28 @@ END { | |||
| 371 | njobs = i; | 378 | njobs = i; |
| 372 | nc = ncpus; | 379 | nc = ncpus; |
| 373 | first = 0; | 380 | first = 0; |
| 381 | batchnum = 1; | ||
| 374 | 382 | ||
| 375 | # Each pass through the following loop considers one test. | 383 | # Each pass through the following loop considers one test. |
| 376 | for (i = 0; i < njobs; i++) { | 384 | for (i = 0; i < njobs; i++) { |
| 377 | if (ncpus == 0) { | 385 | if (ncpus == 0) { |
| 378 | # Sequential test specified, each test its own batch. | 386 | # Sequential test specified, each test its own batch. |
| 379 | dump(i, i + 1); | 387 | dump(i, i + 1, batchnum); |
| 380 | first = i; | 388 | first = i; |
| 389 | batchnum++; | ||
| 381 | } else if (nc < cpus[i] && i != 0) { | 390 | } else if (nc < cpus[i] && i != 0) { |
| 382 | # Out of CPUs, dump out a batch. | 391 | # Out of CPUs, dump out a batch. |
| 383 | dump(first, i); | 392 | dump(first, i, batchnum); |
| 384 | first = i; | 393 | first = i; |
| 385 | nc = ncpus; | 394 | nc = ncpus; |
| 395 | batchnum++; | ||
| 386 | } | 396 | } |
| 387 | # Account for the CPUs needed by the current test. | 397 | # Account for the CPUs needed by the current test. |
| 388 | nc -= cpus[i]; | 398 | nc -= cpus[i]; |
| 389 | } | 399 | } |
| 390 | # Dump the last batch. | 400 | # Dump the last batch. |
| 391 | if (ncpus != 0) | 401 | if (ncpus != 0) |
| 392 | dump(first, i); | 402 | dump(first, i, batchnum); |
| 393 | }' >> $T/script | 403 | }' >> $T/script |
| 394 | 404 | ||
| 395 | cat << ___EOF___ >> $T/script | 405 | cat << ___EOF___ >> $T/script |
diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh index d8f35cf116be..844787a0d7be 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-console.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh | |||
| @@ -24,9 +24,6 @@ | |||
| 24 | # | 24 | # |
| 25 | # Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 25 | # Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> |
| 26 | 26 | ||
| 27 | T=/tmp/abat-chk-badness.sh.$$ | ||
| 28 | trap 'rm -f $T' 0 | ||
| 29 | |||
| 30 | file="$1" | 27 | file="$1" |
| 31 | title="$2" | 28 | title="$2" |
| 32 | 29 | ||
| @@ -36,9 +33,41 @@ if grep -Pq '\x00' < $file | |||
| 36 | then | 33 | then |
| 37 | print_warning Console output contains nul bytes, old qemu still running? | 34 | print_warning Console output contains nul bytes, old qemu still running? |
| 38 | fi | 35 | fi |
| 39 | egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|Stall ended before state dump start' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $T | 36 | egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|Stall ended before state dump start' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags |
| 40 | if test -s $T | 37 | if test -s $1.diags |
| 41 | then | 38 | then |
| 42 | print_warning Assertion failure in $file $title | 39 | print_warning Assertion failure in $file $title |
| 43 | cat $T | 40 | # cat $1.diags |
| 41 | summary="" | ||
| 42 | n_badness=`grep -c Badness $1` | ||
| 43 | if test "$n_badness" -ne 0 | ||
| 44 | then | ||
| 45 | summary="$summary Badness: $n_badness" | ||
| 46 | fi | ||
| 47 | n_warn=`grep -v 'Warning: unable to open an initial console' $1 | egrep -c 'WARNING:|Warn'` | ||
| 48 | if test "$n_warn" -ne 0 | ||
| 49 | then | ||
| 50 | summary="$summary Warnings: $n_warn" | ||
| 51 | fi | ||
| 52 | n_bugs=`egrep -c 'BUG|Oops:' $1` | ||
| 53 | if test "$n_bugs" -ne 0 | ||
| 54 | then | ||
| 55 | summary="$summary Bugs: $n_bugs" | ||
| 56 | fi | ||
| 57 | n_calltrace=`grep -c 'Call Trace:' $1` | ||
| 58 | if test "$n_calltrace" -ne 0 | ||
| 59 | then | ||
| 60 | summary="$summary Call Traces: $n_calltrace" | ||
| 61 | fi | ||
| 62 | n_lockdep=`grep -c =========== $1` | ||
| 63 | if test "$n_badness" -ne 0 | ||
| 64 | then | ||
| 65 | summary="$summary lockdep: $n_badness" | ||
| 66 | fi | ||
| 67 | n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|Stall ended before state dump start' $1` | ||
| 68 | if test "$n_stalls" -ne 0 | ||
| 69 | then | ||
| 70 | summary="$summary Stalls: $n_stalls" | ||
| 71 | fi | ||
| 72 | print_warning Summary: $summary | ||
| 44 | fi | 73 | fi |
diff --git a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt index 9ef33a743b73..24396ae8355b 100644 --- a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt +++ b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt | |||
| @@ -20,7 +20,6 @@ CONFIG_PROVE_RCU | |||
| 20 | 20 | ||
| 21 | CONFIG_NO_HZ_FULL_SYSIDLE | 21 | CONFIG_NO_HZ_FULL_SYSIDLE |
| 22 | CONFIG_RCU_NOCB_CPU | 22 | CONFIG_RCU_NOCB_CPU |
| 23 | CONFIG_RCU_USER_QS | ||
| 24 | 23 | ||
| 25 | Meaningless for TINY_RCU. | 24 | Meaningless for TINY_RCU. |
| 26 | 25 | ||
diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt index 657f3a035488..4e2b1893d40d 100644 --- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt +++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt | |||
| @@ -72,10 +72,6 @@ CONFIG_RCU_TORTURE_TEST_RUNNABLE | |||
| 72 | 72 | ||
| 73 | Always used in KVM testing. | 73 | Always used in KVM testing. |
| 74 | 74 | ||
| 75 | CONFIG_RCU_USER_QS | ||
| 76 | |||
| 77 | Redundant with CONFIG_NO_HZ_FULL. | ||
| 78 | |||
| 79 | CONFIG_PREEMPT_RCU | 75 | CONFIG_PREEMPT_RCU |
| 80 | CONFIG_TREE_RCU | 76 | CONFIG_TREE_RCU |
| 81 | 77 | ||
