aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg588
-rw-r--r--Documentation/blockdev/drbd/DRBD-data-packets.svg459
-rw-r--r--Documentation/blockdev/drbd/README.txt16
-rw-r--r--Documentation/blockdev/drbd/conn-states-8.dot18
-rw-r--r--Documentation/blockdev/drbd/disk-states-8.dot16
-rw-r--r--Documentation/blockdev/drbd/drbd-connection-state-overview.dot85
-rw-r--r--Documentation/blockdev/drbd/node-states-8.dot14
-rw-r--r--MAINTAINERS13
-rw-r--r--drivers/block/Kconfig2
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/drbd/Kconfig82
-rw-r--r--drivers/block/drbd/Makefile8
-rw-r--r--drivers/block/drbd/drbd_actlog.c1484
-rw-r--r--drivers/block/drbd/drbd_bitmap.c1327
-rw-r--r--drivers/block/drbd/drbd_int.h2258
-rw-r--r--drivers/block/drbd/drbd_main.c3735
-rw-r--r--drivers/block/drbd/drbd_nl.c2365
-rw-r--r--drivers/block/drbd/drbd_proc.c266
-rw-r--r--drivers/block/drbd/drbd_receiver.c4456
-rw-r--r--drivers/block/drbd/drbd_req.c1132
-rw-r--r--drivers/block/drbd/drbd_req.h327
-rw-r--r--drivers/block/drbd/drbd_strings.c113
-rw-r--r--drivers/block/drbd/drbd_tracing.c752
-rw-r--r--drivers/block/drbd/drbd_tracing.h87
-rw-r--r--drivers/block/drbd/drbd_vli.h351
-rw-r--r--drivers/block/drbd/drbd_worker.c1529
-rw-r--r--drivers/block/drbd/drbd_wrappers.h91
-rw-r--r--include/linux/drbd.h349
-rw-r--r--include/linux/drbd_limits.h137
-rw-r--r--include/linux/drbd_nl.h137
-rw-r--r--include/linux/drbd_tag_magic.h83
-rw-r--r--include/linux/lru_cache.h294
-rw-r--r--lib/Kconfig3
-rw-r--r--lib/Makefile2
-rw-r--r--lib/lru_cache.c560
35 files changed, 23140 insertions, 0 deletions
diff --git a/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg b/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg
new file mode 100644
index 000000000000..f87cfa0dc2fb
--- /dev/null
+++ b/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg
@@ -0,0 +1,588 @@
1<?xml version="1.0" encoding="UTF-8" standalone="no"?>
2<!-- Created with Inkscape (http://www.inkscape.org/) -->
3<svg
4 xmlns:svg="http://www.w3.org/2000/svg"
5 xmlns="http://www.w3.org/2000/svg"
6 version="1.0"
7 width="210mm"
8 height="297mm"
9 viewBox="0 0 21000 29700"
10 id="svg2"
11 style="fill-rule:evenodd">
12 <defs
13 id="defs4" />
14 <g
15 id="Default"
16 style="visibility:visible">
17 <desc
18 id="desc180">Master slide</desc>
19 </g>
20 <path
21 d="M 11999,8601 L 11899,8301 L 12099,8301 L 11999,8601 z"
22 id="path193"
23 style="fill:#008000;visibility:visible" />
24 <path
25 d="M 11999,7801 L 11999,8361"
26 id="path197"
27 style="fill:none;stroke:#008000;visibility:visible" />
28 <path
29 d="M 7999,10401 L 7899,10101 L 8099,10101 L 7999,10401 z"
30 id="path209"
31 style="fill:#008000;visibility:visible" />
32 <path
33 d="M 7999,9601 L 7999,10161"
34 id="path213"
35 style="fill:none;stroke:#008000;visibility:visible" />
36 <path
37 d="M 11999,7801 L 11685,7840 L 11724,7644 L 11999,7801 z"
38 id="path225"
39 style="fill:#008000;visibility:visible" />
40 <path
41 d="M 7999,7001 L 11764,7754"
42 id="path229"
43 style="fill:none;stroke:#008000;visibility:visible" />
44 <g
45 transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-1244.4792,1416.5139)"
46 id="g245"
47 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
48 <text
49 id="text247">
50 <tspan
51 x="9139 9368 9579 9808 9986 10075 10252 10481 10659 10837 10909"
52 y="9284"
53 id="tspan249">RSDataReply</tspan>
54 </text>
55 </g>
56 <path
57 d="M 7999,9601 L 8281,9458 L 8311,9655 L 7999,9601 z"
58 id="path259"
59 style="fill:#008000;visibility:visible" />
60 <path
61 d="M 11999,9001 L 8236,9565"
62 id="path263"
63 style="fill:none;stroke:#008000;visibility:visible" />
64 <g
65 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,1620.9382,-1639.4947)"
66 id="g279"
67 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
68 <text
69 id="text281">
70 <tspan
71 x="8743 8972 9132 9310 9573 9801 10013 10242 10419 10597 10775 10953 11114"
72 y="7023"
73 id="tspan283">CsumRSRequest</tspan>
74 </text>
75 </g>
76 <text
77 id="text297"
78 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
79 <tspan
80 x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692"
81 y="5707"
82 id="tspan299">w_make_resync_request()</tspan>
83 </text>
84 <text
85 id="text313"
86 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
87 <tspan
88 x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378"
89 y="7806"
90 id="tspan315">receive_DataRequest()</tspan>
91 </text>
92 <text
93 id="text329"
94 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
95 <tspan
96 x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399"
97 y="8606"
98 id="tspan331">drbd_endio_read_sec()</tspan>
99 </text>
100 <text
101 id="text345"
102 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
103 <tspan
104 x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13825 13986 14164 14426 14604 14710 14871 15049 15154 15332 15510 15616"
105 y="9007"
106 id="tspan347">w_e_end_csum_rs_req()</tspan>
107 </text>
108 <text
109 id="text361"
110 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
111 <tspan
112 x="4444 4550 4728 4889 5066 5138 5299 5477 5655 5883 6095 6324 6501 6590 6768 6997 7175 7352 7424 7585 7691"
113 y="9507"
114 id="tspan363">receive_RSDataReply()</tspan>
115 </text>
116 <text
117 id="text377"
118 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
119 <tspan
120 x="4457 4635 4741 4918 5096 5274 5452 5630 5807 5879 6057 6235 6464 6569 6641 6730 6908 7086 7247 7425 7585 7691"
121 y="10407"
122 id="tspan379">drbd_endio_write_sec()</tspan>
123 </text>
124 <text
125 id="text393"
126 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
127 <tspan
128 x="4647 4825 5003 5180 5358 5536 5714 5820 5997 6158 6319 6497 6658 6836 7013 7085 7263 7424 7585 7691"
129 y="10907"
130 id="tspan395">e_end_resync_block()</tspan>
131 </text>
132 <path
133 d="M 11999,11601 L 11685,11640 L 11724,11444 L 11999,11601 z"
134 id="path405"
135 style="fill:#000080;visibility:visible" />
136 <path
137 d="M 7999,10801 L 11764,11554"
138 id="path409"
139 style="fill:none;stroke:#000080;visibility:visible" />
140 <g
141 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,2434.7562,-1674.649)"
142 id="g425"
143 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
144 <text
145 id="text427">
146 <tspan
147 x="9320 9621 9726 9798 9887 10065 10277 10438"
148 y="10943"
149 id="tspan429">WriteAck</tspan>
150 </text>
151 </g>
152 <text
153 id="text443"
154 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
155 <tspan
156 x="12199 12377 12555 12644 12821 13033 13105 13283 13444 13604 13816 13977 14138 14244"
157 y="11559"
158 id="tspan445">got_BlockAck()</tspan>
159 </text>
160 <text
161 id="text459"
162 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
163 <tspan
164 x="7999 8304 8541 8778 8990 9201 9413 9650 10001 10120 10357 10594 10806 11043 11280 11398 11703 11940 12152 12364 12601 12812 12931 13049 13261 13498 13710 13947 14065 14302 14540 14658 14777 14870 15107 15225 15437 15649 15886"
165 y="4877"
166 id="tspan461">Checksum based Resync, case not in sync</tspan>
167 </text>
168 <text
169 id="text475"
170 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
171 <tspan
172 x="6961 7266 7571 7854 8159 8299 8536 8654 8891 9010 9247 9484 9603 9840 9958 10077 10170 10407"
173 y="2806"
174 id="tspan477">DRBD-8.3 data flow</tspan>
175 </text>
176 <text
177 id="text491"
178 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
179 <tspan
180 x="5190 5419 5596 5774 5952 6113 6291 6468 6646 6824 6985 7146 7324 7586 7692"
181 y="7005"
182 id="tspan493">w_e_send_csum()</tspan>
183 </text>
184 <path
185 d="M 11999,17601 L 11899,17301 L 12099,17301 L 11999,17601 z"
186 id="path503"
187 style="fill:#008000;visibility:visible" />
188 <path
189 d="M 11999,16801 L 11999,17361"
190 id="path507"
191 style="fill:none;stroke:#008000;visibility:visible" />
192 <path
193 d="M 11999,16801 L 11685,16840 L 11724,16644 L 11999,16801 z"
194 id="path519"
195 style="fill:#008000;visibility:visible" />
196 <path
197 d="M 7999,16001 L 11764,16754"
198 id="path523"
199 style="fill:none;stroke:#008000;visibility:visible" />
200 <g
201 transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-2539.5806,1529.3491)"
202 id="g539"
203 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
204 <text
205 id="text541">
206 <tspan
207 x="9269 9498 9709 9798 9959 10048 10226 10437 10598 10776"
208 y="18265"
209 id="tspan543">RSIsInSync</tspan>
210 </text>
211 </g>
212 <path
213 d="M 7999,18601 L 8281,18458 L 8311,18655 L 7999,18601 z"
214 id="path553"
215 style="fill:#000080;visibility:visible" />
216 <path
217 d="M 11999,18001 L 8236,18565"
218 id="path557"
219 style="fill:none;stroke:#000080;visibility:visible" />
220 <g
221 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,3461.4027,-1449.3012)"
222 id="g573"
223 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
224 <text
225 id="text575">
226 <tspan
227 x="8743 8972 9132 9310 9573 9801 10013 10242 10419 10597 10775 10953 11114"
228 y="16023"
229 id="tspan577">CsumRSRequest</tspan>
230 </text>
231 </g>
232 <text
233 id="text591"
234 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
235 <tspan
236 x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378"
237 y="16806"
238 id="tspan593">receive_DataRequest()</tspan>
239 </text>
240 <text
241 id="text607"
242 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
243 <tspan
244 x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399"
245 y="17606"
246 id="tspan609">drbd_endio_read_sec()</tspan>
247 </text>
248 <text
249 id="text623"
250 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
251 <tspan
252 x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13825 13986 14164 14426 14604 14710 14871 15049 15154 15332 15510 15616"
253 y="18007"
254 id="tspan625">w_e_end_csum_rs_req()</tspan>
255 </text>
256 <text
257 id="text639"
258 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
259 <tspan
260 x="5735 5913 6091 6180 6357 6446 6607 6696 6874 7085 7246 7424 7585 7691"
261 y="18507"
262 id="tspan641">got_IsInSync()</tspan>
263 </text>
264 <text
265 id="text655"
266 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
267 <tspan
268 x="7999 8304 8541 8778 8990 9201 9413 9650 10001 10120 10357 10594 10806 11043 11280 11398 11703 11940 12152 12364 12601 12812 12931 13049 13261 13498 13710 13947 14065 14159 14396 14514 14726 14937 15175"
269 y="13877"
270 id="tspan657">Checksum based Resync, case in sync</tspan>
271 </text>
272 <path
273 d="M 12000,24601 L 11900,24301 L 12100,24301 L 12000,24601 z"
274 id="path667"
275 style="fill:#008000;visibility:visible" />
276 <path
277 d="M 12000,23801 L 12000,24361"
278 id="path671"
279 style="fill:none;stroke:#008000;visibility:visible" />
280 <path
281 d="M 8000,26401 L 7900,26101 L 8100,26101 L 8000,26401 z"
282 id="path683"
283 style="fill:#008000;visibility:visible" />
284 <path
285 d="M 8000,25601 L 8000,26161"
286 id="path687"
287 style="fill:none;stroke:#008000;visibility:visible" />
288 <path
289 d="M 12000,23801 L 11686,23840 L 11725,23644 L 12000,23801 z"
290 id="path699"
291 style="fill:#008000;visibility:visible" />
292 <path
293 d="M 8000,23001 L 11765,23754"
294 id="path703"
295 style="fill:none;stroke:#008000;visibility:visible" />
296 <g
297 transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-3543.8452,1630.5143)"
298 id="g719"
299 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
300 <text
301 id="text721">
302 <tspan
303 x="9464 9710 9921 10150 10328 10505 10577"
304 y="25236"
305 id="tspan723">OVReply</tspan>
306 </text>
307 </g>
308 <path
309 d="M 8000,25601 L 8282,25458 L 8312,25655 L 8000,25601 z"
310 id="path733"
311 style="fill:#008000;visibility:visible" />
312 <path
313 d="M 12000,25001 L 8237,25565"
314 id="path737"
315 style="fill:none;stroke:#008000;visibility:visible" />
316 <g
317 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,4918.2801,-1381.2128)"
318 id="g753"
319 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
320 <text
321 id="text755">
322 <tspan
323 x="9142 9388 9599 9828 10006 10183 10361 10539 10700"
324 y="23106"
325 id="tspan757">OVRequest</tspan>
326 </text>
327 </g>
328 <text
329 id="text771"
330 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
331 <tspan
332 x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13656 13868 14097 14274 14452 14630 14808 14969 15058 15163"
333 y="23806"
334 id="tspan773">receive_OVRequest()</tspan>
335 </text>
336 <text
337 id="text787"
338 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
339 <tspan
340 x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14084 14262 14439 14617 14795 14956 15134 15295 15400"
341 y="24606"
342 id="tspan789">drbd_endio_read_sec()</tspan>
343 </text>
344 <text
345 id="text803"
346 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
347 <tspan
348 x="12192 12421 12598 12776 12954 13132 13310 13487 13665 13843 14004 14182 14288 14465 14643 14749"
349 y="25007"
350 id="tspan805">w_e_end_ov_req()</tspan>
351 </text>
352 <text
353 id="text819"
354 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
355 <tspan
356 x="5101 5207 5385 5546 5723 5795 5956 6134 6312 6557 6769 6998 7175 7353 7425 7586 7692"
357 y="25507"
358 id="tspan821">receive_OVReply()</tspan>
359 </text>
360 <text
361 id="text835"
362 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
363 <tspan
364 x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692"
365 y="26407"
366 id="tspan837">drbd_endio_read_sec()</tspan>
367 </text>
368 <text
369 id="text851"
370 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
371 <tspan
372 x="4902 5131 5308 5486 5664 5842 6020 6197 6375 6553 6714 6892 6998 7175 7353 7425 7586 7692"
373 y="26907"
374 id="tspan853">w_e_end_ov_reply()</tspan>
375 </text>
376 <path
377 d="M 12000,27601 L 11686,27640 L 11725,27444 L 12000,27601 z"
378 id="path863"
379 style="fill:#000080;visibility:visible" />
380 <path
381 d="M 8000,26801 L 11765,27554"
382 id="path867"
383 style="fill:none;stroke:#000080;visibility:visible" />
384 <g
385 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,5704.1907,-1328.312)"
386 id="g883"
387 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
388 <text
389 id="text885">
390 <tspan
391 x="9279 9525 9736 9965 10143 10303 10481 10553"
392 y="26935"
393 id="tspan887">OVResult</tspan>
394 </text>
395 </g>
396 <text
397 id="text901"
398 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
399 <tspan
400 x="12200 12378 12556 12645 12822 13068 13280 13508 13686 13847 14025 14097 14185 14291"
401 y="27559"
402 id="tspan903">got_OVResult()</tspan>
403 </text>
404 <text
405 id="text917"
406 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
407 <tspan
408 x="8000 8330 8567 8660 8754 8991 9228 9346 9558 9795 9935 10028 10146"
409 y="21877"
410 id="tspan919">Online verify</tspan>
411 </text>
412 <text
413 id="text933"
414 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
415 <tspan
416 x="4641 4870 5047 5310 5488 5649 5826 6004 6182 6343 6521 6626 6804 6982 7160 7338 7499 7587 7693"
417 y="23005"
418 id="tspan935">w_make_ov_request()</tspan>
419 </text>
420 <path
421 d="M 8000,6500 L 7900,6200 L 8100,6200 L 8000,6500 z"
422 id="path945"
423 style="fill:#008000;visibility:visible" />
424 <path
425 d="M 8000,5700 L 8000,6260"
426 id="path949"
427 style="fill:none;stroke:#008000;visibility:visible" />
428 <path
429 d="M 3900,5500 L 3700,5500 L 3700,11000 L 3900,11000"
430 id="path961"
431 style="fill:none;stroke:#000000;visibility:visible" />
432 <path
433 d="M 3900,14500 L 3700,14500 L 3700,18600 L 3900,18600"
434 id="path973"
435 style="fill:none;stroke:#000000;visibility:visible" />
436 <path
437 d="M 3900,22800 L 3700,22800 L 3700,26900 L 3900,26900"
438 id="path985"
439 style="fill:none;stroke:#000000;visibility:visible" />
440 <text
441 id="text1001"
442 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
443 <tspan
444 x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692"
445 y="6506"
446 id="tspan1003">drbd_endio_read_sec()</tspan>
447 </text>
448 <text
449 id="text1017"
450 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
451 <tspan
452 x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692"
453 y="14708"
454 id="tspan1019">w_make_resync_request()</tspan>
455 </text>
456 <text
457 id="text1033"
458 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
459 <tspan
460 x="5190 5419 5596 5774 5952 6113 6291 6468 6646 6824 6985 7146 7324 7586 7692"
461 y="16006"
462 id="tspan1035">w_e_send_csum()</tspan>
463 </text>
464 <path
465 d="M 8000,15501 L 7900,15201 L 8100,15201 L 8000,15501 z"
466 id="path1045"
467 style="fill:#008000;visibility:visible" />
468 <path
469 d="M 8000,14701 L 8000,15261"
470 id="path1049"
471 style="fill:none;stroke:#008000;visibility:visible" />
472 <text
473 id="text1065"
474 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
475 <tspan
476 x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692"
477 y="15507"
478 id="tspan1067">drbd_endio_read_sec()</tspan>
479 </text>
480 <path
481 d="M 16100,9000 L 16300,9000 L 16300,7500 L 16100,7500"
482 id="path1077"
483 style="fill:none;stroke:#000000;visibility:visible" />
484 <path
485 d="M 16100,18000 L 16300,18000 L 16300,16500 L 16100,16500"
486 id="path1089"
487 style="fill:none;stroke:#000000;visibility:visible" />
488 <path
489 d="M 16100,25000 L 16300,25000 L 16300,23500 L 16100,23500"
490 id="path1101"
491 style="fill:none;stroke:#000000;visibility:visible" />
492 <text
493 id="text1117"
494 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
495 <tspan
496 x="2026 2132 2293 2471 2648 2826 3004 3076 3254 3431 3503 3681 3787"
497 y="5402"
498 id="tspan1119">rs_begin_io()</tspan>
499 </text>
500 <text
501 id="text1133"
502 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
503 <tspan
504 x="2027 2133 2294 2472 2649 2827 3005 3077 3255 3432 3504 3682 3788"
505 y="14402"
506 id="tspan1135">rs_begin_io()</tspan>
507 </text>
508 <text
509 id="text1149"
510 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
511 <tspan
512 x="2026 2132 2293 2471 2648 2826 3004 3076 3254 3431 3503 3681 3787"
513 y="22602"
514 id="tspan1151">rs_begin_io()</tspan>
515 </text>
516 <text
517 id="text1165"
518 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
519 <tspan
520 x="1426 1532 1693 1871 2031 2209 2472 2649 2721 2899 2988 3166 3344 3416 3593 3699"
521 y="11302"
522 id="tspan1167">rs_complete_io()</tspan>
523 </text>
524 <text
525 id="text1181"
526 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
527 <tspan
528 x="1526 1632 1793 1971 2131 2309 2572 2749 2821 2999 3088 3266 3444 3516 3693 3799"
529 y="18931"
530 id="tspan1183">rs_complete_io()</tspan>
531 </text>
532 <text
533 id="text1197"
534 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
535 <tspan
536 x="1526 1632 1793 1971 2131 2309 2572 2749 2821 2999 3088 3266 3444 3516 3693 3799"
537 y="27231"
538 id="tspan1199">rs_complete_io()</tspan>
539 </text>
540 <text
541 id="text1213"
542 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
543 <tspan
544 x="16126 16232 16393 16571 16748 16926 17104 17176 17354 17531 17603 17781 17887"
545 y="7402"
546 id="tspan1215">rs_begin_io()</tspan>
547 </text>
548 <text
549 id="text1229"
550 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
551 <tspan
552 x="16127 16233 16394 16572 16749 16927 17105 17177 17355 17532 17604 17782 17888"
553 y="16331"
554 id="tspan1231">rs_begin_io()</tspan>
555 </text>
556 <text
557 id="text1245"
558 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
559 <tspan
560 x="16127 16233 16394 16572 16749 16927 17105 17177 17355 17532 17604 17782 17888"
561 y="23302"
562 id="tspan1247">rs_begin_io()</tspan>
563 </text>
564 <text
565 id="text1261"
566 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
567 <tspan
568 x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388"
569 y="9302"
570 id="tspan1263">rs_complete_io()</tspan>
571 </text>
572 <text
573 id="text1277"
574 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
575 <tspan
576 x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388"
577 y="18331"
578 id="tspan1279">rs_complete_io()</tspan>
579 </text>
580 <text
581 id="text1293"
582 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
583 <tspan
584 x="16126 16232 16393 16571 16731 16909 17172 17349 17421 17599 17688 17866 18044 18116 18293 18399"
585 y="25302"
586 id="tspan1295">rs_complete_io()</tspan>
587 </text>
588</svg>
diff --git a/Documentation/blockdev/drbd/DRBD-data-packets.svg b/Documentation/blockdev/drbd/DRBD-data-packets.svg
new file mode 100644
index 000000000000..48a1e2165fec
--- /dev/null
+++ b/Documentation/blockdev/drbd/DRBD-data-packets.svg
@@ -0,0 +1,459 @@
1<?xml version="1.0" encoding="UTF-8" standalone="no"?>
2<!-- Created with Inkscape (http://www.inkscape.org/) -->
3<svg
4 xmlns:svg="http://www.w3.org/2000/svg"
5 xmlns="http://www.w3.org/2000/svg"
6 version="1.0"
7 width="210mm"
8 height="297mm"
9 viewBox="0 0 21000 29700"
10 id="svg2"
11 style="fill-rule:evenodd">
12 <defs
13 id="defs4" />
14 <g
15 id="Default"
16 style="visibility:visible">
17 <desc
18 id="desc176">Master slide</desc>
19 </g>
20 <path
21 d="M 11999,19601 L 11899,19301 L 12099,19301 L 11999,19601 z"
22 id="path189"
23 style="fill:#008000;visibility:visible" />
24 <path
25 d="M 11999,18801 L 11999,19361"
26 id="path193"
27 style="fill:none;stroke:#008000;visibility:visible" />
28 <path
29 d="M 7999,21401 L 7899,21101 L 8099,21101 L 7999,21401 z"
30 id="path205"
31 style="fill:#008000;visibility:visible" />
32 <path
33 d="M 7999,20601 L 7999,21161"
34 id="path209"
35 style="fill:none;stroke:#008000;visibility:visible" />
36 <path
37 d="M 11999,18801 L 11685,18840 L 11724,18644 L 11999,18801 z"
38 id="path221"
39 style="fill:#008000;visibility:visible" />
40 <path
41 d="M 7999,18001 L 11764,18754"
42 id="path225"
43 style="fill:none;stroke:#008000;visibility:visible" />
44 <text
45 x="-3023.845"
46 y="1106.8124"
47 transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)"
48 id="text243"
49 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
50 <tspan
51 x="6115.1553 6344.1553 6555.1553 6784.1553 6962.1553 7051.1553 7228.1553 7457.1553 7635.1553 7813.1553 7885.1553"
52 y="21390.812"
53 id="tspan245">RSDataReply</tspan>
54 </text>
55 <path
56 d="M 7999,20601 L 8281,20458 L 8311,20655 L 7999,20601 z"
57 id="path255"
58 style="fill:#008000;visibility:visible" />
59 <path
60 d="M 11999,20001 L 8236,20565"
61 id="path259"
62 style="fill:none;stroke:#008000;visibility:visible" />
63 <text
64 x="3502.5356"
65 y="-2184.6621"
66 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
67 id="text277"
68 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
69 <tspan
70 x="12321.536 12550.536 12761.536 12990.536 13168.536 13257.536 13434.536 13663.536 13841.536 14019.536 14196.536 14374.536 14535.536"
71 y="15854.338"
72 id="tspan279">RSDataRequest</tspan>
73 </text>
74 <text
75 id="text293"
76 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
77 <tspan
78 x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692"
79 y="17807"
80 id="tspan295">w_make_resync_request()</tspan>
81 </text>
82 <text
83 id="text309"
84 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
85 <tspan
86 x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378"
87 y="18806"
88 id="tspan311">receive_DataRequest()</tspan>
89 </text>
90 <text
91 id="text325"
92 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
93 <tspan
94 x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399"
95 y="19606"
96 id="tspan327">drbd_endio_read_sec()</tspan>
97 </text>
98 <text
99 id="text341"
100 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
101 <tspan
102 x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13770 13931 14109 14287 14375 14553 14731 14837 15015 15192 15298"
103 y="20007"
104 id="tspan343">w_e_end_rsdata_req()</tspan>
105 </text>
106 <text
107 id="text357"
108 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
109 <tspan
110 x="4444 4550 4728 4889 5066 5138 5299 5477 5655 5883 6095 6324 6501 6590 6768 6997 7175 7352 7424 7585 7691"
111 y="20507"
112 id="tspan359">receive_RSDataReply()</tspan>
113 </text>
114 <text
115 id="text373"
116 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
117 <tspan
118 x="4457 4635 4741 4918 5096 5274 5452 5630 5807 5879 6057 6235 6464 6569 6641 6730 6908 7086 7247 7425 7585 7691"
119 y="21407"
120 id="tspan375">drbd_endio_write_sec()</tspan>
121 </text>
122 <text
123 id="text389"
124 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
125 <tspan
126 x="4647 4825 5003 5180 5358 5536 5714 5820 5997 6158 6319 6497 6658 6836 7013 7085 7263 7424 7585 7691"
127 y="21907"
128 id="tspan391">e_end_resync_block()</tspan>
129 </text>
130 <path
131 d="M 11999,22601 L 11685,22640 L 11724,22444 L 11999,22601 z"
132 id="path401"
133 style="fill:#000080;visibility:visible" />
134 <path
135 d="M 7999,21801 L 11764,22554"
136 id="path405"
137 style="fill:none;stroke:#000080;visibility:visible" />
138 <text
139 x="4290.3008"
140 y="-2369.6162"
141 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
142 id="text423"
143 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
144 <tspan
145 x="13610.301 13911.301 14016.301 14088.301 14177.301 14355.301 14567.301 14728.301"
146 y="19573.385"
147 id="tspan425">WriteAck</tspan>
148 </text>
149 <text
150 id="text439"
151 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
152 <tspan
153 x="12199 12377 12555 12644 12821 13033 13105 13283 13444 13604 13816 13977 14138 14244"
154 y="22559"
155 id="tspan441">got_BlockAck()</tspan>
156 </text>
157 <text
158 id="text455"
159 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
160 <tspan
161 x="7999 8304 8541 8753 8964 9201 9413 9531 9769 9862 10099 10310 10522 10734 10852 10971 11208 11348 11585 11822"
162 y="16877"
163 id="tspan457">Resync blocks, 4-32K</tspan>
164 </text>
165 <path
166 d="M 12000,7601 L 11900,7301 L 12100,7301 L 12000,7601 z"
167 id="path467"
168 style="fill:#008000;visibility:visible" />
169 <path
170 d="M 12000,6801 L 12000,7361"
171 id="path471"
172 style="fill:none;stroke:#008000;visibility:visible" />
173 <path
174 d="M 12000,6801 L 11686,6840 L 11725,6644 L 12000,6801 z"
175 id="path483"
176 style="fill:#008000;visibility:visible" />
177 <path
178 d="M 8000,6001 L 11765,6754"
179 id="path487"
180 style="fill:none;stroke:#008000;visibility:visible" />
181 <text
182 x="-1288.1796"
183 y="1279.7666"
184 transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)"
185 id="text505"
186 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
187 <tspan
188 x="8174.8208 8475.8203 8580.8203 8652.8203 8741.8203 8919.8203 9131.8203 9292.8203"
189 y="9516.7666"
190 id="tspan507">WriteAck</tspan>
191 </text>
192 <path
193 d="M 8000,8601 L 8282,8458 L 8312,8655 L 8000,8601 z"
194 id="path517"
195 style="fill:#000080;visibility:visible" />
196 <path
197 d="M 12000,8001 L 8237,8565"
198 id="path521"
199 style="fill:none;stroke:#000080;visibility:visible" />
200 <text
201 x="1065.6655"
202 y="-2097.7664"
203 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
204 id="text539"
205 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
206 <tspan
207 x="10682.666 10911.666 11088.666 11177.666"
208 y="4107.2339"
209 id="tspan541">Data</tspan>
210 </text>
211 <text
212 id="text555"
213 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
214 <tspan
215 x="4746 4924 5030 5207 5385 5563 5826 6003 6164 6342 6520 6626 6803 6981 7159 7337 7498 7587 7692"
216 y="5505"
217 id="tspan557">drbd_make_request()</tspan>
218 </text>
219 <text
220 id="text571"
221 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
222 <tspan
223 x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13639 13817 13906 14084 14190"
224 y="6806"
225 id="tspan573">receive_Data()</tspan>
226 </text>
227 <text
228 id="text587"
229 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
230 <tspan
231 x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14207 14312 14384 14473 14651 14829 14990 15168 15328 15434"
232 y="7606"
233 id="tspan589">drbd_endio_write_sec()</tspan>
234 </text>
235 <text
236 id="text603"
237 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
238 <tspan
239 x="12192 12370 12548 12725 12903 13081 13259 13437 13509 13686 13847 14008 14114"
240 y="8007"
241 id="tspan605">e_end_block()</tspan>
242 </text>
243 <text
244 id="text619"
245 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
246 <tspan
247 x="5647 5825 6003 6092 6269 6481 6553 6731 6892 7052 7264 7425 7586 7692"
248 y="8606"
249 id="tspan621">got_BlockAck()</tspan>
250 </text>
251 <text
252 id="text635"
253 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
254 <tspan
255 x="8000 8305 8542 8779 9016 9109 9346 9486 9604 9956 10049 10189 10328 10565 10705 10942 11179 11298 11603 11742 11835 11954 12191 12310 12428 12665 12902 13139 13279 13516 13753"
256 y="4877"
257 id="tspan637">Regular mirrored write, 512-32K</tspan>
258 </text>
259 <text
260 id="text651"
261 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
262 <tspan
263 x="5381 5610 5787 5948 6126 6304 6482 6659 6837 7015 7087 7265 7426 7587 7692"
264 y="6003"
265 id="tspan653">w_send_dblock()</tspan>
266 </text>
267 <path
268 d="M 8000,6800 L 7900,6500 L 8100,6500 L 8000,6800 z"
269 id="path663"
270 style="fill:#008000;visibility:visible" />
271 <path
272 d="M 8000,6000 L 8000,6560"
273 id="path667"
274 style="fill:none;stroke:#008000;visibility:visible" />
275 <text
276 id="text683"
277 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
278 <tspan
279 x="4602 4780 4886 5063 5241 5419 5597 5775 5952 6024 6202 6380 6609 6714 6786 6875 7053 7231 7409 7515 7587 7692"
280 y="6905"
281 id="tspan685">drbd_endio_write_pri()</tspan>
282 </text>
283 <path
284 d="M 12000,13602 L 11900,13302 L 12100,13302 L 12000,13602 z"
285 id="path695"
286 style="fill:#008000;visibility:visible" />
287 <path
288 d="M 12000,12802 L 12000,13362"
289 id="path699"
290 style="fill:none;stroke:#008000;visibility:visible" />
291 <path
292 d="M 12000,12802 L 11686,12841 L 11725,12645 L 12000,12802 z"
293 id="path711"
294 style="fill:#008000;visibility:visible" />
295 <path
296 d="M 8000,12002 L 11765,12755"
297 id="path715"
298 style="fill:none;stroke:#008000;visibility:visible" />
299 <text
300 x="-2155.5266"
301 y="1201.5964"
302 transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)"
303 id="text733"
304 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
305 <tspan
306 x="7202.4736 7431.4736 7608.4736 7697.4736 7875.4736 8104.4736 8282.4736 8459.4736 8531.4736"
307 y="15454.597"
308 id="tspan735">DataReply</tspan>
309 </text>
310 <path
311 d="M 8000,14602 L 8282,14459 L 8312,14656 L 8000,14602 z"
312 id="path745"
313 style="fill:#008000;visibility:visible" />
314 <path
315 d="M 12000,14002 L 8237,14566"
316 id="path749"
317 style="fill:none;stroke:#008000;visibility:visible" />
318 <text
319 x="2280.3804"
320 y="-2103.2141"
321 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
322 id="text767"
323 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
324 <tspan
325 x="11316.381 11545.381 11722.381 11811.381 11989.381 12218.381 12396.381 12573.381 12751.381 12929.381 13090.381"
326 y="9981.7861"
327 id="tspan769">DataRequest</tspan>
328 </text>
329 <text
330 id="text783"
331 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
332 <tspan
333 x="4746 4924 5030 5207 5385 5563 5826 6003 6164 6342 6520 6626 6803 6981 7159 7337 7498 7587 7692"
334 y="11506"
335 id="tspan785">drbd_make_request()</tspan>
336 </text>
337 <text
338 id="text799"
339 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
340 <tspan
341 x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13639 13817 13906 14084 14312 14490 14668 14846 15024 15185 15273 15379"
342 y="12807"
343 id="tspan801">receive_DataRequest()</tspan>
344 </text>
345 <text
346 id="text815"
347 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
348 <tspan
349 x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14084 14262 14439 14617 14795 14956 15134 15295 15400"
350 y="13607"
351 id="tspan817">drbd_endio_read_sec()</tspan>
352 </text>
353 <text
354 id="text831"
355 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
356 <tspan
357 x="12192 12421 12598 12776 12954 13132 13310 13487 13665 13843 14021 14110 14288 14465 14571 14749 14927 15033"
358 y="14008"
359 id="tspan833">w_e_end_data_req()</tspan>
360 </text>
361 <g
362 id="g835"
363 style="visibility:visible">
364 <desc
365 id="desc837">Drawing</desc>
366 <text
367 id="text847"
368 style="font-size:318px;font-weight:400;fill:#008000;font-family:Helvetica embedded">
369 <tspan
370 x="4885 4991 5169 5330 5507 5579 5740 5918 6096 6324 6502 6591 6769 6997 7175 7353 7425 7586 7692"
371 y="14607"
372 id="tspan849">receive_DataReply()</tspan>
373 </text>
374 </g>
375 <text
376 id="text863"
377 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
378 <tspan
379 x="8000 8305 8398 8610 8821 8914 9151 9363 9575 9693 9833 10070 10307 10544 10663 10781 11018 11255 11493 11632 11869 12106"
380 y="10878"
381 id="tspan865">Diskless read, 512-32K</tspan>
382 </text>
383 <text
384 id="text879"
385 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
386 <tspan
387 x="5029 5258 5435 5596 5774 5952 6130 6307 6413 6591 6769 6947 7125 7230 7408 7586 7692"
388 y="12004"
389 id="tspan881">w_send_read_req()</tspan>
390 </text>
391 <text
392 id="text895"
393 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
394 <tspan
395 x="6961 7266 7571 7854 8159 8278 8515 8633 8870 9107 9226 9463 9581 9700 9793 10030"
396 y="2806"
397 id="tspan897">DRBD 8 data flow</tspan>
398 </text>
399 <path
400 d="M 3900,5300 L 3700,5300 L 3700,7000 L 3900,7000"
401 id="path907"
402 style="fill:none;stroke:#000000;visibility:visible" />
403 <path
404 d="M 3900,17600 L 3700,17600 L 3700,22000 L 3900,22000"
405 id="path919"
406 style="fill:none;stroke:#000000;visibility:visible" />
407 <path
408 d="M 16100,20000 L 16300,20000 L 16300,18500 L 16100,18500"
409 id="path931"
410 style="fill:none;stroke:#000000;visibility:visible" />
411 <text
412 id="text947"
413 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
414 <tspan
415 x="2126 2304 2376 2554 2731 2909 3087 3159 3337 3515 3587 3764 3870"
416 y="5202"
417 id="tspan949">al_begin_io()</tspan>
418 </text>
419 <text
420 id="text963"
421 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
422 <tspan
423 x="1632 1810 1882 2060 2220 2398 2661 2839 2910 3088 3177 3355 3533 3605 3783 3888"
424 y="7331"
425 id="tspan965">al_complete_io()</tspan>
426 </text>
427 <text
428 id="text979"
429 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
430 <tspan
431 x="2126 2232 2393 2571 2748 2926 3104 3176 3354 3531 3603 3781 3887"
432 y="17431"
433 id="tspan981">rs_begin_io()</tspan>
434 </text>
435 <text
436 id="text995"
437 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
438 <tspan
439 x="1626 1732 1893 2071 2231 2409 2672 2849 2921 3099 3188 3366 3544 3616 3793 3899"
440 y="22331"
441 id="tspan997">rs_complete_io()</tspan>
442 </text>
443 <text
444 id="text1011"
445 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
446 <tspan
447 x="16027 16133 16294 16472 16649 16827 17005 17077 17255 17432 17504 17682 17788"
448 y="18402"
449 id="tspan1013">rs_begin_io()</tspan>
450 </text>
451 <text
452 id="text1027"
453 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
454 <tspan
455 x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388"
456 y="20331"
457 id="tspan1029">rs_complete_io()</tspan>
458 </text>
459</svg>
diff --git a/Documentation/blockdev/drbd/README.txt b/Documentation/blockdev/drbd/README.txt
new file mode 100644
index 000000000000..627b0a1bf35e
--- /dev/null
+++ b/Documentation/blockdev/drbd/README.txt
@@ -0,0 +1,16 @@
1Description
2
3 DRBD is a shared-nothing, synchronously replicated block device. It
4 is designed to serve as a building block for high availability
5 clusters and in this context, is a "drop-in" replacement for shared
6 storage. Simplistically, you could see it as a network RAID 1.
7
8 Please visit http://www.drbd.org to find out more.
9
10The here included files are intended to help understand the implementation
11
12DRBD-8.3-data-packets.svg, DRBD-data-packets.svg
13 relates some functions, and write packets.
14
15conn-states-8.dot, disk-states-8.dot, node-states-8.dot
16 The sub graphs of DRBD's state transitions
diff --git a/Documentation/blockdev/drbd/conn-states-8.dot b/Documentation/blockdev/drbd/conn-states-8.dot
new file mode 100644
index 000000000000..025e8cf5e64a
--- /dev/null
+++ b/Documentation/blockdev/drbd/conn-states-8.dot
@@ -0,0 +1,18 @@
1digraph conn_states {
2 StandAllone -> WFConnection [ label = "ioctl_set_net()" ]
3 WFConnection -> Unconnected [ label = "unable to bind()" ]
4 WFConnection -> WFReportParams [ label = "in connect() after accept" ]
5 WFReportParams -> StandAllone [ label = "checks in receive_param()" ]
6 WFReportParams -> Connected [ label = "in receive_param()" ]
7 WFReportParams -> WFBitMapS [ label = "sync_handshake()" ]
8 WFReportParams -> WFBitMapT [ label = "sync_handshake()" ]
9 WFBitMapS -> SyncSource [ label = "receive_bitmap()" ]
10 WFBitMapT -> SyncTarget [ label = "receive_bitmap()" ]
11 SyncSource -> Connected
12 SyncTarget -> Connected
13 SyncSource -> PausedSyncS
14 SyncTarget -> PausedSyncT
15 PausedSyncS -> SyncSource
16 PausedSyncT -> SyncTarget
17 Connected -> WFConnection [ label = "* on network error" ]
18}
diff --git a/Documentation/blockdev/drbd/disk-states-8.dot b/Documentation/blockdev/drbd/disk-states-8.dot
new file mode 100644
index 000000000000..d06cfb46fb98
--- /dev/null
+++ b/Documentation/blockdev/drbd/disk-states-8.dot
@@ -0,0 +1,16 @@
1digraph disk_states {
2 Diskless -> Inconsistent [ label = "ioctl_set_disk()" ]
3 Diskless -> Consistent [ label = "ioctl_set_disk()" ]
4 Diskless -> Outdated [ label = "ioctl_set_disk()" ]
5 Consistent -> Outdated [ label = "receive_param()" ]
6 Consistent -> UpToDate [ label = "receive_param()" ]
7 Consistent -> Inconsistent [ label = "start resync" ]
8 Outdated -> Inconsistent [ label = "start resync" ]
9 UpToDate -> Inconsistent [ label = "ioctl_replicate" ]
10 Inconsistent -> UpToDate [ label = "resync completed" ]
11 Consistent -> Failed [ label = "io completion error" ]
12 Outdated -> Failed [ label = "io completion error" ]
13 UpToDate -> Failed [ label = "io completion error" ]
14 Inconsistent -> Failed [ label = "io completion error" ]
15 Failed -> Diskless [ label = "sending notify to peer" ]
16}
diff --git a/Documentation/blockdev/drbd/drbd-connection-state-overview.dot b/Documentation/blockdev/drbd/drbd-connection-state-overview.dot
new file mode 100644
index 000000000000..6d9cf0a7b11d
--- /dev/null
+++ b/Documentation/blockdev/drbd/drbd-connection-state-overview.dot
@@ -0,0 +1,85 @@
1// vim: set sw=2 sts=2 :
2digraph {
3 rankdir=BT
4 bgcolor=white
5
6 node [shape=plaintext]
7 node [fontcolor=black]
8
9 StandAlone [ style=filled,fillcolor=gray,label=StandAlone ]
10
11 node [fontcolor=lightgray]
12
13 Unconnected [ label=Unconnected ]
14
15 CommTrouble [ shape=record,
16 label="{communication loss|{Timeout|BrokenPipe|NetworkFailure}}" ]
17
18 node [fontcolor=gray]
19
20 subgraph cluster_try_connect {
21 label="try to connect, handshake"
22 rank=max
23 WFConnection [ label=WFConnection ]
24 WFReportParams [ label=WFReportParams ]
25 }
26
27 TearDown [ label=TearDown ]
28
29 Connected [ label=Connected,style=filled,fillcolor=green,fontcolor=black ]
30
31 node [fontcolor=lightblue]
32
33 StartingSyncS [ label=StartingSyncS ]
34 StartingSyncT [ label=StartingSyncT ]
35
36 subgraph cluster_bitmap_exchange {
37 node [fontcolor=red]
38 fontcolor=red
39 label="new application (WRITE?) requests blocked\lwhile bitmap is exchanged"
40
41 WFBitMapT [ label=WFBitMapT ]
42 WFSyncUUID [ label=WFSyncUUID ]
43 WFBitMapS [ label=WFBitMapS ]
44 }
45
46 node [fontcolor=blue]
47
48 cluster_resync [ shape=record,label="{<any>resynchronisation process running\l'concurrent' application requests allowed|{{<T>PausedSyncT\nSyncTarget}|{<S>PausedSyncS\nSyncSource}}}" ]
49
50 node [shape=box,fontcolor=black]
51
52 // drbdadm [label="drbdadm connect"]
53 // handshake [label="drbd_connect()\ndrbd_do_handshake\ndrbd_sync_handshake() etc."]
54 // comm_error [label="communication trouble"]
55
56 //
57 // edges
58 // --------------------------------------
59
60 StandAlone -> Unconnected [ label="drbdadm connect" ]
61 Unconnected -> StandAlone [ label="drbdadm disconnect\lor serious communication trouble" ]
62 Unconnected -> WFConnection [ label="receiver thread is started" ]
63 WFConnection -> WFReportParams [ headlabel="accept()\land/or \lconnect()\l" ]
64
65 WFReportParams -> StandAlone [ label="during handshake\lpeers do not agree\labout something essential" ]
66 WFReportParams -> Connected [ label="data identical\lno sync needed",color=green,fontcolor=green ]
67
68 WFReportParams -> WFBitMapS
69 WFReportParams -> WFBitMapT
70 WFBitMapT -> WFSyncUUID [minlen=0.1,constraint=false]
71
72 WFBitMapS -> cluster_resync:S
73 WFSyncUUID -> cluster_resync:T
74
75 edge [color=green]
76 cluster_resync:any -> Connected [ label="resnyc done",fontcolor=green ]
77
78 edge [color=red]
79 WFReportParams -> CommTrouble
80 Connected -> CommTrouble
81 cluster_resync:any -> CommTrouble
82 edge [color=black]
83 CommTrouble -> Unconnected [label="receiver thread is stopped" ]
84
85}
diff --git a/Documentation/blockdev/drbd/node-states-8.dot b/Documentation/blockdev/drbd/node-states-8.dot
new file mode 100644
index 000000000000..4a2b00c23547
--- /dev/null
+++ b/Documentation/blockdev/drbd/node-states-8.dot
@@ -0,0 +1,14 @@
1digraph node_states {
2 Secondary -> Primary [ label = "ioctl_set_state()" ]
3 Primary -> Secondary [ label = "ioctl_set_state()" ]
4}
5
6digraph peer_states {
7 Secondary -> Primary [ label = "recv state packet" ]
8 Primary -> Secondary [ label = "recv state packet" ]
9 Primary -> Unknown [ label = "connection lost" ]
10 Secondary -> Unknown [ label = "connection lost" ]
11 Unknown -> Primary [ label = "connected" ]
12 Unknown -> Secondary [ label = "connected" ]
13}
14
diff --git a/MAINTAINERS b/MAINTAINERS
index c450f3abb8c9..ea56bd7a6cba 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1758,6 +1758,19 @@ S: Maintained
1758F: drivers/scsi/dpt* 1758F: drivers/scsi/dpt*
1759F: drivers/scsi/dpt/ 1759F: drivers/scsi/dpt/
1760 1760
1761DRBD DRIVER
1762P: Philipp Reisner
1763P: Lars Ellenberg
1764M: drbd-dev@lists.linbit.com
1765L: drbd-user@lists.linbit.com
1766W: http://www.drbd.org
1767T: git git://git.drbd.org/linux-2.6-drbd.git drbd
1768T: git git://git.drbd.org/drbd-8.3.git
1769S: Supported
1770F: drivers/block/drbd/
1771F: lib/lru_cache.c
1772F: Documentation/blockdev/drbd/
1773
1761DRIVER CORE, KOBJECTS, AND SYSFS 1774DRIVER CORE, KOBJECTS, AND SYSFS
1762M: Greg Kroah-Hartman <gregkh@suse.de> 1775M: Greg Kroah-Hartman <gregkh@suse.de>
1763T: quilt kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/ 1776T: quilt kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 1d886e079c58..77bfce52e9ca 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -271,6 +271,8 @@ config BLK_DEV_CRYPTOLOOP
271 instead, which can be configured to be on-disk compatible with the 271 instead, which can be configured to be on-disk compatible with the
272 cryptoloop device. 272 cryptoloop device.
273 273
274source "drivers/block/drbd/Kconfig"
275
274config BLK_DEV_NBD 276config BLK_DEV_NBD
275 tristate "Network block device support" 277 tristate "Network block device support"
276 depends on NET 278 depends on NET
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index cdaa3f8fddf0..aff5ac925c34 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -36,5 +36,6 @@ obj-$(CONFIG_BLK_DEV_UB) += ub.o
36obj-$(CONFIG_BLK_DEV_HD) += hd.o 36obj-$(CONFIG_BLK_DEV_HD) += hd.o
37 37
38obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o 38obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
39obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
39 40
40swim_mod-objs := swim.o swim_asm.o 41swim_mod-objs := swim.o swim_asm.o
diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig
new file mode 100644
index 000000000000..4e6f90f487c2
--- /dev/null
+++ b/drivers/block/drbd/Kconfig
@@ -0,0 +1,82 @@
1#
2# DRBD device driver configuration
3#
4
5comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected"
6 depends on !PROC_FS || !INET || !CONNECTOR
7
8config BLK_DEV_DRBD
9 tristate "DRBD Distributed Replicated Block Device support"
10 depends on PROC_FS && INET && CONNECTOR
11 select LRU_CACHE
12 default n
13 help
14
15 NOTE: In order to authenticate connections you have to select
16 CRYPTO_HMAC and a hash function as well.
17
18 DRBD is a shared-nothing, synchronously replicated block device. It
19 is designed to serve as a building block for high availability
20 clusters and in this context, is a "drop-in" replacement for shared
21 storage. Simplistically, you could see it as a network RAID 1.
22
23 Each minor device has a role, which can be 'primary' or 'secondary'.
24 On the node with the primary device the application is supposed to
25 run and to access the device (/dev/drbdX). Every write is sent to
26 the local 'lower level block device' and, across the network, to the
27 node with the device in 'secondary' state. The secondary device
28 simply writes the data to its lower level block device.
29
30 DRBD can also be used in dual-Primary mode (device writable on both
31 nodes), which means it can exhibit shared disk semantics in a
32 shared-nothing cluster. Needless to say, on top of dual-Primary
33 DRBD utilizing a cluster file system is necessary to maintain for
34 cache coherency.
35
36 For automatic failover you need a cluster manager (e.g. heartbeat).
37 See also: http://www.drbd.org/, http://www.linux-ha.org
38
39 If unsure, say N.
40
41config DRBD_TRACE
42 tristate "DRBD tracing"
43 depends on BLK_DEV_DRBD
44 select TRACEPOINTS
45 default n
46 help
47
48 Say Y here if you want to be able to trace various events in DRBD.
49
50 If unsure, say N.
51
52config DRBD_FAULT_INJECTION
53 bool "DRBD fault injection"
54 depends on BLK_DEV_DRBD
55 help
56
57 Say Y here if you want to simulate IO errors, in order to test DRBD's
58 behavior.
59
60 The actual simulation of IO errors is done by writing 3 values to
61 /sys/module/drbd/parameters/
62
63 enable_faults: bitmask of...
64 1 meta data write
65 2 read
66 4 resync data write
67 8 read
68 16 data write
69 32 data read
70 64 read ahead
71 128 kmalloc of bitmap
72 256 allocation of EE (epoch_entries)
73
74 fault_devs: bitmask of minor numbers
75 fault_rate: frequency in percent
76
77 Example: Simulate data write errors on /dev/drbd0 with a probability of 5%.
78 echo 16 > /sys/module/drbd/parameters/enable_faults
79 echo 1 > /sys/module/drbd/parameters/fault_devs
80 echo 5 > /sys/module/drbd/parameters/fault_rate
81
82 If unsure, say N.
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile
new file mode 100644
index 000000000000..7d86ef8a8b40
--- /dev/null
+++ b/drivers/block/drbd/Makefile
@@ -0,0 +1,8 @@
1drbd-y := drbd_bitmap.o drbd_proc.o
2drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
3drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
4
5drbd_trace-y := drbd_tracing.o
6
7obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o
8obj-$(CONFIG_DRBD_TRACE) += drbd_trace.o
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
new file mode 100644
index 000000000000..74b4835d3107
--- /dev/null
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -0,0 +1,1484 @@
1/*
2 drbd_actlog.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/slab.h>
27#include <linux/drbd.h>
28#include "drbd_int.h"
29#include "drbd_tracing.h"
30#include "drbd_wrappers.h"
31
32/* We maintain a trivial check sum in our on disk activity log.
33 * With that we can ensure correct operation even when the storage
34 * device might do a partial (last) sector write while loosing power.
35 */
36struct __packed al_transaction {
37 u32 magic;
38 u32 tr_number;
39 struct __packed {
40 u32 pos;
41 u32 extent; } updates[1 + AL_EXTENTS_PT];
42 u32 xor_sum;
43};
44
45struct update_odbm_work {
46 struct drbd_work w;
47 unsigned int enr;
48};
49
50struct update_al_work {
51 struct drbd_work w;
52 struct lc_element *al_ext;
53 struct completion event;
54 unsigned int enr;
55 /* if old_enr != LC_FREE, write corresponding bitmap sector, too */
56 unsigned int old_enr;
57};
58
59struct drbd_atodb_wait {
60 atomic_t count;
61 struct completion io_done;
62 struct drbd_conf *mdev;
63 int error;
64};
65
66
67int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);
68
69/* The actual tracepoint needs to have constant number of known arguments...
70 */
71void trace_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, ...)
72{
73 va_list ap;
74
75 va_start(ap, fmt);
76 trace__drbd_resync(mdev, level, fmt, ap);
77 va_end(ap);
78}
79
80static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
81 struct drbd_backing_dev *bdev,
82 struct page *page, sector_t sector,
83 int rw, int size)
84{
85 struct bio *bio;
86 struct drbd_md_io md_io;
87 int ok;
88
89 md_io.mdev = mdev;
90 init_completion(&md_io.event);
91 md_io.error = 0;
92
93 if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags))
94 rw |= (1 << BIO_RW_BARRIER);
95 rw |= ((1<<BIO_RW_UNPLUG) | (1<<BIO_RW_SYNCIO));
96
97 retry:
98 bio = bio_alloc(GFP_NOIO, 1);
99 bio->bi_bdev = bdev->md_bdev;
100 bio->bi_sector = sector;
101 ok = (bio_add_page(bio, page, size, 0) == size);
102 if (!ok)
103 goto out;
104 bio->bi_private = &md_io;
105 bio->bi_end_io = drbd_md_io_complete;
106 bio->bi_rw = rw;
107
108 trace_drbd_bio(mdev, "Md", bio, 0, NULL);
109
110 if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
111 bio_endio(bio, -EIO);
112 else
113 submit_bio(rw, bio);
114 wait_for_completion(&md_io.event);
115 ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;
116
117 /* check for unsupported barrier op.
118 * would rather check on EOPNOTSUPP, but that is not reliable.
119 * don't try again for ANY return value != 0 */
120 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && !ok)) {
121 /* Try again with no barrier */
122 dev_warn(DEV, "Barriers not supported on meta data device - disabling\n");
123 set_bit(MD_NO_BARRIER, &mdev->flags);
124 rw &= ~(1 << BIO_RW_BARRIER);
125 bio_put(bio);
126 goto retry;
127 }
128 out:
129 bio_put(bio);
130 return ok;
131}
132
133int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
134 sector_t sector, int rw)
135{
136 int logical_block_size, mask, ok;
137 int offset = 0;
138 struct page *iop = mdev->md_io_page;
139
140 D_ASSERT(mutex_is_locked(&mdev->md_io_mutex));
141
142 BUG_ON(!bdev->md_bdev);
143
144 logical_block_size = bdev_logical_block_size(bdev->md_bdev);
145 if (logical_block_size == 0)
146 logical_block_size = MD_SECTOR_SIZE;
147
148 /* in case logical_block_size != 512 [ s390 only? ] */
149 if (logical_block_size != MD_SECTOR_SIZE) {
150 mask = (logical_block_size / MD_SECTOR_SIZE) - 1;
151 D_ASSERT(mask == 1 || mask == 3 || mask == 7);
152 D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE);
153 offset = sector & mask;
154 sector = sector & ~mask;
155 iop = mdev->md_io_tmpp;
156
157 if (rw & WRITE) {
158 /* these are GFP_KERNEL pages, pre-allocated
159 * on device initialization */
160 void *p = page_address(mdev->md_io_page);
161 void *hp = page_address(mdev->md_io_tmpp);
162
163 ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector,
164 READ, logical_block_size);
165
166 if (unlikely(!ok)) {
167 dev_err(DEV, "drbd_md_sync_page_io(,%llus,"
168 "READ [logical_block_size!=512]) failed!\n",
169 (unsigned long long)sector);
170 return 0;
171 }
172
173 memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE);
174 }
175 }
176
177 if (sector < drbd_md_first_sector(bdev) ||
178 sector > drbd_md_last_sector(bdev))
179 dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
180 current->comm, current->pid, __func__,
181 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
182
183 ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size);
184 if (unlikely(!ok)) {
185 dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n",
186 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
187 return 0;
188 }
189
190 if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) {
191 void *p = page_address(mdev->md_io_page);
192 void *hp = page_address(mdev->md_io_tmpp);
193
194 memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE);
195 }
196
197 return ok;
198}
199
200static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
201{
202 struct lc_element *al_ext;
203 struct lc_element *tmp;
204 unsigned long al_flags = 0;
205
206 spin_lock_irq(&mdev->al_lock);
207 tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
208 if (unlikely(tmp != NULL)) {
209 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
210 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
211 spin_unlock_irq(&mdev->al_lock);
212 return NULL;
213 }
214 }
215 al_ext = lc_get(mdev->act_log, enr);
216 al_flags = mdev->act_log->flags;
217 spin_unlock_irq(&mdev->al_lock);
218
219 /*
220 if (!al_ext) {
221 if (al_flags & LC_STARVING)
222 dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n");
223 if (al_flags & LC_DIRTY)
224 dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n");
225 }
226 */
227
228 return al_ext;
229}
230
231void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector)
232{
233 unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
234 struct lc_element *al_ext;
235 struct update_al_work al_work;
236
237 D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
238
239 trace_drbd_actlog(mdev, sector, "al_begin_io");
240
241 wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)));
242
243 if (al_ext->lc_number != enr) {
244 /* drbd_al_write_transaction(mdev,al_ext,enr);
245 * recurses into generic_make_request(), which
246 * disallows recursion, bios being serialized on the
247 * current->bio_tail list now.
248 * we have to delegate updates to the activity log
249 * to the worker thread. */
250 init_completion(&al_work.event);
251 al_work.al_ext = al_ext;
252 al_work.enr = enr;
253 al_work.old_enr = al_ext->lc_number;
254 al_work.w.cb = w_al_write_transaction;
255 drbd_queue_work_front(&mdev->data.work, &al_work.w);
256 wait_for_completion(&al_work.event);
257
258 mdev->al_writ_cnt++;
259
260 spin_lock_irq(&mdev->al_lock);
261 lc_changed(mdev->act_log, al_ext);
262 spin_unlock_irq(&mdev->al_lock);
263 wake_up(&mdev->al_wait);
264 }
265}
266
267void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)
268{
269 unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
270 struct lc_element *extent;
271 unsigned long flags;
272
273 trace_drbd_actlog(mdev, sector, "al_complete_io");
274
275 spin_lock_irqsave(&mdev->al_lock, flags);
276
277 extent = lc_find(mdev->act_log, enr);
278
279 if (!extent) {
280 spin_unlock_irqrestore(&mdev->al_lock, flags);
281 dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
282 return;
283 }
284
285 if (lc_put(mdev->act_log, extent) == 0)
286 wake_up(&mdev->al_wait);
287
288 spin_unlock_irqrestore(&mdev->al_lock, flags);
289}
290
291int
292w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
293{
294 struct update_al_work *aw = container_of(w, struct update_al_work, w);
295 struct lc_element *updated = aw->al_ext;
296 const unsigned int new_enr = aw->enr;
297 const unsigned int evicted = aw->old_enr;
298 struct al_transaction *buffer;
299 sector_t sector;
300 int i, n, mx;
301 unsigned int extent_nr;
302 u32 xor_sum = 0;
303
304 if (!get_ldev(mdev)) {
305 dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n");
306 complete(&((struct update_al_work *)w)->event);
307 return 1;
308 }
309 /* do we have to do a bitmap write, first?
310 * TODO reduce maximum latency:
311 * submit both bios, then wait for both,
312 * instead of doing two synchronous sector writes. */
313 if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
314 drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT);
315
316 mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */
317 buffer = (struct al_transaction *)page_address(mdev->md_io_page);
318
319 buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
320 buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
321
322 n = lc_index_of(mdev->act_log, updated);
323
324 buffer->updates[0].pos = cpu_to_be32(n);
325 buffer->updates[0].extent = cpu_to_be32(new_enr);
326
327 xor_sum ^= new_enr;
328
329 mx = min_t(int, AL_EXTENTS_PT,
330 mdev->act_log->nr_elements - mdev->al_tr_cycle);
331 for (i = 0; i < mx; i++) {
332 unsigned idx = mdev->al_tr_cycle + i;
333 extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number;
334 buffer->updates[i+1].pos = cpu_to_be32(idx);
335 buffer->updates[i+1].extent = cpu_to_be32(extent_nr);
336 xor_sum ^= extent_nr;
337 }
338 for (; i < AL_EXTENTS_PT; i++) {
339 buffer->updates[i+1].pos = __constant_cpu_to_be32(-1);
340 buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE);
341 xor_sum ^= LC_FREE;
342 }
343 mdev->al_tr_cycle += AL_EXTENTS_PT;
344 if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
345 mdev->al_tr_cycle = 0;
346
347 buffer->xor_sum = cpu_to_be32(xor_sum);
348
349 sector = mdev->ldev->md.md_offset
350 + mdev->ldev->md.al_offset + mdev->al_tr_pos;
351
352 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
353 drbd_chk_io_error(mdev, 1, TRUE);
354
355 if (++mdev->al_tr_pos >
356 div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
357 mdev->al_tr_pos = 0;
358
359 D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
360 mdev->al_tr_number++;
361
362 mutex_unlock(&mdev->md_io_mutex);
363
364 complete(&((struct update_al_work *)w)->event);
365 put_ldev(mdev);
366
367 return 1;
368}
369
370/**
371 * drbd_al_read_tr() - Read a single transaction from the on disk activity log
372 * @mdev: DRBD device.
373 * @bdev: Block device to read form.
374 * @b: pointer to an al_transaction.
375 * @index: On disk slot of the transaction to read.
376 *
377 * Returns -1 on IO error, 0 on checksum error and 1 upon success.
378 */
379static int drbd_al_read_tr(struct drbd_conf *mdev,
380 struct drbd_backing_dev *bdev,
381 struct al_transaction *b,
382 int index)
383{
384 sector_t sector;
385 int rv, i;
386 u32 xor_sum = 0;
387
388 sector = bdev->md.md_offset + bdev->md.al_offset + index;
389
390 /* Dont process error normally,
391 * as this is done before disk is attached! */
392 if (!drbd_md_sync_page_io(mdev, bdev, sector, READ))
393 return -1;
394
395 rv = (be32_to_cpu(b->magic) == DRBD_MAGIC);
396
397 for (i = 0; i < AL_EXTENTS_PT + 1; i++)
398 xor_sum ^= be32_to_cpu(b->updates[i].extent);
399 rv &= (xor_sum == be32_to_cpu(b->xor_sum));
400
401 return rv;
402}
403
404/**
405 * drbd_al_read_log() - Restores the activity log from its on disk representation.
406 * @mdev: DRBD device.
407 * @bdev: Block device to read form.
408 *
409 * Returns 1 on success, returns 0 when reading the log failed due to IO errors.
410 */
411int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
412{
413 struct al_transaction *buffer;
414 int i;
415 int rv;
416 int mx;
417 int active_extents = 0;
418 int transactions = 0;
419 int found_valid = 0;
420 int from = 0;
421 int to = 0;
422 u32 from_tnr = 0;
423 u32 to_tnr = 0;
424 u32 cnr;
425
426 mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT);
427
428 /* lock out all other meta data io for now,
429 * and make sure the page is mapped.
430 */
431 mutex_lock(&mdev->md_io_mutex);
432 buffer = page_address(mdev->md_io_page);
433
434 /* Find the valid transaction in the log */
435 for (i = 0; i <= mx; i++) {
436 rv = drbd_al_read_tr(mdev, bdev, buffer, i);
437 if (rv == 0)
438 continue;
439 if (rv == -1) {
440 mutex_unlock(&mdev->md_io_mutex);
441 return 0;
442 }
443 cnr = be32_to_cpu(buffer->tr_number);
444
445 if (++found_valid == 1) {
446 from = i;
447 to = i;
448 from_tnr = cnr;
449 to_tnr = cnr;
450 continue;
451 }
452 if ((int)cnr - (int)from_tnr < 0) {
453 D_ASSERT(from_tnr - cnr + i - from == mx+1);
454 from = i;
455 from_tnr = cnr;
456 }
457 if ((int)cnr - (int)to_tnr > 0) {
458 D_ASSERT(cnr - to_tnr == i - to);
459 to = i;
460 to_tnr = cnr;
461 }
462 }
463
464 if (!found_valid) {
465 dev_warn(DEV, "No usable activity log found.\n");
466 mutex_unlock(&mdev->md_io_mutex);
467 return 1;
468 }
469
470 /* Read the valid transactions.
471 * dev_info(DEV, "Reading from %d to %d.\n",from,to); */
472 i = from;
473 while (1) {
474 int j, pos;
475 unsigned int extent_nr;
476 unsigned int trn;
477
478 rv = drbd_al_read_tr(mdev, bdev, buffer, i);
479 ERR_IF(rv == 0) goto cancel;
480 if (rv == -1) {
481 mutex_unlock(&mdev->md_io_mutex);
482 return 0;
483 }
484
485 trn = be32_to_cpu(buffer->tr_number);
486
487 spin_lock_irq(&mdev->al_lock);
488
489 /* This loop runs backwards because in the cyclic
490 elements there might be an old version of the
491 updated element (in slot 0). So the element in slot 0
492 can overwrite old versions. */
493 for (j = AL_EXTENTS_PT; j >= 0; j--) {
494 pos = be32_to_cpu(buffer->updates[j].pos);
495 extent_nr = be32_to_cpu(buffer->updates[j].extent);
496
497 if (extent_nr == LC_FREE)
498 continue;
499
500 lc_set(mdev->act_log, extent_nr, pos);
501 active_extents++;
502 }
503 spin_unlock_irq(&mdev->al_lock);
504
505 transactions++;
506
507cancel:
508 if (i == to)
509 break;
510 i++;
511 if (i > mx)
512 i = 0;
513 }
514
515 mdev->al_tr_number = to_tnr+1;
516 mdev->al_tr_pos = to;
517 if (++mdev->al_tr_pos >
518 div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
519 mdev->al_tr_pos = 0;
520
521 /* ok, we are done with it */
522 mutex_unlock(&mdev->md_io_mutex);
523
524 dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
525 transactions, active_extents);
526
527 return 1;
528}
529
530static void atodb_endio(struct bio *bio, int error)
531{
532 struct drbd_atodb_wait *wc = bio->bi_private;
533 struct drbd_conf *mdev = wc->mdev;
534 struct page *page;
535 int uptodate = bio_flagged(bio, BIO_UPTODATE);
536
537 /* strange behavior of some lower level drivers...
538 * fail the request by clearing the uptodate flag,
539 * but do not return any error?! */
540 if (!error && !uptodate)
541 error = -EIO;
542
543 drbd_chk_io_error(mdev, error, TRUE);
544 if (error && wc->error == 0)
545 wc->error = error;
546
547 if (atomic_dec_and_test(&wc->count))
548 complete(&wc->io_done);
549
550 page = bio->bi_io_vec[0].bv_page;
551 put_page(page);
552 bio_put(bio);
553 mdev->bm_writ_cnt++;
554 put_ldev(mdev);
555}
556
557#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
558/* activity log to on disk bitmap -- prepare bio unless that sector
559 * is already covered by previously prepared bios */
560static int atodb_prepare_unless_covered(struct drbd_conf *mdev,
561 struct bio **bios,
562 unsigned int enr,
563 struct drbd_atodb_wait *wc) __must_hold(local)
564{
565 struct bio *bio;
566 struct page *page;
567 sector_t on_disk_sector = enr + mdev->ldev->md.md_offset
568 + mdev->ldev->md.bm_offset;
569 unsigned int page_offset = PAGE_SIZE;
570 int offset;
571 int i = 0;
572 int err = -ENOMEM;
573
574 /* Check if that enr is already covered by an already created bio.
575 * Caution, bios[] is not NULL terminated,
576 * but only initialized to all NULL.
577 * For completely scattered activity log,
578 * the last invocation iterates over all bios,
579 * and finds the last NULL entry.
580 */
581 while ((bio = bios[i])) {
582 if (bio->bi_sector == on_disk_sector)
583 return 0;
584 i++;
585 }
586 /* bios[i] == NULL, the next not yet used slot */
587
588 /* GFP_KERNEL, we are not in the write-out path */
589 bio = bio_alloc(GFP_KERNEL, 1);
590 if (bio == NULL)
591 return -ENOMEM;
592
593 if (i > 0) {
594 const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec;
595 page_offset = prev_bv->bv_offset + prev_bv->bv_len;
596 page = prev_bv->bv_page;
597 }
598 if (page_offset == PAGE_SIZE) {
599 page = alloc_page(__GFP_HIGHMEM);
600 if (page == NULL)
601 goto out_bio_put;
602 page_offset = 0;
603 } else {
604 get_page(page);
605 }
606
607 offset = S2W(enr);
608 drbd_bm_get_lel(mdev, offset,
609 min_t(size_t, S2W(1), drbd_bm_words(mdev) - offset),
610 kmap(page) + page_offset);
611 kunmap(page);
612
613 bio->bi_private = wc;
614 bio->bi_end_io = atodb_endio;
615 bio->bi_bdev = mdev->ldev->md_bdev;
616 bio->bi_sector = on_disk_sector;
617
618 if (bio_add_page(bio, page, MD_SECTOR_SIZE, page_offset) != MD_SECTOR_SIZE)
619 goto out_put_page;
620
621 atomic_inc(&wc->count);
622 /* we already know that we may do this...
623 * get_ldev_if_state(mdev,D_ATTACHING);
624 * just get the extra reference, so that the local_cnt reflects
625 * the number of pending IO requests DRBD at its backing device.
626 */
627 atomic_inc(&mdev->local_cnt);
628
629 bios[i] = bio;
630
631 return 0;
632
633out_put_page:
634 err = -EINVAL;
635 put_page(page);
636out_bio_put:
637 bio_put(bio);
638 return err;
639}
640
641/**
642 * drbd_al_to_on_disk_bm() - * Writes bitmap parts covered by active AL extents
643 * @mdev: DRBD device.
644 *
645 * Called when we detach (unconfigure) local storage,
646 * or when we go from R_PRIMARY to R_SECONDARY role.
647 */
648void drbd_al_to_on_disk_bm(struct drbd_conf *mdev)
649{
650 int i, nr_elements;
651 unsigned int enr;
652 struct bio **bios;
653 struct drbd_atodb_wait wc;
654
655 ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING))
656 return; /* sorry, I don't have any act_log etc... */
657
658 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
659
660 nr_elements = mdev->act_log->nr_elements;
661
662 /* GFP_KERNEL, we are not in anyone's write-out path */
663 bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL);
664 if (!bios)
665 goto submit_one_by_one;
666
667 atomic_set(&wc.count, 0);
668 init_completion(&wc.io_done);
669 wc.mdev = mdev;
670 wc.error = 0;
671
672 for (i = 0; i < nr_elements; i++) {
673 enr = lc_element_by_index(mdev->act_log, i)->lc_number;
674 if (enr == LC_FREE)
675 continue;
676 /* next statement also does atomic_inc wc.count and local_cnt */
677 if (atodb_prepare_unless_covered(mdev, bios,
678 enr/AL_EXT_PER_BM_SECT,
679 &wc))
680 goto free_bios_submit_one_by_one;
681 }
682
683 /* unnecessary optimization? */
684 lc_unlock(mdev->act_log);
685 wake_up(&mdev->al_wait);
686
687 /* all prepared, submit them */
688 for (i = 0; i < nr_elements; i++) {
689 if (bios[i] == NULL)
690 break;
691 if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) {
692 bios[i]->bi_rw = WRITE;
693 bio_endio(bios[i], -EIO);
694 } else {
695 submit_bio(WRITE, bios[i]);
696 }
697 }
698
699 drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
700
701 /* always (try to) flush bitmap to stable storage */
702 drbd_md_flush(mdev);
703
704 /* In case we did not submit a single IO do not wait for
705 * them to complete. ( Because we would wait forever here. )
706 *
707 * In case we had IOs and they are already complete, there
708 * is not point in waiting anyways.
709 * Therefore this if () ... */
710 if (atomic_read(&wc.count))
711 wait_for_completion(&wc.io_done);
712
713 put_ldev(mdev);
714
715 kfree(bios);
716 return;
717
718 free_bios_submit_one_by_one:
719 /* free everything by calling the endio callback directly. */
720 for (i = 0; i < nr_elements && bios[i]; i++)
721 bio_endio(bios[i], 0);
722
723 kfree(bios);
724
725 submit_one_by_one:
726 dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n");
727
728 for (i = 0; i < mdev->act_log->nr_elements; i++) {
729 enr = lc_element_by_index(mdev->act_log, i)->lc_number;
730 if (enr == LC_FREE)
731 continue;
732 /* Really slow: if we have al-extents 16..19 active,
733 * sector 4 will be written four times! Synchronous! */
734 drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT);
735 }
736
737 lc_unlock(mdev->act_log);
738 wake_up(&mdev->al_wait);
739 put_ldev(mdev);
740}
741
742/**
743 * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents
744 * @mdev: DRBD device.
745 */
746void drbd_al_apply_to_bm(struct drbd_conf *mdev)
747{
748 unsigned int enr;
749 unsigned long add = 0;
750 char ppb[10];
751 int i;
752
753 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
754
755 for (i = 0; i < mdev->act_log->nr_elements; i++) {
756 enr = lc_element_by_index(mdev->act_log, i)->lc_number;
757 if (enr == LC_FREE)
758 continue;
759 add += drbd_bm_ALe_set_all(mdev, enr);
760 }
761
762 lc_unlock(mdev->act_log);
763 wake_up(&mdev->al_wait);
764
765 dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n",
766 ppsize(ppb, Bit2KB(add)));
767}
768
769static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
770{
771 int rv;
772
773 spin_lock_irq(&mdev->al_lock);
774 rv = (al_ext->refcnt == 0);
775 if (likely(rv))
776 lc_del(mdev->act_log, al_ext);
777 spin_unlock_irq(&mdev->al_lock);
778
779 return rv;
780}
781
782/**
783 * drbd_al_shrink() - Removes all active extents form the activity log
784 * @mdev: DRBD device.
785 *
786 * Removes all active extents form the activity log, waiting until
787 * the reference count of each entry dropped to 0 first, of course.
788 *
789 * You need to lock mdev->act_log with lc_try_lock() / lc_unlock()
790 */
791void drbd_al_shrink(struct drbd_conf *mdev)
792{
793 struct lc_element *al_ext;
794 int i;
795
796 D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags));
797
798 for (i = 0; i < mdev->act_log->nr_elements; i++) {
799 al_ext = lc_element_by_index(mdev->act_log, i);
800 if (al_ext->lc_number == LC_FREE)
801 continue;
802 wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext));
803 }
804
805 wake_up(&mdev->al_wait);
806}
807
808static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused)
809{
810 struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
811
812 if (!get_ldev(mdev)) {
813 if (__ratelimit(&drbd_ratelimit_state))
814 dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n");
815 kfree(udw);
816 return 1;
817 }
818
819 drbd_bm_write_sect(mdev, udw->enr);
820 put_ldev(mdev);
821
822 kfree(udw);
823
824 if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) {
825 switch (mdev->state.conn) {
826 case C_SYNC_SOURCE: case C_SYNC_TARGET:
827 case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:
828 drbd_resync_finished(mdev);
829 default:
830 /* nothing to do */
831 break;
832 }
833 }
834 drbd_bcast_sync_progress(mdev);
835
836 return 1;
837}
838
839
840/* ATTENTION. The AL's extents are 4MB each, while the extents in the
841 * resync LRU-cache are 16MB each.
842 * The caller of this function has to hold an get_ldev() reference.
843 *
844 * TODO will be obsoleted once we have a caching lru of the on disk bitmap
845 */
846static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
847 int count, int success)
848{
849 struct lc_element *e;
850 struct update_odbm_work *udw;
851
852 unsigned int enr;
853
854 D_ASSERT(atomic_read(&mdev->local_cnt));
855
856 /* I simply assume that a sector/size pair never crosses
857 * a 16 MB extent border. (Currently this is true...) */
858 enr = BM_SECT_TO_EXT(sector);
859
860 e = lc_get(mdev->resync, enr);
861 if (e) {
862 struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
863 if (ext->lce.lc_number == enr) {
864 if (success)
865 ext->rs_left -= count;
866 else
867 ext->rs_failed += count;
868 if (ext->rs_left < ext->rs_failed) {
869 dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
870 "rs_failed=%d count=%d\n",
871 (unsigned long long)sector,
872 ext->lce.lc_number, ext->rs_left,
873 ext->rs_failed, count);
874 dump_stack();
875
876 lc_put(mdev->resync, &ext->lce);
877 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
878 return;
879 }
880 } else {
881 /* Normally this element should be in the cache,
882 * since drbd_rs_begin_io() pulled it already in.
883 *
884 * But maybe an application write finished, and we set
885 * something outside the resync lru_cache in sync.
886 */
887 int rs_left = drbd_bm_e_weight(mdev, enr);
888 if (ext->flags != 0) {
889 dev_warn(DEV, "changing resync lce: %d[%u;%02lx]"
890 " -> %d[%u;00]\n",
891 ext->lce.lc_number, ext->rs_left,
892 ext->flags, enr, rs_left);
893 ext->flags = 0;
894 }
895 if (ext->rs_failed) {
896 dev_warn(DEV, "Kicking resync_lru element enr=%u "
897 "out with rs_failed=%d\n",
898 ext->lce.lc_number, ext->rs_failed);
899 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
900 }
901 ext->rs_left = rs_left;
902 ext->rs_failed = success ? 0 : count;
903 lc_changed(mdev->resync, &ext->lce);
904 }
905 lc_put(mdev->resync, &ext->lce);
906 /* no race, we are within the al_lock! */
907
908 if (ext->rs_left == ext->rs_failed) {
909 ext->rs_failed = 0;
910
911 udw = kmalloc(sizeof(*udw), GFP_ATOMIC);
912 if (udw) {
913 udw->enr = ext->lce.lc_number;
914 udw->w.cb = w_update_odbm;
915 drbd_queue_work_front(&mdev->data.work, &udw->w);
916 } else {
917 dev_warn(DEV, "Could not kmalloc an udw\n");
918 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
919 }
920 }
921 } else {
922 dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n",
923 mdev->resync_locked,
924 mdev->resync->nr_elements,
925 mdev->resync->flags);
926 }
927}
928
929/* clear the bit corresponding to the piece of storage in question:
930 * size byte of data starting from sector. Only clear a bits of the affected
931 * one ore more _aligned_ BM_BLOCK_SIZE blocks.
932 *
933 * called by worker on C_SYNC_TARGET and receiver on SyncSource.
934 *
935 */
936void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
937 const char *file, const unsigned int line)
938{
939 /* Is called from worker and receiver context _only_ */
940 unsigned long sbnr, ebnr, lbnr;
941 unsigned long count = 0;
942 sector_t esector, nr_sectors;
943 int wake_up = 0;
944 unsigned long flags;
945
946 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
947 dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
948 (unsigned long long)sector, size);
949 return;
950 }
951 nr_sectors = drbd_get_capacity(mdev->this_bdev);
952 esector = sector + (size >> 9) - 1;
953
954 ERR_IF(sector >= nr_sectors) return;
955 ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
956
957 lbnr = BM_SECT_TO_BIT(nr_sectors-1);
958
959 /* we clear it (in sync).
960 * round up start sector, round down end sector. we make sure we only
961 * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
962 if (unlikely(esector < BM_SECT_PER_BIT-1))
963 return;
964 if (unlikely(esector == (nr_sectors-1)))
965 ebnr = lbnr;
966 else
967 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
968 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
969
970 trace_drbd_resync(mdev, TRACE_LVL_METRICS,
971 "drbd_set_in_sync: sector=%llus size=%u sbnr=%lu ebnr=%lu\n",
972 (unsigned long long)sector, size, sbnr, ebnr);
973
974 if (sbnr > ebnr)
975 return;
976
977 /*
978 * ok, (capacity & 7) != 0 sometimes, but who cares...
979 * we count rs_{total,left} in bits, not sectors.
980 */
981 spin_lock_irqsave(&mdev->al_lock, flags);
982 count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
983 if (count) {
984 /* we need the lock for drbd_try_clear_on_disk_bm */
985 if (jiffies - mdev->rs_mark_time > HZ*10) {
986 /* should be rolling marks,
987 * but we estimate only anyways. */
988 if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) &&
989 mdev->state.conn != C_PAUSED_SYNC_T &&
990 mdev->state.conn != C_PAUSED_SYNC_S) {
991 mdev->rs_mark_time = jiffies;
992 mdev->rs_mark_left = drbd_bm_total_weight(mdev);
993 }
994 }
995 if (get_ldev(mdev)) {
996 drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE);
997 put_ldev(mdev);
998 }
999 /* just wake_up unconditional now, various lc_chaged(),
1000 * lc_put() in drbd_try_clear_on_disk_bm(). */
1001 wake_up = 1;
1002 }
1003 spin_unlock_irqrestore(&mdev->al_lock, flags);
1004 if (wake_up)
1005 wake_up(&mdev->al_wait);
1006}
1007
1008/*
1009 * this is intended to set one request worth of data out of sync.
1010 * affects at least 1 bit,
1011 * and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits.
1012 *
1013 * called by tl_clear and drbd_send_dblock (==drbd_make_request).
1014 * so this can be _any_ process.
1015 */
1016void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
1017 const char *file, const unsigned int line)
1018{
1019 unsigned long sbnr, ebnr, lbnr, flags;
1020 sector_t esector, nr_sectors;
1021 unsigned int enr, count;
1022 struct lc_element *e;
1023
1024 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
1025 dev_err(DEV, "sector: %llus, size: %d\n",
1026 (unsigned long long)sector, size);
1027 return;
1028 }
1029
1030 if (!get_ldev(mdev))
1031 return; /* no disk, no metadata, no bitmap to set bits in */
1032
1033 nr_sectors = drbd_get_capacity(mdev->this_bdev);
1034 esector = sector + (size >> 9) - 1;
1035
1036 ERR_IF(sector >= nr_sectors)
1037 goto out;
1038 ERR_IF(esector >= nr_sectors)
1039 esector = (nr_sectors-1);
1040
1041 lbnr = BM_SECT_TO_BIT(nr_sectors-1);
1042
1043 /* we set it out of sync,
1044 * we do not need to round anything here */
1045 sbnr = BM_SECT_TO_BIT(sector);
1046 ebnr = BM_SECT_TO_BIT(esector);
1047
1048 trace_drbd_resync(mdev, TRACE_LVL_METRICS,
1049 "drbd_set_out_of_sync: sector=%llus size=%u sbnr=%lu ebnr=%lu\n",
1050 (unsigned long long)sector, size, sbnr, ebnr);
1051
1052 /* ok, (capacity & 7) != 0 sometimes, but who cares...
1053 * we count rs_{total,left} in bits, not sectors. */
1054 spin_lock_irqsave(&mdev->al_lock, flags);
1055 count = drbd_bm_set_bits(mdev, sbnr, ebnr);
1056
1057 enr = BM_SECT_TO_EXT(sector);
1058 e = lc_find(mdev->resync, enr);
1059 if (e)
1060 lc_entry(e, struct bm_extent, lce)->rs_left += count;
1061 spin_unlock_irqrestore(&mdev->al_lock, flags);
1062
1063out:
1064 put_ldev(mdev);
1065}
1066
1067static
1068struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
1069{
1070 struct lc_element *e;
1071 struct bm_extent *bm_ext;
1072 int wakeup = 0;
1073 unsigned long rs_flags;
1074
1075 spin_lock_irq(&mdev->al_lock);
1076 if (mdev->resync_locked > mdev->resync->nr_elements/2) {
1077 spin_unlock_irq(&mdev->al_lock);
1078 return NULL;
1079 }
1080 e = lc_get(mdev->resync, enr);
1081 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1082 if (bm_ext) {
1083 if (bm_ext->lce.lc_number != enr) {
1084 bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
1085 bm_ext->rs_failed = 0;
1086 lc_changed(mdev->resync, &bm_ext->lce);
1087 wakeup = 1;
1088 }
1089 if (bm_ext->lce.refcnt == 1)
1090 mdev->resync_locked++;
1091 set_bit(BME_NO_WRITES, &bm_ext->flags);
1092 }
1093 rs_flags = mdev->resync->flags;
1094 spin_unlock_irq(&mdev->al_lock);
1095 if (wakeup)
1096 wake_up(&mdev->al_wait);
1097
1098 if (!bm_ext) {
1099 if (rs_flags & LC_STARVING)
1100 dev_warn(DEV, "Have to wait for element"
1101 " (resync LRU too small?)\n");
1102 BUG_ON(rs_flags & LC_DIRTY);
1103 }
1104
1105 return bm_ext;
1106}
1107
1108static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
1109{
1110 struct lc_element *al_ext;
1111 int rv = 0;
1112
1113 spin_lock_irq(&mdev->al_lock);
1114 if (unlikely(enr == mdev->act_log->new_number))
1115 rv = 1;
1116 else {
1117 al_ext = lc_find(mdev->act_log, enr);
1118 if (al_ext) {
1119 if (al_ext->refcnt)
1120 rv = 1;
1121 }
1122 }
1123 spin_unlock_irq(&mdev->al_lock);
1124
1125 /*
1126 if (unlikely(rv)) {
1127 dev_info(DEV, "Delaying sync read until app's write is done\n");
1128 }
1129 */
1130 return rv;
1131}
1132
1133/**
1134 * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED
1135 * @mdev: DRBD device.
1136 * @sector: The sector number.
1137 *
1138 * This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted.
1139 */
1140int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
1141{
1142 unsigned int enr = BM_SECT_TO_EXT(sector);
1143 struct bm_extent *bm_ext;
1144 int i, sig;
1145
1146 trace_drbd_resync(mdev, TRACE_LVL_ALL,
1147 "drbd_rs_begin_io: sector=%llus (rs_end=%d)\n",
1148 (unsigned long long)sector, enr);
1149
1150 sig = wait_event_interruptible(mdev->al_wait,
1151 (bm_ext = _bme_get(mdev, enr)));
1152 if (sig)
1153 return 0;
1154
1155 if (test_bit(BME_LOCKED, &bm_ext->flags))
1156 return 1;
1157
1158 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
1159 sig = wait_event_interruptible(mdev->al_wait,
1160 !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i));
1161 if (sig) {
1162 spin_lock_irq(&mdev->al_lock);
1163 if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
1164 clear_bit(BME_NO_WRITES, &bm_ext->flags);
1165 mdev->resync_locked--;
1166 wake_up(&mdev->al_wait);
1167 }
1168 spin_unlock_irq(&mdev->al_lock);
1169 return 0;
1170 }
1171 }
1172
1173 set_bit(BME_LOCKED, &bm_ext->flags);
1174
1175 return 1;
1176}
1177
1178/**
1179 * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep
1180 * @mdev: DRBD device.
1181 * @sector: The sector number.
1182 *
1183 * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then
1184 * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN
1185 * if there is still application IO going on in this area.
1186 */
1187int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
1188{
1189 unsigned int enr = BM_SECT_TO_EXT(sector);
1190 const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT;
1191 struct lc_element *e;
1192 struct bm_extent *bm_ext;
1193 int i;
1194
1195 trace_drbd_resync(mdev, TRACE_LVL_ALL, "drbd_try_rs_begin_io: sector=%llus\n",
1196 (unsigned long long)sector);
1197
1198 spin_lock_irq(&mdev->al_lock);
1199 if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) {
1200 /* in case you have very heavy scattered io, it may
1201 * stall the syncer undefined if we give up the ref count
1202 * when we try again and requeue.
1203 *
1204 * if we don't give up the refcount, but the next time
1205 * we are scheduled this extent has been "synced" by new
1206 * application writes, we'd miss the lc_put on the
1207 * extent we keep the refcount on.
1208 * so we remembered which extent we had to try again, and
1209 * if the next requested one is something else, we do
1210 * the lc_put here...
1211 * we also have to wake_up
1212 */
1213
1214 trace_drbd_resync(mdev, TRACE_LVL_ALL,
1215 "dropping %u, apparently got 'synced' by application io\n",
1216 mdev->resync_wenr);
1217
1218 e = lc_find(mdev->resync, mdev->resync_wenr);
1219 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1220 if (bm_ext) {
1221 D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
1222 D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
1223 clear_bit(BME_NO_WRITES, &bm_ext->flags);
1224 mdev->resync_wenr = LC_FREE;
1225 if (lc_put(mdev->resync, &bm_ext->lce) == 0)
1226 mdev->resync_locked--;
1227 wake_up(&mdev->al_wait);
1228 } else {
1229 dev_alert(DEV, "LOGIC BUG\n");
1230 }
1231 }
1232 /* TRY. */
1233 e = lc_try_get(mdev->resync, enr);
1234 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1235 if (bm_ext) {
1236 if (test_bit(BME_LOCKED, &bm_ext->flags))
1237 goto proceed;
1238 if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) {
1239 mdev->resync_locked++;
1240 } else {
1241 /* we did set the BME_NO_WRITES,
1242 * but then could not set BME_LOCKED,
1243 * so we tried again.
1244 * drop the extra reference. */
1245 trace_drbd_resync(mdev, TRACE_LVL_ALL,
1246 "dropping extra reference on %u\n", enr);
1247
1248 bm_ext->lce.refcnt--;
1249 D_ASSERT(bm_ext->lce.refcnt > 0);
1250 }
1251 goto check_al;
1252 } else {
1253 /* do we rather want to try later? */
1254 if (mdev->resync_locked > mdev->resync->nr_elements-3) {
1255 trace_drbd_resync(mdev, TRACE_LVL_ALL,
1256 "resync_locked = %u!\n", mdev->resync_locked);
1257
1258 goto try_again;
1259 }
1260 /* Do or do not. There is no try. -- Yoda */
1261 e = lc_get(mdev->resync, enr);
1262 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1263 if (!bm_ext) {
1264 const unsigned long rs_flags = mdev->resync->flags;
1265 if (rs_flags & LC_STARVING)
1266 dev_warn(DEV, "Have to wait for element"
1267 " (resync LRU too small?)\n");
1268 BUG_ON(rs_flags & LC_DIRTY);
1269 goto try_again;
1270 }
1271 if (bm_ext->lce.lc_number != enr) {
1272 bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
1273 bm_ext->rs_failed = 0;
1274 lc_changed(mdev->resync, &bm_ext->lce);
1275 wake_up(&mdev->al_wait);
1276 D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0);
1277 }
1278 set_bit(BME_NO_WRITES, &bm_ext->flags);
1279 D_ASSERT(bm_ext->lce.refcnt == 1);
1280 mdev->resync_locked++;
1281 goto check_al;
1282 }
1283check_al:
1284 trace_drbd_resync(mdev, TRACE_LVL_ALL, "checking al for %u\n", enr);
1285
1286 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
1287 if (unlikely(al_enr+i == mdev->act_log->new_number))
1288 goto try_again;
1289 if (lc_is_used(mdev->act_log, al_enr+i))
1290 goto try_again;
1291 }
1292 set_bit(BME_LOCKED, &bm_ext->flags);
1293proceed:
1294 mdev->resync_wenr = LC_FREE;
1295 spin_unlock_irq(&mdev->al_lock);
1296 return 0;
1297
1298try_again:
1299 trace_drbd_resync(mdev, TRACE_LVL_ALL, "need to try again for %u\n", enr);
1300 if (bm_ext)
1301 mdev->resync_wenr = enr;
1302 spin_unlock_irq(&mdev->al_lock);
1303 return -EAGAIN;
1304}
1305
1306void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector)
1307{
1308 unsigned int enr = BM_SECT_TO_EXT(sector);
1309 struct lc_element *e;
1310 struct bm_extent *bm_ext;
1311 unsigned long flags;
1312
1313 trace_drbd_resync(mdev, TRACE_LVL_ALL,
1314 "drbd_rs_complete_io: sector=%llus (rs_enr=%d)\n",
1315 (long long)sector, enr);
1316
1317 spin_lock_irqsave(&mdev->al_lock, flags);
1318 e = lc_find(mdev->resync, enr);
1319 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1320 if (!bm_ext) {
1321 spin_unlock_irqrestore(&mdev->al_lock, flags);
1322 if (__ratelimit(&drbd_ratelimit_state))
1323 dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n");
1324 return;
1325 }
1326
1327 if (bm_ext->lce.refcnt == 0) {
1328 spin_unlock_irqrestore(&mdev->al_lock, flags);
1329 dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, "
1330 "but refcnt is 0!?\n",
1331 (unsigned long long)sector, enr);
1332 return;
1333 }
1334
1335 if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
1336 clear_bit(BME_LOCKED, &bm_ext->flags);
1337 clear_bit(BME_NO_WRITES, &bm_ext->flags);
1338 mdev->resync_locked--;
1339 wake_up(&mdev->al_wait);
1340 }
1341
1342 spin_unlock_irqrestore(&mdev->al_lock, flags);
1343}
1344
1345/**
1346 * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED)
1347 * @mdev: DRBD device.
1348 */
1349void drbd_rs_cancel_all(struct drbd_conf *mdev)
1350{
1351 trace_drbd_resync(mdev, TRACE_LVL_METRICS, "drbd_rs_cancel_all\n");
1352
1353 spin_lock_irq(&mdev->al_lock);
1354
1355 if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */
1356 lc_reset(mdev->resync);
1357 put_ldev(mdev);
1358 }
1359 mdev->resync_locked = 0;
1360 mdev->resync_wenr = LC_FREE;
1361 spin_unlock_irq(&mdev->al_lock);
1362 wake_up(&mdev->al_wait);
1363}
1364
1365/**
1366 * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU
1367 * @mdev: DRBD device.
1368 *
1369 * Returns 0 upon success, -EAGAIN if at least one reference count was
1370 * not zero.
1371 */
1372int drbd_rs_del_all(struct drbd_conf *mdev)
1373{
1374 struct lc_element *e;
1375 struct bm_extent *bm_ext;
1376 int i;
1377
1378 trace_drbd_resync(mdev, TRACE_LVL_METRICS, "drbd_rs_del_all\n");
1379
1380 spin_lock_irq(&mdev->al_lock);
1381
1382 if (get_ldev_if_state(mdev, D_FAILED)) {
1383 /* ok, ->resync is there. */
1384 for (i = 0; i < mdev->resync->nr_elements; i++) {
1385 e = lc_element_by_index(mdev->resync, i);
1386 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1387 if (bm_ext->lce.lc_number == LC_FREE)
1388 continue;
1389 if (bm_ext->lce.lc_number == mdev->resync_wenr) {
1390 dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently"
1391 " got 'synced' by application io\n",
1392 mdev->resync_wenr);
1393 D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
1394 D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
1395 clear_bit(BME_NO_WRITES, &bm_ext->flags);
1396 mdev->resync_wenr = LC_FREE;
1397 lc_put(mdev->resync, &bm_ext->lce);
1398 }
1399 if (bm_ext->lce.refcnt != 0) {
1400 dev_info(DEV, "Retrying drbd_rs_del_all() later. "
1401 "refcnt=%d\n", bm_ext->lce.refcnt);
1402 put_ldev(mdev);
1403 spin_unlock_irq(&mdev->al_lock);
1404 return -EAGAIN;
1405 }
1406 D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
1407 D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags));
1408 lc_del(mdev->resync, &bm_ext->lce);
1409 }
1410 D_ASSERT(mdev->resync->used == 0);
1411 put_ldev(mdev);
1412 }
1413 spin_unlock_irq(&mdev->al_lock);
1414
1415 return 0;
1416}
1417
1418/**
1419 * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks
1420 * @mdev: DRBD device.
1421 * @sector: The sector number.
1422 * @size: Size of failed IO operation, in byte.
1423 */
1424void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
1425{
1426 /* Is called from worker and receiver context _only_ */
1427 unsigned long sbnr, ebnr, lbnr;
1428 unsigned long count;
1429 sector_t esector, nr_sectors;
1430 int wake_up = 0;
1431
1432 trace_drbd_resync(mdev, TRACE_LVL_SUMMARY,
1433 "drbd_rs_failed_io: sector=%llus, size=%u\n",
1434 (unsigned long long)sector, size);
1435
1436 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
1437 dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
1438 (unsigned long long)sector, size);
1439 return;
1440 }
1441 nr_sectors = drbd_get_capacity(mdev->this_bdev);
1442 esector = sector + (size >> 9) - 1;
1443
1444 ERR_IF(sector >= nr_sectors) return;
1445 ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
1446
1447 lbnr = BM_SECT_TO_BIT(nr_sectors-1);
1448
1449 /*
1450 * round up start sector, round down end sector. we make sure we only
1451 * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */
1452 if (unlikely(esector < BM_SECT_PER_BIT-1))
1453 return;
1454 if (unlikely(esector == (nr_sectors-1)))
1455 ebnr = lbnr;
1456 else
1457 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
1458 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
1459
1460 if (sbnr > ebnr)
1461 return;
1462
1463 /*
1464 * ok, (capacity & 7) != 0 sometimes, but who cares...
1465 * we count rs_{total,left} in bits, not sectors.
1466 */
1467 spin_lock_irq(&mdev->al_lock);
1468 count = drbd_bm_count_bits(mdev, sbnr, ebnr);
1469 if (count) {
1470 mdev->rs_failed += count;
1471
1472 if (get_ldev(mdev)) {
1473 drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE);
1474 put_ldev(mdev);
1475 }
1476
1477 /* just wake_up unconditional now, various lc_chaged(),
1478 * lc_put() in drbd_try_clear_on_disk_bm(). */
1479 wake_up = 1;
1480 }
1481 spin_unlock_irq(&mdev->al_lock);
1482 if (wake_up)
1483 wake_up(&mdev->al_wait);
1484}
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
new file mode 100644
index 000000000000..b61057e77882
--- /dev/null
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -0,0 +1,1327 @@
1/*
2 drbd_bitmap.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25#include <linux/bitops.h>
26#include <linux/vmalloc.h>
27#include <linux/string.h>
28#include <linux/drbd.h>
29#include <asm/kmap_types.h>
30#include "drbd_int.h"
31
32/* OPAQUE outside this file!
33 * interface defined in drbd_int.h
34
35 * convention:
36 * function name drbd_bm_... => used elsewhere, "public".
37 * function name bm_... => internal to implementation, "private".
38
39 * Note that since find_first_bit returns int, at the current granularity of
40 * the bitmap (4KB per byte), this implementation "only" supports up to
41 * 1<<(32+12) == 16 TB...
42 */
43
44/*
45 * NOTE
46 * Access to the *bm_pages is protected by bm_lock.
47 * It is safe to read the other members within the lock.
48 *
49 * drbd_bm_set_bits is called from bio_endio callbacks,
50 * We may be called with irq already disabled,
51 * so we need spin_lock_irqsave().
52 * And we need the kmap_atomic.
53 */
54struct drbd_bitmap {
55 struct page **bm_pages;
56 spinlock_t bm_lock;
57 /* WARNING unsigned long bm_*:
58 * 32bit number of bit offset is just enough for 512 MB bitmap.
59 * it will blow up if we make the bitmap bigger...
60 * not that it makes much sense to have a bitmap that large,
61 * rather change the granularity to 16k or 64k or something.
62 * (that implies other problems, however...)
63 */
64 unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */
65 unsigned long bm_bits;
66 size_t bm_words;
67 size_t bm_number_of_pages;
68 sector_t bm_dev_capacity;
69 struct semaphore bm_change; /* serializes resize operations */
70
71 atomic_t bm_async_io;
72 wait_queue_head_t bm_io_wait;
73
74 unsigned long bm_flags;
75
76 /* debugging aid, in case we are still racy somewhere */
77 char *bm_why;
78 struct task_struct *bm_task;
79};
80
81/* definition of bits in bm_flags */
82#define BM_LOCKED 0
83#define BM_MD_IO_ERROR 1
84#define BM_P_VMALLOCED 2
85
86static int bm_is_locked(struct drbd_bitmap *b)
87{
88 return test_bit(BM_LOCKED, &b->bm_flags);
89}
90
91#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
92static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
93{
94 struct drbd_bitmap *b = mdev->bitmap;
95 if (!__ratelimit(&drbd_ratelimit_state))
96 return;
97 dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n",
98 current == mdev->receiver.task ? "receiver" :
99 current == mdev->asender.task ? "asender" :
100 current == mdev->worker.task ? "worker" : current->comm,
101 func, b->bm_why ?: "?",
102 b->bm_task == mdev->receiver.task ? "receiver" :
103 b->bm_task == mdev->asender.task ? "asender" :
104 b->bm_task == mdev->worker.task ? "worker" : "?");
105}
106
107void drbd_bm_lock(struct drbd_conf *mdev, char *why)
108{
109 struct drbd_bitmap *b = mdev->bitmap;
110 int trylock_failed;
111
112 if (!b) {
113 dev_err(DEV, "FIXME no bitmap in drbd_bm_lock!?\n");
114 return;
115 }
116
117 trylock_failed = down_trylock(&b->bm_change);
118
119 if (trylock_failed) {
120 dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n",
121 current == mdev->receiver.task ? "receiver" :
122 current == mdev->asender.task ? "asender" :
123 current == mdev->worker.task ? "worker" : current->comm,
124 why, b->bm_why ?: "?",
125 b->bm_task == mdev->receiver.task ? "receiver" :
126 b->bm_task == mdev->asender.task ? "asender" :
127 b->bm_task == mdev->worker.task ? "worker" : "?");
128 down(&b->bm_change);
129 }
130 if (__test_and_set_bit(BM_LOCKED, &b->bm_flags))
131 dev_err(DEV, "FIXME bitmap already locked in bm_lock\n");
132
133 b->bm_why = why;
134 b->bm_task = current;
135}
136
137void drbd_bm_unlock(struct drbd_conf *mdev)
138{
139 struct drbd_bitmap *b = mdev->bitmap;
140 if (!b) {
141 dev_err(DEV, "FIXME no bitmap in drbd_bm_unlock!?\n");
142 return;
143 }
144
145 if (!__test_and_clear_bit(BM_LOCKED, &mdev->bitmap->bm_flags))
146 dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n");
147
148 b->bm_why = NULL;
149 b->bm_task = NULL;
150 up(&b->bm_change);
151}
152
153/* word offset to long pointer */
154static unsigned long *__bm_map_paddr(struct drbd_bitmap *b, unsigned long offset, const enum km_type km)
155{
156 struct page *page;
157 unsigned long page_nr;
158
159 /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
160 page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
161 BUG_ON(page_nr >= b->bm_number_of_pages);
162 page = b->bm_pages[page_nr];
163
164 return (unsigned long *) kmap_atomic(page, km);
165}
166
167static unsigned long * bm_map_paddr(struct drbd_bitmap *b, unsigned long offset)
168{
169 return __bm_map_paddr(b, offset, KM_IRQ1);
170}
171
172static void __bm_unmap(unsigned long *p_addr, const enum km_type km)
173{
174 kunmap_atomic(p_addr, km);
175};
176
177static void bm_unmap(unsigned long *p_addr)
178{
179 return __bm_unmap(p_addr, KM_IRQ1);
180}
181
182/* long word offset of _bitmap_ sector */
183#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
184/* word offset from start of bitmap to word number _in_page_
185 * modulo longs per page
186#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
187 hm, well, Philipp thinks gcc might not optimze the % into & (... - 1)
188 so do it explicitly:
189 */
190#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
191
192/* Long words per page */
193#define LWPP (PAGE_SIZE/sizeof(long))
194
195/*
196 * actually most functions herein should take a struct drbd_bitmap*, not a
197 * struct drbd_conf*, but for the debug macros I like to have the mdev around
198 * to be able to report device specific.
199 */
200
201static void bm_free_pages(struct page **pages, unsigned long number)
202{
203 unsigned long i;
204 if (!pages)
205 return;
206
207 for (i = 0; i < number; i++) {
208 if (!pages[i]) {
209 printk(KERN_ALERT "drbd: bm_free_pages tried to free "
210 "a NULL pointer; i=%lu n=%lu\n",
211 i, number);
212 continue;
213 }
214 __free_page(pages[i]);
215 pages[i] = NULL;
216 }
217}
218
219static void bm_vk_free(void *ptr, int v)
220{
221 if (v)
222 vfree(ptr);
223 else
224 kfree(ptr);
225}
226
227/*
228 * "have" and "want" are NUMBER OF PAGES.
229 */
230static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
231{
232 struct page **old_pages = b->bm_pages;
233 struct page **new_pages, *page;
234 unsigned int i, bytes, vmalloced = 0;
235 unsigned long have = b->bm_number_of_pages;
236
237 BUG_ON(have == 0 && old_pages != NULL);
238 BUG_ON(have != 0 && old_pages == NULL);
239
240 if (have == want)
241 return old_pages;
242
243 /* Trying kmalloc first, falling back to vmalloc.
244 * GFP_KERNEL is ok, as this is done when a lower level disk is
245 * "attached" to the drbd. Context is receiver thread or cqueue
246 * thread. As we have no disk yet, we are not in the IO path,
247 * not even the IO path of the peer. */
248 bytes = sizeof(struct page *)*want;
249 new_pages = kmalloc(bytes, GFP_KERNEL);
250 if (!new_pages) {
251 new_pages = vmalloc(bytes);
252 if (!new_pages)
253 return NULL;
254 vmalloced = 1;
255 }
256
257 memset(new_pages, 0, bytes);
258 if (want >= have) {
259 for (i = 0; i < have; i++)
260 new_pages[i] = old_pages[i];
261 for (; i < want; i++) {
262 page = alloc_page(GFP_HIGHUSER);
263 if (!page) {
264 bm_free_pages(new_pages + have, i - have);
265 bm_vk_free(new_pages, vmalloced);
266 return NULL;
267 }
268 new_pages[i] = page;
269 }
270 } else {
271 for (i = 0; i < want; i++)
272 new_pages[i] = old_pages[i];
273 /* NOT HERE, we are outside the spinlock!
274 bm_free_pages(old_pages + want, have - want);
275 */
276 }
277
278 if (vmalloced)
279 set_bit(BM_P_VMALLOCED, &b->bm_flags);
280 else
281 clear_bit(BM_P_VMALLOCED, &b->bm_flags);
282
283 return new_pages;
284}
285
286/*
287 * called on driver init only. TODO call when a device is created.
288 * allocates the drbd_bitmap, and stores it in mdev->bitmap.
289 */
290int drbd_bm_init(struct drbd_conf *mdev)
291{
292 struct drbd_bitmap *b = mdev->bitmap;
293 WARN_ON(b != NULL);
294 b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);
295 if (!b)
296 return -ENOMEM;
297 spin_lock_init(&b->bm_lock);
298 init_MUTEX(&b->bm_change);
299 init_waitqueue_head(&b->bm_io_wait);
300
301 mdev->bitmap = b;
302
303 return 0;
304}
305
306sector_t drbd_bm_capacity(struct drbd_conf *mdev)
307{
308 ERR_IF(!mdev->bitmap) return 0;
309 return mdev->bitmap->bm_dev_capacity;
310}
311
312/* called on driver unload. TODO: call when a device is destroyed.
313 */
314void drbd_bm_cleanup(struct drbd_conf *mdev)
315{
316 ERR_IF (!mdev->bitmap) return;
317 bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
318 bm_vk_free(mdev->bitmap->bm_pages, test_bit(BM_P_VMALLOCED, &mdev->bitmap->bm_flags));
319 kfree(mdev->bitmap);
320 mdev->bitmap = NULL;
321}
322
323/*
324 * since (b->bm_bits % BITS_PER_LONG) != 0,
325 * this masks out the remaining bits.
326 * Returns the number of bits cleared.
327 */
328static int bm_clear_surplus(struct drbd_bitmap *b)
329{
330 const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1;
331 size_t w = b->bm_bits >> LN2_BPL;
332 int cleared = 0;
333 unsigned long *p_addr, *bm;
334
335 p_addr = bm_map_paddr(b, w);
336 bm = p_addr + MLPP(w);
337 if (w < b->bm_words) {
338 cleared = hweight_long(*bm & ~mask);
339 *bm &= mask;
340 w++; bm++;
341 }
342
343 if (w < b->bm_words) {
344 cleared += hweight_long(*bm);
345 *bm = 0;
346 }
347 bm_unmap(p_addr);
348 return cleared;
349}
350
351static void bm_set_surplus(struct drbd_bitmap *b)
352{
353 const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1;
354 size_t w = b->bm_bits >> LN2_BPL;
355 unsigned long *p_addr, *bm;
356
357 p_addr = bm_map_paddr(b, w);
358 bm = p_addr + MLPP(w);
359 if (w < b->bm_words) {
360 *bm |= ~mask;
361 bm++; w++;
362 }
363
364 if (w < b->bm_words) {
365 *bm = ~(0UL);
366 }
367 bm_unmap(p_addr);
368}
369
370static unsigned long __bm_count_bits(struct drbd_bitmap *b, const int swap_endian)
371{
372 unsigned long *p_addr, *bm, offset = 0;
373 unsigned long bits = 0;
374 unsigned long i, do_now;
375
376 while (offset < b->bm_words) {
377 i = do_now = min_t(size_t, b->bm_words-offset, LWPP);
378 p_addr = __bm_map_paddr(b, offset, KM_USER0);
379 bm = p_addr + MLPP(offset);
380 while (i--) {
381#ifndef __LITTLE_ENDIAN
382 if (swap_endian)
383 *bm = lel_to_cpu(*bm);
384#endif
385 bits += hweight_long(*bm++);
386 }
387 __bm_unmap(p_addr, KM_USER0);
388 offset += do_now;
389 cond_resched();
390 }
391
392 return bits;
393}
394
395static unsigned long bm_count_bits(struct drbd_bitmap *b)
396{
397 return __bm_count_bits(b, 0);
398}
399
400static unsigned long bm_count_bits_swap_endian(struct drbd_bitmap *b)
401{
402 return __bm_count_bits(b, 1);
403}
404
405/* offset and len in long words.*/
406static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
407{
408 unsigned long *p_addr, *bm;
409 size_t do_now, end;
410
411#define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512)
412
413 end = offset + len;
414
415 if (end > b->bm_words) {
416 printk(KERN_ALERT "drbd: bm_memset end > bm_words\n");
417 return;
418 }
419
420 while (offset < end) {
421 do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
422 p_addr = bm_map_paddr(b, offset);
423 bm = p_addr + MLPP(offset);
424 if (bm+do_now > p_addr + LWPP) {
425 printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
426 p_addr, bm, (int)do_now);
427 break; /* breaks to after catch_oob_access_end() only! */
428 }
429 memset(bm, c, do_now * sizeof(long));
430 bm_unmap(p_addr);
431 offset += do_now;
432 }
433}
434
435/*
436 * make sure the bitmap has enough room for the attached storage,
437 * if necessary, resize.
438 * called whenever we may have changed the device size.
439 * returns -ENOMEM if we could not allocate enough memory, 0 on success.
440 * In case this is actually a resize, we copy the old bitmap into the new one.
441 * Otherwise, the bitmap is initialized to all bits set.
442 */
443int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity)
444{
445 struct drbd_bitmap *b = mdev->bitmap;
446 unsigned long bits, words, owords, obits, *p_addr, *bm;
447 unsigned long want, have, onpages; /* number of pages */
448 struct page **npages, **opages = NULL;
449 int err = 0, growing;
450 int opages_vmalloced;
451
452 ERR_IF(!b) return -ENOMEM;
453
454 drbd_bm_lock(mdev, "resize");
455
456 dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n",
457 (unsigned long long)capacity);
458
459 if (capacity == b->bm_dev_capacity)
460 goto out;
461
462 opages_vmalloced = test_bit(BM_P_VMALLOCED, &b->bm_flags);
463
464 if (capacity == 0) {
465 spin_lock_irq(&b->bm_lock);
466 opages = b->bm_pages;
467 onpages = b->bm_number_of_pages;
468 owords = b->bm_words;
469 b->bm_pages = NULL;
470 b->bm_number_of_pages =
471 b->bm_set =
472 b->bm_bits =
473 b->bm_words =
474 b->bm_dev_capacity = 0;
475 spin_unlock_irq(&b->bm_lock);
476 bm_free_pages(opages, onpages);
477 bm_vk_free(opages, opages_vmalloced);
478 goto out;
479 }
480 bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
481
482 /* if we would use
483 words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
484 a 32bit host could present the wrong number of words
485 to a 64bit host.
486 */
487 words = ALIGN(bits, 64) >> LN2_BPL;
488
489 if (get_ldev(mdev)) {
490 D_ASSERT((u64)bits <= (((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12));
491 put_ldev(mdev);
492 }
493
494 /* one extra long to catch off by one errors */
495 want = ALIGN((words+1)*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
496 have = b->bm_number_of_pages;
497 if (want == have) {
498 D_ASSERT(b->bm_pages != NULL);
499 npages = b->bm_pages;
500 } else {
501 if (FAULT_ACTIVE(mdev, DRBD_FAULT_BM_ALLOC))
502 npages = NULL;
503 else
504 npages = bm_realloc_pages(b, want);
505 }
506
507 if (!npages) {
508 err = -ENOMEM;
509 goto out;
510 }
511
512 spin_lock_irq(&b->bm_lock);
513 opages = b->bm_pages;
514 owords = b->bm_words;
515 obits = b->bm_bits;
516
517 growing = bits > obits;
518 if (opages)
519 bm_set_surplus(b);
520
521 b->bm_pages = npages;
522 b->bm_number_of_pages = want;
523 b->bm_bits = bits;
524 b->bm_words = words;
525 b->bm_dev_capacity = capacity;
526
527 if (growing) {
528 bm_memset(b, owords, 0xff, words-owords);
529 b->bm_set += bits - obits;
530 }
531
532 if (want < have) {
533 /* implicit: (opages != NULL) && (opages != npages) */
534 bm_free_pages(opages + want, have - want);
535 }
536
537 p_addr = bm_map_paddr(b, words);
538 bm = p_addr + MLPP(words);
539 *bm = DRBD_MAGIC;
540 bm_unmap(p_addr);
541
542 (void)bm_clear_surplus(b);
543
544 spin_unlock_irq(&b->bm_lock);
545 if (opages != npages)
546 bm_vk_free(opages, opages_vmalloced);
547 if (!growing)
548 b->bm_set = bm_count_bits(b);
549 dev_info(DEV, "resync bitmap: bits=%lu words=%lu\n", bits, words);
550
551 out:
552 drbd_bm_unlock(mdev);
553 return err;
554}
555
556/* inherently racy:
557 * if not protected by other means, return value may be out of date when
558 * leaving this function...
559 * we still need to lock it, since it is important that this returns
560 * bm_set == 0 precisely.
561 *
562 * maybe bm_set should be atomic_t ?
563 */
564static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
565{
566 struct drbd_bitmap *b = mdev->bitmap;
567 unsigned long s;
568 unsigned long flags;
569
570 ERR_IF(!b) return 0;
571 ERR_IF(!b->bm_pages) return 0;
572
573 spin_lock_irqsave(&b->bm_lock, flags);
574 s = b->bm_set;
575 spin_unlock_irqrestore(&b->bm_lock, flags);
576
577 return s;
578}
579
580unsigned long drbd_bm_total_weight(struct drbd_conf *mdev)
581{
582 unsigned long s;
583 /* if I don't have a disk, I don't know about out-of-sync status */
584 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
585 return 0;
586 s = _drbd_bm_total_weight(mdev);
587 put_ldev(mdev);
588 return s;
589}
590
591size_t drbd_bm_words(struct drbd_conf *mdev)
592{
593 struct drbd_bitmap *b = mdev->bitmap;
594 ERR_IF(!b) return 0;
595 ERR_IF(!b->bm_pages) return 0;
596
597 return b->bm_words;
598}
599
600unsigned long drbd_bm_bits(struct drbd_conf *mdev)
601{
602 struct drbd_bitmap *b = mdev->bitmap;
603 ERR_IF(!b) return 0;
604
605 return b->bm_bits;
606}
607
608/* merge number words from buffer into the bitmap starting at offset.
609 * buffer[i] is expected to be little endian unsigned long.
610 * bitmap must be locked by drbd_bm_lock.
611 * currently only used from receive_bitmap.
612 */
613void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
614 unsigned long *buffer)
615{
616 struct drbd_bitmap *b = mdev->bitmap;
617 unsigned long *p_addr, *bm;
618 unsigned long word, bits;
619 size_t end, do_now;
620
621 end = offset + number;
622
623 ERR_IF(!b) return;
624 ERR_IF(!b->bm_pages) return;
625 if (number == 0)
626 return;
627 WARN_ON(offset >= b->bm_words);
628 WARN_ON(end > b->bm_words);
629
630 spin_lock_irq(&b->bm_lock);
631 while (offset < end) {
632 do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
633 p_addr = bm_map_paddr(b, offset);
634 bm = p_addr + MLPP(offset);
635 offset += do_now;
636 while (do_now--) {
637 bits = hweight_long(*bm);
638 word = *bm | lel_to_cpu(*buffer++);
639 *bm++ = word;
640 b->bm_set += hweight_long(word) - bits;
641 }
642 bm_unmap(p_addr);
643 }
644 /* with 32bit <-> 64bit cross-platform connect
645 * this is only correct for current usage,
646 * where we _know_ that we are 64 bit aligned,
647 * and know that this function is used in this way, too...
648 */
649 if (end == b->bm_words)
650 b->bm_set -= bm_clear_surplus(b);
651
652 spin_unlock_irq(&b->bm_lock);
653}
654
655/* copy number words from the bitmap starting at offset into the buffer.
656 * buffer[i] will be little endian unsigned long.
657 */
658void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
659 unsigned long *buffer)
660{
661 struct drbd_bitmap *b = mdev->bitmap;
662 unsigned long *p_addr, *bm;
663 size_t end, do_now;
664
665 end = offset + number;
666
667 ERR_IF(!b) return;
668 ERR_IF(!b->bm_pages) return;
669
670 spin_lock_irq(&b->bm_lock);
671 if ((offset >= b->bm_words) ||
672 (end > b->bm_words) ||
673 (number <= 0))
674 dev_err(DEV, "offset=%lu number=%lu bm_words=%lu\n",
675 (unsigned long) offset,
676 (unsigned long) number,
677 (unsigned long) b->bm_words);
678 else {
679 while (offset < end) {
680 do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
681 p_addr = bm_map_paddr(b, offset);
682 bm = p_addr + MLPP(offset);
683 offset += do_now;
684 while (do_now--)
685 *buffer++ = cpu_to_lel(*bm++);
686 bm_unmap(p_addr);
687 }
688 }
689 spin_unlock_irq(&b->bm_lock);
690}
691
692/* set all bits in the bitmap */
693void drbd_bm_set_all(struct drbd_conf *mdev)
694{
695 struct drbd_bitmap *b = mdev->bitmap;
696 ERR_IF(!b) return;
697 ERR_IF(!b->bm_pages) return;
698
699 spin_lock_irq(&b->bm_lock);
700 bm_memset(b, 0, 0xff, b->bm_words);
701 (void)bm_clear_surplus(b);
702 b->bm_set = b->bm_bits;
703 spin_unlock_irq(&b->bm_lock);
704}
705
706/* clear all bits in the bitmap */
707void drbd_bm_clear_all(struct drbd_conf *mdev)
708{
709 struct drbd_bitmap *b = mdev->bitmap;
710 ERR_IF(!b) return;
711 ERR_IF(!b->bm_pages) return;
712
713 spin_lock_irq(&b->bm_lock);
714 bm_memset(b, 0, 0, b->bm_words);
715 b->bm_set = 0;
716 spin_unlock_irq(&b->bm_lock);
717}
718
719static void bm_async_io_complete(struct bio *bio, int error)
720{
721 struct drbd_bitmap *b = bio->bi_private;
722 int uptodate = bio_flagged(bio, BIO_UPTODATE);
723
724
725 /* strange behavior of some lower level drivers...
726 * fail the request by clearing the uptodate flag,
727 * but do not return any error?!
728 * do we want to WARN() on this? */
729 if (!error && !uptodate)
730 error = -EIO;
731
732 if (error) {
733 /* doh. what now?
734 * for now, set all bits, and flag MD_IO_ERROR */
735 __set_bit(BM_MD_IO_ERROR, &b->bm_flags);
736 }
737 if (atomic_dec_and_test(&b->bm_async_io))
738 wake_up(&b->bm_io_wait);
739
740 bio_put(bio);
741}
742
743static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local)
744{
745 /* we are process context. we always get a bio */
746 struct bio *bio = bio_alloc(GFP_KERNEL, 1);
747 unsigned int len;
748 sector_t on_disk_sector =
749 mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset;
750 on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
751
752 /* this might happen with very small
753 * flexible external meta data device */
754 len = min_t(unsigned int, PAGE_SIZE,
755 (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9);
756
757 bio->bi_bdev = mdev->ldev->md_bdev;
758 bio->bi_sector = on_disk_sector;
759 bio_add_page(bio, b->bm_pages[page_nr], len, 0);
760 bio->bi_private = b;
761 bio->bi_end_io = bm_async_io_complete;
762
763 if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
764 bio->bi_rw |= rw;
765 bio_endio(bio, -EIO);
766 } else {
767 submit_bio(rw, bio);
768 }
769}
770
771# if defined(__LITTLE_ENDIAN)
772 /* nothing to do, on disk == in memory */
773# define bm_cpu_to_lel(x) ((void)0)
774# else
775void bm_cpu_to_lel(struct drbd_bitmap *b)
776{
777 /* need to cpu_to_lel all the pages ...
778 * this may be optimized by using
779 * cpu_to_lel(-1) == -1 and cpu_to_lel(0) == 0;
780 * the following is still not optimal, but better than nothing */
781 unsigned int i;
782 unsigned long *p_addr, *bm;
783 if (b->bm_set == 0) {
784 /* no page at all; avoid swap if all is 0 */
785 i = b->bm_number_of_pages;
786 } else if (b->bm_set == b->bm_bits) {
787 /* only the last page */
788 i = b->bm_number_of_pages - 1;
789 } else {
790 /* all pages */
791 i = 0;
792 }
793 for (; i < b->bm_number_of_pages; i++) {
794 p_addr = kmap_atomic(b->bm_pages[i], KM_USER0);
795 for (bm = p_addr; bm < p_addr + PAGE_SIZE/sizeof(long); bm++)
796 *bm = cpu_to_lel(*bm);
797 kunmap_atomic(p_addr, KM_USER0);
798 }
799}
800# endif
801/* lel_to_cpu == cpu_to_lel */
802# define bm_lel_to_cpu(x) bm_cpu_to_lel(x)
803
804/*
805 * bm_rw: read/write the whole bitmap from/to its on disk location.
806 */
807static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local)
808{
809 struct drbd_bitmap *b = mdev->bitmap;
810 /* sector_t sector; */
811 int bm_words, num_pages, i;
812 unsigned long now;
813 char ppb[10];
814 int err = 0;
815
816 WARN_ON(!bm_is_locked(b));
817
818 /* no spinlock here, the drbd_bm_lock should be enough! */
819
820 bm_words = drbd_bm_words(mdev);
821 num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT;
822
823 /* on disk bitmap is little endian */
824 if (rw == WRITE)
825 bm_cpu_to_lel(b);
826
827 now = jiffies;
828 atomic_set(&b->bm_async_io, num_pages);
829 __clear_bit(BM_MD_IO_ERROR, &b->bm_flags);
830
831 /* let the layers below us try to merge these bios... */
832 for (i = 0; i < num_pages; i++)
833 bm_page_io_async(mdev, b, i, rw);
834
835 drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
836 wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0);
837
838 if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) {
839 dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
840 drbd_chk_io_error(mdev, 1, TRUE);
841 err = -EIO;
842 }
843
844 now = jiffies;
845 if (rw == WRITE) {
846 /* swap back endianness */
847 bm_lel_to_cpu(b);
848 /* flush bitmap to stable storage */
849 drbd_md_flush(mdev);
850 } else /* rw == READ */ {
851 /* just read, if necessary adjust endianness */
852 b->bm_set = bm_count_bits_swap_endian(b);
853 dev_info(DEV, "recounting of set bits took additional %lu jiffies\n",
854 jiffies - now);
855 }
856 now = b->bm_set;
857
858 dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
859 ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
860
861 return err;
862}
863
864/**
865 * drbd_bm_read() - Read the whole bitmap from its on disk location.
866 * @mdev: DRBD device.
867 */
868int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
869{
870 return bm_rw(mdev, READ);
871}
872
873/**
874 * drbd_bm_write() - Write the whole bitmap to its on disk location.
875 * @mdev: DRBD device.
876 */
877int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
878{
879 return bm_rw(mdev, WRITE);
880}
881
882/**
883 * drbd_bm_write_sect: Writes a 512 (MD_SECTOR_SIZE) byte piece of the bitmap
884 * @mdev: DRBD device.
885 * @enr: Extent number in the resync lru (happens to be sector offset)
886 *
887 * The BM_EXT_SIZE is on purpose exactly the amount of the bitmap covered
888 * by a single sector write. Therefore enr == sector offset from the
889 * start of the bitmap.
890 */
891int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local)
892{
893 sector_t on_disk_sector = enr + mdev->ldev->md.md_offset
894 + mdev->ldev->md.bm_offset;
895 int bm_words, num_words, offset;
896 int err = 0;
897
898 mutex_lock(&mdev->md_io_mutex);
899 bm_words = drbd_bm_words(mdev);
900 offset = S2W(enr); /* word offset into bitmap */
901 num_words = min(S2W(1), bm_words - offset);
902 if (num_words < S2W(1))
903 memset(page_address(mdev->md_io_page), 0, MD_SECTOR_SIZE);
904 drbd_bm_get_lel(mdev, offset, num_words,
905 page_address(mdev->md_io_page));
906 if (!drbd_md_sync_page_io(mdev, mdev->ldev, on_disk_sector, WRITE)) {
907 int i;
908 err = -EIO;
909 dev_err(DEV, "IO ERROR writing bitmap sector %lu "
910 "(meta-disk sector %llus)\n",
911 enr, (unsigned long long)on_disk_sector);
912 drbd_chk_io_error(mdev, 1, TRUE);
913 for (i = 0; i < AL_EXT_PER_BM_SECT; i++)
914 drbd_bm_ALe_set_all(mdev, enr*AL_EXT_PER_BM_SECT+i);
915 }
916 mdev->bm_writ_cnt++;
917 mutex_unlock(&mdev->md_io_mutex);
918 return err;
919}
920
921/* NOTE
922 * find_first_bit returns int, we return unsigned long.
923 * should not make much difference anyways, but ...
924 *
925 * this returns a bit number, NOT a sector!
926 */
927#define BPP_MASK ((1UL << (PAGE_SHIFT+3)) - 1)
928static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
929 const int find_zero_bit, const enum km_type km)
930{
931 struct drbd_bitmap *b = mdev->bitmap;
932 unsigned long i = -1UL;
933 unsigned long *p_addr;
934 unsigned long bit_offset; /* bit offset of the mapped page. */
935
936 if (bm_fo > b->bm_bits) {
937 dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
938 } else {
939 while (bm_fo < b->bm_bits) {
940 unsigned long offset;
941 bit_offset = bm_fo & ~BPP_MASK; /* bit offset of the page */
942 offset = bit_offset >> LN2_BPL; /* word offset of the page */
943 p_addr = __bm_map_paddr(b, offset, km);
944
945 if (find_zero_bit)
946 i = find_next_zero_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK);
947 else
948 i = find_next_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK);
949
950 __bm_unmap(p_addr, km);
951 if (i < PAGE_SIZE*8) {
952 i = bit_offset + i;
953 if (i >= b->bm_bits)
954 break;
955 goto found;
956 }
957 bm_fo = bit_offset + PAGE_SIZE*8;
958 }
959 i = -1UL;
960 }
961 found:
962 return i;
963}
964
965static unsigned long bm_find_next(struct drbd_conf *mdev,
966 unsigned long bm_fo, const int find_zero_bit)
967{
968 struct drbd_bitmap *b = mdev->bitmap;
969 unsigned long i = -1UL;
970
971 ERR_IF(!b) return i;
972 ERR_IF(!b->bm_pages) return i;
973
974 spin_lock_irq(&b->bm_lock);
975 if (bm_is_locked(b))
976 bm_print_lock_info(mdev);
977
978 i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1);
979
980 spin_unlock_irq(&b->bm_lock);
981 return i;
982}
983
984unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
985{
986 return bm_find_next(mdev, bm_fo, 0);
987}
988
989#if 0
990/* not yet needed for anything. */
991unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
992{
993 return bm_find_next(mdev, bm_fo, 1);
994}
995#endif
996
997/* does not spin_lock_irqsave.
998 * you must take drbd_bm_lock() first */
999unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
1000{
1001 /* WARN_ON(!bm_is_locked(mdev)); */
1002 return __bm_find_next(mdev, bm_fo, 0, KM_USER1);
1003}
1004
1005unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
1006{
1007 /* WARN_ON(!bm_is_locked(mdev)); */
1008 return __bm_find_next(mdev, bm_fo, 1, KM_USER1);
1009}
1010
1011/* returns number of bits actually changed.
1012 * for val != 0, we change 0 -> 1, return code positive
1013 * for val == 0, we change 1 -> 0, return code negative
1014 * wants bitnr, not sector.
1015 * expected to be called for only a few bits (e - s about BITS_PER_LONG).
1016 * Must hold bitmap lock already. */
1017int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1018 unsigned long e, int val, const enum km_type km)
1019{
1020 struct drbd_bitmap *b = mdev->bitmap;
1021 unsigned long *p_addr = NULL;
1022 unsigned long bitnr;
1023 unsigned long last_page_nr = -1UL;
1024 int c = 0;
1025
1026 if (e >= b->bm_bits) {
1027 dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
1028 s, e, b->bm_bits);
1029 e = b->bm_bits ? b->bm_bits -1 : 0;
1030 }
1031 for (bitnr = s; bitnr <= e; bitnr++) {
1032 unsigned long offset = bitnr>>LN2_BPL;
1033 unsigned long page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
1034 if (page_nr != last_page_nr) {
1035 if (p_addr)
1036 __bm_unmap(p_addr, km);
1037 p_addr = __bm_map_paddr(b, offset, km);
1038 last_page_nr = page_nr;
1039 }
1040 if (val)
1041 c += (0 == __test_and_set_bit(bitnr & BPP_MASK, p_addr));
1042 else
1043 c -= (0 != __test_and_clear_bit(bitnr & BPP_MASK, p_addr));
1044 }
1045 if (p_addr)
1046 __bm_unmap(p_addr, km);
1047 b->bm_set += c;
1048 return c;
1049}
1050
1051/* returns number of bits actually changed.
1052 * for val != 0, we change 0 -> 1, return code positive
1053 * for val == 0, we change 1 -> 0, return code negative
1054 * wants bitnr, not sector */
1055int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1056 const unsigned long e, int val)
1057{
1058 unsigned long flags;
1059 struct drbd_bitmap *b = mdev->bitmap;
1060 int c = 0;
1061
1062 ERR_IF(!b) return 1;
1063 ERR_IF(!b->bm_pages) return 0;
1064
1065 spin_lock_irqsave(&b->bm_lock, flags);
1066 if (bm_is_locked(b))
1067 bm_print_lock_info(mdev);
1068
1069 c = __bm_change_bits_to(mdev, s, e, val, KM_IRQ1);
1070
1071 spin_unlock_irqrestore(&b->bm_lock, flags);
1072 return c;
1073}
1074
1075/* returns number of bits changed 0 -> 1 */
1076int drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1077{
1078 return bm_change_bits_to(mdev, s, e, 1);
1079}
1080
1081/* returns number of bits changed 1 -> 0 */
1082int drbd_bm_clear_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1083{
1084 return -bm_change_bits_to(mdev, s, e, 0);
1085}
1086
1087/* sets all bits in full words,
1088 * from first_word up to, but not including, last_word */
1089static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
1090 int page_nr, int first_word, int last_word)
1091{
1092 int i;
1093 int bits;
1094 unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_USER0);
1095 for (i = first_word; i < last_word; i++) {
1096 bits = hweight_long(paddr[i]);
1097 paddr[i] = ~0UL;
1098 b->bm_set += BITS_PER_LONG - bits;
1099 }
1100 kunmap_atomic(paddr, KM_USER0);
1101}
1102
1103/* Same thing as drbd_bm_set_bits, but without taking the spin_lock_irqsave.
1104 * You must first drbd_bm_lock().
1105 * Can be called to set the whole bitmap in one go.
1106 * Sets bits from s to e _inclusive_. */
1107void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1108{
1109 /* First set_bit from the first bit (s)
1110 * up to the next long boundary (sl),
1111 * then assign full words up to the last long boundary (el),
1112 * then set_bit up to and including the last bit (e).
1113 *
1114 * Do not use memset, because we must account for changes,
1115 * so we need to loop over the words with hweight() anyways.
1116 */
1117 unsigned long sl = ALIGN(s,BITS_PER_LONG);
1118 unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
1119 int first_page;
1120 int last_page;
1121 int page_nr;
1122 int first_word;
1123 int last_word;
1124
1125 if (e - s <= 3*BITS_PER_LONG) {
1126 /* don't bother; el and sl may even be wrong. */
1127 __bm_change_bits_to(mdev, s, e, 1, KM_USER0);
1128 return;
1129 }
1130
1131 /* difference is large enough that we can trust sl and el */
1132
1133 /* bits filling the current long */
1134 if (sl)
1135 __bm_change_bits_to(mdev, s, sl-1, 1, KM_USER0);
1136
1137 first_page = sl >> (3 + PAGE_SHIFT);
1138 last_page = el >> (3 + PAGE_SHIFT);
1139
1140 /* MLPP: modulo longs per page */
1141 /* LWPP: long words per page */
1142 first_word = MLPP(sl >> LN2_BPL);
1143 last_word = LWPP;
1144
1145 /* first and full pages, unless first page == last page */
1146 for (page_nr = first_page; page_nr < last_page; page_nr++) {
1147 bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word);
1148 cond_resched();
1149 first_word = 0;
1150 }
1151
1152 /* last page (respectively only page, for first page == last page) */
1153 last_word = MLPP(el >> LN2_BPL);
1154 bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word);
1155
1156 /* possibly trailing bits.
1157 * example: (e & 63) == 63, el will be e+1.
1158 * if that even was the very last bit,
1159 * it would trigger an assert in __bm_change_bits_to()
1160 */
1161 if (el <= e)
1162 __bm_change_bits_to(mdev, el, e, 1, KM_USER0);
1163}
1164
1165/* returns bit state
1166 * wants bitnr, NOT sector.
1167 * inherently racy... area needs to be locked by means of {al,rs}_lru
1168 * 1 ... bit set
1169 * 0 ... bit not set
1170 * -1 ... first out of bounds access, stop testing for bits!
1171 */
1172int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
1173{
1174 unsigned long flags;
1175 struct drbd_bitmap *b = mdev->bitmap;
1176 unsigned long *p_addr;
1177 int i;
1178
1179 ERR_IF(!b) return 0;
1180 ERR_IF(!b->bm_pages) return 0;
1181
1182 spin_lock_irqsave(&b->bm_lock, flags);
1183 if (bm_is_locked(b))
1184 bm_print_lock_info(mdev);
1185 if (bitnr < b->bm_bits) {
1186 unsigned long offset = bitnr>>LN2_BPL;
1187 p_addr = bm_map_paddr(b, offset);
1188 i = test_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0;
1189 bm_unmap(p_addr);
1190 } else if (bitnr == b->bm_bits) {
1191 i = -1;
1192 } else { /* (bitnr > b->bm_bits) */
1193 dev_err(DEV, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
1194 i = 0;
1195 }
1196
1197 spin_unlock_irqrestore(&b->bm_lock, flags);
1198 return i;
1199}
1200
1201/* returns number of bits set in the range [s, e] */
1202int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1203{
1204 unsigned long flags;
1205 struct drbd_bitmap *b = mdev->bitmap;
1206 unsigned long *p_addr = NULL, page_nr = -1;
1207 unsigned long bitnr;
1208 int c = 0;
1209 size_t w;
1210
1211 /* If this is called without a bitmap, that is a bug. But just to be
1212 * robust in case we screwed up elsewhere, in that case pretend there
1213 * was one dirty bit in the requested area, so we won't try to do a
1214 * local read there (no bitmap probably implies no disk) */
1215 ERR_IF(!b) return 1;
1216 ERR_IF(!b->bm_pages) return 1;
1217
1218 spin_lock_irqsave(&b->bm_lock, flags);
1219 if (bm_is_locked(b))
1220 bm_print_lock_info(mdev);
1221 for (bitnr = s; bitnr <= e; bitnr++) {
1222 w = bitnr >> LN2_BPL;
1223 if (page_nr != w >> (PAGE_SHIFT - LN2_BPL + 3)) {
1224 page_nr = w >> (PAGE_SHIFT - LN2_BPL + 3);
1225 if (p_addr)
1226 bm_unmap(p_addr);
1227 p_addr = bm_map_paddr(b, w);
1228 }
1229 ERR_IF (bitnr >= b->bm_bits) {
1230 dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
1231 } else {
1232 c += (0 != test_bit(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
1233 }
1234 }
1235 if (p_addr)
1236 bm_unmap(p_addr);
1237 spin_unlock_irqrestore(&b->bm_lock, flags);
1238 return c;
1239}
1240
1241
1242/* inherently racy...
1243 * return value may be already out-of-date when this function returns.
1244 * but the general usage is that this is only use during a cstate when bits are
1245 * only cleared, not set, and typically only care for the case when the return
1246 * value is zero, or we already "locked" this "bitmap extent" by other means.
1247 *
1248 * enr is bm-extent number, since we chose to name one sector (512 bytes)
1249 * worth of the bitmap a "bitmap extent".
1250 *
1251 * TODO
1252 * I think since we use it like a reference count, we should use the real
1253 * reference count of some bitmap extent element from some lru instead...
1254 *
1255 */
1256int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
1257{
1258 struct drbd_bitmap *b = mdev->bitmap;
1259 int count, s, e;
1260 unsigned long flags;
1261 unsigned long *p_addr, *bm;
1262
1263 ERR_IF(!b) return 0;
1264 ERR_IF(!b->bm_pages) return 0;
1265
1266 spin_lock_irqsave(&b->bm_lock, flags);
1267 if (bm_is_locked(b))
1268 bm_print_lock_info(mdev);
1269
1270 s = S2W(enr);
1271 e = min((size_t)S2W(enr+1), b->bm_words);
1272 count = 0;
1273 if (s < b->bm_words) {
1274 int n = e-s;
1275 p_addr = bm_map_paddr(b, s);
1276 bm = p_addr + MLPP(s);
1277 while (n--)
1278 count += hweight_long(*bm++);
1279 bm_unmap(p_addr);
1280 } else {
1281 dev_err(DEV, "start offset (%d) too large in drbd_bm_e_weight\n", s);
1282 }
1283 spin_unlock_irqrestore(&b->bm_lock, flags);
1284 return count;
1285}
1286
1287/* set all bits covered by the AL-extent al_enr */
1288unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
1289{
1290 struct drbd_bitmap *b = mdev->bitmap;
1291 unsigned long *p_addr, *bm;
1292 unsigned long weight;
1293 int count, s, e, i, do_now;
1294 ERR_IF(!b) return 0;
1295 ERR_IF(!b->bm_pages) return 0;
1296
1297 spin_lock_irq(&b->bm_lock);
1298 if (bm_is_locked(b))
1299 bm_print_lock_info(mdev);
1300 weight = b->bm_set;
1301
1302 s = al_enr * BM_WORDS_PER_AL_EXT;
1303 e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);
1304 /* assert that s and e are on the same page */
1305 D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3)
1306 == s >> (PAGE_SHIFT - LN2_BPL + 3));
1307 count = 0;
1308 if (s < b->bm_words) {
1309 i = do_now = e-s;
1310 p_addr = bm_map_paddr(b, s);
1311 bm = p_addr + MLPP(s);
1312 while (i--) {
1313 count += hweight_long(*bm);
1314 *bm = -1UL;
1315 bm++;
1316 }
1317 bm_unmap(p_addr);
1318 b->bm_set += do_now*BITS_PER_LONG - count;
1319 if (e == b->bm_words)
1320 b->bm_set -= bm_clear_surplus(b);
1321 } else {
1322 dev_err(DEV, "start offset (%d) too large in drbd_bm_ALe_set_all\n", s);
1323 }
1324 weight = b->bm_set - weight;
1325 spin_unlock_irq(&b->bm_lock);
1326 return weight;
1327}
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
new file mode 100644
index 000000000000..8da602e010bb
--- /dev/null
+++ b/drivers/block/drbd/drbd_int.h
@@ -0,0 +1,2258 @@
1/*
2 drbd_int.h
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24*/
25
26#ifndef _DRBD_INT_H
27#define _DRBD_INT_H
28
29#include <linux/compiler.h>
30#include <linux/types.h>
31#include <linux/version.h>
32#include <linux/list.h>
33#include <linux/sched.h>
34#include <linux/bitops.h>
35#include <linux/slab.h>
36#include <linux/crypto.h>
37#include <linux/tcp.h>
38#include <linux/mutex.h>
39#include <linux/major.h>
40#include <linux/blkdev.h>
41#include <linux/genhd.h>
42#include <net/tcp.h>
43#include <linux/lru_cache.h>
44
45#ifdef __CHECKER__
46# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr")))
47# define __protected_read_by(x) __attribute__((require_context(x,1,999,"read")))
48# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write")))
49# define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call")))
50#else
51# define __protected_by(x)
52# define __protected_read_by(x)
53# define __protected_write_by(x)
54# define __must_hold(x)
55#endif
56
57#define __no_warn(lock, stmt) do { __acquire(lock); stmt; __release(lock); } while (0)
58
59/* module parameter, defined in drbd_main.c */
60extern unsigned int minor_count;
61extern int disable_sendpage;
62extern int allow_oos;
63extern unsigned int cn_idx;
64
65#ifdef CONFIG_DRBD_FAULT_INJECTION
66extern int enable_faults;
67extern int fault_rate;
68extern int fault_devs;
69#endif
70
71extern char usermode_helper[];
72
73
74#ifndef TRUE
75#define TRUE 1
76#endif
77#ifndef FALSE
78#define FALSE 0
79#endif
80
81/* I don't remember why XCPU ...
82 * This is used to wake the asender,
83 * and to interrupt sending the sending task
84 * on disconnect.
85 */
86#define DRBD_SIG SIGXCPU
87
88/* This is used to stop/restart our threads.
89 * Cannot use SIGTERM nor SIGKILL, since these
90 * are sent out by init on runlevel changes
91 * I choose SIGHUP for now.
92 */
93#define DRBD_SIGKILL SIGHUP
94
95/* All EEs on the free list should have ID_VACANT (== 0)
96 * freshly allocated EEs get !ID_VACANT (== 1)
97 * so if it says "cannot dereference null pointer at adress 0x00000001",
98 * it is most likely one of these :( */
99
100#define ID_IN_SYNC (4711ULL)
101#define ID_OUT_OF_SYNC (4712ULL)
102
103#define ID_SYNCER (-1ULL)
104#define ID_VACANT 0
105#define is_syncer_block_id(id) ((id) == ID_SYNCER)
106
107struct drbd_conf;
108
109
110/* to shorten dev_warn(DEV, "msg"); and relatives statements */
111#define DEV (disk_to_dev(mdev->vdisk))
112
113#define D_ASSERT(exp) if (!(exp)) \
114 dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__)
115
116#define ERR_IF(exp) if (({ \
117 int _b = (exp) != 0; \
118 if (_b) dev_err(DEV, "%s: (%s) in %s:%d\n", \
119 __func__, #exp, __FILE__, __LINE__); \
120 _b; \
121 }))
122
123/* Defines to control fault insertion */
124enum {
125 DRBD_FAULT_MD_WR = 0, /* meta data write */
126 DRBD_FAULT_MD_RD = 1, /* read */
127 DRBD_FAULT_RS_WR = 2, /* resync */
128 DRBD_FAULT_RS_RD = 3,
129 DRBD_FAULT_DT_WR = 4, /* data */
130 DRBD_FAULT_DT_RD = 5,
131 DRBD_FAULT_DT_RA = 6, /* data read ahead */
132 DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */
133 DRBD_FAULT_AL_EE = 8, /* alloc ee */
134
135 DRBD_FAULT_MAX,
136};
137
138extern void trace_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, ...);
139
140#ifdef CONFIG_DRBD_FAULT_INJECTION
141extern unsigned int
142_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type);
143static inline int
144drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) {
145 return fault_rate &&
146 (enable_faults & (1<<type)) &&
147 _drbd_insert_fault(mdev, type);
148}
149#define FAULT_ACTIVE(_m, _t) (drbd_insert_fault((_m), (_t)))
150
151#else
152#define FAULT_ACTIVE(_m, _t) (0)
153#endif
154
155/* integer division, round _UP_ to the next integer */
156#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0))
157/* usual integer division */
158#define div_floor(A, B) ((A)/(B))
159
160/* drbd_meta-data.c (still in drbd_main.c) */
161/* 4th incarnation of the disk layout. */
162#define DRBD_MD_MAGIC (DRBD_MAGIC+4)
163
164extern struct drbd_conf **minor_table;
165extern struct ratelimit_state drbd_ratelimit_state;
166
167/* on the wire */
168enum drbd_packets {
169 /* receiver (data socket) */
170 P_DATA = 0x00,
171 P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */
172 P_RS_DATA_REPLY = 0x02, /* Response to P_RS_DATA_REQUEST */
173 P_BARRIER = 0x03,
174 P_BITMAP = 0x04,
175 P_BECOME_SYNC_TARGET = 0x05,
176 P_BECOME_SYNC_SOURCE = 0x06,
177 P_UNPLUG_REMOTE = 0x07, /* Used at various times to hint the peer */
178 P_DATA_REQUEST = 0x08, /* Used to ask for a data block */
179 P_RS_DATA_REQUEST = 0x09, /* Used to ask for a data block for resync */
180 P_SYNC_PARAM = 0x0a,
181 P_PROTOCOL = 0x0b,
182 P_UUIDS = 0x0c,
183 P_SIZES = 0x0d,
184 P_STATE = 0x0e,
185 P_SYNC_UUID = 0x0f,
186 P_AUTH_CHALLENGE = 0x10,
187 P_AUTH_RESPONSE = 0x11,
188 P_STATE_CHG_REQ = 0x12,
189
190 /* asender (meta socket */
191 P_PING = 0x13,
192 P_PING_ACK = 0x14,
193 P_RECV_ACK = 0x15, /* Used in protocol B */
194 P_WRITE_ACK = 0x16, /* Used in protocol C */
195 P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
196 P_DISCARD_ACK = 0x18, /* Used in proto C, two-primaries conflict detection */
197 P_NEG_ACK = 0x19, /* Sent if local disk is unusable */
198 P_NEG_DREPLY = 0x1a, /* Local disk is broken... */
199 P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */
200 P_BARRIER_ACK = 0x1c,
201 P_STATE_CHG_REPLY = 0x1d,
202
203 /* "new" commands, no longer fitting into the ordering scheme above */
204
205 P_OV_REQUEST = 0x1e, /* data socket */
206 P_OV_REPLY = 0x1f,
207 P_OV_RESULT = 0x20, /* meta socket */
208 P_CSUM_RS_REQUEST = 0x21, /* data socket */
209 P_RS_IS_IN_SYNC = 0x22, /* meta socket */
210 P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */
211 P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */
212
213 P_MAX_CMD = 0x25,
214 P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
215 P_MAX_OPT_CMD = 0x101,
216
217 /* special command ids for handshake */
218
219 P_HAND_SHAKE_M = 0xfff1, /* First Packet on the MetaSock */
220 P_HAND_SHAKE_S = 0xfff2, /* First Packet on the Socket */
221
222 P_HAND_SHAKE = 0xfffe /* FIXED for the next century! */
223};
224
225static inline const char *cmdname(enum drbd_packets cmd)
226{
227 /* THINK may need to become several global tables
228 * when we want to support more than
229 * one PRO_VERSION */
230 static const char *cmdnames[] = {
231 [P_DATA] = "Data",
232 [P_DATA_REPLY] = "DataReply",
233 [P_RS_DATA_REPLY] = "RSDataReply",
234 [P_BARRIER] = "Barrier",
235 [P_BITMAP] = "ReportBitMap",
236 [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
237 [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
238 [P_UNPLUG_REMOTE] = "UnplugRemote",
239 [P_DATA_REQUEST] = "DataRequest",
240 [P_RS_DATA_REQUEST] = "RSDataRequest",
241 [P_SYNC_PARAM] = "SyncParam",
242 [P_SYNC_PARAM89] = "SyncParam89",
243 [P_PROTOCOL] = "ReportProtocol",
244 [P_UUIDS] = "ReportUUIDs",
245 [P_SIZES] = "ReportSizes",
246 [P_STATE] = "ReportState",
247 [P_SYNC_UUID] = "ReportSyncUUID",
248 [P_AUTH_CHALLENGE] = "AuthChallenge",
249 [P_AUTH_RESPONSE] = "AuthResponse",
250 [P_PING] = "Ping",
251 [P_PING_ACK] = "PingAck",
252 [P_RECV_ACK] = "RecvAck",
253 [P_WRITE_ACK] = "WriteAck",
254 [P_RS_WRITE_ACK] = "RSWriteAck",
255 [P_DISCARD_ACK] = "DiscardAck",
256 [P_NEG_ACK] = "NegAck",
257 [P_NEG_DREPLY] = "NegDReply",
258 [P_NEG_RS_DREPLY] = "NegRSDReply",
259 [P_BARRIER_ACK] = "BarrierAck",
260 [P_STATE_CHG_REQ] = "StateChgRequest",
261 [P_STATE_CHG_REPLY] = "StateChgReply",
262 [P_OV_REQUEST] = "OVRequest",
263 [P_OV_REPLY] = "OVReply",
264 [P_OV_RESULT] = "OVResult",
265 [P_MAX_CMD] = NULL,
266 };
267
268 if (cmd == P_HAND_SHAKE_M)
269 return "HandShakeM";
270 if (cmd == P_HAND_SHAKE_S)
271 return "HandShakeS";
272 if (cmd == P_HAND_SHAKE)
273 return "HandShake";
274 if (cmd >= P_MAX_CMD)
275 return "Unknown";
276 return cmdnames[cmd];
277}
278
279/* for sending/receiving the bitmap,
280 * possibly in some encoding scheme */
281struct bm_xfer_ctx {
282 /* "const"
283 * stores total bits and long words
284 * of the bitmap, so we don't need to
285 * call the accessor functions over and again. */
286 unsigned long bm_bits;
287 unsigned long bm_words;
288 /* during xfer, current position within the bitmap */
289 unsigned long bit_offset;
290 unsigned long word_offset;
291
292 /* statistics; index: (h->command == P_BITMAP) */
293 unsigned packets[2];
294 unsigned bytes[2];
295};
296
297extern void INFO_bm_xfer_stats(struct drbd_conf *mdev,
298 const char *direction, struct bm_xfer_ctx *c);
299
300static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
301{
302 /* word_offset counts "native long words" (32 or 64 bit),
303 * aligned at 64 bit.
304 * Encoded packet may end at an unaligned bit offset.
305 * In case a fallback clear text packet is transmitted in
306 * between, we adjust this offset back to the last 64bit
307 * aligned "native long word", which makes coding and decoding
308 * the plain text bitmap much more convenient. */
309#if BITS_PER_LONG == 64
310 c->word_offset = c->bit_offset >> 6;
311#elif BITS_PER_LONG == 32
312 c->word_offset = c->bit_offset >> 5;
313 c->word_offset &= ~(1UL);
314#else
315# error "unsupported BITS_PER_LONG"
316#endif
317}
318
319#ifndef __packed
320#define __packed __attribute__((packed))
321#endif
322
323/* This is the layout for a packet on the wire.
324 * The byteorder is the network byte order.
325 * (except block_id and barrier fields.
326 * these are pointers to local structs
327 * and have no relevance for the partner,
328 * which just echoes them as received.)
329 *
330 * NOTE that the payload starts at a long aligned offset,
331 * regardless of 32 or 64 bit arch!
332 */
333struct p_header {
334 u32 magic;
335 u16 command;
336 u16 length; /* bytes of data after this header */
337 u8 payload[0];
338} __packed;
339/* 8 bytes. packet FIXED for the next century! */
340
341/*
342 * short commands, packets without payload, plain p_header:
343 * P_PING
344 * P_PING_ACK
345 * P_BECOME_SYNC_TARGET
346 * P_BECOME_SYNC_SOURCE
347 * P_UNPLUG_REMOTE
348 */
349
350/*
351 * commands with out-of-struct payload:
352 * P_BITMAP (no additional fields)
353 * P_DATA, P_DATA_REPLY (see p_data)
354 * P_COMPRESSED_BITMAP (see receive_compressed_bitmap)
355 */
356
357/* these defines must not be changed without changing the protocol version */
358#define DP_HARDBARRIER 1
359#define DP_RW_SYNC 2
360#define DP_MAY_SET_IN_SYNC 4
361
362struct p_data {
363 struct p_header head;
364 u64 sector; /* 64 bits sector number */
365 u64 block_id; /* to identify the request in protocol B&C */
366 u32 seq_num;
367 u32 dp_flags;
368} __packed;
369
370/*
371 * commands which share a struct:
372 * p_block_ack:
373 * P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
374 * P_DISCARD_ACK (proto C, two-primaries conflict detection)
375 * p_block_req:
376 * P_DATA_REQUEST, P_RS_DATA_REQUEST
377 */
378struct p_block_ack {
379 struct p_header head;
380 u64 sector;
381 u64 block_id;
382 u32 blksize;
383 u32 seq_num;
384} __packed;
385
386
387struct p_block_req {
388 struct p_header head;
389 u64 sector;
390 u64 block_id;
391 u32 blksize;
392 u32 pad; /* to multiple of 8 Byte */
393} __packed;
394
395/*
396 * commands with their own struct for additional fields:
397 * P_HAND_SHAKE
398 * P_BARRIER
399 * P_BARRIER_ACK
400 * P_SYNC_PARAM
401 * ReportParams
402 */
403
404struct p_handshake {
405 struct p_header head; /* 8 bytes */
406 u32 protocol_min;
407 u32 feature_flags;
408 u32 protocol_max;
409
410 /* should be more than enough for future enhancements
411 * for now, feature_flags and the reserverd array shall be zero.
412 */
413
414 u32 _pad;
415 u64 reserverd[7];
416} __packed;
417/* 80 bytes, FIXED for the next century */
418
419struct p_barrier {
420 struct p_header head;
421 u32 barrier; /* barrier number _handle_ only */
422 u32 pad; /* to multiple of 8 Byte */
423} __packed;
424
425struct p_barrier_ack {
426 struct p_header head;
427 u32 barrier;
428 u32 set_size;
429} __packed;
430
431struct p_rs_param {
432 struct p_header head;
433 u32 rate;
434
435 /* Since protocol version 88 and higher. */
436 char verify_alg[0];
437} __packed;
438
439struct p_rs_param_89 {
440 struct p_header head;
441 u32 rate;
442 /* protocol version 89: */
443 char verify_alg[SHARED_SECRET_MAX];
444 char csums_alg[SHARED_SECRET_MAX];
445} __packed;
446
447struct p_protocol {
448 struct p_header head;
449 u32 protocol;
450 u32 after_sb_0p;
451 u32 after_sb_1p;
452 u32 after_sb_2p;
453 u32 want_lose;
454 u32 two_primaries;
455
456 /* Since protocol version 87 and higher. */
457 char integrity_alg[0];
458
459} __packed;
460
461struct p_uuids {
462 struct p_header head;
463 u64 uuid[UI_EXTENDED_SIZE];
464} __packed;
465
466struct p_rs_uuid {
467 struct p_header head;
468 u64 uuid;
469} __packed;
470
471struct p_sizes {
472 struct p_header head;
473 u64 d_size; /* size of disk */
474 u64 u_size; /* user requested size */
475 u64 c_size; /* current exported size */
476 u32 max_segment_size; /* Maximal size of a BIO */
477 u32 queue_order_type;
478} __packed;
479
480struct p_state {
481 struct p_header head;
482 u32 state;
483} __packed;
484
485struct p_req_state {
486 struct p_header head;
487 u32 mask;
488 u32 val;
489} __packed;
490
491struct p_req_state_reply {
492 struct p_header head;
493 u32 retcode;
494} __packed;
495
496struct p_drbd06_param {
497 u64 size;
498 u32 state;
499 u32 blksize;
500 u32 protocol;
501 u32 version;
502 u32 gen_cnt[5];
503 u32 bit_map_gen[5];
504} __packed;
505
506struct p_discard {
507 struct p_header head;
508 u64 block_id;
509 u32 seq_num;
510 u32 pad;
511} __packed;
512
513/* Valid values for the encoding field.
514 * Bump proto version when changing this. */
515enum drbd_bitmap_code {
516 /* RLE_VLI_Bytes = 0,
517 * and other bit variants had been defined during
518 * algorithm evaluation. */
519 RLE_VLI_Bits = 2,
520};
521
522struct p_compressed_bm {
523 struct p_header head;
524 /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
525 * (encoding & 0x80): polarity (set/unset) of first runlength
526 * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
527 * used to pad up to head.length bytes
528 */
529 u8 encoding;
530
531 u8 code[0];
532} __packed;
533
534/* DCBP: Drbd Compressed Bitmap Packet ... */
535static inline enum drbd_bitmap_code
536DCBP_get_code(struct p_compressed_bm *p)
537{
538 return (enum drbd_bitmap_code)(p->encoding & 0x0f);
539}
540
541static inline void
542DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
543{
544 BUG_ON(code & ~0xf);
545 p->encoding = (p->encoding & ~0xf) | code;
546}
547
548static inline int
549DCBP_get_start(struct p_compressed_bm *p)
550{
551 return (p->encoding & 0x80) != 0;
552}
553
554static inline void
555DCBP_set_start(struct p_compressed_bm *p, int set)
556{
557 p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
558}
559
560static inline int
561DCBP_get_pad_bits(struct p_compressed_bm *p)
562{
563 return (p->encoding >> 4) & 0x7;
564}
565
566static inline void
567DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
568{
569 BUG_ON(n & ~0x7);
570 p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
571}
572
573/* one bitmap packet, including the p_header,
574 * should fit within one _architecture independend_ page.
575 * so we need to use the fixed size 4KiB page size
576 * most architechtures have used for a long time.
577 */
578#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header))
579#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
580#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
581#if (PAGE_SIZE < 4096)
582/* drbd_send_bitmap / receive_bitmap would break horribly */
583#error "PAGE_SIZE too small"
584#endif
585
586union p_polymorph {
587 struct p_header header;
588 struct p_handshake handshake;
589 struct p_data data;
590 struct p_block_ack block_ack;
591 struct p_barrier barrier;
592 struct p_barrier_ack barrier_ack;
593 struct p_rs_param_89 rs_param_89;
594 struct p_protocol protocol;
595 struct p_sizes sizes;
596 struct p_uuids uuids;
597 struct p_state state;
598 struct p_req_state req_state;
599 struct p_req_state_reply req_state_reply;
600 struct p_block_req block_req;
601} __packed;
602
603/**********************************************************************/
604enum drbd_thread_state {
605 None,
606 Running,
607 Exiting,
608 Restarting
609};
610
611struct drbd_thread {
612 spinlock_t t_lock;
613 struct task_struct *task;
614 struct completion stop;
615 enum drbd_thread_state t_state;
616 int (*function) (struct drbd_thread *);
617 struct drbd_conf *mdev;
618 int reset_cpu_mask;
619};
620
621static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
622{
623 /* THINK testing the t_state seems to be uncritical in all cases
624 * (but thread_{start,stop}), so we can read it *without* the lock.
625 * --lge */
626
627 smp_rmb();
628 return thi->t_state;
629}
630
631
632/*
633 * Having this as the first member of a struct provides sort of "inheritance".
634 * "derived" structs can be "drbd_queue_work()"ed.
635 * The callback should know and cast back to the descendant struct.
636 * drbd_request and drbd_epoch_entry are descendants of drbd_work.
637 */
638struct drbd_work;
639typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel);
640struct drbd_work {
641 struct list_head list;
642 drbd_work_cb cb;
643};
644
645struct drbd_tl_epoch;
646struct drbd_request {
647 struct drbd_work w;
648 struct drbd_conf *mdev;
649
650 /* if local IO is not allowed, will be NULL.
651 * if local IO _is_ allowed, holds the locally submitted bio clone,
652 * or, after local IO completion, the ERR_PTR(error).
653 * see drbd_endio_pri(). */
654 struct bio *private_bio;
655
656 struct hlist_node colision;
657 sector_t sector;
658 unsigned int size;
659 unsigned int epoch; /* barrier_nr */
660
661 /* barrier_nr: used to check on "completion" whether this req was in
662 * the current epoch, and we therefore have to close it,
663 * starting a new epoch...
664 */
665
666 /* up to here, the struct layout is identical to drbd_epoch_entry;
667 * we might be able to use that to our advantage... */
668
669 struct list_head tl_requests; /* ring list in the transfer log */
670 struct bio *master_bio; /* master bio pointer */
671 unsigned long rq_state; /* see comments above _req_mod() */
672 int seq_num;
673 unsigned long start_time;
674};
675
676struct drbd_tl_epoch {
677 struct drbd_work w;
678 struct list_head requests; /* requests before */
679 struct drbd_tl_epoch *next; /* pointer to the next barrier */
680 unsigned int br_number; /* the barriers identifier. */
681 int n_req; /* number of requests attached before this barrier */
682};
683
684struct drbd_request;
685
686/* These Tl_epoch_entries may be in one of 6 lists:
687 active_ee .. data packet being written
688 sync_ee .. syncer block being written
689 done_ee .. block written, need to send P_WRITE_ACK
690 read_ee .. [RS]P_DATA_REQUEST being read
691*/
692
693struct drbd_epoch {
694 struct list_head list;
695 unsigned int barrier_nr;
696 atomic_t epoch_size; /* increased on every request added. */
697 atomic_t active; /* increased on every req. added, and dec on every finished. */
698 unsigned long flags;
699};
700
701/* drbd_epoch flag bits */
702enum {
703 DE_BARRIER_IN_NEXT_EPOCH_ISSUED,
704 DE_BARRIER_IN_NEXT_EPOCH_DONE,
705 DE_CONTAINS_A_BARRIER,
706 DE_HAVE_BARRIER_NUMBER,
707 DE_IS_FINISHING,
708};
709
710enum epoch_event {
711 EV_PUT,
712 EV_GOT_BARRIER_NR,
713 EV_BARRIER_DONE,
714 EV_BECAME_LAST,
715 EV_TRACE_FLUSH, /* TRACE_ are not real events, only used for tracing */
716 EV_TRACE_ADD_BARRIER, /* Doing the first write as a barrier write */
717 EV_TRACE_SETTING_BI, /* Barrier is expressed with the first write of the next epoch */
718 EV_TRACE_ALLOC,
719 EV_TRACE_FREE,
720 EV_CLEANUP = 32, /* used as flag */
721};
722
723struct drbd_epoch_entry {
724 struct drbd_work w;
725 struct drbd_conf *mdev;
726 struct bio *private_bio;
727 struct hlist_node colision;
728 sector_t sector;
729 unsigned int size;
730 struct drbd_epoch *epoch;
731
732 /* up to here, the struct layout is identical to drbd_request;
733 * we might be able to use that to our advantage... */
734
735 unsigned int flags;
736 u64 block_id;
737};
738
739struct drbd_wq_barrier {
740 struct drbd_work w;
741 struct completion done;
742};
743
744struct digest_info {
745 int digest_size;
746 void *digest;
747};
748
749/* ee flag bits */
750enum {
751 __EE_CALL_AL_COMPLETE_IO,
752 __EE_CONFLICT_PENDING,
753 __EE_MAY_SET_IN_SYNC,
754 __EE_IS_BARRIER,
755};
756#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
757#define EE_CONFLICT_PENDING (1<<__EE_CONFLICT_PENDING)
758#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
759#define EE_IS_BARRIER (1<<__EE_IS_BARRIER)
760
761/* global flag bits */
762enum {
763 CREATE_BARRIER, /* next P_DATA is preceeded by a P_BARRIER */
764 SIGNAL_ASENDER, /* whether asender wants to be interrupted */
765 SEND_PING, /* whether asender should send a ping asap */
766
767 STOP_SYNC_TIMER, /* tell timer to cancel itself */
768 UNPLUG_QUEUED, /* only relevant with kernel 2.4 */
769 UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */
770 MD_DIRTY, /* current uuids and flags not yet on disk */
771 DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */
772 USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */
773 CLUSTER_ST_CHANGE, /* Cluster wide state change going on... */
774 CL_ST_CHG_SUCCESS,
775 CL_ST_CHG_FAIL,
776 CRASHED_PRIMARY, /* This node was a crashed primary.
777 * Gets cleared when the state.conn
778 * goes into C_CONNECTED state. */
779 WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */
780 NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */
781 CONSIDER_RESYNC,
782
783 MD_NO_BARRIER, /* meta data device does not support barriers,
784 so don't even try */
785 SUSPEND_IO, /* suspend application io */
786 BITMAP_IO, /* suspend application io;
787 once no more io in flight, start bitmap io */
788 BITMAP_IO_QUEUED, /* Started bitmap IO */
789 RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
790 NET_CONGESTED, /* The data socket is congested */
791
792 CONFIG_PENDING, /* serialization of (re)configuration requests.
793 * if set, also prevents the device from dying */
794 DEVICE_DYING, /* device became unconfigured,
795 * but worker thread is still handling the cleanup.
796 * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed,
797 * while this is set. */
798 RESIZE_PENDING, /* Size change detected locally, waiting for the response from
799 * the peer, if it changed there as well. */
800};
801
802struct drbd_bitmap; /* opaque for drbd_conf */
803
804/* TODO sort members for performance
805 * MAYBE group them further */
806
807/* THINK maybe we actually want to use the default "event/%s" worker threads
808 * or similar in linux 2.6, which uses per cpu data and threads.
809 *
810 * To be general, this might need a spin_lock member.
811 * For now, please use the mdev->req_lock to protect list_head,
812 * see drbd_queue_work below.
813 */
814struct drbd_work_queue {
815 struct list_head q;
816 struct semaphore s; /* producers up it, worker down()s it */
817 spinlock_t q_lock; /* to protect the list. */
818};
819
820struct drbd_socket {
821 struct drbd_work_queue work;
822 struct mutex mutex;
823 struct socket *socket;
824 /* this way we get our
825 * send/receive buffers off the stack */
826 union p_polymorph sbuf;
827 union p_polymorph rbuf;
828};
829
830struct drbd_md {
831 u64 md_offset; /* sector offset to 'super' block */
832
833 u64 la_size_sect; /* last agreed size, unit sectors */
834 u64 uuid[UI_SIZE];
835 u64 device_uuid;
836 u32 flags;
837 u32 md_size_sect;
838
839 s32 al_offset; /* signed relative sector offset to al area */
840 s32 bm_offset; /* signed relative sector offset to bitmap */
841
842 /* u32 al_nr_extents; important for restoring the AL
843 * is stored into sync_conf.al_extents, which in turn
844 * gets applied to act_log->nr_elements
845 */
846};
847
848/* for sync_conf and other types... */
849#define NL_PACKET(name, number, fields) struct name { fields };
850#define NL_INTEGER(pn,pr,member) int member;
851#define NL_INT64(pn,pr,member) __u64 member;
852#define NL_BIT(pn,pr,member) unsigned member:1;
853#define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len;
854#include "linux/drbd_nl.h"
855
856struct drbd_backing_dev {
857 struct block_device *backing_bdev;
858 struct block_device *md_bdev;
859 struct file *lo_file;
860 struct file *md_file;
861 struct drbd_md md;
862 struct disk_conf dc; /* The user provided config... */
863 sector_t known_size; /* last known size of that backing device */
864};
865
866struct drbd_md_io {
867 struct drbd_conf *mdev;
868 struct completion event;
869 int error;
870};
871
872struct bm_io_work {
873 struct drbd_work w;
874 char *why;
875 int (*io_fn)(struct drbd_conf *mdev);
876 void (*done)(struct drbd_conf *mdev, int rv);
877};
878
879enum write_ordering_e {
880 WO_none,
881 WO_drain_io,
882 WO_bdev_flush,
883 WO_bio_barrier
884};
885
886struct drbd_conf {
887 /* things that are stored as / read from meta data on disk */
888 unsigned long flags;
889
890 /* configured by drbdsetup */
891 struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */
892 struct syncer_conf sync_conf;
893 struct drbd_backing_dev *ldev __protected_by(local);
894
895 sector_t p_size; /* partner's disk size */
896 struct request_queue *rq_queue;
897 struct block_device *this_bdev;
898 struct gendisk *vdisk;
899
900 struct drbd_socket data; /* data/barrier/cstate/parameter packets */
901 struct drbd_socket meta; /* ping/ack (metadata) packets */
902 int agreed_pro_version; /* actually used protocol version */
903 unsigned long last_received; /* in jiffies, either socket */
904 unsigned int ko_count;
905 struct drbd_work resync_work,
906 unplug_work,
907 md_sync_work;
908 struct timer_list resync_timer;
909 struct timer_list md_sync_timer;
910
911 /* Used after attach while negotiating new disk state. */
912 union drbd_state new_state_tmp;
913
914 union drbd_state state;
915 wait_queue_head_t misc_wait;
916 wait_queue_head_t state_wait; /* upon each state change. */
917 unsigned int send_cnt;
918 unsigned int recv_cnt;
919 unsigned int read_cnt;
920 unsigned int writ_cnt;
921 unsigned int al_writ_cnt;
922 unsigned int bm_writ_cnt;
923 atomic_t ap_bio_cnt; /* Requests we need to complete */
924 atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
925 atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
926 atomic_t unacked_cnt; /* Need to send replys for */
927 atomic_t local_cnt; /* Waiting for local completion */
928 atomic_t net_cnt; /* Users of net_conf */
929 spinlock_t req_lock;
930 struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */
931 struct drbd_tl_epoch *newest_tle;
932 struct drbd_tl_epoch *oldest_tle;
933 struct list_head out_of_sequence_requests;
934 struct hlist_head *tl_hash;
935 unsigned int tl_hash_s;
936
937 /* blocks to sync in this run [unit BM_BLOCK_SIZE] */
938 unsigned long rs_total;
939 /* number of sync IOs that failed in this run */
940 unsigned long rs_failed;
941 /* Syncer's start time [unit jiffies] */
942 unsigned long rs_start;
943 /* cumulated time in PausedSyncX state [unit jiffies] */
944 unsigned long rs_paused;
945 /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
946 unsigned long rs_mark_left;
947 /* marks's time [unit jiffies] */
948 unsigned long rs_mark_time;
949 /* skipped because csum was equeal [unit BM_BLOCK_SIZE] */
950 unsigned long rs_same_csum;
951
952 /* where does the admin want us to start? (sector) */
953 sector_t ov_start_sector;
954 /* where are we now? (sector) */
955 sector_t ov_position;
956 /* Start sector of out of sync range (to merge printk reporting). */
957 sector_t ov_last_oos_start;
958 /* size of out-of-sync range in sectors. */
959 sector_t ov_last_oos_size;
960 unsigned long ov_left; /* in bits */
961 struct crypto_hash *csums_tfm;
962 struct crypto_hash *verify_tfm;
963
964 struct drbd_thread receiver;
965 struct drbd_thread worker;
966 struct drbd_thread asender;
967 struct drbd_bitmap *bitmap;
968 unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */
969
970 /* Used to track operations of resync... */
971 struct lru_cache *resync;
972 /* Number of locked elements in resync LRU */
973 unsigned int resync_locked;
974 /* resync extent number waiting for application requests */
975 unsigned int resync_wenr;
976
977 int open_cnt;
978 u64 *p_uuid;
979 struct drbd_epoch *current_epoch;
980 spinlock_t epoch_lock;
981 unsigned int epochs;
982 enum write_ordering_e write_ordering;
983 struct list_head active_ee; /* IO in progress */
984 struct list_head sync_ee; /* IO in progress */
985 struct list_head done_ee; /* send ack */
986 struct list_head read_ee; /* IO in progress */
987 struct list_head net_ee; /* zero-copy network send in progress */
988 struct hlist_head *ee_hash; /* is proteced by req_lock! */
989 unsigned int ee_hash_s;
990
991 /* this one is protected by ee_lock, single thread */
992 struct drbd_epoch_entry *last_write_w_barrier;
993
994 int next_barrier_nr;
995 struct hlist_head *app_reads_hash; /* is proteced by req_lock */
996 struct list_head resync_reads;
997 atomic_t pp_in_use;
998 wait_queue_head_t ee_wait;
999 struct page *md_io_page; /* one page buffer for md_io */
1000 struct page *md_io_tmpp; /* for logical_block_size != 512 */
1001 struct mutex md_io_mutex; /* protects the md_io_buffer */
1002 spinlock_t al_lock;
1003 wait_queue_head_t al_wait;
1004 struct lru_cache *act_log; /* activity log */
1005 unsigned int al_tr_number;
1006 int al_tr_cycle;
1007 int al_tr_pos; /* position of the next transaction in the journal */
1008 struct crypto_hash *cram_hmac_tfm;
1009 struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */
1010 struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */
1011 void *int_dig_out;
1012 void *int_dig_in;
1013 void *int_dig_vv;
1014 wait_queue_head_t seq_wait;
1015 atomic_t packet_seq;
1016 unsigned int peer_seq;
1017 spinlock_t peer_seq_lock;
1018 unsigned int minor;
1019 unsigned long comm_bm_set; /* communicated number of set bits. */
1020 cpumask_var_t cpu_mask;
1021 struct bm_io_work bm_io_work;
1022 u64 ed_uuid; /* UUID of the exposed data */
1023 struct mutex state_mutex;
1024 char congestion_reason; /* Why we where congested... */
1025};
1026
1027static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
1028{
1029 struct drbd_conf *mdev;
1030
1031 mdev = minor < minor_count ? minor_table[minor] : NULL;
1032
1033 return mdev;
1034}
1035
1036static inline unsigned int mdev_to_minor(struct drbd_conf *mdev)
1037{
1038 return mdev->minor;
1039}
1040
1041/* returns 1 if it was successfull,
1042 * returns 0 if there was no data socket.
1043 * so wherever you are going to use the data.socket, e.g. do
1044 * if (!drbd_get_data_sock(mdev))
1045 * return 0;
1046 * CODE();
1047 * drbd_put_data_sock(mdev);
1048 */
1049static inline int drbd_get_data_sock(struct drbd_conf *mdev)
1050{
1051 mutex_lock(&mdev->data.mutex);
1052 /* drbd_disconnect() could have called drbd_free_sock()
1053 * while we were waiting in down()... */
1054 if (unlikely(mdev->data.socket == NULL)) {
1055 mutex_unlock(&mdev->data.mutex);
1056 return 0;
1057 }
1058 return 1;
1059}
1060
1061static inline void drbd_put_data_sock(struct drbd_conf *mdev)
1062{
1063 mutex_unlock(&mdev->data.mutex);
1064}
1065
1066/*
1067 * function declarations
1068 *************************/
1069
1070/* drbd_main.c */
1071
1072enum chg_state_flags {
1073 CS_HARD = 1,
1074 CS_VERBOSE = 2,
1075 CS_WAIT_COMPLETE = 4,
1076 CS_SERIALIZE = 8,
1077 CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE,
1078};
1079
1080extern void drbd_init_set_defaults(struct drbd_conf *mdev);
1081extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
1082 union drbd_state mask, union drbd_state val);
1083extern void drbd_force_state(struct drbd_conf *, union drbd_state,
1084 union drbd_state);
1085extern int _drbd_request_state(struct drbd_conf *, union drbd_state,
1086 union drbd_state, enum chg_state_flags);
1087extern int __drbd_set_state(struct drbd_conf *, union drbd_state,
1088 enum chg_state_flags, struct completion *done);
1089extern void print_st_err(struct drbd_conf *, union drbd_state,
1090 union drbd_state, int);
1091extern int drbd_thread_start(struct drbd_thread *thi);
1092extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait);
1093#ifdef CONFIG_SMP
1094extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev);
1095extern void drbd_calc_cpu_mask(struct drbd_conf *mdev);
1096#else
1097#define drbd_thread_current_set_cpu(A) ({})
1098#define drbd_calc_cpu_mask(A) ({})
1099#endif
1100extern void drbd_free_resources(struct drbd_conf *mdev);
1101extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
1102 unsigned int set_size);
1103extern void tl_clear(struct drbd_conf *mdev);
1104extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
1105extern void drbd_free_sock(struct drbd_conf *mdev);
1106extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
1107 void *buf, size_t size, unsigned msg_flags);
1108extern int drbd_send_protocol(struct drbd_conf *mdev);
1109extern int drbd_send_uuids(struct drbd_conf *mdev);
1110extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
1111extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val);
1112extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply);
1113extern int _drbd_send_state(struct drbd_conf *mdev);
1114extern int drbd_send_state(struct drbd_conf *mdev);
1115extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1116 enum drbd_packets cmd, struct p_header *h,
1117 size_t size, unsigned msg_flags);
1118#define USE_DATA_SOCKET 1
1119#define USE_META_SOCKET 0
1120extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1121 enum drbd_packets cmd, struct p_header *h,
1122 size_t size);
1123extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
1124 char *data, size_t size);
1125extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc);
1126extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr,
1127 u32 set_size);
1128extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
1129 struct drbd_epoch_entry *e);
1130extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
1131 struct p_block_req *rp);
1132extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
1133 struct p_data *dp);
1134extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
1135 sector_t sector, int blksize, u64 block_id);
1136extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
1137 struct drbd_epoch_entry *e);
1138extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req);
1139extern int _drbd_send_barrier(struct drbd_conf *mdev,
1140 struct drbd_tl_epoch *barrier);
1141extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
1142 sector_t sector, int size, u64 block_id);
1143extern int drbd_send_drequest_csum(struct drbd_conf *mdev,
1144 sector_t sector,int size,
1145 void *digest, int digest_size,
1146 enum drbd_packets cmd);
1147extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size);
1148
1149extern int drbd_send_bitmap(struct drbd_conf *mdev);
1150extern int _drbd_send_bitmap(struct drbd_conf *mdev);
1151extern int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode);
1152extern void drbd_free_bc(struct drbd_backing_dev *ldev);
1153extern void drbd_mdev_cleanup(struct drbd_conf *mdev);
1154
1155/* drbd_meta-data.c (still in drbd_main.c) */
1156extern void drbd_md_sync(struct drbd_conf *mdev);
1157extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev);
1158/* maybe define them below as inline? */
1159extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
1160extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
1161extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
1162extern void _drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
1163extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local);
1164extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local);
1165extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local);
1166extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
1167extern void drbd_md_mark_dirty(struct drbd_conf *mdev);
1168extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
1169 int (*io_fn)(struct drbd_conf *),
1170 void (*done)(struct drbd_conf *, int),
1171 char *why);
1172extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
1173extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
1174extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
1175
1176
1177/* Meta data layout
1178 We reserve a 128MB Block (4k aligned)
1179 * either at the end of the backing device
1180 * or on a seperate meta data device. */
1181
1182#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */
1183/* The following numbers are sectors */
1184#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */
1185#define MD_AL_MAX_SIZE 64 /* = 32 kb LOG ~ 3776 extents ~ 14 GB Storage */
1186/* Allows up to about 3.8TB */
1187#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE)
1188
1189/* Since the smalles IO unit is usually 512 byte */
1190#define MD_SECTOR_SHIFT 9
1191#define MD_SECTOR_SIZE (1<<MD_SECTOR_SHIFT)
1192
1193/* activity log */
1194#define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */
1195#define AL_EXTENT_SHIFT 22 /* One extent represents 4M Storage */
1196#define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT)
1197
1198#if BITS_PER_LONG == 32
1199#define LN2_BPL 5
1200#define cpu_to_lel(A) cpu_to_le32(A)
1201#define lel_to_cpu(A) le32_to_cpu(A)
1202#elif BITS_PER_LONG == 64
1203#define LN2_BPL 6
1204#define cpu_to_lel(A) cpu_to_le64(A)
1205#define lel_to_cpu(A) le64_to_cpu(A)
1206#else
1207#error "LN2 of BITS_PER_LONG unknown!"
1208#endif
1209
1210/* resync bitmap */
1211/* 16MB sized 'bitmap extent' to track syncer usage */
1212struct bm_extent {
1213 int rs_left; /* number of bits set (out of sync) in this extent. */
1214 int rs_failed; /* number of failed resync requests in this extent. */
1215 unsigned long flags;
1216 struct lc_element lce;
1217};
1218
1219#define BME_NO_WRITES 0 /* bm_extent.flags: no more requests on this one! */
1220#define BME_LOCKED 1 /* bm_extent.flags: syncer active on this one. */
1221
1222/* drbd_bitmap.c */
1223/*
1224 * We need to store one bit for a block.
1225 * Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap.
1226 * Bit 0 ==> local node thinks this block is binary identical on both nodes
1227 * Bit 1 ==> local node thinks this block needs to be synced.
1228 */
1229
1230#define BM_BLOCK_SHIFT 12 /* 4k per bit */
1231#define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT)
1232/* (9+3) : 512 bytes @ 8 bits; representing 16M storage
1233 * per sector of on disk bitmap */
1234#define BM_EXT_SHIFT (BM_BLOCK_SHIFT + MD_SECTOR_SHIFT + 3) /* = 24 */
1235#define BM_EXT_SIZE (1<<BM_EXT_SHIFT)
1236
1237#if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12)
1238#error "HAVE YOU FIXED drbdmeta AS WELL??"
1239#endif
1240
1241/* thus many _storage_ sectors are described by one bit */
1242#define BM_SECT_TO_BIT(x) ((x)>>(BM_BLOCK_SHIFT-9))
1243#define BM_BIT_TO_SECT(x) ((sector_t)(x)<<(BM_BLOCK_SHIFT-9))
1244#define BM_SECT_PER_BIT BM_BIT_TO_SECT(1)
1245
1246/* bit to represented kilo byte conversion */
1247#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10))
1248
1249/* in which _bitmap_ extent (resp. sector) the bit for a certain
1250 * _storage_ sector is located in */
1251#define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9))
1252
1253/* how much _storage_ sectors we have per bitmap sector */
1254#define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9))
1255#define BM_SECT_PER_EXT BM_EXT_TO_SECT(1)
1256
1257/* in one sector of the bitmap, we have this many activity_log extents. */
1258#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
1259#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
1260
1261#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
1262#define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
1263
1264/* the extent in "PER_EXTENT" below is an activity log extent
1265 * we need that many (long words/bytes) to store the bitmap
1266 * of one AL_EXTENT_SIZE chunk of storage.
1267 * we can store the bitmap for that many AL_EXTENTS within
1268 * one sector of the _on_disk_ bitmap:
1269 * bit 0 bit 37 bit 38 bit (512*8)-1
1270 * ...|........|........|.. // ..|........|
1271 * sect. 0 `296 `304 ^(512*8*8)-1
1272 *
1273#define BM_WORDS_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG )
1274#define BM_BYTES_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 ) // 128
1275#define BM_EXT_PER_SECT ( 512 / BM_BYTES_PER_EXTENT ) // 4
1276 */
1277
1278#define DRBD_MAX_SECTORS_32 (0xffffffffLU)
1279#define DRBD_MAX_SECTORS_BM \
1280 ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9)))
1281#if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32
1282#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
1283#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM
1284#elif !defined(CONFIG_LBD) && BITS_PER_LONG == 32
1285#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32
1286#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
1287#else
1288#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
1289/* 16 TB in units of sectors */
1290#if BITS_PER_LONG == 32
1291/* adjust by one page worth of bitmap,
1292 * so we won't wrap around in drbd_bm_find_next_bit.
1293 * you should use 64bit OS for that much storage, anyways. */
1294#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff)
1295#else
1296#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0x1LU << 32)
1297#endif
1298#endif
1299
1300/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
1301 * With a value of 6 all IO in one 32K block make it to the same slot of the
1302 * hash table. */
1303#define HT_SHIFT 6
1304#define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT))
1305
1306/* Number of elements in the app_reads_hash */
1307#define APP_R_HSIZE 15
1308
1309extern int drbd_bm_init(struct drbd_conf *mdev);
1310extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors);
1311extern void drbd_bm_cleanup(struct drbd_conf *mdev);
1312extern void drbd_bm_set_all(struct drbd_conf *mdev);
1313extern void drbd_bm_clear_all(struct drbd_conf *mdev);
1314extern int drbd_bm_set_bits(
1315 struct drbd_conf *mdev, unsigned long s, unsigned long e);
1316extern int drbd_bm_clear_bits(
1317 struct drbd_conf *mdev, unsigned long s, unsigned long e);
1318/* bm_set_bits variant for use while holding drbd_bm_lock */
1319extern void _drbd_bm_set_bits(struct drbd_conf *mdev,
1320 const unsigned long s, const unsigned long e);
1321extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr);
1322extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
1323extern int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local);
1324extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
1325extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
1326extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
1327 unsigned long al_enr);
1328extern size_t drbd_bm_words(struct drbd_conf *mdev);
1329extern unsigned long drbd_bm_bits(struct drbd_conf *mdev);
1330extern sector_t drbd_bm_capacity(struct drbd_conf *mdev);
1331extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
1332/* bm_find_next variants for use while you hold drbd_bm_lock() */
1333extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
1334extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo);
1335extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev);
1336extern int drbd_bm_rs_done(struct drbd_conf *mdev);
1337/* for receive_bitmap */
1338extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset,
1339 size_t number, unsigned long *buffer);
1340/* for _drbd_send_bitmap and drbd_bm_write_sect */
1341extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset,
1342 size_t number, unsigned long *buffer);
1343
1344extern void drbd_bm_lock(struct drbd_conf *mdev, char *why);
1345extern void drbd_bm_unlock(struct drbd_conf *mdev);
1346
1347extern int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e);
1348/* drbd_main.c */
1349
1350extern struct kmem_cache *drbd_request_cache;
1351extern struct kmem_cache *drbd_ee_cache; /* epoch entries */
1352extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
1353extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
1354extern mempool_t *drbd_request_mempool;
1355extern mempool_t *drbd_ee_mempool;
1356
1357extern struct page *drbd_pp_pool; /* drbd's page pool */
1358extern spinlock_t drbd_pp_lock;
1359extern int drbd_pp_vacant;
1360extern wait_queue_head_t drbd_pp_wait;
1361
1362extern rwlock_t global_state_lock;
1363
1364extern struct drbd_conf *drbd_new_device(unsigned int minor);
1365extern void drbd_free_mdev(struct drbd_conf *mdev);
1366
1367extern int proc_details;
1368
1369/* drbd_req */
1370extern int drbd_make_request_26(struct request_queue *q, struct bio *bio);
1371extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
1372extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
1373extern int is_valid_ar_handle(struct drbd_request *, sector_t);
1374
1375
1376/* drbd_nl.c */
1377extern void drbd_suspend_io(struct drbd_conf *mdev);
1378extern void drbd_resume_io(struct drbd_conf *mdev);
1379extern char *ppsize(char *buf, unsigned long long size);
1380extern sector_t drbd_new_dev_size(struct drbd_conf *,
1381 struct drbd_backing_dev *);
1382enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
1383extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *) __must_hold(local);
1384extern void resync_after_online_grow(struct drbd_conf *);
1385extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
1386extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
1387 int force);
1388enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
1389extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
1390
1391/* drbd_worker.c */
1392extern int drbd_worker(struct drbd_thread *thi);
1393extern int drbd_alter_sa(struct drbd_conf *mdev, int na);
1394extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side);
1395extern void resume_next_sg(struct drbd_conf *mdev);
1396extern void suspend_other_sg(struct drbd_conf *mdev);
1397extern int drbd_resync_finished(struct drbd_conf *mdev);
1398/* maybe rather drbd_main.c ? */
1399extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
1400 struct drbd_backing_dev *bdev, sector_t sector, int rw);
1401extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
1402
1403static inline void ov_oos_print(struct drbd_conf *mdev)
1404{
1405 if (mdev->ov_last_oos_size) {
1406 dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n",
1407 (unsigned long long)mdev->ov_last_oos_start,
1408 (unsigned long)mdev->ov_last_oos_size);
1409 }
1410 mdev->ov_last_oos_size=0;
1411}
1412
1413
1414extern void drbd_csum(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
1415/* worker callbacks */
1416extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int);
1417extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int);
1418extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int);
1419extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int);
1420extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int);
1421extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int);
1422extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int);
1423extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int);
1424extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int);
1425extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int);
1426extern int w_io_error(struct drbd_conf *, struct drbd_work *, int);
1427extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int);
1428extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int);
1429extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int);
1430extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
1431extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
1432extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
1433extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
1434
1435extern void resync_timer_fn(unsigned long data);
1436
1437/* drbd_receiver.c */
1438extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
1439extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
1440 u64 id,
1441 sector_t sector,
1442 unsigned int data_size,
1443 gfp_t gfp_mask) __must_hold(local);
1444extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e);
1445extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
1446 struct list_head *head);
1447extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
1448 struct list_head *head);
1449extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
1450extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
1451extern void drbd_flush_workqueue(struct drbd_conf *mdev);
1452
1453/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
1454 * mess with get_fs/set_fs, we know we are KERNEL_DS always. */
1455static inline int drbd_setsockopt(struct socket *sock, int level, int optname,
1456 char __user *optval, int optlen)
1457{
1458 int err;
1459 if (level == SOL_SOCKET)
1460 err = sock_setsockopt(sock, level, optname, optval, optlen);
1461 else
1462 err = sock->ops->setsockopt(sock, level, optname, optval,
1463 optlen);
1464 return err;
1465}
1466
1467static inline void drbd_tcp_cork(struct socket *sock)
1468{
1469 int __user val = 1;
1470 (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
1471 (char __user *)&val, sizeof(val));
1472}
1473
1474static inline void drbd_tcp_uncork(struct socket *sock)
1475{
1476 int __user val = 0;
1477 (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
1478 (char __user *)&val, sizeof(val));
1479}
1480
1481static inline void drbd_tcp_nodelay(struct socket *sock)
1482{
1483 int __user val = 1;
1484 (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY,
1485 (char __user *)&val, sizeof(val));
1486}
1487
1488static inline void drbd_tcp_quickack(struct socket *sock)
1489{
1490 int __user val = 1;
1491 (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
1492 (char __user *)&val, sizeof(val));
1493}
1494
1495void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo);
1496
1497/* drbd_proc.c */
1498extern struct proc_dir_entry *drbd_proc;
1499extern struct file_operations drbd_proc_fops;
1500extern const char *drbd_conn_str(enum drbd_conns s);
1501extern const char *drbd_role_str(enum drbd_role s);
1502
1503/* drbd_actlog.c */
1504extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector);
1505extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector);
1506extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector);
1507extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
1508extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
1509extern void drbd_rs_cancel_all(struct drbd_conf *mdev);
1510extern int drbd_rs_del_all(struct drbd_conf *mdev);
1511extern void drbd_rs_failed_io(struct drbd_conf *mdev,
1512 sector_t sector, int size);
1513extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *);
1514extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector,
1515 int size, const char *file, const unsigned int line);
1516#define drbd_set_in_sync(mdev, sector, size) \
1517 __drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__)
1518extern void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
1519 int size, const char *file, const unsigned int line);
1520#define drbd_set_out_of_sync(mdev, sector, size) \
1521 __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
1522extern void drbd_al_apply_to_bm(struct drbd_conf *mdev);
1523extern void drbd_al_to_on_disk_bm(struct drbd_conf *mdev);
1524extern void drbd_al_shrink(struct drbd_conf *mdev);
1525
1526
1527/* drbd_nl.c */
1528
1529void drbd_nl_cleanup(void);
1530int __init drbd_nl_init(void);
1531void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state);
1532void drbd_bcast_sync_progress(struct drbd_conf *mdev);
1533void drbd_bcast_ee(struct drbd_conf *mdev,
1534 const char *reason, const int dgs,
1535 const char* seen_hash, const char* calc_hash,
1536 const struct drbd_epoch_entry* e);
1537
1538
1539/**
1540 * DOC: DRBD State macros
1541 *
1542 * These macros are used to express state changes in easily readable form.
1543 *
1544 * The NS macros expand to a mask and a value, that can be bit ored onto the
1545 * current state as soon as the spinlock (req_lock) was taken.
1546 *
1547 * The _NS macros are used for state functions that get called with the
1548 * spinlock. These macros expand directly to the new state value.
1549 *
1550 * Besides the basic forms NS() and _NS() additional _?NS[23] are defined
1551 * to express state changes that affect more than one aspect of the state.
1552 *
1553 * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
1554 * Means that the network connection was established and that the peer
1555 * is in secondary role.
1556 */
1557#define role_MASK R_MASK
1558#define peer_MASK R_MASK
1559#define disk_MASK D_MASK
1560#define pdsk_MASK D_MASK
1561#define conn_MASK C_MASK
1562#define susp_MASK 1
1563#define user_isp_MASK 1
1564#define aftr_isp_MASK 1
1565
1566#define NS(T, S) \
1567 ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
1568 ({ union drbd_state val; val.i = 0; val.T = (S); val; })
1569#define NS2(T1, S1, T2, S2) \
1570 ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
1571 mask.T2 = T2##_MASK; mask; }), \
1572 ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
1573 val.T2 = (S2); val; })
1574#define NS3(T1, S1, T2, S2, T3, S3) \
1575 ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
1576 mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
1577 ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
1578 val.T2 = (S2); val.T3 = (S3); val; })
1579
1580#define _NS(D, T, S) \
1581 D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; })
1582#define _NS2(D, T1, S1, T2, S2) \
1583 D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
1584 __ns.T2 = (S2); __ns; })
1585#define _NS3(D, T1, S1, T2, S2, T3, S3) \
1586 D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
1587 __ns.T2 = (S2); __ns.T3 = (S3); __ns; })
1588
1589/*
1590 * inline helper functions
1591 *************************/
1592
1593static inline void drbd_state_lock(struct drbd_conf *mdev)
1594{
1595 wait_event(mdev->misc_wait,
1596 !test_and_set_bit(CLUSTER_ST_CHANGE, &mdev->flags));
1597}
1598
1599static inline void drbd_state_unlock(struct drbd_conf *mdev)
1600{
1601 clear_bit(CLUSTER_ST_CHANGE, &mdev->flags);
1602 wake_up(&mdev->misc_wait);
1603}
1604
1605static inline int _drbd_set_state(struct drbd_conf *mdev,
1606 union drbd_state ns, enum chg_state_flags flags,
1607 struct completion *done)
1608{
1609 int rv;
1610
1611 read_lock(&global_state_lock);
1612 rv = __drbd_set_state(mdev, ns, flags, done);
1613 read_unlock(&global_state_lock);
1614
1615 return rv;
1616}
1617
1618/**
1619 * drbd_request_state() - Reqest a state change
1620 * @mdev: DRBD device.
1621 * @mask: mask of state bits to change.
1622 * @val: value of new state bits.
1623 *
1624 * This is the most graceful way of requesting a state change. It is verbose
1625 * quite verbose in case the state change is not possible, and all those
1626 * state changes are globally serialized.
1627 */
1628static inline int drbd_request_state(struct drbd_conf *mdev,
1629 union drbd_state mask,
1630 union drbd_state val)
1631{
1632 return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED);
1633}
1634
1635#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__)
1636static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, const char *where)
1637{
1638 switch (mdev->ldev->dc.on_io_error) {
1639 case EP_PASS_ON:
1640 if (!forcedetach) {
1641 if (printk_ratelimit())
1642 dev_err(DEV, "Local IO failed in %s."
1643 "Passing error on...\n", where);
1644 break;
1645 }
1646 /* NOTE fall through to detach case if forcedetach set */
1647 case EP_DETACH:
1648 case EP_CALL_HELPER:
1649 if (mdev->state.disk > D_FAILED) {
1650 _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
1651 dev_err(DEV, "Local IO failed in %s."
1652 "Detaching...\n", where);
1653 }
1654 break;
1655 }
1656}
1657
1658/**
1659 * drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers
1660 * @mdev: DRBD device.
1661 * @error: Error code passed to the IO completion callback
1662 * @forcedetach: Force detach. I.e. the error happened while accessing the meta data
1663 *
1664 * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED)
1665 */
1666#define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__)
1667static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
1668 int error, int forcedetach, const char *where)
1669{
1670 if (error) {
1671 unsigned long flags;
1672 spin_lock_irqsave(&mdev->req_lock, flags);
1673 __drbd_chk_io_error_(mdev, forcedetach, where);
1674 spin_unlock_irqrestore(&mdev->req_lock, flags);
1675 }
1676}
1677
1678
1679/**
1680 * drbd_md_first_sector() - Returns the first sector number of the meta data area
1681 * @bdev: Meta data block device.
1682 *
1683 * BTW, for internal meta data, this happens to be the maximum capacity
1684 * we could agree upon with our peer node.
1685 */
1686static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
1687{
1688 switch (bdev->dc.meta_dev_idx) {
1689 case DRBD_MD_INDEX_INTERNAL:
1690 case DRBD_MD_INDEX_FLEX_INT:
1691 return bdev->md.md_offset + bdev->md.bm_offset;
1692 case DRBD_MD_INDEX_FLEX_EXT:
1693 default:
1694 return bdev->md.md_offset;
1695 }
1696}
1697
1698/**
1699 * drbd_md_last_sector() - Return the last sector number of the meta data area
1700 * @bdev: Meta data block device.
1701 */
1702static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
1703{
1704 switch (bdev->dc.meta_dev_idx) {
1705 case DRBD_MD_INDEX_INTERNAL:
1706 case DRBD_MD_INDEX_FLEX_INT:
1707 return bdev->md.md_offset + MD_AL_OFFSET - 1;
1708 case DRBD_MD_INDEX_FLEX_EXT:
1709 default:
1710 return bdev->md.md_offset + bdev->md.md_size_sect;
1711 }
1712}
1713
1714/* Returns the number of 512 byte sectors of the device */
1715static inline sector_t drbd_get_capacity(struct block_device *bdev)
1716{
1717 /* return bdev ? get_capacity(bdev->bd_disk) : 0; */
1718 return bdev ? bdev->bd_inode->i_size >> 9 : 0;
1719}
1720
1721/**
1722 * drbd_get_max_capacity() - Returns the capacity we announce to out peer
1723 * @bdev: Meta data block device.
1724 *
1725 * returns the capacity we announce to out peer. we clip ourselves at the
1726 * various MAX_SECTORS, because if we don't, current implementation will
1727 * oops sooner or later
1728 */
1729static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
1730{
1731 sector_t s;
1732 switch (bdev->dc.meta_dev_idx) {
1733 case DRBD_MD_INDEX_INTERNAL:
1734 case DRBD_MD_INDEX_FLEX_INT:
1735 s = drbd_get_capacity(bdev->backing_bdev)
1736 ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
1737 drbd_md_first_sector(bdev))
1738 : 0;
1739 break;
1740 case DRBD_MD_INDEX_FLEX_EXT:
1741 s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
1742 drbd_get_capacity(bdev->backing_bdev));
1743 /* clip at maximum size the meta device can support */
1744 s = min_t(sector_t, s,
1745 BM_EXT_TO_SECT(bdev->md.md_size_sect
1746 - bdev->md.bm_offset));
1747 break;
1748 default:
1749 s = min_t(sector_t, DRBD_MAX_SECTORS,
1750 drbd_get_capacity(bdev->backing_bdev));
1751 }
1752 return s;
1753}
1754
1755/**
1756 * drbd_md_ss__() - Return the sector number of our meta data super block
1757 * @mdev: DRBD device.
1758 * @bdev: Meta data block device.
1759 */
1760static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
1761 struct drbd_backing_dev *bdev)
1762{
1763 switch (bdev->dc.meta_dev_idx) {
1764 default: /* external, some index */
1765 return MD_RESERVED_SECT * bdev->dc.meta_dev_idx;
1766 case DRBD_MD_INDEX_INTERNAL:
1767 /* with drbd08, internal meta data is always "flexible" */
1768 case DRBD_MD_INDEX_FLEX_INT:
1769 /* sizeof(struct md_on_disk_07) == 4k
1770 * position: last 4k aligned block of 4k size */
1771 if (!bdev->backing_bdev) {
1772 if (__ratelimit(&drbd_ratelimit_state)) {
1773 dev_err(DEV, "bdev->backing_bdev==NULL\n");
1774 dump_stack();
1775 }
1776 return 0;
1777 }
1778 return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL)
1779 - MD_AL_OFFSET;
1780 case DRBD_MD_INDEX_FLEX_EXT:
1781 return 0;
1782 }
1783}
1784
1785static inline void
1786_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
1787{
1788 list_add_tail(&w->list, &q->q);
1789 up(&q->s);
1790}
1791
1792static inline void
1793drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
1794{
1795 unsigned long flags;
1796 spin_lock_irqsave(&q->q_lock, flags);
1797 list_add(&w->list, &q->q);
1798 up(&q->s); /* within the spinlock,
1799 see comment near end of drbd_worker() */
1800 spin_unlock_irqrestore(&q->q_lock, flags);
1801}
1802
1803static inline void
1804drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
1805{
1806 unsigned long flags;
1807 spin_lock_irqsave(&q->q_lock, flags);
1808 list_add_tail(&w->list, &q->q);
1809 up(&q->s); /* within the spinlock,
1810 see comment near end of drbd_worker() */
1811 spin_unlock_irqrestore(&q->q_lock, flags);
1812}
1813
1814static inline void wake_asender(struct drbd_conf *mdev)
1815{
1816 if (test_bit(SIGNAL_ASENDER, &mdev->flags))
1817 force_sig(DRBD_SIG, mdev->asender.task);
1818}
1819
1820static inline void request_ping(struct drbd_conf *mdev)
1821{
1822 set_bit(SEND_PING, &mdev->flags);
1823 wake_asender(mdev);
1824}
1825
1826static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
1827 enum drbd_packets cmd)
1828{
1829 struct p_header h;
1830 return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
1831}
1832
1833static inline int drbd_send_ping(struct drbd_conf *mdev)
1834{
1835 struct p_header h;
1836 return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
1837}
1838
1839static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
1840{
1841 struct p_header h;
1842 return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
1843}
1844
1845static inline void drbd_thread_stop(struct drbd_thread *thi)
1846{
1847 _drbd_thread_stop(thi, FALSE, TRUE);
1848}
1849
1850static inline void drbd_thread_stop_nowait(struct drbd_thread *thi)
1851{
1852 _drbd_thread_stop(thi, FALSE, FALSE);
1853}
1854
1855static inline void drbd_thread_restart_nowait(struct drbd_thread *thi)
1856{
1857 _drbd_thread_stop(thi, TRUE, FALSE);
1858}
1859
1860/* counts how many answer packets packets we expect from our peer,
1861 * for either explicit application requests,
1862 * or implicit barrier packets as necessary.
1863 * increased:
1864 * w_send_barrier
1865 * _req_mod(req, queue_for_net_write or queue_for_net_read);
1866 * it is much easier and equally valid to count what we queue for the
1867 * worker, even before it actually was queued or send.
1868 * (drbd_make_request_common; recovery path on read io-error)
1869 * decreased:
1870 * got_BarrierAck (respective tl_clear, tl_clear_barrier)
1871 * _req_mod(req, data_received)
1872 * [from receive_DataReply]
1873 * _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked)
1874 * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)]
1875 * for some reason it is NOT decreased in got_NegAck,
1876 * but in the resulting cleanup code from report_params.
1877 * we should try to remember the reason for that...
1878 * _req_mod(req, send_failed or send_canceled)
1879 * _req_mod(req, connection_lost_while_pending)
1880 * [from tl_clear_barrier]
1881 */
1882static inline void inc_ap_pending(struct drbd_conf *mdev)
1883{
1884 atomic_inc(&mdev->ap_pending_cnt);
1885}
1886
1887#define ERR_IF_CNT_IS_NEGATIVE(which) \
1888 if (atomic_read(&mdev->which) < 0) \
1889 dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \
1890 __func__ , __LINE__ , \
1891 atomic_read(&mdev->which))
1892
1893#define dec_ap_pending(mdev) do { \
1894 typecheck(struct drbd_conf *, mdev); \
1895 if (atomic_dec_and_test(&mdev->ap_pending_cnt)) \
1896 wake_up(&mdev->misc_wait); \
1897 ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0)
1898
1899/* counts how many resync-related answers we still expect from the peer
1900 * increase decrease
1901 * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY)
1902 * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK whith ID_SYNCER)
1903 * (or P_NEG_ACK with ID_SYNCER)
1904 */
1905static inline void inc_rs_pending(struct drbd_conf *mdev)
1906{
1907 atomic_inc(&mdev->rs_pending_cnt);
1908}
1909
1910#define dec_rs_pending(mdev) do { \
1911 typecheck(struct drbd_conf *, mdev); \
1912 atomic_dec(&mdev->rs_pending_cnt); \
1913 ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0)
1914
1915/* counts how many answers we still need to send to the peer.
1916 * increased on
1917 * receive_Data unless protocol A;
1918 * we need to send a P_RECV_ACK (proto B)
1919 * or P_WRITE_ACK (proto C)
1920 * receive_RSDataReply (recv_resync_read) we need to send a P_WRITE_ACK
1921 * receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA
1922 * receive_Barrier_* we need to send a P_BARRIER_ACK
1923 */
1924static inline void inc_unacked(struct drbd_conf *mdev)
1925{
1926 atomic_inc(&mdev->unacked_cnt);
1927}
1928
1929#define dec_unacked(mdev) do { \
1930 typecheck(struct drbd_conf *, mdev); \
1931 atomic_dec(&mdev->unacked_cnt); \
1932 ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
1933
1934#define sub_unacked(mdev, n) do { \
1935 typecheck(struct drbd_conf *, mdev); \
1936 atomic_sub(n, &mdev->unacked_cnt); \
1937 ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
1938
1939
1940static inline void put_net_conf(struct drbd_conf *mdev)
1941{
1942 if (atomic_dec_and_test(&mdev->net_cnt))
1943 wake_up(&mdev->misc_wait);
1944}
1945
1946/**
1947 * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there
1948 * @mdev: DRBD device.
1949 *
1950 * You have to call put_net_conf() when finished working with mdev->net_conf.
1951 */
1952static inline int get_net_conf(struct drbd_conf *mdev)
1953{
1954 int have_net_conf;
1955
1956 atomic_inc(&mdev->net_cnt);
1957 have_net_conf = mdev->state.conn >= C_UNCONNECTED;
1958 if (!have_net_conf)
1959 put_net_conf(mdev);
1960 return have_net_conf;
1961}
1962
1963/**
1964 * get_ldev() - Increase the ref count on mdev->ldev. Returns 0 if there is no ldev
1965 * @M: DRBD device.
1966 *
1967 * You have to call put_ldev() when finished working with mdev->ldev.
1968 */
1969#define get_ldev(M) __cond_lock(local, _get_ldev_if_state(M,D_INCONSISTENT))
1970#define get_ldev_if_state(M,MINS) __cond_lock(local, _get_ldev_if_state(M,MINS))
1971
1972static inline void put_ldev(struct drbd_conf *mdev)
1973{
1974 __release(local);
1975 if (atomic_dec_and_test(&mdev->local_cnt))
1976 wake_up(&mdev->misc_wait);
1977 D_ASSERT(atomic_read(&mdev->local_cnt) >= 0);
1978}
1979
1980#ifndef __CHECKER__
1981static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
1982{
1983 int io_allowed;
1984
1985 atomic_inc(&mdev->local_cnt);
1986 io_allowed = (mdev->state.disk >= mins);
1987 if (!io_allowed)
1988 put_ldev(mdev);
1989 return io_allowed;
1990}
1991#else
1992extern int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins);
1993#endif
1994
1995/* you must have an "get_ldev" reference */
1996static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
1997 unsigned long *bits_left, unsigned int *per_mil_done)
1998{
1999 /*
2000 * this is to break it at compile time when we change that
2001 * (we may feel 4TB maximum storage per drbd is not enough)
2002 */
2003 typecheck(unsigned long, mdev->rs_total);
2004
2005 /* note: both rs_total and rs_left are in bits, i.e. in
2006 * units of BM_BLOCK_SIZE.
2007 * for the percentage, we don't care. */
2008
2009 *bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
2010 /* >> 10 to prevent overflow,
2011 * +1 to prevent division by zero */
2012 if (*bits_left > mdev->rs_total) {
2013 /* doh. maybe a logic bug somewhere.
2014 * may also be just a race condition
2015 * between this and a disconnect during sync.
2016 * for now, just prevent in-kernel buffer overflow.
2017 */
2018 smp_rmb();
2019 dev_warn(DEV, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n",
2020 drbd_conn_str(mdev->state.conn),
2021 *bits_left, mdev->rs_total, mdev->rs_failed);
2022 *per_mil_done = 0;
2023 } else {
2024 /* make sure the calculation happens in long context */
2025 unsigned long tmp = 1000UL -
2026 (*bits_left >> 10)*1000UL
2027 / ((mdev->rs_total >> 10) + 1UL);
2028 *per_mil_done = tmp;
2029 }
2030}
2031
2032
2033/* this throttles on-the-fly application requests
2034 * according to max_buffers settings;
2035 * maybe re-implement using semaphores? */
2036static inline int drbd_get_max_buffers(struct drbd_conf *mdev)
2037{
2038 int mxb = 1000000; /* arbitrary limit on open requests */
2039 if (get_net_conf(mdev)) {
2040 mxb = mdev->net_conf->max_buffers;
2041 put_net_conf(mdev);
2042 }
2043 return mxb;
2044}
2045
2046static inline int drbd_state_is_stable(union drbd_state s)
2047{
2048
2049 /* DO NOT add a default clause, we want the compiler to warn us
2050 * for any newly introduced state we may have forgotten to add here */
2051
2052 switch ((enum drbd_conns)s.conn) {
2053 /* new io only accepted when there is no connection, ... */
2054 case C_STANDALONE:
2055 case C_WF_CONNECTION:
2056 /* ... or there is a well established connection. */
2057 case C_CONNECTED:
2058 case C_SYNC_SOURCE:
2059 case C_SYNC_TARGET:
2060 case C_VERIFY_S:
2061 case C_VERIFY_T:
2062 case C_PAUSED_SYNC_S:
2063 case C_PAUSED_SYNC_T:
2064 /* maybe stable, look at the disk state */
2065 break;
2066
2067 /* no new io accepted during tansitional states
2068 * like handshake or teardown */
2069 case C_DISCONNECTING:
2070 case C_UNCONNECTED:
2071 case C_TIMEOUT:
2072 case C_BROKEN_PIPE:
2073 case C_NETWORK_FAILURE:
2074 case C_PROTOCOL_ERROR:
2075 case C_TEAR_DOWN:
2076 case C_WF_REPORT_PARAMS:
2077 case C_STARTING_SYNC_S:
2078 case C_STARTING_SYNC_T:
2079 case C_WF_BITMAP_S:
2080 case C_WF_BITMAP_T:
2081 case C_WF_SYNC_UUID:
2082 case C_MASK:
2083 /* not "stable" */
2084 return 0;
2085 }
2086
2087 switch ((enum drbd_disk_state)s.disk) {
2088 case D_DISKLESS:
2089 case D_INCONSISTENT:
2090 case D_OUTDATED:
2091 case D_CONSISTENT:
2092 case D_UP_TO_DATE:
2093 /* disk state is stable as well. */
2094 break;
2095
2096 /* no new io accepted during tansitional states */
2097 case D_ATTACHING:
2098 case D_FAILED:
2099 case D_NEGOTIATING:
2100 case D_UNKNOWN:
2101 case D_MASK:
2102 /* not "stable" */
2103 return 0;
2104 }
2105
2106 return 1;
2107}
2108
2109static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
2110{
2111 int mxb = drbd_get_max_buffers(mdev);
2112
2113 if (mdev->state.susp)
2114 return 0;
2115 if (test_bit(SUSPEND_IO, &mdev->flags))
2116 return 0;
2117
2118 /* to avoid potential deadlock or bitmap corruption,
2119 * in various places, we only allow new application io
2120 * to start during "stable" states. */
2121
2122 /* no new io accepted when attaching or detaching the disk */
2123 if (!drbd_state_is_stable(mdev->state))
2124 return 0;
2125
2126 /* since some older kernels don't have atomic_add_unless,
2127 * and we are within the spinlock anyways, we have this workaround. */
2128 if (atomic_read(&mdev->ap_bio_cnt) > mxb)
2129 return 0;
2130 if (test_bit(BITMAP_IO, &mdev->flags))
2131 return 0;
2132 return 1;
2133}
2134
2135/* I'd like to use wait_event_lock_irq,
2136 * but I'm not sure when it got introduced,
2137 * and not sure when it has 3 or 4 arguments */
2138static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two)
2139{
2140 /* compare with after_state_ch,
2141 * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */
2142 DEFINE_WAIT(wait);
2143
2144 /* we wait here
2145 * as long as the device is suspended
2146 * until the bitmap is no longer on the fly during connection
2147 * handshake as long as we would exeed the max_buffer limit.
2148 *
2149 * to avoid races with the reconnect code,
2150 * we need to atomic_inc within the spinlock. */
2151
2152 spin_lock_irq(&mdev->req_lock);
2153 while (!__inc_ap_bio_cond(mdev)) {
2154 prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
2155 spin_unlock_irq(&mdev->req_lock);
2156 schedule();
2157 finish_wait(&mdev->misc_wait, &wait);
2158 spin_lock_irq(&mdev->req_lock);
2159 }
2160 atomic_add(one_or_two, &mdev->ap_bio_cnt);
2161 spin_unlock_irq(&mdev->req_lock);
2162}
2163
2164static inline void dec_ap_bio(struct drbd_conf *mdev)
2165{
2166 int mxb = drbd_get_max_buffers(mdev);
2167 int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt);
2168
2169 D_ASSERT(ap_bio >= 0);
2170 /* this currently does wake_up for every dec_ap_bio!
2171 * maybe rather introduce some type of hysteresis?
2172 * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */
2173 if (ap_bio < mxb)
2174 wake_up(&mdev->misc_wait);
2175 if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) {
2176 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
2177 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
2178 }
2179}
2180
2181static inline void drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
2182{
2183 mdev->ed_uuid = val;
2184}
2185
2186static inline int seq_cmp(u32 a, u32 b)
2187{
2188 /* we assume wrap around at 32bit.
2189 * for wrap around at 24bit (old atomic_t),
2190 * we'd have to
2191 * a <<= 8; b <<= 8;
2192 */
2193 return (s32)(a) - (s32)(b);
2194}
2195#define seq_lt(a, b) (seq_cmp((a), (b)) < 0)
2196#define seq_gt(a, b) (seq_cmp((a), (b)) > 0)
2197#define seq_ge(a, b) (seq_cmp((a), (b)) >= 0)
2198#define seq_le(a, b) (seq_cmp((a), (b)) <= 0)
2199/* CAUTION: please no side effects in arguments! */
2200#define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b)))
2201
2202static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq)
2203{
2204 unsigned int m;
2205 spin_lock(&mdev->peer_seq_lock);
2206 m = seq_max(mdev->peer_seq, new_seq);
2207 mdev->peer_seq = m;
2208 spin_unlock(&mdev->peer_seq_lock);
2209 if (m == new_seq)
2210 wake_up(&mdev->seq_wait);
2211}
2212
2213static inline void drbd_update_congested(struct drbd_conf *mdev)
2214{
2215 struct sock *sk = mdev->data.socket->sk;
2216 if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
2217 set_bit(NET_CONGESTED, &mdev->flags);
2218}
2219
2220static inline int drbd_queue_order_type(struct drbd_conf *mdev)
2221{
2222 /* sorry, we currently have no working implementation
2223 * of distributed TCQ stuff */
2224#ifndef QUEUE_ORDERED_NONE
2225#define QUEUE_ORDERED_NONE 0
2226#endif
2227 return QUEUE_ORDERED_NONE;
2228}
2229
2230static inline void drbd_blk_run_queue(struct request_queue *q)
2231{
2232 if (q && q->unplug_fn)
2233 q->unplug_fn(q);
2234}
2235
2236static inline void drbd_kick_lo(struct drbd_conf *mdev)
2237{
2238 if (get_ldev(mdev)) {
2239 drbd_blk_run_queue(bdev_get_queue(mdev->ldev->backing_bdev));
2240 put_ldev(mdev);
2241 }
2242}
2243
2244static inline void drbd_md_flush(struct drbd_conf *mdev)
2245{
2246 int r;
2247
2248 if (test_bit(MD_NO_BARRIER, &mdev->flags))
2249 return;
2250
2251 r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL);
2252 if (r) {
2253 set_bit(MD_NO_BARRIER, &mdev->flags);
2254 dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
2255 }
2256}
2257
2258#endif
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
new file mode 100644
index 000000000000..edf0b8031e69
--- /dev/null
+++ b/drivers/block/drbd/drbd_main.c
@@ -0,0 +1,3735 @@
1/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
29#include <linux/autoconf.h>
30#include <linux/module.h>
31#include <linux/version.h>
32#include <linux/drbd.h>
33#include <asm/uaccess.h>
34#include <asm/types.h>
35#include <net/sock.h>
36#include <linux/ctype.h>
37#include <linux/smp_lock.h>
38#include <linux/fs.h>
39#include <linux/file.h>
40#include <linux/proc_fs.h>
41#include <linux/init.h>
42#include <linux/mm.h>
43#include <linux/memcontrol.h>
44#include <linux/mm_inline.h>
45#include <linux/slab.h>
46#include <linux/random.h>
47#include <linux/reboot.h>
48#include <linux/notifier.h>
49#include <linux/kthread.h>
50
51#define __KERNEL_SYSCALLS__
52#include <linux/unistd.h>
53#include <linux/vmalloc.h>
54
55#include <linux/drbd_limits.h>
56#include "drbd_int.h"
57#include "drbd_tracing.h"
58#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
59
60#include "drbd_vli.h"
61
62struct after_state_chg_work {
63 struct drbd_work w;
64 union drbd_state os;
65 union drbd_state ns;
66 enum chg_state_flags flags;
67 struct completion *done;
68};
69
70int drbdd_init(struct drbd_thread *);
71int drbd_worker(struct drbd_thread *);
72int drbd_asender(struct drbd_thread *);
73
74int drbd_init(void);
75static int drbd_open(struct block_device *bdev, fmode_t mode);
76static int drbd_release(struct gendisk *gd, fmode_t mode);
77static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
78static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
79 union drbd_state ns, enum chg_state_flags flags);
80static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
81static void md_sync_timer_fn(unsigned long data);
82static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
83
84DEFINE_TRACE(drbd_unplug);
85DEFINE_TRACE(drbd_uuid);
86DEFINE_TRACE(drbd_ee);
87DEFINE_TRACE(drbd_packet);
88DEFINE_TRACE(drbd_md_io);
89DEFINE_TRACE(drbd_epoch);
90DEFINE_TRACE(drbd_netlink);
91DEFINE_TRACE(drbd_actlog);
92DEFINE_TRACE(drbd_bio);
93DEFINE_TRACE(_drbd_resync);
94DEFINE_TRACE(drbd_req);
95
96MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
97 "Lars Ellenberg <lars@linbit.com>");
98MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
99MODULE_VERSION(REL_VERSION);
100MODULE_LICENSE("GPL");
101MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
102MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
103
104#include <linux/moduleparam.h>
105/* allow_open_on_secondary */
106MODULE_PARM_DESC(allow_oos, "DONT USE!");
107/* thanks to these macros, if compiled into the kernel (not-module),
108 * this becomes the boot parameter drbd.minor_count */
109module_param(minor_count, uint, 0444);
110module_param(disable_sendpage, bool, 0644);
111module_param(allow_oos, bool, 0);
112module_param(cn_idx, uint, 0444);
113module_param(proc_details, int, 0644);
114
115#ifdef CONFIG_DRBD_FAULT_INJECTION
116int enable_faults;
117int fault_rate;
118static int fault_count;
119int fault_devs;
120/* bitmap of enabled faults */
121module_param(enable_faults, int, 0664);
122/* fault rate % value - applies to all enabled faults */
123module_param(fault_rate, int, 0664);
124/* count of faults inserted */
125module_param(fault_count, int, 0664);
126/* bitmap of devices to insert faults on */
127module_param(fault_devs, int, 0644);
128#endif
129
130/* module parameter, defined */
131unsigned int minor_count = 32;
132int disable_sendpage;
133int allow_oos;
134unsigned int cn_idx = CN_IDX_DRBD;
135int proc_details; /* Detail level in proc drbd*/
136
137/* Module parameter for setting the user mode helper program
138 * to run. Default is /sbin/drbdadm */
139char usermode_helper[80] = "/sbin/drbdadm";
140
141module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
142
143/* in 2.6.x, our device mapping and config info contains our virtual gendisks
144 * as member "struct gendisk *vdisk;"
145 */
146struct drbd_conf **minor_table;
147
148struct kmem_cache *drbd_request_cache;
149struct kmem_cache *drbd_ee_cache; /* epoch entries */
150struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
151struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
152mempool_t *drbd_request_mempool;
153mempool_t *drbd_ee_mempool;
154
155/* I do not use a standard mempool, because:
156 1) I want to hand out the pre-allocated objects first.
157 2) I want to be able to interrupt sleeping allocation with a signal.
158 Note: This is a single linked list, the next pointer is the private
159 member of struct page.
160 */
161struct page *drbd_pp_pool;
162spinlock_t drbd_pp_lock;
163int drbd_pp_vacant;
164wait_queue_head_t drbd_pp_wait;
165
166DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
167
168static struct block_device_operations drbd_ops = {
169 .owner = THIS_MODULE,
170 .open = drbd_open,
171 .release = drbd_release,
172};
173
174#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
175
176#ifdef __CHECKER__
177/* When checking with sparse, and this is an inline function, sparse will
178 give tons of false positives. When this is a real functions sparse works.
179 */
180int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
181{
182 int io_allowed;
183
184 atomic_inc(&mdev->local_cnt);
185 io_allowed = (mdev->state.disk >= mins);
186 if (!io_allowed) {
187 if (atomic_dec_and_test(&mdev->local_cnt))
188 wake_up(&mdev->misc_wait);
189 }
190 return io_allowed;
191}
192
193#endif
194
195/**
196 * DOC: The transfer log
197 *
198 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
199 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
200 * of the list. There is always at least one &struct drbd_tl_epoch object.
201 *
202 * Each &struct drbd_tl_epoch has a circular double linked list of requests
203 * attached.
204 */
205static int tl_init(struct drbd_conf *mdev)
206{
207 struct drbd_tl_epoch *b;
208
209 /* during device minor initialization, we may well use GFP_KERNEL */
210 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
211 if (!b)
212 return 0;
213 INIT_LIST_HEAD(&b->requests);
214 INIT_LIST_HEAD(&b->w.list);
215 b->next = NULL;
216 b->br_number = 4711;
217 b->n_req = 0;
218 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
219
220 mdev->oldest_tle = b;
221 mdev->newest_tle = b;
222 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
223
224 mdev->tl_hash = NULL;
225 mdev->tl_hash_s = 0;
226
227 return 1;
228}
229
230static void tl_cleanup(struct drbd_conf *mdev)
231{
232 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
233 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
234 kfree(mdev->oldest_tle);
235 mdev->oldest_tle = NULL;
236 kfree(mdev->unused_spare_tle);
237 mdev->unused_spare_tle = NULL;
238 kfree(mdev->tl_hash);
239 mdev->tl_hash = NULL;
240 mdev->tl_hash_s = 0;
241}
242
243/**
244 * _tl_add_barrier() - Adds a barrier to the transfer log
245 * @mdev: DRBD device.
246 * @new: Barrier to be added before the current head of the TL.
247 *
248 * The caller must hold the req_lock.
249 */
250void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
251{
252 struct drbd_tl_epoch *newest_before;
253
254 INIT_LIST_HEAD(&new->requests);
255 INIT_LIST_HEAD(&new->w.list);
256 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
257 new->next = NULL;
258 new->n_req = 0;
259
260 newest_before = mdev->newest_tle;
261 /* never send a barrier number == 0, because that is special-cased
262 * when using TCQ for our write ordering code */
263 new->br_number = (newest_before->br_number+1) ?: 1;
264 if (mdev->newest_tle != new) {
265 mdev->newest_tle->next = new;
266 mdev->newest_tle = new;
267 }
268}
269
270/**
271 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
272 * @mdev: DRBD device.
273 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
274 * @set_size: Expected number of requests before that barrier.
275 *
276 * In case the passed barrier_nr or set_size does not match the oldest
277 * &struct drbd_tl_epoch objects this function will cause a termination
278 * of the connection.
279 */
280void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
281 unsigned int set_size)
282{
283 struct drbd_tl_epoch *b, *nob; /* next old barrier */
284 struct list_head *le, *tle;
285 struct drbd_request *r;
286
287 spin_lock_irq(&mdev->req_lock);
288
289 b = mdev->oldest_tle;
290
291 /* first some paranoia code */
292 if (b == NULL) {
293 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
294 barrier_nr);
295 goto bail;
296 }
297 if (b->br_number != barrier_nr) {
298 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
299 barrier_nr, b->br_number);
300 goto bail;
301 }
302 if (b->n_req != set_size) {
303 dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n",
304 barrier_nr, set_size, b->n_req);
305 goto bail;
306 }
307
308 /* Clean up list of requests processed during current epoch */
309 list_for_each_safe(le, tle, &b->requests) {
310 r = list_entry(le, struct drbd_request, tl_requests);
311 _req_mod(r, barrier_acked);
312 }
313 /* There could be requests on the list waiting for completion
314 of the write to the local disk. To avoid corruptions of
315 slab's data structures we have to remove the lists head.
316
317 Also there could have been a barrier ack out of sequence, overtaking
318 the write acks - which would be a bug and violating write ordering.
319 To not deadlock in case we lose connection while such requests are
320 still pending, we need some way to find them for the
321 _req_mode(connection_lost_while_pending).
322
323 These have been list_move'd to the out_of_sequence_requests list in
324 _req_mod(, barrier_acked) above.
325 */
326 list_del_init(&b->requests);
327
328 nob = b->next;
329 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
330 _tl_add_barrier(mdev, b);
331 if (nob)
332 mdev->oldest_tle = nob;
333 /* if nob == NULL b was the only barrier, and becomes the new
334 barrier. Therefore mdev->oldest_tle points already to b */
335 } else {
336 D_ASSERT(nob != NULL);
337 mdev->oldest_tle = nob;
338 kfree(b);
339 }
340
341 spin_unlock_irq(&mdev->req_lock);
342 dec_ap_pending(mdev);
343
344 return;
345
346bail:
347 spin_unlock_irq(&mdev->req_lock);
348 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
349}
350
351
352/**
353 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
354 * @mdev: DRBD device.
355 *
356 * This is called after the connection to the peer was lost. The storage covered
357 * by the requests on the transfer gets marked as our of sync. Called from the
358 * receiver thread and the worker thread.
359 */
360void tl_clear(struct drbd_conf *mdev)
361{
362 struct drbd_tl_epoch *b, *tmp;
363 struct list_head *le, *tle;
364 struct drbd_request *r;
365 int new_initial_bnr = net_random();
366
367 spin_lock_irq(&mdev->req_lock);
368
369 b = mdev->oldest_tle;
370 while (b) {
371 list_for_each_safe(le, tle, &b->requests) {
372 r = list_entry(le, struct drbd_request, tl_requests);
373 /* It would be nice to complete outside of spinlock.
374 * But this is easier for now. */
375 _req_mod(r, connection_lost_while_pending);
376 }
377 tmp = b->next;
378
379 /* there could still be requests on that ring list,
380 * in case local io is still pending */
381 list_del(&b->requests);
382
383 /* dec_ap_pending corresponding to queue_barrier.
384 * the newest barrier may not have been queued yet,
385 * in which case w.cb is still NULL. */
386 if (b->w.cb != NULL)
387 dec_ap_pending(mdev);
388
389 if (b == mdev->newest_tle) {
390 /* recycle, but reinit! */
391 D_ASSERT(tmp == NULL);
392 INIT_LIST_HEAD(&b->requests);
393 INIT_LIST_HEAD(&b->w.list);
394 b->w.cb = NULL;
395 b->br_number = new_initial_bnr;
396 b->n_req = 0;
397
398 mdev->oldest_tle = b;
399 break;
400 }
401 kfree(b);
402 b = tmp;
403 }
404
405 /* we expect this list to be empty. */
406 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
407
408 /* but just in case, clean it up anyways! */
409 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
410 r = list_entry(le, struct drbd_request, tl_requests);
411 /* It would be nice to complete outside of spinlock.
412 * But this is easier for now. */
413 _req_mod(r, connection_lost_while_pending);
414 }
415
416 /* ensure bit indicating barrier is required is clear */
417 clear_bit(CREATE_BARRIER, &mdev->flags);
418
419 spin_unlock_irq(&mdev->req_lock);
420}
421
422/**
423 * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
424 * @mdev: DRBD device.
425 * @os: old (current) state.
426 * @ns: new (wanted) state.
427 */
428static int cl_wide_st_chg(struct drbd_conf *mdev,
429 union drbd_state os, union drbd_state ns)
430{
431 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
432 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
433 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
434 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
435 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
436 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
437 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
438}
439
440int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
441 union drbd_state mask, union drbd_state val)
442{
443 unsigned long flags;
444 union drbd_state os, ns;
445 int rv;
446
447 spin_lock_irqsave(&mdev->req_lock, flags);
448 os = mdev->state;
449 ns.i = (os.i & ~mask.i) | val.i;
450 rv = _drbd_set_state(mdev, ns, f, NULL);
451 ns = mdev->state;
452 spin_unlock_irqrestore(&mdev->req_lock, flags);
453
454 return rv;
455}
456
457/**
458 * drbd_force_state() - Impose a change which happens outside our control on our state
459 * @mdev: DRBD device.
460 * @mask: mask of state bits to change.
461 * @val: value of new state bits.
462 */
463void drbd_force_state(struct drbd_conf *mdev,
464 union drbd_state mask, union drbd_state val)
465{
466 drbd_change_state(mdev, CS_HARD, mask, val);
467}
468
469static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
470static int is_valid_state_transition(struct drbd_conf *,
471 union drbd_state, union drbd_state);
472static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
473 union drbd_state ns, int *warn_sync_abort);
474int drbd_send_state_req(struct drbd_conf *,
475 union drbd_state, union drbd_state);
476
477static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
478 union drbd_state mask, union drbd_state val)
479{
480 union drbd_state os, ns;
481 unsigned long flags;
482 int rv;
483
484 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
485 return SS_CW_SUCCESS;
486
487 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
488 return SS_CW_FAILED_BY_PEER;
489
490 rv = 0;
491 spin_lock_irqsave(&mdev->req_lock, flags);
492 os = mdev->state;
493 ns.i = (os.i & ~mask.i) | val.i;
494 ns = sanitize_state(mdev, os, ns, NULL);
495
496 if (!cl_wide_st_chg(mdev, os, ns))
497 rv = SS_CW_NO_NEED;
498 if (!rv) {
499 rv = is_valid_state(mdev, ns);
500 if (rv == SS_SUCCESS) {
501 rv = is_valid_state_transition(mdev, ns, os);
502 if (rv == SS_SUCCESS)
503 rv = 0; /* cont waiting, otherwise fail. */
504 }
505 }
506 spin_unlock_irqrestore(&mdev->req_lock, flags);
507
508 return rv;
509}
510
511/**
512 * drbd_req_state() - Perform an eventually cluster wide state change
513 * @mdev: DRBD device.
514 * @mask: mask of state bits to change.
515 * @val: value of new state bits.
516 * @f: flags
517 *
518 * Should not be called directly, use drbd_request_state() or
519 * _drbd_request_state().
520 */
521static int drbd_req_state(struct drbd_conf *mdev,
522 union drbd_state mask, union drbd_state val,
523 enum chg_state_flags f)
524{
525 struct completion done;
526 unsigned long flags;
527 union drbd_state os, ns;
528 int rv;
529
530 init_completion(&done);
531
532 if (f & CS_SERIALIZE)
533 mutex_lock(&mdev->state_mutex);
534
535 spin_lock_irqsave(&mdev->req_lock, flags);
536 os = mdev->state;
537 ns.i = (os.i & ~mask.i) | val.i;
538 ns = sanitize_state(mdev, os, ns, NULL);
539
540 if (cl_wide_st_chg(mdev, os, ns)) {
541 rv = is_valid_state(mdev, ns);
542 if (rv == SS_SUCCESS)
543 rv = is_valid_state_transition(mdev, ns, os);
544 spin_unlock_irqrestore(&mdev->req_lock, flags);
545
546 if (rv < SS_SUCCESS) {
547 if (f & CS_VERBOSE)
548 print_st_err(mdev, os, ns, rv);
549 goto abort;
550 }
551
552 drbd_state_lock(mdev);
553 if (!drbd_send_state_req(mdev, mask, val)) {
554 drbd_state_unlock(mdev);
555 rv = SS_CW_FAILED_BY_PEER;
556 if (f & CS_VERBOSE)
557 print_st_err(mdev, os, ns, rv);
558 goto abort;
559 }
560
561 wait_event(mdev->state_wait,
562 (rv = _req_st_cond(mdev, mask, val)));
563
564 if (rv < SS_SUCCESS) {
565 drbd_state_unlock(mdev);
566 if (f & CS_VERBOSE)
567 print_st_err(mdev, os, ns, rv);
568 goto abort;
569 }
570 spin_lock_irqsave(&mdev->req_lock, flags);
571 os = mdev->state;
572 ns.i = (os.i & ~mask.i) | val.i;
573 rv = _drbd_set_state(mdev, ns, f, &done);
574 drbd_state_unlock(mdev);
575 } else {
576 rv = _drbd_set_state(mdev, ns, f, &done);
577 }
578
579 spin_unlock_irqrestore(&mdev->req_lock, flags);
580
581 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
582 D_ASSERT(current != mdev->worker.task);
583 wait_for_completion(&done);
584 }
585
586abort:
587 if (f & CS_SERIALIZE)
588 mutex_unlock(&mdev->state_mutex);
589
590 return rv;
591}
592
593/**
594 * _drbd_request_state() - Request a state change (with flags)
595 * @mdev: DRBD device.
596 * @mask: mask of state bits to change.
597 * @val: value of new state bits.
598 * @f: flags
599 *
600 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
601 * flag, or when logging of failed state change requests is not desired.
602 */
603int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
604 union drbd_state val, enum chg_state_flags f)
605{
606 int rv;
607
608 wait_event(mdev->state_wait,
609 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
610
611 return rv;
612}
613
614static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
615{
616 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
617 name,
618 drbd_conn_str(ns.conn),
619 drbd_role_str(ns.role),
620 drbd_role_str(ns.peer),
621 drbd_disk_str(ns.disk),
622 drbd_disk_str(ns.pdsk),
623 ns.susp ? 's' : 'r',
624 ns.aftr_isp ? 'a' : '-',
625 ns.peer_isp ? 'p' : '-',
626 ns.user_isp ? 'u' : '-'
627 );
628}
629
630void print_st_err(struct drbd_conf *mdev,
631 union drbd_state os, union drbd_state ns, int err)
632{
633 if (err == SS_IN_TRANSIENT_STATE)
634 return;
635 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
636 print_st(mdev, " state", os);
637 print_st(mdev, "wanted", ns);
638}
639
640
641#define drbd_peer_str drbd_role_str
642#define drbd_pdsk_str drbd_disk_str
643
644#define drbd_susp_str(A) ((A) ? "1" : "0")
645#define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
646#define drbd_peer_isp_str(A) ((A) ? "1" : "0")
647#define drbd_user_isp_str(A) ((A) ? "1" : "0")
648
649#define PSC(A) \
650 ({ if (ns.A != os.A) { \
651 pbp += sprintf(pbp, #A "( %s -> %s ) ", \
652 drbd_##A##_str(os.A), \
653 drbd_##A##_str(ns.A)); \
654 } })
655
656/**
657 * is_valid_state() - Returns an SS_ error code if ns is not valid
658 * @mdev: DRBD device.
659 * @ns: State to consider.
660 */
661static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
662{
663 /* See drbd_state_sw_errors in drbd_strings.c */
664
665 enum drbd_fencing_p fp;
666 int rv = SS_SUCCESS;
667
668 fp = FP_DONT_CARE;
669 if (get_ldev(mdev)) {
670 fp = mdev->ldev->dc.fencing;
671 put_ldev(mdev);
672 }
673
674 if (get_net_conf(mdev)) {
675 if (!mdev->net_conf->two_primaries &&
676 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
677 rv = SS_TWO_PRIMARIES;
678 put_net_conf(mdev);
679 }
680
681 if (rv <= 0)
682 /* already found a reason to abort */;
683 else if (ns.role == R_SECONDARY && mdev->open_cnt)
684 rv = SS_DEVICE_IN_USE;
685
686 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
687 rv = SS_NO_UP_TO_DATE_DISK;
688
689 else if (fp >= FP_RESOURCE &&
690 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
691 rv = SS_PRIMARY_NOP;
692
693 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
694 rv = SS_NO_UP_TO_DATE_DISK;
695
696 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
697 rv = SS_NO_LOCAL_DISK;
698
699 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
700 rv = SS_NO_REMOTE_DISK;
701
702 else if ((ns.conn == C_CONNECTED ||
703 ns.conn == C_WF_BITMAP_S ||
704 ns.conn == C_SYNC_SOURCE ||
705 ns.conn == C_PAUSED_SYNC_S) &&
706 ns.disk == D_OUTDATED)
707 rv = SS_CONNECTED_OUTDATES;
708
709 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
710 (mdev->sync_conf.verify_alg[0] == 0))
711 rv = SS_NO_VERIFY_ALG;
712
713 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
714 mdev->agreed_pro_version < 88)
715 rv = SS_NOT_SUPPORTED;
716
717 return rv;
718}
719
720/**
721 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
722 * @mdev: DRBD device.
723 * @ns: new state.
724 * @os: old state.
725 */
726static int is_valid_state_transition(struct drbd_conf *mdev,
727 union drbd_state ns, union drbd_state os)
728{
729 int rv = SS_SUCCESS;
730
731 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
732 os.conn > C_CONNECTED)
733 rv = SS_RESYNC_RUNNING;
734
735 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
736 rv = SS_ALREADY_STANDALONE;
737
738 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
739 rv = SS_IS_DISKLESS;
740
741 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
742 rv = SS_NO_NET_CONFIG;
743
744 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
745 rv = SS_LOWER_THAN_OUTDATED;
746
747 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
748 rv = SS_IN_TRANSIENT_STATE;
749
750 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
751 rv = SS_IN_TRANSIENT_STATE;
752
753 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
754 rv = SS_NEED_CONNECTION;
755
756 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
757 ns.conn != os.conn && os.conn > C_CONNECTED)
758 rv = SS_RESYNC_RUNNING;
759
760 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
761 os.conn < C_CONNECTED)
762 rv = SS_NEED_CONNECTION;
763
764 return rv;
765}
766
767/**
768 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
769 * @mdev: DRBD device.
770 * @os: old state.
771 * @ns: new state.
772 * @warn_sync_abort:
773 *
774 * When we loose connection, we have to set the state of the peers disk (pdsk)
775 * to D_UNKNOWN. This rule and many more along those lines are in this function.
776 */
777static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
778 union drbd_state ns, int *warn_sync_abort)
779{
780 enum drbd_fencing_p fp;
781
782 fp = FP_DONT_CARE;
783 if (get_ldev(mdev)) {
784 fp = mdev->ldev->dc.fencing;
785 put_ldev(mdev);
786 }
787
788 /* Disallow Network errors to configure a device's network part */
789 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
790 os.conn <= C_DISCONNECTING)
791 ns.conn = os.conn;
792
793 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */
794 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
795 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING)
796 ns.conn = os.conn;
797
798 /* After C_DISCONNECTING only C_STANDALONE may follow */
799 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
800 ns.conn = os.conn;
801
802 if (ns.conn < C_CONNECTED) {
803 ns.peer_isp = 0;
804 ns.peer = R_UNKNOWN;
805 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
806 ns.pdsk = D_UNKNOWN;
807 }
808
809 /* Clear the aftr_isp when becoming unconfigured */
810 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
811 ns.aftr_isp = 0;
812
813 if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS)
814 ns.pdsk = D_UNKNOWN;
815
816 /* Abort resync if a disk fails/detaches */
817 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
818 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
819 if (warn_sync_abort)
820 *warn_sync_abort = 1;
821 ns.conn = C_CONNECTED;
822 }
823
824 if (ns.conn >= C_CONNECTED &&
825 ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) ||
826 (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) {
827 switch (ns.conn) {
828 case C_WF_BITMAP_T:
829 case C_PAUSED_SYNC_T:
830 ns.disk = D_OUTDATED;
831 break;
832 case C_CONNECTED:
833 case C_WF_BITMAP_S:
834 case C_SYNC_SOURCE:
835 case C_PAUSED_SYNC_S:
836 ns.disk = D_UP_TO_DATE;
837 break;
838 case C_SYNC_TARGET:
839 ns.disk = D_INCONSISTENT;
840 dev_warn(DEV, "Implicitly set disk state Inconsistent!\n");
841 break;
842 }
843 if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE)
844 dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n");
845 }
846
847 if (ns.conn >= C_CONNECTED &&
848 (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) {
849 switch (ns.conn) {
850 case C_CONNECTED:
851 case C_WF_BITMAP_T:
852 case C_PAUSED_SYNC_T:
853 case C_SYNC_TARGET:
854 ns.pdsk = D_UP_TO_DATE;
855 break;
856 case C_WF_BITMAP_S:
857 case C_PAUSED_SYNC_S:
858 ns.pdsk = D_OUTDATED;
859 break;
860 case C_SYNC_SOURCE:
861 ns.pdsk = D_INCONSISTENT;
862 dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n");
863 break;
864 }
865 if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE)
866 dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n");
867 }
868
869 /* Connection breaks down before we finished "Negotiating" */
870 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
871 get_ldev_if_state(mdev, D_NEGOTIATING)) {
872 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
873 ns.disk = mdev->new_state_tmp.disk;
874 ns.pdsk = mdev->new_state_tmp.pdsk;
875 } else {
876 dev_alert(DEV, "Connection lost while negotiating, no data!\n");
877 ns.disk = D_DISKLESS;
878 ns.pdsk = D_UNKNOWN;
879 }
880 put_ldev(mdev);
881 }
882
883 if (fp == FP_STONITH &&
884 (ns.role == R_PRIMARY &&
885 ns.conn < C_CONNECTED &&
886 ns.pdsk > D_OUTDATED))
887 ns.susp = 1;
888
889 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
890 if (ns.conn == C_SYNC_SOURCE)
891 ns.conn = C_PAUSED_SYNC_S;
892 if (ns.conn == C_SYNC_TARGET)
893 ns.conn = C_PAUSED_SYNC_T;
894 } else {
895 if (ns.conn == C_PAUSED_SYNC_S)
896 ns.conn = C_SYNC_SOURCE;
897 if (ns.conn == C_PAUSED_SYNC_T)
898 ns.conn = C_SYNC_TARGET;
899 }
900
901 return ns;
902}
903
904/* helper for __drbd_set_state */
905static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
906{
907 if (cs == C_VERIFY_T) {
908 /* starting online verify from an arbitrary position
909 * does not fit well into the existing protocol.
910 * on C_VERIFY_T, we initialize ov_left and friends
911 * implicitly in receive_DataRequest once the
912 * first P_OV_REQUEST is received */
913 mdev->ov_start_sector = ~(sector_t)0;
914 } else {
915 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
916 if (bit >= mdev->rs_total)
917 mdev->ov_start_sector =
918 BM_BIT_TO_SECT(mdev->rs_total - 1);
919 mdev->ov_position = mdev->ov_start_sector;
920 }
921}
922
923/**
924 * __drbd_set_state() - Set a new DRBD state
925 * @mdev: DRBD device.
926 * @ns: new state.
927 * @flags: Flags
928 * @done: Optional completion, that will get completed after the after_state_ch() finished
929 *
930 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
931 */
932int __drbd_set_state(struct drbd_conf *mdev,
933 union drbd_state ns, enum chg_state_flags flags,
934 struct completion *done)
935{
936 union drbd_state os;
937 int rv = SS_SUCCESS;
938 int warn_sync_abort = 0;
939 struct after_state_chg_work *ascw;
940
941 os = mdev->state;
942
943 ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
944
945 if (ns.i == os.i)
946 return SS_NOTHING_TO_DO;
947
948 if (!(flags & CS_HARD)) {
949 /* pre-state-change checks ; only look at ns */
950 /* See drbd_state_sw_errors in drbd_strings.c */
951
952 rv = is_valid_state(mdev, ns);
953 if (rv < SS_SUCCESS) {
954 /* If the old state was illegal as well, then let
955 this happen...*/
956
957 if (is_valid_state(mdev, os) == rv) {
958 dev_err(DEV, "Considering state change from bad state. "
959 "Error would be: '%s'\n",
960 drbd_set_st_err_str(rv));
961 print_st(mdev, "old", os);
962 print_st(mdev, "new", ns);
963 rv = is_valid_state_transition(mdev, ns, os);
964 }
965 } else
966 rv = is_valid_state_transition(mdev, ns, os);
967 }
968
969 if (rv < SS_SUCCESS) {
970 if (flags & CS_VERBOSE)
971 print_st_err(mdev, os, ns, rv);
972 return rv;
973 }
974
975 if (warn_sync_abort)
976 dev_warn(DEV, "Resync aborted.\n");
977
978 {
979 char *pbp, pb[300];
980 pbp = pb;
981 *pbp = 0;
982 PSC(role);
983 PSC(peer);
984 PSC(conn);
985 PSC(disk);
986 PSC(pdsk);
987 PSC(susp);
988 PSC(aftr_isp);
989 PSC(peer_isp);
990 PSC(user_isp);
991 dev_info(DEV, "%s\n", pb);
992 }
993
994 /* solve the race between becoming unconfigured,
995 * worker doing the cleanup, and
996 * admin reconfiguring us:
997 * on (re)configure, first set CONFIG_PENDING,
998 * then wait for a potentially exiting worker,
999 * start the worker, and schedule one no_op.
1000 * then proceed with configuration.
1001 */
1002 if (ns.disk == D_DISKLESS &&
1003 ns.conn == C_STANDALONE &&
1004 ns.role == R_SECONDARY &&
1005 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1006 set_bit(DEVICE_DYING, &mdev->flags);
1007
1008 mdev->state.i = ns.i;
1009 wake_up(&mdev->misc_wait);
1010 wake_up(&mdev->state_wait);
1011
1012 /* post-state-change actions */
1013 if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) {
1014 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1015 mod_timer(&mdev->resync_timer, jiffies);
1016 }
1017
1018 /* aborted verify run. log the last position */
1019 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1020 ns.conn < C_CONNECTED) {
1021 mdev->ov_start_sector =
1022 BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left);
1023 dev_info(DEV, "Online Verify reached sector %llu\n",
1024 (unsigned long long)mdev->ov_start_sector);
1025 }
1026
1027 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1028 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1029 dev_info(DEV, "Syncer continues.\n");
1030 mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time;
1031 if (ns.conn == C_SYNC_TARGET) {
1032 if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))
1033 mod_timer(&mdev->resync_timer, jiffies);
1034 /* This if (!test_bit) is only needed for the case
1035 that a device that has ceased to used its timer,
1036 i.e. it is already in drbd_resync_finished() gets
1037 paused and resumed. */
1038 }
1039 }
1040
1041 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1042 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1043 dev_info(DEV, "Resync suspended\n");
1044 mdev->rs_mark_time = jiffies;
1045 if (ns.conn == C_PAUSED_SYNC_T)
1046 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1047 }
1048
1049 if (os.conn == C_CONNECTED &&
1050 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1051 mdev->ov_position = 0;
1052 mdev->rs_total =
1053 mdev->rs_mark_left = drbd_bm_bits(mdev);
1054 if (mdev->agreed_pro_version >= 90)
1055 set_ov_position(mdev, ns.conn);
1056 else
1057 mdev->ov_start_sector = 0;
1058 mdev->ov_left = mdev->rs_total
1059 - BM_SECT_TO_BIT(mdev->ov_position);
1060 mdev->rs_start =
1061 mdev->rs_mark_time = jiffies;
1062 mdev->ov_last_oos_size = 0;
1063 mdev->ov_last_oos_start = 0;
1064
1065 if (ns.conn == C_VERIFY_S) {
1066 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1067 (unsigned long long)mdev->ov_position);
1068 mod_timer(&mdev->resync_timer, jiffies);
1069 }
1070 }
1071
1072 if (get_ldev(mdev)) {
1073 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1074 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1075 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1076
1077 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1078 mdf |= MDF_CRASHED_PRIMARY;
1079 if (mdev->state.role == R_PRIMARY ||
1080 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1081 mdf |= MDF_PRIMARY_IND;
1082 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1083 mdf |= MDF_CONNECTED_IND;
1084 if (mdev->state.disk > D_INCONSISTENT)
1085 mdf |= MDF_CONSISTENT;
1086 if (mdev->state.disk > D_OUTDATED)
1087 mdf |= MDF_WAS_UP_TO_DATE;
1088 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1089 mdf |= MDF_PEER_OUT_DATED;
1090 if (mdf != mdev->ldev->md.flags) {
1091 mdev->ldev->md.flags = mdf;
1092 drbd_md_mark_dirty(mdev);
1093 }
1094 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1095 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1096 put_ldev(mdev);
1097 }
1098
1099 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1100 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1101 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1102 set_bit(CONSIDER_RESYNC, &mdev->flags);
1103
1104 /* Receiver should clean up itself */
1105 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1106 drbd_thread_stop_nowait(&mdev->receiver);
1107
1108 /* Now the receiver finished cleaning up itself, it should die */
1109 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1110 drbd_thread_stop_nowait(&mdev->receiver);
1111
1112 /* Upon network failure, we need to restart the receiver. */
1113 if (os.conn > C_TEAR_DOWN &&
1114 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1115 drbd_thread_restart_nowait(&mdev->receiver);
1116
1117 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1118 if (ascw) {
1119 ascw->os = os;
1120 ascw->ns = ns;
1121 ascw->flags = flags;
1122 ascw->w.cb = w_after_state_ch;
1123 ascw->done = done;
1124 drbd_queue_work(&mdev->data.work, &ascw->w);
1125 } else {
1126 dev_warn(DEV, "Could not kmalloc an ascw\n");
1127 }
1128
1129 return rv;
1130}
1131
1132static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1133{
1134 struct after_state_chg_work *ascw =
1135 container_of(w, struct after_state_chg_work, w);
1136 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1137 if (ascw->flags & CS_WAIT_COMPLETE) {
1138 D_ASSERT(ascw->done != NULL);
1139 complete(ascw->done);
1140 }
1141 kfree(ascw);
1142
1143 return 1;
1144}
1145
1146static void abw_start_sync(struct drbd_conf *mdev, int rv)
1147{
1148 if (rv) {
1149 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1150 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1151 return;
1152 }
1153
1154 switch (mdev->state.conn) {
1155 case C_STARTING_SYNC_T:
1156 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1157 break;
1158 case C_STARTING_SYNC_S:
1159 drbd_start_resync(mdev, C_SYNC_SOURCE);
1160 break;
1161 }
1162}
1163
1164/**
1165 * after_state_ch() - Perform after state change actions that may sleep
1166 * @mdev: DRBD device.
1167 * @os: old state.
1168 * @ns: new state.
1169 * @flags: Flags
1170 */
1171static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1172 union drbd_state ns, enum chg_state_flags flags)
1173{
1174 enum drbd_fencing_p fp;
1175
1176 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1177 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1178 if (mdev->p_uuid)
1179 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1180 }
1181
1182 fp = FP_DONT_CARE;
1183 if (get_ldev(mdev)) {
1184 fp = mdev->ldev->dc.fencing;
1185 put_ldev(mdev);
1186 }
1187
1188 /* Inform userspace about the change... */
1189 drbd_bcast_state(mdev, ns);
1190
1191 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1192 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1193 drbd_khelper(mdev, "pri-on-incon-degr");
1194
1195 /* Here we have the actions that are performed after a
1196 state change. This function might sleep */
1197
1198 if (fp == FP_STONITH && ns.susp) {
1199 /* case1: The outdate peer handler is successful:
1200 * case2: The connection was established again: */
1201 if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) ||
1202 (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) {
1203 tl_clear(mdev);
1204 spin_lock_irq(&mdev->req_lock);
1205 _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
1206 spin_unlock_irq(&mdev->req_lock);
1207 }
1208 }
1209 /* Do not change the order of the if above and the two below... */
1210 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1211 drbd_send_uuids(mdev);
1212 drbd_send_state(mdev);
1213 }
1214 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1215 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1216
1217 /* Lost contact to peer's copy of the data */
1218 if ((os.pdsk >= D_INCONSISTENT &&
1219 os.pdsk != D_UNKNOWN &&
1220 os.pdsk != D_OUTDATED)
1221 && (ns.pdsk < D_INCONSISTENT ||
1222 ns.pdsk == D_UNKNOWN ||
1223 ns.pdsk == D_OUTDATED)) {
1224 kfree(mdev->p_uuid);
1225 mdev->p_uuid = NULL;
1226 if (get_ldev(mdev)) {
1227 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1228 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1229 drbd_uuid_new_current(mdev);
1230 drbd_send_uuids(mdev);
1231 }
1232 put_ldev(mdev);
1233 }
1234 }
1235
1236 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1237 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
1238 drbd_uuid_new_current(mdev);
1239
1240 /* D_DISKLESS Peer becomes secondary */
1241 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1242 drbd_al_to_on_disk_bm(mdev);
1243 put_ldev(mdev);
1244 }
1245
1246 /* Last part of the attaching process ... */
1247 if (ns.conn >= C_CONNECTED &&
1248 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1249 kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */
1250 mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */
1251 drbd_send_sizes(mdev, 0); /* to start sync... */
1252 drbd_send_uuids(mdev);
1253 drbd_send_state(mdev);
1254 }
1255
1256 /* We want to pause/continue resync, tell peer. */
1257 if (ns.conn >= C_CONNECTED &&
1258 ((os.aftr_isp != ns.aftr_isp) ||
1259 (os.user_isp != ns.user_isp)))
1260 drbd_send_state(mdev);
1261
1262 /* In case one of the isp bits got set, suspend other devices. */
1263 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1264 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1265 suspend_other_sg(mdev);
1266
1267 /* Make sure the peer gets informed about eventual state
1268 changes (ISP bits) while we were in WFReportParams. */
1269 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1270 drbd_send_state(mdev);
1271
1272 /* We are in the progress to start a full sync... */
1273 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1274 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1275 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1276
1277 /* We are invalidating our self... */
1278 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1279 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1280 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1281
1282 if (os.disk > D_FAILED && ns.disk == D_FAILED) {
1283 enum drbd_io_error_p eh;
1284
1285 eh = EP_PASS_ON;
1286 if (get_ldev_if_state(mdev, D_FAILED)) {
1287 eh = mdev->ldev->dc.on_io_error;
1288 put_ldev(mdev);
1289 }
1290
1291 drbd_rs_cancel_all(mdev);
1292 /* since get_ldev() only works as long as disk>=D_INCONSISTENT,
1293 and it is D_DISKLESS here, local_cnt can only go down, it can
1294 not increase... It will reach zero */
1295 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1296 mdev->rs_total = 0;
1297 mdev->rs_failed = 0;
1298 atomic_set(&mdev->rs_pending_cnt, 0);
1299
1300 spin_lock_irq(&mdev->req_lock);
1301 _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL);
1302 spin_unlock_irq(&mdev->req_lock);
1303
1304 if (eh == EP_CALL_HELPER)
1305 drbd_khelper(mdev, "local-io-error");
1306 }
1307
1308 if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
1309
1310 if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ {
1311 if (drbd_send_state(mdev))
1312 dev_warn(DEV, "Notified peer that my disk is broken.\n");
1313 else
1314 dev_err(DEV, "Sending state in drbd_io_error() failed\n");
1315 }
1316
1317 lc_destroy(mdev->resync);
1318 mdev->resync = NULL;
1319 lc_destroy(mdev->act_log);
1320 mdev->act_log = NULL;
1321 __no_warn(local,
1322 drbd_free_bc(mdev->ldev);
1323 mdev->ldev = NULL;);
1324
1325 if (mdev->md_io_tmpp)
1326 __free_page(mdev->md_io_tmpp);
1327 }
1328
1329 /* Disks got bigger while they were detached */
1330 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1331 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1332 if (ns.conn == C_CONNECTED)
1333 resync_after_online_grow(mdev);
1334 }
1335
1336 /* A resync finished or aborted, wake paused devices... */
1337 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1338 (os.peer_isp && !ns.peer_isp) ||
1339 (os.user_isp && !ns.user_isp))
1340 resume_next_sg(mdev);
1341
1342 /* Upon network connection, we need to start the receiver */
1343 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1344 drbd_thread_start(&mdev->receiver);
1345
1346 /* Terminate worker thread if we are unconfigured - it will be
1347 restarted as needed... */
1348 if (ns.disk == D_DISKLESS &&
1349 ns.conn == C_STANDALONE &&
1350 ns.role == R_SECONDARY) {
1351 if (os.aftr_isp != ns.aftr_isp)
1352 resume_next_sg(mdev);
1353 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1354 if (test_bit(DEVICE_DYING, &mdev->flags))
1355 drbd_thread_stop_nowait(&mdev->worker);
1356 }
1357
1358 drbd_md_sync(mdev);
1359}
1360
1361
1362static int drbd_thread_setup(void *arg)
1363{
1364 struct drbd_thread *thi = (struct drbd_thread *) arg;
1365 struct drbd_conf *mdev = thi->mdev;
1366 unsigned long flags;
1367 int retval;
1368
1369restart:
1370 retval = thi->function(thi);
1371
1372 spin_lock_irqsave(&thi->t_lock, flags);
1373
1374 /* if the receiver has been "Exiting", the last thing it did
1375 * was set the conn state to "StandAlone",
1376 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1377 * and receiver thread will be "started".
1378 * drbd_thread_start needs to set "Restarting" in that case.
1379 * t_state check and assignment needs to be within the same spinlock,
1380 * so either thread_start sees Exiting, and can remap to Restarting,
1381 * or thread_start see None, and can proceed as normal.
1382 */
1383
1384 if (thi->t_state == Restarting) {
1385 dev_info(DEV, "Restarting %s\n", current->comm);
1386 thi->t_state = Running;
1387 spin_unlock_irqrestore(&thi->t_lock, flags);
1388 goto restart;
1389 }
1390
1391 thi->task = NULL;
1392 thi->t_state = None;
1393 smp_mb();
1394 complete(&thi->stop);
1395 spin_unlock_irqrestore(&thi->t_lock, flags);
1396
1397 dev_info(DEV, "Terminating %s\n", current->comm);
1398
1399 /* Release mod reference taken when thread was started */
1400 module_put(THIS_MODULE);
1401 return retval;
1402}
1403
1404static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1405 int (*func) (struct drbd_thread *))
1406{
1407 spin_lock_init(&thi->t_lock);
1408 thi->task = NULL;
1409 thi->t_state = None;
1410 thi->function = func;
1411 thi->mdev = mdev;
1412}
1413
1414int drbd_thread_start(struct drbd_thread *thi)
1415{
1416 struct drbd_conf *mdev = thi->mdev;
1417 struct task_struct *nt;
1418 unsigned long flags;
1419
1420 const char *me =
1421 thi == &mdev->receiver ? "receiver" :
1422 thi == &mdev->asender ? "asender" :
1423 thi == &mdev->worker ? "worker" : "NONSENSE";
1424
1425 /* is used from state engine doing drbd_thread_stop_nowait,
1426 * while holding the req lock irqsave */
1427 spin_lock_irqsave(&thi->t_lock, flags);
1428
1429 switch (thi->t_state) {
1430 case None:
1431 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1432 me, current->comm, current->pid);
1433
1434 /* Get ref on module for thread - this is released when thread exits */
1435 if (!try_module_get(THIS_MODULE)) {
1436 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1437 spin_unlock_irqrestore(&thi->t_lock, flags);
1438 return FALSE;
1439 }
1440
1441 init_completion(&thi->stop);
1442 D_ASSERT(thi->task == NULL);
1443 thi->reset_cpu_mask = 1;
1444 thi->t_state = Running;
1445 spin_unlock_irqrestore(&thi->t_lock, flags);
1446 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1447
1448 nt = kthread_create(drbd_thread_setup, (void *) thi,
1449 "drbd%d_%s", mdev_to_minor(mdev), me);
1450
1451 if (IS_ERR(nt)) {
1452 dev_err(DEV, "Couldn't start thread\n");
1453
1454 module_put(THIS_MODULE);
1455 return FALSE;
1456 }
1457 spin_lock_irqsave(&thi->t_lock, flags);
1458 thi->task = nt;
1459 thi->t_state = Running;
1460 spin_unlock_irqrestore(&thi->t_lock, flags);
1461 wake_up_process(nt);
1462 break;
1463 case Exiting:
1464 thi->t_state = Restarting;
1465 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1466 me, current->comm, current->pid);
1467 /* fall through */
1468 case Running:
1469 case Restarting:
1470 default:
1471 spin_unlock_irqrestore(&thi->t_lock, flags);
1472 break;
1473 }
1474
1475 return TRUE;
1476}
1477
1478
1479void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1480{
1481 unsigned long flags;
1482
1483 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1484
1485 /* may be called from state engine, holding the req lock irqsave */
1486 spin_lock_irqsave(&thi->t_lock, flags);
1487
1488 if (thi->t_state == None) {
1489 spin_unlock_irqrestore(&thi->t_lock, flags);
1490 if (restart)
1491 drbd_thread_start(thi);
1492 return;
1493 }
1494
1495 if (thi->t_state != ns) {
1496 if (thi->task == NULL) {
1497 spin_unlock_irqrestore(&thi->t_lock, flags);
1498 return;
1499 }
1500
1501 thi->t_state = ns;
1502 smp_mb();
1503 init_completion(&thi->stop);
1504 if (thi->task != current)
1505 force_sig(DRBD_SIGKILL, thi->task);
1506
1507 }
1508
1509 spin_unlock_irqrestore(&thi->t_lock, flags);
1510
1511 if (wait)
1512 wait_for_completion(&thi->stop);
1513}
1514
1515#ifdef CONFIG_SMP
1516/**
1517 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1518 * @mdev: DRBD device.
1519 *
1520 * Forces all threads of a device onto the same CPU. This is beneficial for
1521 * DRBD's performance. May be overwritten by user's configuration.
1522 */
1523void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1524{
1525 int ord, cpu;
1526
1527 /* user override. */
1528 if (cpumask_weight(mdev->cpu_mask))
1529 return;
1530
1531 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1532 for_each_online_cpu(cpu) {
1533 if (ord-- == 0) {
1534 cpumask_set_cpu(cpu, mdev->cpu_mask);
1535 return;
1536 }
1537 }
1538 /* should not be reached */
1539 cpumask_setall(mdev->cpu_mask);
1540}
1541
1542/**
1543 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1544 * @mdev: DRBD device.
1545 *
1546 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1547 * prematurely.
1548 */
1549void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1550{
1551 struct task_struct *p = current;
1552 struct drbd_thread *thi =
1553 p == mdev->asender.task ? &mdev->asender :
1554 p == mdev->receiver.task ? &mdev->receiver :
1555 p == mdev->worker.task ? &mdev->worker :
1556 NULL;
1557 ERR_IF(thi == NULL)
1558 return;
1559 if (!thi->reset_cpu_mask)
1560 return;
1561 thi->reset_cpu_mask = 0;
1562 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1563}
1564#endif
1565
1566/* the appropriate socket mutex must be held already */
1567int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1568 enum drbd_packets cmd, struct p_header *h,
1569 size_t size, unsigned msg_flags)
1570{
1571 int sent, ok;
1572
1573 ERR_IF(!h) return FALSE;
1574 ERR_IF(!size) return FALSE;
1575
1576 h->magic = BE_DRBD_MAGIC;
1577 h->command = cpu_to_be16(cmd);
1578 h->length = cpu_to_be16(size-sizeof(struct p_header));
1579
1580 trace_drbd_packet(mdev, sock, 0, (void *)h, __FILE__, __LINE__);
1581 sent = drbd_send(mdev, sock, h, size, msg_flags);
1582
1583 ok = (sent == size);
1584 if (!ok)
1585 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1586 cmdname(cmd), (int)size, sent);
1587 return ok;
1588}
1589
1590/* don't pass the socket. we may only look at it
1591 * when we hold the appropriate socket mutex.
1592 */
1593int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1594 enum drbd_packets cmd, struct p_header *h, size_t size)
1595{
1596 int ok = 0;
1597 struct socket *sock;
1598
1599 if (use_data_socket) {
1600 mutex_lock(&mdev->data.mutex);
1601 sock = mdev->data.socket;
1602 } else {
1603 mutex_lock(&mdev->meta.mutex);
1604 sock = mdev->meta.socket;
1605 }
1606
1607 /* drbd_disconnect() could have called drbd_free_sock()
1608 * while we were waiting in down()... */
1609 if (likely(sock != NULL))
1610 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1611
1612 if (use_data_socket)
1613 mutex_unlock(&mdev->data.mutex);
1614 else
1615 mutex_unlock(&mdev->meta.mutex);
1616 return ok;
1617}
1618
1619int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1620 size_t size)
1621{
1622 struct p_header h;
1623 int ok;
1624
1625 h.magic = BE_DRBD_MAGIC;
1626 h.command = cpu_to_be16(cmd);
1627 h.length = cpu_to_be16(size);
1628
1629 if (!drbd_get_data_sock(mdev))
1630 return 0;
1631
1632 trace_drbd_packet(mdev, mdev->data.socket, 0, (void *)&h, __FILE__, __LINE__);
1633
1634 ok = (sizeof(h) ==
1635 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1636 ok = ok && (size ==
1637 drbd_send(mdev, mdev->data.socket, data, size, 0));
1638
1639 drbd_put_data_sock(mdev);
1640
1641 return ok;
1642}
1643
1644int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1645{
1646 struct p_rs_param_89 *p;
1647 struct socket *sock;
1648 int size, rv;
1649 const int apv = mdev->agreed_pro_version;
1650
1651 size = apv <= 87 ? sizeof(struct p_rs_param)
1652 : apv == 88 ? sizeof(struct p_rs_param)
1653 + strlen(mdev->sync_conf.verify_alg) + 1
1654 : /* 89 */ sizeof(struct p_rs_param_89);
1655
1656 /* used from admin command context and receiver/worker context.
1657 * to avoid kmalloc, grab the socket right here,
1658 * then use the pre-allocated sbuf there */
1659 mutex_lock(&mdev->data.mutex);
1660 sock = mdev->data.socket;
1661
1662 if (likely(sock != NULL)) {
1663 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1664
1665 p = &mdev->data.sbuf.rs_param_89;
1666
1667 /* initialize verify_alg and csums_alg */
1668 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1669
1670 p->rate = cpu_to_be32(sc->rate);
1671
1672 if (apv >= 88)
1673 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1674 if (apv >= 89)
1675 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1676
1677 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1678 } else
1679 rv = 0; /* not ok */
1680
1681 mutex_unlock(&mdev->data.mutex);
1682
1683 return rv;
1684}
1685
1686int drbd_send_protocol(struct drbd_conf *mdev)
1687{
1688 struct p_protocol *p;
1689 int size, rv;
1690
1691 size = sizeof(struct p_protocol);
1692
1693 if (mdev->agreed_pro_version >= 87)
1694 size += strlen(mdev->net_conf->integrity_alg) + 1;
1695
1696 /* we must not recurse into our own queue,
1697 * as that is blocked during handshake */
1698 p = kmalloc(size, GFP_NOIO);
1699 if (p == NULL)
1700 return 0;
1701
1702 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
1703 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
1704 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
1705 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
1706 p->want_lose = cpu_to_be32(mdev->net_conf->want_lose);
1707 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1708
1709 if (mdev->agreed_pro_version >= 87)
1710 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1711
1712 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
1713 (struct p_header *)p, size);
1714 kfree(p);
1715 return rv;
1716}
1717
1718int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1719{
1720 struct p_uuids p;
1721 int i;
1722
1723 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1724 return 1;
1725
1726 for (i = UI_CURRENT; i < UI_SIZE; i++)
1727 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1728
1729 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1730 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1731 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1732 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1733 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1734 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1735
1736 put_ldev(mdev);
1737
1738 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
1739 (struct p_header *)&p, sizeof(p));
1740}
1741
1742int drbd_send_uuids(struct drbd_conf *mdev)
1743{
1744 return _drbd_send_uuids(mdev, 0);
1745}
1746
1747int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1748{
1749 return _drbd_send_uuids(mdev, 8);
1750}
1751
1752
1753int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1754{
1755 struct p_rs_uuid p;
1756
1757 p.uuid = cpu_to_be64(val);
1758
1759 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
1760 (struct p_header *)&p, sizeof(p));
1761}
1762
1763int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply)
1764{
1765 struct p_sizes p;
1766 sector_t d_size, u_size;
1767 int q_order_type;
1768 int ok;
1769
1770 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1771 D_ASSERT(mdev->ldev->backing_bdev);
1772 d_size = drbd_get_max_capacity(mdev->ldev);
1773 u_size = mdev->ldev->dc.disk_size;
1774 q_order_type = drbd_queue_order_type(mdev);
1775 p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev));
1776 put_ldev(mdev);
1777 } else {
1778 d_size = 0;
1779 u_size = 0;
1780 q_order_type = QUEUE_ORDERED_NONE;
1781 }
1782
1783 p.d_size = cpu_to_be64(d_size);
1784 p.u_size = cpu_to_be64(u_size);
1785 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1786 p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
1787 p.queue_order_type = cpu_to_be32(q_order_type);
1788
1789 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
1790 (struct p_header *)&p, sizeof(p));
1791 return ok;
1792}
1793
1794/**
1795 * drbd_send_state() - Sends the drbd state to the peer
1796 * @mdev: DRBD device.
1797 */
1798int drbd_send_state(struct drbd_conf *mdev)
1799{
1800 struct socket *sock;
1801 struct p_state p;
1802 int ok = 0;
1803
1804 /* Grab state lock so we wont send state if we're in the middle
1805 * of a cluster wide state change on another thread */
1806 drbd_state_lock(mdev);
1807
1808 mutex_lock(&mdev->data.mutex);
1809
1810 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
1811 sock = mdev->data.socket;
1812
1813 if (likely(sock != NULL)) {
1814 ok = _drbd_send_cmd(mdev, sock, P_STATE,
1815 (struct p_header *)&p, sizeof(p), 0);
1816 }
1817
1818 mutex_unlock(&mdev->data.mutex);
1819
1820 drbd_state_unlock(mdev);
1821 return ok;
1822}
1823
1824int drbd_send_state_req(struct drbd_conf *mdev,
1825 union drbd_state mask, union drbd_state val)
1826{
1827 struct p_req_state p;
1828
1829 p.mask = cpu_to_be32(mask.i);
1830 p.val = cpu_to_be32(val.i);
1831
1832 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
1833 (struct p_header *)&p, sizeof(p));
1834}
1835
1836int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
1837{
1838 struct p_req_state_reply p;
1839
1840 p.retcode = cpu_to_be32(retcode);
1841
1842 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
1843 (struct p_header *)&p, sizeof(p));
1844}
1845
1846int fill_bitmap_rle_bits(struct drbd_conf *mdev,
1847 struct p_compressed_bm *p,
1848 struct bm_xfer_ctx *c)
1849{
1850 struct bitstream bs;
1851 unsigned long plain_bits;
1852 unsigned long tmp;
1853 unsigned long rl;
1854 unsigned len;
1855 unsigned toggle;
1856 int bits;
1857
1858 /* may we use this feature? */
1859 if ((mdev->sync_conf.use_rle == 0) ||
1860 (mdev->agreed_pro_version < 90))
1861 return 0;
1862
1863 if (c->bit_offset >= c->bm_bits)
1864 return 0; /* nothing to do. */
1865
1866 /* use at most thus many bytes */
1867 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
1868 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
1869 /* plain bits covered in this code string */
1870 plain_bits = 0;
1871
1872 /* p->encoding & 0x80 stores whether the first run length is set.
1873 * bit offset is implicit.
1874 * start with toggle == 2 to be able to tell the first iteration */
1875 toggle = 2;
1876
1877 /* see how much plain bits we can stuff into one packet
1878 * using RLE and VLI. */
1879 do {
1880 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
1881 : _drbd_bm_find_next(mdev, c->bit_offset);
1882 if (tmp == -1UL)
1883 tmp = c->bm_bits;
1884 rl = tmp - c->bit_offset;
1885
1886 if (toggle == 2) { /* first iteration */
1887 if (rl == 0) {
1888 /* the first checked bit was set,
1889 * store start value, */
1890 DCBP_set_start(p, 1);
1891 /* but skip encoding of zero run length */
1892 toggle = !toggle;
1893 continue;
1894 }
1895 DCBP_set_start(p, 0);
1896 }
1897
1898 /* paranoia: catch zero runlength.
1899 * can only happen if bitmap is modified while we scan it. */
1900 if (rl == 0) {
1901 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
1902 "t:%u bo:%lu\n", toggle, c->bit_offset);
1903 return -1;
1904 }
1905
1906 bits = vli_encode_bits(&bs, rl);
1907 if (bits == -ENOBUFS) /* buffer full */
1908 break;
1909 if (bits <= 0) {
1910 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
1911 return 0;
1912 }
1913
1914 toggle = !toggle;
1915 plain_bits += rl;
1916 c->bit_offset = tmp;
1917 } while (c->bit_offset < c->bm_bits);
1918
1919 len = bs.cur.b - p->code + !!bs.cur.bit;
1920
1921 if (plain_bits < (len << 3)) {
1922 /* incompressible with this method.
1923 * we need to rewind both word and bit position. */
1924 c->bit_offset -= plain_bits;
1925 bm_xfer_ctx_bit_to_word_offset(c);
1926 c->bit_offset = c->word_offset * BITS_PER_LONG;
1927 return 0;
1928 }
1929
1930 /* RLE + VLI was able to compress it just fine.
1931 * update c->word_offset. */
1932 bm_xfer_ctx_bit_to_word_offset(c);
1933
1934 /* store pad_bits */
1935 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
1936
1937 return len;
1938}
1939
1940enum { OK, FAILED, DONE }
1941send_bitmap_rle_or_plain(struct drbd_conf *mdev,
1942 struct p_header *h, struct bm_xfer_ctx *c)
1943{
1944 struct p_compressed_bm *p = (void*)h;
1945 unsigned long num_words;
1946 int len;
1947 int ok;
1948
1949 len = fill_bitmap_rle_bits(mdev, p, c);
1950
1951 if (len < 0)
1952 return FAILED;
1953
1954 if (len) {
1955 DCBP_set_code(p, RLE_VLI_Bits);
1956 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
1957 sizeof(*p) + len, 0);
1958
1959 c->packets[0]++;
1960 c->bytes[0] += sizeof(*p) + len;
1961
1962 if (c->bit_offset >= c->bm_bits)
1963 len = 0; /* DONE */
1964 } else {
1965 /* was not compressible.
1966 * send a buffer full of plain text bits instead. */
1967 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
1968 len = num_words * sizeof(long);
1969 if (len)
1970 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
1971 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
1972 h, sizeof(struct p_header) + len, 0);
1973 c->word_offset += num_words;
1974 c->bit_offset = c->word_offset * BITS_PER_LONG;
1975
1976 c->packets[1]++;
1977 c->bytes[1] += sizeof(struct p_header) + len;
1978
1979 if (c->bit_offset > c->bm_bits)
1980 c->bit_offset = c->bm_bits;
1981 }
1982 ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
1983
1984 if (ok == DONE)
1985 INFO_bm_xfer_stats(mdev, "send", c);
1986 return ok;
1987}
1988
1989/* See the comment at receive_bitmap() */
1990int _drbd_send_bitmap(struct drbd_conf *mdev)
1991{
1992 struct bm_xfer_ctx c;
1993 struct p_header *p;
1994 int ret;
1995
1996 ERR_IF(!mdev->bitmap) return FALSE;
1997
1998 /* maybe we should use some per thread scratch page,
1999 * and allocate that during initial device creation? */
2000 p = (struct p_header *) __get_free_page(GFP_NOIO);
2001 if (!p) {
2002 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2003 return FALSE;
2004 }
2005
2006 if (get_ldev(mdev)) {
2007 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2008 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2009 drbd_bm_set_all(mdev);
2010 if (drbd_bm_write(mdev)) {
2011 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2012 * but otherwise process as per normal - need to tell other
2013 * side that a full resync is required! */
2014 dev_err(DEV, "Failed to write bitmap to disk!\n");
2015 } else {
2016 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2017 drbd_md_sync(mdev);
2018 }
2019 }
2020 put_ldev(mdev);
2021 }
2022
2023 c = (struct bm_xfer_ctx) {
2024 .bm_bits = drbd_bm_bits(mdev),
2025 .bm_words = drbd_bm_words(mdev),
2026 };
2027
2028 do {
2029 ret = send_bitmap_rle_or_plain(mdev, p, &c);
2030 } while (ret == OK);
2031
2032 free_page((unsigned long) p);
2033 return (ret == DONE);
2034}
2035
2036int drbd_send_bitmap(struct drbd_conf *mdev)
2037{
2038 int err;
2039
2040 if (!drbd_get_data_sock(mdev))
2041 return -1;
2042 err = !_drbd_send_bitmap(mdev);
2043 drbd_put_data_sock(mdev);
2044 return err;
2045}
2046
2047int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2048{
2049 int ok;
2050 struct p_barrier_ack p;
2051
2052 p.barrier = barrier_nr;
2053 p.set_size = cpu_to_be32(set_size);
2054
2055 if (mdev->state.conn < C_CONNECTED)
2056 return FALSE;
2057 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2058 (struct p_header *)&p, sizeof(p));
2059 return ok;
2060}
2061
2062/**
2063 * _drbd_send_ack() - Sends an ack packet
2064 * @mdev: DRBD device.
2065 * @cmd: Packet command code.
2066 * @sector: sector, needs to be in big endian byte order
2067 * @blksize: size in byte, needs to be in big endian byte order
2068 * @block_id: Id, big endian byte order
2069 */
2070static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2071 u64 sector,
2072 u32 blksize,
2073 u64 block_id)
2074{
2075 int ok;
2076 struct p_block_ack p;
2077
2078 p.sector = sector;
2079 p.block_id = block_id;
2080 p.blksize = blksize;
2081 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2082
2083 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2084 return FALSE;
2085 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2086 (struct p_header *)&p, sizeof(p));
2087 return ok;
2088}
2089
2090int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2091 struct p_data *dp)
2092{
2093 const int header_size = sizeof(struct p_data)
2094 - sizeof(struct p_header);
2095 int data_size = ((struct p_header *)dp)->length - header_size;
2096
2097 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2098 dp->block_id);
2099}
2100
2101int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2102 struct p_block_req *rp)
2103{
2104 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2105}
2106
2107/**
2108 * drbd_send_ack() - Sends an ack packet
2109 * @mdev: DRBD device.
2110 * @cmd: Packet command code.
2111 * @e: Epoch entry.
2112 */
2113int drbd_send_ack(struct drbd_conf *mdev,
2114 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2115{
2116 return _drbd_send_ack(mdev, cmd,
2117 cpu_to_be64(e->sector),
2118 cpu_to_be32(e->size),
2119 e->block_id);
2120}
2121
2122/* This function misuses the block_id field to signal if the blocks
2123 * are is sync or not. */
2124int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2125 sector_t sector, int blksize, u64 block_id)
2126{
2127 return _drbd_send_ack(mdev, cmd,
2128 cpu_to_be64(sector),
2129 cpu_to_be32(blksize),
2130 cpu_to_be64(block_id));
2131}
2132
2133int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2134 sector_t sector, int size, u64 block_id)
2135{
2136 int ok;
2137 struct p_block_req p;
2138
2139 p.sector = cpu_to_be64(sector);
2140 p.block_id = block_id;
2141 p.blksize = cpu_to_be32(size);
2142
2143 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2144 (struct p_header *)&p, sizeof(p));
2145 return ok;
2146}
2147
2148int drbd_send_drequest_csum(struct drbd_conf *mdev,
2149 sector_t sector, int size,
2150 void *digest, int digest_size,
2151 enum drbd_packets cmd)
2152{
2153 int ok;
2154 struct p_block_req p;
2155
2156 p.sector = cpu_to_be64(sector);
2157 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2158 p.blksize = cpu_to_be32(size);
2159
2160 p.head.magic = BE_DRBD_MAGIC;
2161 p.head.command = cpu_to_be16(cmd);
2162 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size);
2163
2164 mutex_lock(&mdev->data.mutex);
2165
2166 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2167 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2168
2169 mutex_unlock(&mdev->data.mutex);
2170
2171 return ok;
2172}
2173
2174int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2175{
2176 int ok;
2177 struct p_block_req p;
2178
2179 p.sector = cpu_to_be64(sector);
2180 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2181 p.blksize = cpu_to_be32(size);
2182
2183 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2184 (struct p_header *)&p, sizeof(p));
2185 return ok;
2186}
2187
2188/* called on sndtimeo
2189 * returns FALSE if we should retry,
2190 * TRUE if we think connection is dead
2191 */
2192static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2193{
2194 int drop_it;
2195 /* long elapsed = (long)(jiffies - mdev->last_received); */
2196
2197 drop_it = mdev->meta.socket == sock
2198 || !mdev->asender.task
2199 || get_t_state(&mdev->asender) != Running
2200 || mdev->state.conn < C_CONNECTED;
2201
2202 if (drop_it)
2203 return TRUE;
2204
2205 drop_it = !--mdev->ko_count;
2206 if (!drop_it) {
2207 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2208 current->comm, current->pid, mdev->ko_count);
2209 request_ping(mdev);
2210 }
2211
2212 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2213}
2214
2215/* The idea of sendpage seems to be to put some kind of reference
2216 * to the page into the skb, and to hand it over to the NIC. In
2217 * this process get_page() gets called.
2218 *
2219 * As soon as the page was really sent over the network put_page()
2220 * gets called by some part of the network layer. [ NIC driver? ]
2221 *
2222 * [ get_page() / put_page() increment/decrement the count. If count
2223 * reaches 0 the page will be freed. ]
2224 *
2225 * This works nicely with pages from FSs.
2226 * But this means that in protocol A we might signal IO completion too early!
2227 *
2228 * In order not to corrupt data during a resync we must make sure
2229 * that we do not reuse our own buffer pages (EEs) to early, therefore
2230 * we have the net_ee list.
2231 *
2232 * XFS seems to have problems, still, it submits pages with page_count == 0!
2233 * As a workaround, we disable sendpage on pages
2234 * with page_count == 0 or PageSlab.
2235 */
2236static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2237 int offset, size_t size)
2238{
2239 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0);
2240 kunmap(page);
2241 if (sent == size)
2242 mdev->send_cnt += size>>9;
2243 return sent == size;
2244}
2245
2246static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2247 int offset, size_t size)
2248{
2249 mm_segment_t oldfs = get_fs();
2250 int sent, ok;
2251 int len = size;
2252
2253 /* e.g. XFS meta- & log-data is in slab pages, which have a
2254 * page_count of 0 and/or have PageSlab() set.
2255 * we cannot use send_page for those, as that does get_page();
2256 * put_page(); and would cause either a VM_BUG directly, or
2257 * __page_cache_release a page that would actually still be referenced
2258 * by someone, leading to some obscure delayed Oops somewhere else. */
2259 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2260 return _drbd_no_send_page(mdev, page, offset, size);
2261
2262 drbd_update_congested(mdev);
2263 set_fs(KERNEL_DS);
2264 do {
2265 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2266 offset, len,
2267 MSG_NOSIGNAL);
2268 if (sent == -EAGAIN) {
2269 if (we_should_drop_the_connection(mdev,
2270 mdev->data.socket))
2271 break;
2272 else
2273 continue;
2274 }
2275 if (sent <= 0) {
2276 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2277 __func__, (int)size, len, sent);
2278 break;
2279 }
2280 len -= sent;
2281 offset += sent;
2282 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2283 set_fs(oldfs);
2284 clear_bit(NET_CONGESTED, &mdev->flags);
2285
2286 ok = (len == 0);
2287 if (likely(ok))
2288 mdev->send_cnt += size>>9;
2289 return ok;
2290}
2291
2292static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2293{
2294 struct bio_vec *bvec;
2295 int i;
2296 __bio_for_each_segment(bvec, bio, i, 0) {
2297 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2298 bvec->bv_offset, bvec->bv_len))
2299 return 0;
2300 }
2301 return 1;
2302}
2303
2304static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2305{
2306 struct bio_vec *bvec;
2307 int i;
2308 __bio_for_each_segment(bvec, bio, i, 0) {
2309 if (!_drbd_send_page(mdev, bvec->bv_page,
2310 bvec->bv_offset, bvec->bv_len))
2311 return 0;
2312 }
2313
2314 return 1;
2315}
2316
2317/* Used to send write requests
2318 * R_PRIMARY -> Peer (P_DATA)
2319 */
2320int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2321{
2322 int ok = 1;
2323 struct p_data p;
2324 unsigned int dp_flags = 0;
2325 void *dgb;
2326 int dgs;
2327
2328 if (!drbd_get_data_sock(mdev))
2329 return 0;
2330
2331 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2332 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2333
2334 p.head.magic = BE_DRBD_MAGIC;
2335 p.head.command = cpu_to_be16(P_DATA);
2336 p.head.length =
2337 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size);
2338
2339 p.sector = cpu_to_be64(req->sector);
2340 p.block_id = (unsigned long)req;
2341 p.seq_num = cpu_to_be32(req->seq_num =
2342 atomic_add_return(1, &mdev->packet_seq));
2343 dp_flags = 0;
2344
2345 /* NOTE: no need to check if barriers supported here as we would
2346 * not pass the test in make_request_common in that case
2347 */
2348 if (bio_rw_flagged(req->master_bio, BIO_RW_BARRIER)) {
2349 dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n");
2350 /* dp_flags |= DP_HARDBARRIER; */
2351 }
2352 if (bio_rw_flagged(req->master_bio, BIO_RW_SYNCIO))
2353 dp_flags |= DP_RW_SYNC;
2354 /* for now handle SYNCIO and UNPLUG
2355 * as if they still were one and the same flag */
2356 if (bio_rw_flagged(req->master_bio, BIO_RW_UNPLUG))
2357 dp_flags |= DP_RW_SYNC;
2358 if (mdev->state.conn >= C_SYNC_SOURCE &&
2359 mdev->state.conn <= C_PAUSED_SYNC_T)
2360 dp_flags |= DP_MAY_SET_IN_SYNC;
2361
2362 p.dp_flags = cpu_to_be32(dp_flags);
2363 trace_drbd_packet(mdev, mdev->data.socket, 0, (void *)&p, __FILE__, __LINE__);
2364 set_bit(UNPLUG_REMOTE, &mdev->flags);
2365 ok = (sizeof(p) ==
2366 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE));
2367 if (ok && dgs) {
2368 dgb = mdev->int_dig_out;
2369 drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2370 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2371 }
2372 if (ok) {
2373 if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
2374 ok = _drbd_send_bio(mdev, req->master_bio);
2375 else
2376 ok = _drbd_send_zc_bio(mdev, req->master_bio);
2377 }
2378
2379 drbd_put_data_sock(mdev);
2380 return ok;
2381}
2382
2383/* answer packet, used to send data back for read requests:
2384 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2385 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2386 */
2387int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2388 struct drbd_epoch_entry *e)
2389{
2390 int ok;
2391 struct p_data p;
2392 void *dgb;
2393 int dgs;
2394
2395 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2396 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2397
2398 p.head.magic = BE_DRBD_MAGIC;
2399 p.head.command = cpu_to_be16(cmd);
2400 p.head.length =
2401 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size);
2402
2403 p.sector = cpu_to_be64(e->sector);
2404 p.block_id = e->block_id;
2405 /* p.seq_num = 0; No sequence numbers here.. */
2406
2407 /* Only called by our kernel thread.
2408 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2409 * in response to admin command or module unload.
2410 */
2411 if (!drbd_get_data_sock(mdev))
2412 return 0;
2413
2414 trace_drbd_packet(mdev, mdev->data.socket, 0, (void *)&p, __FILE__, __LINE__);
2415 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p,
2416 sizeof(p), MSG_MORE);
2417 if (ok && dgs) {
2418 dgb = mdev->int_dig_out;
2419 drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb);
2420 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2421 }
2422 if (ok)
2423 ok = _drbd_send_zc_bio(mdev, e->private_bio);
2424
2425 drbd_put_data_sock(mdev);
2426 return ok;
2427}
2428
2429/*
2430 drbd_send distinguishes two cases:
2431
2432 Packets sent via the data socket "sock"
2433 and packets sent via the meta data socket "msock"
2434
2435 sock msock
2436 -----------------+-------------------------+------------------------------
2437 timeout conf.timeout / 2 conf.timeout / 2
2438 timeout action send a ping via msock Abort communication
2439 and close all sockets
2440*/
2441
2442/*
2443 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2444 */
2445int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2446 void *buf, size_t size, unsigned msg_flags)
2447{
2448 struct kvec iov;
2449 struct msghdr msg;
2450 int rv, sent = 0;
2451
2452 if (!sock)
2453 return -1000;
2454
2455 /* THINK if (signal_pending) return ... ? */
2456
2457 iov.iov_base = buf;
2458 iov.iov_len = size;
2459
2460 msg.msg_name = NULL;
2461 msg.msg_namelen = 0;
2462 msg.msg_control = NULL;
2463 msg.msg_controllen = 0;
2464 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2465
2466 if (sock == mdev->data.socket) {
2467 mdev->ko_count = mdev->net_conf->ko_count;
2468 drbd_update_congested(mdev);
2469 }
2470 do {
2471 /* STRANGE
2472 * tcp_sendmsg does _not_ use its size parameter at all ?
2473 *
2474 * -EAGAIN on timeout, -EINTR on signal.
2475 */
2476/* THINK
2477 * do we need to block DRBD_SIG if sock == &meta.socket ??
2478 * otherwise wake_asender() might interrupt some send_*Ack !
2479 */
2480 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2481 if (rv == -EAGAIN) {
2482 if (we_should_drop_the_connection(mdev, sock))
2483 break;
2484 else
2485 continue;
2486 }
2487 D_ASSERT(rv != 0);
2488 if (rv == -EINTR) {
2489 flush_signals(current);
2490 rv = 0;
2491 }
2492 if (rv < 0)
2493 break;
2494 sent += rv;
2495 iov.iov_base += rv;
2496 iov.iov_len -= rv;
2497 } while (sent < size);
2498
2499 if (sock == mdev->data.socket)
2500 clear_bit(NET_CONGESTED, &mdev->flags);
2501
2502 if (rv <= 0) {
2503 if (rv != -EAGAIN) {
2504 dev_err(DEV, "%s_sendmsg returned %d\n",
2505 sock == mdev->meta.socket ? "msock" : "sock",
2506 rv);
2507 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2508 } else
2509 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2510 }
2511
2512 return sent;
2513}
2514
2515static int drbd_open(struct block_device *bdev, fmode_t mode)
2516{
2517 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2518 unsigned long flags;
2519 int rv = 0;
2520
2521 spin_lock_irqsave(&mdev->req_lock, flags);
2522 /* to have a stable mdev->state.role
2523 * and no race with updating open_cnt */
2524
2525 if (mdev->state.role != R_PRIMARY) {
2526 if (mode & FMODE_WRITE)
2527 rv = -EROFS;
2528 else if (!allow_oos)
2529 rv = -EMEDIUMTYPE;
2530 }
2531
2532 if (!rv)
2533 mdev->open_cnt++;
2534 spin_unlock_irqrestore(&mdev->req_lock, flags);
2535
2536 return rv;
2537}
2538
2539static int drbd_release(struct gendisk *gd, fmode_t mode)
2540{
2541 struct drbd_conf *mdev = gd->private_data;
2542 mdev->open_cnt--;
2543 return 0;
2544}
2545
2546static void drbd_unplug_fn(struct request_queue *q)
2547{
2548 struct drbd_conf *mdev = q->queuedata;
2549
2550 trace_drbd_unplug(mdev, "got unplugged");
2551
2552 /* unplug FIRST */
2553 spin_lock_irq(q->queue_lock);
2554 blk_remove_plug(q);
2555 spin_unlock_irq(q->queue_lock);
2556
2557 /* only if connected */
2558 spin_lock_irq(&mdev->req_lock);
2559 if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) {
2560 D_ASSERT(mdev->state.role == R_PRIMARY);
2561 if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) {
2562 /* add to the data.work queue,
2563 * unless already queued.
2564 * XXX this might be a good addition to drbd_queue_work
2565 * anyways, to detect "double queuing" ... */
2566 if (list_empty(&mdev->unplug_work.list))
2567 drbd_queue_work(&mdev->data.work,
2568 &mdev->unplug_work);
2569 }
2570 }
2571 spin_unlock_irq(&mdev->req_lock);
2572
2573 if (mdev->state.disk >= D_INCONSISTENT)
2574 drbd_kick_lo(mdev);
2575}
2576
2577static void drbd_set_defaults(struct drbd_conf *mdev)
2578{
2579 mdev->sync_conf.after = DRBD_AFTER_DEF;
2580 mdev->sync_conf.rate = DRBD_RATE_DEF;
2581 mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_DEF;
2582 mdev->state = (union drbd_state) {
2583 { .role = R_SECONDARY,
2584 .peer = R_UNKNOWN,
2585 .conn = C_STANDALONE,
2586 .disk = D_DISKLESS,
2587 .pdsk = D_UNKNOWN,
2588 .susp = 0
2589 } };
2590}
2591
2592void drbd_init_set_defaults(struct drbd_conf *mdev)
2593{
2594 /* the memset(,0,) did most of this.
2595 * note: only assignments, no allocation in here */
2596
2597 drbd_set_defaults(mdev);
2598
2599 /* for now, we do NOT yet support it,
2600 * even though we start some framework
2601 * to eventually support barriers */
2602 set_bit(NO_BARRIER_SUPP, &mdev->flags);
2603
2604 atomic_set(&mdev->ap_bio_cnt, 0);
2605 atomic_set(&mdev->ap_pending_cnt, 0);
2606 atomic_set(&mdev->rs_pending_cnt, 0);
2607 atomic_set(&mdev->unacked_cnt, 0);
2608 atomic_set(&mdev->local_cnt, 0);
2609 atomic_set(&mdev->net_cnt, 0);
2610 atomic_set(&mdev->packet_seq, 0);
2611 atomic_set(&mdev->pp_in_use, 0);
2612
2613 mutex_init(&mdev->md_io_mutex);
2614 mutex_init(&mdev->data.mutex);
2615 mutex_init(&mdev->meta.mutex);
2616 sema_init(&mdev->data.work.s, 0);
2617 sema_init(&mdev->meta.work.s, 0);
2618 mutex_init(&mdev->state_mutex);
2619
2620 spin_lock_init(&mdev->data.work.q_lock);
2621 spin_lock_init(&mdev->meta.work.q_lock);
2622
2623 spin_lock_init(&mdev->al_lock);
2624 spin_lock_init(&mdev->req_lock);
2625 spin_lock_init(&mdev->peer_seq_lock);
2626 spin_lock_init(&mdev->epoch_lock);
2627
2628 INIT_LIST_HEAD(&mdev->active_ee);
2629 INIT_LIST_HEAD(&mdev->sync_ee);
2630 INIT_LIST_HEAD(&mdev->done_ee);
2631 INIT_LIST_HEAD(&mdev->read_ee);
2632 INIT_LIST_HEAD(&mdev->net_ee);
2633 INIT_LIST_HEAD(&mdev->resync_reads);
2634 INIT_LIST_HEAD(&mdev->data.work.q);
2635 INIT_LIST_HEAD(&mdev->meta.work.q);
2636 INIT_LIST_HEAD(&mdev->resync_work.list);
2637 INIT_LIST_HEAD(&mdev->unplug_work.list);
2638 INIT_LIST_HEAD(&mdev->md_sync_work.list);
2639 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
2640 mdev->resync_work.cb = w_resync_inactive;
2641 mdev->unplug_work.cb = w_send_write_hint;
2642 mdev->md_sync_work.cb = w_md_sync;
2643 mdev->bm_io_work.w.cb = w_bitmap_io;
2644 init_timer(&mdev->resync_timer);
2645 init_timer(&mdev->md_sync_timer);
2646 mdev->resync_timer.function = resync_timer_fn;
2647 mdev->resync_timer.data = (unsigned long) mdev;
2648 mdev->md_sync_timer.function = md_sync_timer_fn;
2649 mdev->md_sync_timer.data = (unsigned long) mdev;
2650
2651 init_waitqueue_head(&mdev->misc_wait);
2652 init_waitqueue_head(&mdev->state_wait);
2653 init_waitqueue_head(&mdev->ee_wait);
2654 init_waitqueue_head(&mdev->al_wait);
2655 init_waitqueue_head(&mdev->seq_wait);
2656
2657 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2658 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2659 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2660
2661 mdev->agreed_pro_version = PRO_VERSION_MAX;
2662 mdev->write_ordering = WO_bio_barrier;
2663 mdev->resync_wenr = LC_FREE;
2664}
2665
2666void drbd_mdev_cleanup(struct drbd_conf *mdev)
2667{
2668 if (mdev->receiver.t_state != None)
2669 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2670 mdev->receiver.t_state);
2671
2672 /* no need to lock it, I'm the only thread alive */
2673 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
2674 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2675 mdev->al_writ_cnt =
2676 mdev->bm_writ_cnt =
2677 mdev->read_cnt =
2678 mdev->recv_cnt =
2679 mdev->send_cnt =
2680 mdev->writ_cnt =
2681 mdev->p_size =
2682 mdev->rs_start =
2683 mdev->rs_total =
2684 mdev->rs_failed =
2685 mdev->rs_mark_left =
2686 mdev->rs_mark_time = 0;
2687 D_ASSERT(mdev->net_conf == NULL);
2688
2689 drbd_set_my_capacity(mdev, 0);
2690 if (mdev->bitmap) {
2691 /* maybe never allocated. */
2692 drbd_bm_resize(mdev, 0);
2693 drbd_bm_cleanup(mdev);
2694 }
2695
2696 drbd_free_resources(mdev);
2697
2698 /*
2699 * currently we drbd_init_ee only on module load, so
2700 * we may do drbd_release_ee only on module unload!
2701 */
2702 D_ASSERT(list_empty(&mdev->active_ee));
2703 D_ASSERT(list_empty(&mdev->sync_ee));
2704 D_ASSERT(list_empty(&mdev->done_ee));
2705 D_ASSERT(list_empty(&mdev->read_ee));
2706 D_ASSERT(list_empty(&mdev->net_ee));
2707 D_ASSERT(list_empty(&mdev->resync_reads));
2708 D_ASSERT(list_empty(&mdev->data.work.q));
2709 D_ASSERT(list_empty(&mdev->meta.work.q));
2710 D_ASSERT(list_empty(&mdev->resync_work.list));
2711 D_ASSERT(list_empty(&mdev->unplug_work.list));
2712
2713}
2714
2715
2716static void drbd_destroy_mempools(void)
2717{
2718 struct page *page;
2719
2720 while (drbd_pp_pool) {
2721 page = drbd_pp_pool;
2722 drbd_pp_pool = (struct page *)page_private(page);
2723 __free_page(page);
2724 drbd_pp_vacant--;
2725 }
2726
2727 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2728
2729 if (drbd_ee_mempool)
2730 mempool_destroy(drbd_ee_mempool);
2731 if (drbd_request_mempool)
2732 mempool_destroy(drbd_request_mempool);
2733 if (drbd_ee_cache)
2734 kmem_cache_destroy(drbd_ee_cache);
2735 if (drbd_request_cache)
2736 kmem_cache_destroy(drbd_request_cache);
2737 if (drbd_bm_ext_cache)
2738 kmem_cache_destroy(drbd_bm_ext_cache);
2739 if (drbd_al_ext_cache)
2740 kmem_cache_destroy(drbd_al_ext_cache);
2741
2742 drbd_ee_mempool = NULL;
2743 drbd_request_mempool = NULL;
2744 drbd_ee_cache = NULL;
2745 drbd_request_cache = NULL;
2746 drbd_bm_ext_cache = NULL;
2747 drbd_al_ext_cache = NULL;
2748
2749 return;
2750}
2751
2752static int drbd_create_mempools(void)
2753{
2754 struct page *page;
2755 const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count;
2756 int i;
2757
2758 /* prepare our caches and mempools */
2759 drbd_request_mempool = NULL;
2760 drbd_ee_cache = NULL;
2761 drbd_request_cache = NULL;
2762 drbd_bm_ext_cache = NULL;
2763 drbd_al_ext_cache = NULL;
2764 drbd_pp_pool = NULL;
2765
2766 /* caches */
2767 drbd_request_cache = kmem_cache_create(
2768 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
2769 if (drbd_request_cache == NULL)
2770 goto Enomem;
2771
2772 drbd_ee_cache = kmem_cache_create(
2773 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
2774 if (drbd_ee_cache == NULL)
2775 goto Enomem;
2776
2777 drbd_bm_ext_cache = kmem_cache_create(
2778 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
2779 if (drbd_bm_ext_cache == NULL)
2780 goto Enomem;
2781
2782 drbd_al_ext_cache = kmem_cache_create(
2783 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
2784 if (drbd_al_ext_cache == NULL)
2785 goto Enomem;
2786
2787 /* mempools */
2788 drbd_request_mempool = mempool_create(number,
2789 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
2790 if (drbd_request_mempool == NULL)
2791 goto Enomem;
2792
2793 drbd_ee_mempool = mempool_create(number,
2794 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2795 if (drbd_request_mempool == NULL)
2796 goto Enomem;
2797
2798 /* drbd's page pool */
2799 spin_lock_init(&drbd_pp_lock);
2800
2801 for (i = 0; i < number; i++) {
2802 page = alloc_page(GFP_HIGHUSER);
2803 if (!page)
2804 goto Enomem;
2805 set_page_private(page, (unsigned long)drbd_pp_pool);
2806 drbd_pp_pool = page;
2807 }
2808 drbd_pp_vacant = number;
2809
2810 return 0;
2811
2812Enomem:
2813 drbd_destroy_mempools(); /* in case we allocated some */
2814 return -ENOMEM;
2815}
2816
2817static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
2818 void *unused)
2819{
2820 /* just so we have it. you never know what interesting things we
2821 * might want to do here some day...
2822 */
2823
2824 return NOTIFY_DONE;
2825}
2826
2827static struct notifier_block drbd_notifier = {
2828 .notifier_call = drbd_notify_sys,
2829};
2830
2831static void drbd_release_ee_lists(struct drbd_conf *mdev)
2832{
2833 int rr;
2834
2835 rr = drbd_release_ee(mdev, &mdev->active_ee);
2836 if (rr)
2837 dev_err(DEV, "%d EEs in active list found!\n", rr);
2838
2839 rr = drbd_release_ee(mdev, &mdev->sync_ee);
2840 if (rr)
2841 dev_err(DEV, "%d EEs in sync list found!\n", rr);
2842
2843 rr = drbd_release_ee(mdev, &mdev->read_ee);
2844 if (rr)
2845 dev_err(DEV, "%d EEs in read list found!\n", rr);
2846
2847 rr = drbd_release_ee(mdev, &mdev->done_ee);
2848 if (rr)
2849 dev_err(DEV, "%d EEs in done list found!\n", rr);
2850
2851 rr = drbd_release_ee(mdev, &mdev->net_ee);
2852 if (rr)
2853 dev_err(DEV, "%d EEs in net list found!\n", rr);
2854}
2855
2856/* caution. no locking.
2857 * currently only used from module cleanup code. */
2858static void drbd_delete_device(unsigned int minor)
2859{
2860 struct drbd_conf *mdev = minor_to_mdev(minor);
2861
2862 if (!mdev)
2863 return;
2864
2865 /* paranoia asserts */
2866 if (mdev->open_cnt != 0)
2867 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
2868 __FILE__ , __LINE__);
2869
2870 ERR_IF (!list_empty(&mdev->data.work.q)) {
2871 struct list_head *lp;
2872 list_for_each(lp, &mdev->data.work.q) {
2873 dev_err(DEV, "lp = %p\n", lp);
2874 }
2875 };
2876 /* end paranoia asserts */
2877
2878 del_gendisk(mdev->vdisk);
2879
2880 /* cleanup stuff that may have been allocated during
2881 * device (re-)configuration or state changes */
2882
2883 if (mdev->this_bdev)
2884 bdput(mdev->this_bdev);
2885
2886 drbd_free_resources(mdev);
2887
2888 drbd_release_ee_lists(mdev);
2889
2890 /* should be free'd on disconnect? */
2891 kfree(mdev->ee_hash);
2892 /*
2893 mdev->ee_hash_s = 0;
2894 mdev->ee_hash = NULL;
2895 */
2896
2897 lc_destroy(mdev->act_log);
2898 lc_destroy(mdev->resync);
2899
2900 kfree(mdev->p_uuid);
2901 /* mdev->p_uuid = NULL; */
2902
2903 kfree(mdev->int_dig_out);
2904 kfree(mdev->int_dig_in);
2905 kfree(mdev->int_dig_vv);
2906
2907 /* cleanup the rest that has been
2908 * allocated from drbd_new_device
2909 * and actually free the mdev itself */
2910 drbd_free_mdev(mdev);
2911}
2912
2913static void drbd_cleanup(void)
2914{
2915 unsigned int i;
2916
2917 unregister_reboot_notifier(&drbd_notifier);
2918
2919 drbd_nl_cleanup();
2920
2921 if (minor_table) {
2922 if (drbd_proc)
2923 remove_proc_entry("drbd", NULL);
2924 i = minor_count;
2925 while (i--)
2926 drbd_delete_device(i);
2927 drbd_destroy_mempools();
2928 }
2929
2930 kfree(minor_table);
2931
2932 unregister_blkdev(DRBD_MAJOR, "drbd");
2933
2934 printk(KERN_INFO "drbd: module cleanup done.\n");
2935}
2936
2937/**
2938 * drbd_congested() - Callback for pdflush
2939 * @congested_data: User data
2940 * @bdi_bits: Bits pdflush is currently interested in
2941 *
2942 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
2943 */
2944static int drbd_congested(void *congested_data, int bdi_bits)
2945{
2946 struct drbd_conf *mdev = congested_data;
2947 struct request_queue *q;
2948 char reason = '-';
2949 int r = 0;
2950
2951 if (!__inc_ap_bio_cond(mdev)) {
2952 /* DRBD has frozen IO */
2953 r = bdi_bits;
2954 reason = 'd';
2955 goto out;
2956 }
2957
2958 if (get_ldev(mdev)) {
2959 q = bdev_get_queue(mdev->ldev->backing_bdev);
2960 r = bdi_congested(&q->backing_dev_info, bdi_bits);
2961 put_ldev(mdev);
2962 if (r)
2963 reason = 'b';
2964 }
2965
2966 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
2967 r |= (1 << BDI_async_congested);
2968 reason = reason == 'b' ? 'a' : 'n';
2969 }
2970
2971out:
2972 mdev->congestion_reason = reason;
2973 return r;
2974}
2975
2976struct drbd_conf *drbd_new_device(unsigned int minor)
2977{
2978 struct drbd_conf *mdev;
2979 struct gendisk *disk;
2980 struct request_queue *q;
2981
2982 /* GFP_KERNEL, we are outside of all write-out paths */
2983 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
2984 if (!mdev)
2985 return NULL;
2986 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
2987 goto out_no_cpumask;
2988
2989 mdev->minor = minor;
2990
2991 drbd_init_set_defaults(mdev);
2992
2993 q = blk_alloc_queue(GFP_KERNEL);
2994 if (!q)
2995 goto out_no_q;
2996 mdev->rq_queue = q;
2997 q->queuedata = mdev;
2998 blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE);
2999
3000 disk = alloc_disk(1);
3001 if (!disk)
3002 goto out_no_disk;
3003 mdev->vdisk = disk;
3004
3005 set_disk_ro(disk, TRUE);
3006
3007 disk->queue = q;
3008 disk->major = DRBD_MAJOR;
3009 disk->first_minor = minor;
3010 disk->fops = &drbd_ops;
3011 sprintf(disk->disk_name, "drbd%d", minor);
3012 disk->private_data = mdev;
3013
3014 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3015 /* we have no partitions. we contain only ourselves. */
3016 mdev->this_bdev->bd_contains = mdev->this_bdev;
3017
3018 q->backing_dev_info.congested_fn = drbd_congested;
3019 q->backing_dev_info.congested_data = mdev;
3020
3021 blk_queue_make_request(q, drbd_make_request_26);
3022 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3023 blk_queue_merge_bvec(q, drbd_merge_bvec);
3024 q->queue_lock = &mdev->req_lock; /* needed since we use */
3025 /* plugging on a queue, that actually has no requests! */
3026 q->unplug_fn = drbd_unplug_fn;
3027
3028 mdev->md_io_page = alloc_page(GFP_KERNEL);
3029 if (!mdev->md_io_page)
3030 goto out_no_io_page;
3031
3032 if (drbd_bm_init(mdev))
3033 goto out_no_bitmap;
3034 /* no need to lock access, we are still initializing this minor device. */
3035 if (!tl_init(mdev))
3036 goto out_no_tl;
3037
3038 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3039 if (!mdev->app_reads_hash)
3040 goto out_no_app_reads;
3041
3042 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3043 if (!mdev->current_epoch)
3044 goto out_no_epoch;
3045
3046 INIT_LIST_HEAD(&mdev->current_epoch->list);
3047 mdev->epochs = 1;
3048
3049 return mdev;
3050
3051/* out_whatever_else:
3052 kfree(mdev->current_epoch); */
3053out_no_epoch:
3054 kfree(mdev->app_reads_hash);
3055out_no_app_reads:
3056 tl_cleanup(mdev);
3057out_no_tl:
3058 drbd_bm_cleanup(mdev);
3059out_no_bitmap:
3060 __free_page(mdev->md_io_page);
3061out_no_io_page:
3062 put_disk(disk);
3063out_no_disk:
3064 blk_cleanup_queue(q);
3065out_no_q:
3066 free_cpumask_var(mdev->cpu_mask);
3067out_no_cpumask:
3068 kfree(mdev);
3069 return NULL;
3070}
3071
3072/* counterpart of drbd_new_device.
3073 * last part of drbd_delete_device. */
3074void drbd_free_mdev(struct drbd_conf *mdev)
3075{
3076 kfree(mdev->current_epoch);
3077 kfree(mdev->app_reads_hash);
3078 tl_cleanup(mdev);
3079 if (mdev->bitmap) /* should no longer be there. */
3080 drbd_bm_cleanup(mdev);
3081 __free_page(mdev->md_io_page);
3082 put_disk(mdev->vdisk);
3083 blk_cleanup_queue(mdev->rq_queue);
3084 free_cpumask_var(mdev->cpu_mask);
3085 kfree(mdev);
3086}
3087
3088
3089int __init drbd_init(void)
3090{
3091 int err;
3092
3093 if (sizeof(struct p_handshake) != 80) {
3094 printk(KERN_ERR
3095 "drbd: never change the size or layout "
3096 "of the HandShake packet.\n");
3097 return -EINVAL;
3098 }
3099
3100 if (1 > minor_count || minor_count > 255) {
3101 printk(KERN_ERR
3102 "drbd: invalid minor_count (%d)\n", minor_count);
3103#ifdef MODULE
3104 return -EINVAL;
3105#else
3106 minor_count = 8;
3107#endif
3108 }
3109
3110 err = drbd_nl_init();
3111 if (err)
3112 return err;
3113
3114 err = register_blkdev(DRBD_MAJOR, "drbd");
3115 if (err) {
3116 printk(KERN_ERR
3117 "drbd: unable to register block device major %d\n",
3118 DRBD_MAJOR);
3119 return err;
3120 }
3121
3122 register_reboot_notifier(&drbd_notifier);
3123
3124 /*
3125 * allocate all necessary structs
3126 */
3127 err = -ENOMEM;
3128
3129 init_waitqueue_head(&drbd_pp_wait);
3130
3131 drbd_proc = NULL; /* play safe for drbd_cleanup */
3132 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3133 GFP_KERNEL);
3134 if (!minor_table)
3135 goto Enomem;
3136
3137 err = drbd_create_mempools();
3138 if (err)
3139 goto Enomem;
3140
3141 drbd_proc = proc_create("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops);
3142 if (!drbd_proc) {
3143 printk(KERN_ERR "drbd: unable to register proc file\n");
3144 goto Enomem;
3145 }
3146
3147 rwlock_init(&global_state_lock);
3148
3149 printk(KERN_INFO "drbd: initialized. "
3150 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3151 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3152 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3153 printk(KERN_INFO "drbd: registered as block device major %d\n",
3154 DRBD_MAJOR);
3155 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3156
3157 return 0; /* Success! */
3158
3159Enomem:
3160 drbd_cleanup();
3161 if (err == -ENOMEM)
3162 /* currently always the case */
3163 printk(KERN_ERR "drbd: ran out of memory\n");
3164 else
3165 printk(KERN_ERR "drbd: initialization failure\n");
3166 return err;
3167}
3168
3169void drbd_free_bc(struct drbd_backing_dev *ldev)
3170{
3171 if (ldev == NULL)
3172 return;
3173
3174 bd_release(ldev->backing_bdev);
3175 bd_release(ldev->md_bdev);
3176
3177 fput(ldev->lo_file);
3178 fput(ldev->md_file);
3179
3180 kfree(ldev);
3181}
3182
3183void drbd_free_sock(struct drbd_conf *mdev)
3184{
3185 if (mdev->data.socket) {
3186 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3187 sock_release(mdev->data.socket);
3188 mdev->data.socket = NULL;
3189 }
3190 if (mdev->meta.socket) {
3191 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3192 sock_release(mdev->meta.socket);
3193 mdev->meta.socket = NULL;
3194 }
3195}
3196
3197
3198void drbd_free_resources(struct drbd_conf *mdev)
3199{
3200 crypto_free_hash(mdev->csums_tfm);
3201 mdev->csums_tfm = NULL;
3202 crypto_free_hash(mdev->verify_tfm);
3203 mdev->verify_tfm = NULL;
3204 crypto_free_hash(mdev->cram_hmac_tfm);
3205 mdev->cram_hmac_tfm = NULL;
3206 crypto_free_hash(mdev->integrity_w_tfm);
3207 mdev->integrity_w_tfm = NULL;
3208 crypto_free_hash(mdev->integrity_r_tfm);
3209 mdev->integrity_r_tfm = NULL;
3210
3211 drbd_free_sock(mdev);
3212
3213 __no_warn(local,
3214 drbd_free_bc(mdev->ldev);
3215 mdev->ldev = NULL;);
3216}
3217
3218/* meta data management */
3219
3220struct meta_data_on_disk {
3221 u64 la_size; /* last agreed size. */
3222 u64 uuid[UI_SIZE]; /* UUIDs. */
3223 u64 device_uuid;
3224 u64 reserved_u64_1;
3225 u32 flags; /* MDF */
3226 u32 magic;
3227 u32 md_size_sect;
3228 u32 al_offset; /* offset to this block */
3229 u32 al_nr_extents; /* important for restoring the AL */
3230 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3231 u32 bm_offset; /* offset to the bitmap, from here */
3232 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3233 u32 reserved_u32[4];
3234
3235} __packed;
3236
3237/**
3238 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3239 * @mdev: DRBD device.
3240 */
3241void drbd_md_sync(struct drbd_conf *mdev)
3242{
3243 struct meta_data_on_disk *buffer;
3244 sector_t sector;
3245 int i;
3246
3247 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3248 return;
3249 del_timer(&mdev->md_sync_timer);
3250
3251 /* We use here D_FAILED and not D_ATTACHING because we try to write
3252 * metadata even if we detach due to a disk failure! */
3253 if (!get_ldev_if_state(mdev, D_FAILED))
3254 return;
3255
3256 trace_drbd_md_io(mdev, WRITE, mdev->ldev);
3257
3258 mutex_lock(&mdev->md_io_mutex);
3259 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3260 memset(buffer, 0, 512);
3261
3262 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3263 for (i = UI_CURRENT; i < UI_SIZE; i++)
3264 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3265 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3266 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3267
3268 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3269 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3270 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3271 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3272 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3273
3274 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3275
3276 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3277 sector = mdev->ldev->md.md_offset;
3278
3279 if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3280 clear_bit(MD_DIRTY, &mdev->flags);
3281 } else {
3282 /* this was a try anyways ... */
3283 dev_err(DEV, "meta data update failed!\n");
3284
3285 drbd_chk_io_error(mdev, 1, TRUE);
3286 }
3287
3288 /* Update mdev->ldev->md.la_size_sect,
3289 * since we updated it on metadata. */
3290 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3291
3292 mutex_unlock(&mdev->md_io_mutex);
3293 put_ldev(mdev);
3294}
3295
3296/**
3297 * drbd_md_read() - Reads in the meta data super block
3298 * @mdev: DRBD device.
3299 * @bdev: Device from which the meta data should be read in.
3300 *
3301 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
3302 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3303 */
3304int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3305{
3306 struct meta_data_on_disk *buffer;
3307 int i, rv = NO_ERROR;
3308
3309 if (!get_ldev_if_state(mdev, D_ATTACHING))
3310 return ERR_IO_MD_DISK;
3311
3312 trace_drbd_md_io(mdev, READ, bdev);
3313
3314 mutex_lock(&mdev->md_io_mutex);
3315 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3316
3317 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3318 /* NOTE: cant do normal error processing here as this is
3319 called BEFORE disk is attached */
3320 dev_err(DEV, "Error while reading metadata.\n");
3321 rv = ERR_IO_MD_DISK;
3322 goto err;
3323 }
3324
3325 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3326 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3327 rv = ERR_MD_INVALID;
3328 goto err;
3329 }
3330 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3331 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3332 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3333 rv = ERR_MD_INVALID;
3334 goto err;
3335 }
3336 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3337 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3338 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3339 rv = ERR_MD_INVALID;
3340 goto err;
3341 }
3342 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3343 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3344 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3345 rv = ERR_MD_INVALID;
3346 goto err;
3347 }
3348
3349 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3350 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3351 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3352 rv = ERR_MD_INVALID;
3353 goto err;
3354 }
3355
3356 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3357 for (i = UI_CURRENT; i < UI_SIZE; i++)
3358 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3359 bdev->md.flags = be32_to_cpu(buffer->flags);
3360 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3361 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3362
3363 if (mdev->sync_conf.al_extents < 7)
3364 mdev->sync_conf.al_extents = 127;
3365
3366 err:
3367 mutex_unlock(&mdev->md_io_mutex);
3368 put_ldev(mdev);
3369
3370 return rv;
3371}
3372
3373/**
3374 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3375 * @mdev: DRBD device.
3376 *
3377 * Call this function if you change anything that should be written to
3378 * the meta-data super block. This function sets MD_DIRTY, and starts a
3379 * timer that ensures that within five seconds you have to call drbd_md_sync().
3380 */
3381void drbd_md_mark_dirty(struct drbd_conf *mdev)
3382{
3383 set_bit(MD_DIRTY, &mdev->flags);
3384 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
3385}
3386
3387
3388static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3389{
3390 int i;
3391
3392 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) {
3393 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
3394
3395 trace_drbd_uuid(mdev, i+1);
3396 }
3397}
3398
3399void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3400{
3401 if (idx == UI_CURRENT) {
3402 if (mdev->state.role == R_PRIMARY)
3403 val |= 1;
3404 else
3405 val &= ~((u64)1);
3406
3407 drbd_set_ed_uuid(mdev, val);
3408 }
3409
3410 mdev->ldev->md.uuid[idx] = val;
3411 trace_drbd_uuid(mdev, idx);
3412 drbd_md_mark_dirty(mdev);
3413}
3414
3415
3416void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3417{
3418 if (mdev->ldev->md.uuid[idx]) {
3419 drbd_uuid_move_history(mdev);
3420 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
3421 trace_drbd_uuid(mdev, UI_HISTORY_START);
3422 }
3423 _drbd_uuid_set(mdev, idx, val);
3424}
3425
3426/**
3427 * drbd_uuid_new_current() - Creates a new current UUID
3428 * @mdev: DRBD device.
3429 *
3430 * Creates a new current UUID, and rotates the old current UUID into
3431 * the bitmap slot. Causes an incremental resync upon next connect.
3432 */
3433void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3434{
3435 u64 val;
3436
3437 dev_info(DEV, "Creating new current UUID\n");
3438 D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3439 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
3440 trace_drbd_uuid(mdev, UI_BITMAP);
3441
3442 get_random_bytes(&val, sizeof(u64));
3443 _drbd_uuid_set(mdev, UI_CURRENT, val);
3444}
3445
3446void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3447{
3448 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3449 return;
3450
3451 if (val == 0) {
3452 drbd_uuid_move_history(mdev);
3453 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3454 mdev->ldev->md.uuid[UI_BITMAP] = 0;
3455 trace_drbd_uuid(mdev, UI_HISTORY_START);
3456 trace_drbd_uuid(mdev, UI_BITMAP);
3457 } else {
3458 if (mdev->ldev->md.uuid[UI_BITMAP])
3459 dev_warn(DEV, "bm UUID already set");
3460
3461 mdev->ldev->md.uuid[UI_BITMAP] = val;
3462 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3463
3464 trace_drbd_uuid(mdev, UI_BITMAP);
3465 }
3466 drbd_md_mark_dirty(mdev);
3467}
3468
3469/**
3470 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3471 * @mdev: DRBD device.
3472 *
3473 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3474 */
3475int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3476{
3477 int rv = -EIO;
3478
3479 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3480 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3481 drbd_md_sync(mdev);
3482 drbd_bm_set_all(mdev);
3483
3484 rv = drbd_bm_write(mdev);
3485
3486 if (!rv) {
3487 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3488 drbd_md_sync(mdev);
3489 }
3490
3491 put_ldev(mdev);
3492 }
3493
3494 return rv;
3495}
3496
3497/**
3498 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3499 * @mdev: DRBD device.
3500 *
3501 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3502 */
3503int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3504{
3505 int rv = -EIO;
3506
3507 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3508 drbd_bm_clear_all(mdev);
3509 rv = drbd_bm_write(mdev);
3510 put_ldev(mdev);
3511 }
3512
3513 return rv;
3514}
3515
3516static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3517{
3518 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3519 int rv;
3520
3521 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3522
3523 drbd_bm_lock(mdev, work->why);
3524 rv = work->io_fn(mdev);
3525 drbd_bm_unlock(mdev);
3526
3527 clear_bit(BITMAP_IO, &mdev->flags);
3528 wake_up(&mdev->misc_wait);
3529
3530 if (work->done)
3531 work->done(mdev, rv);
3532
3533 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3534 work->why = NULL;
3535
3536 return 1;
3537}
3538
3539/**
3540 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3541 * @mdev: DRBD device.
3542 * @io_fn: IO callback to be called when bitmap IO is possible
3543 * @done: callback to be called after the bitmap IO was performed
3544 * @why: Descriptive text of the reason for doing the IO
3545 *
3546 * While IO on the bitmap happens we freeze application IO thus we ensure
3547 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3548 * called from worker context. It MUST NOT be used while a previous such
3549 * work is still pending!
3550 */
3551void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3552 int (*io_fn)(struct drbd_conf *),
3553 void (*done)(struct drbd_conf *, int),
3554 char *why)
3555{
3556 D_ASSERT(current == mdev->worker.task);
3557
3558 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3559 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3560 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3561 if (mdev->bm_io_work.why)
3562 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3563 why, mdev->bm_io_work.why);
3564
3565 mdev->bm_io_work.io_fn = io_fn;
3566 mdev->bm_io_work.done = done;
3567 mdev->bm_io_work.why = why;
3568
3569 set_bit(BITMAP_IO, &mdev->flags);
3570 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
3571 if (list_empty(&mdev->bm_io_work.w.list)) {
3572 set_bit(BITMAP_IO_QUEUED, &mdev->flags);
3573 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
3574 } else
3575 dev_err(DEV, "FIXME avoided double queuing bm_io_work\n");
3576 }
3577}
3578
3579/**
3580 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
3581 * @mdev: DRBD device.
3582 * @io_fn: IO callback to be called when bitmap IO is possible
3583 * @why: Descriptive text of the reason for doing the IO
3584 *
3585 * freezes application IO while that the actual IO operations runs. This
3586 * functions MAY NOT be called from worker context.
3587 */
3588int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3589{
3590 int rv;
3591
3592 D_ASSERT(current != mdev->worker.task);
3593
3594 drbd_suspend_io(mdev);
3595
3596 drbd_bm_lock(mdev, why);
3597 rv = io_fn(mdev);
3598 drbd_bm_unlock(mdev);
3599
3600 drbd_resume_io(mdev);
3601
3602 return rv;
3603}
3604
3605void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3606{
3607 if ((mdev->ldev->md.flags & flag) != flag) {
3608 drbd_md_mark_dirty(mdev);
3609 mdev->ldev->md.flags |= flag;
3610 }
3611}
3612
3613void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3614{
3615 if ((mdev->ldev->md.flags & flag) != 0) {
3616 drbd_md_mark_dirty(mdev);
3617 mdev->ldev->md.flags &= ~flag;
3618 }
3619}
3620int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3621{
3622 return (bdev->md.flags & flag) != 0;
3623}
3624
3625static void md_sync_timer_fn(unsigned long data)
3626{
3627 struct drbd_conf *mdev = (struct drbd_conf *) data;
3628
3629 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
3630}
3631
3632static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3633{
3634 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
3635 drbd_md_sync(mdev);
3636
3637 return 1;
3638}
3639
3640#ifdef CONFIG_DRBD_FAULT_INJECTION
3641/* Fault insertion support including random number generator shamelessly
3642 * stolen from kernel/rcutorture.c */
3643struct fault_random_state {
3644 unsigned long state;
3645 unsigned long count;
3646};
3647
3648#define FAULT_RANDOM_MULT 39916801 /* prime */
3649#define FAULT_RANDOM_ADD 479001701 /* prime */
3650#define FAULT_RANDOM_REFRESH 10000
3651
3652/*
3653 * Crude but fast random-number generator. Uses a linear congruential
3654 * generator, with occasional help from get_random_bytes().
3655 */
3656static unsigned long
3657_drbd_fault_random(struct fault_random_state *rsp)
3658{
3659 long refresh;
3660
3661 if (--rsp->count < 0) {
3662 get_random_bytes(&refresh, sizeof(refresh));
3663 rsp->state += refresh;
3664 rsp->count = FAULT_RANDOM_REFRESH;
3665 }
3666 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3667 return swahw32(rsp->state);
3668}
3669
3670static char *
3671_drbd_fault_str(unsigned int type) {
3672 static char *_faults[] = {
3673 [DRBD_FAULT_MD_WR] = "Meta-data write",
3674 [DRBD_FAULT_MD_RD] = "Meta-data read",
3675 [DRBD_FAULT_RS_WR] = "Resync write",
3676 [DRBD_FAULT_RS_RD] = "Resync read",
3677 [DRBD_FAULT_DT_WR] = "Data write",
3678 [DRBD_FAULT_DT_RD] = "Data read",
3679 [DRBD_FAULT_DT_RA] = "Data read ahead",
3680 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
3681 [DRBD_FAULT_AL_EE] = "EE allocation"
3682 };
3683
3684 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3685}
3686
3687unsigned int
3688_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
3689{
3690 static struct fault_random_state rrs = {0, 0};
3691
3692 unsigned int ret = (
3693 (fault_devs == 0 ||
3694 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
3695 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
3696
3697 if (ret) {
3698 fault_count++;
3699
3700 if (printk_ratelimit())
3701 dev_warn(DEV, "***Simulating %s failure\n",
3702 _drbd_fault_str(type));
3703 }
3704
3705 return ret;
3706}
3707#endif
3708
3709const char *drbd_buildtag(void)
3710{
3711 /* DRBD built from external sources has here a reference to the
3712 git hash of the source code. */
3713
3714 static char buildtag[38] = "\0uilt-in";
3715
3716 if (buildtag[0] == 0) {
3717#ifdef CONFIG_MODULES
3718 if (THIS_MODULE != NULL)
3719 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3720 else
3721#endif
3722 buildtag[0] = 'b';
3723 }
3724
3725 return buildtag;
3726}
3727
3728module_init(drbd_init)
3729module_exit(drbd_cleanup)
3730
3731/* For drbd_tracing: */
3732EXPORT_SYMBOL(drbd_conn_str);
3733EXPORT_SYMBOL(drbd_role_str);
3734EXPORT_SYMBOL(drbd_disk_str);
3735EXPORT_SYMBOL(drbd_set_st_err_str);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
new file mode 100644
index 000000000000..1927acefe230
--- /dev/null
+++ b/drivers/block/drbd/drbd_nl.c
@@ -0,0 +1,2365 @@
1/*
2 drbd_nl.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/autoconf.h>
27#include <linux/module.h>
28#include <linux/drbd.h>
29#include <linux/in.h>
30#include <linux/fs.h>
31#include <linux/file.h>
32#include <linux/slab.h>
33#include <linux/connector.h>
34#include <linux/blkpg.h>
35#include <linux/cpumask.h>
36#include "drbd_int.h"
37#include "drbd_tracing.h"
38#include "drbd_wrappers.h"
39#include <asm/unaligned.h>
40#include <linux/drbd_tag_magic.h>
41#include <linux/drbd_limits.h>
42
43static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int);
44static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *);
45static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *);
46
47/* see get_sb_bdev and bd_claim */
48static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
49
50/* Generate the tag_list to struct functions */
51#define NL_PACKET(name, number, fields) \
52static int name ## _from_tags(struct drbd_conf *mdev, \
53 unsigned short *tags, struct name *arg) __attribute__ ((unused)); \
54static int name ## _from_tags(struct drbd_conf *mdev, \
55 unsigned short *tags, struct name *arg) \
56{ \
57 int tag; \
58 int dlen; \
59 \
60 while ((tag = get_unaligned(tags++)) != TT_END) { \
61 dlen = get_unaligned(tags++); \
62 switch (tag_number(tag)) { \
63 fields \
64 default: \
65 if (tag & T_MANDATORY) { \
66 dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \
67 return 0; \
68 } \
69 } \
70 tags = (unsigned short *)((char *)tags + dlen); \
71 } \
72 return 1; \
73}
74#define NL_INTEGER(pn, pr, member) \
75 case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \
76 arg->member = get_unaligned((int *)(tags)); \
77 break;
78#define NL_INT64(pn, pr, member) \
79 case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \
80 arg->member = get_unaligned((u64 *)(tags)); \
81 break;
82#define NL_BIT(pn, pr, member) \
83 case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \
84 arg->member = *(char *)(tags) ? 1 : 0; \
85 break;
86#define NL_STRING(pn, pr, member, len) \
87 case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \
88 if (dlen > len) { \
89 dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \
90 #member, dlen, (unsigned int)len); \
91 return 0; \
92 } \
93 arg->member ## _len = dlen; \
94 memcpy(arg->member, tags, min_t(size_t, dlen, len)); \
95 break;
96#include "linux/drbd_nl.h"
97
98/* Generate the struct to tag_list functions */
99#define NL_PACKET(name, number, fields) \
100static unsigned short* \
101name ## _to_tags(struct drbd_conf *mdev, \
102 struct name *arg, unsigned short *tags) __attribute__ ((unused)); \
103static unsigned short* \
104name ## _to_tags(struct drbd_conf *mdev, \
105 struct name *arg, unsigned short *tags) \
106{ \
107 fields \
108 return tags; \
109}
110
111#define NL_INTEGER(pn, pr, member) \
112 put_unaligned(pn | pr | TT_INTEGER, tags++); \
113 put_unaligned(sizeof(int), tags++); \
114 put_unaligned(arg->member, (int *)tags); \
115 tags = (unsigned short *)((char *)tags+sizeof(int));
116#define NL_INT64(pn, pr, member) \
117 put_unaligned(pn | pr | TT_INT64, tags++); \
118 put_unaligned(sizeof(u64), tags++); \
119 put_unaligned(arg->member, (u64 *)tags); \
120 tags = (unsigned short *)((char *)tags+sizeof(u64));
121#define NL_BIT(pn, pr, member) \
122 put_unaligned(pn | pr | TT_BIT, tags++); \
123 put_unaligned(sizeof(char), tags++); \
124 *(char *)tags = arg->member; \
125 tags = (unsigned short *)((char *)tags+sizeof(char));
126#define NL_STRING(pn, pr, member, len) \
127 put_unaligned(pn | pr | TT_STRING, tags++); \
128 put_unaligned(arg->member ## _len, tags++); \
129 memcpy(tags, arg->member, arg->member ## _len); \
130 tags = (unsigned short *)((char *)tags + arg->member ## _len);
131#include "linux/drbd_nl.h"
132
133void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name);
134void drbd_nl_send_reply(struct cn_msg *, int);
135
136int drbd_khelper(struct drbd_conf *mdev, char *cmd)
137{
138 char *envp[] = { "HOME=/",
139 "TERM=linux",
140 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
141 NULL, /* Will be set to address family */
142 NULL, /* Will be set to address */
143 NULL };
144
145 char mb[12], af[20], ad[60], *afs;
146 char *argv[] = {usermode_helper, cmd, mb, NULL };
147 int ret;
148
149 snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev));
150
151 if (get_net_conf(mdev)) {
152 switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) {
153 case AF_INET6:
154 afs = "ipv6";
155 snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6",
156 &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr);
157 break;
158 case AF_INET:
159 afs = "ipv4";
160 snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
161 &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
162 break;
163 default:
164 afs = "ssocks";
165 snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
166 &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
167 }
168 snprintf(af, 20, "DRBD_PEER_AF=%s", afs);
169 envp[3]=af;
170 envp[4]=ad;
171 put_net_conf(mdev);
172 }
173
174 dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
175
176 drbd_bcast_ev_helper(mdev, cmd);
177 ret = call_usermodehelper(usermode_helper, argv, envp, 1);
178 if (ret)
179 dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
180 usermode_helper, cmd, mb,
181 (ret >> 8) & 0xff, ret);
182 else
183 dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
184 usermode_helper, cmd, mb,
185 (ret >> 8) & 0xff, ret);
186
187 if (ret < 0) /* Ignore any ERRNOs we got. */
188 ret = 0;
189
190 return ret;
191}
192
193enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
194{
195 char *ex_to_string;
196 int r;
197 enum drbd_disk_state nps;
198 enum drbd_fencing_p fp;
199
200 D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
201
202 if (get_ldev_if_state(mdev, D_CONSISTENT)) {
203 fp = mdev->ldev->dc.fencing;
204 put_ldev(mdev);
205 } else {
206 dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n");
207 return mdev->state.pdsk;
208 }
209
210 if (fp == FP_STONITH)
211 _drbd_request_state(mdev, NS(susp, 1), CS_WAIT_COMPLETE);
212
213 r = drbd_khelper(mdev, "fence-peer");
214
215 switch ((r>>8) & 0xff) {
216 case 3: /* peer is inconsistent */
217 ex_to_string = "peer is inconsistent or worse";
218 nps = D_INCONSISTENT;
219 break;
220 case 4: /* peer got outdated, or was already outdated */
221 ex_to_string = "peer was fenced";
222 nps = D_OUTDATED;
223 break;
224 case 5: /* peer was down */
225 if (mdev->state.disk == D_UP_TO_DATE) {
226 /* we will(have) create(d) a new UUID anyways... */
227 ex_to_string = "peer is unreachable, assumed to be dead";
228 nps = D_OUTDATED;
229 } else {
230 ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
231 nps = mdev->state.pdsk;
232 }
233 break;
234 case 6: /* Peer is primary, voluntarily outdate myself.
235 * This is useful when an unconnected R_SECONDARY is asked to
236 * become R_PRIMARY, but finds the other peer being active. */
237 ex_to_string = "peer is active";
238 dev_warn(DEV, "Peer is primary, outdating myself.\n");
239 nps = D_UNKNOWN;
240 _drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE);
241 break;
242 case 7:
243 if (fp != FP_STONITH)
244 dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n");
245 ex_to_string = "peer was stonithed";
246 nps = D_OUTDATED;
247 break;
248 default:
249 /* The script is broken ... */
250 nps = D_UNKNOWN;
251 dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
252 return nps;
253 }
254
255 dev_info(DEV, "fence-peer helper returned %d (%s)\n",
256 (r>>8) & 0xff, ex_to_string);
257 return nps;
258}
259
260
261int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
262{
263 const int max_tries = 4;
264 int r = 0;
265 int try = 0;
266 int forced = 0;
267 union drbd_state mask, val;
268 enum drbd_disk_state nps;
269
270 if (new_role == R_PRIMARY)
271 request_ping(mdev); /* Detect a dead peer ASAP */
272
273 mutex_lock(&mdev->state_mutex);
274
275 mask.i = 0; mask.role = R_MASK;
276 val.i = 0; val.role = new_role;
277
278 while (try++ < max_tries) {
279 r = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE);
280
281 /* in case we first succeeded to outdate,
282 * but now suddenly could establish a connection */
283 if (r == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
284 val.pdsk = 0;
285 mask.pdsk = 0;
286 continue;
287 }
288
289 if (r == SS_NO_UP_TO_DATE_DISK && force &&
290 (mdev->state.disk == D_INCONSISTENT ||
291 mdev->state.disk == D_OUTDATED)) {
292 mask.disk = D_MASK;
293 val.disk = D_UP_TO_DATE;
294 forced = 1;
295 continue;
296 }
297
298 if (r == SS_NO_UP_TO_DATE_DISK &&
299 mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) {
300 D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
301 nps = drbd_try_outdate_peer(mdev);
302
303 if (nps == D_OUTDATED || nps == D_INCONSISTENT) {
304 val.disk = D_UP_TO_DATE;
305 mask.disk = D_MASK;
306 }
307
308 val.pdsk = nps;
309 mask.pdsk = D_MASK;
310
311 continue;
312 }
313
314 if (r == SS_NOTHING_TO_DO)
315 goto fail;
316 if (r == SS_PRIMARY_NOP && mask.pdsk == 0) {
317 nps = drbd_try_outdate_peer(mdev);
318
319 if (force && nps > D_OUTDATED) {
320 dev_warn(DEV, "Forced into split brain situation!\n");
321 nps = D_OUTDATED;
322 }
323
324 mask.pdsk = D_MASK;
325 val.pdsk = nps;
326
327 continue;
328 }
329 if (r == SS_TWO_PRIMARIES) {
330 /* Maybe the peer is detected as dead very soon...
331 retry at most once more in this case. */
332 __set_current_state(TASK_INTERRUPTIBLE);
333 schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10);
334 if (try < max_tries)
335 try = max_tries - 1;
336 continue;
337 }
338 if (r < SS_SUCCESS) {
339 r = _drbd_request_state(mdev, mask, val,
340 CS_VERBOSE + CS_WAIT_COMPLETE);
341 if (r < SS_SUCCESS)
342 goto fail;
343 }
344 break;
345 }
346
347 if (r < SS_SUCCESS)
348 goto fail;
349
350 if (forced)
351 dev_warn(DEV, "Forced to consider local data as UpToDate!\n");
352
353 /* Wait until nothing is on the fly :) */
354 wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0);
355
356 if (new_role == R_SECONDARY) {
357 set_disk_ro(mdev->vdisk, TRUE);
358 if (get_ldev(mdev)) {
359 mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
360 put_ldev(mdev);
361 }
362 } else {
363 if (get_net_conf(mdev)) {
364 mdev->net_conf->want_lose = 0;
365 put_net_conf(mdev);
366 }
367 set_disk_ro(mdev->vdisk, FALSE);
368 if (get_ldev(mdev)) {
369 if (((mdev->state.conn < C_CONNECTED ||
370 mdev->state.pdsk <= D_FAILED)
371 && mdev->ldev->md.uuid[UI_BITMAP] == 0) || forced)
372 drbd_uuid_new_current(mdev);
373
374 mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1;
375 put_ldev(mdev);
376 }
377 }
378
379 if ((new_role == R_SECONDARY) && get_ldev(mdev)) {
380 drbd_al_to_on_disk_bm(mdev);
381 put_ldev(mdev);
382 }
383
384 if (mdev->state.conn >= C_WF_REPORT_PARAMS) {
385 /* if this was forced, we should consider sync */
386 if (forced)
387 drbd_send_uuids(mdev);
388 drbd_send_state(mdev);
389 }
390
391 drbd_md_sync(mdev);
392
393 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
394 fail:
395 mutex_unlock(&mdev->state_mutex);
396 return r;
397}
398
399
400static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
401 struct drbd_nl_cfg_reply *reply)
402{
403 struct primary primary_args;
404
405 memset(&primary_args, 0, sizeof(struct primary));
406 if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) {
407 reply->ret_code = ERR_MANDATORY_TAG;
408 return 0;
409 }
410
411 reply->ret_code =
412 drbd_set_role(mdev, R_PRIMARY, primary_args.overwrite_peer);
413
414 return 0;
415}
416
417static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
418 struct drbd_nl_cfg_reply *reply)
419{
420 reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0);
421
422 return 0;
423}
424
425/* initializes the md.*_offset members, so we are able to find
426 * the on disk meta data */
427static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
428 struct drbd_backing_dev *bdev)
429{
430 sector_t md_size_sect = 0;
431 switch (bdev->dc.meta_dev_idx) {
432 default:
433 /* v07 style fixed size indexed meta data */
434 bdev->md.md_size_sect = MD_RESERVED_SECT;
435 bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
436 bdev->md.al_offset = MD_AL_OFFSET;
437 bdev->md.bm_offset = MD_BM_OFFSET;
438 break;
439 case DRBD_MD_INDEX_FLEX_EXT:
440 /* just occupy the full device; unit: sectors */
441 bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
442 bdev->md.md_offset = 0;
443 bdev->md.al_offset = MD_AL_OFFSET;
444 bdev->md.bm_offset = MD_BM_OFFSET;
445 break;
446 case DRBD_MD_INDEX_INTERNAL:
447 case DRBD_MD_INDEX_FLEX_INT:
448 bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
449 /* al size is still fixed */
450 bdev->md.al_offset = -MD_AL_MAX_SIZE;
451 /* we need (slightly less than) ~ this much bitmap sectors: */
452 md_size_sect = drbd_get_capacity(bdev->backing_bdev);
453 md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
454 md_size_sect = BM_SECT_TO_EXT(md_size_sect);
455 md_size_sect = ALIGN(md_size_sect, 8);
456
457 /* plus the "drbd meta data super block",
458 * and the activity log; */
459 md_size_sect += MD_BM_OFFSET;
460
461 bdev->md.md_size_sect = md_size_sect;
462 /* bitmap offset is adjusted by 'super' block size */
463 bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET;
464 break;
465 }
466}
467
468char *ppsize(char *buf, unsigned long long size)
469{
470 /* Needs 9 bytes at max. */
471 static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
472 int base = 0;
473 while (size >= 10000) {
474 /* shift + round */
475 size = (size >> 10) + !!(size & (1<<9));
476 base++;
477 }
478 sprintf(buf, "%lu %cB", (long)size, units[base]);
479
480 return buf;
481}
482
483/* there is still a theoretical deadlock when called from receiver
484 * on an D_INCONSISTENT R_PRIMARY:
485 * remote READ does inc_ap_bio, receiver would need to receive answer
486 * packet from remote to dec_ap_bio again.
487 * receiver receive_sizes(), comes here,
488 * waits for ap_bio_cnt == 0. -> deadlock.
489 * but this cannot happen, actually, because:
490 * R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable
491 * (not connected, or bad/no disk on peer):
492 * see drbd_fail_request_early, ap_bio_cnt is zero.
493 * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
494 * peer may not initiate a resize.
495 */
496void drbd_suspend_io(struct drbd_conf *mdev)
497{
498 set_bit(SUSPEND_IO, &mdev->flags);
499 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
500}
501
502void drbd_resume_io(struct drbd_conf *mdev)
503{
504 clear_bit(SUSPEND_IO, &mdev->flags);
505 wake_up(&mdev->misc_wait);
506}
507
508/**
509 * drbd_determine_dev_size() - Sets the right device size obeying all constraints
510 * @mdev: DRBD device.
511 *
512 * Returns 0 on success, negative return values indicate errors.
513 * You should call drbd_md_sync() after calling this function.
514 */
515enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev) __must_hold(local)
516{
517 sector_t prev_first_sect, prev_size; /* previous meta location */
518 sector_t la_size;
519 sector_t size;
520 char ppb[10];
521
522 int md_moved, la_size_changed;
523 enum determine_dev_size rv = unchanged;
524
525 /* race:
526 * application request passes inc_ap_bio,
527 * but then cannot get an AL-reference.
528 * this function later may wait on ap_bio_cnt == 0. -> deadlock.
529 *
530 * to avoid that:
531 * Suspend IO right here.
532 * still lock the act_log to not trigger ASSERTs there.
533 */
534 drbd_suspend_io(mdev);
535
536 /* no wait necessary anymore, actually we could assert that */
537 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
538
539 prev_first_sect = drbd_md_first_sector(mdev->ldev);
540 prev_size = mdev->ldev->md.md_size_sect;
541 la_size = mdev->ldev->md.la_size_sect;
542
543 /* TODO: should only be some assert here, not (re)init... */
544 drbd_md_set_sector_offsets(mdev, mdev->ldev);
545
546 size = drbd_new_dev_size(mdev, mdev->ldev);
547
548 if (drbd_get_capacity(mdev->this_bdev) != size ||
549 drbd_bm_capacity(mdev) != size) {
550 int err;
551 err = drbd_bm_resize(mdev, size);
552 if (unlikely(err)) {
553 /* currently there is only one error: ENOMEM! */
554 size = drbd_bm_capacity(mdev)>>1;
555 if (size == 0) {
556 dev_err(DEV, "OUT OF MEMORY! "
557 "Could not allocate bitmap!\n");
558 } else {
559 dev_err(DEV, "BM resizing failed. "
560 "Leaving size unchanged at size = %lu KB\n",
561 (unsigned long)size);
562 }
563 rv = dev_size_error;
564 }
565 /* racy, see comments above. */
566 drbd_set_my_capacity(mdev, size);
567 mdev->ldev->md.la_size_sect = size;
568 dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
569 (unsigned long long)size>>1);
570 }
571 if (rv == dev_size_error)
572 goto out;
573
574 la_size_changed = (la_size != mdev->ldev->md.la_size_sect);
575
576 md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev)
577 || prev_size != mdev->ldev->md.md_size_sect;
578
579 if (la_size_changed || md_moved) {
580 drbd_al_shrink(mdev); /* All extents inactive. */
581 dev_info(DEV, "Writing the whole bitmap, %s\n",
582 la_size_changed && md_moved ? "size changed and md moved" :
583 la_size_changed ? "size changed" : "md moved");
584 rv = drbd_bitmap_io(mdev, &drbd_bm_write, "size changed"); /* does drbd_resume_io() ! */
585 drbd_md_mark_dirty(mdev);
586 }
587
588 if (size > la_size)
589 rv = grew;
590 if (size < la_size)
591 rv = shrunk;
592out:
593 lc_unlock(mdev->act_log);
594 wake_up(&mdev->al_wait);
595 drbd_resume_io(mdev);
596
597 return rv;
598}
599
600sector_t
601drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
602{
603 sector_t p_size = mdev->p_size; /* partner's disk size. */
604 sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */
605 sector_t m_size; /* my size */
606 sector_t u_size = bdev->dc.disk_size; /* size requested by user. */
607 sector_t size = 0;
608
609 m_size = drbd_get_max_capacity(bdev);
610
611 if (p_size && m_size) {
612 size = min_t(sector_t, p_size, m_size);
613 } else {
614 if (la_size) {
615 size = la_size;
616 if (m_size && m_size < size)
617 size = m_size;
618 if (p_size && p_size < size)
619 size = p_size;
620 } else {
621 if (m_size)
622 size = m_size;
623 if (p_size)
624 size = p_size;
625 }
626 }
627
628 if (size == 0)
629 dev_err(DEV, "Both nodes diskless!\n");
630
631 if (u_size) {
632 if (u_size > size)
633 dev_err(DEV, "Requested disk size is too big (%lu > %lu)\n",
634 (unsigned long)u_size>>1, (unsigned long)size>>1);
635 else
636 size = u_size;
637 }
638
639 return size;
640}
641
642/**
643 * drbd_check_al_size() - Ensures that the AL is of the right size
644 * @mdev: DRBD device.
645 *
646 * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation
647 * failed, and 0 on success. You should call drbd_md_sync() after you called
648 * this function.
649 */
650static int drbd_check_al_size(struct drbd_conf *mdev)
651{
652 struct lru_cache *n, *t;
653 struct lc_element *e;
654 unsigned int in_use;
655 int i;
656
657 ERR_IF(mdev->sync_conf.al_extents < 7)
658 mdev->sync_conf.al_extents = 127;
659
660 if (mdev->act_log &&
661 mdev->act_log->nr_elements == mdev->sync_conf.al_extents)
662 return 0;
663
664 in_use = 0;
665 t = mdev->act_log;
666 n = lc_create("act_log", drbd_al_ext_cache,
667 mdev->sync_conf.al_extents, sizeof(struct lc_element), 0);
668
669 if (n == NULL) {
670 dev_err(DEV, "Cannot allocate act_log lru!\n");
671 return -ENOMEM;
672 }
673 spin_lock_irq(&mdev->al_lock);
674 if (t) {
675 for (i = 0; i < t->nr_elements; i++) {
676 e = lc_element_by_index(t, i);
677 if (e->refcnt)
678 dev_err(DEV, "refcnt(%d)==%d\n",
679 e->lc_number, e->refcnt);
680 in_use += e->refcnt;
681 }
682 }
683 if (!in_use)
684 mdev->act_log = n;
685 spin_unlock_irq(&mdev->al_lock);
686 if (in_use) {
687 dev_err(DEV, "Activity log still in use!\n");
688 lc_destroy(n);
689 return -EBUSY;
690 } else {
691 if (t)
692 lc_destroy(t);
693 }
694 drbd_md_mark_dirty(mdev); /* we changed mdev->act_log->nr_elemens */
695 return 0;
696}
697
698void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __must_hold(local)
699{
700 struct request_queue * const q = mdev->rq_queue;
701 struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
702 int max_segments = mdev->ldev->dc.max_bio_bvecs;
703
704 if (b->merge_bvec_fn && !mdev->ldev->dc.use_bmbv)
705 max_seg_s = PAGE_SIZE;
706
707 max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s);
708
709 blk_queue_max_sectors(q, max_seg_s >> 9);
710 blk_queue_max_phys_segments(q, max_segments ? max_segments : MAX_PHYS_SEGMENTS);
711 blk_queue_max_hw_segments(q, max_segments ? max_segments : MAX_HW_SEGMENTS);
712 blk_queue_max_segment_size(q, max_seg_s);
713 blk_queue_logical_block_size(q, 512);
714 blk_queue_segment_boundary(q, PAGE_SIZE-1);
715 blk_stack_limits(&q->limits, &b->limits, 0);
716
717 if (b->merge_bvec_fn)
718 dev_warn(DEV, "Backing device's merge_bvec_fn() = %p\n",
719 b->merge_bvec_fn);
720 dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q));
721
722 if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
723 dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
724 q->backing_dev_info.ra_pages,
725 b->backing_dev_info.ra_pages);
726 q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
727 }
728}
729
730/* serialize deconfig (worker exiting, doing cleanup)
731 * and reconfig (drbdsetup disk, drbdsetup net)
732 *
733 * wait for a potentially exiting worker, then restart it,
734 * or start a new one.
735 */
736static void drbd_reconfig_start(struct drbd_conf *mdev)
737{
738 wait_event(mdev->state_wait, test_and_set_bit(CONFIG_PENDING, &mdev->flags));
739 wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags));
740 drbd_thread_start(&mdev->worker);
741}
742
743/* if still unconfigured, stops worker again.
744 * if configured now, clears CONFIG_PENDING.
745 * wakes potential waiters */
746static void drbd_reconfig_done(struct drbd_conf *mdev)
747{
748 spin_lock_irq(&mdev->req_lock);
749 if (mdev->state.disk == D_DISKLESS &&
750 mdev->state.conn == C_STANDALONE &&
751 mdev->state.role == R_SECONDARY) {
752 set_bit(DEVICE_DYING, &mdev->flags);
753 drbd_thread_stop_nowait(&mdev->worker);
754 } else
755 clear_bit(CONFIG_PENDING, &mdev->flags);
756 spin_unlock_irq(&mdev->req_lock);
757 wake_up(&mdev->state_wait);
758}
759
760/* does always return 0;
761 * interesting return code is in reply->ret_code */
762static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
763 struct drbd_nl_cfg_reply *reply)
764{
765 enum drbd_ret_codes retcode;
766 enum determine_dev_size dd;
767 sector_t max_possible_sectors;
768 sector_t min_md_device_sectors;
769 struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
770 struct inode *inode, *inode2;
771 struct lru_cache *resync_lru = NULL;
772 union drbd_state ns, os;
773 int rv;
774 int cp_discovered = 0;
775 int logical_block_size;
776
777 drbd_reconfig_start(mdev);
778
779 /* if you want to reconfigure, please tear down first */
780 if (mdev->state.disk > D_DISKLESS) {
781 retcode = ERR_DISK_CONFIGURED;
782 goto fail;
783 }
784
785 /* allocation not in the IO path, cqueue thread context */
786 nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
787 if (!nbc) {
788 retcode = ERR_NOMEM;
789 goto fail;
790 }
791
792 nbc->dc.disk_size = DRBD_DISK_SIZE_SECT_DEF;
793 nbc->dc.on_io_error = DRBD_ON_IO_ERROR_DEF;
794 nbc->dc.fencing = DRBD_FENCING_DEF;
795 nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF;
796
797 if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) {
798 retcode = ERR_MANDATORY_TAG;
799 goto fail;
800 }
801
802 if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
803 retcode = ERR_MD_IDX_INVALID;
804 goto fail;
805 }
806
807 nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0);
808 if (IS_ERR(nbc->lo_file)) {
809 dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
810 PTR_ERR(nbc->lo_file));
811 nbc->lo_file = NULL;
812 retcode = ERR_OPEN_DISK;
813 goto fail;
814 }
815
816 inode = nbc->lo_file->f_dentry->d_inode;
817
818 if (!S_ISBLK(inode->i_mode)) {
819 retcode = ERR_DISK_NOT_BDEV;
820 goto fail;
821 }
822
823 nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0);
824 if (IS_ERR(nbc->md_file)) {
825 dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
826 PTR_ERR(nbc->md_file));
827 nbc->md_file = NULL;
828 retcode = ERR_OPEN_MD_DISK;
829 goto fail;
830 }
831
832 inode2 = nbc->md_file->f_dentry->d_inode;
833
834 if (!S_ISBLK(inode2->i_mode)) {
835 retcode = ERR_MD_NOT_BDEV;
836 goto fail;
837 }
838
839 nbc->backing_bdev = inode->i_bdev;
840 if (bd_claim(nbc->backing_bdev, mdev)) {
841 printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n",
842 nbc->backing_bdev, mdev,
843 nbc->backing_bdev->bd_holder,
844 nbc->backing_bdev->bd_contains->bd_holder,
845 nbc->backing_bdev->bd_holders);
846 retcode = ERR_BDCLAIM_DISK;
847 goto fail;
848 }
849
850 resync_lru = lc_create("resync", drbd_bm_ext_cache,
851 61, sizeof(struct bm_extent),
852 offsetof(struct bm_extent, lce));
853 if (!resync_lru) {
854 retcode = ERR_NOMEM;
855 goto release_bdev_fail;
856 }
857
858 /* meta_dev_idx >= 0: external fixed size,
859 * possibly multiple drbd sharing one meta device.
860 * TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is
861 * not yet used by some other drbd minor!
862 * (if you use drbd.conf + drbdadm,
863 * that should check it for you already; but if you don't, or someone
864 * fooled it, we need to double check here) */
865 nbc->md_bdev = inode2->i_bdev;
866 if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev
867 : (void *) drbd_m_holder)) {
868 retcode = ERR_BDCLAIM_MD_DISK;
869 goto release_bdev_fail;
870 }
871
872 if ((nbc->backing_bdev == nbc->md_bdev) !=
873 (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
874 nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
875 retcode = ERR_MD_IDX_INVALID;
876 goto release_bdev2_fail;
877 }
878
879 /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
880 drbd_md_set_sector_offsets(mdev, nbc);
881
882 if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) {
883 dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
884 (unsigned long long) drbd_get_max_capacity(nbc),
885 (unsigned long long) nbc->dc.disk_size);
886 retcode = ERR_DISK_TO_SMALL;
887 goto release_bdev2_fail;
888 }
889
890 if (nbc->dc.meta_dev_idx < 0) {
891 max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
892 /* at least one MB, otherwise it does not make sense */
893 min_md_device_sectors = (2<<10);
894 } else {
895 max_possible_sectors = DRBD_MAX_SECTORS;
896 min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1);
897 }
898
899 if (drbd_get_capacity(nbc->md_bdev) > max_possible_sectors)
900 dev_warn(DEV, "truncating very big lower level device "
901 "to currently maximum possible %llu sectors\n",
902 (unsigned long long) max_possible_sectors);
903
904 if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
905 retcode = ERR_MD_DISK_TO_SMALL;
906 dev_warn(DEV, "refusing attach: md-device too small, "
907 "at least %llu sectors needed for this meta-disk type\n",
908 (unsigned long long) min_md_device_sectors);
909 goto release_bdev2_fail;
910 }
911
912 /* Make sure the new disk is big enough
913 * (we may currently be R_PRIMARY with no local disk...) */
914 if (drbd_get_max_capacity(nbc) <
915 drbd_get_capacity(mdev->this_bdev)) {
916 retcode = ERR_DISK_TO_SMALL;
917 goto release_bdev2_fail;
918 }
919
920 nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
921
922 drbd_suspend_io(mdev);
923 /* also wait for the last barrier ack. */
924 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt));
925 /* and for any other previously queued work */
926 drbd_flush_workqueue(mdev);
927
928 retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE);
929 drbd_resume_io(mdev);
930 if (retcode < SS_SUCCESS)
931 goto release_bdev2_fail;
932
933 if (!get_ldev_if_state(mdev, D_ATTACHING))
934 goto force_diskless;
935
936 drbd_md_set_sector_offsets(mdev, nbc);
937
938 if (!mdev->bitmap) {
939 if (drbd_bm_init(mdev)) {
940 retcode = ERR_NOMEM;
941 goto force_diskless_dec;
942 }
943 }
944
945 retcode = drbd_md_read(mdev, nbc);
946 if (retcode != NO_ERROR)
947 goto force_diskless_dec;
948
949 if (mdev->state.conn < C_CONNECTED &&
950 mdev->state.role == R_PRIMARY &&
951 (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
952 dev_err(DEV, "Can only attach to data with current UUID=%016llX\n",
953 (unsigned long long)mdev->ed_uuid);
954 retcode = ERR_DATA_NOT_CURRENT;
955 goto force_diskless_dec;
956 }
957
958 /* Since we are diskless, fix the activity log first... */
959 if (drbd_check_al_size(mdev)) {
960 retcode = ERR_NOMEM;
961 goto force_diskless_dec;
962 }
963
964 /* Prevent shrinking of consistent devices ! */
965 if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
966 drbd_new_dev_size(mdev, nbc) < nbc->md.la_size_sect) {
967 dev_warn(DEV, "refusing to truncate a consistent device\n");
968 retcode = ERR_DISK_TO_SMALL;
969 goto force_diskless_dec;
970 }
971
972 if (!drbd_al_read_log(mdev, nbc)) {
973 retcode = ERR_IO_MD_DISK;
974 goto force_diskless_dec;
975 }
976
977 /* allocate a second IO page if logical_block_size != 512 */
978 logical_block_size = bdev_logical_block_size(nbc->md_bdev);
979 if (logical_block_size == 0)
980 logical_block_size = MD_SECTOR_SIZE;
981
982 if (logical_block_size != MD_SECTOR_SIZE) {
983 if (!mdev->md_io_tmpp) {
984 struct page *page = alloc_page(GFP_NOIO);
985 if (!page)
986 goto force_diskless_dec;
987
988 dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n",
989 logical_block_size, MD_SECTOR_SIZE);
990 dev_warn(DEV, "Workaround engaged (has performance impact).\n");
991
992 mdev->md_io_tmpp = page;
993 }
994 }
995
996 /* Reset the "barriers don't work" bits here, then force meta data to
997 * be written, to ensure we determine if barriers are supported. */
998 if (nbc->dc.no_md_flush)
999 set_bit(MD_NO_BARRIER, &mdev->flags);
1000 else
1001 clear_bit(MD_NO_BARRIER, &mdev->flags);
1002
1003 /* Point of no return reached.
1004 * Devices and memory are no longer released by error cleanup below.
1005 * now mdev takes over responsibility, and the state engine should
1006 * clean it up somewhere. */
1007 D_ASSERT(mdev->ldev == NULL);
1008 mdev->ldev = nbc;
1009 mdev->resync = resync_lru;
1010 nbc = NULL;
1011 resync_lru = NULL;
1012
1013 mdev->write_ordering = WO_bio_barrier;
1014 drbd_bump_write_ordering(mdev, WO_bio_barrier);
1015
1016 if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY))
1017 set_bit(CRASHED_PRIMARY, &mdev->flags);
1018 else
1019 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1020
1021 if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND)) {
1022 set_bit(CRASHED_PRIMARY, &mdev->flags);
1023 cp_discovered = 1;
1024 }
1025
1026 mdev->send_cnt = 0;
1027 mdev->recv_cnt = 0;
1028 mdev->read_cnt = 0;
1029 mdev->writ_cnt = 0;
1030
1031 drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE);
1032
1033 /* If I am currently not R_PRIMARY,
1034 * but meta data primary indicator is set,
1035 * I just now recover from a hard crash,
1036 * and have been R_PRIMARY before that crash.
1037 *
1038 * Now, if I had no connection before that crash
1039 * (have been degraded R_PRIMARY), chances are that
1040 * I won't find my peer now either.
1041 *
1042 * In that case, and _only_ in that case,
1043 * we use the degr-wfc-timeout instead of the default,
1044 * so we can automatically recover from a crash of a
1045 * degraded but active "cluster" after a certain timeout.
1046 */
1047 clear_bit(USE_DEGR_WFC_T, &mdev->flags);
1048 if (mdev->state.role != R_PRIMARY &&
1049 drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
1050 !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND))
1051 set_bit(USE_DEGR_WFC_T, &mdev->flags);
1052
1053 dd = drbd_determin_dev_size(mdev);
1054 if (dd == dev_size_error) {
1055 retcode = ERR_NOMEM_BITMAP;
1056 goto force_diskless_dec;
1057 } else if (dd == grew)
1058 set_bit(RESYNC_AFTER_NEG, &mdev->flags);
1059
1060 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
1061 dev_info(DEV, "Assuming that all blocks are out of sync "
1062 "(aka FullSync)\n");
1063 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from attaching")) {
1064 retcode = ERR_IO_MD_DISK;
1065 goto force_diskless_dec;
1066 }
1067 } else {
1068 if (drbd_bitmap_io(mdev, &drbd_bm_read, "read from attaching") < 0) {
1069 retcode = ERR_IO_MD_DISK;
1070 goto force_diskless_dec;
1071 }
1072 }
1073
1074 if (cp_discovered) {
1075 drbd_al_apply_to_bm(mdev);
1076 drbd_al_to_on_disk_bm(mdev);
1077 }
1078
1079 spin_lock_irq(&mdev->req_lock);
1080 os = mdev->state;
1081 ns.i = os.i;
1082 /* If MDF_CONSISTENT is not set go into inconsistent state,
1083 otherwise investigate MDF_WasUpToDate...
1084 If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
1085 otherwise into D_CONSISTENT state.
1086 */
1087 if (drbd_md_test_flag(mdev->ldev, MDF_CONSISTENT)) {
1088 if (drbd_md_test_flag(mdev->ldev, MDF_WAS_UP_TO_DATE))
1089 ns.disk = D_CONSISTENT;
1090 else
1091 ns.disk = D_OUTDATED;
1092 } else {
1093 ns.disk = D_INCONSISTENT;
1094 }
1095
1096 if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED))
1097 ns.pdsk = D_OUTDATED;
1098
1099 if ( ns.disk == D_CONSISTENT &&
1100 (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE))
1101 ns.disk = D_UP_TO_DATE;
1102
1103 /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
1104 MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before
1105 this point, because drbd_request_state() modifies these
1106 flags. */
1107
1108 /* In case we are C_CONNECTED postpone any decision on the new disk
1109 state after the negotiation phase. */
1110 if (mdev->state.conn == C_CONNECTED) {
1111 mdev->new_state_tmp.i = ns.i;
1112 ns.i = os.i;
1113 ns.disk = D_NEGOTIATING;
1114 }
1115
1116 rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
1117 ns = mdev->state;
1118 spin_unlock_irq(&mdev->req_lock);
1119
1120 if (rv < SS_SUCCESS)
1121 goto force_diskless_dec;
1122
1123 if (mdev->state.role == R_PRIMARY)
1124 mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1;
1125 else
1126 mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
1127
1128 drbd_md_mark_dirty(mdev);
1129 drbd_md_sync(mdev);
1130
1131 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
1132 put_ldev(mdev);
1133 reply->ret_code = retcode;
1134 drbd_reconfig_done(mdev);
1135 return 0;
1136
1137 force_diskless_dec:
1138 put_ldev(mdev);
1139 force_diskless:
1140 drbd_force_state(mdev, NS(disk, D_DISKLESS));
1141 drbd_md_sync(mdev);
1142 release_bdev2_fail:
1143 if (nbc)
1144 bd_release(nbc->md_bdev);
1145 release_bdev_fail:
1146 if (nbc)
1147 bd_release(nbc->backing_bdev);
1148 fail:
1149 if (nbc) {
1150 if (nbc->lo_file)
1151 fput(nbc->lo_file);
1152 if (nbc->md_file)
1153 fput(nbc->md_file);
1154 kfree(nbc);
1155 }
1156 lc_destroy(resync_lru);
1157
1158 reply->ret_code = retcode;
1159 drbd_reconfig_done(mdev);
1160 return 0;
1161}
1162
1163static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1164 struct drbd_nl_cfg_reply *reply)
1165{
1166 reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS));
1167 return 0;
1168}
1169
1170static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1171 struct drbd_nl_cfg_reply *reply)
1172{
1173 int i, ns;
1174 enum drbd_ret_codes retcode;
1175 struct net_conf *new_conf = NULL;
1176 struct crypto_hash *tfm = NULL;
1177 struct crypto_hash *integrity_w_tfm = NULL;
1178 struct crypto_hash *integrity_r_tfm = NULL;
1179 struct hlist_head *new_tl_hash = NULL;
1180 struct hlist_head *new_ee_hash = NULL;
1181 struct drbd_conf *odev;
1182 char hmac_name[CRYPTO_MAX_ALG_NAME];
1183 void *int_dig_out = NULL;
1184 void *int_dig_in = NULL;
1185 void *int_dig_vv = NULL;
1186 struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr;
1187
1188 drbd_reconfig_start(mdev);
1189
1190 if (mdev->state.conn > C_STANDALONE) {
1191 retcode = ERR_NET_CONFIGURED;
1192 goto fail;
1193 }
1194
1195 /* allocation not in the IO path, cqueue thread context */
1196 new_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
1197 if (!new_conf) {
1198 retcode = ERR_NOMEM;
1199 goto fail;
1200 }
1201
1202 memset(new_conf, 0, sizeof(struct net_conf));
1203 new_conf->timeout = DRBD_TIMEOUT_DEF;
1204 new_conf->try_connect_int = DRBD_CONNECT_INT_DEF;
1205 new_conf->ping_int = DRBD_PING_INT_DEF;
1206 new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF;
1207 new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF;
1208 new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF;
1209 new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF;
1210 new_conf->rcvbuf_size = DRBD_RCVBUF_SIZE_DEF;
1211 new_conf->ko_count = DRBD_KO_COUNT_DEF;
1212 new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF;
1213 new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF;
1214 new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF;
1215 new_conf->want_lose = 0;
1216 new_conf->two_primaries = 0;
1217 new_conf->wire_protocol = DRBD_PROT_C;
1218 new_conf->ping_timeo = DRBD_PING_TIMEO_DEF;
1219 new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF;
1220
1221 if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) {
1222 retcode = ERR_MANDATORY_TAG;
1223 goto fail;
1224 }
1225
1226 if (new_conf->two_primaries
1227 && (new_conf->wire_protocol != DRBD_PROT_C)) {
1228 retcode = ERR_NOT_PROTO_C;
1229 goto fail;
1230 };
1231
1232 if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
1233 retcode = ERR_DISCARD;
1234 goto fail;
1235 }
1236
1237 retcode = NO_ERROR;
1238
1239 new_my_addr = (struct sockaddr *)&new_conf->my_addr;
1240 new_peer_addr = (struct sockaddr *)&new_conf->peer_addr;
1241 for (i = 0; i < minor_count; i++) {
1242 odev = minor_to_mdev(i);
1243 if (!odev || odev == mdev)
1244 continue;
1245 if (get_net_conf(odev)) {
1246 taken_addr = (struct sockaddr *)&odev->net_conf->my_addr;
1247 if (new_conf->my_addr_len == odev->net_conf->my_addr_len &&
1248 !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len))
1249 retcode = ERR_LOCAL_ADDR;
1250
1251 taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr;
1252 if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len &&
1253 !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len))
1254 retcode = ERR_PEER_ADDR;
1255
1256 put_net_conf(odev);
1257 if (retcode != NO_ERROR)
1258 goto fail;
1259 }
1260 }
1261
1262 if (new_conf->cram_hmac_alg[0] != 0) {
1263 snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
1264 new_conf->cram_hmac_alg);
1265 tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC);
1266 if (IS_ERR(tfm)) {
1267 tfm = NULL;
1268 retcode = ERR_AUTH_ALG;
1269 goto fail;
1270 }
1271
1272 if (crypto_tfm_alg_type(crypto_hash_tfm(tfm))
1273 != CRYPTO_ALG_TYPE_HASH) {
1274 retcode = ERR_AUTH_ALG_ND;
1275 goto fail;
1276 }
1277 }
1278
1279 if (new_conf->integrity_alg[0]) {
1280 integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
1281 if (IS_ERR(integrity_w_tfm)) {
1282 integrity_w_tfm = NULL;
1283 retcode=ERR_INTEGRITY_ALG;
1284 goto fail;
1285 }
1286
1287 if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) {
1288 retcode=ERR_INTEGRITY_ALG_ND;
1289 goto fail;
1290 }
1291
1292 integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
1293 if (IS_ERR(integrity_r_tfm)) {
1294 integrity_r_tfm = NULL;
1295 retcode=ERR_INTEGRITY_ALG;
1296 goto fail;
1297 }
1298 }
1299
1300 ns = new_conf->max_epoch_size/8;
1301 if (mdev->tl_hash_s != ns) {
1302 new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
1303 if (!new_tl_hash) {
1304 retcode = ERR_NOMEM;
1305 goto fail;
1306 }
1307 }
1308
1309 ns = new_conf->max_buffers/8;
1310 if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) {
1311 new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
1312 if (!new_ee_hash) {
1313 retcode = ERR_NOMEM;
1314 goto fail;
1315 }
1316 }
1317
1318 ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
1319
1320 if (integrity_w_tfm) {
1321 i = crypto_hash_digestsize(integrity_w_tfm);
1322 int_dig_out = kmalloc(i, GFP_KERNEL);
1323 if (!int_dig_out) {
1324 retcode = ERR_NOMEM;
1325 goto fail;
1326 }
1327 int_dig_in = kmalloc(i, GFP_KERNEL);
1328 if (!int_dig_in) {
1329 retcode = ERR_NOMEM;
1330 goto fail;
1331 }
1332 int_dig_vv = kmalloc(i, GFP_KERNEL);
1333 if (!int_dig_vv) {
1334 retcode = ERR_NOMEM;
1335 goto fail;
1336 }
1337 }
1338
1339 if (!mdev->bitmap) {
1340 if(drbd_bm_init(mdev)) {
1341 retcode = ERR_NOMEM;
1342 goto fail;
1343 }
1344 }
1345
1346 spin_lock_irq(&mdev->req_lock);
1347 if (mdev->net_conf != NULL) {
1348 retcode = ERR_NET_CONFIGURED;
1349 spin_unlock_irq(&mdev->req_lock);
1350 goto fail;
1351 }
1352 mdev->net_conf = new_conf;
1353
1354 mdev->send_cnt = 0;
1355 mdev->recv_cnt = 0;
1356
1357 if (new_tl_hash) {
1358 kfree(mdev->tl_hash);
1359 mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8;
1360 mdev->tl_hash = new_tl_hash;
1361 }
1362
1363 if (new_ee_hash) {
1364 kfree(mdev->ee_hash);
1365 mdev->ee_hash_s = mdev->net_conf->max_buffers/8;
1366 mdev->ee_hash = new_ee_hash;
1367 }
1368
1369 crypto_free_hash(mdev->cram_hmac_tfm);
1370 mdev->cram_hmac_tfm = tfm;
1371
1372 crypto_free_hash(mdev->integrity_w_tfm);
1373 mdev->integrity_w_tfm = integrity_w_tfm;
1374
1375 crypto_free_hash(mdev->integrity_r_tfm);
1376 mdev->integrity_r_tfm = integrity_r_tfm;
1377
1378 kfree(mdev->int_dig_out);
1379 kfree(mdev->int_dig_in);
1380 kfree(mdev->int_dig_vv);
1381 mdev->int_dig_out=int_dig_out;
1382 mdev->int_dig_in=int_dig_in;
1383 mdev->int_dig_vv=int_dig_vv;
1384 spin_unlock_irq(&mdev->req_lock);
1385
1386 retcode = _drbd_request_state(mdev, NS(conn, C_UNCONNECTED), CS_VERBOSE);
1387
1388 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
1389 reply->ret_code = retcode;
1390 drbd_reconfig_done(mdev);
1391 return 0;
1392
1393fail:
1394 kfree(int_dig_out);
1395 kfree(int_dig_in);
1396 kfree(int_dig_vv);
1397 crypto_free_hash(tfm);
1398 crypto_free_hash(integrity_w_tfm);
1399 crypto_free_hash(integrity_r_tfm);
1400 kfree(new_tl_hash);
1401 kfree(new_ee_hash);
1402 kfree(new_conf);
1403
1404 reply->ret_code = retcode;
1405 drbd_reconfig_done(mdev);
1406 return 0;
1407}
1408
1409static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1410 struct drbd_nl_cfg_reply *reply)
1411{
1412 int retcode;
1413
1414 retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED);
1415
1416 if (retcode == SS_NOTHING_TO_DO)
1417 goto done;
1418 else if (retcode == SS_ALREADY_STANDALONE)
1419 goto done;
1420 else if (retcode == SS_PRIMARY_NOP) {
1421 /* Our statche checking code wants to see the peer outdated. */
1422 retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
1423 pdsk, D_OUTDATED));
1424 } else if (retcode == SS_CW_FAILED_BY_PEER) {
1425 /* The peer probably wants to see us outdated. */
1426 retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
1427 disk, D_OUTDATED),
1428 CS_ORDERED);
1429 if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) {
1430 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
1431 retcode = SS_SUCCESS;
1432 }
1433 }
1434
1435 if (retcode < SS_SUCCESS)
1436 goto fail;
1437
1438 if (wait_event_interruptible(mdev->state_wait,
1439 mdev->state.conn != C_DISCONNECTING)) {
1440 /* Do not test for mdev->state.conn == C_STANDALONE, since
1441 someone else might connect us in the mean time! */
1442 retcode = ERR_INTR;
1443 goto fail;
1444 }
1445
1446 done:
1447 retcode = NO_ERROR;
1448 fail:
1449 drbd_md_sync(mdev);
1450 reply->ret_code = retcode;
1451 return 0;
1452}
1453
1454void resync_after_online_grow(struct drbd_conf *mdev)
1455{
1456 int iass; /* I am sync source */
1457
1458 dev_info(DEV, "Resync of new storage after online grow\n");
1459 if (mdev->state.role != mdev->state.peer)
1460 iass = (mdev->state.role == R_PRIMARY);
1461 else
1462 iass = test_bit(DISCARD_CONCURRENT, &mdev->flags);
1463
1464 if (iass)
1465 drbd_start_resync(mdev, C_SYNC_SOURCE);
1466 else
1467 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
1468}
1469
1470static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1471 struct drbd_nl_cfg_reply *reply)
1472{
1473 struct resize rs;
1474 int retcode = NO_ERROR;
1475 int ldsc = 0; /* local disk size changed */
1476 enum determine_dev_size dd;
1477
1478 memset(&rs, 0, sizeof(struct resize));
1479 if (!resize_from_tags(mdev, nlp->tag_list, &rs)) {
1480 retcode = ERR_MANDATORY_TAG;
1481 goto fail;
1482 }
1483
1484 if (mdev->state.conn > C_CONNECTED) {
1485 retcode = ERR_RESIZE_RESYNC;
1486 goto fail;
1487 }
1488
1489 if (mdev->state.role == R_SECONDARY &&
1490 mdev->state.peer == R_SECONDARY) {
1491 retcode = ERR_NO_PRIMARY;
1492 goto fail;
1493 }
1494
1495 if (!get_ldev(mdev)) {
1496 retcode = ERR_NO_DISK;
1497 goto fail;
1498 }
1499
1500 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
1501 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
1502 ldsc = 1;
1503 }
1504
1505 mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
1506 dd = drbd_determin_dev_size(mdev);
1507 drbd_md_sync(mdev);
1508 put_ldev(mdev);
1509 if (dd == dev_size_error) {
1510 retcode = ERR_NOMEM_BITMAP;
1511 goto fail;
1512 }
1513
1514 if (mdev->state.conn == C_CONNECTED && (dd != unchanged || ldsc)) {
1515 if (dd == grew)
1516 set_bit(RESIZE_PENDING, &mdev->flags);
1517
1518 drbd_send_uuids(mdev);
1519 drbd_send_sizes(mdev, 1);
1520 }
1521
1522 fail:
1523 reply->ret_code = retcode;
1524 return 0;
1525}
1526
1527static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1528 struct drbd_nl_cfg_reply *reply)
1529{
1530 int retcode = NO_ERROR;
1531 int err;
1532 int ovr; /* online verify running */
1533 int rsr; /* re-sync running */
1534 struct crypto_hash *verify_tfm = NULL;
1535 struct crypto_hash *csums_tfm = NULL;
1536 struct syncer_conf sc;
1537 cpumask_var_t new_cpu_mask;
1538
1539 if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) {
1540 retcode = ERR_NOMEM;
1541 goto fail;
1542 }
1543
1544 if (nlp->flags & DRBD_NL_SET_DEFAULTS) {
1545 memset(&sc, 0, sizeof(struct syncer_conf));
1546 sc.rate = DRBD_RATE_DEF;
1547 sc.after = DRBD_AFTER_DEF;
1548 sc.al_extents = DRBD_AL_EXTENTS_DEF;
1549 } else
1550 memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
1551
1552 if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) {
1553 retcode = ERR_MANDATORY_TAG;
1554 goto fail;
1555 }
1556
1557 /* re-sync running */
1558 rsr = ( mdev->state.conn == C_SYNC_SOURCE ||
1559 mdev->state.conn == C_SYNC_TARGET ||
1560 mdev->state.conn == C_PAUSED_SYNC_S ||
1561 mdev->state.conn == C_PAUSED_SYNC_T );
1562
1563 if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) {
1564 retcode = ERR_CSUMS_RESYNC_RUNNING;
1565 goto fail;
1566 }
1567
1568 if (!rsr && sc.csums_alg[0]) {
1569 csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC);
1570 if (IS_ERR(csums_tfm)) {
1571 csums_tfm = NULL;
1572 retcode = ERR_CSUMS_ALG;
1573 goto fail;
1574 }
1575
1576 if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) {
1577 retcode = ERR_CSUMS_ALG_ND;
1578 goto fail;
1579 }
1580 }
1581
1582 /* online verify running */
1583 ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T);
1584
1585 if (ovr) {
1586 if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) {
1587 retcode = ERR_VERIFY_RUNNING;
1588 goto fail;
1589 }
1590 }
1591
1592 if (!ovr && sc.verify_alg[0]) {
1593 verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC);
1594 if (IS_ERR(verify_tfm)) {
1595 verify_tfm = NULL;
1596 retcode = ERR_VERIFY_ALG;
1597 goto fail;
1598 }
1599
1600 if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) {
1601 retcode = ERR_VERIFY_ALG_ND;
1602 goto fail;
1603 }
1604 }
1605
1606 /* silently ignore cpu mask on UP kernel */
1607 if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) {
1608 err = __bitmap_parse(sc.cpu_mask, 32, 0,
1609 cpumask_bits(new_cpu_mask), nr_cpu_ids);
1610 if (err) {
1611 dev_warn(DEV, "__bitmap_parse() failed with %d\n", err);
1612 retcode = ERR_CPU_MASK_PARSE;
1613 goto fail;
1614 }
1615 }
1616
1617 ERR_IF (sc.rate < 1) sc.rate = 1;
1618 ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */
1619#define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT)
1620 if (sc.al_extents > AL_MAX) {
1621 dev_err(DEV, "sc.al_extents > %d\n", AL_MAX);
1622 sc.al_extents = AL_MAX;
1623 }
1624#undef AL_MAX
1625
1626 /* most sanity checks done, try to assign the new sync-after
1627 * dependency. need to hold the global lock in there,
1628 * to avoid a race in the dependency loop check. */
1629 retcode = drbd_alter_sa(mdev, sc.after);
1630 if (retcode != NO_ERROR)
1631 goto fail;
1632
1633 /* ok, assign the rest of it as well.
1634 * lock against receive_SyncParam() */
1635 spin_lock(&mdev->peer_seq_lock);
1636 mdev->sync_conf = sc;
1637
1638 if (!rsr) {
1639 crypto_free_hash(mdev->csums_tfm);
1640 mdev->csums_tfm = csums_tfm;
1641 csums_tfm = NULL;
1642 }
1643
1644 if (!ovr) {
1645 crypto_free_hash(mdev->verify_tfm);
1646 mdev->verify_tfm = verify_tfm;
1647 verify_tfm = NULL;
1648 }
1649 spin_unlock(&mdev->peer_seq_lock);
1650
1651 if (get_ldev(mdev)) {
1652 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
1653 drbd_al_shrink(mdev);
1654 err = drbd_check_al_size(mdev);
1655 lc_unlock(mdev->act_log);
1656 wake_up(&mdev->al_wait);
1657
1658 put_ldev(mdev);
1659 drbd_md_sync(mdev);
1660
1661 if (err) {
1662 retcode = ERR_NOMEM;
1663 goto fail;
1664 }
1665 }
1666
1667 if (mdev->state.conn >= C_CONNECTED)
1668 drbd_send_sync_param(mdev, &sc);
1669
1670 if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) {
1671 cpumask_copy(mdev->cpu_mask, new_cpu_mask);
1672 drbd_calc_cpu_mask(mdev);
1673 mdev->receiver.reset_cpu_mask = 1;
1674 mdev->asender.reset_cpu_mask = 1;
1675 mdev->worker.reset_cpu_mask = 1;
1676 }
1677
1678 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
1679fail:
1680 free_cpumask_var(new_cpu_mask);
1681 crypto_free_hash(csums_tfm);
1682 crypto_free_hash(verify_tfm);
1683 reply->ret_code = retcode;
1684 return 0;
1685}
1686
1687static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1688 struct drbd_nl_cfg_reply *reply)
1689{
1690 int retcode;
1691
1692 retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
1693
1694 if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION)
1695 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
1696
1697 while (retcode == SS_NEED_CONNECTION) {
1698 spin_lock_irq(&mdev->req_lock);
1699 if (mdev->state.conn < C_CONNECTED)
1700 retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL);
1701 spin_unlock_irq(&mdev->req_lock);
1702
1703 if (retcode != SS_NEED_CONNECTION)
1704 break;
1705
1706 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
1707 }
1708
1709 reply->ret_code = retcode;
1710 return 0;
1711}
1712
1713static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1714 struct drbd_nl_cfg_reply *reply)
1715{
1716
1717 reply->ret_code = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
1718
1719 return 0;
1720}
1721
1722static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1723 struct drbd_nl_cfg_reply *reply)
1724{
1725 int retcode = NO_ERROR;
1726
1727 if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
1728 retcode = ERR_PAUSE_IS_SET;
1729
1730 reply->ret_code = retcode;
1731 return 0;
1732}
1733
1734static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1735 struct drbd_nl_cfg_reply *reply)
1736{
1737 int retcode = NO_ERROR;
1738
1739 if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO)
1740 retcode = ERR_PAUSE_IS_CLEAR;
1741
1742 reply->ret_code = retcode;
1743 return 0;
1744}
1745
1746static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1747 struct drbd_nl_cfg_reply *reply)
1748{
1749 reply->ret_code = drbd_request_state(mdev, NS(susp, 1));
1750
1751 return 0;
1752}
1753
1754static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1755 struct drbd_nl_cfg_reply *reply)
1756{
1757 reply->ret_code = drbd_request_state(mdev, NS(susp, 0));
1758 return 0;
1759}
1760
1761static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1762 struct drbd_nl_cfg_reply *reply)
1763{
1764 reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED));
1765 return 0;
1766}
1767
1768static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1769 struct drbd_nl_cfg_reply *reply)
1770{
1771 unsigned short *tl;
1772
1773 tl = reply->tag_list;
1774
1775 if (get_ldev(mdev)) {
1776 tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl);
1777 put_ldev(mdev);
1778 }
1779
1780 if (get_net_conf(mdev)) {
1781 tl = net_conf_to_tags(mdev, mdev->net_conf, tl);
1782 put_net_conf(mdev);
1783 }
1784 tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl);
1785
1786 put_unaligned(TT_END, tl++); /* Close the tag list */
1787
1788 return (int)((char *)tl - (char *)reply->tag_list);
1789}
1790
1791static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1792 struct drbd_nl_cfg_reply *reply)
1793{
1794 unsigned short *tl = reply->tag_list;
1795 union drbd_state s = mdev->state;
1796 unsigned long rs_left;
1797 unsigned int res;
1798
1799 tl = get_state_to_tags(mdev, (struct get_state *)&s, tl);
1800
1801 /* no local ref, no bitmap, no syncer progress. */
1802 if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) {
1803 if (get_ldev(mdev)) {
1804 drbd_get_syncer_progress(mdev, &rs_left, &res);
1805 tl = tl_add_int(tl, T_sync_progress, &res);
1806 put_ldev(mdev);
1807 }
1808 }
1809 put_unaligned(TT_END, tl++); /* Close the tag list */
1810
1811 return (int)((char *)tl - (char *)reply->tag_list);
1812}
1813
1814static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1815 struct drbd_nl_cfg_reply *reply)
1816{
1817 unsigned short *tl;
1818
1819 tl = reply->tag_list;
1820
1821 if (get_ldev(mdev)) {
1822 tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64));
1823 tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags);
1824 put_ldev(mdev);
1825 }
1826 put_unaligned(TT_END, tl++); /* Close the tag list */
1827
1828 return (int)((char *)tl - (char *)reply->tag_list);
1829}
1830
1831/**
1832 * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use
1833 * @mdev: DRBD device.
1834 * @nlp: Netlink/connector packet from drbdsetup
1835 * @reply: Reply packet for drbdsetup
1836 */
1837static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1838 struct drbd_nl_cfg_reply *reply)
1839{
1840 unsigned short *tl;
1841 char rv;
1842
1843 tl = reply->tag_list;
1844
1845 rv = mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
1846 test_bit(USE_DEGR_WFC_T, &mdev->flags) ? UT_DEGRADED : UT_DEFAULT;
1847
1848 tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv));
1849 put_unaligned(TT_END, tl++); /* Close the tag list */
1850
1851 return (int)((char *)tl - (char *)reply->tag_list);
1852}
1853
1854static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1855 struct drbd_nl_cfg_reply *reply)
1856{
1857 /* default to resume from last known position, if possible */
1858 struct start_ov args =
1859 { .start_sector = mdev->ov_start_sector };
1860
1861 if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) {
1862 reply->ret_code = ERR_MANDATORY_TAG;
1863 return 0;
1864 }
1865 /* w_make_ov_request expects position to be aligned */
1866 mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT;
1867 reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
1868 return 0;
1869}
1870
1871
1872static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1873 struct drbd_nl_cfg_reply *reply)
1874{
1875 int retcode = NO_ERROR;
1876 int skip_initial_sync = 0;
1877 int err;
1878
1879 struct new_c_uuid args;
1880
1881 memset(&args, 0, sizeof(struct new_c_uuid));
1882 if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) {
1883 reply->ret_code = ERR_MANDATORY_TAG;
1884 return 0;
1885 }
1886
1887 mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */
1888
1889 if (!get_ldev(mdev)) {
1890 retcode = ERR_NO_DISK;
1891 goto out;
1892 }
1893
1894 /* this is "skip initial sync", assume to be clean */
1895 if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 &&
1896 mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
1897 dev_info(DEV, "Preparing to skip initial sync\n");
1898 skip_initial_sync = 1;
1899 } else if (mdev->state.conn != C_STANDALONE) {
1900 retcode = ERR_CONNECTED;
1901 goto out_dec;
1902 }
1903
1904 drbd_uuid_set(mdev, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */
1905 drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */
1906
1907 if (args.clear_bm) {
1908 err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, "clear_n_write from new_c_uuid");
1909 if (err) {
1910 dev_err(DEV, "Writing bitmap failed with %d\n",err);
1911 retcode = ERR_IO_MD_DISK;
1912 }
1913 if (skip_initial_sync) {
1914 drbd_send_uuids_skip_initial_sync(mdev);
1915 _drbd_uuid_set(mdev, UI_BITMAP, 0);
1916 spin_lock_irq(&mdev->req_lock);
1917 _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
1918 CS_VERBOSE, NULL);
1919 spin_unlock_irq(&mdev->req_lock);
1920 }
1921 }
1922
1923 drbd_md_sync(mdev);
1924out_dec:
1925 put_ldev(mdev);
1926out:
1927 mutex_unlock(&mdev->state_mutex);
1928
1929 reply->ret_code = retcode;
1930 return 0;
1931}
1932
1933static struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp)
1934{
1935 struct drbd_conf *mdev;
1936
1937 if (nlp->drbd_minor >= minor_count)
1938 return NULL;
1939
1940 mdev = minor_to_mdev(nlp->drbd_minor);
1941
1942 if (!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) {
1943 struct gendisk *disk = NULL;
1944 mdev = drbd_new_device(nlp->drbd_minor);
1945
1946 spin_lock_irq(&drbd_pp_lock);
1947 if (minor_table[nlp->drbd_minor] == NULL) {
1948 minor_table[nlp->drbd_minor] = mdev;
1949 disk = mdev->vdisk;
1950 mdev = NULL;
1951 } /* else: we lost the race */
1952 spin_unlock_irq(&drbd_pp_lock);
1953
1954 if (disk) /* we won the race above */
1955 /* in case we ever add a drbd_delete_device(),
1956 * don't forget the del_gendisk! */
1957 add_disk(disk);
1958 else /* we lost the race above */
1959 drbd_free_mdev(mdev);
1960
1961 mdev = minor_to_mdev(nlp->drbd_minor);
1962 }
1963
1964 return mdev;
1965}
1966
1967struct cn_handler_struct {
1968 int (*function)(struct drbd_conf *,
1969 struct drbd_nl_cfg_req *,
1970 struct drbd_nl_cfg_reply *);
1971 int reply_body_size;
1972};
1973
1974static struct cn_handler_struct cnd_table[] = {
1975 [ P_primary ] = { &drbd_nl_primary, 0 },
1976 [ P_secondary ] = { &drbd_nl_secondary, 0 },
1977 [ P_disk_conf ] = { &drbd_nl_disk_conf, 0 },
1978 [ P_detach ] = { &drbd_nl_detach, 0 },
1979 [ P_net_conf ] = { &drbd_nl_net_conf, 0 },
1980 [ P_disconnect ] = { &drbd_nl_disconnect, 0 },
1981 [ P_resize ] = { &drbd_nl_resize, 0 },
1982 [ P_syncer_conf ] = { &drbd_nl_syncer_conf, 0 },
1983 [ P_invalidate ] = { &drbd_nl_invalidate, 0 },
1984 [ P_invalidate_peer ] = { &drbd_nl_invalidate_peer, 0 },
1985 [ P_pause_sync ] = { &drbd_nl_pause_sync, 0 },
1986 [ P_resume_sync ] = { &drbd_nl_resume_sync, 0 },
1987 [ P_suspend_io ] = { &drbd_nl_suspend_io, 0 },
1988 [ P_resume_io ] = { &drbd_nl_resume_io, 0 },
1989 [ P_outdate ] = { &drbd_nl_outdate, 0 },
1990 [ P_get_config ] = { &drbd_nl_get_config,
1991 sizeof(struct syncer_conf_tag_len_struct) +
1992 sizeof(struct disk_conf_tag_len_struct) +
1993 sizeof(struct net_conf_tag_len_struct) },
1994 [ P_get_state ] = { &drbd_nl_get_state,
1995 sizeof(struct get_state_tag_len_struct) +
1996 sizeof(struct sync_progress_tag_len_struct) },
1997 [ P_get_uuids ] = { &drbd_nl_get_uuids,
1998 sizeof(struct get_uuids_tag_len_struct) },
1999 [ P_get_timeout_flag ] = { &drbd_nl_get_timeout_flag,
2000 sizeof(struct get_timeout_flag_tag_len_struct)},
2001 [ P_start_ov ] = { &drbd_nl_start_ov, 0 },
2002 [ P_new_c_uuid ] = { &drbd_nl_new_c_uuid, 0 },
2003};
2004
2005static void drbd_connector_callback(struct cn_msg *req)
2006{
2007 struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data;
2008 struct cn_handler_struct *cm;
2009 struct cn_msg *cn_reply;
2010 struct drbd_nl_cfg_reply *reply;
2011 struct drbd_conf *mdev;
2012 int retcode, rr;
2013 int reply_size = sizeof(struct cn_msg)
2014 + sizeof(struct drbd_nl_cfg_reply)
2015 + sizeof(short int);
2016
2017 if (!try_module_get(THIS_MODULE)) {
2018 printk(KERN_ERR "drbd: try_module_get() failed!\n");
2019 return;
2020 }
2021
2022 mdev = ensure_mdev(nlp);
2023 if (!mdev) {
2024 retcode = ERR_MINOR_INVALID;
2025 goto fail;
2026 }
2027
2028 trace_drbd_netlink(req, 1);
2029
2030 if (nlp->packet_type >= P_nl_after_last_packet) {
2031 retcode = ERR_PACKET_NR;
2032 goto fail;
2033 }
2034
2035 cm = cnd_table + nlp->packet_type;
2036
2037 /* This may happen if packet number is 0: */
2038 if (cm->function == NULL) {
2039 retcode = ERR_PACKET_NR;
2040 goto fail;
2041 }
2042
2043 reply_size += cm->reply_body_size;
2044
2045 /* allocation not in the IO path, cqueue thread context */
2046 cn_reply = kmalloc(reply_size, GFP_KERNEL);
2047 if (!cn_reply) {
2048 retcode = ERR_NOMEM;
2049 goto fail;
2050 }
2051 reply = (struct drbd_nl_cfg_reply *) cn_reply->data;
2052
2053 reply->packet_type =
2054 cm->reply_body_size ? nlp->packet_type : P_nl_after_last_packet;
2055 reply->minor = nlp->drbd_minor;
2056 reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */
2057 /* reply->tag_list; might be modified by cm->function. */
2058
2059 rr = cm->function(mdev, nlp, reply);
2060
2061 cn_reply->id = req->id;
2062 cn_reply->seq = req->seq;
2063 cn_reply->ack = req->ack + 1;
2064 cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr;
2065 cn_reply->flags = 0;
2066
2067 trace_drbd_netlink(cn_reply, 0);
2068 rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL);
2069 if (rr && rr != -ESRCH)
2070 printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
2071
2072 kfree(cn_reply);
2073 module_put(THIS_MODULE);
2074 return;
2075 fail:
2076 drbd_nl_send_reply(req, retcode);
2077 module_put(THIS_MODULE);
2078}
2079
2080static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */
2081
2082static unsigned short *
2083__tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data,
2084 unsigned short len, int nul_terminated)
2085{
2086 unsigned short l = tag_descriptions[tag_number(tag)].max_len;
2087 len = (len < l) ? len : l;
2088 put_unaligned(tag, tl++);
2089 put_unaligned(len, tl++);
2090 memcpy(tl, data, len);
2091 tl = (unsigned short*)((char*)tl + len);
2092 if (nul_terminated)
2093 *((char*)tl - 1) = 0;
2094 return tl;
2095}
2096
2097static unsigned short *
2098tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len)
2099{
2100 return __tl_add_blob(tl, tag, data, len, 0);
2101}
2102
2103static unsigned short *
2104tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str)
2105{
2106 return __tl_add_blob(tl, tag, str, strlen(str)+1, 0);
2107}
2108
2109static unsigned short *
2110tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val)
2111{
2112 put_unaligned(tag, tl++);
2113 switch(tag_type(tag)) {
2114 case TT_INTEGER:
2115 put_unaligned(sizeof(int), tl++);
2116 put_unaligned(*(int *)val, (int *)tl);
2117 tl = (unsigned short*)((char*)tl+sizeof(int));
2118 break;
2119 case TT_INT64:
2120 put_unaligned(sizeof(u64), tl++);
2121 put_unaligned(*(u64 *)val, (u64 *)tl);
2122 tl = (unsigned short*)((char*)tl+sizeof(u64));
2123 break;
2124 default:
2125 /* someone did something stupid. */
2126 ;
2127 }
2128 return tl;
2129}
2130
2131void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state)
2132{
2133 char buffer[sizeof(struct cn_msg)+
2134 sizeof(struct drbd_nl_cfg_reply)+
2135 sizeof(struct get_state_tag_len_struct)+
2136 sizeof(short int)];
2137 struct cn_msg *cn_reply = (struct cn_msg *) buffer;
2138 struct drbd_nl_cfg_reply *reply =
2139 (struct drbd_nl_cfg_reply *)cn_reply->data;
2140 unsigned short *tl = reply->tag_list;
2141
2142 /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
2143
2144 tl = get_state_to_tags(mdev, (struct get_state *)&state, tl);
2145
2146 put_unaligned(TT_END, tl++); /* Close the tag list */
2147
2148 cn_reply->id.idx = CN_IDX_DRBD;
2149 cn_reply->id.val = CN_VAL_DRBD;
2150
2151 cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
2152 cn_reply->ack = 0; /* not used here. */
2153 cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
2154 (int)((char *)tl - (char *)reply->tag_list);
2155 cn_reply->flags = 0;
2156
2157 reply->packet_type = P_get_state;
2158 reply->minor = mdev_to_minor(mdev);
2159 reply->ret_code = NO_ERROR;
2160
2161 trace_drbd_netlink(cn_reply, 0);
2162 cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2163}
2164
2165void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name)
2166{
2167 char buffer[sizeof(struct cn_msg)+
2168 sizeof(struct drbd_nl_cfg_reply)+
2169 sizeof(struct call_helper_tag_len_struct)+
2170 sizeof(short int)];
2171 struct cn_msg *cn_reply = (struct cn_msg *) buffer;
2172 struct drbd_nl_cfg_reply *reply =
2173 (struct drbd_nl_cfg_reply *)cn_reply->data;
2174 unsigned short *tl = reply->tag_list;
2175
2176 /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
2177
2178 tl = tl_add_str(tl, T_helper, helper_name);
2179 put_unaligned(TT_END, tl++); /* Close the tag list */
2180
2181 cn_reply->id.idx = CN_IDX_DRBD;
2182 cn_reply->id.val = CN_VAL_DRBD;
2183
2184 cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
2185 cn_reply->ack = 0; /* not used here. */
2186 cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
2187 (int)((char *)tl - (char *)reply->tag_list);
2188 cn_reply->flags = 0;
2189
2190 reply->packet_type = P_call_helper;
2191 reply->minor = mdev_to_minor(mdev);
2192 reply->ret_code = NO_ERROR;
2193
2194 trace_drbd_netlink(cn_reply, 0);
2195 cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2196}
2197
2198void drbd_bcast_ee(struct drbd_conf *mdev,
2199 const char *reason, const int dgs,
2200 const char* seen_hash, const char* calc_hash,
2201 const struct drbd_epoch_entry* e)
2202{
2203 struct cn_msg *cn_reply;
2204 struct drbd_nl_cfg_reply *reply;
2205 struct bio_vec *bvec;
2206 unsigned short *tl;
2207 int i;
2208
2209 if (!e)
2210 return;
2211 if (!reason || !reason[0])
2212 return;
2213
2214 /* apparently we have to memcpy twice, first to prepare the data for the
2215 * struct cn_msg, then within cn_netlink_send from the cn_msg to the
2216 * netlink skb. */
2217 /* receiver thread context, which is not in the writeout path (of this node),
2218 * but may be in the writeout path of the _other_ node.
2219 * GFP_NOIO to avoid potential "distributed deadlock". */
2220 cn_reply = kmalloc(
2221 sizeof(struct cn_msg)+
2222 sizeof(struct drbd_nl_cfg_reply)+
2223 sizeof(struct dump_ee_tag_len_struct)+
2224 sizeof(short int),
2225 GFP_NOIO);
2226
2227 if (!cn_reply) {
2228 dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n",
2229 (unsigned long long)e->sector, e->size);
2230 return;
2231 }
2232
2233 reply = (struct drbd_nl_cfg_reply*)cn_reply->data;
2234 tl = reply->tag_list;
2235
2236 tl = tl_add_str(tl, T_dump_ee_reason, reason);
2237 tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs);
2238 tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs);
2239 tl = tl_add_int(tl, T_ee_sector, &e->sector);
2240 tl = tl_add_int(tl, T_ee_block_id, &e->block_id);
2241
2242 put_unaligned(T_ee_data, tl++);
2243 put_unaligned(e->size, tl++);
2244
2245 __bio_for_each_segment(bvec, e->private_bio, i, 0) {
2246 void *d = kmap(bvec->bv_page);
2247 memcpy(tl, d + bvec->bv_offset, bvec->bv_len);
2248 kunmap(bvec->bv_page);
2249 tl=(unsigned short*)((char*)tl + bvec->bv_len);
2250 }
2251 put_unaligned(TT_END, tl++); /* Close the tag list */
2252
2253 cn_reply->id.idx = CN_IDX_DRBD;
2254 cn_reply->id.val = CN_VAL_DRBD;
2255
2256 cn_reply->seq = atomic_add_return(1,&drbd_nl_seq);
2257 cn_reply->ack = 0; // not used here.
2258 cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
2259 (int)((char*)tl - (char*)reply->tag_list);
2260 cn_reply->flags = 0;
2261
2262 reply->packet_type = P_dump_ee;
2263 reply->minor = mdev_to_minor(mdev);
2264 reply->ret_code = NO_ERROR;
2265
2266 trace_drbd_netlink(cn_reply, 0);
2267 cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2268 kfree(cn_reply);
2269}
2270
2271void drbd_bcast_sync_progress(struct drbd_conf *mdev)
2272{
2273 char buffer[sizeof(struct cn_msg)+
2274 sizeof(struct drbd_nl_cfg_reply)+
2275 sizeof(struct sync_progress_tag_len_struct)+
2276 sizeof(short int)];
2277 struct cn_msg *cn_reply = (struct cn_msg *) buffer;
2278 struct drbd_nl_cfg_reply *reply =
2279 (struct drbd_nl_cfg_reply *)cn_reply->data;
2280 unsigned short *tl = reply->tag_list;
2281 unsigned long rs_left;
2282 unsigned int res;
2283
2284 /* no local ref, no bitmap, no syncer progress, no broadcast. */
2285 if (!get_ldev(mdev))
2286 return;
2287 drbd_get_syncer_progress(mdev, &rs_left, &res);
2288 put_ldev(mdev);
2289
2290 tl = tl_add_int(tl, T_sync_progress, &res);
2291 put_unaligned(TT_END, tl++); /* Close the tag list */
2292
2293 cn_reply->id.idx = CN_IDX_DRBD;
2294 cn_reply->id.val = CN_VAL_DRBD;
2295
2296 cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
2297 cn_reply->ack = 0; /* not used here. */
2298 cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
2299 (int)((char *)tl - (char *)reply->tag_list);
2300 cn_reply->flags = 0;
2301
2302 reply->packet_type = P_sync_progress;
2303 reply->minor = mdev_to_minor(mdev);
2304 reply->ret_code = NO_ERROR;
2305
2306 trace_drbd_netlink(cn_reply, 0);
2307 cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2308}
2309
2310int __init drbd_nl_init(void)
2311{
2312 static struct cb_id cn_id_drbd;
2313 int err, try=10;
2314
2315 cn_id_drbd.val = CN_VAL_DRBD;
2316 do {
2317 cn_id_drbd.idx = cn_idx;
2318 err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback);
2319 if (!err)
2320 break;
2321 cn_idx = (cn_idx + CN_IDX_STEP);
2322 } while (try--);
2323
2324 if (err) {
2325 printk(KERN_ERR "drbd: cn_drbd failed to register\n");
2326 return err;
2327 }
2328
2329 return 0;
2330}
2331
2332void drbd_nl_cleanup(void)
2333{
2334 static struct cb_id cn_id_drbd;
2335
2336 cn_id_drbd.idx = cn_idx;
2337 cn_id_drbd.val = CN_VAL_DRBD;
2338
2339 cn_del_callback(&cn_id_drbd);
2340}
2341
2342void drbd_nl_send_reply(struct cn_msg *req, int ret_code)
2343{
2344 char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)];
2345 struct cn_msg *cn_reply = (struct cn_msg *) buffer;
2346 struct drbd_nl_cfg_reply *reply =
2347 (struct drbd_nl_cfg_reply *)cn_reply->data;
2348 int rr;
2349
2350 cn_reply->id = req->id;
2351
2352 cn_reply->seq = req->seq;
2353 cn_reply->ack = req->ack + 1;
2354 cn_reply->len = sizeof(struct drbd_nl_cfg_reply);
2355 cn_reply->flags = 0;
2356
2357 reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor;
2358 reply->ret_code = ret_code;
2359
2360 trace_drbd_netlink(cn_reply, 0);
2361 rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2362 if (rr && rr != -ESRCH)
2363 printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
2364}
2365
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
new file mode 100644
index 000000000000..98fcb7450c76
--- /dev/null
+++ b/drivers/block/drbd/drbd_proc.c
@@ -0,0 +1,266 @@
1/*
2 drbd_proc.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/autoconf.h>
27#include <linux/module.h>
28
29#include <asm/uaccess.h>
30#include <linux/fs.h>
31#include <linux/file.h>
32#include <linux/slab.h>
33#include <linux/proc_fs.h>
34#include <linux/seq_file.h>
35#include <linux/drbd.h>
36#include "drbd_int.h"
37
38static int drbd_proc_open(struct inode *inode, struct file *file);
39
40
41struct proc_dir_entry *drbd_proc;
42struct file_operations drbd_proc_fops = {
43 .owner = THIS_MODULE,
44 .open = drbd_proc_open,
45 .read = seq_read,
46 .llseek = seq_lseek,
47 .release = single_release,
48};
49
50
51/*lge
52 * progress bars shamelessly adapted from driver/md/md.c
53 * output looks like
54 * [=====>..............] 33.5% (23456/123456)
55 * finish: 2:20:20 speed: 6,345 (6,456) K/sec
56 */
57static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
58{
59 unsigned long db, dt, dbdt, rt, rs_left;
60 unsigned int res;
61 int i, x, y;
62
63 drbd_get_syncer_progress(mdev, &rs_left, &res);
64
65 x = res/50;
66 y = 20-x;
67 seq_printf(seq, "\t[");
68 for (i = 1; i < x; i++)
69 seq_printf(seq, "=");
70 seq_printf(seq, ">");
71 for (i = 0; i < y; i++)
72 seq_printf(seq, ".");
73 seq_printf(seq, "] ");
74
75 seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10);
76 /* if more than 1 GB display in MB */
77 if (mdev->rs_total > 0x100000L)
78 seq_printf(seq, "(%lu/%lu)M\n\t",
79 (unsigned long) Bit2KB(rs_left >> 10),
80 (unsigned long) Bit2KB(mdev->rs_total >> 10));
81 else
82 seq_printf(seq, "(%lu/%lu)K\n\t",
83 (unsigned long) Bit2KB(rs_left),
84 (unsigned long) Bit2KB(mdev->rs_total));
85
86 /* see drivers/md/md.c
87 * We do not want to overflow, so the order of operands and
88 * the * 100 / 100 trick are important. We do a +1 to be
89 * safe against division by zero. We only estimate anyway.
90 *
91 * dt: time from mark until now
92 * db: blocks written from mark until now
93 * rt: remaining time
94 */
95 dt = (jiffies - mdev->rs_mark_time) / HZ;
96
97 if (dt > 20) {
98 /* if we made no update to rs_mark_time for too long,
99 * we are stalled. show that. */
100 seq_printf(seq, "stalled\n");
101 return;
102 }
103
104 if (!dt)
105 dt++;
106 db = mdev->rs_mark_left - rs_left;
107 rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
108
109 seq_printf(seq, "finish: %lu:%02lu:%02lu",
110 rt / 3600, (rt % 3600) / 60, rt % 60);
111
112 /* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */
113 dbdt = Bit2KB(db/dt);
114 if (dbdt > 1000)
115 seq_printf(seq, " speed: %ld,%03ld",
116 dbdt/1000, dbdt % 1000);
117 else
118 seq_printf(seq, " speed: %ld", dbdt);
119
120 /* mean speed since syncer started
121 * we do account for PausedSync periods */
122 dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
123 if (dt <= 0)
124 dt = 1;
125 db = mdev->rs_total - rs_left;
126 dbdt = Bit2KB(db/dt);
127 if (dbdt > 1000)
128 seq_printf(seq, " (%ld,%03ld)",
129 dbdt/1000, dbdt % 1000);
130 else
131 seq_printf(seq, " (%ld)", dbdt);
132
133 seq_printf(seq, " K/sec\n");
134}
135
136static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
137{
138 struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
139
140 seq_printf(seq, "%5d %s %s\n", bme->rs_left,
141 bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------",
142 bme->flags & BME_LOCKED ? "LOCKED" : "------"
143 );
144}
145
146static int drbd_seq_show(struct seq_file *seq, void *v)
147{
148 int i, hole = 0;
149 const char *sn;
150 struct drbd_conf *mdev;
151
152 static char write_ordering_chars[] = {
153 [WO_none] = 'n',
154 [WO_drain_io] = 'd',
155 [WO_bdev_flush] = 'f',
156 [WO_bio_barrier] = 'b',
157 };
158
159 seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
160 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag());
161
162 /*
163 cs .. connection state
164 ro .. node role (local/remote)
165 ds .. disk state (local/remote)
166 protocol
167 various flags
168 ns .. network send
169 nr .. network receive
170 dw .. disk write
171 dr .. disk read
172 al .. activity log write count
173 bm .. bitmap update write count
174 pe .. pending (waiting for ack or data reply)
175 ua .. unack'd (still need to send ack or data reply)
176 ap .. application requests accepted, but not yet completed
177 ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending
178 wo .. write ordering mode currently in use
179 oos .. known out-of-sync kB
180 */
181
182 for (i = 0; i < minor_count; i++) {
183 mdev = minor_to_mdev(i);
184 if (!mdev) {
185 hole = 1;
186 continue;
187 }
188 if (hole) {
189 hole = 0;
190 seq_printf(seq, "\n");
191 }
192
193 sn = drbd_conn_str(mdev->state.conn);
194
195 if (mdev->state.conn == C_STANDALONE &&
196 mdev->state.disk == D_DISKLESS &&
197 mdev->state.role == R_SECONDARY) {
198 seq_printf(seq, "%2d: cs:Unconfigured\n", i);
199 } else {
200 seq_printf(seq,
201 "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n"
202 " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
203 "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
204 i, sn,
205 drbd_role_str(mdev->state.role),
206 drbd_role_str(mdev->state.peer),
207 drbd_disk_str(mdev->state.disk),
208 drbd_disk_str(mdev->state.pdsk),
209 (mdev->net_conf == NULL ? ' ' :
210 (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')),
211 mdev->state.susp ? 's' : 'r',
212 mdev->state.aftr_isp ? 'a' : '-',
213 mdev->state.peer_isp ? 'p' : '-',
214 mdev->state.user_isp ? 'u' : '-',
215 mdev->congestion_reason ?: '-',
216 mdev->send_cnt/2,
217 mdev->recv_cnt/2,
218 mdev->writ_cnt/2,
219 mdev->read_cnt/2,
220 mdev->al_writ_cnt,
221 mdev->bm_writ_cnt,
222 atomic_read(&mdev->local_cnt),
223 atomic_read(&mdev->ap_pending_cnt) +
224 atomic_read(&mdev->rs_pending_cnt),
225 atomic_read(&mdev->unacked_cnt),
226 atomic_read(&mdev->ap_bio_cnt),
227 mdev->epochs,
228 write_ordering_chars[mdev->write_ordering]
229 );
230 seq_printf(seq, " oos:%lu\n",
231 Bit2KB(drbd_bm_total_weight(mdev)));
232 }
233 if (mdev->state.conn == C_SYNC_SOURCE ||
234 mdev->state.conn == C_SYNC_TARGET)
235 drbd_syncer_progress(mdev, seq);
236
237 if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
238 seq_printf(seq, "\t%3d%% %lu/%lu\n",
239 (int)((mdev->rs_total-mdev->ov_left) /
240 (mdev->rs_total/100+1)),
241 mdev->rs_total - mdev->ov_left,
242 mdev->rs_total);
243
244 if (proc_details >= 1 && get_ldev_if_state(mdev, D_FAILED)) {
245 lc_seq_printf_stats(seq, mdev->resync);
246 lc_seq_printf_stats(seq, mdev->act_log);
247 put_ldev(mdev);
248 }
249
250 if (proc_details >= 2) {
251 if (mdev->resync) {
252 lc_seq_dump_details(seq, mdev->resync, "rs_left",
253 resync_dump_detail);
254 }
255 }
256 }
257
258 return 0;
259}
260
261static int drbd_proc_open(struct inode *inode, struct file *file)
262{
263 return single_open(file, drbd_seq_show, PDE(inode)->data);
264}
265
266/* PROC FS stuff end */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
new file mode 100644
index 000000000000..63686c4d85cf
--- /dev/null
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -0,0 +1,4456 @@
1/*
2 drbd_receiver.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25
26#include <linux/autoconf.h>
27#include <linux/module.h>
28
29#include <asm/uaccess.h>
30#include <net/sock.h>
31
32#include <linux/version.h>
33#include <linux/drbd.h>
34#include <linux/fs.h>
35#include <linux/file.h>
36#include <linux/in.h>
37#include <linux/mm.h>
38#include <linux/memcontrol.h>
39#include <linux/mm_inline.h>
40#include <linux/slab.h>
41#include <linux/smp_lock.h>
42#include <linux/pkt_sched.h>
43#define __KERNEL_SYSCALLS__
44#include <linux/unistd.h>
45#include <linux/vmalloc.h>
46#include <linux/random.h>
47#include <linux/mm.h>
48#include <linux/string.h>
49#include <linux/scatterlist.h>
50#include "drbd_int.h"
51#include "drbd_tracing.h"
52#include "drbd_req.h"
53
54#include "drbd_vli.h"
55
56struct flush_work {
57 struct drbd_work w;
58 struct drbd_epoch *epoch;
59};
60
61enum finish_epoch {
62 FE_STILL_LIVE,
63 FE_DESTROYED,
64 FE_RECYCLED,
65};
66
67static int drbd_do_handshake(struct drbd_conf *mdev);
68static int drbd_do_auth(struct drbd_conf *mdev);
69
70static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
71static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
72
73static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
74{
75 struct drbd_epoch *prev;
76 spin_lock(&mdev->epoch_lock);
77 prev = list_entry(epoch->list.prev, struct drbd_epoch, list);
78 if (prev == epoch || prev == mdev->current_epoch)
79 prev = NULL;
80 spin_unlock(&mdev->epoch_lock);
81 return prev;
82}
83
84#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
85
86static struct page *drbd_pp_first_page_or_try_alloc(struct drbd_conf *mdev)
87{
88 struct page *page = NULL;
89
90 /* Yes, testing drbd_pp_vacant outside the lock is racy.
91 * So what. It saves a spin_lock. */
92 if (drbd_pp_vacant > 0) {
93 spin_lock(&drbd_pp_lock);
94 page = drbd_pp_pool;
95 if (page) {
96 drbd_pp_pool = (struct page *)page_private(page);
97 set_page_private(page, 0); /* just to be polite */
98 drbd_pp_vacant--;
99 }
100 spin_unlock(&drbd_pp_lock);
101 }
102 /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
103 * "criss-cross" setup, that might cause write-out on some other DRBD,
104 * which in turn might block on the other node at this very place. */
105 if (!page)
106 page = alloc_page(GFP_TRY);
107 if (page)
108 atomic_inc(&mdev->pp_in_use);
109 return page;
110}
111
112/* kick lower level device, if we have more than (arbitrary number)
113 * reference counts on it, which typically are locally submitted io
114 * requests. don't use unacked_cnt, so we speed up proto A and B, too. */
115static void maybe_kick_lo(struct drbd_conf *mdev)
116{
117 if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark)
118 drbd_kick_lo(mdev);
119}
120
121static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed)
122{
123 struct drbd_epoch_entry *e;
124 struct list_head *le, *tle;
125
126 /* The EEs are always appended to the end of the list. Since
127 they are sent in order over the wire, they have to finish
128 in order. As soon as we see the first not finished we can
129 stop to examine the list... */
130
131 list_for_each_safe(le, tle, &mdev->net_ee) {
132 e = list_entry(le, struct drbd_epoch_entry, w.list);
133 if (drbd_bio_has_active_page(e->private_bio))
134 break;
135 list_move(le, to_be_freed);
136 }
137}
138
139static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
140{
141 LIST_HEAD(reclaimed);
142 struct drbd_epoch_entry *e, *t;
143
144 maybe_kick_lo(mdev);
145 spin_lock_irq(&mdev->req_lock);
146 reclaim_net_ee(mdev, &reclaimed);
147 spin_unlock_irq(&mdev->req_lock);
148
149 list_for_each_entry_safe(e, t, &reclaimed, w.list)
150 drbd_free_ee(mdev, e);
151}
152
153/**
154 * drbd_pp_alloc() - Returns a page, fails only if a signal comes in
155 * @mdev: DRBD device.
156 * @retry: whether or not to retry allocation forever (or until signalled)
157 *
158 * Tries to allocate a page, first from our own page pool, then from the
159 * kernel, unless this allocation would exceed the max_buffers setting.
160 * If @retry is non-zero, retry until DRBD frees a page somewhere else.
161 */
162static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry)
163{
164 struct page *page = NULL;
165 DEFINE_WAIT(wait);
166
167 if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
168 page = drbd_pp_first_page_or_try_alloc(mdev);
169 if (page)
170 return page;
171 }
172
173 for (;;) {
174 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
175
176 drbd_kick_lo_and_reclaim_net(mdev);
177
178 if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
179 page = drbd_pp_first_page_or_try_alloc(mdev);
180 if (page)
181 break;
182 }
183
184 if (!retry)
185 break;
186
187 if (signal_pending(current)) {
188 dev_warn(DEV, "drbd_pp_alloc interrupted!\n");
189 break;
190 }
191
192 schedule();
193 }
194 finish_wait(&drbd_pp_wait, &wait);
195
196 return page;
197}
198
199/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
200 * Is also used from inside an other spin_lock_irq(&mdev->req_lock) */
201static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
202{
203 int free_it;
204
205 spin_lock(&drbd_pp_lock);
206 if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
207 free_it = 1;
208 } else {
209 set_page_private(page, (unsigned long)drbd_pp_pool);
210 drbd_pp_pool = page;
211 drbd_pp_vacant++;
212 free_it = 0;
213 }
214 spin_unlock(&drbd_pp_lock);
215
216 atomic_dec(&mdev->pp_in_use);
217
218 if (free_it)
219 __free_page(page);
220
221 wake_up(&drbd_pp_wait);
222}
223
224static void drbd_pp_free_bio_pages(struct drbd_conf *mdev, struct bio *bio)
225{
226 struct page *p_to_be_freed = NULL;
227 struct page *page;
228 struct bio_vec *bvec;
229 int i;
230
231 spin_lock(&drbd_pp_lock);
232 __bio_for_each_segment(bvec, bio, i, 0) {
233 if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
234 set_page_private(bvec->bv_page, (unsigned long)p_to_be_freed);
235 p_to_be_freed = bvec->bv_page;
236 } else {
237 set_page_private(bvec->bv_page, (unsigned long)drbd_pp_pool);
238 drbd_pp_pool = bvec->bv_page;
239 drbd_pp_vacant++;
240 }
241 }
242 spin_unlock(&drbd_pp_lock);
243 atomic_sub(bio->bi_vcnt, &mdev->pp_in_use);
244
245 while (p_to_be_freed) {
246 page = p_to_be_freed;
247 p_to_be_freed = (struct page *)page_private(page);
248 set_page_private(page, 0); /* just to be polite */
249 put_page(page);
250 }
251
252 wake_up(&drbd_pp_wait);
253}
254
255/*
256You need to hold the req_lock:
257 _drbd_wait_ee_list_empty()
258
259You must not have the req_lock:
260 drbd_free_ee()
261 drbd_alloc_ee()
262 drbd_init_ee()
263 drbd_release_ee()
264 drbd_ee_fix_bhs()
265 drbd_process_done_ee()
266 drbd_clear_done_ee()
267 drbd_wait_ee_list_empty()
268*/
269
270struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
271 u64 id,
272 sector_t sector,
273 unsigned int data_size,
274 gfp_t gfp_mask) __must_hold(local)
275{
276 struct request_queue *q;
277 struct drbd_epoch_entry *e;
278 struct page *page;
279 struct bio *bio;
280 unsigned int ds;
281
282 if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE))
283 return NULL;
284
285 e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
286 if (!e) {
287 if (!(gfp_mask & __GFP_NOWARN))
288 dev_err(DEV, "alloc_ee: Allocation of an EE failed\n");
289 return NULL;
290 }
291
292 bio = bio_alloc(gfp_mask & ~__GFP_HIGHMEM, div_ceil(data_size, PAGE_SIZE));
293 if (!bio) {
294 if (!(gfp_mask & __GFP_NOWARN))
295 dev_err(DEV, "alloc_ee: Allocation of a bio failed\n");
296 goto fail1;
297 }
298
299 bio->bi_bdev = mdev->ldev->backing_bdev;
300 bio->bi_sector = sector;
301
302 ds = data_size;
303 while (ds) {
304 page = drbd_pp_alloc(mdev, (gfp_mask & __GFP_WAIT));
305 if (!page) {
306 if (!(gfp_mask & __GFP_NOWARN))
307 dev_err(DEV, "alloc_ee: Allocation of a page failed\n");
308 goto fail2;
309 }
310 if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) {
311 drbd_pp_free(mdev, page);
312 dev_err(DEV, "alloc_ee: bio_add_page(s=%llu,"
313 "data_size=%u,ds=%u) failed\n",
314 (unsigned long long)sector, data_size, ds);
315
316 q = bdev_get_queue(bio->bi_bdev);
317 if (q->merge_bvec_fn) {
318 struct bvec_merge_data bvm = {
319 .bi_bdev = bio->bi_bdev,
320 .bi_sector = bio->bi_sector,
321 .bi_size = bio->bi_size,
322 .bi_rw = bio->bi_rw,
323 };
324 int l = q->merge_bvec_fn(q, &bvm,
325 &bio->bi_io_vec[bio->bi_vcnt]);
326 dev_err(DEV, "merge_bvec_fn() = %d\n", l);
327 }
328
329 /* dump more of the bio. */
330 dev_err(DEV, "bio->bi_max_vecs = %d\n", bio->bi_max_vecs);
331 dev_err(DEV, "bio->bi_vcnt = %d\n", bio->bi_vcnt);
332 dev_err(DEV, "bio->bi_size = %d\n", bio->bi_size);
333 dev_err(DEV, "bio->bi_phys_segments = %d\n", bio->bi_phys_segments);
334
335 goto fail2;
336 break;
337 }
338 ds -= min_t(int, ds, PAGE_SIZE);
339 }
340
341 D_ASSERT(data_size == bio->bi_size);
342
343 bio->bi_private = e;
344 e->mdev = mdev;
345 e->sector = sector;
346 e->size = bio->bi_size;
347
348 e->private_bio = bio;
349 e->block_id = id;
350 INIT_HLIST_NODE(&e->colision);
351 e->epoch = NULL;
352 e->flags = 0;
353
354 trace_drbd_ee(mdev, e, "allocated");
355
356 return e;
357
358 fail2:
359 drbd_pp_free_bio_pages(mdev, bio);
360 bio_put(bio);
361 fail1:
362 mempool_free(e, drbd_ee_mempool);
363
364 return NULL;
365}
366
367void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
368{
369 struct bio *bio = e->private_bio;
370 trace_drbd_ee(mdev, e, "freed");
371 drbd_pp_free_bio_pages(mdev, bio);
372 bio_put(bio);
373 D_ASSERT(hlist_unhashed(&e->colision));
374 mempool_free(e, drbd_ee_mempool);
375}
376
377int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
378{
379 LIST_HEAD(work_list);
380 struct drbd_epoch_entry *e, *t;
381 int count = 0;
382
383 spin_lock_irq(&mdev->req_lock);
384 list_splice_init(list, &work_list);
385 spin_unlock_irq(&mdev->req_lock);
386
387 list_for_each_entry_safe(e, t, &work_list, w.list) {
388 drbd_free_ee(mdev, e);
389 count++;
390 }
391 return count;
392}
393
394
395/*
396 * This function is called from _asender only_
397 * but see also comments in _req_mod(,barrier_acked)
398 * and receive_Barrier.
399 *
400 * Move entries from net_ee to done_ee, if ready.
401 * Grab done_ee, call all callbacks, free the entries.
402 * The callbacks typically send out ACKs.
403 */
404static int drbd_process_done_ee(struct drbd_conf *mdev)
405{
406 LIST_HEAD(work_list);
407 LIST_HEAD(reclaimed);
408 struct drbd_epoch_entry *e, *t;
409 int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS);
410
411 spin_lock_irq(&mdev->req_lock);
412 reclaim_net_ee(mdev, &reclaimed);
413 list_splice_init(&mdev->done_ee, &work_list);
414 spin_unlock_irq(&mdev->req_lock);
415
416 list_for_each_entry_safe(e, t, &reclaimed, w.list)
417 drbd_free_ee(mdev, e);
418
419 /* possible callbacks here:
420 * e_end_block, and e_end_resync_block, e_send_discard_ack.
421 * all ignore the last argument.
422 */
423 list_for_each_entry_safe(e, t, &work_list, w.list) {
424 trace_drbd_ee(mdev, e, "process_done_ee");
425 /* list_del not necessary, next/prev members not touched */
426 ok = e->w.cb(mdev, &e->w, !ok) && ok;
427 drbd_free_ee(mdev, e);
428 }
429 wake_up(&mdev->ee_wait);
430
431 return ok;
432}
433
434void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
435{
436 DEFINE_WAIT(wait);
437
438 /* avoids spin_lock/unlock
439 * and calling prepare_to_wait in the fast path */
440 while (!list_empty(head)) {
441 prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
442 spin_unlock_irq(&mdev->req_lock);
443 drbd_kick_lo(mdev);
444 schedule();
445 finish_wait(&mdev->ee_wait, &wait);
446 spin_lock_irq(&mdev->req_lock);
447 }
448}
449
450void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
451{
452 spin_lock_irq(&mdev->req_lock);
453 _drbd_wait_ee_list_empty(mdev, head);
454 spin_unlock_irq(&mdev->req_lock);
455}
456
457/* see also kernel_accept; which is only present since 2.6.18.
458 * also we want to log which part of it failed, exactly */
459static int drbd_accept(struct drbd_conf *mdev, const char **what,
460 struct socket *sock, struct socket **newsock)
461{
462 struct sock *sk = sock->sk;
463 int err = 0;
464
465 *what = "listen";
466 err = sock->ops->listen(sock, 5);
467 if (err < 0)
468 goto out;
469
470 *what = "sock_create_lite";
471 err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
472 newsock);
473 if (err < 0)
474 goto out;
475
476 *what = "accept";
477 err = sock->ops->accept(sock, *newsock, 0);
478 if (err < 0) {
479 sock_release(*newsock);
480 *newsock = NULL;
481 goto out;
482 }
483 (*newsock)->ops = sock->ops;
484
485out:
486 return err;
487}
488
489static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock,
490 void *buf, size_t size, int flags)
491{
492 mm_segment_t oldfs;
493 struct kvec iov = {
494 .iov_base = buf,
495 .iov_len = size,
496 };
497 struct msghdr msg = {
498 .msg_iovlen = 1,
499 .msg_iov = (struct iovec *)&iov,
500 .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
501 };
502 int rv;
503
504 oldfs = get_fs();
505 set_fs(KERNEL_DS);
506 rv = sock_recvmsg(sock, &msg, size, msg.msg_flags);
507 set_fs(oldfs);
508
509 return rv;
510}
511
512static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size)
513{
514 mm_segment_t oldfs;
515 struct kvec iov = {
516 .iov_base = buf,
517 .iov_len = size,
518 };
519 struct msghdr msg = {
520 .msg_iovlen = 1,
521 .msg_iov = (struct iovec *)&iov,
522 .msg_flags = MSG_WAITALL | MSG_NOSIGNAL
523 };
524 int rv;
525
526 oldfs = get_fs();
527 set_fs(KERNEL_DS);
528
529 for (;;) {
530 rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags);
531 if (rv == size)
532 break;
533
534 /* Note:
535 * ECONNRESET other side closed the connection
536 * ERESTARTSYS (on sock) we got a signal
537 */
538
539 if (rv < 0) {
540 if (rv == -ECONNRESET)
541 dev_info(DEV, "sock was reset by peer\n");
542 else if (rv != -ERESTARTSYS)
543 dev_err(DEV, "sock_recvmsg returned %d\n", rv);
544 break;
545 } else if (rv == 0) {
546 dev_info(DEV, "sock was shut down by peer\n");
547 break;
548 } else {
549 /* signal came in, or peer/link went down,
550 * after we read a partial message
551 */
552 /* D_ASSERT(signal_pending(current)); */
553 break;
554 }
555 };
556
557 set_fs(oldfs);
558
559 if (rv != size)
560 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
561
562 return rv;
563}
564
565static struct socket *drbd_try_connect(struct drbd_conf *mdev)
566{
567 const char *what;
568 struct socket *sock;
569 struct sockaddr_in6 src_in6;
570 int err;
571 int disconnect_on_error = 1;
572
573 if (!get_net_conf(mdev))
574 return NULL;
575
576 what = "sock_create_kern";
577 err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
578 SOCK_STREAM, IPPROTO_TCP, &sock);
579 if (err < 0) {
580 sock = NULL;
581 goto out;
582 }
583
584 sock->sk->sk_rcvtimeo =
585 sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ;
586
587 /* explicitly bind to the configured IP as source IP
588 * for the outgoing connections.
589 * This is needed for multihomed hosts and to be
590 * able to use lo: interfaces for drbd.
591 * Make sure to use 0 as port number, so linux selects
592 * a free one dynamically.
593 */
594 memcpy(&src_in6, mdev->net_conf->my_addr,
595 min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6)));
596 if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6)
597 src_in6.sin6_port = 0;
598 else
599 ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
600
601 what = "bind before connect";
602 err = sock->ops->bind(sock,
603 (struct sockaddr *) &src_in6,
604 mdev->net_conf->my_addr_len);
605 if (err < 0)
606 goto out;
607
608 /* connect may fail, peer not yet available.
609 * stay C_WF_CONNECTION, don't go Disconnecting! */
610 disconnect_on_error = 0;
611 what = "connect";
612 err = sock->ops->connect(sock,
613 (struct sockaddr *)mdev->net_conf->peer_addr,
614 mdev->net_conf->peer_addr_len, 0);
615
616out:
617 if (err < 0) {
618 if (sock) {
619 sock_release(sock);
620 sock = NULL;
621 }
622 switch (-err) {
623 /* timeout, busy, signal pending */
624 case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
625 case EINTR: case ERESTARTSYS:
626 /* peer not (yet) available, network problem */
627 case ECONNREFUSED: case ENETUNREACH:
628 case EHOSTDOWN: case EHOSTUNREACH:
629 disconnect_on_error = 0;
630 break;
631 default:
632 dev_err(DEV, "%s failed, err = %d\n", what, err);
633 }
634 if (disconnect_on_error)
635 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
636 }
637 put_net_conf(mdev);
638 return sock;
639}
640
641static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
642{
643 int timeo, err;
644 struct socket *s_estab = NULL, *s_listen;
645 const char *what;
646
647 if (!get_net_conf(mdev))
648 return NULL;
649
650 what = "sock_create_kern";
651 err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
652 SOCK_STREAM, IPPROTO_TCP, &s_listen);
653 if (err) {
654 s_listen = NULL;
655 goto out;
656 }
657
658 timeo = mdev->net_conf->try_connect_int * HZ;
659 timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
660
661 s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */
662 s_listen->sk->sk_rcvtimeo = timeo;
663 s_listen->sk->sk_sndtimeo = timeo;
664
665 what = "bind before listen";
666 err = s_listen->ops->bind(s_listen,
667 (struct sockaddr *) mdev->net_conf->my_addr,
668 mdev->net_conf->my_addr_len);
669 if (err < 0)
670 goto out;
671
672 err = drbd_accept(mdev, &what, s_listen, &s_estab);
673
674out:
675 if (s_listen)
676 sock_release(s_listen);
677 if (err < 0) {
678 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
679 dev_err(DEV, "%s failed, err = %d\n", what, err);
680 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
681 }
682 }
683 put_net_conf(mdev);
684
685 return s_estab;
686}
687
688static int drbd_send_fp(struct drbd_conf *mdev,
689 struct socket *sock, enum drbd_packets cmd)
690{
691 struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
692
693 return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
694}
695
696static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
697{
698 struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
699 int rr;
700
701 rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
702
703 if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC)
704 return be16_to_cpu(h->command);
705
706 return 0xffff;
707}
708
709/**
710 * drbd_socket_okay() - Free the socket if its connection is not okay
711 * @mdev: DRBD device.
712 * @sock: pointer to the pointer to the socket.
713 */
714static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
715{
716 int rr;
717 char tb[4];
718
719 if (!*sock)
720 return FALSE;
721
722 rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
723
724 if (rr > 0 || rr == -EAGAIN) {
725 return TRUE;
726 } else {
727 sock_release(*sock);
728 *sock = NULL;
729 return FALSE;
730 }
731}
732
733/*
734 * return values:
735 * 1 yes, we have a valid connection
736 * 0 oops, did not work out, please try again
737 * -1 peer talks different language,
738 * no point in trying again, please go standalone.
739 * -2 We do not have a network config...
740 */
741static int drbd_connect(struct drbd_conf *mdev)
742{
743 struct socket *s, *sock, *msock;
744 int try, h, ok;
745
746 D_ASSERT(!mdev->data.socket);
747
748 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags))
749 dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n");
750
751 if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
752 return -2;
753
754 clear_bit(DISCARD_CONCURRENT, &mdev->flags);
755
756 sock = NULL;
757 msock = NULL;
758
759 do {
760 for (try = 0;;) {
761 /* 3 tries, this should take less than a second! */
762 s = drbd_try_connect(mdev);
763 if (s || ++try >= 3)
764 break;
765 /* give the other side time to call bind() & listen() */
766 __set_current_state(TASK_INTERRUPTIBLE);
767 schedule_timeout(HZ / 10);
768 }
769
770 if (s) {
771 if (!sock) {
772 drbd_send_fp(mdev, s, P_HAND_SHAKE_S);
773 sock = s;
774 s = NULL;
775 } else if (!msock) {
776 drbd_send_fp(mdev, s, P_HAND_SHAKE_M);
777 msock = s;
778 s = NULL;
779 } else {
780 dev_err(DEV, "Logic error in drbd_connect()\n");
781 goto out_release_sockets;
782 }
783 }
784
785 if (sock && msock) {
786 __set_current_state(TASK_INTERRUPTIBLE);
787 schedule_timeout(HZ / 10);
788 ok = drbd_socket_okay(mdev, &sock);
789 ok = drbd_socket_okay(mdev, &msock) && ok;
790 if (ok)
791 break;
792 }
793
794retry:
795 s = drbd_wait_for_connect(mdev);
796 if (s) {
797 try = drbd_recv_fp(mdev, s);
798 drbd_socket_okay(mdev, &sock);
799 drbd_socket_okay(mdev, &msock);
800 switch (try) {
801 case P_HAND_SHAKE_S:
802 if (sock) {
803 dev_warn(DEV, "initial packet S crossed\n");
804 sock_release(sock);
805 }
806 sock = s;
807 break;
808 case P_HAND_SHAKE_M:
809 if (msock) {
810 dev_warn(DEV, "initial packet M crossed\n");
811 sock_release(msock);
812 }
813 msock = s;
814 set_bit(DISCARD_CONCURRENT, &mdev->flags);
815 break;
816 default:
817 dev_warn(DEV, "Error receiving initial packet\n");
818 sock_release(s);
819 if (random32() & 1)
820 goto retry;
821 }
822 }
823
824 if (mdev->state.conn <= C_DISCONNECTING)
825 goto out_release_sockets;
826 if (signal_pending(current)) {
827 flush_signals(current);
828 smp_rmb();
829 if (get_t_state(&mdev->receiver) == Exiting)
830 goto out_release_sockets;
831 }
832
833 if (sock && msock) {
834 ok = drbd_socket_okay(mdev, &sock);
835 ok = drbd_socket_okay(mdev, &msock) && ok;
836 if (ok)
837 break;
838 }
839 } while (1);
840
841 msock->sk->sk_reuse = 1; /* SO_REUSEADDR */
842 sock->sk->sk_reuse = 1; /* SO_REUSEADDR */
843
844 sock->sk->sk_allocation = GFP_NOIO;
845 msock->sk->sk_allocation = GFP_NOIO;
846
847 sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
848 msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
849
850 if (mdev->net_conf->sndbuf_size) {
851 sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size;
852 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
853 }
854
855 if (mdev->net_conf->rcvbuf_size) {
856 sock->sk->sk_rcvbuf = mdev->net_conf->rcvbuf_size;
857 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
858 }
859
860 /* NOT YET ...
861 * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
862 * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
863 * first set it to the P_HAND_SHAKE timeout,
864 * which we set to 4x the configured ping_timeout. */
865 sock->sk->sk_sndtimeo =
866 sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10;
867
868 msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
869 msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
870
871 /* we don't want delays.
872 * we use TCP_CORK where apropriate, though */
873 drbd_tcp_nodelay(sock);
874 drbd_tcp_nodelay(msock);
875
876 mdev->data.socket = sock;
877 mdev->meta.socket = msock;
878 mdev->last_received = jiffies;
879
880 D_ASSERT(mdev->asender.task == NULL);
881
882 h = drbd_do_handshake(mdev);
883 if (h <= 0)
884 return h;
885
886 if (mdev->cram_hmac_tfm) {
887 /* drbd_request_state(mdev, NS(conn, WFAuth)); */
888 if (!drbd_do_auth(mdev)) {
889 dev_err(DEV, "Authentication of peer failed\n");
890 return -1;
891 }
892 }
893
894 if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS)
895 return 0;
896
897 sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
898 sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
899
900 atomic_set(&mdev->packet_seq, 0);
901 mdev->peer_seq = 0;
902
903 drbd_thread_start(&mdev->asender);
904
905 drbd_send_protocol(mdev);
906 drbd_send_sync_param(mdev, &mdev->sync_conf);
907 drbd_send_sizes(mdev, 0);
908 drbd_send_uuids(mdev);
909 drbd_send_state(mdev);
910 clear_bit(USE_DEGR_WFC_T, &mdev->flags);
911 clear_bit(RESIZE_PENDING, &mdev->flags);
912
913 return 1;
914
915out_release_sockets:
916 if (sock)
917 sock_release(sock);
918 if (msock)
919 sock_release(msock);
920 return -1;
921}
922
923static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h)
924{
925 int r;
926
927 r = drbd_recv(mdev, h, sizeof(*h));
928
929 if (unlikely(r != sizeof(*h))) {
930 dev_err(DEV, "short read expecting header on sock: r=%d\n", r);
931 return FALSE;
932 };
933 h->command = be16_to_cpu(h->command);
934 h->length = be16_to_cpu(h->length);
935 if (unlikely(h->magic != BE_DRBD_MAGIC)) {
936 dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n",
937 (long)be32_to_cpu(h->magic),
938 h->command, h->length);
939 return FALSE;
940 }
941 mdev->last_received = jiffies;
942
943 return TRUE;
944}
945
946static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
947{
948 int rv;
949
950 if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
951 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL);
952 if (rv) {
953 dev_err(DEV, "local disk flush failed with status %d\n", rv);
954 /* would rather check on EOPNOTSUPP, but that is not reliable.
955 * don't try again for ANY return value != 0
956 * if (rv == -EOPNOTSUPP) */
957 drbd_bump_write_ordering(mdev, WO_drain_io);
958 }
959 put_ldev(mdev);
960 }
961
962 return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
963}
964
965static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
966{
967 struct flush_work *fw = (struct flush_work *)w;
968 struct drbd_epoch *epoch = fw->epoch;
969
970 kfree(w);
971
972 if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags))
973 drbd_flush_after_epoch(mdev, epoch);
974
975 drbd_may_finish_epoch(mdev, epoch, EV_PUT |
976 (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0));
977
978 return 1;
979}
980
981/**
982 * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
983 * @mdev: DRBD device.
984 * @epoch: Epoch object.
985 * @ev: Epoch event.
986 */
987static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
988 struct drbd_epoch *epoch,
989 enum epoch_event ev)
990{
991 int finish, epoch_size;
992 struct drbd_epoch *next_epoch;
993 int schedule_flush = 0;
994 enum finish_epoch rv = FE_STILL_LIVE;
995
996 spin_lock(&mdev->epoch_lock);
997 do {
998 next_epoch = NULL;
999 finish = 0;
1000
1001 epoch_size = atomic_read(&epoch->epoch_size);
1002
1003 switch (ev & ~EV_CLEANUP) {
1004 case EV_PUT:
1005 atomic_dec(&epoch->active);
1006 break;
1007 case EV_GOT_BARRIER_NR:
1008 set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
1009
1010 /* Special case: If we just switched from WO_bio_barrier to
1011 WO_bdev_flush we should not finish the current epoch */
1012 if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 &&
1013 mdev->write_ordering != WO_bio_barrier &&
1014 epoch == mdev->current_epoch)
1015 clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags);
1016 break;
1017 case EV_BARRIER_DONE:
1018 set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags);
1019 break;
1020 case EV_BECAME_LAST:
1021 /* nothing to do*/
1022 break;
1023 }
1024
1025 trace_drbd_epoch(mdev, epoch, ev);
1026
1027 if (epoch_size != 0 &&
1028 atomic_read(&epoch->active) == 0 &&
1029 test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) &&
1030 epoch->list.prev == &mdev->current_epoch->list &&
1031 !test_bit(DE_IS_FINISHING, &epoch->flags)) {
1032 /* Nearly all conditions are met to finish that epoch... */
1033 if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ||
1034 mdev->write_ordering == WO_none ||
1035 (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) ||
1036 ev & EV_CLEANUP) {
1037 finish = 1;
1038 set_bit(DE_IS_FINISHING, &epoch->flags);
1039 } else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) &&
1040 mdev->write_ordering == WO_bio_barrier) {
1041 atomic_inc(&epoch->active);
1042 schedule_flush = 1;
1043 }
1044 }
1045 if (finish) {
1046 if (!(ev & EV_CLEANUP)) {
1047 spin_unlock(&mdev->epoch_lock);
1048 drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
1049 spin_lock(&mdev->epoch_lock);
1050 }
1051 dec_unacked(mdev);
1052
1053 if (mdev->current_epoch != epoch) {
1054 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
1055 list_del(&epoch->list);
1056 ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
1057 mdev->epochs--;
1058 trace_drbd_epoch(mdev, epoch, EV_TRACE_FREE);
1059 kfree(epoch);
1060
1061 if (rv == FE_STILL_LIVE)
1062 rv = FE_DESTROYED;
1063 } else {
1064 epoch->flags = 0;
1065 atomic_set(&epoch->epoch_size, 0);
1066 /* atomic_set(&epoch->active, 0); is alrady zero */
1067 if (rv == FE_STILL_LIVE)
1068 rv = FE_RECYCLED;
1069 }
1070 }
1071
1072 if (!next_epoch)
1073 break;
1074
1075 epoch = next_epoch;
1076 } while (1);
1077
1078 spin_unlock(&mdev->epoch_lock);
1079
1080 if (schedule_flush) {
1081 struct flush_work *fw;
1082 fw = kmalloc(sizeof(*fw), GFP_ATOMIC);
1083 if (fw) {
1084 trace_drbd_epoch(mdev, epoch, EV_TRACE_FLUSH);
1085 fw->w.cb = w_flush;
1086 fw->epoch = epoch;
1087 drbd_queue_work(&mdev->data.work, &fw->w);
1088 } else {
1089 dev_warn(DEV, "Could not kmalloc a flush_work obj\n");
1090 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1091 /* That is not a recursion, only one level */
1092 drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
1093 drbd_may_finish_epoch(mdev, epoch, EV_PUT);
1094 }
1095 }
1096
1097 return rv;
1098}
1099
1100/**
1101 * drbd_bump_write_ordering() - Fall back to an other write ordering method
1102 * @mdev: DRBD device.
1103 * @wo: Write ordering method to try.
1104 */
1105void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local)
1106{
1107 enum write_ordering_e pwo;
1108 static char *write_ordering_str[] = {
1109 [WO_none] = "none",
1110 [WO_drain_io] = "drain",
1111 [WO_bdev_flush] = "flush",
1112 [WO_bio_barrier] = "barrier",
1113 };
1114
1115 pwo = mdev->write_ordering;
1116 wo = min(pwo, wo);
1117 if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier)
1118 wo = WO_bdev_flush;
1119 if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
1120 wo = WO_drain_io;
1121 if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
1122 wo = WO_none;
1123 mdev->write_ordering = wo;
1124 if (pwo != mdev->write_ordering || wo == WO_bio_barrier)
1125 dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
1126}
1127
1128/**
1129 * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set
1130 * @mdev: DRBD device.
1131 * @w: work object.
1132 * @cancel: The connection will be closed anyways (unused in this callback)
1133 */
1134int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
1135{
1136 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1137 struct bio *bio = e->private_bio;
1138
1139 /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
1140 (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
1141 so that we can finish that epoch in drbd_may_finish_epoch().
1142 That is necessary if we already have a long chain of Epochs, before
1143 we realize that BIO_RW_BARRIER is actually not supported */
1144
1145 /* As long as the -ENOTSUPP on the barrier is reported immediately
1146 that will never trigger. If it is reported late, we will just
1147 print that warning and continue correctly for all future requests
1148 with WO_bdev_flush */
1149 if (previous_epoch(mdev, e->epoch))
1150 dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
1151
1152 /* prepare bio for re-submit,
1153 * re-init volatile members */
1154 /* we still have a local reference,
1155 * get_ldev was done in receive_Data. */
1156 bio->bi_bdev = mdev->ldev->backing_bdev;
1157 bio->bi_sector = e->sector;
1158 bio->bi_size = e->size;
1159 bio->bi_idx = 0;
1160
1161 bio->bi_flags &= ~(BIO_POOL_MASK - 1);
1162 bio->bi_flags |= 1 << BIO_UPTODATE;
1163
1164 /* don't know whether this is necessary: */
1165 bio->bi_phys_segments = 0;
1166 bio->bi_next = NULL;
1167
1168 /* these should be unchanged: */
1169 /* bio->bi_end_io = drbd_endio_write_sec; */
1170 /* bio->bi_vcnt = whatever; */
1171
1172 e->w.cb = e_end_block;
1173
1174 /* This is no longer a barrier request. */
1175 bio->bi_rw &= ~(1UL << BIO_RW_BARRIER);
1176
1177 drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, bio);
1178
1179 return 1;
1180}
1181
1182static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h)
1183{
1184 int rv, issue_flush;
1185 struct p_barrier *p = (struct p_barrier *)h;
1186 struct drbd_epoch *epoch;
1187
1188 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
1189
1190 rv = drbd_recv(mdev, h->payload, h->length);
1191 ERR_IF(rv != h->length) return FALSE;
1192
1193 inc_unacked(mdev);
1194
1195 if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
1196 drbd_kick_lo(mdev);
1197
1198 mdev->current_epoch->barrier_nr = p->barrier;
1199 rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR);
1200
1201 /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
1202 * the activity log, which means it would not be resynced in case the
1203 * R_PRIMARY crashes now.
1204 * Therefore we must send the barrier_ack after the barrier request was
1205 * completed. */
1206 switch (mdev->write_ordering) {
1207 case WO_bio_barrier:
1208 case WO_none:
1209 if (rv == FE_RECYCLED)
1210 return TRUE;
1211 break;
1212
1213 case WO_bdev_flush:
1214 case WO_drain_io:
1215 D_ASSERT(rv == FE_STILL_LIVE);
1216 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
1217 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1218 rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
1219 if (rv == FE_RECYCLED)
1220 return TRUE;
1221
1222 /* The asender will send all the ACKs and barrier ACKs out, since
1223 all EEs moved from the active_ee to the done_ee. We need to
1224 provide a new epoch object for the EEs that come in soon */
1225 break;
1226 }
1227
1228 /* receiver context, in the writeout path of the other node.
1229 * avoid potential distributed deadlock */
1230 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1231 if (!epoch) {
1232 dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
1233 issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1234 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1235 if (issue_flush) {
1236 rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
1237 if (rv == FE_RECYCLED)
1238 return TRUE;
1239 }
1240
1241 drbd_wait_ee_list_empty(mdev, &mdev->done_ee);
1242
1243 return TRUE;
1244 }
1245
1246 epoch->flags = 0;
1247 atomic_set(&epoch->epoch_size, 0);
1248 atomic_set(&epoch->active, 0);
1249
1250 spin_lock(&mdev->epoch_lock);
1251 if (atomic_read(&mdev->current_epoch->epoch_size)) {
1252 list_add(&epoch->list, &mdev->current_epoch->list);
1253 mdev->current_epoch = epoch;
1254 mdev->epochs++;
1255 trace_drbd_epoch(mdev, epoch, EV_TRACE_ALLOC);
1256 } else {
1257 /* The current_epoch got recycled while we allocated this one... */
1258 kfree(epoch);
1259 }
1260 spin_unlock(&mdev->epoch_lock);
1261
1262 return TRUE;
1263}
1264
1265/* used from receive_RSDataReply (recv_resync_read)
1266 * and from receive_Data */
1267static struct drbd_epoch_entry *
1268read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
1269{
1270 struct drbd_epoch_entry *e;
1271 struct bio_vec *bvec;
1272 struct page *page;
1273 struct bio *bio;
1274 int dgs, ds, i, rr;
1275 void *dig_in = mdev->int_dig_in;
1276 void *dig_vv = mdev->int_dig_vv;
1277
1278 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
1279 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
1280
1281 if (dgs) {
1282 rr = drbd_recv(mdev, dig_in, dgs);
1283 if (rr != dgs) {
1284 dev_warn(DEV, "short read receiving data digest: read %d expected %d\n",
1285 rr, dgs);
1286 return NULL;
1287 }
1288 }
1289
1290 data_size -= dgs;
1291
1292 ERR_IF(data_size & 0x1ff) return NULL;
1293 ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL;
1294
1295 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1296 * "criss-cross" setup, that might cause write-out on some other DRBD,
1297 * which in turn might block on the other node at this very place. */
1298 e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
1299 if (!e)
1300 return NULL;
1301 bio = e->private_bio;
1302 ds = data_size;
1303 bio_for_each_segment(bvec, bio, i) {
1304 page = bvec->bv_page;
1305 rr = drbd_recv(mdev, kmap(page), min_t(int, ds, PAGE_SIZE));
1306 kunmap(page);
1307 if (rr != min_t(int, ds, PAGE_SIZE)) {
1308 drbd_free_ee(mdev, e);
1309 dev_warn(DEV, "short read receiving data: read %d expected %d\n",
1310 rr, min_t(int, ds, PAGE_SIZE));
1311 return NULL;
1312 }
1313 ds -= rr;
1314 }
1315
1316 if (dgs) {
1317 drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv);
1318 if (memcmp(dig_in, dig_vv, dgs)) {
1319 dev_err(DEV, "Digest integrity check FAILED.\n");
1320 drbd_bcast_ee(mdev, "digest failed",
1321 dgs, dig_in, dig_vv, e);
1322 drbd_free_ee(mdev, e);
1323 return NULL;
1324 }
1325 }
1326 mdev->recv_cnt += data_size>>9;
1327 return e;
1328}
1329
1330/* drbd_drain_block() just takes a data block
1331 * out of the socket input buffer, and discards it.
1332 */
1333static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
1334{
1335 struct page *page;
1336 int rr, rv = 1;
1337 void *data;
1338
1339 page = drbd_pp_alloc(mdev, 1);
1340
1341 data = kmap(page);
1342 while (data_size) {
1343 rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE));
1344 if (rr != min_t(int, data_size, PAGE_SIZE)) {
1345 rv = 0;
1346 dev_warn(DEV, "short read receiving data: read %d expected %d\n",
1347 rr, min_t(int, data_size, PAGE_SIZE));
1348 break;
1349 }
1350 data_size -= rr;
1351 }
1352 kunmap(page);
1353 drbd_pp_free(mdev, page);
1354 return rv;
1355}
1356
1357static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
1358 sector_t sector, int data_size)
1359{
1360 struct bio_vec *bvec;
1361 struct bio *bio;
1362 int dgs, rr, i, expect;
1363 void *dig_in = mdev->int_dig_in;
1364 void *dig_vv = mdev->int_dig_vv;
1365
1366 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
1367 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
1368
1369 if (dgs) {
1370 rr = drbd_recv(mdev, dig_in, dgs);
1371 if (rr != dgs) {
1372 dev_warn(DEV, "short read receiving data reply digest: read %d expected %d\n",
1373 rr, dgs);
1374 return 0;
1375 }
1376 }
1377
1378 data_size -= dgs;
1379
1380 /* optimistically update recv_cnt. if receiving fails below,
1381 * we disconnect anyways, and counters will be reset. */
1382 mdev->recv_cnt += data_size>>9;
1383
1384 bio = req->master_bio;
1385 D_ASSERT(sector == bio->bi_sector);
1386
1387 bio_for_each_segment(bvec, bio, i) {
1388 expect = min_t(int, data_size, bvec->bv_len);
1389 rr = drbd_recv(mdev,
1390 kmap(bvec->bv_page)+bvec->bv_offset,
1391 expect);
1392 kunmap(bvec->bv_page);
1393 if (rr != expect) {
1394 dev_warn(DEV, "short read receiving data reply: "
1395 "read %d expected %d\n",
1396 rr, expect);
1397 return 0;
1398 }
1399 data_size -= rr;
1400 }
1401
1402 if (dgs) {
1403 drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv);
1404 if (memcmp(dig_in, dig_vv, dgs)) {
1405 dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
1406 return 0;
1407 }
1408 }
1409
1410 D_ASSERT(data_size == 0);
1411 return 1;
1412}
1413
1414/* e_end_resync_block() is called via
1415 * drbd_process_done_ee() by asender only */
1416static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1417{
1418 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1419 sector_t sector = e->sector;
1420 int ok;
1421
1422 D_ASSERT(hlist_unhashed(&e->colision));
1423
1424 if (likely(drbd_bio_uptodate(e->private_bio))) {
1425 drbd_set_in_sync(mdev, sector, e->size);
1426 ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
1427 } else {
1428 /* Record failure to sync */
1429 drbd_rs_failed_io(mdev, sector, e->size);
1430
1431 ok = drbd_send_ack(mdev, P_NEG_ACK, e);
1432 }
1433 dec_unacked(mdev);
1434
1435 return ok;
1436}
1437
1438static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local)
1439{
1440 struct drbd_epoch_entry *e;
1441
1442 e = read_in_block(mdev, ID_SYNCER, sector, data_size);
1443 if (!e) {
1444 put_ldev(mdev);
1445 return FALSE;
1446 }
1447
1448 dec_rs_pending(mdev);
1449
1450 e->private_bio->bi_end_io = drbd_endio_write_sec;
1451 e->private_bio->bi_rw = WRITE;
1452 e->w.cb = e_end_resync_block;
1453
1454 inc_unacked(mdev);
1455 /* corresponding dec_unacked() in e_end_resync_block()
1456 * respective _drbd_clear_done_ee */
1457
1458 spin_lock_irq(&mdev->req_lock);
1459 list_add(&e->w.list, &mdev->sync_ee);
1460 spin_unlock_irq(&mdev->req_lock);
1461
1462 trace_drbd_ee(mdev, e, "submitting for (rs)write");
1463 trace_drbd_bio(mdev, "Sec", e->private_bio, 0, NULL);
1464 drbd_generic_make_request(mdev, DRBD_FAULT_RS_WR, e->private_bio);
1465 /* accounting done in endio */
1466
1467 maybe_kick_lo(mdev);
1468 return TRUE;
1469}
1470
1471static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
1472{
1473 struct drbd_request *req;
1474 sector_t sector;
1475 unsigned int header_size, data_size;
1476 int ok;
1477 struct p_data *p = (struct p_data *)h;
1478
1479 header_size = sizeof(*p) - sizeof(*h);
1480 data_size = h->length - header_size;
1481
1482 ERR_IF(data_size == 0) return FALSE;
1483
1484 if (drbd_recv(mdev, h->payload, header_size) != header_size)
1485 return FALSE;
1486
1487 sector = be64_to_cpu(p->sector);
1488
1489 spin_lock_irq(&mdev->req_lock);
1490 req = _ar_id_to_req(mdev, p->block_id, sector);
1491 spin_unlock_irq(&mdev->req_lock);
1492 if (unlikely(!req)) {
1493 dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n");
1494 return FALSE;
1495 }
1496
1497 /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid
1498 * special casing it there for the various failure cases.
1499 * still no race with drbd_fail_pending_reads */
1500 ok = recv_dless_read(mdev, req, sector, data_size);
1501
1502 if (ok)
1503 req_mod(req, data_received);
1504 /* else: nothing. handled from drbd_disconnect...
1505 * I don't think we may complete this just yet
1506 * in case we are "on-disconnect: freeze" */
1507
1508 return ok;
1509}
1510
1511static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h)
1512{
1513 sector_t sector;
1514 unsigned int header_size, data_size;
1515 int ok;
1516 struct p_data *p = (struct p_data *)h;
1517
1518 header_size = sizeof(*p) - sizeof(*h);
1519 data_size = h->length - header_size;
1520
1521 ERR_IF(data_size == 0) return FALSE;
1522
1523 if (drbd_recv(mdev, h->payload, header_size) != header_size)
1524 return FALSE;
1525
1526 sector = be64_to_cpu(p->sector);
1527 D_ASSERT(p->block_id == ID_SYNCER);
1528
1529 if (get_ldev(mdev)) {
1530 /* data is submitted to disk within recv_resync_read.
1531 * corresponding put_ldev done below on error,
1532 * or in drbd_endio_write_sec. */
1533 ok = recv_resync_read(mdev, sector, data_size);
1534 } else {
1535 if (__ratelimit(&drbd_ratelimit_state))
1536 dev_err(DEV, "Can not write resync data to local disk.\n");
1537
1538 ok = drbd_drain_block(mdev, data_size);
1539
1540 drbd_send_ack_dp(mdev, P_NEG_ACK, p);
1541 }
1542
1543 return ok;
1544}
1545
1546/* e_end_block() is called via drbd_process_done_ee().
1547 * this means this function only runs in the asender thread
1548 */
1549static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1550{
1551 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1552 sector_t sector = e->sector;
1553 struct drbd_epoch *epoch;
1554 int ok = 1, pcmd;
1555
1556 if (e->flags & EE_IS_BARRIER) {
1557 epoch = previous_epoch(mdev, e->epoch);
1558 if (epoch)
1559 drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0));
1560 }
1561
1562 if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
1563 if (likely(drbd_bio_uptodate(e->private_bio))) {
1564 pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
1565 mdev->state.conn <= C_PAUSED_SYNC_T &&
1566 e->flags & EE_MAY_SET_IN_SYNC) ?
1567 P_RS_WRITE_ACK : P_WRITE_ACK;
1568 ok &= drbd_send_ack(mdev, pcmd, e);
1569 if (pcmd == P_RS_WRITE_ACK)
1570 drbd_set_in_sync(mdev, sector, e->size);
1571 } else {
1572 ok = drbd_send_ack(mdev, P_NEG_ACK, e);
1573 /* we expect it to be marked out of sync anyways...
1574 * maybe assert this? */
1575 }
1576 dec_unacked(mdev);
1577 }
1578 /* we delete from the conflict detection hash _after_ we sent out the
1579 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */
1580 if (mdev->net_conf->two_primaries) {
1581 spin_lock_irq(&mdev->req_lock);
1582 D_ASSERT(!hlist_unhashed(&e->colision));
1583 hlist_del_init(&e->colision);
1584 spin_unlock_irq(&mdev->req_lock);
1585 } else {
1586 D_ASSERT(hlist_unhashed(&e->colision));
1587 }
1588
1589 drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
1590
1591 return ok;
1592}
1593
1594static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1595{
1596 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1597 int ok = 1;
1598
1599 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
1600 ok = drbd_send_ack(mdev, P_DISCARD_ACK, e);
1601
1602 spin_lock_irq(&mdev->req_lock);
1603 D_ASSERT(!hlist_unhashed(&e->colision));
1604 hlist_del_init(&e->colision);
1605 spin_unlock_irq(&mdev->req_lock);
1606
1607 dec_unacked(mdev);
1608
1609 return ok;
1610}
1611
1612/* Called from receive_Data.
1613 * Synchronize packets on sock with packets on msock.
1614 *
1615 * This is here so even when a P_DATA packet traveling via sock overtook an Ack
1616 * packet traveling on msock, they are still processed in the order they have
1617 * been sent.
1618 *
1619 * Note: we don't care for Ack packets overtaking P_DATA packets.
1620 *
1621 * In case packet_seq is larger than mdev->peer_seq number, there are
1622 * outstanding packets on the msock. We wait for them to arrive.
1623 * In case we are the logically next packet, we update mdev->peer_seq
1624 * ourselves. Correctly handles 32bit wrap around.
1625 *
1626 * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
1627 * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
1628 * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
1629 * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
1630 *
1631 * returns 0 if we may process the packet,
1632 * -ERESTARTSYS if we were interrupted (by disconnect signal). */
1633static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
1634{
1635 DEFINE_WAIT(wait);
1636 unsigned int p_seq;
1637 long timeout;
1638 int ret = 0;
1639 spin_lock(&mdev->peer_seq_lock);
1640 for (;;) {
1641 prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
1642 if (seq_le(packet_seq, mdev->peer_seq+1))
1643 break;
1644 if (signal_pending(current)) {
1645 ret = -ERESTARTSYS;
1646 break;
1647 }
1648 p_seq = mdev->peer_seq;
1649 spin_unlock(&mdev->peer_seq_lock);
1650 timeout = schedule_timeout(30*HZ);
1651 spin_lock(&mdev->peer_seq_lock);
1652 if (timeout == 0 && p_seq == mdev->peer_seq) {
1653 ret = -ETIMEDOUT;
1654 dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n");
1655 break;
1656 }
1657 }
1658 finish_wait(&mdev->seq_wait, &wait);
1659 if (mdev->peer_seq+1 == packet_seq)
1660 mdev->peer_seq++;
1661 spin_unlock(&mdev->peer_seq_lock);
1662 return ret;
1663}
1664
1665/* mirrored write */
1666static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
1667{
1668 sector_t sector;
1669 struct drbd_epoch_entry *e;
1670 struct p_data *p = (struct p_data *)h;
1671 int header_size, data_size;
1672 int rw = WRITE;
1673 u32 dp_flags;
1674
1675 header_size = sizeof(*p) - sizeof(*h);
1676 data_size = h->length - header_size;
1677
1678 ERR_IF(data_size == 0) return FALSE;
1679
1680 if (drbd_recv(mdev, h->payload, header_size) != header_size)
1681 return FALSE;
1682
1683 if (!get_ldev(mdev)) {
1684 if (__ratelimit(&drbd_ratelimit_state))
1685 dev_err(DEV, "Can not write mirrored data block "
1686 "to local disk.\n");
1687 spin_lock(&mdev->peer_seq_lock);
1688 if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num))
1689 mdev->peer_seq++;
1690 spin_unlock(&mdev->peer_seq_lock);
1691
1692 drbd_send_ack_dp(mdev, P_NEG_ACK, p);
1693 atomic_inc(&mdev->current_epoch->epoch_size);
1694 return drbd_drain_block(mdev, data_size);
1695 }
1696
1697 /* get_ldev(mdev) successful.
1698 * Corresponding put_ldev done either below (on various errors),
1699 * or in drbd_endio_write_sec, if we successfully submit the data at
1700 * the end of this function. */
1701
1702 sector = be64_to_cpu(p->sector);
1703 e = read_in_block(mdev, p->block_id, sector, data_size);
1704 if (!e) {
1705 put_ldev(mdev);
1706 return FALSE;
1707 }
1708
1709 e->private_bio->bi_end_io = drbd_endio_write_sec;
1710 e->w.cb = e_end_block;
1711
1712 spin_lock(&mdev->epoch_lock);
1713 e->epoch = mdev->current_epoch;
1714 atomic_inc(&e->epoch->epoch_size);
1715 atomic_inc(&e->epoch->active);
1716
1717 if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) {
1718 struct drbd_epoch *epoch;
1719 /* Issue a barrier if we start a new epoch, and the previous epoch
1720 was not a epoch containing a single request which already was
1721 a Barrier. */
1722 epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list);
1723 if (epoch == e->epoch) {
1724 set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
1725 trace_drbd_epoch(mdev, e->epoch, EV_TRACE_ADD_BARRIER);
1726 rw |= (1<<BIO_RW_BARRIER);
1727 e->flags |= EE_IS_BARRIER;
1728 } else {
1729 if (atomic_read(&epoch->epoch_size) > 1 ||
1730 !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) {
1731 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1732 trace_drbd_epoch(mdev, epoch, EV_TRACE_SETTING_BI);
1733 set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
1734 trace_drbd_epoch(mdev, e->epoch, EV_TRACE_ADD_BARRIER);
1735 rw |= (1<<BIO_RW_BARRIER);
1736 e->flags |= EE_IS_BARRIER;
1737 }
1738 }
1739 }
1740 spin_unlock(&mdev->epoch_lock);
1741
1742 dp_flags = be32_to_cpu(p->dp_flags);
1743 if (dp_flags & DP_HARDBARRIER) {
1744 dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n");
1745 /* rw |= (1<<BIO_RW_BARRIER); */
1746 }
1747 if (dp_flags & DP_RW_SYNC)
1748 rw |= (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG);
1749 if (dp_flags & DP_MAY_SET_IN_SYNC)
1750 e->flags |= EE_MAY_SET_IN_SYNC;
1751
1752 /* I'm the receiver, I do hold a net_cnt reference. */
1753 if (!mdev->net_conf->two_primaries) {
1754 spin_lock_irq(&mdev->req_lock);
1755 } else {
1756 /* don't get the req_lock yet,
1757 * we may sleep in drbd_wait_peer_seq */
1758 const int size = e->size;
1759 const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags);
1760 DEFINE_WAIT(wait);
1761 struct drbd_request *i;
1762 struct hlist_node *n;
1763 struct hlist_head *slot;
1764 int first;
1765
1766 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
1767 BUG_ON(mdev->ee_hash == NULL);
1768 BUG_ON(mdev->tl_hash == NULL);
1769
1770 /* conflict detection and handling:
1771 * 1. wait on the sequence number,
1772 * in case this data packet overtook ACK packets.
1773 * 2. check our hash tables for conflicting requests.
1774 * we only need to walk the tl_hash, since an ee can not
1775 * have a conflict with an other ee: on the submitting
1776 * node, the corresponding req had already been conflicting,
1777 * and a conflicting req is never sent.
1778 *
1779 * Note: for two_primaries, we are protocol C,
1780 * so there cannot be any request that is DONE
1781 * but still on the transfer log.
1782 *
1783 * unconditionally add to the ee_hash.
1784 *
1785 * if no conflicting request is found:
1786 * submit.
1787 *
1788 * if any conflicting request is found
1789 * that has not yet been acked,
1790 * AND I have the "discard concurrent writes" flag:
1791 * queue (via done_ee) the P_DISCARD_ACK; OUT.
1792 *
1793 * if any conflicting request is found:
1794 * block the receiver, waiting on misc_wait
1795 * until no more conflicting requests are there,
1796 * or we get interrupted (disconnect).
1797 *
1798 * we do not just write after local io completion of those
1799 * requests, but only after req is done completely, i.e.
1800 * we wait for the P_DISCARD_ACK to arrive!
1801 *
1802 * then proceed normally, i.e. submit.
1803 */
1804 if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num)))
1805 goto out_interrupted;
1806
1807 spin_lock_irq(&mdev->req_lock);
1808
1809 hlist_add_head(&e->colision, ee_hash_slot(mdev, sector));
1810
1811#define OVERLAPS overlaps(i->sector, i->size, sector, size)
1812 slot = tl_hash_slot(mdev, sector);
1813 first = 1;
1814 for (;;) {
1815 int have_unacked = 0;
1816 int have_conflict = 0;
1817 prepare_to_wait(&mdev->misc_wait, &wait,
1818 TASK_INTERRUPTIBLE);
1819 hlist_for_each_entry(i, n, slot, colision) {
1820 if (OVERLAPS) {
1821 /* only ALERT on first iteration,
1822 * we may be woken up early... */
1823 if (first)
1824 dev_alert(DEV, "%s[%u] Concurrent local write detected!"
1825 " new: %llus +%u; pending: %llus +%u\n",
1826 current->comm, current->pid,
1827 (unsigned long long)sector, size,
1828 (unsigned long long)i->sector, i->size);
1829 if (i->rq_state & RQ_NET_PENDING)
1830 ++have_unacked;
1831 ++have_conflict;
1832 }
1833 }
1834#undef OVERLAPS
1835 if (!have_conflict)
1836 break;
1837
1838 /* Discard Ack only for the _first_ iteration */
1839 if (first && discard && have_unacked) {
1840 dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n",
1841 (unsigned long long)sector);
1842 inc_unacked(mdev);
1843 e->w.cb = e_send_discard_ack;
1844 list_add_tail(&e->w.list, &mdev->done_ee);
1845
1846 spin_unlock_irq(&mdev->req_lock);
1847
1848 /* we could probably send that P_DISCARD_ACK ourselves,
1849 * but I don't like the receiver using the msock */
1850
1851 put_ldev(mdev);
1852 wake_asender(mdev);
1853 finish_wait(&mdev->misc_wait, &wait);
1854 return TRUE;
1855 }
1856
1857 if (signal_pending(current)) {
1858 hlist_del_init(&e->colision);
1859
1860 spin_unlock_irq(&mdev->req_lock);
1861
1862 finish_wait(&mdev->misc_wait, &wait);
1863 goto out_interrupted;
1864 }
1865
1866 spin_unlock_irq(&mdev->req_lock);
1867 if (first) {
1868 first = 0;
1869 dev_alert(DEV, "Concurrent write! [W AFTERWARDS] "
1870 "sec=%llus\n", (unsigned long long)sector);
1871 } else if (discard) {
1872 /* we had none on the first iteration.
1873 * there must be none now. */
1874 D_ASSERT(have_unacked == 0);
1875 }
1876 schedule();
1877 spin_lock_irq(&mdev->req_lock);
1878 }
1879 finish_wait(&mdev->misc_wait, &wait);
1880 }
1881
1882 list_add(&e->w.list, &mdev->active_ee);
1883 spin_unlock_irq(&mdev->req_lock);
1884
1885 switch (mdev->net_conf->wire_protocol) {
1886 case DRBD_PROT_C:
1887 inc_unacked(mdev);
1888 /* corresponding dec_unacked() in e_end_block()
1889 * respective _drbd_clear_done_ee */
1890 break;
1891 case DRBD_PROT_B:
1892 /* I really don't like it that the receiver thread
1893 * sends on the msock, but anyways */
1894 drbd_send_ack(mdev, P_RECV_ACK, e);
1895 break;
1896 case DRBD_PROT_A:
1897 /* nothing to do */
1898 break;
1899 }
1900
1901 if (mdev->state.pdsk == D_DISKLESS) {
1902 /* In case we have the only disk of the cluster, */
1903 drbd_set_out_of_sync(mdev, e->sector, e->size);
1904 e->flags |= EE_CALL_AL_COMPLETE_IO;
1905 drbd_al_begin_io(mdev, e->sector);
1906 }
1907
1908 e->private_bio->bi_rw = rw;
1909 trace_drbd_ee(mdev, e, "submitting for (data)write");
1910 trace_drbd_bio(mdev, "Sec", e->private_bio, 0, NULL);
1911 drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, e->private_bio);
1912 /* accounting done in endio */
1913
1914 maybe_kick_lo(mdev);
1915 return TRUE;
1916
1917out_interrupted:
1918 /* yes, the epoch_size now is imbalanced.
1919 * but we drop the connection anyways, so we don't have a chance to
1920 * receive a barrier... atomic_inc(&mdev->epoch_size); */
1921 put_ldev(mdev);
1922 drbd_free_ee(mdev, e);
1923 return FALSE;
1924}
1925
1926static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
1927{
1928 sector_t sector;
1929 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
1930 struct drbd_epoch_entry *e;
1931 struct digest_info *di = NULL;
1932 int size, digest_size;
1933 unsigned int fault_type;
1934 struct p_block_req *p =
1935 (struct p_block_req *)h;
1936 const int brps = sizeof(*p)-sizeof(*h);
1937
1938 if (drbd_recv(mdev, h->payload, brps) != brps)
1939 return FALSE;
1940
1941 sector = be64_to_cpu(p->sector);
1942 size = be32_to_cpu(p->blksize);
1943
1944 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
1945 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
1946 (unsigned long long)sector, size);
1947 return FALSE;
1948 }
1949 if (sector + (size>>9) > capacity) {
1950 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
1951 (unsigned long long)sector, size);
1952 return FALSE;
1953 }
1954
1955 if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
1956 if (__ratelimit(&drbd_ratelimit_state))
1957 dev_err(DEV, "Can not satisfy peer's read request, "
1958 "no local data.\n");
1959 drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY :
1960 P_NEG_RS_DREPLY , p);
1961 return TRUE;
1962 }
1963
1964 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1965 * "criss-cross" setup, that might cause write-out on some other DRBD,
1966 * which in turn might block on the other node at this very place. */
1967 e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO);
1968 if (!e) {
1969 put_ldev(mdev);
1970 return FALSE;
1971 }
1972
1973 e->private_bio->bi_rw = READ;
1974 e->private_bio->bi_end_io = drbd_endio_read_sec;
1975
1976 switch (h->command) {
1977 case P_DATA_REQUEST:
1978 e->w.cb = w_e_end_data_req;
1979 fault_type = DRBD_FAULT_DT_RD;
1980 break;
1981 case P_RS_DATA_REQUEST:
1982 e->w.cb = w_e_end_rsdata_req;
1983 fault_type = DRBD_FAULT_RS_RD;
1984 /* Eventually this should become asynchronously. Currently it
1985 * blocks the whole receiver just to delay the reading of a
1986 * resync data block.
1987 * the drbd_work_queue mechanism is made for this...
1988 */
1989 if (!drbd_rs_begin_io(mdev, sector)) {
1990 /* we have been interrupted,
1991 * probably connection lost! */
1992 D_ASSERT(signal_pending(current));
1993 goto out_free_e;
1994 }
1995 break;
1996
1997 case P_OV_REPLY:
1998 case P_CSUM_RS_REQUEST:
1999 fault_type = DRBD_FAULT_RS_RD;
2000 digest_size = h->length - brps ;
2001 di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
2002 if (!di)
2003 goto out_free_e;
2004
2005 di->digest_size = digest_size;
2006 di->digest = (((char *)di)+sizeof(struct digest_info));
2007
2008 if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
2009 goto out_free_e;
2010
2011 e->block_id = (u64)(unsigned long)di;
2012 if (h->command == P_CSUM_RS_REQUEST) {
2013 D_ASSERT(mdev->agreed_pro_version >= 89);
2014 e->w.cb = w_e_end_csum_rs_req;
2015 } else if (h->command == P_OV_REPLY) {
2016 e->w.cb = w_e_end_ov_reply;
2017 dec_rs_pending(mdev);
2018 break;
2019 }
2020
2021 if (!drbd_rs_begin_io(mdev, sector)) {
2022 /* we have been interrupted, probably connection lost! */
2023 D_ASSERT(signal_pending(current));
2024 goto out_free_e;
2025 }
2026 break;
2027
2028 case P_OV_REQUEST:
2029 if (mdev->state.conn >= C_CONNECTED &&
2030 mdev->state.conn != C_VERIFY_T)
2031 dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n",
2032 drbd_conn_str(mdev->state.conn));
2033 if (mdev->ov_start_sector == ~(sector_t)0 &&
2034 mdev->agreed_pro_version >= 90) {
2035 mdev->ov_start_sector = sector;
2036 mdev->ov_position = sector;
2037 mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(sector);
2038 dev_info(DEV, "Online Verify start sector: %llu\n",
2039 (unsigned long long)sector);
2040 }
2041 e->w.cb = w_e_end_ov_req;
2042 fault_type = DRBD_FAULT_RS_RD;
2043 /* Eventually this should become asynchronous. Currently it
2044 * blocks the whole receiver just to delay the reading of a
2045 * resync data block.
2046 * the drbd_work_queue mechanism is made for this...
2047 */
2048 if (!drbd_rs_begin_io(mdev, sector)) {
2049 /* we have been interrupted,
2050 * probably connection lost! */
2051 D_ASSERT(signal_pending(current));
2052 goto out_free_e;
2053 }
2054 break;
2055
2056
2057 default:
2058 dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
2059 cmdname(h->command));
2060 fault_type = DRBD_FAULT_MAX;
2061 }
2062
2063 spin_lock_irq(&mdev->req_lock);
2064 list_add(&e->w.list, &mdev->read_ee);
2065 spin_unlock_irq(&mdev->req_lock);
2066
2067 inc_unacked(mdev);
2068
2069 trace_drbd_ee(mdev, e, "submitting for read");
2070 trace_drbd_bio(mdev, "Sec", e->private_bio, 0, NULL);
2071 drbd_generic_make_request(mdev, fault_type, e->private_bio);
2072 maybe_kick_lo(mdev);
2073
2074 return TRUE;
2075
2076out_free_e:
2077 kfree(di);
2078 put_ldev(mdev);
2079 drbd_free_ee(mdev, e);
2080 return FALSE;
2081}
2082
2083static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
2084{
2085 int self, peer, rv = -100;
2086 unsigned long ch_self, ch_peer;
2087
2088 self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2089 peer = mdev->p_uuid[UI_BITMAP] & 1;
2090
2091 ch_peer = mdev->p_uuid[UI_SIZE];
2092 ch_self = mdev->comm_bm_set;
2093
2094 switch (mdev->net_conf->after_sb_0p) {
2095 case ASB_CONSENSUS:
2096 case ASB_DISCARD_SECONDARY:
2097 case ASB_CALL_HELPER:
2098 dev_err(DEV, "Configuration error.\n");
2099 break;
2100 case ASB_DISCONNECT:
2101 break;
2102 case ASB_DISCARD_YOUNGER_PRI:
2103 if (self == 0 && peer == 1) {
2104 rv = -1;
2105 break;
2106 }
2107 if (self == 1 && peer == 0) {
2108 rv = 1;
2109 break;
2110 }
2111 /* Else fall through to one of the other strategies... */
2112 case ASB_DISCARD_OLDER_PRI:
2113 if (self == 0 && peer == 1) {
2114 rv = 1;
2115 break;
2116 }
2117 if (self == 1 && peer == 0) {
2118 rv = -1;
2119 break;
2120 }
2121 /* Else fall through to one of the other strategies... */
2122 dev_warn(DEV, "Discard younger/older primary did not found a decision\n"
2123 "Using discard-least-changes instead\n");
2124 case ASB_DISCARD_ZERO_CHG:
2125 if (ch_peer == 0 && ch_self == 0) {
2126 rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
2127 ? -1 : 1;
2128 break;
2129 } else {
2130 if (ch_peer == 0) { rv = 1; break; }
2131 if (ch_self == 0) { rv = -1; break; }
2132 }
2133 if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG)
2134 break;
2135 case ASB_DISCARD_LEAST_CHG:
2136 if (ch_self < ch_peer)
2137 rv = -1;
2138 else if (ch_self > ch_peer)
2139 rv = 1;
2140 else /* ( ch_self == ch_peer ) */
2141 /* Well, then use something else. */
2142 rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
2143 ? -1 : 1;
2144 break;
2145 case ASB_DISCARD_LOCAL:
2146 rv = -1;
2147 break;
2148 case ASB_DISCARD_REMOTE:
2149 rv = 1;
2150 }
2151
2152 return rv;
2153}
2154
2155static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
2156{
2157 int self, peer, hg, rv = -100;
2158
2159 self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2160 peer = mdev->p_uuid[UI_BITMAP] & 1;
2161
2162 switch (mdev->net_conf->after_sb_1p) {
2163 case ASB_DISCARD_YOUNGER_PRI:
2164 case ASB_DISCARD_OLDER_PRI:
2165 case ASB_DISCARD_LEAST_CHG:
2166 case ASB_DISCARD_LOCAL:
2167 case ASB_DISCARD_REMOTE:
2168 dev_err(DEV, "Configuration error.\n");
2169 break;
2170 case ASB_DISCONNECT:
2171 break;
2172 case ASB_CONSENSUS:
2173 hg = drbd_asb_recover_0p(mdev);
2174 if (hg == -1 && mdev->state.role == R_SECONDARY)
2175 rv = hg;
2176 if (hg == 1 && mdev->state.role == R_PRIMARY)
2177 rv = hg;
2178 break;
2179 case ASB_VIOLENTLY:
2180 rv = drbd_asb_recover_0p(mdev);
2181 break;
2182 case ASB_DISCARD_SECONDARY:
2183 return mdev->state.role == R_PRIMARY ? 1 : -1;
2184 case ASB_CALL_HELPER:
2185 hg = drbd_asb_recover_0p(mdev);
2186 if (hg == -1 && mdev->state.role == R_PRIMARY) {
2187 self = drbd_set_role(mdev, R_SECONDARY, 0);
2188 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2189 * we might be here in C_WF_REPORT_PARAMS which is transient.
2190 * we do not need to wait for the after state change work either. */
2191 self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2192 if (self != SS_SUCCESS) {
2193 drbd_khelper(mdev, "pri-lost-after-sb");
2194 } else {
2195 dev_warn(DEV, "Successfully gave up primary role.\n");
2196 rv = hg;
2197 }
2198 } else
2199 rv = hg;
2200 }
2201
2202 return rv;
2203}
2204
2205static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
2206{
2207 int self, peer, hg, rv = -100;
2208
2209 self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2210 peer = mdev->p_uuid[UI_BITMAP] & 1;
2211
2212 switch (mdev->net_conf->after_sb_2p) {
2213 case ASB_DISCARD_YOUNGER_PRI:
2214 case ASB_DISCARD_OLDER_PRI:
2215 case ASB_DISCARD_LEAST_CHG:
2216 case ASB_DISCARD_LOCAL:
2217 case ASB_DISCARD_REMOTE:
2218 case ASB_CONSENSUS:
2219 case ASB_DISCARD_SECONDARY:
2220 dev_err(DEV, "Configuration error.\n");
2221 break;
2222 case ASB_VIOLENTLY:
2223 rv = drbd_asb_recover_0p(mdev);
2224 break;
2225 case ASB_DISCONNECT:
2226 break;
2227 case ASB_CALL_HELPER:
2228 hg = drbd_asb_recover_0p(mdev);
2229 if (hg == -1) {
2230 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2231 * we might be here in C_WF_REPORT_PARAMS which is transient.
2232 * we do not need to wait for the after state change work either. */
2233 self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2234 if (self != SS_SUCCESS) {
2235 drbd_khelper(mdev, "pri-lost-after-sb");
2236 } else {
2237 dev_warn(DEV, "Successfully gave up primary role.\n");
2238 rv = hg;
2239 }
2240 } else
2241 rv = hg;
2242 }
2243
2244 return rv;
2245}
2246
2247static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid,
2248 u64 bits, u64 flags)
2249{
2250 if (!uuid) {
2251 dev_info(DEV, "%s uuid info vanished while I was looking!\n", text);
2252 return;
2253 }
2254 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
2255 text,
2256 (unsigned long long)uuid[UI_CURRENT],
2257 (unsigned long long)uuid[UI_BITMAP],
2258 (unsigned long long)uuid[UI_HISTORY_START],
2259 (unsigned long long)uuid[UI_HISTORY_END],
2260 (unsigned long long)bits,
2261 (unsigned long long)flags);
2262}
2263
2264/*
2265 100 after split brain try auto recover
2266 2 C_SYNC_SOURCE set BitMap
2267 1 C_SYNC_SOURCE use BitMap
2268 0 no Sync
2269 -1 C_SYNC_TARGET use BitMap
2270 -2 C_SYNC_TARGET set BitMap
2271 -100 after split brain, disconnect
2272-1000 unrelated data
2273 */
2274static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local)
2275{
2276 u64 self, peer;
2277 int i, j;
2278
2279 self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2280 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2281
2282 *rule_nr = 10;
2283 if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
2284 return 0;
2285
2286 *rule_nr = 20;
2287 if ((self == UUID_JUST_CREATED || self == (u64)0) &&
2288 peer != UUID_JUST_CREATED)
2289 return -2;
2290
2291 *rule_nr = 30;
2292 if (self != UUID_JUST_CREATED &&
2293 (peer == UUID_JUST_CREATED || peer == (u64)0))
2294 return 2;
2295
2296 if (self == peer) {
2297 int rct, dc; /* roles at crash time */
2298
2299 if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
2300
2301 if (mdev->agreed_pro_version < 91)
2302 return -1001;
2303
2304 if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
2305 (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
2306 dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n");
2307 drbd_uuid_set_bm(mdev, 0UL);
2308
2309 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2310 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2311 *rule_nr = 34;
2312 } else {
2313 dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n");
2314 *rule_nr = 36;
2315 }
2316
2317 return 1;
2318 }
2319
2320 if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
2321
2322 if (mdev->agreed_pro_version < 91)
2323 return -1001;
2324
2325 if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
2326 (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
2327 dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
2328
2329 mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START];
2330 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP];
2331 mdev->p_uuid[UI_BITMAP] = 0UL;
2332
2333 drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2334 *rule_nr = 35;
2335 } else {
2336 dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n");
2337 *rule_nr = 37;
2338 }
2339
2340 return -1;
2341 }
2342
2343 /* Common power [off|failure] */
2344 rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) +
2345 (mdev->p_uuid[UI_FLAGS] & 2);
2346 /* lowest bit is set when we were primary,
2347 * next bit (weight 2) is set when peer was primary */
2348 *rule_nr = 40;
2349
2350 switch (rct) {
2351 case 0: /* !self_pri && !peer_pri */ return 0;
2352 case 1: /* self_pri && !peer_pri */ return 1;
2353 case 2: /* !self_pri && peer_pri */ return -1;
2354 case 3: /* self_pri && peer_pri */
2355 dc = test_bit(DISCARD_CONCURRENT, &mdev->flags);
2356 return dc ? -1 : 1;
2357 }
2358 }
2359
2360 *rule_nr = 50;
2361 peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2362 if (self == peer)
2363 return -1;
2364
2365 *rule_nr = 51;
2366 peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
2367 if (self == peer) {
2368 self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2369 peer = mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1);
2370 if (self == peer) {
2371 /* The last P_SYNC_UUID did not get though. Undo the last start of
2372 resync as sync source modifications of the peer's UUIDs. */
2373
2374 if (mdev->agreed_pro_version < 91)
2375 return -1001;
2376
2377 mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
2378 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
2379 return -1;
2380 }
2381 }
2382
2383 *rule_nr = 60;
2384 self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2385 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2386 peer = mdev->p_uuid[i] & ~((u64)1);
2387 if (self == peer)
2388 return -2;
2389 }
2390
2391 *rule_nr = 70;
2392 self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2393 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2394 if (self == peer)
2395 return 1;
2396
2397 *rule_nr = 71;
2398 self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2399 if (self == peer) {
2400 self = mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1);
2401 peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
2402 if (self == peer) {
2403 /* The last P_SYNC_UUID did not get though. Undo the last start of
2404 resync as sync source modifications of our UUIDs. */
2405
2406 if (mdev->agreed_pro_version < 91)
2407 return -1001;
2408
2409 _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
2410 _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]);
2411
2412 dev_info(DEV, "Undid last start of resync:\n");
2413
2414 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2415 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2416
2417 return 1;
2418 }
2419 }
2420
2421
2422 *rule_nr = 80;
2423 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2424 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2425 if (self == peer)
2426 return 2;
2427 }
2428
2429 *rule_nr = 90;
2430 self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2431 peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2432 if (self == peer && self != ((u64)0))
2433 return 100;
2434
2435 *rule_nr = 100;
2436 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2437 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2438 for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
2439 peer = mdev->p_uuid[j] & ~((u64)1);
2440 if (self == peer)
2441 return -100;
2442 }
2443 }
2444
2445 return -1000;
2446}
2447
2448/* drbd_sync_handshake() returns the new conn state on success, or
2449 CONN_MASK (-1) on failure.
2450 */
2451static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role,
2452 enum drbd_disk_state peer_disk) __must_hold(local)
2453{
2454 int hg, rule_nr;
2455 enum drbd_conns rv = C_MASK;
2456 enum drbd_disk_state mydisk;
2457
2458 mydisk = mdev->state.disk;
2459 if (mydisk == D_NEGOTIATING)
2460 mydisk = mdev->new_state_tmp.disk;
2461
2462 dev_info(DEV, "drbd_sync_handshake:\n");
2463 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0);
2464 drbd_uuid_dump(mdev, "peer", mdev->p_uuid,
2465 mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2466
2467 hg = drbd_uuid_compare(mdev, &rule_nr);
2468
2469 dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
2470
2471 if (hg == -1000) {
2472 dev_alert(DEV, "Unrelated data, aborting!\n");
2473 return C_MASK;
2474 }
2475 if (hg == -1001) {
2476 dev_alert(DEV, "To resolve this both sides have to support at least protocol\n");
2477 return C_MASK;
2478 }
2479
2480 if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
2481 (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) {
2482 int f = (hg == -100) || abs(hg) == 2;
2483 hg = mydisk > D_INCONSISTENT ? 1 : -1;
2484 if (f)
2485 hg = hg*2;
2486 dev_info(DEV, "Becoming sync %s due to disk states.\n",
2487 hg > 0 ? "source" : "target");
2488 }
2489
2490 if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
2491 int pcount = (mdev->state.role == R_PRIMARY)
2492 + (peer_role == R_PRIMARY);
2493 int forced = (hg == -100);
2494
2495 switch (pcount) {
2496 case 0:
2497 hg = drbd_asb_recover_0p(mdev);
2498 break;
2499 case 1:
2500 hg = drbd_asb_recover_1p(mdev);
2501 break;
2502 case 2:
2503 hg = drbd_asb_recover_2p(mdev);
2504 break;
2505 }
2506 if (abs(hg) < 100) {
2507 dev_warn(DEV, "Split-Brain detected, %d primaries, "
2508 "automatically solved. Sync from %s node\n",
2509 pcount, (hg < 0) ? "peer" : "this");
2510 if (forced) {
2511 dev_warn(DEV, "Doing a full sync, since"
2512 " UUIDs where ambiguous.\n");
2513 hg = hg*2;
2514 }
2515 }
2516 }
2517
2518 if (hg == -100) {
2519 if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1))
2520 hg = -1;
2521 if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1))
2522 hg = 1;
2523
2524 if (abs(hg) < 100)
2525 dev_warn(DEV, "Split-Brain detected, manually solved. "
2526 "Sync from %s node\n",
2527 (hg < 0) ? "peer" : "this");
2528 }
2529
2530 if (hg == -100) {
2531 dev_alert(DEV, "Split-Brain detected, dropping connection!\n");
2532 drbd_khelper(mdev, "split-brain");
2533 return C_MASK;
2534 }
2535
2536 if (hg > 0 && mydisk <= D_INCONSISTENT) {
2537 dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n");
2538 return C_MASK;
2539 }
2540
2541 if (hg < 0 && /* by intention we do not use mydisk here. */
2542 mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) {
2543 switch (mdev->net_conf->rr_conflict) {
2544 case ASB_CALL_HELPER:
2545 drbd_khelper(mdev, "pri-lost");
2546 /* fall through */
2547 case ASB_DISCONNECT:
2548 dev_err(DEV, "I shall become SyncTarget, but I am primary!\n");
2549 return C_MASK;
2550 case ASB_VIOLENTLY:
2551 dev_warn(DEV, "Becoming SyncTarget, violating the stable-data"
2552 "assumption\n");
2553 }
2554 }
2555
2556 if (abs(hg) >= 2) {
2557 dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
2558 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake"))
2559 return C_MASK;
2560 }
2561
2562 if (hg > 0) { /* become sync source. */
2563 rv = C_WF_BITMAP_S;
2564 } else if (hg < 0) { /* become sync target */
2565 rv = C_WF_BITMAP_T;
2566 } else {
2567 rv = C_CONNECTED;
2568 if (drbd_bm_total_weight(mdev)) {
2569 dev_info(DEV, "No resync, but %lu bits in bitmap!\n",
2570 drbd_bm_total_weight(mdev));
2571 }
2572 }
2573
2574 return rv;
2575}
2576
2577/* returns 1 if invalid */
2578static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
2579{
2580 /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
2581 if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) ||
2582 (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL))
2583 return 0;
2584
2585 /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
2586 if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL ||
2587 self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL)
2588 return 1;
2589
2590 /* everything else is valid if they are equal on both sides. */
2591 if (peer == self)
2592 return 0;
2593
2594 /* everything es is invalid. */
2595 return 1;
2596}
2597
2598static int receive_protocol(struct drbd_conf *mdev, struct p_header *h)
2599{
2600 struct p_protocol *p = (struct p_protocol *)h;
2601 int header_size, data_size;
2602 int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
2603 int p_want_lose, p_two_primaries;
2604 char p_integrity_alg[SHARED_SECRET_MAX] = "";
2605
2606 header_size = sizeof(*p) - sizeof(*h);
2607 data_size = h->length - header_size;
2608
2609 if (drbd_recv(mdev, h->payload, header_size) != header_size)
2610 return FALSE;
2611
2612 p_proto = be32_to_cpu(p->protocol);
2613 p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
2614 p_after_sb_1p = be32_to_cpu(p->after_sb_1p);
2615 p_after_sb_2p = be32_to_cpu(p->after_sb_2p);
2616 p_want_lose = be32_to_cpu(p->want_lose);
2617 p_two_primaries = be32_to_cpu(p->two_primaries);
2618
2619 if (p_proto != mdev->net_conf->wire_protocol) {
2620 dev_err(DEV, "incompatible communication protocols\n");
2621 goto disconnect;
2622 }
2623
2624 if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) {
2625 dev_err(DEV, "incompatible after-sb-0pri settings\n");
2626 goto disconnect;
2627 }
2628
2629 if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) {
2630 dev_err(DEV, "incompatible after-sb-1pri settings\n");
2631 goto disconnect;
2632 }
2633
2634 if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) {
2635 dev_err(DEV, "incompatible after-sb-2pri settings\n");
2636 goto disconnect;
2637 }
2638
2639 if (p_want_lose && mdev->net_conf->want_lose) {
2640 dev_err(DEV, "both sides have the 'want_lose' flag set\n");
2641 goto disconnect;
2642 }
2643
2644 if (p_two_primaries != mdev->net_conf->two_primaries) {
2645 dev_err(DEV, "incompatible setting of the two-primaries options\n");
2646 goto disconnect;
2647 }
2648
2649 if (mdev->agreed_pro_version >= 87) {
2650 unsigned char *my_alg = mdev->net_conf->integrity_alg;
2651
2652 if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size)
2653 return FALSE;
2654
2655 p_integrity_alg[SHARED_SECRET_MAX-1] = 0;
2656 if (strcmp(p_integrity_alg, my_alg)) {
2657 dev_err(DEV, "incompatible setting of the data-integrity-alg\n");
2658 goto disconnect;
2659 }
2660 dev_info(DEV, "data-integrity-alg: %s\n",
2661 my_alg[0] ? my_alg : (unsigned char *)"<not-used>");
2662 }
2663
2664 return TRUE;
2665
2666disconnect:
2667 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2668 return FALSE;
2669}
2670
2671/* helper function
2672 * input: alg name, feature name
2673 * return: NULL (alg name was "")
2674 * ERR_PTR(error) if something goes wrong
2675 * or the crypto hash ptr, if it worked out ok. */
2676struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
2677 const char *alg, const char *name)
2678{
2679 struct crypto_hash *tfm;
2680
2681 if (!alg[0])
2682 return NULL;
2683
2684 tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
2685 if (IS_ERR(tfm)) {
2686 dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n",
2687 alg, name, PTR_ERR(tfm));
2688 return tfm;
2689 }
2690 if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
2691 crypto_free_hash(tfm);
2692 dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name);
2693 return ERR_PTR(-EINVAL);
2694 }
2695 return tfm;
2696}
2697
2698static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
2699{
2700 int ok = TRUE;
2701 struct p_rs_param_89 *p = (struct p_rs_param_89 *)h;
2702 unsigned int header_size, data_size, exp_max_sz;
2703 struct crypto_hash *verify_tfm = NULL;
2704 struct crypto_hash *csums_tfm = NULL;
2705 const int apv = mdev->agreed_pro_version;
2706
2707 exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
2708 : apv == 88 ? sizeof(struct p_rs_param)
2709 + SHARED_SECRET_MAX
2710 : /* 89 */ sizeof(struct p_rs_param_89);
2711
2712 if (h->length > exp_max_sz) {
2713 dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
2714 h->length, exp_max_sz);
2715 return FALSE;
2716 }
2717
2718 if (apv <= 88) {
2719 header_size = sizeof(struct p_rs_param) - sizeof(*h);
2720 data_size = h->length - header_size;
2721 } else /* apv >= 89 */ {
2722 header_size = sizeof(struct p_rs_param_89) - sizeof(*h);
2723 data_size = h->length - header_size;
2724 D_ASSERT(data_size == 0);
2725 }
2726
2727 /* initialize verify_alg and csums_alg */
2728 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
2729
2730 if (drbd_recv(mdev, h->payload, header_size) != header_size)
2731 return FALSE;
2732
2733 mdev->sync_conf.rate = be32_to_cpu(p->rate);
2734
2735 if (apv >= 88) {
2736 if (apv == 88) {
2737 if (data_size > SHARED_SECRET_MAX) {
2738 dev_err(DEV, "verify-alg too long, "
2739 "peer wants %u, accepting only %u byte\n",
2740 data_size, SHARED_SECRET_MAX);
2741 return FALSE;
2742 }
2743
2744 if (drbd_recv(mdev, p->verify_alg, data_size) != data_size)
2745 return FALSE;
2746
2747 /* we expect NUL terminated string */
2748 /* but just in case someone tries to be evil */
2749 D_ASSERT(p->verify_alg[data_size-1] == 0);
2750 p->verify_alg[data_size-1] = 0;
2751
2752 } else /* apv >= 89 */ {
2753 /* we still expect NUL terminated strings */
2754 /* but just in case someone tries to be evil */
2755 D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0);
2756 D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0);
2757 p->verify_alg[SHARED_SECRET_MAX-1] = 0;
2758 p->csums_alg[SHARED_SECRET_MAX-1] = 0;
2759 }
2760
2761 if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) {
2762 if (mdev->state.conn == C_WF_REPORT_PARAMS) {
2763 dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
2764 mdev->sync_conf.verify_alg, p->verify_alg);
2765 goto disconnect;
2766 }
2767 verify_tfm = drbd_crypto_alloc_digest_safe(mdev,
2768 p->verify_alg, "verify-alg");
2769 if (IS_ERR(verify_tfm)) {
2770 verify_tfm = NULL;
2771 goto disconnect;
2772 }
2773 }
2774
2775 if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) {
2776 if (mdev->state.conn == C_WF_REPORT_PARAMS) {
2777 dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
2778 mdev->sync_conf.csums_alg, p->csums_alg);
2779 goto disconnect;
2780 }
2781 csums_tfm = drbd_crypto_alloc_digest_safe(mdev,
2782 p->csums_alg, "csums-alg");
2783 if (IS_ERR(csums_tfm)) {
2784 csums_tfm = NULL;
2785 goto disconnect;
2786 }
2787 }
2788
2789
2790 spin_lock(&mdev->peer_seq_lock);
2791 /* lock against drbd_nl_syncer_conf() */
2792 if (verify_tfm) {
2793 strcpy(mdev->sync_conf.verify_alg, p->verify_alg);
2794 mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1;
2795 crypto_free_hash(mdev->verify_tfm);
2796 mdev->verify_tfm = verify_tfm;
2797 dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
2798 }
2799 if (csums_tfm) {
2800 strcpy(mdev->sync_conf.csums_alg, p->csums_alg);
2801 mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1;
2802 crypto_free_hash(mdev->csums_tfm);
2803 mdev->csums_tfm = csums_tfm;
2804 dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
2805 }
2806 spin_unlock(&mdev->peer_seq_lock);
2807 }
2808
2809 return ok;
2810disconnect:
2811 /* just for completeness: actually not needed,
2812 * as this is not reached if csums_tfm was ok. */
2813 crypto_free_hash(csums_tfm);
2814 /* but free the verify_tfm again, if csums_tfm did not work out */
2815 crypto_free_hash(verify_tfm);
2816 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2817 return FALSE;
2818}
2819
2820static void drbd_setup_order_type(struct drbd_conf *mdev, int peer)
2821{
2822 /* sorry, we currently have no working implementation
2823 * of distributed TCQ */
2824}
2825
2826/* warn if the arguments differ by more than 12.5% */
2827static void warn_if_differ_considerably(struct drbd_conf *mdev,
2828 const char *s, sector_t a, sector_t b)
2829{
2830 sector_t d;
2831 if (a == 0 || b == 0)
2832 return;
2833 d = (a > b) ? (a - b) : (b - a);
2834 if (d > (a>>3) || d > (b>>3))
2835 dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s,
2836 (unsigned long long)a, (unsigned long long)b);
2837}
2838
2839static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
2840{
2841 struct p_sizes *p = (struct p_sizes *)h;
2842 enum determine_dev_size dd = unchanged;
2843 unsigned int max_seg_s;
2844 sector_t p_size, p_usize, my_usize;
2845 int ldsc = 0; /* local disk size changed */
2846 enum drbd_conns nconn;
2847
2848 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
2849 if (drbd_recv(mdev, h->payload, h->length) != h->length)
2850 return FALSE;
2851
2852 p_size = be64_to_cpu(p->d_size);
2853 p_usize = be64_to_cpu(p->u_size);
2854
2855 if (p_size == 0 && mdev->state.disk == D_DISKLESS) {
2856 dev_err(DEV, "some backing storage is needed\n");
2857 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2858 return FALSE;
2859 }
2860
2861 /* just store the peer's disk size for now.
2862 * we still need to figure out whether we accept that. */
2863 mdev->p_size = p_size;
2864
2865#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
2866 if (get_ldev(mdev)) {
2867 warn_if_differ_considerably(mdev, "lower level device sizes",
2868 p_size, drbd_get_max_capacity(mdev->ldev));
2869 warn_if_differ_considerably(mdev, "user requested size",
2870 p_usize, mdev->ldev->dc.disk_size);
2871
2872 /* if this is the first connect, or an otherwise expected
2873 * param exchange, choose the minimum */
2874 if (mdev->state.conn == C_WF_REPORT_PARAMS)
2875 p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size,
2876 p_usize);
2877
2878 my_usize = mdev->ldev->dc.disk_size;
2879
2880 if (mdev->ldev->dc.disk_size != p_usize) {
2881 mdev->ldev->dc.disk_size = p_usize;
2882 dev_info(DEV, "Peer sets u_size to %lu sectors\n",
2883 (unsigned long)mdev->ldev->dc.disk_size);
2884 }
2885
2886 /* Never shrink a device with usable data during connect.
2887 But allow online shrinking if we are connected. */
2888 if (drbd_new_dev_size(mdev, mdev->ldev) <
2889 drbd_get_capacity(mdev->this_bdev) &&
2890 mdev->state.disk >= D_OUTDATED &&
2891 mdev->state.conn < C_CONNECTED) {
2892 dev_err(DEV, "The peer's disk size is too small!\n");
2893 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2894 mdev->ldev->dc.disk_size = my_usize;
2895 put_ldev(mdev);
2896 return FALSE;
2897 }
2898 put_ldev(mdev);
2899 }
2900#undef min_not_zero
2901
2902 if (get_ldev(mdev)) {
2903 dd = drbd_determin_dev_size(mdev);
2904 put_ldev(mdev);
2905 if (dd == dev_size_error)
2906 return FALSE;
2907 drbd_md_sync(mdev);
2908 } else {
2909 /* I am diskless, need to accept the peer's size. */
2910 drbd_set_my_capacity(mdev, p_size);
2911 }
2912
2913 if (mdev->p_uuid && mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
2914 nconn = drbd_sync_handshake(mdev,
2915 mdev->state.peer, mdev->state.pdsk);
2916 put_ldev(mdev);
2917
2918 if (nconn == C_MASK) {
2919 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2920 return FALSE;
2921 }
2922
2923 if (drbd_request_state(mdev, NS(conn, nconn)) < SS_SUCCESS) {
2924 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2925 return FALSE;
2926 }
2927 }
2928
2929 if (get_ldev(mdev)) {
2930 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
2931 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
2932 ldsc = 1;
2933 }
2934
2935 max_seg_s = be32_to_cpu(p->max_segment_size);
2936 if (max_seg_s != queue_max_segment_size(mdev->rq_queue))
2937 drbd_setup_queue_param(mdev, max_seg_s);
2938
2939 drbd_setup_order_type(mdev, be32_to_cpu(p->queue_order_type));
2940 put_ldev(mdev);
2941 }
2942
2943 if (mdev->state.conn > C_WF_REPORT_PARAMS) {
2944 if (be64_to_cpu(p->c_size) !=
2945 drbd_get_capacity(mdev->this_bdev) || ldsc) {
2946 /* we have different sizes, probably peer
2947 * needs to know my new size... */
2948 drbd_send_sizes(mdev, 0);
2949 }
2950 if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
2951 (dd == grew && mdev->state.conn == C_CONNECTED)) {
2952 if (mdev->state.pdsk >= D_INCONSISTENT &&
2953 mdev->state.disk >= D_INCONSISTENT)
2954 resync_after_online_grow(mdev);
2955 else
2956 set_bit(RESYNC_AFTER_NEG, &mdev->flags);
2957 }
2958 }
2959
2960 return TRUE;
2961}
2962
2963static int receive_uuids(struct drbd_conf *mdev, struct p_header *h)
2964{
2965 struct p_uuids *p = (struct p_uuids *)h;
2966 u64 *p_uuid;
2967 int i;
2968
2969 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
2970 if (drbd_recv(mdev, h->payload, h->length) != h->length)
2971 return FALSE;
2972
2973 p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
2974
2975 for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
2976 p_uuid[i] = be64_to_cpu(p->uuid[i]);
2977
2978 kfree(mdev->p_uuid);
2979 mdev->p_uuid = p_uuid;
2980
2981 if (mdev->state.conn < C_CONNECTED &&
2982 mdev->state.disk < D_INCONSISTENT &&
2983 mdev->state.role == R_PRIMARY &&
2984 (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
2985 dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
2986 (unsigned long long)mdev->ed_uuid);
2987 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2988 return FALSE;
2989 }
2990
2991 if (get_ldev(mdev)) {
2992 int skip_initial_sync =
2993 mdev->state.conn == C_CONNECTED &&
2994 mdev->agreed_pro_version >= 90 &&
2995 mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
2996 (p_uuid[UI_FLAGS] & 8);
2997 if (skip_initial_sync) {
2998 dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n");
2999 drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
3000 "clear_n_write from receive_uuids");
3001 _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]);
3002 _drbd_uuid_set(mdev, UI_BITMAP, 0);
3003 _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
3004 CS_VERBOSE, NULL);
3005 drbd_md_sync(mdev);
3006 }
3007 put_ldev(mdev);
3008 }
3009
3010 /* Before we test for the disk state, we should wait until an eventually
3011 ongoing cluster wide state change is finished. That is important if
3012 we are primary and are detaching from our disk. We need to see the
3013 new disk state... */
3014 wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags));
3015 if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
3016 drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
3017
3018 return TRUE;
3019}
3020
3021/**
3022 * convert_state() - Converts the peer's view of the cluster state to our point of view
3023 * @ps: The state as seen by the peer.
3024 */
3025static union drbd_state convert_state(union drbd_state ps)
3026{
3027 union drbd_state ms;
3028
3029 static enum drbd_conns c_tab[] = {
3030 [C_CONNECTED] = C_CONNECTED,
3031
3032 [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
3033 [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
3034 [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
3035 [C_VERIFY_S] = C_VERIFY_T,
3036 [C_MASK] = C_MASK,
3037 };
3038
3039 ms.i = ps.i;
3040
3041 ms.conn = c_tab[ps.conn];
3042 ms.peer = ps.role;
3043 ms.role = ps.peer;
3044 ms.pdsk = ps.disk;
3045 ms.disk = ps.pdsk;
3046 ms.peer_isp = (ps.aftr_isp | ps.user_isp);
3047
3048 return ms;
3049}
3050
3051static int receive_req_state(struct drbd_conf *mdev, struct p_header *h)
3052{
3053 struct p_req_state *p = (struct p_req_state *)h;
3054 union drbd_state mask, val;
3055 int rv;
3056
3057 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
3058 if (drbd_recv(mdev, h->payload, h->length) != h->length)
3059 return FALSE;
3060
3061 mask.i = be32_to_cpu(p->mask);
3062 val.i = be32_to_cpu(p->val);
3063
3064 if (test_bit(DISCARD_CONCURRENT, &mdev->flags) &&
3065 test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) {
3066 drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
3067 return TRUE;
3068 }
3069
3070 mask = convert_state(mask);
3071 val = convert_state(val);
3072
3073 rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
3074
3075 drbd_send_sr_reply(mdev, rv);
3076 drbd_md_sync(mdev);
3077
3078 return TRUE;
3079}
3080
3081static int receive_state(struct drbd_conf *mdev, struct p_header *h)
3082{
3083 struct p_state *p = (struct p_state *)h;
3084 enum drbd_conns nconn, oconn;
3085 union drbd_state ns, peer_state;
3086 enum drbd_disk_state real_peer_disk;
3087 int rv;
3088
3089 ERR_IF(h->length != (sizeof(*p)-sizeof(*h)))
3090 return FALSE;
3091
3092 if (drbd_recv(mdev, h->payload, h->length) != h->length)
3093 return FALSE;
3094
3095 peer_state.i = be32_to_cpu(p->state);
3096
3097 real_peer_disk = peer_state.disk;
3098 if (peer_state.disk == D_NEGOTIATING) {
3099 real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
3100 dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
3101 }
3102
3103 spin_lock_irq(&mdev->req_lock);
3104 retry:
3105 oconn = nconn = mdev->state.conn;
3106 spin_unlock_irq(&mdev->req_lock);
3107
3108 if (nconn == C_WF_REPORT_PARAMS)
3109 nconn = C_CONNECTED;
3110
3111 if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
3112 get_ldev_if_state(mdev, D_NEGOTIATING)) {
3113 int cr; /* consider resync */
3114
3115 /* if we established a new connection */
3116 cr = (oconn < C_CONNECTED);
3117 /* if we had an established connection
3118 * and one of the nodes newly attaches a disk */
3119 cr |= (oconn == C_CONNECTED &&
3120 (peer_state.disk == D_NEGOTIATING ||
3121 mdev->state.disk == D_NEGOTIATING));
3122 /* if we have both been inconsistent, and the peer has been
3123 * forced to be UpToDate with --overwrite-data */
3124 cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
3125 /* if we had been plain connected, and the admin requested to
3126 * start a sync by "invalidate" or "invalidate-remote" */
3127 cr |= (oconn == C_CONNECTED &&
3128 (peer_state.conn >= C_STARTING_SYNC_S &&
3129 peer_state.conn <= C_WF_BITMAP_T));
3130
3131 if (cr)
3132 nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
3133
3134 put_ldev(mdev);
3135 if (nconn == C_MASK) {
3136 if (mdev->state.disk == D_NEGOTIATING) {
3137 drbd_force_state(mdev, NS(disk, D_DISKLESS));
3138 nconn = C_CONNECTED;
3139 } else if (peer_state.disk == D_NEGOTIATING) {
3140 dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
3141 peer_state.disk = D_DISKLESS;
3142 } else {
3143 D_ASSERT(oconn == C_WF_REPORT_PARAMS);
3144 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3145 return FALSE;
3146 }
3147 }
3148 }
3149
3150 spin_lock_irq(&mdev->req_lock);
3151 if (mdev->state.conn != oconn)
3152 goto retry;
3153 clear_bit(CONSIDER_RESYNC, &mdev->flags);
3154 ns.i = mdev->state.i;
3155 ns.conn = nconn;
3156 ns.peer = peer_state.role;
3157 ns.pdsk = real_peer_disk;
3158 ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
3159 if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
3160 ns.disk = mdev->new_state_tmp.disk;
3161
3162 rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL);
3163 ns = mdev->state;
3164 spin_unlock_irq(&mdev->req_lock);
3165
3166 if (rv < SS_SUCCESS) {
3167 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3168 return FALSE;
3169 }
3170
3171 if (oconn > C_WF_REPORT_PARAMS) {
3172 if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
3173 peer_state.disk != D_NEGOTIATING ) {
3174 /* we want resync, peer has not yet decided to sync... */
3175 /* Nowadays only used when forcing a node into primary role and
3176 setting its disk to UpToDate with that */
3177 drbd_send_uuids(mdev);
3178 drbd_send_state(mdev);
3179 }
3180 }
3181
3182 mdev->net_conf->want_lose = 0;
3183
3184 drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
3185
3186 return TRUE;
3187}
3188
3189static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
3190{
3191 struct p_rs_uuid *p = (struct p_rs_uuid *)h;
3192
3193 wait_event(mdev->misc_wait,
3194 mdev->state.conn == C_WF_SYNC_UUID ||
3195 mdev->state.conn < C_CONNECTED ||
3196 mdev->state.disk < D_NEGOTIATING);
3197
3198 /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
3199
3200 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
3201 if (drbd_recv(mdev, h->payload, h->length) != h->length)
3202 return FALSE;
3203
3204 /* Here the _drbd_uuid_ functions are right, current should
3205 _not_ be rotated into the history */
3206 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
3207 _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid));
3208 _drbd_uuid_set(mdev, UI_BITMAP, 0UL);
3209
3210 drbd_start_resync(mdev, C_SYNC_TARGET);
3211
3212 put_ldev(mdev);
3213 } else
3214 dev_err(DEV, "Ignoring SyncUUID packet!\n");
3215
3216 return TRUE;
3217}
3218
3219enum receive_bitmap_ret { OK, DONE, FAILED };
3220
3221static enum receive_bitmap_ret
3222receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h,
3223 unsigned long *buffer, struct bm_xfer_ctx *c)
3224{
3225 unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
3226 unsigned want = num_words * sizeof(long);
3227
3228 if (want != h->length) {
3229 dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length);
3230 return FAILED;
3231 }
3232 if (want == 0)
3233 return DONE;
3234 if (drbd_recv(mdev, buffer, want) != want)
3235 return FAILED;
3236
3237 drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer);
3238
3239 c->word_offset += num_words;
3240 c->bit_offset = c->word_offset * BITS_PER_LONG;
3241 if (c->bit_offset > c->bm_bits)
3242 c->bit_offset = c->bm_bits;
3243
3244 return OK;
3245}
3246
3247static enum receive_bitmap_ret
3248recv_bm_rle_bits(struct drbd_conf *mdev,
3249 struct p_compressed_bm *p,
3250 struct bm_xfer_ctx *c)
3251{
3252 struct bitstream bs;
3253 u64 look_ahead;
3254 u64 rl;
3255 u64 tmp;
3256 unsigned long s = c->bit_offset;
3257 unsigned long e;
3258 int len = p->head.length - (sizeof(*p) - sizeof(p->head));
3259 int toggle = DCBP_get_start(p);
3260 int have;
3261 int bits;
3262
3263 bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p));
3264
3265 bits = bitstream_get_bits(&bs, &look_ahead, 64);
3266 if (bits < 0)
3267 return FAILED;
3268
3269 for (have = bits; have > 0; s += rl, toggle = !toggle) {
3270 bits = vli_decode_bits(&rl, look_ahead);
3271 if (bits <= 0)
3272 return FAILED;
3273
3274 if (toggle) {
3275 e = s + rl -1;
3276 if (e >= c->bm_bits) {
3277 dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
3278 return FAILED;
3279 }
3280 _drbd_bm_set_bits(mdev, s, e);
3281 }
3282
3283 if (have < bits) {
3284 dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
3285 have, bits, look_ahead,
3286 (unsigned int)(bs.cur.b - p->code),
3287 (unsigned int)bs.buf_len);
3288 return FAILED;
3289 }
3290 look_ahead >>= bits;
3291 have -= bits;
3292
3293 bits = bitstream_get_bits(&bs, &tmp, 64 - have);
3294 if (bits < 0)
3295 return FAILED;
3296 look_ahead |= tmp << have;
3297 have += bits;
3298 }
3299
3300 c->bit_offset = s;
3301 bm_xfer_ctx_bit_to_word_offset(c);
3302
3303 return (s == c->bm_bits) ? DONE : OK;
3304}
3305
3306static enum receive_bitmap_ret
3307decode_bitmap_c(struct drbd_conf *mdev,
3308 struct p_compressed_bm *p,
3309 struct bm_xfer_ctx *c)
3310{
3311 if (DCBP_get_code(p) == RLE_VLI_Bits)
3312 return recv_bm_rle_bits(mdev, p, c);
3313
3314 /* other variants had been implemented for evaluation,
3315 * but have been dropped as this one turned out to be "best"
3316 * during all our tests. */
3317
3318 dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
3319 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3320 return FAILED;
3321}
3322
3323void INFO_bm_xfer_stats(struct drbd_conf *mdev,
3324 const char *direction, struct bm_xfer_ctx *c)
3325{
3326 /* what would it take to transfer it "plaintext" */
3327 unsigned plain = sizeof(struct p_header) *
3328 ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
3329 + c->bm_words * sizeof(long);
3330 unsigned total = c->bytes[0] + c->bytes[1];
3331 unsigned r;
3332
3333 /* total can not be zero. but just in case: */
3334 if (total == 0)
3335 return;
3336
3337 /* don't report if not compressed */
3338 if (total >= plain)
3339 return;
3340
3341 /* total < plain. check for overflow, still */
3342 r = (total > UINT_MAX/1000) ? (total / (plain/1000))
3343 : (1000 * total / plain);
3344
3345 if (r > 1000)
3346 r = 1000;
3347
3348 r = 1000 - r;
3349 dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
3350 "total %u; compression: %u.%u%%\n",
3351 direction,
3352 c->bytes[1], c->packets[1],
3353 c->bytes[0], c->packets[0],
3354 total, r/10, r % 10);
3355}
3356
3357/* Since we are processing the bitfield from lower addresses to higher,
3358 it does not matter if the process it in 32 bit chunks or 64 bit
3359 chunks as long as it is little endian. (Understand it as byte stream,
3360 beginning with the lowest byte...) If we would use big endian
3361 we would need to process it from the highest address to the lowest,
3362 in order to be agnostic to the 32 vs 64 bits issue.
3363
3364 returns 0 on failure, 1 if we successfully received it. */
3365static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
3366{
3367 struct bm_xfer_ctx c;
3368 void *buffer;
3369 enum receive_bitmap_ret ret;
3370 int ok = FALSE;
3371
3372 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
3373
3374 drbd_bm_lock(mdev, "receive bitmap");
3375
3376 /* maybe we should use some per thread scratch page,
3377 * and allocate that during initial device creation? */
3378 buffer = (unsigned long *) __get_free_page(GFP_NOIO);
3379 if (!buffer) {
3380 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
3381 goto out;
3382 }
3383
3384 c = (struct bm_xfer_ctx) {
3385 .bm_bits = drbd_bm_bits(mdev),
3386 .bm_words = drbd_bm_words(mdev),
3387 };
3388
3389 do {
3390 if (h->command == P_BITMAP) {
3391 ret = receive_bitmap_plain(mdev, h, buffer, &c);
3392 } else if (h->command == P_COMPRESSED_BITMAP) {
3393 /* MAYBE: sanity check that we speak proto >= 90,
3394 * and the feature is enabled! */
3395 struct p_compressed_bm *p;
3396
3397 if (h->length > BM_PACKET_PAYLOAD_BYTES) {
3398 dev_err(DEV, "ReportCBitmap packet too large\n");
3399 goto out;
3400 }
3401 /* use the page buff */
3402 p = buffer;
3403 memcpy(p, h, sizeof(*h));
3404 if (drbd_recv(mdev, p->head.payload, h->length) != h->length)
3405 goto out;
3406 if (p->head.length <= (sizeof(*p) - sizeof(p->head))) {
3407 dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length);
3408 return FAILED;
3409 }
3410 ret = decode_bitmap_c(mdev, p, &c);
3411 } else {
3412 dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command);
3413 goto out;
3414 }
3415
3416 c.packets[h->command == P_BITMAP]++;
3417 c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length;
3418
3419 if (ret != OK)
3420 break;
3421
3422 if (!drbd_recv_header(mdev, h))
3423 goto out;
3424 } while (ret == OK);
3425 if (ret == FAILED)
3426 goto out;
3427
3428 INFO_bm_xfer_stats(mdev, "receive", &c);
3429
3430 if (mdev->state.conn == C_WF_BITMAP_T) {
3431 ok = !drbd_send_bitmap(mdev);
3432 if (!ok)
3433 goto out;
3434 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
3435 ok = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
3436 D_ASSERT(ok == SS_SUCCESS);
3437 } else if (mdev->state.conn != C_WF_BITMAP_S) {
3438 /* admin may have requested C_DISCONNECTING,
3439 * other threads may have noticed network errors */
3440 dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n",
3441 drbd_conn_str(mdev->state.conn));
3442 }
3443
3444 ok = TRUE;
3445 out:
3446 drbd_bm_unlock(mdev);
3447 if (ok && mdev->state.conn == C_WF_BITMAP_S)
3448 drbd_start_resync(mdev, C_SYNC_SOURCE);
3449 free_page((unsigned long) buffer);
3450 return ok;
3451}
3452
3453static int receive_skip(struct drbd_conf *mdev, struct p_header *h)
3454{
3455 /* TODO zero copy sink :) */
3456 static char sink[128];
3457 int size, want, r;
3458
3459 dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
3460 h->command, h->length);
3461
3462 size = h->length;
3463 while (size > 0) {
3464 want = min_t(int, size, sizeof(sink));
3465 r = drbd_recv(mdev, sink, want);
3466 ERR_IF(r <= 0) break;
3467 size -= r;
3468 }
3469 return size == 0;
3470}
3471
3472static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
3473{
3474 if (mdev->state.disk >= D_INCONSISTENT)
3475 drbd_kick_lo(mdev);
3476
3477 /* Make sure we've acked all the TCP data associated
3478 * with the data requests being unplugged */
3479 drbd_tcp_quickack(mdev->data.socket);
3480
3481 return TRUE;
3482}
3483
3484typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *);
3485
3486static drbd_cmd_handler_f drbd_default_handler[] = {
3487 [P_DATA] = receive_Data,
3488 [P_DATA_REPLY] = receive_DataReply,
3489 [P_RS_DATA_REPLY] = receive_RSDataReply,
3490 [P_BARRIER] = receive_Barrier,
3491 [P_BITMAP] = receive_bitmap,
3492 [P_COMPRESSED_BITMAP] = receive_bitmap,
3493 [P_UNPLUG_REMOTE] = receive_UnplugRemote,
3494 [P_DATA_REQUEST] = receive_DataRequest,
3495 [P_RS_DATA_REQUEST] = receive_DataRequest,
3496 [P_SYNC_PARAM] = receive_SyncParam,
3497 [P_SYNC_PARAM89] = receive_SyncParam,
3498 [P_PROTOCOL] = receive_protocol,
3499 [P_UUIDS] = receive_uuids,
3500 [P_SIZES] = receive_sizes,
3501 [P_STATE] = receive_state,
3502 [P_STATE_CHG_REQ] = receive_req_state,
3503 [P_SYNC_UUID] = receive_sync_uuid,
3504 [P_OV_REQUEST] = receive_DataRequest,
3505 [P_OV_REPLY] = receive_DataRequest,
3506 [P_CSUM_RS_REQUEST] = receive_DataRequest,
3507 /* anything missing from this table is in
3508 * the asender_tbl, see get_asender_cmd */
3509 [P_MAX_CMD] = NULL,
3510};
3511
3512static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler;
3513static drbd_cmd_handler_f *drbd_opt_cmd_handler;
3514
3515static void drbdd(struct drbd_conf *mdev)
3516{
3517 drbd_cmd_handler_f handler;
3518 struct p_header *header = &mdev->data.rbuf.header;
3519
3520 while (get_t_state(&mdev->receiver) == Running) {
3521 drbd_thread_current_set_cpu(mdev);
3522 if (!drbd_recv_header(mdev, header))
3523 break;
3524
3525 if (header->command < P_MAX_CMD)
3526 handler = drbd_cmd_handler[header->command];
3527 else if (P_MAY_IGNORE < header->command
3528 && header->command < P_MAX_OPT_CMD)
3529 handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE];
3530 else if (header->command > P_MAX_OPT_CMD)
3531 handler = receive_skip;
3532 else
3533 handler = NULL;
3534
3535 if (unlikely(!handler)) {
3536 dev_err(DEV, "unknown packet type %d, l: %d!\n",
3537 header->command, header->length);
3538 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3539 break;
3540 }
3541 if (unlikely(!handler(mdev, header))) {
3542 dev_err(DEV, "error receiving %s, l: %d!\n",
3543 cmdname(header->command), header->length);
3544 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3545 break;
3546 }
3547
3548 trace_drbd_packet(mdev, mdev->data.socket, 2, &mdev->data.rbuf,
3549 __FILE__, __LINE__);
3550 }
3551}
3552
3553static void drbd_fail_pending_reads(struct drbd_conf *mdev)
3554{
3555 struct hlist_head *slot;
3556 struct hlist_node *pos;
3557 struct hlist_node *tmp;
3558 struct drbd_request *req;
3559 int i;
3560
3561 /*
3562 * Application READ requests
3563 */
3564 spin_lock_irq(&mdev->req_lock);
3565 for (i = 0; i < APP_R_HSIZE; i++) {
3566 slot = mdev->app_reads_hash+i;
3567 hlist_for_each_entry_safe(req, pos, tmp, slot, colision) {
3568 /* it may (but should not any longer!)
3569 * be on the work queue; if that assert triggers,
3570 * we need to also grab the
3571 * spin_lock_irq(&mdev->data.work.q_lock);
3572 * and list_del_init here. */
3573 D_ASSERT(list_empty(&req->w.list));
3574 /* It would be nice to complete outside of spinlock.
3575 * But this is easier for now. */
3576 _req_mod(req, connection_lost_while_pending);
3577 }
3578 }
3579 for (i = 0; i < APP_R_HSIZE; i++)
3580 if (!hlist_empty(mdev->app_reads_hash+i))
3581 dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: "
3582 "%p, should be NULL\n", i, mdev->app_reads_hash[i].first);
3583
3584 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
3585 spin_unlock_irq(&mdev->req_lock);
3586}
3587
3588void drbd_flush_workqueue(struct drbd_conf *mdev)
3589{
3590 struct drbd_wq_barrier barr;
3591
3592 barr.w.cb = w_prev_work_done;
3593 init_completion(&barr.done);
3594 drbd_queue_work(&mdev->data.work, &barr.w);
3595 wait_for_completion(&barr.done);
3596}
3597
3598static void drbd_disconnect(struct drbd_conf *mdev)
3599{
3600 enum drbd_fencing_p fp;
3601 union drbd_state os, ns;
3602 int rv = SS_UNKNOWN_ERROR;
3603 unsigned int i;
3604
3605 if (mdev->state.conn == C_STANDALONE)
3606 return;
3607 if (mdev->state.conn >= C_WF_CONNECTION)
3608 dev_err(DEV, "ASSERT FAILED cstate = %s, expected < WFConnection\n",
3609 drbd_conn_str(mdev->state.conn));
3610
3611 /* asender does not clean up anything. it must not interfere, either */
3612 drbd_thread_stop(&mdev->asender);
3613
3614 mutex_lock(&mdev->data.mutex);
3615 drbd_free_sock(mdev);
3616 mutex_unlock(&mdev->data.mutex);
3617
3618 spin_lock_irq(&mdev->req_lock);
3619 _drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
3620 _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
3621 _drbd_wait_ee_list_empty(mdev, &mdev->read_ee);
3622 spin_unlock_irq(&mdev->req_lock);
3623
3624 /* We do not have data structures that would allow us to
3625 * get the rs_pending_cnt down to 0 again.
3626 * * On C_SYNC_TARGET we do not have any data structures describing
3627 * the pending RSDataRequest's we have sent.
3628 * * On C_SYNC_SOURCE there is no data structure that tracks
3629 * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
3630 * And no, it is not the sum of the reference counts in the
3631 * resync_LRU. The resync_LRU tracks the whole operation including
3632 * the disk-IO, while the rs_pending_cnt only tracks the blocks
3633 * on the fly. */
3634 drbd_rs_cancel_all(mdev);
3635 mdev->rs_total = 0;
3636 mdev->rs_failed = 0;
3637 atomic_set(&mdev->rs_pending_cnt, 0);
3638 wake_up(&mdev->misc_wait);
3639
3640 /* make sure syncer is stopped and w_resume_next_sg queued */
3641 del_timer_sync(&mdev->resync_timer);
3642 set_bit(STOP_SYNC_TIMER, &mdev->flags);
3643 resync_timer_fn((unsigned long)mdev);
3644
3645 /* so we can be sure that all remote or resync reads
3646 * made it at least to net_ee */
3647 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
3648
3649 /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
3650 * w_make_resync_request etc. which may still be on the worker queue
3651 * to be "canceled" */
3652 drbd_flush_workqueue(mdev);
3653
3654 /* This also does reclaim_net_ee(). If we do this too early, we might
3655 * miss some resync ee and pages.*/
3656 drbd_process_done_ee(mdev);
3657
3658 kfree(mdev->p_uuid);
3659 mdev->p_uuid = NULL;
3660
3661 if (!mdev->state.susp)
3662 tl_clear(mdev);
3663
3664 drbd_fail_pending_reads(mdev);
3665
3666 dev_info(DEV, "Connection closed\n");
3667
3668 drbd_md_sync(mdev);
3669
3670 fp = FP_DONT_CARE;
3671 if (get_ldev(mdev)) {
3672 fp = mdev->ldev->dc.fencing;
3673 put_ldev(mdev);
3674 }
3675
3676 if (mdev->state.role == R_PRIMARY) {
3677 if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) {
3678 enum drbd_disk_state nps = drbd_try_outdate_peer(mdev);
3679 drbd_request_state(mdev, NS(pdsk, nps));
3680 }
3681 }
3682
3683 spin_lock_irq(&mdev->req_lock);
3684 os = mdev->state;
3685 if (os.conn >= C_UNCONNECTED) {
3686 /* Do not restart in case we are C_DISCONNECTING */
3687 ns = os;
3688 ns.conn = C_UNCONNECTED;
3689 rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
3690 }
3691 spin_unlock_irq(&mdev->req_lock);
3692
3693 if (os.conn == C_DISCONNECTING) {
3694 struct hlist_head *h;
3695 wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0);
3696
3697 /* we must not free the tl_hash
3698 * while application io is still on the fly */
3699 wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0);
3700
3701 spin_lock_irq(&mdev->req_lock);
3702 /* paranoia code */
3703 for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
3704 if (h->first)
3705 dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
3706 (int)(h - mdev->ee_hash), h->first);
3707 kfree(mdev->ee_hash);
3708 mdev->ee_hash = NULL;
3709 mdev->ee_hash_s = 0;
3710
3711 /* paranoia code */
3712 for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
3713 if (h->first)
3714 dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
3715 (int)(h - mdev->tl_hash), h->first);
3716 kfree(mdev->tl_hash);
3717 mdev->tl_hash = NULL;
3718 mdev->tl_hash_s = 0;
3719 spin_unlock_irq(&mdev->req_lock);
3720
3721 crypto_free_hash(mdev->cram_hmac_tfm);
3722 mdev->cram_hmac_tfm = NULL;
3723
3724 kfree(mdev->net_conf);
3725 mdev->net_conf = NULL;
3726 drbd_request_state(mdev, NS(conn, C_STANDALONE));
3727 }
3728
3729 /* tcp_close and release of sendpage pages can be deferred. I don't
3730 * want to use SO_LINGER, because apparently it can be deferred for
3731 * more than 20 seconds (longest time I checked).
3732 *
3733 * Actually we don't care for exactly when the network stack does its
3734 * put_page(), but release our reference on these pages right here.
3735 */
3736 i = drbd_release_ee(mdev, &mdev->net_ee);
3737 if (i)
3738 dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
3739 i = atomic_read(&mdev->pp_in_use);
3740 if (i)
3741 dev_info(DEV, "pp_in_use = %u, expected 0\n", i);
3742
3743 D_ASSERT(list_empty(&mdev->read_ee));
3744 D_ASSERT(list_empty(&mdev->active_ee));
3745 D_ASSERT(list_empty(&mdev->sync_ee));
3746 D_ASSERT(list_empty(&mdev->done_ee));
3747
3748 /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
3749 atomic_set(&mdev->current_epoch->epoch_size, 0);
3750 D_ASSERT(list_empty(&mdev->current_epoch->list));
3751}
3752
3753/*
3754 * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
3755 * we can agree on is stored in agreed_pro_version.
3756 *
3757 * feature flags and the reserved array should be enough room for future
3758 * enhancements of the handshake protocol, and possible plugins...
3759 *
3760 * for now, they are expected to be zero, but ignored.
3761 */
3762static int drbd_send_handshake(struct drbd_conf *mdev)
3763{
3764 /* ASSERT current == mdev->receiver ... */
3765 struct p_handshake *p = &mdev->data.sbuf.handshake;
3766 int ok;
3767
3768 if (mutex_lock_interruptible(&mdev->data.mutex)) {
3769 dev_err(DEV, "interrupted during initial handshake\n");
3770 return 0; /* interrupted. not ok. */
3771 }
3772
3773 if (mdev->data.socket == NULL) {
3774 mutex_unlock(&mdev->data.mutex);
3775 return 0;
3776 }
3777
3778 memset(p, 0, sizeof(*p));
3779 p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
3780 p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
3781 ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
3782 (struct p_header *)p, sizeof(*p), 0 );
3783 mutex_unlock(&mdev->data.mutex);
3784 return ok;
3785}
3786
3787/*
3788 * return values:
3789 * 1 yes, we have a valid connection
3790 * 0 oops, did not work out, please try again
3791 * -1 peer talks different language,
3792 * no point in trying again, please go standalone.
3793 */
3794static int drbd_do_handshake(struct drbd_conf *mdev)
3795{
3796 /* ASSERT current == mdev->receiver ... */
3797 struct p_handshake *p = &mdev->data.rbuf.handshake;
3798 const int expect = sizeof(struct p_handshake)
3799 -sizeof(struct p_header);
3800 int rv;
3801
3802 rv = drbd_send_handshake(mdev);
3803 if (!rv)
3804 return 0;
3805
3806 rv = drbd_recv_header(mdev, &p->head);
3807 if (!rv)
3808 return 0;
3809
3810 if (p->head.command != P_HAND_SHAKE) {
3811 dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
3812 cmdname(p->head.command), p->head.command);
3813 return -1;
3814 }
3815
3816 if (p->head.length != expect) {
3817 dev_err(DEV, "expected HandShake length: %u, received: %u\n",
3818 expect, p->head.length);
3819 return -1;
3820 }
3821
3822 rv = drbd_recv(mdev, &p->head.payload, expect);
3823
3824 if (rv != expect) {
3825 dev_err(DEV, "short read receiving handshake packet: l=%u\n", rv);
3826 return 0;
3827 }
3828
3829 trace_drbd_packet(mdev, mdev->data.socket, 2, &mdev->data.rbuf,
3830 __FILE__, __LINE__);
3831
3832 p->protocol_min = be32_to_cpu(p->protocol_min);
3833 p->protocol_max = be32_to_cpu(p->protocol_max);
3834 if (p->protocol_max == 0)
3835 p->protocol_max = p->protocol_min;
3836
3837 if (PRO_VERSION_MAX < p->protocol_min ||
3838 PRO_VERSION_MIN > p->protocol_max)
3839 goto incompat;
3840
3841 mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
3842
3843 dev_info(DEV, "Handshake successful: "
3844 "Agreed network protocol version %d\n", mdev->agreed_pro_version);
3845
3846 return 1;
3847
3848 incompat:
3849 dev_err(DEV, "incompatible DRBD dialects: "
3850 "I support %d-%d, peer supports %d-%d\n",
3851 PRO_VERSION_MIN, PRO_VERSION_MAX,
3852 p->protocol_min, p->protocol_max);
3853 return -1;
3854}
3855
3856#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
3857static int drbd_do_auth(struct drbd_conf *mdev)
3858{
3859 dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
3860 dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
3861 return 0;
3862}
3863#else
3864#define CHALLENGE_LEN 64
3865static int drbd_do_auth(struct drbd_conf *mdev)
3866{
3867 char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */
3868 struct scatterlist sg;
3869 char *response = NULL;
3870 char *right_response = NULL;
3871 char *peers_ch = NULL;
3872 struct p_header p;
3873 unsigned int key_len = strlen(mdev->net_conf->shared_secret);
3874 unsigned int resp_size;
3875 struct hash_desc desc;
3876 int rv;
3877
3878 desc.tfm = mdev->cram_hmac_tfm;
3879 desc.flags = 0;
3880
3881 rv = crypto_hash_setkey(mdev->cram_hmac_tfm,
3882 (u8 *)mdev->net_conf->shared_secret, key_len);
3883 if (rv) {
3884 dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv);
3885 rv = 0;
3886 goto fail;
3887 }
3888
3889 get_random_bytes(my_challenge, CHALLENGE_LEN);
3890
3891 rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN);
3892 if (!rv)
3893 goto fail;
3894
3895 rv = drbd_recv_header(mdev, &p);
3896 if (!rv)
3897 goto fail;
3898
3899 if (p.command != P_AUTH_CHALLENGE) {
3900 dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
3901 cmdname(p.command), p.command);
3902 rv = 0;
3903 goto fail;
3904 }
3905
3906 if (p.length > CHALLENGE_LEN*2) {
3907 dev_err(DEV, "expected AuthChallenge payload too big.\n");
3908 rv = 0;
3909 goto fail;
3910 }
3911
3912 peers_ch = kmalloc(p.length, GFP_NOIO);
3913 if (peers_ch == NULL) {
3914 dev_err(DEV, "kmalloc of peers_ch failed\n");
3915 rv = 0;
3916 goto fail;
3917 }
3918
3919 rv = drbd_recv(mdev, peers_ch, p.length);
3920
3921 if (rv != p.length) {
3922 dev_err(DEV, "short read AuthChallenge: l=%u\n", rv);
3923 rv = 0;
3924 goto fail;
3925 }
3926
3927 resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm);
3928 response = kmalloc(resp_size, GFP_NOIO);
3929 if (response == NULL) {
3930 dev_err(DEV, "kmalloc of response failed\n");
3931 rv = 0;
3932 goto fail;
3933 }
3934
3935 sg_init_table(&sg, 1);
3936 sg_set_buf(&sg, peers_ch, p.length);
3937
3938 rv = crypto_hash_digest(&desc, &sg, sg.length, response);
3939 if (rv) {
3940 dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
3941 rv = 0;
3942 goto fail;
3943 }
3944
3945 rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size);
3946 if (!rv)
3947 goto fail;
3948
3949 rv = drbd_recv_header(mdev, &p);
3950 if (!rv)
3951 goto fail;
3952
3953 if (p.command != P_AUTH_RESPONSE) {
3954 dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
3955 cmdname(p.command), p.command);
3956 rv = 0;
3957 goto fail;
3958 }
3959
3960 if (p.length != resp_size) {
3961 dev_err(DEV, "expected AuthResponse payload of wrong size\n");
3962 rv = 0;
3963 goto fail;
3964 }
3965
3966 rv = drbd_recv(mdev, response , resp_size);
3967
3968 if (rv != resp_size) {
3969 dev_err(DEV, "short read receiving AuthResponse: l=%u\n", rv);
3970 rv = 0;
3971 goto fail;
3972 }
3973
3974 right_response = kmalloc(resp_size, GFP_NOIO);
3975 if (response == NULL) {
3976 dev_err(DEV, "kmalloc of right_response failed\n");
3977 rv = 0;
3978 goto fail;
3979 }
3980
3981 sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
3982
3983 rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
3984 if (rv) {
3985 dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
3986 rv = 0;
3987 goto fail;
3988 }
3989
3990 rv = !memcmp(response, right_response, resp_size);
3991
3992 if (rv)
3993 dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n",
3994 resp_size, mdev->net_conf->cram_hmac_alg);
3995
3996 fail:
3997 kfree(peers_ch);
3998 kfree(response);
3999 kfree(right_response);
4000
4001 return rv;
4002}
4003#endif
4004
4005int drbdd_init(struct drbd_thread *thi)
4006{
4007 struct drbd_conf *mdev = thi->mdev;
4008 unsigned int minor = mdev_to_minor(mdev);
4009 int h;
4010
4011 sprintf(current->comm, "drbd%d_receiver", minor);
4012
4013 dev_info(DEV, "receiver (re)started\n");
4014
4015 do {
4016 h = drbd_connect(mdev);
4017 if (h == 0) {
4018 drbd_disconnect(mdev);
4019 __set_current_state(TASK_INTERRUPTIBLE);
4020 schedule_timeout(HZ);
4021 }
4022 if (h == -1) {
4023 dev_warn(DEV, "Discarding network configuration.\n");
4024 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
4025 }
4026 } while (h == 0);
4027
4028 if (h > 0) {
4029 if (get_net_conf(mdev)) {
4030 drbdd(mdev);
4031 put_net_conf(mdev);
4032 }
4033 }
4034
4035 drbd_disconnect(mdev);
4036
4037 dev_info(DEV, "receiver terminated\n");
4038 return 0;
4039}
4040
4041/* ********* acknowledge sender ******** */
4042
4043static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h)
4044{
4045 struct p_req_state_reply *p = (struct p_req_state_reply *)h;
4046
4047 int retcode = be32_to_cpu(p->retcode);
4048
4049 if (retcode >= SS_SUCCESS) {
4050 set_bit(CL_ST_CHG_SUCCESS, &mdev->flags);
4051 } else {
4052 set_bit(CL_ST_CHG_FAIL, &mdev->flags);
4053 dev_err(DEV, "Requested state change failed by peer: %s (%d)\n",
4054 drbd_set_st_err_str(retcode), retcode);
4055 }
4056 wake_up(&mdev->state_wait);
4057
4058 return TRUE;
4059}
4060
4061static int got_Ping(struct drbd_conf *mdev, struct p_header *h)
4062{
4063 return drbd_send_ping_ack(mdev);
4064
4065}
4066
4067static int got_PingAck(struct drbd_conf *mdev, struct p_header *h)
4068{
4069 /* restore idle timeout */
4070 mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
4071
4072 return TRUE;
4073}
4074
4075static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h)
4076{
4077 struct p_block_ack *p = (struct p_block_ack *)h;
4078 sector_t sector = be64_to_cpu(p->sector);
4079 int blksize = be32_to_cpu(p->blksize);
4080
4081 D_ASSERT(mdev->agreed_pro_version >= 89);
4082
4083 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4084
4085 drbd_rs_complete_io(mdev, sector);
4086 drbd_set_in_sync(mdev, sector, blksize);
4087 /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
4088 mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
4089 dec_rs_pending(mdev);
4090
4091 return TRUE;
4092}
4093
4094/* when we receive the ACK for a write request,
4095 * verify that we actually know about it */
4096static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
4097 u64 id, sector_t sector)
4098{
4099 struct hlist_head *slot = tl_hash_slot(mdev, sector);
4100 struct hlist_node *n;
4101 struct drbd_request *req;
4102
4103 hlist_for_each_entry(req, n, slot, colision) {
4104 if ((unsigned long)req == (unsigned long)id) {
4105 if (req->sector != sector) {
4106 dev_err(DEV, "_ack_id_to_req: found req %p but it has "
4107 "wrong sector (%llus versus %llus)\n", req,
4108 (unsigned long long)req->sector,
4109 (unsigned long long)sector);
4110 break;
4111 }
4112 return req;
4113 }
4114 }
4115 dev_err(DEV, "_ack_id_to_req: failed to find req %p, sector %llus in list\n",
4116 (void *)(unsigned long)id, (unsigned long long)sector);
4117 return NULL;
4118}
4119
4120typedef struct drbd_request *(req_validator_fn)
4121 (struct drbd_conf *mdev, u64 id, sector_t sector);
4122
4123static int validate_req_change_req_state(struct drbd_conf *mdev,
4124 u64 id, sector_t sector, req_validator_fn validator,
4125 const char *func, enum drbd_req_event what)
4126{
4127 struct drbd_request *req;
4128 struct bio_and_error m;
4129
4130 spin_lock_irq(&mdev->req_lock);
4131 req = validator(mdev, id, sector);
4132 if (unlikely(!req)) {
4133 spin_unlock_irq(&mdev->req_lock);
4134 dev_err(DEV, "%s: got a corrupt block_id/sector pair\n", func);
4135 return FALSE;
4136 }
4137 __req_mod(req, what, &m);
4138 spin_unlock_irq(&mdev->req_lock);
4139
4140 if (m.bio)
4141 complete_master_bio(mdev, &m);
4142 return TRUE;
4143}
4144
4145static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h)
4146{
4147 struct p_block_ack *p = (struct p_block_ack *)h;
4148 sector_t sector = be64_to_cpu(p->sector);
4149 int blksize = be32_to_cpu(p->blksize);
4150 enum drbd_req_event what;
4151
4152 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4153
4154 if (is_syncer_block_id(p->block_id)) {
4155 drbd_set_in_sync(mdev, sector, blksize);
4156 dec_rs_pending(mdev);
4157 return TRUE;
4158 }
4159 switch (be16_to_cpu(h->command)) {
4160 case P_RS_WRITE_ACK:
4161 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4162 what = write_acked_by_peer_and_sis;
4163 break;
4164 case P_WRITE_ACK:
4165 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4166 what = write_acked_by_peer;
4167 break;
4168 case P_RECV_ACK:
4169 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B);
4170 what = recv_acked_by_peer;
4171 break;
4172 case P_DISCARD_ACK:
4173 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4174 what = conflict_discarded_by_peer;
4175 break;
4176 default:
4177 D_ASSERT(0);
4178 return FALSE;
4179 }
4180
4181 return validate_req_change_req_state(mdev, p->block_id, sector,
4182 _ack_id_to_req, __func__ , what);
4183}
4184
4185static int got_NegAck(struct drbd_conf *mdev, struct p_header *h)
4186{
4187 struct p_block_ack *p = (struct p_block_ack *)h;
4188 sector_t sector = be64_to_cpu(p->sector);
4189
4190 if (__ratelimit(&drbd_ratelimit_state))
4191 dev_warn(DEV, "Got NegAck packet. Peer is in troubles?\n");
4192
4193 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4194
4195 if (is_syncer_block_id(p->block_id)) {
4196 int size = be32_to_cpu(p->blksize);
4197 dec_rs_pending(mdev);
4198 drbd_rs_failed_io(mdev, sector, size);
4199 return TRUE;
4200 }
4201 return validate_req_change_req_state(mdev, p->block_id, sector,
4202 _ack_id_to_req, __func__ , neg_acked);
4203}
4204
4205static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h)
4206{
4207 struct p_block_ack *p = (struct p_block_ack *)h;
4208 sector_t sector = be64_to_cpu(p->sector);
4209
4210 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4211 dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n",
4212 (unsigned long long)sector, be32_to_cpu(p->blksize));
4213
4214 return validate_req_change_req_state(mdev, p->block_id, sector,
4215 _ar_id_to_req, __func__ , neg_acked);
4216}
4217
4218static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
4219{
4220 sector_t sector;
4221 int size;
4222 struct p_block_ack *p = (struct p_block_ack *)h;
4223
4224 sector = be64_to_cpu(p->sector);
4225 size = be32_to_cpu(p->blksize);
4226 D_ASSERT(p->block_id == ID_SYNCER);
4227
4228 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4229
4230 dec_rs_pending(mdev);
4231
4232 if (get_ldev_if_state(mdev, D_FAILED)) {
4233 drbd_rs_complete_io(mdev, sector);
4234 drbd_rs_failed_io(mdev, sector, size);
4235 put_ldev(mdev);
4236 }
4237
4238 return TRUE;
4239}
4240
4241static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h)
4242{
4243 struct p_barrier_ack *p = (struct p_barrier_ack *)h;
4244
4245 tl_release(mdev, p->barrier, be32_to_cpu(p->set_size));
4246
4247 return TRUE;
4248}
4249
4250static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
4251{
4252 struct p_block_ack *p = (struct p_block_ack *)h;
4253 struct drbd_work *w;
4254 sector_t sector;
4255 int size;
4256
4257 sector = be64_to_cpu(p->sector);
4258 size = be32_to_cpu(p->blksize);
4259
4260 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4261
4262 if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
4263 drbd_ov_oos_found(mdev, sector, size);
4264 else
4265 ov_oos_print(mdev);
4266
4267 drbd_rs_complete_io(mdev, sector);
4268 dec_rs_pending(mdev);
4269
4270 if (--mdev->ov_left == 0) {
4271 w = kmalloc(sizeof(*w), GFP_NOIO);
4272 if (w) {
4273 w->cb = w_ov_finished;
4274 drbd_queue_work_front(&mdev->data.work, w);
4275 } else {
4276 dev_err(DEV, "kmalloc(w) failed.");
4277 ov_oos_print(mdev);
4278 drbd_resync_finished(mdev);
4279 }
4280 }
4281 return TRUE;
4282}
4283
4284struct asender_cmd {
4285 size_t pkt_size;
4286 int (*process)(struct drbd_conf *mdev, struct p_header *h);
4287};
4288
4289static struct asender_cmd *get_asender_cmd(int cmd)
4290{
4291 static struct asender_cmd asender_tbl[] = {
4292 /* anything missing from this table is in
4293 * the drbd_cmd_handler (drbd_default_handler) table,
4294 * see the beginning of drbdd() */
4295 [P_PING] = { sizeof(struct p_header), got_Ping },
4296 [P_PING_ACK] = { sizeof(struct p_header), got_PingAck },
4297 [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4298 [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4299 [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4300 [P_DISCARD_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4301 [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck },
4302 [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply },
4303 [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply},
4304 [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult },
4305 [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
4306 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
4307 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
4308 [P_MAX_CMD] = { 0, NULL },
4309 };
4310 if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
4311 return NULL;
4312 return &asender_tbl[cmd];
4313}
4314
4315int drbd_asender(struct drbd_thread *thi)
4316{
4317 struct drbd_conf *mdev = thi->mdev;
4318 struct p_header *h = &mdev->meta.rbuf.header;
4319 struct asender_cmd *cmd = NULL;
4320
4321 int rv, len;
4322 void *buf = h;
4323 int received = 0;
4324 int expect = sizeof(struct p_header);
4325 int empty;
4326
4327 sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
4328
4329 current->policy = SCHED_RR; /* Make this a realtime task! */
4330 current->rt_priority = 2; /* more important than all other tasks */
4331
4332 while (get_t_state(thi) == Running) {
4333 drbd_thread_current_set_cpu(mdev);
4334 if (test_and_clear_bit(SEND_PING, &mdev->flags)) {
4335 ERR_IF(!drbd_send_ping(mdev)) goto reconnect;
4336 mdev->meta.socket->sk->sk_rcvtimeo =
4337 mdev->net_conf->ping_timeo*HZ/10;
4338 }
4339
4340 /* conditionally cork;
4341 * it may hurt latency if we cork without much to send */
4342 if (!mdev->net_conf->no_cork &&
4343 3 < atomic_read(&mdev->unacked_cnt))
4344 drbd_tcp_cork(mdev->meta.socket);
4345 while (1) {
4346 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4347 flush_signals(current);
4348 if (!drbd_process_done_ee(mdev)) {
4349 dev_err(DEV, "process_done_ee() = NOT_OK\n");
4350 goto reconnect;
4351 }
4352 /* to avoid race with newly queued ACKs */
4353 set_bit(SIGNAL_ASENDER, &mdev->flags);
4354 spin_lock_irq(&mdev->req_lock);
4355 empty = list_empty(&mdev->done_ee);
4356 spin_unlock_irq(&mdev->req_lock);
4357 /* new ack may have been queued right here,
4358 * but then there is also a signal pending,
4359 * and we start over... */
4360 if (empty)
4361 break;
4362 }
4363 /* but unconditionally uncork unless disabled */
4364 if (!mdev->net_conf->no_cork)
4365 drbd_tcp_uncork(mdev->meta.socket);
4366
4367 /* short circuit, recv_msg would return EINTR anyways. */
4368 if (signal_pending(current))
4369 continue;
4370
4371 rv = drbd_recv_short(mdev, mdev->meta.socket,
4372 buf, expect-received, 0);
4373 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4374
4375 flush_signals(current);
4376
4377 /* Note:
4378 * -EINTR (on meta) we got a signal
4379 * -EAGAIN (on meta) rcvtimeo expired
4380 * -ECONNRESET other side closed the connection
4381 * -ERESTARTSYS (on data) we got a signal
4382 * rv < 0 other than above: unexpected error!
4383 * rv == expected: full header or command
4384 * rv < expected: "woken" by signal during receive
4385 * rv == 0 : "connection shut down by peer"
4386 */
4387 if (likely(rv > 0)) {
4388 received += rv;
4389 buf += rv;
4390 } else if (rv == 0) {
4391 dev_err(DEV, "meta connection shut down by peer.\n");
4392 goto reconnect;
4393 } else if (rv == -EAGAIN) {
4394 if (mdev->meta.socket->sk->sk_rcvtimeo ==
4395 mdev->net_conf->ping_timeo*HZ/10) {
4396 dev_err(DEV, "PingAck did not arrive in time.\n");
4397 goto reconnect;
4398 }
4399 set_bit(SEND_PING, &mdev->flags);
4400 continue;
4401 } else if (rv == -EINTR) {
4402 continue;
4403 } else {
4404 dev_err(DEV, "sock_recvmsg returned %d\n", rv);
4405 goto reconnect;
4406 }
4407
4408 if (received == expect && cmd == NULL) {
4409 if (unlikely(h->magic != BE_DRBD_MAGIC)) {
4410 dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n",
4411 (long)be32_to_cpu(h->magic),
4412 h->command, h->length);
4413 goto reconnect;
4414 }
4415 cmd = get_asender_cmd(be16_to_cpu(h->command));
4416 len = be16_to_cpu(h->length);
4417 if (unlikely(cmd == NULL)) {
4418 dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n",
4419 (long)be32_to_cpu(h->magic),
4420 h->command, h->length);
4421 goto disconnect;
4422 }
4423 expect = cmd->pkt_size;
4424 ERR_IF(len != expect-sizeof(struct p_header)) {
4425 trace_drbd_packet(mdev, mdev->meta.socket, 1, (void *)h, __FILE__, __LINE__);
4426 goto reconnect;
4427 }
4428 }
4429 if (received == expect) {
4430 D_ASSERT(cmd != NULL);
4431 trace_drbd_packet(mdev, mdev->meta.socket, 1, (void *)h, __FILE__, __LINE__);
4432 if (!cmd->process(mdev, h))
4433 goto reconnect;
4434
4435 buf = h;
4436 received = 0;
4437 expect = sizeof(struct p_header);
4438 cmd = NULL;
4439 }
4440 }
4441
4442 if (0) {
4443reconnect:
4444 drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
4445 }
4446 if (0) {
4447disconnect:
4448 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
4449 }
4450 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4451
4452 D_ASSERT(mdev->state.conn < C_CONNECTED);
4453 dev_info(DEV, "asender terminated\n");
4454
4455 return 0;
4456}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
new file mode 100644
index 000000000000..0656cf1edd57
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.c
@@ -0,0 +1,1132 @@
1/*
2 drbd_req.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/autoconf.h>
27#include <linux/module.h>
28
29#include <linux/slab.h>
30#include <linux/drbd.h>
31#include "drbd_int.h"
32#include "drbd_tracing.h"
33#include "drbd_req.h"
34
35
36/* Update disk stats at start of I/O request */
37static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio)
38{
39 const int rw = bio_data_dir(bio);
40 int cpu;
41 cpu = part_stat_lock();
42 part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
43 part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
44 part_stat_unlock();
45 mdev->vdisk->part0.in_flight[rw]++;
46}
47
48/* Update disk stats when completing request upwards */
49static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
50{
51 int rw = bio_data_dir(req->master_bio);
52 unsigned long duration = jiffies - req->start_time;
53 int cpu;
54 cpu = part_stat_lock();
55 part_stat_add(cpu, &mdev->vdisk->part0, ticks[rw], duration);
56 part_round_stats(cpu, &mdev->vdisk->part0);
57 part_stat_unlock();
58 mdev->vdisk->part0.in_flight[rw]--;
59}
60
61static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw)
62{
63 const unsigned long s = req->rq_state;
64 /* if it was a write, we may have to set the corresponding
65 * bit(s) out-of-sync first. If it had a local part, we need to
66 * release the reference to the activity log. */
67 if (rw == WRITE) {
68 /* remove it from the transfer log.
69 * well, only if it had been there in the first
70 * place... if it had not (local only or conflicting
71 * and never sent), it should still be "empty" as
72 * initialized in drbd_req_new(), so we can list_del() it
73 * here unconditionally */
74 list_del(&req->tl_requests);
75 /* Set out-of-sync unless both OK flags are set
76 * (local only or remote failed).
77 * Other places where we set out-of-sync:
78 * READ with local io-error */
79 if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
80 drbd_set_out_of_sync(mdev, req->sector, req->size);
81
82 if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
83 drbd_set_in_sync(mdev, req->sector, req->size);
84
85 /* one might be tempted to move the drbd_al_complete_io
86 * to the local io completion callback drbd_endio_pri.
87 * but, if this was a mirror write, we may only
88 * drbd_al_complete_io after this is RQ_NET_DONE,
89 * otherwise the extent could be dropped from the al
90 * before it has actually been written on the peer.
91 * if we crash before our peer knows about the request,
92 * but after the extent has been dropped from the al,
93 * we would forget to resync the corresponding extent.
94 */
95 if (s & RQ_LOCAL_MASK) {
96 if (get_ldev_if_state(mdev, D_FAILED)) {
97 drbd_al_complete_io(mdev, req->sector);
98 put_ldev(mdev);
99 } else if (__ratelimit(&drbd_ratelimit_state)) {
100 dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), "
101 "but my Disk seems to have failed :(\n",
102 (unsigned long long) req->sector);
103 }
104 }
105 }
106
107 /* if it was a local io error, we want to notify our
108 * peer about that, and see if we need to
109 * detach the disk and stuff.
110 * to avoid allocating some special work
111 * struct, reuse the request. */
112
113 /* THINK
114 * why do we do this not when we detect the error,
115 * but delay it until it is "done", i.e. possibly
116 * until the next barrier ack? */
117
118 if (rw == WRITE &&
119 ((s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) {
120 if (!(req->w.list.next == LIST_POISON1 ||
121 list_empty(&req->w.list))) {
122 /* DEBUG ASSERT only; if this triggers, we
123 * probably corrupt the worker list here */
124 dev_err(DEV, "req->w.list.next = %p\n", req->w.list.next);
125 dev_err(DEV, "req->w.list.prev = %p\n", req->w.list.prev);
126 }
127 req->w.cb = w_io_error;
128 drbd_queue_work(&mdev->data.work, &req->w);
129 /* drbd_req_free() is done in w_io_error */
130 } else {
131 drbd_req_free(req);
132 }
133}
134
135static void queue_barrier(struct drbd_conf *mdev)
136{
137 struct drbd_tl_epoch *b;
138
139 /* We are within the req_lock. Once we queued the barrier for sending,
140 * we set the CREATE_BARRIER bit. It is cleared as soon as a new
141 * barrier/epoch object is added. This is the only place this bit is
142 * set. It indicates that the barrier for this epoch is already queued,
143 * and no new epoch has been created yet. */
144 if (test_bit(CREATE_BARRIER, &mdev->flags))
145 return;
146
147 b = mdev->newest_tle;
148 b->w.cb = w_send_barrier;
149 /* inc_ap_pending done here, so we won't
150 * get imbalanced on connection loss.
151 * dec_ap_pending will be done in got_BarrierAck
152 * or (on connection loss) in tl_clear. */
153 inc_ap_pending(mdev);
154 drbd_queue_work(&mdev->data.work, &b->w);
155 set_bit(CREATE_BARRIER, &mdev->flags);
156}
157
158static void _about_to_complete_local_write(struct drbd_conf *mdev,
159 struct drbd_request *req)
160{
161 const unsigned long s = req->rq_state;
162 struct drbd_request *i;
163 struct drbd_epoch_entry *e;
164 struct hlist_node *n;
165 struct hlist_head *slot;
166
167 /* before we can signal completion to the upper layers,
168 * we may need to close the current epoch */
169 if (mdev->state.conn >= C_CONNECTED &&
170 req->epoch == mdev->newest_tle->br_number)
171 queue_barrier(mdev);
172
173 /* we need to do the conflict detection stuff,
174 * if we have the ee_hash (two_primaries) and
175 * this has been on the network */
176 if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) {
177 const sector_t sector = req->sector;
178 const int size = req->size;
179
180 /* ASSERT:
181 * there must be no conflicting requests, since
182 * they must have been failed on the spot */
183#define OVERLAPS overlaps(sector, size, i->sector, i->size)
184 slot = tl_hash_slot(mdev, sector);
185 hlist_for_each_entry(i, n, slot, colision) {
186 if (OVERLAPS) {
187 dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; "
188 "other: %p %llus +%u\n",
189 req, (unsigned long long)sector, size,
190 i, (unsigned long long)i->sector, i->size);
191 }
192 }
193
194 /* maybe "wake" those conflicting epoch entries
195 * that wait for this request to finish.
196 *
197 * currently, there can be only _one_ such ee
198 * (well, or some more, which would be pending
199 * P_DISCARD_ACK not yet sent by the asender...),
200 * since we block the receiver thread upon the
201 * first conflict detection, which will wait on
202 * misc_wait. maybe we want to assert that?
203 *
204 * anyways, if we found one,
205 * we just have to do a wake_up. */
206#undef OVERLAPS
207#define OVERLAPS overlaps(sector, size, e->sector, e->size)
208 slot = ee_hash_slot(mdev, req->sector);
209 hlist_for_each_entry(e, n, slot, colision) {
210 if (OVERLAPS) {
211 wake_up(&mdev->misc_wait);
212 break;
213 }
214 }
215 }
216#undef OVERLAPS
217}
218
219void complete_master_bio(struct drbd_conf *mdev,
220 struct bio_and_error *m)
221{
222 trace_drbd_bio(mdev, "Rq", m->bio, 1, NULL);
223 bio_endio(m->bio, m->error);
224 dec_ap_bio(mdev);
225}
226
227/* Helper for __req_mod().
228 * Set m->bio to the master bio, if it is fit to be completed,
229 * or leave it alone (it is initialized to NULL in __req_mod),
230 * if it has already been completed, or cannot be completed yet.
231 * If m->bio is set, the error status to be returned is placed in m->error.
232 */
233void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
234{
235 const unsigned long s = req->rq_state;
236 struct drbd_conf *mdev = req->mdev;
237 /* only WRITES may end up here without a master bio (on barrier ack) */
238 int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE;
239
240 trace_drbd_req(req, nothing, "_req_may_be_done");
241
242 /* we must not complete the master bio, while it is
243 * still being processed by _drbd_send_zc_bio (drbd_send_dblock)
244 * not yet acknowledged by the peer
245 * not yet completed by the local io subsystem
246 * these flags may get cleared in any order by
247 * the worker,
248 * the receiver,
249 * the bio_endio completion callbacks.
250 */
251 if (s & RQ_NET_QUEUED)
252 return;
253 if (s & RQ_NET_PENDING)
254 return;
255 if (s & RQ_LOCAL_PENDING)
256 return;
257
258 if (req->master_bio) {
259 /* this is data_received (remote read)
260 * or protocol C P_WRITE_ACK
261 * or protocol B P_RECV_ACK
262 * or protocol A "handed_over_to_network" (SendAck)
263 * or canceled or failed,
264 * or killed from the transfer log due to connection loss.
265 */
266
267 /*
268 * figure out whether to report success or failure.
269 *
270 * report success when at least one of the operations succeeded.
271 * or, to put the other way,
272 * only report failure, when both operations failed.
273 *
274 * what to do about the failures is handled elsewhere.
275 * what we need to do here is just: complete the master_bio.
276 *
277 * local completion error, if any, has been stored as ERR_PTR
278 * in private_bio within drbd_endio_pri.
279 */
280 int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
281 int error = PTR_ERR(req->private_bio);
282
283 /* remove the request from the conflict detection
284 * respective block_id verification hash */
285 if (!hlist_unhashed(&req->colision))
286 hlist_del(&req->colision);
287 else
288 D_ASSERT((s & RQ_NET_MASK) == 0);
289
290 /* for writes we need to do some extra housekeeping */
291 if (rw == WRITE)
292 _about_to_complete_local_write(mdev, req);
293
294 /* Update disk stats */
295 _drbd_end_io_acct(mdev, req);
296
297 m->error = ok ? 0 : (error ?: -EIO);
298 m->bio = req->master_bio;
299 req->master_bio = NULL;
300 }
301
302 if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) {
303 /* this is disconnected (local only) operation,
304 * or protocol C P_WRITE_ACK,
305 * or protocol A or B P_BARRIER_ACK,
306 * or killed from the transfer log due to connection loss. */
307 _req_is_done(mdev, req, rw);
308 }
309 /* else: network part and not DONE yet. that is
310 * protocol A or B, barrier ack still pending... */
311}
312
313/*
314 * checks whether there was an overlapping request
315 * or ee already registered.
316 *
317 * if so, return 1, in which case this request is completed on the spot,
318 * without ever being submitted or send.
319 *
320 * return 0 if it is ok to submit this request.
321 *
322 * NOTE:
323 * paranoia: assume something above us is broken, and issues different write
324 * requests for the same block simultaneously...
325 *
326 * To ensure these won't be reordered differently on both nodes, resulting in
327 * diverging data sets, we discard the later one(s). Not that this is supposed
328 * to happen, but this is the rationale why we also have to check for
329 * conflicting requests with local origin, and why we have to do so regardless
330 * of whether we allowed multiple primaries.
331 *
332 * BTW, in case we only have one primary, the ee_hash is empty anyways, and the
333 * second hlist_for_each_entry becomes a noop. This is even simpler than to
334 * grab a reference on the net_conf, and check for the two_primaries flag...
335 */
336static int _req_conflicts(struct drbd_request *req)
337{
338 struct drbd_conf *mdev = req->mdev;
339 const sector_t sector = req->sector;
340 const int size = req->size;
341 struct drbd_request *i;
342 struct drbd_epoch_entry *e;
343 struct hlist_node *n;
344 struct hlist_head *slot;
345
346 D_ASSERT(hlist_unhashed(&req->colision));
347
348 if (!get_net_conf(mdev))
349 return 0;
350
351 /* BUG_ON */
352 ERR_IF (mdev->tl_hash_s == 0)
353 goto out_no_conflict;
354 BUG_ON(mdev->tl_hash == NULL);
355
356#define OVERLAPS overlaps(i->sector, i->size, sector, size)
357 slot = tl_hash_slot(mdev, sector);
358 hlist_for_each_entry(i, n, slot, colision) {
359 if (OVERLAPS) {
360 dev_alert(DEV, "%s[%u] Concurrent local write detected! "
361 "[DISCARD L] new: %llus +%u; "
362 "pending: %llus +%u\n",
363 current->comm, current->pid,
364 (unsigned long long)sector, size,
365 (unsigned long long)i->sector, i->size);
366 goto out_conflict;
367 }
368 }
369
370 if (mdev->ee_hash_s) {
371 /* now, check for overlapping requests with remote origin */
372 BUG_ON(mdev->ee_hash == NULL);
373#undef OVERLAPS
374#define OVERLAPS overlaps(e->sector, e->size, sector, size)
375 slot = ee_hash_slot(mdev, sector);
376 hlist_for_each_entry(e, n, slot, colision) {
377 if (OVERLAPS) {
378 dev_alert(DEV, "%s[%u] Concurrent remote write detected!"
379 " [DISCARD L] new: %llus +%u; "
380 "pending: %llus +%u\n",
381 current->comm, current->pid,
382 (unsigned long long)sector, size,
383 (unsigned long long)e->sector, e->size);
384 goto out_conflict;
385 }
386 }
387 }
388#undef OVERLAPS
389
390out_no_conflict:
391 /* this is like it should be, and what we expected.
392 * our users do behave after all... */
393 put_net_conf(mdev);
394 return 0;
395
396out_conflict:
397 put_net_conf(mdev);
398 return 1;
399}
400
401/* obviously this could be coded as many single functions
402 * instead of one huge switch,
403 * or by putting the code directly in the respective locations
404 * (as it has been before).
405 *
406 * but having it this way
407 * enforces that it is all in this one place, where it is easier to audit,
408 * it makes it obvious that whatever "event" "happens" to a request should
409 * happen "atomically" within the req_lock,
410 * and it enforces that we have to think in a very structured manner
411 * about the "events" that may happen to a request during its life time ...
412 */
413void __req_mod(struct drbd_request *req, enum drbd_req_event what,
414 struct bio_and_error *m)
415{
416 struct drbd_conf *mdev = req->mdev;
417 m->bio = NULL;
418
419 trace_drbd_req(req, what, NULL);
420
421 switch (what) {
422 default:
423 dev_err(DEV, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__);
424 break;
425
426 /* does not happen...
427 * initialization done in drbd_req_new
428 case created:
429 break;
430 */
431
432 case to_be_send: /* via network */
433 /* reached via drbd_make_request_common
434 * and from w_read_retry_remote */
435 D_ASSERT(!(req->rq_state & RQ_NET_MASK));
436 req->rq_state |= RQ_NET_PENDING;
437 inc_ap_pending(mdev);
438 break;
439
440 case to_be_submitted: /* locally */
441 /* reached via drbd_make_request_common */
442 D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK));
443 req->rq_state |= RQ_LOCAL_PENDING;
444 break;
445
446 case completed_ok:
447 if (bio_data_dir(req->master_bio) == WRITE)
448 mdev->writ_cnt += req->size>>9;
449 else
450 mdev->read_cnt += req->size>>9;
451
452 req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
453 req->rq_state &= ~RQ_LOCAL_PENDING;
454
455 _req_may_be_done(req, m);
456 put_ldev(mdev);
457 break;
458
459 case write_completed_with_error:
460 req->rq_state |= RQ_LOCAL_COMPLETED;
461 req->rq_state &= ~RQ_LOCAL_PENDING;
462
463 dev_alert(DEV, "Local WRITE failed sec=%llus size=%u\n",
464 (unsigned long long)req->sector, req->size);
465 /* and now: check how to handle local io error. */
466 __drbd_chk_io_error(mdev, FALSE);
467 _req_may_be_done(req, m);
468 put_ldev(mdev);
469 break;
470
471 case read_ahead_completed_with_error:
472 /* it is legal to fail READA */
473 req->rq_state |= RQ_LOCAL_COMPLETED;
474 req->rq_state &= ~RQ_LOCAL_PENDING;
475 _req_may_be_done(req, m);
476 put_ldev(mdev);
477 break;
478
479 case read_completed_with_error:
480 drbd_set_out_of_sync(mdev, req->sector, req->size);
481
482 req->rq_state |= RQ_LOCAL_COMPLETED;
483 req->rq_state &= ~RQ_LOCAL_PENDING;
484
485 dev_alert(DEV, "Local READ failed sec=%llus size=%u\n",
486 (unsigned long long)req->sector, req->size);
487 /* _req_mod(req,to_be_send); oops, recursion... */
488 D_ASSERT(!(req->rq_state & RQ_NET_MASK));
489 req->rq_state |= RQ_NET_PENDING;
490 inc_ap_pending(mdev);
491
492 __drbd_chk_io_error(mdev, FALSE);
493 put_ldev(mdev);
494 /* NOTE: if we have no connection,
495 * or know the peer has no good data either,
496 * then we don't actually need to "queue_for_net_read",
497 * but we do so anyways, since the drbd_io_error()
498 * and the potential state change to "Diskless"
499 * needs to be done from process context */
500
501 /* fall through: _req_mod(req,queue_for_net_read); */
502
503 case queue_for_net_read:
504 /* READ or READA, and
505 * no local disk,
506 * or target area marked as invalid,
507 * or just got an io-error. */
508 /* from drbd_make_request_common
509 * or from bio_endio during read io-error recovery */
510
511 /* so we can verify the handle in the answer packet
512 * corresponding hlist_del is in _req_may_be_done() */
513 hlist_add_head(&req->colision, ar_hash_slot(mdev, req->sector));
514
515 set_bit(UNPLUG_REMOTE, &mdev->flags); /* why? */
516
517 D_ASSERT(req->rq_state & RQ_NET_PENDING);
518 req->rq_state |= RQ_NET_QUEUED;
519 req->w.cb = (req->rq_state & RQ_LOCAL_MASK)
520 ? w_read_retry_remote
521 : w_send_read_req;
522 drbd_queue_work(&mdev->data.work, &req->w);
523 break;
524
525 case queue_for_net_write:
526 /* assert something? */
527 /* from drbd_make_request_common only */
528
529 hlist_add_head(&req->colision, tl_hash_slot(mdev, req->sector));
530 /* corresponding hlist_del is in _req_may_be_done() */
531
532 /* NOTE
533 * In case the req ended up on the transfer log before being
534 * queued on the worker, it could lead to this request being
535 * missed during cleanup after connection loss.
536 * So we have to do both operations here,
537 * within the same lock that protects the transfer log.
538 *
539 * _req_add_to_epoch(req); this has to be after the
540 * _maybe_start_new_epoch(req); which happened in
541 * drbd_make_request_common, because we now may set the bit
542 * again ourselves to close the current epoch.
543 *
544 * Add req to the (now) current epoch (barrier). */
545
546 /* see drbd_make_request_common,
547 * just after it grabs the req_lock */
548 D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0);
549
550 req->epoch = mdev->newest_tle->br_number;
551 list_add_tail(&req->tl_requests,
552 &mdev->newest_tle->requests);
553
554 /* increment size of current epoch */
555 mdev->newest_tle->n_req++;
556
557 /* queue work item to send data */
558 D_ASSERT(req->rq_state & RQ_NET_PENDING);
559 req->rq_state |= RQ_NET_QUEUED;
560 req->w.cb = w_send_dblock;
561 drbd_queue_work(&mdev->data.work, &req->w);
562
563 /* close the epoch, in case it outgrew the limit */
564 if (mdev->newest_tle->n_req >= mdev->net_conf->max_epoch_size)
565 queue_barrier(mdev);
566
567 break;
568
569 case send_canceled:
570 /* treat it the same */
571 case send_failed:
572 /* real cleanup will be done from tl_clear. just update flags
573 * so it is no longer marked as on the worker queue */
574 req->rq_state &= ~RQ_NET_QUEUED;
575 /* if we did it right, tl_clear should be scheduled only after
576 * this, so this should not be necessary! */
577 _req_may_be_done(req, m);
578 break;
579
580 case handed_over_to_network:
581 /* assert something? */
582 if (bio_data_dir(req->master_bio) == WRITE &&
583 mdev->net_conf->wire_protocol == DRBD_PROT_A) {
584 /* this is what is dangerous about protocol A:
585 * pretend it was successfully written on the peer. */
586 if (req->rq_state & RQ_NET_PENDING) {
587 dec_ap_pending(mdev);
588 req->rq_state &= ~RQ_NET_PENDING;
589 req->rq_state |= RQ_NET_OK;
590 } /* else: neg-ack was faster... */
591 /* it is still not yet RQ_NET_DONE until the
592 * corresponding epoch barrier got acked as well,
593 * so we know what to dirty on connection loss */
594 }
595 req->rq_state &= ~RQ_NET_QUEUED;
596 req->rq_state |= RQ_NET_SENT;
597 /* because _drbd_send_zc_bio could sleep, and may want to
598 * dereference the bio even after the "write_acked_by_peer" and
599 * "completed_ok" events came in, once we return from
600 * _drbd_send_zc_bio (drbd_send_dblock), we have to check
601 * whether it is done already, and end it. */
602 _req_may_be_done(req, m);
603 break;
604
605 case connection_lost_while_pending:
606 /* transfer log cleanup after connection loss */
607 /* assert something? */
608 if (req->rq_state & RQ_NET_PENDING)
609 dec_ap_pending(mdev);
610 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
611 req->rq_state |= RQ_NET_DONE;
612 /* if it is still queued, we may not complete it here.
613 * it will be canceled soon. */
614 if (!(req->rq_state & RQ_NET_QUEUED))
615 _req_may_be_done(req, m);
616 break;
617
618 case write_acked_by_peer_and_sis:
619 req->rq_state |= RQ_NET_SIS;
620 case conflict_discarded_by_peer:
621 /* for discarded conflicting writes of multiple primaries,
622 * there is no need to keep anything in the tl, potential
623 * node crashes are covered by the activity log. */
624 if (what == conflict_discarded_by_peer)
625 dev_alert(DEV, "Got DiscardAck packet %llus +%u!"
626 " DRBD is not a random data generator!\n",
627 (unsigned long long)req->sector, req->size);
628 req->rq_state |= RQ_NET_DONE;
629 /* fall through */
630 case write_acked_by_peer:
631 /* protocol C; successfully written on peer.
632 * Nothing to do here.
633 * We want to keep the tl in place for all protocols, to cater
634 * for volatile write-back caches on lower level devices.
635 *
636 * A barrier request is expected to have forced all prior
637 * requests onto stable storage, so completion of a barrier
638 * request could set NET_DONE right here, and not wait for the
639 * P_BARRIER_ACK, but that is an unnecessary optimization. */
640
641 /* this makes it effectively the same as for: */
642 case recv_acked_by_peer:
643 /* protocol B; pretends to be successfully written on peer.
644 * see also notes above in handed_over_to_network about
645 * protocol != C */
646 req->rq_state |= RQ_NET_OK;
647 D_ASSERT(req->rq_state & RQ_NET_PENDING);
648 dec_ap_pending(mdev);
649 req->rq_state &= ~RQ_NET_PENDING;
650 _req_may_be_done(req, m);
651 break;
652
653 case neg_acked:
654 /* assert something? */
655 if (req->rq_state & RQ_NET_PENDING)
656 dec_ap_pending(mdev);
657 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
658
659 req->rq_state |= RQ_NET_DONE;
660 _req_may_be_done(req, m);
661 /* else: done by handed_over_to_network */
662 break;
663
664 case barrier_acked:
665 if (req->rq_state & RQ_NET_PENDING) {
666 /* barrier came in before all requests have been acked.
667 * this is bad, because if the connection is lost now,
668 * we won't be able to clean them up... */
669 dev_err(DEV, "FIXME (barrier_acked but pending)\n");
670 trace_drbd_req(req, nothing, "FIXME (barrier_acked but pending)");
671 list_move(&req->tl_requests, &mdev->out_of_sequence_requests);
672 }
673 D_ASSERT(req->rq_state & RQ_NET_SENT);
674 req->rq_state |= RQ_NET_DONE;
675 _req_may_be_done(req, m);
676 break;
677
678 case data_received:
679 D_ASSERT(req->rq_state & RQ_NET_PENDING);
680 dec_ap_pending(mdev);
681 req->rq_state &= ~RQ_NET_PENDING;
682 req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
683 _req_may_be_done(req, m);
684 break;
685 };
686}
687
688/* we may do a local read if:
689 * - we are consistent (of course),
690 * - or we are generally inconsistent,
691 * BUT we are still/already IN SYNC for this area.
692 * since size may be bigger than BM_BLOCK_SIZE,
693 * we may need to check several bits.
694 */
695static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size)
696{
697 unsigned long sbnr, ebnr;
698 sector_t esector, nr_sectors;
699
700 if (mdev->state.disk == D_UP_TO_DATE)
701 return 1;
702 if (mdev->state.disk >= D_OUTDATED)
703 return 0;
704 if (mdev->state.disk < D_INCONSISTENT)
705 return 0;
706 /* state.disk == D_INCONSISTENT We will have a look at the BitMap */
707 nr_sectors = drbd_get_capacity(mdev->this_bdev);
708 esector = sector + (size >> 9) - 1;
709
710 D_ASSERT(sector < nr_sectors);
711 D_ASSERT(esector < nr_sectors);
712
713 sbnr = BM_SECT_TO_BIT(sector);
714 ebnr = BM_SECT_TO_BIT(esector);
715
716 return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr);
717}
718
719static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
720{
721 const int rw = bio_rw(bio);
722 const int size = bio->bi_size;
723 const sector_t sector = bio->bi_sector;
724 struct drbd_tl_epoch *b = NULL;
725 struct drbd_request *req;
726 int local, remote;
727 int err = -EIO;
728
729 /* allocate outside of all locks; */
730 req = drbd_req_new(mdev, bio);
731 if (!req) {
732 dec_ap_bio(mdev);
733 /* only pass the error to the upper layers.
734 * if user cannot handle io errors, that's not our business. */
735 dev_err(DEV, "could not kmalloc() req\n");
736 bio_endio(bio, -ENOMEM);
737 return 0;
738 }
739
740 trace_drbd_bio(mdev, "Rq", bio, 0, req);
741
742 local = get_ldev(mdev);
743 if (!local) {
744 bio_put(req->private_bio); /* or we get a bio leak */
745 req->private_bio = NULL;
746 }
747 if (rw == WRITE) {
748 remote = 1;
749 } else {
750 /* READ || READA */
751 if (local) {
752 if (!drbd_may_do_local_read(mdev, sector, size)) {
753 /* we could kick the syncer to
754 * sync this extent asap, wait for
755 * it, then continue locally.
756 * Or just issue the request remotely.
757 */
758 local = 0;
759 bio_put(req->private_bio);
760 req->private_bio = NULL;
761 put_ldev(mdev);
762 }
763 }
764 remote = !local && mdev->state.pdsk >= D_UP_TO_DATE;
765 }
766
767 /* If we have a disk, but a READA request is mapped to remote,
768 * we are R_PRIMARY, D_INCONSISTENT, SyncTarget.
769 * Just fail that READA request right here.
770 *
771 * THINK: maybe fail all READA when not local?
772 * or make this configurable...
773 * if network is slow, READA won't do any good.
774 */
775 if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) {
776 err = -EWOULDBLOCK;
777 goto fail_and_free_req;
778 }
779
780 /* For WRITES going to the local disk, grab a reference on the target
781 * extent. This waits for any resync activity in the corresponding
782 * resync extent to finish, and, if necessary, pulls in the target
783 * extent into the activity log, which involves further disk io because
784 * of transactional on-disk meta data updates. */
785 if (rw == WRITE && local)
786 drbd_al_begin_io(mdev, sector);
787
788 remote = remote && (mdev->state.pdsk == D_UP_TO_DATE ||
789 (mdev->state.pdsk == D_INCONSISTENT &&
790 mdev->state.conn >= C_CONNECTED));
791
792 if (!(local || remote)) {
793 dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
794 goto fail_free_complete;
795 }
796
797 /* For WRITE request, we have to make sure that we have an
798 * unused_spare_tle, in case we need to start a new epoch.
799 * I try to be smart and avoid to pre-allocate always "just in case",
800 * but there is a race between testing the bit and pointer outside the
801 * spinlock, and grabbing the spinlock.
802 * if we lost that race, we retry. */
803 if (rw == WRITE && remote &&
804 mdev->unused_spare_tle == NULL &&
805 test_bit(CREATE_BARRIER, &mdev->flags)) {
806allocate_barrier:
807 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO);
808 if (!b) {
809 dev_err(DEV, "Failed to alloc barrier.\n");
810 err = -ENOMEM;
811 goto fail_free_complete;
812 }
813 }
814
815 /* GOOD, everything prepared, grab the spin_lock */
816 spin_lock_irq(&mdev->req_lock);
817
818 if (remote) {
819 remote = (mdev->state.pdsk == D_UP_TO_DATE ||
820 (mdev->state.pdsk == D_INCONSISTENT &&
821 mdev->state.conn >= C_CONNECTED));
822 if (!remote)
823 dev_warn(DEV, "lost connection while grabbing the req_lock!\n");
824 if (!(local || remote)) {
825 dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
826 spin_unlock_irq(&mdev->req_lock);
827 goto fail_free_complete;
828 }
829 }
830
831 if (b && mdev->unused_spare_tle == NULL) {
832 mdev->unused_spare_tle = b;
833 b = NULL;
834 }
835 if (rw == WRITE && remote &&
836 mdev->unused_spare_tle == NULL &&
837 test_bit(CREATE_BARRIER, &mdev->flags)) {
838 /* someone closed the current epoch
839 * while we were grabbing the spinlock */
840 spin_unlock_irq(&mdev->req_lock);
841 goto allocate_barrier;
842 }
843
844
845 /* Update disk stats */
846 _drbd_start_io_acct(mdev, req, bio);
847
848 /* _maybe_start_new_epoch(mdev);
849 * If we need to generate a write barrier packet, we have to add the
850 * new epoch (barrier) object, and queue the barrier packet for sending,
851 * and queue the req's data after it _within the same lock_, otherwise
852 * we have race conditions were the reorder domains could be mixed up.
853 *
854 * Even read requests may start a new epoch and queue the corresponding
855 * barrier packet. To get the write ordering right, we only have to
856 * make sure that, if this is a write request and it triggered a
857 * barrier packet, this request is queued within the same spinlock. */
858 if (remote && mdev->unused_spare_tle &&
859 test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
860 _tl_add_barrier(mdev, mdev->unused_spare_tle);
861 mdev->unused_spare_tle = NULL;
862 } else {
863 D_ASSERT(!(remote && rw == WRITE &&
864 test_bit(CREATE_BARRIER, &mdev->flags)));
865 }
866
867 /* NOTE
868 * Actually, 'local' may be wrong here already, since we may have failed
869 * to write to the meta data, and may become wrong anytime because of
870 * local io-error for some other request, which would lead to us
871 * "detaching" the local disk.
872 *
873 * 'remote' may become wrong any time because the network could fail.
874 *
875 * This is a harmless race condition, though, since it is handled
876 * correctly at the appropriate places; so it just defers the failure
877 * of the respective operation.
878 */
879
880 /* mark them early for readability.
881 * this just sets some state flags. */
882 if (remote)
883 _req_mod(req, to_be_send);
884 if (local)
885 _req_mod(req, to_be_submitted);
886
887 /* check this request on the collision detection hash tables.
888 * if we have a conflict, just complete it here.
889 * THINK do we want to check reads, too? (I don't think so...) */
890 if (rw == WRITE && _req_conflicts(req)) {
891 /* this is a conflicting request.
892 * even though it may have been only _partially_
893 * overlapping with one of the currently pending requests,
894 * without even submitting or sending it, we will
895 * pretend that it was successfully served right now.
896 */
897 if (local) {
898 bio_put(req->private_bio);
899 req->private_bio = NULL;
900 drbd_al_complete_io(mdev, req->sector);
901 put_ldev(mdev);
902 local = 0;
903 }
904 if (remote)
905 dec_ap_pending(mdev);
906 _drbd_end_io_acct(mdev, req);
907 /* THINK: do we want to fail it (-EIO), or pretend success? */
908 bio_endio(req->master_bio, 0);
909 req->master_bio = NULL;
910 dec_ap_bio(mdev);
911 drbd_req_free(req);
912 remote = 0;
913 }
914
915 /* NOTE remote first: to get the concurrent write detection right,
916 * we must register the request before start of local IO. */
917 if (remote) {
918 /* either WRITE and C_CONNECTED,
919 * or READ, and no local disk,
920 * or READ, but not in sync.
921 */
922 _req_mod(req, (rw == WRITE)
923 ? queue_for_net_write
924 : queue_for_net_read);
925 }
926 spin_unlock_irq(&mdev->req_lock);
927 kfree(b); /* if someone else has beaten us to it... */
928
929 if (local) {
930 req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
931
932 trace_drbd_bio(mdev, "Pri", req->private_bio, 0, NULL);
933
934 if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
935 : rw == READ ? DRBD_FAULT_DT_RD
936 : DRBD_FAULT_DT_RA))
937 bio_endio(req->private_bio, -EIO);
938 else
939 generic_make_request(req->private_bio);
940 }
941
942 /* we need to plug ALWAYS since we possibly need to kick lo_dev.
943 * we plug after submit, so we won't miss an unplug event */
944 drbd_plug_device(mdev);
945
946 return 0;
947
948fail_free_complete:
949 if (rw == WRITE && local)
950 drbd_al_complete_io(mdev, sector);
951fail_and_free_req:
952 if (local) {
953 bio_put(req->private_bio);
954 req->private_bio = NULL;
955 put_ldev(mdev);
956 }
957 bio_endio(bio, err);
958 drbd_req_free(req);
959 dec_ap_bio(mdev);
960 kfree(b);
961
962 return 0;
963}
964
965/* helper function for drbd_make_request
966 * if we can determine just by the mdev (state) that this request will fail,
967 * return 1
968 * otherwise return 0
969 */
970static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
971{
972 /* Unconfigured */
973 if (mdev->state.conn == C_DISCONNECTING &&
974 mdev->state.disk == D_DISKLESS)
975 return 1;
976
977 if (mdev->state.role != R_PRIMARY &&
978 (!allow_oos || is_write)) {
979 if (__ratelimit(&drbd_ratelimit_state)) {
980 dev_err(DEV, "Process %s[%u] tried to %s; "
981 "since we are not in Primary state, "
982 "we cannot allow this\n",
983 current->comm, current->pid,
984 is_write ? "WRITE" : "READ");
985 }
986 return 1;
987 }
988
989 /*
990 * Paranoia: we might have been primary, but sync target, or
991 * even diskless, then lost the connection.
992 * This should have been handled (panic? suspend?) somewhere
993 * else. But maybe it was not, so check again here.
994 * Caution: as long as we do not have a read/write lock on mdev,
995 * to serialize state changes, this is racy, since we may lose
996 * the connection *after* we test for the cstate.
997 */
998 if (mdev->state.disk < D_UP_TO_DATE && mdev->state.pdsk < D_UP_TO_DATE) {
999 if (__ratelimit(&drbd_ratelimit_state))
1000 dev_err(DEV, "Sorry, I have no access to good data anymore.\n");
1001 return 1;
1002 }
1003
1004 return 0;
1005}
1006
1007int drbd_make_request_26(struct request_queue *q, struct bio *bio)
1008{
1009 unsigned int s_enr, e_enr;
1010 struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
1011
1012 if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) {
1013 bio_endio(bio, -EPERM);
1014 return 0;
1015 }
1016
1017 /* Reject barrier requests if we know the underlying device does
1018 * not support them.
1019 * XXX: Need to get this info from peer as well some how so we
1020 * XXX: reject if EITHER side/data/metadata area does not support them.
1021 *
1022 * because of those XXX, this is not yet enabled,
1023 * i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit.
1024 */
1025 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags))) {
1026 /* dev_warn(DEV, "Rejecting barrier request as underlying device does not support\n"); */
1027 bio_endio(bio, -EOPNOTSUPP);
1028 return 0;
1029 }
1030
1031 /*
1032 * what we "blindly" assume:
1033 */
1034 D_ASSERT(bio->bi_size > 0);
1035 D_ASSERT((bio->bi_size & 0x1ff) == 0);
1036 D_ASSERT(bio->bi_idx == 0);
1037
1038 /* to make some things easier, force alignment of requests within the
1039 * granularity of our hash tables */
1040 s_enr = bio->bi_sector >> HT_SHIFT;
1041 e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT;
1042
1043 if (likely(s_enr == e_enr)) {
1044 inc_ap_bio(mdev, 1);
1045 return drbd_make_request_common(mdev, bio);
1046 }
1047
1048 /* can this bio be split generically?
1049 * Maybe add our own split-arbitrary-bios function. */
1050 if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_SEGMENT_SIZE) {
1051 /* rather error out here than BUG in bio_split */
1052 dev_err(DEV, "bio would need to, but cannot, be split: "
1053 "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n",
1054 bio->bi_vcnt, bio->bi_idx, bio->bi_size,
1055 (unsigned long long)bio->bi_sector);
1056 bio_endio(bio, -EINVAL);
1057 } else {
1058 /* This bio crosses some boundary, so we have to split it. */
1059 struct bio_pair *bp;
1060 /* works for the "do not cross hash slot boundaries" case
1061 * e.g. sector 262269, size 4096
1062 * s_enr = 262269 >> 6 = 4097
1063 * e_enr = (262269+8-1) >> 6 = 4098
1064 * HT_SHIFT = 6
1065 * sps = 64, mask = 63
1066 * first_sectors = 64 - (262269 & 63) = 3
1067 */
1068 const sector_t sect = bio->bi_sector;
1069 const int sps = 1 << HT_SHIFT; /* sectors per slot */
1070 const int mask = sps - 1;
1071 const sector_t first_sectors = sps - (sect & mask);
1072 bp = bio_split(bio,
1073#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
1074 bio_split_pool,
1075#endif
1076 first_sectors);
1077
1078 /* we need to get a "reference count" (ap_bio_cnt)
1079 * to avoid races with the disconnect/reconnect/suspend code.
1080 * In case we need to split the bio here, we need to get two references
1081 * atomically, otherwise we might deadlock when trying to submit the
1082 * second one! */
1083 inc_ap_bio(mdev, 2);
1084
1085 D_ASSERT(e_enr == s_enr + 1);
1086
1087 drbd_make_request_common(mdev, &bp->bio1);
1088 drbd_make_request_common(mdev, &bp->bio2);
1089 bio_pair_release(bp);
1090 }
1091 return 0;
1092}
1093
1094/* This is called by bio_add_page(). With this function we reduce
1095 * the number of BIOs that span over multiple DRBD_MAX_SEGMENT_SIZEs
1096 * units (was AL_EXTENTs).
1097 *
1098 * we do the calculation within the lower 32bit of the byte offsets,
1099 * since we don't care for actual offset, but only check whether it
1100 * would cross "activity log extent" boundaries.
1101 *
1102 * As long as the BIO is empty we have to allow at least one bvec,
1103 * regardless of size and offset. so the resulting bio may still
1104 * cross extent boundaries. those are dealt with (bio_split) in
1105 * drbd_make_request_26.
1106 */
1107int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
1108{
1109 struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
1110 unsigned int bio_offset =
1111 (unsigned int)bvm->bi_sector << 9; /* 32 bit */
1112 unsigned int bio_size = bvm->bi_size;
1113 int limit, backing_limit;
1114
1115 limit = DRBD_MAX_SEGMENT_SIZE
1116 - ((bio_offset & (DRBD_MAX_SEGMENT_SIZE-1)) + bio_size);
1117 if (limit < 0)
1118 limit = 0;
1119 if (bio_size == 0) {
1120 if (limit <= bvec->bv_len)
1121 limit = bvec->bv_len;
1122 } else if (limit && get_ldev(mdev)) {
1123 struct request_queue * const b =
1124 mdev->ldev->backing_bdev->bd_disk->queue;
1125 if (b->merge_bvec_fn && mdev->ldev->dc.use_bmbv) {
1126 backing_limit = b->merge_bvec_fn(b, bvm, bvec);
1127 limit = min(limit, backing_limit);
1128 }
1129 put_ldev(mdev);
1130 }
1131 return limit;
1132}
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
new file mode 100644
index 000000000000..d37ab57f1209
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.h
@@ -0,0 +1,327 @@
1/*
2 drbd_req.h
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2006-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2006-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
8 Copyright (C) 2006-2008, Philipp Reisner <philipp.reisner@linbit.com>.
9
10 DRBD is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 DRBD is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25#ifndef _DRBD_REQ_H
26#define _DRBD_REQ_H
27
28#include <linux/autoconf.h>
29#include <linux/module.h>
30
31#include <linux/slab.h>
32#include <linux/drbd.h>
33#include "drbd_int.h"
34#include "drbd_wrappers.h"
35
36/* The request callbacks will be called in irq context by the IDE drivers,
37 and in Softirqs/Tasklets/BH context by the SCSI drivers,
38 and by the receiver and worker in kernel-thread context.
39 Try to get the locking right :) */
40
41/*
42 * Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are
43 * associated with IO requests originating from the block layer above us.
44 *
45 * There are quite a few things that may happen to a drbd request
46 * during its lifetime.
47 *
48 * It will be created.
49 * It will be marked with the intention to be
50 * submitted to local disk and/or
51 * send via the network.
52 *
53 * It has to be placed on the transfer log and other housekeeping lists,
54 * In case we have a network connection.
55 *
56 * It may be identified as a concurrent (write) request
57 * and be handled accordingly.
58 *
59 * It may me handed over to the local disk subsystem.
60 * It may be completed by the local disk subsystem,
61 * either sucessfully or with io-error.
62 * In case it is a READ request, and it failed locally,
63 * it may be retried remotely.
64 *
65 * It may be queued for sending.
66 * It may be handed over to the network stack,
67 * which may fail.
68 * It may be acknowledged by the "peer" according to the wire_protocol in use.
69 * this may be a negative ack.
70 * It may receive a faked ack when the network connection is lost and the
71 * transfer log is cleaned up.
72 * Sending may be canceled due to network connection loss.
73 * When it finally has outlived its time,
74 * corresponding dirty bits in the resync-bitmap may be cleared or set,
75 * it will be destroyed,
76 * and completion will be signalled to the originator,
77 * with or without "success".
78 */
79
80enum drbd_req_event {
81 created,
82 to_be_send,
83 to_be_submitted,
84
85 /* XXX yes, now I am inconsistent...
86 * these two are not "events" but "actions"
87 * oh, well... */
88 queue_for_net_write,
89 queue_for_net_read,
90
91 send_canceled,
92 send_failed,
93 handed_over_to_network,
94 connection_lost_while_pending,
95 recv_acked_by_peer,
96 write_acked_by_peer,
97 write_acked_by_peer_and_sis, /* and set_in_sync */
98 conflict_discarded_by_peer,
99 neg_acked,
100 barrier_acked, /* in protocol A and B */
101 data_received, /* (remote read) */
102
103 read_completed_with_error,
104 read_ahead_completed_with_error,
105 write_completed_with_error,
106 completed_ok,
107 nothing, /* for tracing only */
108};
109
110/* encoding of request states for now. we don't actually need that many bits.
111 * we don't need to do atomic bit operations either, since most of the time we
112 * need to look at the connection state and/or manipulate some lists at the
113 * same time, so we should hold the request lock anyways.
114 */
115enum drbd_req_state_bits {
116 /* 210
117 * 000: no local possible
118 * 001: to be submitted
119 * UNUSED, we could map: 011: submitted, completion still pending
120 * 110: completed ok
121 * 010: completed with error
122 */
123 __RQ_LOCAL_PENDING,
124 __RQ_LOCAL_COMPLETED,
125 __RQ_LOCAL_OK,
126
127 /* 76543
128 * 00000: no network possible
129 * 00001: to be send
130 * 00011: to be send, on worker queue
131 * 00101: sent, expecting recv_ack (B) or write_ack (C)
132 * 11101: sent,
133 * recv_ack (B) or implicit "ack" (A),
134 * still waiting for the barrier ack.
135 * master_bio may already be completed and invalidated.
136 * 11100: write_acked (C),
137 * data_received (for remote read, any protocol)
138 * or finally the barrier ack has arrived (B,A)...
139 * request can be freed
140 * 01100: neg-acked (write, protocol C)
141 * or neg-d-acked (read, any protocol)
142 * or killed from the transfer log
143 * during cleanup after connection loss
144 * request can be freed
145 * 01000: canceled or send failed...
146 * request can be freed
147 */
148
149 /* if "SENT" is not set, yet, this can still fail or be canceled.
150 * if "SENT" is set already, we still wait for an Ack packet.
151 * when cleared, the master_bio may be completed.
152 * in (B,A) the request object may still linger on the transaction log
153 * until the corresponding barrier ack comes in */
154 __RQ_NET_PENDING,
155
156 /* If it is QUEUED, and it is a WRITE, it is also registered in the
157 * transfer log. Currently we need this flag to avoid conflicts between
158 * worker canceling the request and tl_clear_barrier killing it from
159 * transfer log. We should restructure the code so this conflict does
160 * no longer occur. */
161 __RQ_NET_QUEUED,
162
163 /* well, actually only "handed over to the network stack".
164 *
165 * TODO can potentially be dropped because of the similar meaning
166 * of RQ_NET_SENT and ~RQ_NET_QUEUED.
167 * however it is not exactly the same. before we drop it
168 * we must ensure that we can tell a request with network part
169 * from a request without, regardless of what happens to it. */
170 __RQ_NET_SENT,
171
172 /* when set, the request may be freed (if RQ_NET_QUEUED is clear).
173 * basically this means the corresponding P_BARRIER_ACK was received */
174 __RQ_NET_DONE,
175
176 /* whether or not we know (C) or pretend (B,A) that the write
177 * was successfully written on the peer.
178 */
179 __RQ_NET_OK,
180
181 /* peer called drbd_set_in_sync() for this write */
182 __RQ_NET_SIS,
183
184 /* keep this last, its for the RQ_NET_MASK */
185 __RQ_NET_MAX,
186};
187
188#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
189#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
190#define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK)
191
192#define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */
193
194#define RQ_NET_PENDING (1UL << __RQ_NET_PENDING)
195#define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED)
196#define RQ_NET_SENT (1UL << __RQ_NET_SENT)
197#define RQ_NET_DONE (1UL << __RQ_NET_DONE)
198#define RQ_NET_OK (1UL << __RQ_NET_OK)
199#define RQ_NET_SIS (1UL << __RQ_NET_SIS)
200
201/* 0x1f8 */
202#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
203
204/* epoch entries */
205static inline
206struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector)
207{
208 BUG_ON(mdev->ee_hash_s == 0);
209 return mdev->ee_hash +
210 ((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s);
211}
212
213/* transfer log (drbd_request objects) */
214static inline
215struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector)
216{
217 BUG_ON(mdev->tl_hash_s == 0);
218 return mdev->tl_hash +
219 ((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s);
220}
221
222/* application reads (drbd_request objects) */
223static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector)
224{
225 return mdev->app_reads_hash
226 + ((unsigned int)(sector) % APP_R_HSIZE);
227}
228
229/* when we receive the answer for a read request,
230 * verify that we actually know about it */
231static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
232 u64 id, sector_t sector)
233{
234 struct hlist_head *slot = ar_hash_slot(mdev, sector);
235 struct hlist_node *n;
236 struct drbd_request *req;
237
238 hlist_for_each_entry(req, n, slot, colision) {
239 if ((unsigned long)req == (unsigned long)id) {
240 D_ASSERT(req->sector == sector);
241 return req;
242 }
243 }
244 return NULL;
245}
246
247static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
248 struct bio *bio_src)
249{
250 struct bio *bio;
251 struct drbd_request *req =
252 mempool_alloc(drbd_request_mempool, GFP_NOIO);
253 if (likely(req)) {
254 bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
255
256 req->rq_state = 0;
257 req->mdev = mdev;
258 req->master_bio = bio_src;
259 req->private_bio = bio;
260 req->epoch = 0;
261 req->sector = bio->bi_sector;
262 req->size = bio->bi_size;
263 req->start_time = jiffies;
264 INIT_HLIST_NODE(&req->colision);
265 INIT_LIST_HEAD(&req->tl_requests);
266 INIT_LIST_HEAD(&req->w.list);
267
268 bio->bi_private = req;
269 bio->bi_end_io = drbd_endio_pri;
270 bio->bi_next = NULL;
271 }
272 return req;
273}
274
275static inline void drbd_req_free(struct drbd_request *req)
276{
277 mempool_free(req, drbd_request_mempool);
278}
279
280static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
281{
282 return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
283}
284
285/* Short lived temporary struct on the stack.
286 * We could squirrel the error to be returned into
287 * bio->bi_size, or similar. But that would be too ugly. */
288struct bio_and_error {
289 struct bio *bio;
290 int error;
291};
292
293extern void _req_may_be_done(struct drbd_request *req,
294 struct bio_and_error *m);
295extern void __req_mod(struct drbd_request *req, enum drbd_req_event what,
296 struct bio_and_error *m);
297extern void complete_master_bio(struct drbd_conf *mdev,
298 struct bio_and_error *m);
299
300/* use this if you don't want to deal with calling complete_master_bio()
301 * outside the spinlock, e.g. when walking some list on cleanup. */
302static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what)
303{
304 struct drbd_conf *mdev = req->mdev;
305 struct bio_and_error m;
306
307 /* __req_mod possibly frees req, do not touch req after that! */
308 __req_mod(req, what, &m);
309 if (m.bio)
310 complete_master_bio(mdev, &m);
311}
312
313/* completion of master bio is outside of spinlock.
314 * If you need it irqsave, do it your self! */
315static inline void req_mod(struct drbd_request *req,
316 enum drbd_req_event what)
317{
318 struct drbd_conf *mdev = req->mdev;
319 struct bio_and_error m;
320 spin_lock_irq(&mdev->req_lock);
321 __req_mod(req, what, &m);
322 spin_unlock_irq(&mdev->req_lock);
323
324 if (m.bio)
325 complete_master_bio(mdev, &m);
326}
327#endif
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
new file mode 100644
index 000000000000..76863e3f05be
--- /dev/null
+++ b/drivers/block/drbd/drbd_strings.c
@@ -0,0 +1,113 @@
1/*
2 drbd.h
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24*/
25
26#include <linux/drbd.h>
27
28static const char *drbd_conn_s_names[] = {
29 [C_STANDALONE] = "StandAlone",
30 [C_DISCONNECTING] = "Disconnecting",
31 [C_UNCONNECTED] = "Unconnected",
32 [C_TIMEOUT] = "Timeout",
33 [C_BROKEN_PIPE] = "BrokenPipe",
34 [C_NETWORK_FAILURE] = "NetworkFailure",
35 [C_PROTOCOL_ERROR] = "ProtocolError",
36 [C_WF_CONNECTION] = "WFConnection",
37 [C_WF_REPORT_PARAMS] = "WFReportParams",
38 [C_TEAR_DOWN] = "TearDown",
39 [C_CONNECTED] = "Connected",
40 [C_STARTING_SYNC_S] = "StartingSyncS",
41 [C_STARTING_SYNC_T] = "StartingSyncT",
42 [C_WF_BITMAP_S] = "WFBitMapS",
43 [C_WF_BITMAP_T] = "WFBitMapT",
44 [C_WF_SYNC_UUID] = "WFSyncUUID",
45 [C_SYNC_SOURCE] = "SyncSource",
46 [C_SYNC_TARGET] = "SyncTarget",
47 [C_PAUSED_SYNC_S] = "PausedSyncS",
48 [C_PAUSED_SYNC_T] = "PausedSyncT",
49 [C_VERIFY_S] = "VerifyS",
50 [C_VERIFY_T] = "VerifyT",
51};
52
53static const char *drbd_role_s_names[] = {
54 [R_PRIMARY] = "Primary",
55 [R_SECONDARY] = "Secondary",
56 [R_UNKNOWN] = "Unknown"
57};
58
59static const char *drbd_disk_s_names[] = {
60 [D_DISKLESS] = "Diskless",
61 [D_ATTACHING] = "Attaching",
62 [D_FAILED] = "Failed",
63 [D_NEGOTIATING] = "Negotiating",
64 [D_INCONSISTENT] = "Inconsistent",
65 [D_OUTDATED] = "Outdated",
66 [D_UNKNOWN] = "DUnknown",
67 [D_CONSISTENT] = "Consistent",
68 [D_UP_TO_DATE] = "UpToDate",
69};
70
71static const char *drbd_state_sw_errors[] = {
72 [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
73 [-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk",
74 [-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
75 [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk",
76 [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected",
77 [-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated",
78 [-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active",
79 [-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device",
80 [-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node",
81 [-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk",
82 [-SS_DEVICE_IN_USE] = "Device is held open by someone",
83 [-SS_NO_NET_CONFIG] = "Have no net/connection configuration",
84 [-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify",
85 [-SS_NEED_CONNECTION] = "Need a connection to start verify or resync",
86 [-SS_NOT_SUPPORTED] = "Peer does not support protocol",
87 [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
88 [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
89 [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
90};
91
92const char *drbd_conn_str(enum drbd_conns s)
93{
94 /* enums are unsigned... */
95 return s > C_PAUSED_SYNC_T ? "TOO_LARGE" : drbd_conn_s_names[s];
96}
97
98const char *drbd_role_str(enum drbd_role s)
99{
100 return s > R_SECONDARY ? "TOO_LARGE" : drbd_role_s_names[s];
101}
102
103const char *drbd_disk_str(enum drbd_disk_state s)
104{
105 return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s];
106}
107
108const char *drbd_set_st_err_str(enum drbd_state_ret_codes err)
109{
110 return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" :
111 err > SS_TWO_PRIMARIES ? "TOO_LARGE"
112 : drbd_state_sw_errors[-err];
113}
diff --git a/drivers/block/drbd/drbd_tracing.c b/drivers/block/drbd/drbd_tracing.c
new file mode 100644
index 000000000000..d18d4f7b4bef
--- /dev/null
+++ b/drivers/block/drbd/drbd_tracing.c
@@ -0,0 +1,752 @@
1/*
2 drbd_tracing.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/module.h>
27#include <linux/drbd.h>
28#include <linux/ctype.h>
29#include "drbd_int.h"
30#include "drbd_tracing.h"
31#include <linux/drbd_tag_magic.h>
32
33MODULE_LICENSE("GPL");
34MODULE_AUTHOR("Philipp Reisner, Lars Ellenberg");
35MODULE_DESCRIPTION("DRBD tracepoint probes");
36MODULE_PARM_DESC(trace_mask, "Bitmap of events to trace see drbd_tracing.c");
37MODULE_PARM_DESC(trace_level, "Current tracing level (changeable in /sys)");
38MODULE_PARM_DESC(trace_devs, "Bitmap of devices to trace (changeable in /sys)");
39
40unsigned int trace_mask = 0; /* Bitmap of events to trace */
41int trace_level; /* Current trace level */
42int trace_devs; /* Bitmap of devices to trace */
43
44module_param(trace_mask, uint, 0444);
45module_param(trace_level, int, 0644);
46module_param(trace_devs, int, 0644);
47
48enum {
49 TRACE_PACKET = 0x0001,
50 TRACE_RQ = 0x0002,
51 TRACE_UUID = 0x0004,
52 TRACE_RESYNC = 0x0008,
53 TRACE_EE = 0x0010,
54 TRACE_UNPLUG = 0x0020,
55 TRACE_NL = 0x0040,
56 TRACE_AL_EXT = 0x0080,
57 TRACE_INT_RQ = 0x0100,
58 TRACE_MD_IO = 0x0200,
59 TRACE_EPOCH = 0x0400,
60};
61
62/* Buffer printing support
63 * dbg_print_flags: used for Flags arg to drbd_print_buffer
64 * - DBGPRINT_BUFFADDR; if set, each line starts with the
65 * virtual address of the line being output. If clear,
66 * each line starts with the offset from the beginning
67 * of the buffer. */
68enum dbg_print_flags {
69 DBGPRINT_BUFFADDR = 0x0001,
70};
71
72/* Macro stuff */
73static char *nl_packet_name(int packet_type)
74{
75/* Generate packet type strings */
76#define NL_PACKET(name, number, fields) \
77 [P_ ## name] = # name,
78#define NL_INTEGER Argh!
79#define NL_BIT Argh!
80#define NL_INT64 Argh!
81#define NL_STRING Argh!
82
83 static char *nl_tag_name[P_nl_after_last_packet] = {
84#include "linux/drbd_nl.h"
85 };
86
87 return (packet_type < sizeof(nl_tag_name)/sizeof(nl_tag_name[0])) ?
88 nl_tag_name[packet_type] : "*Unknown*";
89}
90/* /Macro stuff */
91
92static inline int is_mdev_trace(struct drbd_conf *mdev, unsigned int level)
93{
94 return trace_level >= level && ((1 << mdev_to_minor(mdev)) & trace_devs);
95}
96
97static void probe_drbd_unplug(struct drbd_conf *mdev, char *msg)
98{
99 if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
100 return;
101
102 dev_info(DEV, "%s, ap_bio_count=%d\n", msg, atomic_read(&mdev->ap_bio_cnt));
103}
104
105static void probe_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index)
106{
107 static char *uuid_str[UI_EXTENDED_SIZE] = {
108 [UI_CURRENT] = "CURRENT",
109 [UI_BITMAP] = "BITMAP",
110 [UI_HISTORY_START] = "HISTORY_START",
111 [UI_HISTORY_END] = "HISTORY_END",
112 [UI_SIZE] = "SIZE",
113 [UI_FLAGS] = "FLAGS",
114 };
115
116 if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
117 return;
118
119 if (index >= UI_EXTENDED_SIZE) {
120 dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n");
121 return;
122 }
123
124 dev_info(DEV, " uuid[%s] now %016llX\n",
125 uuid_str[index],
126 (unsigned long long)mdev->ldev->md.uuid[index]);
127}
128
129static void probe_drbd_md_io(struct drbd_conf *mdev, int rw,
130 struct drbd_backing_dev *bdev)
131{
132 if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
133 return;
134
135 dev_info(DEV, " %s metadata superblock now\n",
136 rw == READ ? "Reading" : "Writing");
137}
138
139static void probe_drbd_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, char* msg)
140{
141 if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
142 return;
143
144 dev_info(DEV, "EE %s sec=%llus size=%u e=%p\n",
145 msg, (unsigned long long)e->sector, e->size, e);
146}
147
148static void probe_drbd_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch,
149 enum epoch_event ev)
150{
151 static char *epoch_event_str[] = {
152 [EV_PUT] = "put",
153 [EV_GOT_BARRIER_NR] = "got_barrier_nr",
154 [EV_BARRIER_DONE] = "barrier_done",
155 [EV_BECAME_LAST] = "became_last",
156 [EV_TRACE_FLUSH] = "issuing_flush",
157 [EV_TRACE_ADD_BARRIER] = "added_barrier",
158 [EV_TRACE_SETTING_BI] = "just set barrier_in_next_epoch",
159 };
160
161 if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
162 return;
163
164 ev &= ~EV_CLEANUP;
165
166 switch (ev) {
167 case EV_TRACE_ALLOC:
168 dev_info(DEV, "Allocate epoch %p/xxxx { } nr_epochs=%d\n", epoch, mdev->epochs);
169 break;
170 case EV_TRACE_FREE:
171 dev_info(DEV, "Freeing epoch %p/%d { size=%d } nr_epochs=%d\n",
172 epoch, epoch->barrier_nr, atomic_read(&epoch->epoch_size),
173 mdev->epochs);
174 break;
175 default:
176 dev_info(DEV, "Update epoch %p/%d { size=%d active=%d %c%c n%c%c } ev=%s\n",
177 epoch, epoch->barrier_nr, atomic_read(&epoch->epoch_size),
178 atomic_read(&epoch->active),
179 test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) ? 'n' : '-',
180 test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) ? 'b' : '-',
181 test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) ? 'i' : '-',
182 test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ? 'd' : '-',
183 epoch_event_str[ev]);
184 }
185}
186
187static void probe_drbd_netlink(void *data, int is_req)
188{
189 struct cn_msg *msg = data;
190
191 if (is_req) {
192 struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)msg->data;
193
194 printk(KERN_INFO "drbd%d: "
195 "Netlink: << %s (%d) - seq: %x, ack: %x, len: %x\n",
196 nlp->drbd_minor,
197 nl_packet_name(nlp->packet_type),
198 nlp->packet_type,
199 msg->seq, msg->ack, msg->len);
200 } else {
201 struct drbd_nl_cfg_reply *nlp = (struct drbd_nl_cfg_reply *)msg->data;
202
203 printk(KERN_INFO "drbd%d: "
204 "Netlink: >> %s (%d) - seq: %x, ack: %x, len: %x\n",
205 nlp->minor,
206 nlp->packet_type == P_nl_after_last_packet ?
207 "Empty-Reply" : nl_packet_name(nlp->packet_type),
208 nlp->packet_type,
209 msg->seq, msg->ack, msg->len);
210 }
211}
212
213static void probe_drbd_actlog(struct drbd_conf *mdev, sector_t sector, char* msg)
214{
215 unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
216
217 if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
218 return;
219
220 dev_info(DEV, "%s (sec=%llus, al_enr=%u, rs_enr=%d)\n",
221 msg, (unsigned long long) sector, enr,
222 (int)BM_SECT_TO_EXT(sector));
223}
224
225/**
226 * drbd_print_buffer() - Hexdump arbitrary binary data into a buffer
227 * @prefix: String is output at the beginning of each line output.
228 * @flags: Currently only defined flag: DBGPRINT_BUFFADDR; if set, each
229 * line starts with the virtual address of the line being
230 * output. If clear, each line starts with the offset from the
231 * beginning of the buffer.
232 * @size: Indicates the size of each entry in the buffer. Supported
233 * values are sizeof(char), sizeof(short) and sizeof(int)
234 * @buffer: Start address of buffer
235 * @buffer_va: Virtual address of start of buffer (normally the same
236 * as Buffer, but having it separate allows it to hold
237 * file address for example)
238 * @length: length of buffer
239 */
240static void drbd_print_buffer(const char *prefix, unsigned int flags, int size,
241 const void *buffer, const void *buffer_va,
242 unsigned int length)
243
244#define LINE_SIZE 16
245#define LINE_ENTRIES (int)(LINE_SIZE/size)
246{
247 const unsigned char *pstart;
248 const unsigned char *pstart_va;
249 const unsigned char *pend;
250 char bytes_str[LINE_SIZE*3+8], ascii_str[LINE_SIZE+8];
251 char *pbytes = bytes_str, *pascii = ascii_str;
252 int offset = 0;
253 long sizemask;
254 int field_width;
255 int index;
256 const unsigned char *pend_str;
257 const unsigned char *p;
258 int count;
259
260 /* verify size parameter */
261 if (size != sizeof(char) &&
262 size != sizeof(short) &&
263 size != sizeof(int)) {
264 printk(KERN_DEBUG "drbd_print_buffer: "
265 "ERROR invalid size %d\n", size);
266 return;
267 }
268
269 sizemask = size-1;
270 field_width = size*2;
271
272 /* Adjust start/end to be on appropriate boundary for size */
273 buffer = (const char *)((long)buffer & ~sizemask);
274 pend = (const unsigned char *)
275 (((long)buffer + length + sizemask) & ~sizemask);
276
277 if (flags & DBGPRINT_BUFFADDR) {
278 /* Move start back to nearest multiple of line size,
279 * if printing address. This results in nicely formatted output
280 * with addresses being on line size (16) byte boundaries */
281 pstart = (const unsigned char *)((long)buffer & ~(LINE_SIZE-1));
282 } else {
283 pstart = (const unsigned char *)buffer;
284 }
285
286 /* Set value of start VA to print if addresses asked for */
287 pstart_va = (const unsigned char *)buffer_va
288 - ((const unsigned char *)buffer-pstart);
289
290 /* Calculate end position to nicely align right hand side */
291 pend_str = pstart + (((pend-pstart) + LINE_SIZE-1) & ~(LINE_SIZE-1));
292
293 /* Init strings */
294 *pbytes = *pascii = '\0';
295
296 /* Start at beginning of first line */
297 p = pstart;
298 count = 0;
299
300 while (p < pend_str) {
301 if (p < (const unsigned char *)buffer || p >= pend) {
302 /* Before start of buffer or after end- print spaces */
303 pbytes += sprintf(pbytes, "%*c ", field_width, ' ');
304 pascii += sprintf(pascii, "%*c", size, ' ');
305 p += size;
306 } else {
307 /* Add hex and ascii to strings */
308 int val;
309 switch (size) {
310 default:
311 case 1:
312 val = *(unsigned char *)p;
313 break;
314 case 2:
315 val = *(unsigned short *)p;
316 break;
317 case 4:
318 val = *(unsigned int *)p;
319 break;
320 }
321
322 pbytes += sprintf(pbytes, "%0*x ", field_width, val);
323
324 for (index = size; index; index--) {
325 *pascii++ = isprint(*p) ? *p : '.';
326 p++;
327 }
328 }
329
330 count++;
331
332 if (count == LINE_ENTRIES || p >= pend_str) {
333 /* Null terminate and print record */
334 *pascii = '\0';
335 printk(KERN_DEBUG "%s%8.8lx: %*s|%*s|\n",
336 prefix,
337 (flags & DBGPRINT_BUFFADDR)
338 ? (long)pstart_va:(long)offset,
339 LINE_ENTRIES*(field_width+1), bytes_str,
340 LINE_SIZE, ascii_str);
341
342 /* Move onto next line */
343 pstart_va += (p-pstart);
344 pstart = p;
345 count = 0;
346 offset += LINE_SIZE;
347
348 /* Re-init strings */
349 pbytes = bytes_str;
350 pascii = ascii_str;
351 *pbytes = *pascii = '\0';
352 }
353 }
354}
355
356static void probe_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, va_list args)
357{
358 char str[256];
359
360 if (!is_mdev_trace(mdev, level))
361 return;
362
363 if (vsnprintf(str, 256, fmt, args) >= 256)
364 str[255] = 0;
365
366 printk(KERN_INFO "%s %s: %s", dev_driver_string(disk_to_dev(mdev->vdisk)),
367 dev_name(disk_to_dev(mdev->vdisk)), str);
368}
369
370static void probe_drbd_bio(struct drbd_conf *mdev, const char *pfx, struct bio *bio, int complete,
371 struct drbd_request *r)
372{
373#if defined(CONFIG_LBDAF) || defined(CONFIG_LBD)
374#define SECTOR_FORMAT "%Lx"
375#else
376#define SECTOR_FORMAT "%lx"
377#endif
378#define SECTOR_SHIFT 9
379
380 unsigned long lowaddr = (unsigned long)(bio->bi_sector << SECTOR_SHIFT);
381 char *faddr = (char *)(lowaddr);
382 char rb[sizeof(void *)*2+6] = { 0, };
383 struct bio_vec *bvec;
384 int segno;
385
386 const int rw = bio->bi_rw;
387 const int biorw = (rw & (RW_MASK|RWA_MASK));
388 const int biobarrier = (rw & (1<<BIO_RW_BARRIER));
389 const int biosync = (rw & ((1<<BIO_RW_UNPLUG) | (1<<BIO_RW_SYNCIO)));
390
391 if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
392 return;
393
394 if (r)
395 sprintf(rb, "Req:%p ", r);
396
397 dev_info(DEV, "%s %s:%s%s%s Bio:%p %s- %soffset " SECTOR_FORMAT ", size %x\n",
398 complete ? "<<<" : ">>>",
399 pfx,
400 biorw == WRITE ? "Write" : "Read",
401 biobarrier ? " : B" : "",
402 biosync ? " : S" : "",
403 bio,
404 rb,
405 complete ? (bio_flagged(bio, BIO_UPTODATE) ? "Success, " : "Failed, ") : "",
406 bio->bi_sector << SECTOR_SHIFT,
407 bio->bi_size);
408
409 if (trace_level >= TRACE_LVL_METRICS &&
410 ((biorw == WRITE) ^ complete)) {
411 printk(KERN_DEBUG " ind page offset length\n");
412 __bio_for_each_segment(bvec, bio, segno, 0) {
413 printk(KERN_DEBUG " [%d] %p %8.8x %8.8x\n", segno,
414 bvec->bv_page, bvec->bv_offset, bvec->bv_len);
415
416 if (trace_level >= TRACE_LVL_ALL) {
417 char *bvec_buf;
418 unsigned long flags;
419
420 bvec_buf = bvec_kmap_irq(bvec, &flags);
421
422 drbd_print_buffer(" ", DBGPRINT_BUFFADDR, 1,
423 bvec_buf,
424 faddr,
425 (bvec->bv_len <= 0x80)
426 ? bvec->bv_len : 0x80);
427
428 bvec_kunmap_irq(bvec_buf, &flags);
429
430 if (bvec->bv_len > 0x40)
431 printk(KERN_DEBUG " ....\n");
432
433 faddr += bvec->bv_len;
434 }
435 }
436 }
437}
438
439static void probe_drbd_req(struct drbd_request *req, enum drbd_req_event what, char *msg)
440{
441 static const char *rq_event_names[] = {
442 [created] = "created",
443 [to_be_send] = "to_be_send",
444 [to_be_submitted] = "to_be_submitted",
445 [queue_for_net_write] = "queue_for_net_write",
446 [queue_for_net_read] = "queue_for_net_read",
447 [send_canceled] = "send_canceled",
448 [send_failed] = "send_failed",
449 [handed_over_to_network] = "handed_over_to_network",
450 [connection_lost_while_pending] =
451 "connection_lost_while_pending",
452 [recv_acked_by_peer] = "recv_acked_by_peer",
453 [write_acked_by_peer] = "write_acked_by_peer",
454 [neg_acked] = "neg_acked",
455 [conflict_discarded_by_peer] = "conflict_discarded_by_peer",
456 [barrier_acked] = "barrier_acked",
457 [data_received] = "data_received",
458 [read_completed_with_error] = "read_completed_with_error",
459 [read_ahead_completed_with_error] = "reada_completed_with_error",
460 [write_completed_with_error] = "write_completed_with_error",
461 [completed_ok] = "completed_ok",
462 };
463
464 struct drbd_conf *mdev = req->mdev;
465
466 const int rw = (req->master_bio == NULL ||
467 bio_data_dir(req->master_bio) == WRITE) ?
468 'W' : 'R';
469 const unsigned long s = req->rq_state;
470
471 if (what != nothing) {
472 dev_info(DEV, "__req_mod(%p %c ,%s)\n", req, rw, rq_event_names[what]);
473 } else {
474 dev_info(DEV, "%s %p %c L%c%c%cN%c%c%c%c%c %u (%llus +%u) %s\n",
475 msg, req, rw,
476 s & RQ_LOCAL_PENDING ? 'p' : '-',
477 s & RQ_LOCAL_COMPLETED ? 'c' : '-',
478 s & RQ_LOCAL_OK ? 'o' : '-',
479 s & RQ_NET_PENDING ? 'p' : '-',
480 s & RQ_NET_QUEUED ? 'q' : '-',
481 s & RQ_NET_SENT ? 's' : '-',
482 s & RQ_NET_DONE ? 'd' : '-',
483 s & RQ_NET_OK ? 'o' : '-',
484 req->epoch,
485 (unsigned long long)req->sector,
486 req->size,
487 drbd_conn_str(mdev->state.conn));
488 }
489}
490
491
492#define drbd_peer_str drbd_role_str
493#define drbd_pdsk_str drbd_disk_str
494
495#define PSM(A) \
496do { \
497 if (mask.A) { \
498 int i = snprintf(p, len, " " #A "( %s )", \
499 drbd_##A##_str(val.A)); \
500 if (i >= len) \
501 return op; \
502 p += i; \
503 len -= i; \
504 } \
505} while (0)
506
507static char *dump_st(char *p, int len, union drbd_state mask, union drbd_state val)
508{
509 char *op = p;
510 *p = '\0';
511 PSM(role);
512 PSM(peer);
513 PSM(conn);
514 PSM(disk);
515 PSM(pdsk);
516
517 return op;
518}
519
520#define INFOP(fmt, args...) \
521do { \
522 if (trace_level >= TRACE_LVL_ALL) { \
523 dev_info(DEV, "%s:%d: %s [%d] %s %s " fmt , \
524 file, line, current->comm, current->pid, \
525 sockname, recv ? "<<<" : ">>>" , \
526 ## args); \
527 } else { \
528 dev_info(DEV, "%s %s " fmt, sockname, \
529 recv ? "<<<" : ">>>" , \
530 ## args); \
531 } \
532} while (0)
533
534static char *_dump_block_id(u64 block_id, char *buff)
535{
536 if (is_syncer_block_id(block_id))
537 strcpy(buff, "SyncerId");
538 else
539 sprintf(buff, "%llx", (unsigned long long)block_id);
540
541 return buff;
542}
543
544static void probe_drbd_packet(struct drbd_conf *mdev, struct socket *sock,
545 int recv, union p_polymorph *p, char *file, int line)
546{
547 char *sockname = sock == mdev->meta.socket ? "meta" : "data";
548 int cmd = (recv == 2) ? p->header.command : be16_to_cpu(p->header.command);
549 char tmp[300];
550 union drbd_state m, v;
551
552 switch (cmd) {
553 case P_HAND_SHAKE:
554 INFOP("%s (protocol %u-%u)\n", cmdname(cmd),
555 be32_to_cpu(p->handshake.protocol_min),
556 be32_to_cpu(p->handshake.protocol_max));
557 break;
558
559 case P_BITMAP: /* don't report this */
560 case P_COMPRESSED_BITMAP: /* don't report this */
561 break;
562
563 case P_DATA:
564 INFOP("%s (sector %llus, id %s, seq %u, f %x)\n", cmdname(cmd),
565 (unsigned long long)be64_to_cpu(p->data.sector),
566 _dump_block_id(p->data.block_id, tmp),
567 be32_to_cpu(p->data.seq_num),
568 be32_to_cpu(p->data.dp_flags)
569 );
570 break;
571
572 case P_DATA_REPLY:
573 case P_RS_DATA_REPLY:
574 INFOP("%s (sector %llus, id %s)\n", cmdname(cmd),
575 (unsigned long long)be64_to_cpu(p->data.sector),
576 _dump_block_id(p->data.block_id, tmp)
577 );
578 break;
579
580 case P_RECV_ACK:
581 case P_WRITE_ACK:
582 case P_RS_WRITE_ACK:
583 case P_DISCARD_ACK:
584 case P_NEG_ACK:
585 case P_NEG_RS_DREPLY:
586 INFOP("%s (sector %llus, size %u, id %s, seq %u)\n",
587 cmdname(cmd),
588 (long long)be64_to_cpu(p->block_ack.sector),
589 be32_to_cpu(p->block_ack.blksize),
590 _dump_block_id(p->block_ack.block_id, tmp),
591 be32_to_cpu(p->block_ack.seq_num)
592 );
593 break;
594
595 case P_DATA_REQUEST:
596 case P_RS_DATA_REQUEST:
597 INFOP("%s (sector %llus, size %u, id %s)\n", cmdname(cmd),
598 (long long)be64_to_cpu(p->block_req.sector),
599 be32_to_cpu(p->block_req.blksize),
600 _dump_block_id(p->block_req.block_id, tmp)
601 );
602 break;
603
604 case P_BARRIER:
605 case P_BARRIER_ACK:
606 INFOP("%s (barrier %u)\n", cmdname(cmd), p->barrier.barrier);
607 break;
608
609 case P_SYNC_PARAM:
610 case P_SYNC_PARAM89:
611 INFOP("%s (rate %u, verify-alg \"%.64s\", csums-alg \"%.64s\")\n",
612 cmdname(cmd), be32_to_cpu(p->rs_param_89.rate),
613 p->rs_param_89.verify_alg, p->rs_param_89.csums_alg);
614 break;
615
616 case P_UUIDS:
617 INFOP("%s Curr:%016llX, Bitmap:%016llX, "
618 "HisSt:%016llX, HisEnd:%016llX\n",
619 cmdname(cmd),
620 (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_CURRENT]),
621 (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_BITMAP]),
622 (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_HISTORY_START]),
623 (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_HISTORY_END]));
624 break;
625
626 case P_SIZES:
627 INFOP("%s (d %lluMiB, u %lluMiB, c %lldMiB, "
628 "max bio %x, q order %x)\n",
629 cmdname(cmd),
630 (long long)(be64_to_cpu(p->sizes.d_size)>>(20-9)),
631 (long long)(be64_to_cpu(p->sizes.u_size)>>(20-9)),
632 (long long)(be64_to_cpu(p->sizes.c_size)>>(20-9)),
633 be32_to_cpu(p->sizes.max_segment_size),
634 be32_to_cpu(p->sizes.queue_order_type));
635 break;
636
637 case P_STATE:
638 v.i = be32_to_cpu(p->state.state);
639 m.i = 0xffffffff;
640 dump_st(tmp, sizeof(tmp), m, v);
641 INFOP("%s (s %x {%s})\n", cmdname(cmd), v.i, tmp);
642 break;
643
644 case P_STATE_CHG_REQ:
645 m.i = be32_to_cpu(p->req_state.mask);
646 v.i = be32_to_cpu(p->req_state.val);
647 dump_st(tmp, sizeof(tmp), m, v);
648 INFOP("%s (m %x v %x {%s})\n", cmdname(cmd), m.i, v.i, tmp);
649 break;
650
651 case P_STATE_CHG_REPLY:
652 INFOP("%s (ret %x)\n", cmdname(cmd),
653 be32_to_cpu(p->req_state_reply.retcode));
654 break;
655
656 case P_PING:
657 case P_PING_ACK:
658 /*
659 * Dont trace pings at summary level
660 */
661 if (trace_level < TRACE_LVL_ALL)
662 break;
663 /* fall through... */
664 default:
665 INFOP("%s (%u)\n", cmdname(cmd), cmd);
666 break;
667 }
668}
669
670
671static int __init drbd_trace_init(void)
672{
673 int ret;
674
675 if (trace_mask & TRACE_UNPLUG) {
676 ret = register_trace_drbd_unplug(probe_drbd_unplug);
677 WARN_ON(ret);
678 }
679 if (trace_mask & TRACE_UUID) {
680 ret = register_trace_drbd_uuid(probe_drbd_uuid);
681 WARN_ON(ret);
682 }
683 if (trace_mask & TRACE_EE) {
684 ret = register_trace_drbd_ee(probe_drbd_ee);
685 WARN_ON(ret);
686 }
687 if (trace_mask & TRACE_PACKET) {
688 ret = register_trace_drbd_packet(probe_drbd_packet);
689 WARN_ON(ret);
690 }
691 if (trace_mask & TRACE_MD_IO) {
692 ret = register_trace_drbd_md_io(probe_drbd_md_io);
693 WARN_ON(ret);
694 }
695 if (trace_mask & TRACE_EPOCH) {
696 ret = register_trace_drbd_epoch(probe_drbd_epoch);
697 WARN_ON(ret);
698 }
699 if (trace_mask & TRACE_NL) {
700 ret = register_trace_drbd_netlink(probe_drbd_netlink);
701 WARN_ON(ret);
702 }
703 if (trace_mask & TRACE_AL_EXT) {
704 ret = register_trace_drbd_actlog(probe_drbd_actlog);
705 WARN_ON(ret);
706 }
707 if (trace_mask & TRACE_RQ) {
708 ret = register_trace_drbd_bio(probe_drbd_bio);
709 WARN_ON(ret);
710 }
711 if (trace_mask & TRACE_INT_RQ) {
712 ret = register_trace_drbd_req(probe_drbd_req);
713 WARN_ON(ret);
714 }
715 if (trace_mask & TRACE_RESYNC) {
716 ret = register_trace__drbd_resync(probe_drbd_resync);
717 WARN_ON(ret);
718 }
719 return 0;
720}
721
722module_init(drbd_trace_init);
723
724static void __exit drbd_trace_exit(void)
725{
726 if (trace_mask & TRACE_UNPLUG)
727 unregister_trace_drbd_unplug(probe_drbd_unplug);
728 if (trace_mask & TRACE_UUID)
729 unregister_trace_drbd_uuid(probe_drbd_uuid);
730 if (trace_mask & TRACE_EE)
731 unregister_trace_drbd_ee(probe_drbd_ee);
732 if (trace_mask & TRACE_PACKET)
733 unregister_trace_drbd_packet(probe_drbd_packet);
734 if (trace_mask & TRACE_MD_IO)
735 unregister_trace_drbd_md_io(probe_drbd_md_io);
736 if (trace_mask & TRACE_EPOCH)
737 unregister_trace_drbd_epoch(probe_drbd_epoch);
738 if (trace_mask & TRACE_NL)
739 unregister_trace_drbd_netlink(probe_drbd_netlink);
740 if (trace_mask & TRACE_AL_EXT)
741 unregister_trace_drbd_actlog(probe_drbd_actlog);
742 if (trace_mask & TRACE_RQ)
743 unregister_trace_drbd_bio(probe_drbd_bio);
744 if (trace_mask & TRACE_INT_RQ)
745 unregister_trace_drbd_req(probe_drbd_req);
746 if (trace_mask & TRACE_RESYNC)
747 unregister_trace__drbd_resync(probe_drbd_resync);
748
749 tracepoint_synchronize_unregister();
750}
751
752module_exit(drbd_trace_exit);
diff --git a/drivers/block/drbd/drbd_tracing.h b/drivers/block/drbd/drbd_tracing.h
new file mode 100644
index 000000000000..c4531a137f65
--- /dev/null
+++ b/drivers/block/drbd/drbd_tracing.h
@@ -0,0 +1,87 @@
1/*
2 drbd_tracing.h
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#ifndef DRBD_TRACING_H
27#define DRBD_TRACING_H
28
29#include <linux/tracepoint.h>
30#include "drbd_int.h"
31#include "drbd_req.h"
32
33enum {
34 TRACE_LVL_ALWAYS = 0,
35 TRACE_LVL_SUMMARY,
36 TRACE_LVL_METRICS,
37 TRACE_LVL_ALL,
38 TRACE_LVL_MAX
39};
40
41DECLARE_TRACE(drbd_unplug,
42 TP_PROTO(struct drbd_conf *mdev, char* msg),
43 TP_ARGS(mdev, msg));
44
45DECLARE_TRACE(drbd_uuid,
46 TP_PROTO(struct drbd_conf *mdev, enum drbd_uuid_index index),
47 TP_ARGS(mdev, index));
48
49DECLARE_TRACE(drbd_ee,
50 TP_PROTO(struct drbd_conf *mdev, struct drbd_epoch_entry *e, char* msg),
51 TP_ARGS(mdev, e, msg));
52
53DECLARE_TRACE(drbd_md_io,
54 TP_PROTO(struct drbd_conf *mdev, int rw, struct drbd_backing_dev *bdev),
55 TP_ARGS(mdev, rw, bdev));
56
57DECLARE_TRACE(drbd_epoch,
58 TP_PROTO(struct drbd_conf *mdev, struct drbd_epoch *epoch, enum epoch_event ev),
59 TP_ARGS(mdev, epoch, ev));
60
61DECLARE_TRACE(drbd_netlink,
62 TP_PROTO(void *data, int is_req),
63 TP_ARGS(data, is_req));
64
65DECLARE_TRACE(drbd_actlog,
66 TP_PROTO(struct drbd_conf *mdev, sector_t sector, char* msg),
67 TP_ARGS(mdev, sector, msg));
68
69DECLARE_TRACE(drbd_bio,
70 TP_PROTO(struct drbd_conf *mdev, const char *pfx, struct bio *bio, int complete,
71 struct drbd_request *r),
72 TP_ARGS(mdev, pfx, bio, complete, r));
73
74DECLARE_TRACE(drbd_req,
75 TP_PROTO(struct drbd_request *req, enum drbd_req_event what, char *msg),
76 TP_ARGS(req, what, msg));
77
78DECLARE_TRACE(drbd_packet,
79 TP_PROTO(struct drbd_conf *mdev, struct socket *sock,
80 int recv, union p_polymorph *p, char *file, int line),
81 TP_ARGS(mdev, sock, recv, p, file, line));
82
83DECLARE_TRACE(_drbd_resync,
84 TP_PROTO(struct drbd_conf *mdev, int level, const char *fmt, va_list args),
85 TP_ARGS(mdev, level, fmt, args));
86
87#endif
diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h
new file mode 100644
index 000000000000..fc824006e721
--- /dev/null
+++ b/drivers/block/drbd/drbd_vli.h
@@ -0,0 +1,351 @@
1/*
2-*- linux-c -*-
3 drbd_receiver.c
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25#ifndef _DRBD_VLI_H
26#define _DRBD_VLI_H
27
28/*
29 * At a granularity of 4KiB storage represented per bit,
30 * and stroage sizes of several TiB,
31 * and possibly small-bandwidth replication,
32 * the bitmap transfer time can take much too long,
33 * if transmitted in plain text.
34 *
35 * We try to reduce the transfered bitmap information
36 * by encoding runlengths of bit polarity.
37 *
38 * We never actually need to encode a "zero" (runlengths are positive).
39 * But then we have to store the value of the first bit.
40 * The first bit of information thus shall encode if the first runlength
41 * gives the number of set or unset bits.
42 *
43 * We assume that large areas are either completely set or unset,
44 * which gives good compression with any runlength method,
45 * even when encoding the runlength as fixed size 32bit/64bit integers.
46 *
47 * Still, there may be areas where the polarity flips every few bits,
48 * and encoding the runlength sequence of those areas with fix size
49 * integers would be much worse than plaintext.
50 *
51 * We want to encode small runlength values with minimum code length,
52 * while still being able to encode a Huge run of all zeros.
53 *
54 * Thus we need a Variable Length Integer encoding, VLI.
55 *
56 * For some cases, we produce more code bits than plaintext input.
57 * We need to send incompressible chunks as plaintext, skip over them
58 * and then see if the next chunk compresses better.
59 *
60 * We don't care too much about "excellent" compression ratio for large
61 * runlengths (all set/all clear): whether we achieve a factor of 100
62 * or 1000 is not that much of an issue.
63 * We do not want to waste too much on short runlengths in the "noisy"
64 * parts of the bitmap, though.
65 *
66 * There are endless variants of VLI, we experimented with:
67 * * simple byte-based
68 * * various bit based with different code word length.
69 *
70 * To avoid yet an other configuration parameter (choice of bitmap compression
71 * algorithm) which was difficult to explain and tune, we just chose the one
72 * variant that turned out best in all test cases.
73 * Based on real world usage patterns, with device sizes ranging from a few GiB
74 * to several TiB, file server/mailserver/webserver/mysql/postgress,
75 * mostly idle to really busy, the all time winner (though sometimes only
76 * marginally better) is:
77 */
78
79/*
80 * encoding is "visualised" as
81 * __little endian__ bitstream, least significant bit first (left most)
82 *
83 * this particular encoding is chosen so that the prefix code
84 * starts as unary encoding the level, then modified so that
85 * 10 levels can be described in 8bit, with minimal overhead
86 * for the smaller levels.
87 *
88 * Number of data bits follow fibonacci sequence, with the exception of the
89 * last level (+1 data bit, so it makes 64bit total). The only worse code when
90 * encoding bit polarity runlength is 1 plain bits => 2 code bits.
91prefix data bits max val Nº data bits
920 x 0x2 1
9310 x 0x4 1
94110 xx 0x8 2
951110 xxx 0x10 3
9611110 xxx xx 0x30 5
97111110 xx xxxxxx 0x130 8
9811111100 xxxxxxxx xxxxx 0x2130 13
9911111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21
10011111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34
10111111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
102 * maximum encodable value: 0x100000400202130 == 2**56 + some */
103
104/* compression "table":
105 transmitted x 0.29
106 as plaintext x ........................
107 x ........................
108 x ........................
109 x 0.59 0.21........................
110 x ........................................................
111 x .. c ...................................................
112 x 0.44.. o ...................................................
113 x .......... d ...................................................
114 x .......... e ...................................................
115 X............. ...................................................
116 x.............. b ...................................................
1172.0x............... i ...................................................
118 #X................ t ...................................................
119 #................. s ........................... plain bits ..........
120-+-----------------------------------------------------------------------
121 1 16 32 64
122*/
123
124/* LEVEL: (total bits, prefix bits, prefix value),
125 * sorted ascending by number of total bits.
126 * The rest of the code table is calculated at compiletime from this. */
127
128/* fibonacci data 1, 1, ... */
129#define VLI_L_1_1() do { \
130 LEVEL( 2, 1, 0x00); \
131 LEVEL( 3, 2, 0x01); \
132 LEVEL( 5, 3, 0x03); \
133 LEVEL( 7, 4, 0x07); \
134 LEVEL(10, 5, 0x0f); \
135 LEVEL(14, 6, 0x1f); \
136 LEVEL(21, 8, 0x3f); \
137 LEVEL(29, 8, 0x7f); \
138 LEVEL(42, 8, 0xbf); \
139 LEVEL(64, 8, 0xff); \
140 } while (0)
141
142/* finds a suitable level to decode the least significant part of in.
143 * returns number of bits consumed.
144 *
145 * BUG() for bad input, as that would mean a buggy code table. */
146static inline int vli_decode_bits(u64 *out, const u64 in)
147{
148 u64 adj = 1;
149
150#define LEVEL(t,b,v) \
151 do { \
152 if ((in & ((1 << b) -1)) == v) { \
153 *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \
154 return t; \
155 } \
156 adj += 1ULL << (t - b); \
157 } while (0)
158
159 VLI_L_1_1();
160
161 /* NOT REACHED, if VLI_LEVELS code table is defined properly */
162 BUG();
163#undef LEVEL
164}
165
166/* return number of code bits needed,
167 * or negative error number */
168static inline int __vli_encode_bits(u64 *out, const u64 in)
169{
170 u64 max = 0;
171 u64 adj = 1;
172
173 if (in == 0)
174 return -EINVAL;
175
176#define LEVEL(t,b,v) do { \
177 max += 1ULL << (t - b); \
178 if (in <= max) { \
179 if (out) \
180 *out = ((in - adj) << b) | v; \
181 return t; \
182 } \
183 adj = max + 1; \
184 } while (0)
185
186 VLI_L_1_1();
187
188 return -EOVERFLOW;
189#undef LEVEL
190}
191
192#undef VLI_L_1_1
193
194/* code from here down is independend of actually used bit code */
195
196/*
197 * Code length is determined by some unique (e.g. unary) prefix.
198 * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
199 * not a byte stream.
200 */
201
202/* for the bitstream, we need a cursor */
203struct bitstream_cursor {
204 /* the current byte */
205 u8 *b;
206 /* the current bit within *b, nomalized: 0..7 */
207 unsigned int bit;
208};
209
210/* initialize cursor to point to first bit of stream */
211static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
212{
213 cur->b = s;
214 cur->bit = 0;
215}
216
217/* advance cursor by that many bits; maximum expected input value: 64,
218 * but depending on VLI implementation, it may be more. */
219static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
220{
221 bits += cur->bit;
222 cur->b = cur->b + (bits >> 3);
223 cur->bit = bits & 7;
224}
225
226/* the bitstream itself knows its length */
227struct bitstream {
228 struct bitstream_cursor cur;
229 unsigned char *buf;
230 size_t buf_len; /* in bytes */
231
232 /* for input stream:
233 * number of trailing 0 bits for padding
234 * total number of valid bits in stream: buf_len * 8 - pad_bits */
235 unsigned int pad_bits;
236};
237
238static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
239{
240 bs->buf = s;
241 bs->buf_len = len;
242 bs->pad_bits = pad_bits;
243 bitstream_cursor_reset(&bs->cur, bs->buf);
244}
245
246static inline void bitstream_rewind(struct bitstream *bs)
247{
248 bitstream_cursor_reset(&bs->cur, bs->buf);
249 memset(bs->buf, 0, bs->buf_len);
250}
251
252/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
253 * Ignores "pad_bits".
254 * Returns zero if bits == 0 (nothing to do).
255 * Returns number of bits used if successful.
256 *
257 * If there is not enough room left in bitstream,
258 * leaves bitstream unchanged and returns -ENOBUFS.
259 */
260static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
261{
262 unsigned char *b = bs->cur.b;
263 unsigned int tmp;
264
265 if (bits == 0)
266 return 0;
267
268 if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
269 return -ENOBUFS;
270
271 /* paranoia: strip off hi bits; they should not be set anyways. */
272 if (bits < 64)
273 val &= ~0ULL >> (64 - bits);
274
275 *b++ |= (val & 0xff) << bs->cur.bit;
276
277 for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
278 *b++ |= (val >> tmp) & 0xff;
279
280 bitstream_cursor_advance(&bs->cur, bits);
281 return bits;
282}
283
284/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
285 *
286 * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
287 *
288 * If there are less than the requested number of valid bits left in the
289 * bitstream, still fetches all available bits.
290 *
291 * Returns number of actually fetched bits.
292 */
293static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
294{
295 u64 val;
296 unsigned int n;
297
298 if (bits > 64)
299 return -EINVAL;
300
301 if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
302 bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
303 - bs->cur.bit - bs->pad_bits;
304
305 if (bits == 0) {
306 *out = 0;
307 return 0;
308 }
309
310 /* get the high bits */
311 val = 0;
312 n = (bs->cur.bit + bits + 7) >> 3;
313 /* n may be at most 9, if cur.bit + bits > 64 */
314 /* which means this copies at most 8 byte */
315 if (n) {
316 memcpy(&val, bs->cur.b+1, n - 1);
317 val = le64_to_cpu(val) << (8 - bs->cur.bit);
318 }
319
320 /* we still need the low bits */
321 val |= bs->cur.b[0] >> bs->cur.bit;
322
323 /* and mask out bits we don't want */
324 val &= ~0ULL >> (64 - bits);
325
326 bitstream_cursor_advance(&bs->cur, bits);
327 *out = val;
328
329 return bits;
330}
331
332/* encodes @in as vli into @bs;
333
334 * return values
335 * > 0: number of bits successfully stored in bitstream
336 * -ENOBUFS @bs is full
337 * -EINVAL input zero (invalid)
338 * -EOVERFLOW input too large for this vli code (invalid)
339 */
340static inline int vli_encode_bits(struct bitstream *bs, u64 in)
341{
342 u64 code = code;
343 int bits = __vli_encode_bits(&code, in);
344
345 if (bits <= 0)
346 return bits;
347
348 return bitstream_put_bits(bs, code, bits);
349}
350
351#endif
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
new file mode 100644
index 000000000000..212e9545e634
--- /dev/null
+++ b/drivers/block/drbd/drbd_worker.c
@@ -0,0 +1,1529 @@
1/*
2 drbd_worker.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/autoconf.h>
27#include <linux/module.h>
28#include <linux/version.h>
29#include <linux/drbd.h>
30#include <linux/sched.h>
31#include <linux/smp_lock.h>
32#include <linux/wait.h>
33#include <linux/mm.h>
34#include <linux/memcontrol.h>
35#include <linux/mm_inline.h>
36#include <linux/slab.h>
37#include <linux/random.h>
38#include <linux/mm.h>
39#include <linux/string.h>
40#include <linux/scatterlist.h>
41
42#include "drbd_int.h"
43#include "drbd_req.h"
44#include "drbd_tracing.h"
45
46#define SLEEP_TIME (HZ/10)
47
48static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
49
50
51
52/* defined here:
53 drbd_md_io_complete
54 drbd_endio_write_sec
55 drbd_endio_read_sec
56 drbd_endio_pri
57
58 * more endio handlers:
59 atodb_endio in drbd_actlog.c
60 drbd_bm_async_io_complete in drbd_bitmap.c
61
62 * For all these callbacks, note the following:
63 * The callbacks will be called in irq context by the IDE drivers,
64 * and in Softirqs/Tasklets/BH context by the SCSI drivers.
65 * Try to get the locking right :)
66 *
67 */
68
69
70/* About the global_state_lock
71 Each state transition on an device holds a read lock. In case we have
72 to evaluate the sync after dependencies, we grab a write lock, because
73 we need stable states on all devices for that. */
74rwlock_t global_state_lock;
75
76/* used for synchronous meta data and bitmap IO
77 * submitted by drbd_md_sync_page_io()
78 */
79void drbd_md_io_complete(struct bio *bio, int error)
80{
81 struct drbd_md_io *md_io;
82
83 md_io = (struct drbd_md_io *)bio->bi_private;
84 md_io->error = error;
85
86 trace_drbd_bio(md_io->mdev, "Md", bio, 1, NULL);
87
88 complete(&md_io->event);
89}
90
91/* reads on behalf of the partner,
92 * "submitted" by the receiver
93 */
94void drbd_endio_read_sec(struct bio *bio, int error) __releases(local)
95{
96 unsigned long flags = 0;
97 struct drbd_epoch_entry *e = NULL;
98 struct drbd_conf *mdev;
99 int uptodate = bio_flagged(bio, BIO_UPTODATE);
100
101 e = bio->bi_private;
102 mdev = e->mdev;
103
104 if (error)
105 dev_warn(DEV, "read: error=%d s=%llus\n", error,
106 (unsigned long long)e->sector);
107 if (!error && !uptodate) {
108 dev_warn(DEV, "read: setting error to -EIO s=%llus\n",
109 (unsigned long long)e->sector);
110 /* strange behavior of some lower level drivers...
111 * fail the request by clearing the uptodate flag,
112 * but do not return any error?! */
113 error = -EIO;
114 }
115
116 D_ASSERT(e->block_id != ID_VACANT);
117
118 trace_drbd_bio(mdev, "Sec", bio, 1, NULL);
119
120 spin_lock_irqsave(&mdev->req_lock, flags);
121 mdev->read_cnt += e->size >> 9;
122 list_del(&e->w.list);
123 if (list_empty(&mdev->read_ee))
124 wake_up(&mdev->ee_wait);
125 spin_unlock_irqrestore(&mdev->req_lock, flags);
126
127 drbd_chk_io_error(mdev, error, FALSE);
128 drbd_queue_work(&mdev->data.work, &e->w);
129 put_ldev(mdev);
130
131 trace_drbd_ee(mdev, e, "read completed");
132}
133
134/* writes on behalf of the partner, or resync writes,
135 * "submitted" by the receiver.
136 */
137void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
138{
139 unsigned long flags = 0;
140 struct drbd_epoch_entry *e = NULL;
141 struct drbd_conf *mdev;
142 sector_t e_sector;
143 int do_wake;
144 int is_syncer_req;
145 int do_al_complete_io;
146 int uptodate = bio_flagged(bio, BIO_UPTODATE);
147 int is_barrier = bio_rw_flagged(bio, BIO_RW_BARRIER);
148
149 e = bio->bi_private;
150 mdev = e->mdev;
151
152 if (error)
153 dev_warn(DEV, "write: error=%d s=%llus\n", error,
154 (unsigned long long)e->sector);
155 if (!error && !uptodate) {
156 dev_warn(DEV, "write: setting error to -EIO s=%llus\n",
157 (unsigned long long)e->sector);
158 /* strange behavior of some lower level drivers...
159 * fail the request by clearing the uptodate flag,
160 * but do not return any error?! */
161 error = -EIO;
162 }
163
164 /* error == -ENOTSUPP would be a better test,
165 * alas it is not reliable */
166 if (error && is_barrier && e->flags & EE_IS_BARRIER) {
167 drbd_bump_write_ordering(mdev, WO_bdev_flush);
168 spin_lock_irqsave(&mdev->req_lock, flags);
169 list_del(&e->w.list);
170 e->w.cb = w_e_reissue;
171 /* put_ldev actually happens below, once we come here again. */
172 __release(local);
173 spin_unlock_irqrestore(&mdev->req_lock, flags);
174 drbd_queue_work(&mdev->data.work, &e->w);
175 return;
176 }
177
178 D_ASSERT(e->block_id != ID_VACANT);
179
180 trace_drbd_bio(mdev, "Sec", bio, 1, NULL);
181
182 spin_lock_irqsave(&mdev->req_lock, flags);
183 mdev->writ_cnt += e->size >> 9;
184 is_syncer_req = is_syncer_block_id(e->block_id);
185
186 /* after we moved e to done_ee,
187 * we may no longer access it,
188 * it may be freed/reused already!
189 * (as soon as we release the req_lock) */
190 e_sector = e->sector;
191 do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
192
193 list_del(&e->w.list); /* has been on active_ee or sync_ee */
194 list_add_tail(&e->w.list, &mdev->done_ee);
195
196 trace_drbd_ee(mdev, e, "write completed");
197
198 /* No hlist_del_init(&e->colision) here, we did not send the Ack yet,
199 * neither did we wake possibly waiting conflicting requests.
200 * done from "drbd_process_done_ee" within the appropriate w.cb
201 * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
202
203 do_wake = is_syncer_req
204 ? list_empty(&mdev->sync_ee)
205 : list_empty(&mdev->active_ee);
206
207 if (error)
208 __drbd_chk_io_error(mdev, FALSE);
209 spin_unlock_irqrestore(&mdev->req_lock, flags);
210
211 if (is_syncer_req)
212 drbd_rs_complete_io(mdev, e_sector);
213
214 if (do_wake)
215 wake_up(&mdev->ee_wait);
216
217 if (do_al_complete_io)
218 drbd_al_complete_io(mdev, e_sector);
219
220 wake_asender(mdev);
221 put_ldev(mdev);
222
223}
224
225/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
226 */
227void drbd_endio_pri(struct bio *bio, int error)
228{
229 unsigned long flags;
230 struct drbd_request *req = bio->bi_private;
231 struct drbd_conf *mdev = req->mdev;
232 struct bio_and_error m;
233 enum drbd_req_event what;
234 int uptodate = bio_flagged(bio, BIO_UPTODATE);
235
236 if (error)
237 dev_warn(DEV, "p %s: error=%d\n",
238 bio_data_dir(bio) == WRITE ? "write" : "read", error);
239 if (!error && !uptodate) {
240 dev_warn(DEV, "p %s: setting error to -EIO\n",
241 bio_data_dir(bio) == WRITE ? "write" : "read");
242 /* strange behavior of some lower level drivers...
243 * fail the request by clearing the uptodate flag,
244 * but do not return any error?! */
245 error = -EIO;
246 }
247
248 trace_drbd_bio(mdev, "Pri", bio, 1, NULL);
249
250 /* to avoid recursion in __req_mod */
251 if (unlikely(error)) {
252 what = (bio_data_dir(bio) == WRITE)
253 ? write_completed_with_error
254 : (bio_rw(bio) == READA)
255 ? read_completed_with_error
256 : read_ahead_completed_with_error;
257 } else
258 what = completed_ok;
259
260 bio_put(req->private_bio);
261 req->private_bio = ERR_PTR(error);
262
263 spin_lock_irqsave(&mdev->req_lock, flags);
264 __req_mod(req, what, &m);
265 spin_unlock_irqrestore(&mdev->req_lock, flags);
266
267 if (m.bio)
268 complete_master_bio(mdev, &m);
269}
270
271int w_io_error(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
272{
273 struct drbd_request *req = container_of(w, struct drbd_request, w);
274
275 /* NOTE: mdev->ldev can be NULL by the time we get here! */
276 /* D_ASSERT(mdev->ldev->dc.on_io_error != EP_PASS_ON); */
277
278 /* the only way this callback is scheduled is from _req_may_be_done,
279 * when it is done and had a local write error, see comments there */
280 drbd_req_free(req);
281
282 return TRUE;
283}
284
285int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
286{
287 struct drbd_request *req = container_of(w, struct drbd_request, w);
288
289 /* We should not detach for read io-error,
290 * but try to WRITE the P_DATA_REPLY to the failed location,
291 * to give the disk the chance to relocate that block */
292
293 spin_lock_irq(&mdev->req_lock);
294 if (cancel ||
295 mdev->state.conn < C_CONNECTED ||
296 mdev->state.pdsk <= D_INCONSISTENT) {
297 _req_mod(req, send_canceled);
298 spin_unlock_irq(&mdev->req_lock);
299 dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n");
300 return 1;
301 }
302 spin_unlock_irq(&mdev->req_lock);
303
304 return w_send_read_req(mdev, w, 0);
305}
306
307int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
308{
309 ERR_IF(cancel) return 1;
310 dev_err(DEV, "resync inactive, but callback triggered??\n");
311 return 1; /* Simply ignore this! */
312}
313
314void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
315{
316 struct hash_desc desc;
317 struct scatterlist sg;
318 struct bio_vec *bvec;
319 int i;
320
321 desc.tfm = tfm;
322 desc.flags = 0;
323
324 sg_init_table(&sg, 1);
325 crypto_hash_init(&desc);
326
327 __bio_for_each_segment(bvec, bio, i, 0) {
328 sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
329 crypto_hash_update(&desc, &sg, sg.length);
330 }
331 crypto_hash_final(&desc, digest);
332}
333
334static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
335{
336 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
337 int digest_size;
338 void *digest;
339 int ok;
340
341 D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
342
343 if (unlikely(cancel)) {
344 drbd_free_ee(mdev, e);
345 return 1;
346 }
347
348 if (likely(drbd_bio_uptodate(e->private_bio))) {
349 digest_size = crypto_hash_digestsize(mdev->csums_tfm);
350 digest = kmalloc(digest_size, GFP_NOIO);
351 if (digest) {
352 drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
353
354 inc_rs_pending(mdev);
355 ok = drbd_send_drequest_csum(mdev,
356 e->sector,
357 e->size,
358 digest,
359 digest_size,
360 P_CSUM_RS_REQUEST);
361 kfree(digest);
362 } else {
363 dev_err(DEV, "kmalloc() of digest failed.\n");
364 ok = 0;
365 }
366 } else
367 ok = 1;
368
369 drbd_free_ee(mdev, e);
370
371 if (unlikely(!ok))
372 dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
373 return ok;
374}
375
376#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
377
378static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
379{
380 struct drbd_epoch_entry *e;
381
382 if (!get_ldev(mdev))
383 return 0;
384
385 /* GFP_TRY, because if there is no memory available right now, this may
386 * be rescheduled for later. It is "only" background resync, after all. */
387 e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
388 if (!e) {
389 put_ldev(mdev);
390 return 2;
391 }
392
393 spin_lock_irq(&mdev->req_lock);
394 list_add(&e->w.list, &mdev->read_ee);
395 spin_unlock_irq(&mdev->req_lock);
396
397 e->private_bio->bi_end_io = drbd_endio_read_sec;
398 e->private_bio->bi_rw = READ;
399 e->w.cb = w_e_send_csum;
400
401 mdev->read_cnt += size >> 9;
402 drbd_generic_make_request(mdev, DRBD_FAULT_RS_RD, e->private_bio);
403
404 return 1;
405}
406
407void resync_timer_fn(unsigned long data)
408{
409 unsigned long flags;
410 struct drbd_conf *mdev = (struct drbd_conf *) data;
411 int queue;
412
413 spin_lock_irqsave(&mdev->req_lock, flags);
414
415 if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) {
416 queue = 1;
417 if (mdev->state.conn == C_VERIFY_S)
418 mdev->resync_work.cb = w_make_ov_request;
419 else
420 mdev->resync_work.cb = w_make_resync_request;
421 } else {
422 queue = 0;
423 mdev->resync_work.cb = w_resync_inactive;
424 }
425
426 spin_unlock_irqrestore(&mdev->req_lock, flags);
427
428 /* harmless race: list_empty outside data.work.q_lock */
429 if (list_empty(&mdev->resync_work.list) && queue)
430 drbd_queue_work(&mdev->data.work, &mdev->resync_work);
431}
432
433int w_make_resync_request(struct drbd_conf *mdev,
434 struct drbd_work *w, int cancel)
435{
436 unsigned long bit;
437 sector_t sector;
438 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
439 int max_segment_size = queue_max_segment_size(mdev->rq_queue);
440 int number, i, size, pe, mx;
441 int align, queued, sndbuf;
442
443 if (unlikely(cancel))
444 return 1;
445
446 if (unlikely(mdev->state.conn < C_CONNECTED)) {
447 dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected");
448 return 0;
449 }
450
451 if (mdev->state.conn != C_SYNC_TARGET)
452 dev_err(DEV, "%s in w_make_resync_request\n",
453 drbd_conn_str(mdev->state.conn));
454
455 if (!get_ldev(mdev)) {
456 /* Since we only need to access mdev->rsync a
457 get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
458 to continue resync with a broken disk makes no sense at
459 all */
460 dev_err(DEV, "Disk broke down during resync!\n");
461 mdev->resync_work.cb = w_resync_inactive;
462 return 1;
463 }
464
465 number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
466 pe = atomic_read(&mdev->rs_pending_cnt);
467
468 mutex_lock(&mdev->data.mutex);
469 if (mdev->data.socket)
470 mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req);
471 else
472 mx = 1;
473 mutex_unlock(&mdev->data.mutex);
474
475 /* For resync rates >160MB/sec, allow more pending RS requests */
476 if (number > mx)
477 mx = number;
478
479 /* Limit the number of pending RS requests to no more than the peer's receive buffer */
480 if ((pe + number) > mx) {
481 number = mx - pe;
482 }
483
484 for (i = 0; i < number; i++) {
485 /* Stop generating RS requests, when half of the send buffer is filled */
486 mutex_lock(&mdev->data.mutex);
487 if (mdev->data.socket) {
488 queued = mdev->data.socket->sk->sk_wmem_queued;
489 sndbuf = mdev->data.socket->sk->sk_sndbuf;
490 } else {
491 queued = 1;
492 sndbuf = 0;
493 }
494 mutex_unlock(&mdev->data.mutex);
495 if (queued > sndbuf / 2)
496 goto requeue;
497
498next_sector:
499 size = BM_BLOCK_SIZE;
500 bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo);
501
502 if (bit == -1UL) {
503 mdev->bm_resync_fo = drbd_bm_bits(mdev);
504 mdev->resync_work.cb = w_resync_inactive;
505 put_ldev(mdev);
506 return 1;
507 }
508
509 sector = BM_BIT_TO_SECT(bit);
510
511 if (drbd_try_rs_begin_io(mdev, sector)) {
512 mdev->bm_resync_fo = bit;
513 goto requeue;
514 }
515 mdev->bm_resync_fo = bit + 1;
516
517 if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) {
518 drbd_rs_complete_io(mdev, sector);
519 goto next_sector;
520 }
521
522#if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE
523 /* try to find some adjacent bits.
524 * we stop if we have already the maximum req size.
525 *
526 * Additionally always align bigger requests, in order to
527 * be prepared for all stripe sizes of software RAIDs.
528 *
529 * we _do_ care about the agreed-upon q->max_segment_size
530 * here, as splitting up the requests on the other side is more
531 * difficult. the consequence is, that on lvm and md and other
532 * "indirect" devices, this is dead code, since
533 * q->max_segment_size will be PAGE_SIZE.
534 */
535 align = 1;
536 for (;;) {
537 if (size + BM_BLOCK_SIZE > max_segment_size)
538 break;
539
540 /* Be always aligned */
541 if (sector & ((1<<(align+3))-1))
542 break;
543
544 /* do not cross extent boundaries */
545 if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
546 break;
547 /* now, is it actually dirty, after all?
548 * caution, drbd_bm_test_bit is tri-state for some
549 * obscure reason; ( b == 0 ) would get the out-of-band
550 * only accidentally right because of the "oddly sized"
551 * adjustment below */
552 if (drbd_bm_test_bit(mdev, bit+1) != 1)
553 break;
554 bit++;
555 size += BM_BLOCK_SIZE;
556 if ((BM_BLOCK_SIZE << align) <= size)
557 align++;
558 i++;
559 }
560 /* if we merged some,
561 * reset the offset to start the next drbd_bm_find_next from */
562 if (size > BM_BLOCK_SIZE)
563 mdev->bm_resync_fo = bit + 1;
564#endif
565
566 /* adjust very last sectors, in case we are oddly sized */
567 if (sector + (size>>9) > capacity)
568 size = (capacity-sector)<<9;
569 if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
570 switch (read_for_csum(mdev, sector, size)) {
571 case 0: /* Disk failure*/
572 put_ldev(mdev);
573 return 0;
574 case 2: /* Allocation failed */
575 drbd_rs_complete_io(mdev, sector);
576 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
577 goto requeue;
578 /* case 1: everything ok */
579 }
580 } else {
581 inc_rs_pending(mdev);
582 if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
583 sector, size, ID_SYNCER)) {
584 dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");
585 dec_rs_pending(mdev);
586 put_ldev(mdev);
587 return 0;
588 }
589 }
590 }
591
592 if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) {
593 /* last syncer _request_ was sent,
594 * but the P_RS_DATA_REPLY not yet received. sync will end (and
595 * next sync group will resume), as soon as we receive the last
596 * resync data block, and the last bit is cleared.
597 * until then resync "work" is "inactive" ...
598 */
599 mdev->resync_work.cb = w_resync_inactive;
600 put_ldev(mdev);
601 return 1;
602 }
603
604 requeue:
605 mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
606 put_ldev(mdev);
607 return 1;
608}
609
610static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
611{
612 int number, i, size;
613 sector_t sector;
614 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
615
616 if (unlikely(cancel))
617 return 1;
618
619 if (unlikely(mdev->state.conn < C_CONNECTED)) {
620 dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected");
621 return 0;
622 }
623
624 number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
625 if (atomic_read(&mdev->rs_pending_cnt) > number)
626 goto requeue;
627
628 number -= atomic_read(&mdev->rs_pending_cnt);
629
630 sector = mdev->ov_position;
631 for (i = 0; i < number; i++) {
632 if (sector >= capacity) {
633 mdev->resync_work.cb = w_resync_inactive;
634 return 1;
635 }
636
637 size = BM_BLOCK_SIZE;
638
639 if (drbd_try_rs_begin_io(mdev, sector)) {
640 mdev->ov_position = sector;
641 goto requeue;
642 }
643
644 if (sector + (size>>9) > capacity)
645 size = (capacity-sector)<<9;
646
647 inc_rs_pending(mdev);
648 if (!drbd_send_ov_request(mdev, sector, size)) {
649 dec_rs_pending(mdev);
650 return 0;
651 }
652 sector += BM_SECT_PER_BIT;
653 }
654 mdev->ov_position = sector;
655
656 requeue:
657 mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
658 return 1;
659}
660
661
662int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
663{
664 kfree(w);
665 ov_oos_print(mdev);
666 drbd_resync_finished(mdev);
667
668 return 1;
669}
670
671static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
672{
673 kfree(w);
674
675 drbd_resync_finished(mdev);
676
677 return 1;
678}
679
680int drbd_resync_finished(struct drbd_conf *mdev)
681{
682 unsigned long db, dt, dbdt;
683 unsigned long n_oos;
684 union drbd_state os, ns;
685 struct drbd_work *w;
686 char *khelper_cmd = NULL;
687
688 /* Remove all elements from the resync LRU. Since future actions
689 * might set bits in the (main) bitmap, then the entries in the
690 * resync LRU would be wrong. */
691 if (drbd_rs_del_all(mdev)) {
692 /* In case this is not possible now, most probably because
693 * there are P_RS_DATA_REPLY Packets lingering on the worker's
694 * queue (or even the read operations for those packets
695 * is not finished by now). Retry in 100ms. */
696
697 drbd_kick_lo(mdev);
698 __set_current_state(TASK_INTERRUPTIBLE);
699 schedule_timeout(HZ / 10);
700 w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
701 if (w) {
702 w->cb = w_resync_finished;
703 drbd_queue_work(&mdev->data.work, w);
704 return 1;
705 }
706 dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
707 }
708
709 dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
710 if (dt <= 0)
711 dt = 1;
712 db = mdev->rs_total;
713 dbdt = Bit2KB(db/dt);
714 mdev->rs_paused /= HZ;
715
716 if (!get_ldev(mdev))
717 goto out;
718
719 spin_lock_irq(&mdev->req_lock);
720 os = mdev->state;
721
722 /* This protects us against multiple calls (that can happen in the presence
723 of application IO), and against connectivity loss just before we arrive here. */
724 if (os.conn <= C_CONNECTED)
725 goto out_unlock;
726
727 ns = os;
728 ns.conn = C_CONNECTED;
729
730 dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
731 (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ?
732 "Online verify " : "Resync",
733 dt + mdev->rs_paused, mdev->rs_paused, dbdt);
734
735 n_oos = drbd_bm_total_weight(mdev);
736
737 if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
738 if (n_oos) {
739 dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n",
740 n_oos, Bit2KB(1));
741 khelper_cmd = "out-of-sync";
742 }
743 } else {
744 D_ASSERT((n_oos - mdev->rs_failed) == 0);
745
746 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
747 khelper_cmd = "after-resync-target";
748
749 if (mdev->csums_tfm && mdev->rs_total) {
750 const unsigned long s = mdev->rs_same_csum;
751 const unsigned long t = mdev->rs_total;
752 const int ratio =
753 (t == 0) ? 0 :
754 (t < 100000) ? ((s*100)/t) : (s/(t/100));
755 dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; "
756 "transferred %luK total %luK\n",
757 ratio,
758 Bit2KB(mdev->rs_same_csum),
759 Bit2KB(mdev->rs_total - mdev->rs_same_csum),
760 Bit2KB(mdev->rs_total));
761 }
762 }
763
764 if (mdev->rs_failed) {
765 dev_info(DEV, " %lu failed blocks\n", mdev->rs_failed);
766
767 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
768 ns.disk = D_INCONSISTENT;
769 ns.pdsk = D_UP_TO_DATE;
770 } else {
771 ns.disk = D_UP_TO_DATE;
772 ns.pdsk = D_INCONSISTENT;
773 }
774 } else {
775 ns.disk = D_UP_TO_DATE;
776 ns.pdsk = D_UP_TO_DATE;
777
778 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
779 if (mdev->p_uuid) {
780 int i;
781 for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
782 _drbd_uuid_set(mdev, i, mdev->p_uuid[i]);
783 drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]);
784 _drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]);
785 } else {
786 dev_err(DEV, "mdev->p_uuid is NULL! BUG\n");
787 }
788 }
789
790 drbd_uuid_set_bm(mdev, 0UL);
791
792 if (mdev->p_uuid) {
793 /* Now the two UUID sets are equal, update what we
794 * know of the peer. */
795 int i;
796 for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
797 mdev->p_uuid[i] = mdev->ldev->md.uuid[i];
798 }
799 }
800
801 _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
802out_unlock:
803 spin_unlock_irq(&mdev->req_lock);
804 put_ldev(mdev);
805out:
806 mdev->rs_total = 0;
807 mdev->rs_failed = 0;
808 mdev->rs_paused = 0;
809 mdev->ov_start_sector = 0;
810
811 if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
812 dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n");
813 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
814 }
815
816 if (khelper_cmd)
817 drbd_khelper(mdev, khelper_cmd);
818
819 return 1;
820}
821
822/* helper */
823static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
824{
825 if (drbd_bio_has_active_page(e->private_bio)) {
826 /* This might happen if sendpage() has not finished */
827 spin_lock_irq(&mdev->req_lock);
828 list_add_tail(&e->w.list, &mdev->net_ee);
829 spin_unlock_irq(&mdev->req_lock);
830 } else
831 drbd_free_ee(mdev, e);
832}
833
834/**
835 * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
836 * @mdev: DRBD device.
837 * @w: work object.
838 * @cancel: The connection will be closed anyways
839 */
840int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
841{
842 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
843 int ok;
844
845 if (unlikely(cancel)) {
846 drbd_free_ee(mdev, e);
847 dec_unacked(mdev);
848 return 1;
849 }
850
851 if (likely(drbd_bio_uptodate(e->private_bio))) {
852 ok = drbd_send_block(mdev, P_DATA_REPLY, e);
853 } else {
854 if (__ratelimit(&drbd_ratelimit_state))
855 dev_err(DEV, "Sending NegDReply. sector=%llus.\n",
856 (unsigned long long)e->sector);
857
858 ok = drbd_send_ack(mdev, P_NEG_DREPLY, e);
859 }
860
861 dec_unacked(mdev);
862
863 move_to_net_ee_or_free(mdev, e);
864
865 if (unlikely(!ok))
866 dev_err(DEV, "drbd_send_block() failed\n");
867 return ok;
868}
869
870/**
871 * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
872 * @mdev: DRBD device.
873 * @w: work object.
874 * @cancel: The connection will be closed anyways
875 */
876int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
877{
878 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
879 int ok;
880
881 if (unlikely(cancel)) {
882 drbd_free_ee(mdev, e);
883 dec_unacked(mdev);
884 return 1;
885 }
886
887 if (get_ldev_if_state(mdev, D_FAILED)) {
888 drbd_rs_complete_io(mdev, e->sector);
889 put_ldev(mdev);
890 }
891
892 if (likely(drbd_bio_uptodate(e->private_bio))) {
893 if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
894 inc_rs_pending(mdev);
895 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
896 } else {
897 if (__ratelimit(&drbd_ratelimit_state))
898 dev_err(DEV, "Not sending RSDataReply, "
899 "partner DISKLESS!\n");
900 ok = 1;
901 }
902 } else {
903 if (__ratelimit(&drbd_ratelimit_state))
904 dev_err(DEV, "Sending NegRSDReply. sector %llus.\n",
905 (unsigned long long)e->sector);
906
907 ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
908
909 /* update resync data with failure */
910 drbd_rs_failed_io(mdev, e->sector, e->size);
911 }
912
913 dec_unacked(mdev);
914
915 move_to_net_ee_or_free(mdev, e);
916
917 if (unlikely(!ok))
918 dev_err(DEV, "drbd_send_block() failed\n");
919 return ok;
920}
921
922int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
923{
924 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
925 struct digest_info *di;
926 int digest_size;
927 void *digest = NULL;
928 int ok, eq = 0;
929
930 if (unlikely(cancel)) {
931 drbd_free_ee(mdev, e);
932 dec_unacked(mdev);
933 return 1;
934 }
935
936 drbd_rs_complete_io(mdev, e->sector);
937
938 di = (struct digest_info *)(unsigned long)e->block_id;
939
940 if (likely(drbd_bio_uptodate(e->private_bio))) {
941 /* quick hack to try to avoid a race against reconfiguration.
942 * a real fix would be much more involved,
943 * introducing more locking mechanisms */
944 if (mdev->csums_tfm) {
945 digest_size = crypto_hash_digestsize(mdev->csums_tfm);
946 D_ASSERT(digest_size == di->digest_size);
947 digest = kmalloc(digest_size, GFP_NOIO);
948 }
949 if (digest) {
950 drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
951 eq = !memcmp(digest, di->digest, digest_size);
952 kfree(digest);
953 }
954
955 if (eq) {
956 drbd_set_in_sync(mdev, e->sector, e->size);
957 mdev->rs_same_csum++;
958 ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
959 } else {
960 inc_rs_pending(mdev);
961 e->block_id = ID_SYNCER;
962 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
963 }
964 } else {
965 ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
966 if (__ratelimit(&drbd_ratelimit_state))
967 dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
968 }
969
970 dec_unacked(mdev);
971
972 kfree(di);
973
974 move_to_net_ee_or_free(mdev, e);
975
976 if (unlikely(!ok))
977 dev_err(DEV, "drbd_send_block/ack() failed\n");
978 return ok;
979}
980
981int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
982{
983 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
984 int digest_size;
985 void *digest;
986 int ok = 1;
987
988 if (unlikely(cancel))
989 goto out;
990
991 if (unlikely(!drbd_bio_uptodate(e->private_bio)))
992 goto out;
993
994 digest_size = crypto_hash_digestsize(mdev->verify_tfm);
995 /* FIXME if this allocation fails, online verify will not terminate! */
996 digest = kmalloc(digest_size, GFP_NOIO);
997 if (digest) {
998 drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest);
999 inc_rs_pending(mdev);
1000 ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
1001 digest, digest_size, P_OV_REPLY);
1002 if (!ok)
1003 dec_rs_pending(mdev);
1004 kfree(digest);
1005 }
1006
1007out:
1008 drbd_free_ee(mdev, e);
1009
1010 dec_unacked(mdev);
1011
1012 return ok;
1013}
1014
1015void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
1016{
1017 if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) {
1018 mdev->ov_last_oos_size += size>>9;
1019 } else {
1020 mdev->ov_last_oos_start = sector;
1021 mdev->ov_last_oos_size = size>>9;
1022 }
1023 drbd_set_out_of_sync(mdev, sector, size);
1024 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
1025}
1026
1027int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1028{
1029 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
1030 struct digest_info *di;
1031 int digest_size;
1032 void *digest;
1033 int ok, eq = 0;
1034
1035 if (unlikely(cancel)) {
1036 drbd_free_ee(mdev, e);
1037 dec_unacked(mdev);
1038 return 1;
1039 }
1040
1041 /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
1042 * the resync lru has been cleaned up already */
1043 drbd_rs_complete_io(mdev, e->sector);
1044
1045 di = (struct digest_info *)(unsigned long)e->block_id;
1046
1047 if (likely(drbd_bio_uptodate(e->private_bio))) {
1048 digest_size = crypto_hash_digestsize(mdev->verify_tfm);
1049 digest = kmalloc(digest_size, GFP_NOIO);
1050 if (digest) {
1051 drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest);
1052
1053 D_ASSERT(digest_size == di->digest_size);
1054 eq = !memcmp(digest, di->digest, digest_size);
1055 kfree(digest);
1056 }
1057 } else {
1058 ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
1059 if (__ratelimit(&drbd_ratelimit_state))
1060 dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
1061 }
1062
1063 dec_unacked(mdev);
1064
1065 kfree(di);
1066
1067 if (!eq)
1068 drbd_ov_oos_found(mdev, e->sector, e->size);
1069 else
1070 ov_oos_print(mdev);
1071
1072 ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size,
1073 eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
1074
1075 drbd_free_ee(mdev, e);
1076
1077 if (--mdev->ov_left == 0) {
1078 ov_oos_print(mdev);
1079 drbd_resync_finished(mdev);
1080 }
1081
1082 return ok;
1083}
1084
1085int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1086{
1087 struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w);
1088 complete(&b->done);
1089 return 1;
1090}
1091
1092int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1093{
1094 struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w);
1095 struct p_barrier *p = &mdev->data.sbuf.barrier;
1096 int ok = 1;
1097
1098 /* really avoid racing with tl_clear. w.cb may have been referenced
1099 * just before it was reassigned and re-queued, so double check that.
1100 * actually, this race was harmless, since we only try to send the
1101 * barrier packet here, and otherwise do nothing with the object.
1102 * but compare with the head of w_clear_epoch */
1103 spin_lock_irq(&mdev->req_lock);
1104 if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED)
1105 cancel = 1;
1106 spin_unlock_irq(&mdev->req_lock);
1107 if (cancel)
1108 return 1;
1109
1110 if (!drbd_get_data_sock(mdev))
1111 return 0;
1112 p->barrier = b->br_number;
1113 /* inc_ap_pending was done where this was queued.
1114 * dec_ap_pending will be done in got_BarrierAck
1115 * or (on connection loss) in w_clear_epoch. */
1116 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
1117 (struct p_header *)p, sizeof(*p), 0);
1118 drbd_put_data_sock(mdev);
1119
1120 return ok;
1121}
1122
1123int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1124{
1125 if (cancel)
1126 return 1;
1127 return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE);
1128}
1129
1130/**
1131 * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
1132 * @mdev: DRBD device.
1133 * @w: work object.
1134 * @cancel: The connection will be closed anyways
1135 */
1136int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1137{
1138 struct drbd_request *req = container_of(w, struct drbd_request, w);
1139 int ok;
1140
1141 if (unlikely(cancel)) {
1142 req_mod(req, send_canceled);
1143 return 1;
1144 }
1145
1146 ok = drbd_send_dblock(mdev, req);
1147 req_mod(req, ok ? handed_over_to_network : send_failed);
1148
1149 return ok;
1150}
1151
1152/**
1153 * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
1154 * @mdev: DRBD device.
1155 * @w: work object.
1156 * @cancel: The connection will be closed anyways
1157 */
1158int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1159{
1160 struct drbd_request *req = container_of(w, struct drbd_request, w);
1161 int ok;
1162
1163 if (unlikely(cancel)) {
1164 req_mod(req, send_canceled);
1165 return 1;
1166 }
1167
1168 ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size,
1169 (unsigned long)req);
1170
1171 if (!ok) {
1172 /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send();
1173 * so this is probably redundant */
1174 if (mdev->state.conn >= C_CONNECTED)
1175 drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
1176 }
1177 req_mod(req, ok ? handed_over_to_network : send_failed);
1178
1179 return ok;
1180}
1181
1182static int _drbd_may_sync_now(struct drbd_conf *mdev)
1183{
1184 struct drbd_conf *odev = mdev;
1185
1186 while (1) {
1187 if (odev->sync_conf.after == -1)
1188 return 1;
1189 odev = minor_to_mdev(odev->sync_conf.after);
1190 ERR_IF(!odev) return 1;
1191 if ((odev->state.conn >= C_SYNC_SOURCE &&
1192 odev->state.conn <= C_PAUSED_SYNC_T) ||
1193 odev->state.aftr_isp || odev->state.peer_isp ||
1194 odev->state.user_isp)
1195 return 0;
1196 }
1197}
1198
1199/**
1200 * _drbd_pause_after() - Pause resync on all devices that may not resync now
1201 * @mdev: DRBD device.
1202 *
1203 * Called from process context only (admin command and after_state_ch).
1204 */
1205static int _drbd_pause_after(struct drbd_conf *mdev)
1206{
1207 struct drbd_conf *odev;
1208 int i, rv = 0;
1209
1210 for (i = 0; i < minor_count; i++) {
1211 odev = minor_to_mdev(i);
1212 if (!odev)
1213 continue;
1214 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1215 continue;
1216 if (!_drbd_may_sync_now(odev))
1217 rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
1218 != SS_NOTHING_TO_DO);
1219 }
1220
1221 return rv;
1222}
1223
1224/**
1225 * _drbd_resume_next() - Resume resync on all devices that may resync now
1226 * @mdev: DRBD device.
1227 *
1228 * Called from process context only (admin command and worker).
1229 */
1230static int _drbd_resume_next(struct drbd_conf *mdev)
1231{
1232 struct drbd_conf *odev;
1233 int i, rv = 0;
1234
1235 for (i = 0; i < minor_count; i++) {
1236 odev = minor_to_mdev(i);
1237 if (!odev)
1238 continue;
1239 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1240 continue;
1241 if (odev->state.aftr_isp) {
1242 if (_drbd_may_sync_now(odev))
1243 rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
1244 CS_HARD, NULL)
1245 != SS_NOTHING_TO_DO) ;
1246 }
1247 }
1248 return rv;
1249}
1250
1251void resume_next_sg(struct drbd_conf *mdev)
1252{
1253 write_lock_irq(&global_state_lock);
1254 _drbd_resume_next(mdev);
1255 write_unlock_irq(&global_state_lock);
1256}
1257
1258void suspend_other_sg(struct drbd_conf *mdev)
1259{
1260 write_lock_irq(&global_state_lock);
1261 _drbd_pause_after(mdev);
1262 write_unlock_irq(&global_state_lock);
1263}
1264
1265static int sync_after_error(struct drbd_conf *mdev, int o_minor)
1266{
1267 struct drbd_conf *odev;
1268
1269 if (o_minor == -1)
1270 return NO_ERROR;
1271 if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)
1272 return ERR_SYNC_AFTER;
1273
1274 /* check for loops */
1275 odev = minor_to_mdev(o_minor);
1276 while (1) {
1277 if (odev == mdev)
1278 return ERR_SYNC_AFTER_CYCLE;
1279
1280 /* dependency chain ends here, no cycles. */
1281 if (odev->sync_conf.after == -1)
1282 return NO_ERROR;
1283
1284 /* follow the dependency chain */
1285 odev = minor_to_mdev(odev->sync_conf.after);
1286 }
1287}
1288
1289int drbd_alter_sa(struct drbd_conf *mdev, int na)
1290{
1291 int changes;
1292 int retcode;
1293
1294 write_lock_irq(&global_state_lock);
1295 retcode = sync_after_error(mdev, na);
1296 if (retcode == NO_ERROR) {
1297 mdev->sync_conf.after = na;
1298 do {
1299 changes = _drbd_pause_after(mdev);
1300 changes |= _drbd_resume_next(mdev);
1301 } while (changes);
1302 }
1303 write_unlock_irq(&global_state_lock);
1304 return retcode;
1305}
1306
1307/**
1308 * drbd_start_resync() - Start the resync process
1309 * @mdev: DRBD device.
1310 * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET
1311 *
1312 * This function might bring you directly into one of the
1313 * C_PAUSED_SYNC_* states.
1314 */
1315void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1316{
1317 union drbd_state ns;
1318 int r;
1319
1320 if (mdev->state.conn >= C_SYNC_SOURCE) {
1321 dev_err(DEV, "Resync already running!\n");
1322 return;
1323 }
1324
1325 trace_drbd_resync(mdev, TRACE_LVL_SUMMARY, "Resync starting: side=%s\n",
1326 side == C_SYNC_TARGET ? "SyncTarget" : "SyncSource");
1327
1328 /* In case a previous resync run was aborted by an IO error/detach on the peer. */
1329 drbd_rs_cancel_all(mdev);
1330
1331 if (side == C_SYNC_TARGET) {
1332 /* Since application IO was locked out during C_WF_BITMAP_T and
1333 C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
1334 we check that we might make the data inconsistent. */
1335 r = drbd_khelper(mdev, "before-resync-target");
1336 r = (r >> 8) & 0xff;
1337 if (r > 0) {
1338 dev_info(DEV, "before-resync-target handler returned %d, "
1339 "dropping connection.\n", r);
1340 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
1341 return;
1342 }
1343 }
1344
1345 drbd_state_lock(mdev);
1346
1347 if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
1348 drbd_state_unlock(mdev);
1349 return;
1350 }
1351
1352 if (side == C_SYNC_TARGET) {
1353 mdev->bm_resync_fo = 0;
1354 } else /* side == C_SYNC_SOURCE */ {
1355 u64 uuid;
1356
1357 get_random_bytes(&uuid, sizeof(u64));
1358 drbd_uuid_set(mdev, UI_BITMAP, uuid);
1359 drbd_send_sync_uuid(mdev, uuid);
1360
1361 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
1362 }
1363
1364 write_lock_irq(&global_state_lock);
1365 ns = mdev->state;
1366
1367 ns.aftr_isp = !_drbd_may_sync_now(mdev);
1368
1369 ns.conn = side;
1370
1371 if (side == C_SYNC_TARGET)
1372 ns.disk = D_INCONSISTENT;
1373 else /* side == C_SYNC_SOURCE */
1374 ns.pdsk = D_INCONSISTENT;
1375
1376 r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
1377 ns = mdev->state;
1378
1379 if (ns.conn < C_CONNECTED)
1380 r = SS_UNKNOWN_ERROR;
1381
1382 if (r == SS_SUCCESS) {
1383 mdev->rs_total =
1384 mdev->rs_mark_left = drbd_bm_total_weight(mdev);
1385 mdev->rs_failed = 0;
1386 mdev->rs_paused = 0;
1387 mdev->rs_start =
1388 mdev->rs_mark_time = jiffies;
1389 mdev->rs_same_csum = 0;
1390 _drbd_pause_after(mdev);
1391 }
1392 write_unlock_irq(&global_state_lock);
1393 drbd_state_unlock(mdev);
1394 put_ldev(mdev);
1395
1396 if (r == SS_SUCCESS) {
1397 dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
1398 drbd_conn_str(ns.conn),
1399 (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
1400 (unsigned long) mdev->rs_total);
1401
1402 if (mdev->rs_total == 0) {
1403 /* Peer still reachable? Beware of failing before-resync-target handlers! */
1404 request_ping(mdev);
1405 __set_current_state(TASK_INTERRUPTIBLE);
1406 schedule_timeout(mdev->net_conf->ping_timeo*HZ/9); /* 9 instead 10 */
1407 drbd_resync_finished(mdev);
1408 return;
1409 }
1410
1411 /* ns.conn may already be != mdev->state.conn,
1412 * we may have been paused in between, or become paused until
1413 * the timer triggers.
1414 * No matter, that is handled in resync_timer_fn() */
1415 if (ns.conn == C_SYNC_TARGET)
1416 mod_timer(&mdev->resync_timer, jiffies);
1417
1418 drbd_md_sync(mdev);
1419 }
1420}
1421
1422int drbd_worker(struct drbd_thread *thi)
1423{
1424 struct drbd_conf *mdev = thi->mdev;
1425 struct drbd_work *w = NULL;
1426 LIST_HEAD(work_list);
1427 int intr = 0, i;
1428
1429 sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev));
1430
1431 while (get_t_state(thi) == Running) {
1432 drbd_thread_current_set_cpu(mdev);
1433
1434 if (down_trylock(&mdev->data.work.s)) {
1435 mutex_lock(&mdev->data.mutex);
1436 if (mdev->data.socket && !mdev->net_conf->no_cork)
1437 drbd_tcp_uncork(mdev->data.socket);
1438 mutex_unlock(&mdev->data.mutex);
1439
1440 intr = down_interruptible(&mdev->data.work.s);
1441
1442 mutex_lock(&mdev->data.mutex);
1443 if (mdev->data.socket && !mdev->net_conf->no_cork)
1444 drbd_tcp_cork(mdev->data.socket);
1445 mutex_unlock(&mdev->data.mutex);
1446 }
1447
1448 if (intr) {
1449 D_ASSERT(intr == -EINTR);
1450 flush_signals(current);
1451 ERR_IF (get_t_state(thi) == Running)
1452 continue;
1453 break;
1454 }
1455
1456 if (get_t_state(thi) != Running)
1457 break;
1458 /* With this break, we have done a down() but not consumed
1459 the entry from the list. The cleanup code takes care of
1460 this... */
1461
1462 w = NULL;
1463 spin_lock_irq(&mdev->data.work.q_lock);
1464 ERR_IF(list_empty(&mdev->data.work.q)) {
1465 /* something terribly wrong in our logic.
1466 * we were able to down() the semaphore,
1467 * but the list is empty... doh.
1468 *
1469 * what is the best thing to do now?
1470 * try again from scratch, restarting the receiver,
1471 * asender, whatnot? could break even more ugly,
1472 * e.g. when we are primary, but no good local data.
1473 *
1474 * I'll try to get away just starting over this loop.
1475 */
1476 spin_unlock_irq(&mdev->data.work.q_lock);
1477 continue;
1478 }
1479 w = list_entry(mdev->data.work.q.next, struct drbd_work, list);
1480 list_del_init(&w->list);
1481 spin_unlock_irq(&mdev->data.work.q_lock);
1482
1483 if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) {
1484 /* dev_warn(DEV, "worker: a callback failed! \n"); */
1485 if (mdev->state.conn >= C_CONNECTED)
1486 drbd_force_state(mdev,
1487 NS(conn, C_NETWORK_FAILURE));
1488 }
1489 }
1490 D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags));
1491 D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags));
1492
1493 spin_lock_irq(&mdev->data.work.q_lock);
1494 i = 0;
1495 while (!list_empty(&mdev->data.work.q)) {
1496 list_splice_init(&mdev->data.work.q, &work_list);
1497 spin_unlock_irq(&mdev->data.work.q_lock);
1498
1499 while (!list_empty(&work_list)) {
1500 w = list_entry(work_list.next, struct drbd_work, list);
1501 list_del_init(&w->list);
1502 w->cb(mdev, w, 1);
1503 i++; /* dead debugging code */
1504 }
1505
1506 spin_lock_irq(&mdev->data.work.q_lock);
1507 }
1508 sema_init(&mdev->data.work.s, 0);
1509 /* DANGEROUS race: if someone did queue his work within the spinlock,
1510 * but up() ed outside the spinlock, we could get an up() on the
1511 * semaphore without corresponding list entry.
1512 * So don't do that.
1513 */
1514 spin_unlock_irq(&mdev->data.work.q_lock);
1515
1516 D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
1517 /* _drbd_set_state only uses stop_nowait.
1518 * wait here for the Exiting receiver. */
1519 drbd_thread_stop(&mdev->receiver);
1520 drbd_mdev_cleanup(mdev);
1521
1522 dev_info(DEV, "worker terminated\n");
1523
1524 clear_bit(DEVICE_DYING, &mdev->flags);
1525 clear_bit(CONFIG_PENDING, &mdev->flags);
1526 wake_up(&mdev->state_wait);
1527
1528 return 0;
1529}
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h
new file mode 100644
index 000000000000..f93fa111ce50
--- /dev/null
+++ b/drivers/block/drbd/drbd_wrappers.h
@@ -0,0 +1,91 @@
1#ifndef _DRBD_WRAPPERS_H
2#define _DRBD_WRAPPERS_H
3
4#include <linux/ctype.h>
5#include <linux/mm.h>
6
7/* see get_sb_bdev and bd_claim */
8extern char *drbd_sec_holder;
9
10/* sets the number of 512 byte sectors of our virtual device */
11static inline void drbd_set_my_capacity(struct drbd_conf *mdev,
12 sector_t size)
13{
14 /* set_capacity(mdev->this_bdev->bd_disk, size); */
15 set_capacity(mdev->vdisk, size);
16 mdev->this_bdev->bd_inode->i_size = (loff_t)size << 9;
17}
18
19#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE)
20
21static inline int drbd_bio_has_active_page(struct bio *bio)
22{
23 struct bio_vec *bvec;
24 int i;
25
26 __bio_for_each_segment(bvec, bio, i, 0) {
27 if (page_count(bvec->bv_page) > 1)
28 return 1;
29 }
30
31 return 0;
32}
33
34/* bi_end_io handlers */
35extern void drbd_md_io_complete(struct bio *bio, int error);
36extern void drbd_endio_read_sec(struct bio *bio, int error);
37extern void drbd_endio_write_sec(struct bio *bio, int error);
38extern void drbd_endio_pri(struct bio *bio, int error);
39
40/*
41 * used to submit our private bio
42 */
43static inline void drbd_generic_make_request(struct drbd_conf *mdev,
44 int fault_type, struct bio *bio)
45{
46 __release(local);
47 if (!bio->bi_bdev) {
48 printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
49 "bio->bi_bdev == NULL\n",
50 mdev_to_minor(mdev));
51 dump_stack();
52 bio_endio(bio, -ENODEV);
53 return;
54 }
55
56 if (FAULT_ACTIVE(mdev, fault_type))
57 bio_endio(bio, -EIO);
58 else
59 generic_make_request(bio);
60}
61
62static inline void drbd_plug_device(struct drbd_conf *mdev)
63{
64 struct request_queue *q;
65 q = bdev_get_queue(mdev->this_bdev);
66
67 spin_lock_irq(q->queue_lock);
68
69/* XXX the check on !blk_queue_plugged is redundant,
70 * implicitly checked in blk_plug_device */
71
72 if (!blk_queue_plugged(q)) {
73 blk_plug_device(q);
74 del_timer(&q->unplug_timer);
75 /* unplugging should not happen automatically... */
76 }
77 spin_unlock_irq(q->queue_lock);
78}
79
80static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm)
81{
82 return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK)
83 == CRYPTO_ALG_TYPE_HASH;
84}
85
86#ifndef __CHECKER__
87# undef __cond_lock
88# define __cond_lock(x,c) (c)
89#endif
90
91#endif
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
new file mode 100644
index 000000000000..69dc711f37b3
--- /dev/null
+++ b/include/linux/drbd.h
@@ -0,0 +1,349 @@
1/*
2 drbd.h
3 Kernel module for 2.6.x Kernels
4
5 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6
7 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
8 Copyright (C) 2001-2008, Philipp Reisner <philipp.reisner@linbit.com>.
9 Copyright (C) 2001-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
10
11 drbd is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 drbd is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with drbd; see the file COPYING. If not, write to
23 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
24
25*/
26#ifndef DRBD_H
27#define DRBD_H
28#include <linux/connector.h>
29#include <asm/types.h>
30
31#ifdef __KERNEL__
32#include <linux/types.h>
33#include <asm/byteorder.h>
34#else
35#include <sys/types.h>
36#include <sys/wait.h>
37#include <limits.h>
38
39/* Altough the Linux source code makes a difference between
40 generic endianness and the bitfields' endianness, there is no
41 architecture as of Linux-2.6.24-rc4 where the bitfileds' endianness
42 does not match the generic endianness. */
43
44#if __BYTE_ORDER == __LITTLE_ENDIAN
45#define __LITTLE_ENDIAN_BITFIELD
46#elif __BYTE_ORDER == __BIG_ENDIAN
47#define __BIG_ENDIAN_BITFIELD
48#else
49# error "sorry, weird endianness on this box"
50#endif
51
52#endif
53
54
55extern const char *drbd_buildtag(void);
56#define REL_VERSION "8.3.3rc2"
57#define API_VERSION 88
58#define PRO_VERSION_MIN 86
59#define PRO_VERSION_MAX 91
60
61
62enum drbd_io_error_p {
63 EP_PASS_ON, /* FIXME should the better be named "Ignore"? */
64 EP_CALL_HELPER,
65 EP_DETACH
66};
67
68enum drbd_fencing_p {
69 FP_DONT_CARE,
70 FP_RESOURCE,
71 FP_STONITH
72};
73
74enum drbd_disconnect_p {
75 DP_RECONNECT,
76 DP_DROP_NET_CONF,
77 DP_FREEZE_IO
78};
79
80enum drbd_after_sb_p {
81 ASB_DISCONNECT,
82 ASB_DISCARD_YOUNGER_PRI,
83 ASB_DISCARD_OLDER_PRI,
84 ASB_DISCARD_ZERO_CHG,
85 ASB_DISCARD_LEAST_CHG,
86 ASB_DISCARD_LOCAL,
87 ASB_DISCARD_REMOTE,
88 ASB_CONSENSUS,
89 ASB_DISCARD_SECONDARY,
90 ASB_CALL_HELPER,
91 ASB_VIOLENTLY
92};
93
94/* KEEP the order, do not delete or insert. Only append. */
95enum drbd_ret_codes {
96 ERR_CODE_BASE = 100,
97 NO_ERROR = 101,
98 ERR_LOCAL_ADDR = 102,
99 ERR_PEER_ADDR = 103,
100 ERR_OPEN_DISK = 104,
101 ERR_OPEN_MD_DISK = 105,
102 ERR_DISK_NOT_BDEV = 107,
103 ERR_MD_NOT_BDEV = 108,
104 ERR_DISK_TO_SMALL = 111,
105 ERR_MD_DISK_TO_SMALL = 112,
106 ERR_BDCLAIM_DISK = 114,
107 ERR_BDCLAIM_MD_DISK = 115,
108 ERR_MD_IDX_INVALID = 116,
109 ERR_IO_MD_DISK = 118,
110 ERR_MD_INVALID = 119,
111 ERR_AUTH_ALG = 120,
112 ERR_AUTH_ALG_ND = 121,
113 ERR_NOMEM = 122,
114 ERR_DISCARD = 123,
115 ERR_DISK_CONFIGURED = 124,
116 ERR_NET_CONFIGURED = 125,
117 ERR_MANDATORY_TAG = 126,
118 ERR_MINOR_INVALID = 127,
119 ERR_INTR = 129, /* EINTR */
120 ERR_RESIZE_RESYNC = 130,
121 ERR_NO_PRIMARY = 131,
122 ERR_SYNC_AFTER = 132,
123 ERR_SYNC_AFTER_CYCLE = 133,
124 ERR_PAUSE_IS_SET = 134,
125 ERR_PAUSE_IS_CLEAR = 135,
126 ERR_PACKET_NR = 137,
127 ERR_NO_DISK = 138,
128 ERR_NOT_PROTO_C = 139,
129 ERR_NOMEM_BITMAP = 140,
130 ERR_INTEGRITY_ALG = 141, /* DRBD 8.2 only */
131 ERR_INTEGRITY_ALG_ND = 142, /* DRBD 8.2 only */
132 ERR_CPU_MASK_PARSE = 143, /* DRBD 8.2 only */
133 ERR_CSUMS_ALG = 144, /* DRBD 8.2 only */
134 ERR_CSUMS_ALG_ND = 145, /* DRBD 8.2 only */
135 ERR_VERIFY_ALG = 146, /* DRBD 8.2 only */
136 ERR_VERIFY_ALG_ND = 147, /* DRBD 8.2 only */
137 ERR_CSUMS_RESYNC_RUNNING= 148, /* DRBD 8.2 only */
138 ERR_VERIFY_RUNNING = 149, /* DRBD 8.2 only */
139 ERR_DATA_NOT_CURRENT = 150,
140 ERR_CONNECTED = 151, /* DRBD 8.3 only */
141
142 /* insert new ones above this line */
143 AFTER_LAST_ERR_CODE
144};
145
146#define DRBD_PROT_A 1
147#define DRBD_PROT_B 2
148#define DRBD_PROT_C 3
149
150enum drbd_role {
151 R_UNKNOWN = 0,
152 R_PRIMARY = 1, /* role */
153 R_SECONDARY = 2, /* role */
154 R_MASK = 3,
155};
156
157/* The order of these constants is important.
158 * The lower ones (<C_WF_REPORT_PARAMS) indicate
159 * that there is no socket!
160 * >=C_WF_REPORT_PARAMS ==> There is a socket
161 */
162enum drbd_conns {
163 C_STANDALONE,
164 C_DISCONNECTING, /* Temporal state on the way to StandAlone. */
165 C_UNCONNECTED, /* >= C_UNCONNECTED -> inc_net() succeeds */
166
167 /* These temporal states are all used on the way
168 * from >= C_CONNECTED to Unconnected.
169 * The 'disconnect reason' states
170 * I do not allow to change beween them. */
171 C_TIMEOUT,
172 C_BROKEN_PIPE,
173 C_NETWORK_FAILURE,
174 C_PROTOCOL_ERROR,
175 C_TEAR_DOWN,
176
177 C_WF_CONNECTION,
178 C_WF_REPORT_PARAMS, /* we have a socket */
179 C_CONNECTED, /* we have introduced each other */
180 C_STARTING_SYNC_S, /* starting full sync by admin request. */
181 C_STARTING_SYNC_T, /* stariing full sync by admin request. */
182 C_WF_BITMAP_S,
183 C_WF_BITMAP_T,
184 C_WF_SYNC_UUID,
185
186 /* All SyncStates are tested with this comparison
187 * xx >= C_SYNC_SOURCE && xx <= C_PAUSED_SYNC_T */
188 C_SYNC_SOURCE,
189 C_SYNC_TARGET,
190 C_VERIFY_S,
191 C_VERIFY_T,
192 C_PAUSED_SYNC_S,
193 C_PAUSED_SYNC_T,
194 C_MASK = 31
195};
196
197enum drbd_disk_state {
198 D_DISKLESS,
199 D_ATTACHING, /* In the process of reading the meta-data */
200 D_FAILED, /* Becomes D_DISKLESS as soon as we told it the peer */
201 /* when >= D_FAILED it is legal to access mdev->bc */
202 D_NEGOTIATING, /* Late attaching state, we need to talk to the peer */
203 D_INCONSISTENT,
204 D_OUTDATED,
205 D_UNKNOWN, /* Only used for the peer, never for myself */
206 D_CONSISTENT, /* Might be D_OUTDATED, might be D_UP_TO_DATE ... */
207 D_UP_TO_DATE, /* Only this disk state allows applications' IO ! */
208 D_MASK = 15
209};
210
211union drbd_state {
212/* According to gcc's docs is the ...
213 * The order of allocation of bit-fields within a unit (C90 6.5.2.1, C99 6.7.2.1).
214 * Determined by ABI.
215 * pointed out by Maxim Uvarov q<muvarov@ru.mvista.com>
216 * even though we transmit as "cpu_to_be32(state)",
217 * the offsets of the bitfields still need to be swapped
218 * on different endianess.
219 */
220 struct {
221#if defined(__LITTLE_ENDIAN_BITFIELD)
222 unsigned role:2 ; /* 3/4 primary/secondary/unknown */
223 unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
224 unsigned conn:5 ; /* 17/32 cstates */
225 unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
226 unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
227 unsigned susp:1 ; /* 2/2 IO suspended no/yes */
228 unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
229 unsigned peer_isp:1 ;
230 unsigned user_isp:1 ;
231 unsigned _pad:11; /* 0 unused */
232#elif defined(__BIG_ENDIAN_BITFIELD)
233 unsigned _pad:11; /* 0 unused */
234 unsigned user_isp:1 ;
235 unsigned peer_isp:1 ;
236 unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
237 unsigned susp:1 ; /* 2/2 IO suspended no/yes */
238 unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
239 unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
240 unsigned conn:5 ; /* 17/32 cstates */
241 unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
242 unsigned role:2 ; /* 3/4 primary/secondary/unknown */
243#else
244# error "this endianess is not supported"
245#endif
246 };
247 unsigned int i;
248};
249
250enum drbd_state_ret_codes {
251 SS_CW_NO_NEED = 4,
252 SS_CW_SUCCESS = 3,
253 SS_NOTHING_TO_DO = 2,
254 SS_SUCCESS = 1,
255 SS_UNKNOWN_ERROR = 0, /* Used to sleep longer in _drbd_request_state */
256 SS_TWO_PRIMARIES = -1,
257 SS_NO_UP_TO_DATE_DISK = -2,
258 SS_NO_LOCAL_DISK = -4,
259 SS_NO_REMOTE_DISK = -5,
260 SS_CONNECTED_OUTDATES = -6,
261 SS_PRIMARY_NOP = -7,
262 SS_RESYNC_RUNNING = -8,
263 SS_ALREADY_STANDALONE = -9,
264 SS_CW_FAILED_BY_PEER = -10,
265 SS_IS_DISKLESS = -11,
266 SS_DEVICE_IN_USE = -12,
267 SS_NO_NET_CONFIG = -13,
268 SS_NO_VERIFY_ALG = -14, /* drbd-8.2 only */
269 SS_NEED_CONNECTION = -15, /* drbd-8.2 only */
270 SS_LOWER_THAN_OUTDATED = -16,
271 SS_NOT_SUPPORTED = -17, /* drbd-8.2 only */
272 SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */
273 SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */
274 SS_AFTER_LAST_ERROR = -20, /* Keep this at bottom */
275};
276
277/* from drbd_strings.c */
278extern const char *drbd_conn_str(enum drbd_conns);
279extern const char *drbd_role_str(enum drbd_role);
280extern const char *drbd_disk_str(enum drbd_disk_state);
281extern const char *drbd_set_st_err_str(enum drbd_state_ret_codes);
282
283#define SHARED_SECRET_MAX 64
284
285#define MDF_CONSISTENT (1 << 0)
286#define MDF_PRIMARY_IND (1 << 1)
287#define MDF_CONNECTED_IND (1 << 2)
288#define MDF_FULL_SYNC (1 << 3)
289#define MDF_WAS_UP_TO_DATE (1 << 4)
290#define MDF_PEER_OUT_DATED (1 << 5)
291#define MDF_CRASHED_PRIMARY (1 << 6)
292
293enum drbd_uuid_index {
294 UI_CURRENT,
295 UI_BITMAP,
296 UI_HISTORY_START,
297 UI_HISTORY_END,
298 UI_SIZE, /* nl-packet: number of dirty bits */
299 UI_FLAGS, /* nl-packet: flags */
300 UI_EXTENDED_SIZE /* Everything. */
301};
302
303enum drbd_timeout_flag {
304 UT_DEFAULT = 0,
305 UT_DEGRADED = 1,
306 UT_PEER_OUTDATED = 2,
307};
308
309#define UUID_JUST_CREATED ((__u64)4)
310
311#define DRBD_MAGIC 0x83740267
312#define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC)
313
314/* these are of type "int" */
315#define DRBD_MD_INDEX_INTERNAL -1
316#define DRBD_MD_INDEX_FLEX_EXT -2
317#define DRBD_MD_INDEX_FLEX_INT -3
318
319/* Start of the new netlink/connector stuff */
320
321#define DRBD_NL_CREATE_DEVICE 0x01
322#define DRBD_NL_SET_DEFAULTS 0x02
323
324/* The following line should be moved over to linux/connector.h
325 * when the time comes */
326#ifndef CN_IDX_DRBD
327# define CN_IDX_DRBD 0x4
328/* Ubuntu "intrepid ibex" release defined CN_IDX_DRBD as 0x6 */
329#endif
330#define CN_VAL_DRBD 0x1
331
332/* For searching a vacant cn_idx value */
333#define CN_IDX_STEP 6977
334
335struct drbd_nl_cfg_req {
336 int packet_type;
337 unsigned int drbd_minor;
338 int flags;
339 unsigned short tag_list[];
340};
341
342struct drbd_nl_cfg_reply {
343 int packet_type;
344 unsigned int minor;
345 int ret_code; /* enum ret_code or set_st_err_t */
346 unsigned short tag_list[]; /* only used with get_* calls */
347};
348
349#endif
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
new file mode 100644
index 000000000000..9d067ce46960
--- /dev/null
+++ b/include/linux/drbd_limits.h
@@ -0,0 +1,137 @@
1/*
2 drbd_limits.h
3 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
4*/
5
6/*
7 * Our current limitations.
8 * Some of them are hard limits,
9 * some of them are arbitrary range limits, that make it easier to provide
10 * feedback about nonsense settings for certain configurable values.
11 */
12
13#ifndef DRBD_LIMITS_H
14#define DRBD_LIMITS_H 1
15
16#define DEBUG_RANGE_CHECK 0
17
18#define DRBD_MINOR_COUNT_MIN 1
19#define DRBD_MINOR_COUNT_MAX 255
20
21#define DRBD_DIALOG_REFRESH_MIN 0
22#define DRBD_DIALOG_REFRESH_MAX 600
23
24/* valid port number */
25#define DRBD_PORT_MIN 1
26#define DRBD_PORT_MAX 0xffff
27
28/* startup { */
29 /* if you want more than 3.4 days, disable */
30#define DRBD_WFC_TIMEOUT_MIN 0
31#define DRBD_WFC_TIMEOUT_MAX 300000
32#define DRBD_WFC_TIMEOUT_DEF 0
33
34#define DRBD_DEGR_WFC_TIMEOUT_MIN 0
35#define DRBD_DEGR_WFC_TIMEOUT_MAX 300000
36#define DRBD_DEGR_WFC_TIMEOUT_DEF 0
37
38#define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0
39#define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000
40#define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0
41/* }*/
42
43/* net { */
44 /* timeout, unit centi seconds
45 * more than one minute timeout is not usefull */
46#define DRBD_TIMEOUT_MIN 1
47#define DRBD_TIMEOUT_MAX 600
48#define DRBD_TIMEOUT_DEF 60 /* 6 seconds */
49
50 /* active connection retries when C_WF_CONNECTION */
51#define DRBD_CONNECT_INT_MIN 1
52#define DRBD_CONNECT_INT_MAX 120
53#define DRBD_CONNECT_INT_DEF 10 /* seconds */
54
55 /* keep-alive probes when idle */
56#define DRBD_PING_INT_MIN 1
57#define DRBD_PING_INT_MAX 120
58#define DRBD_PING_INT_DEF 10
59
60 /* timeout for the ping packets.*/
61#define DRBD_PING_TIMEO_MIN 1
62#define DRBD_PING_TIMEO_MAX 100
63#define DRBD_PING_TIMEO_DEF 5
64
65 /* max number of write requests between write barriers */
66#define DRBD_MAX_EPOCH_SIZE_MIN 1
67#define DRBD_MAX_EPOCH_SIZE_MAX 20000
68#define DRBD_MAX_EPOCH_SIZE_DEF 2048
69
70 /* I don't think that a tcp send buffer of more than 10M is usefull */
71#define DRBD_SNDBUF_SIZE_MIN 0
72#define DRBD_SNDBUF_SIZE_MAX (10<<20)
73#define DRBD_SNDBUF_SIZE_DEF (2*65535)
74
75#define DRBD_RCVBUF_SIZE_MIN 0
76#define DRBD_RCVBUF_SIZE_MAX (10<<20)
77#define DRBD_RCVBUF_SIZE_DEF (2*65535)
78
79 /* @4k PageSize -> 128kB - 512MB */
80#define DRBD_MAX_BUFFERS_MIN 32
81#define DRBD_MAX_BUFFERS_MAX 131072
82#define DRBD_MAX_BUFFERS_DEF 2048
83
84 /* @4k PageSize -> 4kB - 512MB */
85#define DRBD_UNPLUG_WATERMARK_MIN 1
86#define DRBD_UNPLUG_WATERMARK_MAX 131072
87#define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16)
88
89 /* 0 is disabled.
90 * 200 should be more than enough even for very short timeouts */
91#define DRBD_KO_COUNT_MIN 0
92#define DRBD_KO_COUNT_MAX 200
93#define DRBD_KO_COUNT_DEF 0
94/* } */
95
96/* syncer { */
97 /* FIXME allow rate to be zero? */
98#define DRBD_RATE_MIN 1
99/* channel bonding 10 GbE, or other hardware */
100#define DRBD_RATE_MAX (4 << 20)
101#define DRBD_RATE_DEF 250 /* kb/second */
102
103 /* less than 7 would hit performance unneccessarily.
104 * 3833 is the largest prime that still does fit
105 * into 64 sectors of activity log */
106#define DRBD_AL_EXTENTS_MIN 7
107#define DRBD_AL_EXTENTS_MAX 3833
108#define DRBD_AL_EXTENTS_DEF 127
109
110#define DRBD_AFTER_MIN -1
111#define DRBD_AFTER_MAX 255
112#define DRBD_AFTER_DEF -1
113
114/* } */
115
116/* drbdsetup XY resize -d Z
117 * you are free to reduce the device size to nothing, if you want to.
118 * the upper limit with 64bit kernel, enough ram and flexible meta data
119 * is 16 TB, currently. */
120/* DRBD_MAX_SECTORS */
121#define DRBD_DISK_SIZE_SECT_MIN 0
122#define DRBD_DISK_SIZE_SECT_MAX (16 * (2LLU << 30))
123#define DRBD_DISK_SIZE_SECT_DEF 0 /* = disabled = no user size... */
124
125#define DRBD_ON_IO_ERROR_DEF EP_PASS_ON
126#define DRBD_FENCING_DEF FP_DONT_CARE
127#define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT
128#define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT
129#define DRBD_AFTER_SB_2P_DEF ASB_DISCONNECT
130#define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT
131
132#define DRBD_MAX_BIO_BVECS_MIN 0
133#define DRBD_MAX_BIO_BVECS_MAX 128
134#define DRBD_MAX_BIO_BVECS_DEF 0
135
136#undef RANGE
137#endif
diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h
new file mode 100644
index 000000000000..db5721ad50d1
--- /dev/null
+++ b/include/linux/drbd_nl.h
@@ -0,0 +1,137 @@
1/*
2 PAKET( name,
3 TYPE ( pn, pr, member )
4 ...
5 )
6
7 You may never reissue one of the pn arguments
8*/
9
10#if !defined(NL_PACKET) || !defined(NL_STRING) || !defined(NL_INTEGER) || !defined(NL_BIT) || !defined(NL_INT64)
11#error "The macros NL_PACKET, NL_STRING, NL_INTEGER, NL_INT64 and NL_BIT needs to be defined"
12#endif
13
14NL_PACKET(primary, 1,
15 NL_BIT( 1, T_MAY_IGNORE, overwrite_peer)
16)
17
18NL_PACKET(secondary, 2, )
19
20NL_PACKET(disk_conf, 3,
21 NL_INT64( 2, T_MAY_IGNORE, disk_size)
22 NL_STRING( 3, T_MANDATORY, backing_dev, 128)
23 NL_STRING( 4, T_MANDATORY, meta_dev, 128)
24 NL_INTEGER( 5, T_MANDATORY, meta_dev_idx)
25 NL_INTEGER( 6, T_MAY_IGNORE, on_io_error)
26 NL_INTEGER( 7, T_MAY_IGNORE, fencing)
27 NL_BIT( 37, T_MAY_IGNORE, use_bmbv)
28 NL_BIT( 53, T_MAY_IGNORE, no_disk_flush)
29 NL_BIT( 54, T_MAY_IGNORE, no_md_flush)
30 /* 55 max_bio_size was available in 8.2.6rc2 */
31 NL_INTEGER( 56, T_MAY_IGNORE, max_bio_bvecs)
32 NL_BIT( 57, T_MAY_IGNORE, no_disk_barrier)
33 NL_BIT( 58, T_MAY_IGNORE, no_disk_drain)
34)
35
36NL_PACKET(detach, 4, )
37
38NL_PACKET(net_conf, 5,
39 NL_STRING( 8, T_MANDATORY, my_addr, 128)
40 NL_STRING( 9, T_MANDATORY, peer_addr, 128)
41 NL_STRING( 10, T_MAY_IGNORE, shared_secret, SHARED_SECRET_MAX)
42 NL_STRING( 11, T_MAY_IGNORE, cram_hmac_alg, SHARED_SECRET_MAX)
43 NL_STRING( 44, T_MAY_IGNORE, integrity_alg, SHARED_SECRET_MAX)
44 NL_INTEGER( 14, T_MAY_IGNORE, timeout)
45 NL_INTEGER( 15, T_MANDATORY, wire_protocol)
46 NL_INTEGER( 16, T_MAY_IGNORE, try_connect_int)
47 NL_INTEGER( 17, T_MAY_IGNORE, ping_int)
48 NL_INTEGER( 18, T_MAY_IGNORE, max_epoch_size)
49 NL_INTEGER( 19, T_MAY_IGNORE, max_buffers)
50 NL_INTEGER( 20, T_MAY_IGNORE, unplug_watermark)
51 NL_INTEGER( 21, T_MAY_IGNORE, sndbuf_size)
52 NL_INTEGER( 22, T_MAY_IGNORE, ko_count)
53 NL_INTEGER( 24, T_MAY_IGNORE, after_sb_0p)
54 NL_INTEGER( 25, T_MAY_IGNORE, after_sb_1p)
55 NL_INTEGER( 26, T_MAY_IGNORE, after_sb_2p)
56 NL_INTEGER( 39, T_MAY_IGNORE, rr_conflict)
57 NL_INTEGER( 40, T_MAY_IGNORE, ping_timeo)
58 NL_INTEGER( 67, T_MAY_IGNORE, rcvbuf_size)
59 /* 59 addr_family was available in GIT, never released */
60 NL_BIT( 60, T_MANDATORY, mind_af)
61 NL_BIT( 27, T_MAY_IGNORE, want_lose)
62 NL_BIT( 28, T_MAY_IGNORE, two_primaries)
63 NL_BIT( 41, T_MAY_IGNORE, always_asbp)
64 NL_BIT( 61, T_MAY_IGNORE, no_cork)
65 NL_BIT( 62, T_MANDATORY, auto_sndbuf_size)
66)
67
68NL_PACKET(disconnect, 6, )
69
70NL_PACKET(resize, 7,
71 NL_INT64( 29, T_MAY_IGNORE, resize_size)
72)
73
74NL_PACKET(syncer_conf, 8,
75 NL_INTEGER( 30, T_MAY_IGNORE, rate)
76 NL_INTEGER( 31, T_MAY_IGNORE, after)
77 NL_INTEGER( 32, T_MAY_IGNORE, al_extents)
78 NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX)
79 NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32)
80 NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX)
81 NL_BIT( 65, T_MAY_IGNORE, use_rle)
82)
83
84NL_PACKET(invalidate, 9, )
85NL_PACKET(invalidate_peer, 10, )
86NL_PACKET(pause_sync, 11, )
87NL_PACKET(resume_sync, 12, )
88NL_PACKET(suspend_io, 13, )
89NL_PACKET(resume_io, 14, )
90NL_PACKET(outdate, 15, )
91NL_PACKET(get_config, 16, )
92NL_PACKET(get_state, 17,
93 NL_INTEGER( 33, T_MAY_IGNORE, state_i)
94)
95
96NL_PACKET(get_uuids, 18,
97 NL_STRING( 34, T_MAY_IGNORE, uuids, (UI_SIZE*sizeof(__u64)))
98 NL_INTEGER( 35, T_MAY_IGNORE, uuids_flags)
99)
100
101NL_PACKET(get_timeout_flag, 19,
102 NL_BIT( 36, T_MAY_IGNORE, use_degraded)
103)
104
105NL_PACKET(call_helper, 20,
106 NL_STRING( 38, T_MAY_IGNORE, helper, 32)
107)
108
109/* Tag nr 42 already allocated in drbd-8.1 development. */
110
111NL_PACKET(sync_progress, 23,
112 NL_INTEGER( 43, T_MAY_IGNORE, sync_progress)
113)
114
115NL_PACKET(dump_ee, 24,
116 NL_STRING( 45, T_MAY_IGNORE, dump_ee_reason, 32)
117 NL_STRING( 46, T_MAY_IGNORE, seen_digest, SHARED_SECRET_MAX)
118 NL_STRING( 47, T_MAY_IGNORE, calc_digest, SHARED_SECRET_MAX)
119 NL_INT64( 48, T_MAY_IGNORE, ee_sector)
120 NL_INT64( 49, T_MAY_IGNORE, ee_block_id)
121 NL_STRING( 50, T_MAY_IGNORE, ee_data, 32 << 10)
122)
123
124NL_PACKET(start_ov, 25,
125 NL_INT64( 66, T_MAY_IGNORE, start_sector)
126)
127
128NL_PACKET(new_c_uuid, 26,
129 NL_BIT( 63, T_MANDATORY, clear_bm)
130)
131
132#undef NL_PACKET
133#undef NL_INTEGER
134#undef NL_INT64
135#undef NL_BIT
136#undef NL_STRING
137
diff --git a/include/linux/drbd_tag_magic.h b/include/linux/drbd_tag_magic.h
new file mode 100644
index 000000000000..fcdff8410e99
--- /dev/null
+++ b/include/linux/drbd_tag_magic.h
@@ -0,0 +1,83 @@
1#ifndef DRBD_TAG_MAGIC_H
2#define DRBD_TAG_MAGIC_H
3
4#define TT_END 0
5#define TT_REMOVED 0xE000
6
7/* declare packet_type enums */
8enum packet_types {
9#define NL_PACKET(name, number, fields) P_ ## name = number,
10#define NL_INTEGER(pn, pr, member)
11#define NL_INT64(pn, pr, member)
12#define NL_BIT(pn, pr, member)
13#define NL_STRING(pn, pr, member, len)
14#include "drbd_nl.h"
15 P_nl_after_last_packet,
16};
17
18/* These struct are used to deduce the size of the tag lists: */
19#define NL_PACKET(name, number, fields) \
20 struct name ## _tag_len_struct { fields };
21#define NL_INTEGER(pn, pr, member) \
22 int member; int tag_and_len ## member;
23#define NL_INT64(pn, pr, member) \
24 __u64 member; int tag_and_len ## member;
25#define NL_BIT(pn, pr, member) \
26 unsigned char member:1; int tag_and_len ## member;
27#define NL_STRING(pn, pr, member, len) \
28 unsigned char member[len]; int member ## _len; \
29 int tag_and_len ## member;
30#include "linux/drbd_nl.h"
31
32/* declate tag-list-sizes */
33static const int tag_list_sizes[] = {
34#define NL_PACKET(name, number, fields) 2 fields ,
35#define NL_INTEGER(pn, pr, member) + 4 + 4
36#define NL_INT64(pn, pr, member) + 4 + 8
37#define NL_BIT(pn, pr, member) + 4 + 1
38#define NL_STRING(pn, pr, member, len) + 4 + (len)
39#include "drbd_nl.h"
40};
41
42/* The two highest bits are used for the tag type */
43#define TT_MASK 0xC000
44#define TT_INTEGER 0x0000
45#define TT_INT64 0x4000
46#define TT_BIT 0x8000
47#define TT_STRING 0xC000
48/* The next bit indicates if processing of the tag is mandatory */
49#define T_MANDATORY 0x2000
50#define T_MAY_IGNORE 0x0000
51#define TN_MASK 0x1fff
52/* The remaining 13 bits are used to enumerate the tags */
53
54#define tag_type(T) ((T) & TT_MASK)
55#define tag_number(T) ((T) & TN_MASK)
56
57/* declare tag enums */
58#define NL_PACKET(name, number, fields) fields
59enum drbd_tags {
60#define NL_INTEGER(pn, pr, member) T_ ## member = pn | TT_INTEGER | pr ,
61#define NL_INT64(pn, pr, member) T_ ## member = pn | TT_INT64 | pr ,
62#define NL_BIT(pn, pr, member) T_ ## member = pn | TT_BIT | pr ,
63#define NL_STRING(pn, pr, member, len) T_ ## member = pn | TT_STRING | pr ,
64#include "drbd_nl.h"
65};
66
67struct tag {
68 const char *name;
69 int type_n_flags;
70 int max_len;
71};
72
73/* declare tag names */
74#define NL_PACKET(name, number, fields) fields
75static const struct tag tag_descriptions[] = {
76#define NL_INTEGER(pn, pr, member) [ pn ] = { #member, TT_INTEGER | pr, sizeof(int) },
77#define NL_INT64(pn, pr, member) [ pn ] = { #member, TT_INT64 | pr, sizeof(__u64) },
78#define NL_BIT(pn, pr, member) [ pn ] = { #member, TT_BIT | pr, sizeof(int) },
79#define NL_STRING(pn, pr, member, len) [ pn ] = { #member, TT_STRING | pr, (len) },
80#include "drbd_nl.h"
81};
82
83#endif
diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h
new file mode 100644
index 000000000000..3a2b2d9b0472
--- /dev/null
+++ b/include/linux/lru_cache.h
@@ -0,0 +1,294 @@
1/*
2 lru_cache.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#ifndef LRU_CACHE_H
27#define LRU_CACHE_H
28
29#include <linux/list.h>
30#include <linux/slab.h>
31#include <linux/bitops.h>
32#include <linux/string.h> /* for memset */
33#include <linux/seq_file.h>
34
35/*
36This header file (and its .c file; kernel-doc of functions see there)
37 define a helper framework to easily keep track of index:label associations,
38 and changes to an "active set" of objects, as well as pending transactions,
39 to persistently record those changes.
40
41 We use an LRU policy if it is necessary to "cool down" a region currently in
42 the active set before we can "heat" a previously unused region.
43
44 Because of this later property, it is called "lru_cache".
45 As it actually Tracks Objects in an Active SeT, we could also call it
46 toast (incidentally that is what may happen to the data on the
47 backend storage uppon next resync, if we don't get it right).
48
49What for?
50
51We replicate IO (more or less synchronously) to local and remote disk.
52
53For crash recovery after replication node failure,
54 we need to resync all regions that have been target of in-flight WRITE IO
55 (in use, or "hot", regions), as we don't know wether or not those WRITEs have
56 made it to stable storage.
57
58 To avoid a "full resync", we need to persistently track these regions.
59
60 This is known as "write intent log", and can be implemented as on-disk
61 (coarse or fine grained) bitmap, or other meta data.
62
63 To avoid the overhead of frequent extra writes to this meta data area,
64 usually the condition is softened to regions that _may_ have been target of
65 in-flight WRITE IO, e.g. by only lazily clearing the on-disk write-intent
66 bitmap, trading frequency of meta data transactions against amount of
67 (possibly unneccessary) resync traffic.
68
69 If we set a hard limit on the area that may be "hot" at any given time, we
70 limit the amount of resync traffic needed for crash recovery.
71
72For recovery after replication link failure,
73 we need to resync all blocks that have been changed on the other replica
74 in the mean time, or, if both replica have been changed independently [*],
75 all blocks that have been changed on either replica in the mean time.
76 [*] usually as a result of a cluster split-brain and insufficient protection.
77 but there are valid use cases to do this on purpose.
78
79 Tracking those blocks can be implemented as "dirty bitmap".
80 Having it fine-grained reduces the amount of resync traffic.
81 It should also be persistent, to allow for reboots (or crashes)
82 while the replication link is down.
83
84There are various possible implementations for persistently storing
85write intent log information, three of which are mentioned here.
86
87"Chunk dirtying"
88 The on-disk "dirty bitmap" may be re-used as "write-intent" bitmap as well.
89 To reduce the frequency of bitmap updates for write-intent log purposes,
90 one could dirty "chunks" (of some size) at a time of the (fine grained)
91 on-disk bitmap, while keeping the in-memory "dirty" bitmap as clean as
92 possible, flushing it to disk again when a previously "hot" (and on-disk
93 dirtied as full chunk) area "cools down" again (no IO in flight anymore,
94 and none expected in the near future either).
95
96"Explicit (coarse) write intent bitmap"
97 An other implementation could chose a (probably coarse) explicit bitmap,
98 for write-intent log purposes, additionally to the fine grained dirty bitmap.
99
100"Activity log"
101 Yet an other implementation may keep track of the hot regions, by starting
102 with an empty set, and writing down a journal of region numbers that have
103 become "hot", or have "cooled down" again.
104
105 To be able to use a ring buffer for this journal of changes to the active
106 set, we not only record the actual changes to that set, but also record the
107 not changing members of the set in a round robin fashion. To do so, we use a
108 fixed (but configurable) number of slots which we can identify by index, and
109 associate region numbers (labels) with these indices.
110 For each transaction recording a change to the active set, we record the
111 change itself (index: -old_label, +new_label), and which index is associated
112 with which label (index: current_label) within a certain sliding window that
113 is moved further over the available indices with each such transaction.
114
115 Thus, for crash recovery, if the ringbuffer is sufficiently large, we can
116 accurately reconstruct the active set.
117
118 Sufficiently large depends only on maximum number of active objects, and the
119 size of the sliding window recording "index: current_label" associations within
120 each transaction.
121
122 This is what we call the "activity log".
123
124 Currently we need one activity log transaction per single label change, which
125 does not give much benefit over the "dirty chunks of bitmap" approach, other
126 than potentially less seeks.
127
128 We plan to change the transaction format to support multiple changes per
129 transaction, which then would reduce several (disjoint, "random") updates to
130 the bitmap into one transaction to the activity log ring buffer.
131*/
132
133/* this defines an element in a tracked set
134 * .colision is for hash table lookup.
135 * When we process a new IO request, we know its sector, thus can deduce the
136 * region number (label) easily. To do the label -> object lookup without a
137 * full list walk, we use a simple hash table.
138 *
139 * .list is on one of three lists:
140 * in_use: currently in use (refcnt > 0, lc_number != LC_FREE)
141 * lru: unused but ready to be reused or recycled
142 * (ts_refcnt == 0, lc_number != LC_FREE),
143 * free: unused but ready to be recycled
144 * (ts_refcnt == 0, lc_number == LC_FREE),
145 *
146 * an element is said to be "in the active set",
147 * if either on "in_use" or "lru", i.e. lc_number != LC_FREE.
148 *
149 * DRBD currently (May 2009) only uses 61 elements on the resync lru_cache
150 * (total memory usage 2 pages), and up to 3833 elements on the act_log
151 * lru_cache, totalling ~215 kB for 64bit architechture, ~53 pages.
152 *
153 * We usually do not actually free these objects again, but only "recycle"
154 * them, as the change "index: -old_label, +LC_FREE" would need a transaction
155 * as well. Which also means that using a kmem_cache to allocate the objects
156 * from wastes some resources.
157 * But it avoids high order page allocations in kmalloc.
158 */
159struct lc_element {
160 struct hlist_node colision;
161 struct list_head list; /* LRU list or free list */
162 unsigned refcnt;
163 /* back "pointer" into ts_cache->element[index],
164 * for paranoia, and for "ts_element_to_index" */
165 unsigned lc_index;
166 /* if we want to track a larger set of objects,
167 * it needs to become arch independend u64 */
168 unsigned lc_number;
169
170 /* special label when on free list */
171#define LC_FREE (~0U)
172};
173
174struct lru_cache {
175 /* the least recently used item is kept at lru->prev */
176 struct list_head lru;
177 struct list_head free;
178 struct list_head in_use;
179
180 /* the pre-created kmem cache to allocate the objects from */
181 struct kmem_cache *lc_cache;
182
183 /* size of tracked objects, used to memset(,0,) them in lc_reset */
184 size_t element_size;
185 /* offset of struct lc_element member in the tracked object */
186 size_t element_off;
187
188 /* number of elements (indices) */
189 unsigned int nr_elements;
190 /* Arbitrary limit on maximum tracked objects. Practical limit is much
191 * lower due to allocation failures, probably. For typical use cases,
192 * nr_elements should be a few thousand at most.
193 * This also limits the maximum value of ts_element.ts_index, allowing the
194 * 8 high bits of .ts_index to be overloaded with flags in the future. */
195#define LC_MAX_ACTIVE (1<<24)
196
197 /* statistics */
198 unsigned used; /* number of lelements currently on in_use list */
199 unsigned long hits, misses, starving, dirty, changed;
200
201 /* see below: flag-bits for lru_cache */
202 unsigned long flags;
203
204 /* when changing the label of an index element */
205 unsigned int new_number;
206
207 /* for paranoia when changing the label of an index element */
208 struct lc_element *changing_element;
209
210 void *lc_private;
211 const char *name;
212
213 /* nr_elements there */
214 struct hlist_head *lc_slot;
215 struct lc_element **lc_element;
216};
217
218
219/* flag-bits for lru_cache */
220enum {
221 /* debugging aid, to catch concurrent access early.
222 * user needs to guarantee exclusive access by proper locking! */
223 __LC_PARANOIA,
224 /* if we need to change the set, but currently there is a changing
225 * transaction pending, we are "dirty", and must deferr further
226 * changing requests */
227 __LC_DIRTY,
228 /* if we need to change the set, but currently there is no free nor
229 * unused element available, we are "starving", and must not give out
230 * further references, to guarantee that eventually some refcnt will
231 * drop to zero and we will be able to make progress again, changing
232 * the set, writing the transaction.
233 * if the statistics say we are frequently starving,
234 * nr_elements is too small. */
235 __LC_STARVING,
236};
237#define LC_PARANOIA (1<<__LC_PARANOIA)
238#define LC_DIRTY (1<<__LC_DIRTY)
239#define LC_STARVING (1<<__LC_STARVING)
240
241extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
242 unsigned e_count, size_t e_size, size_t e_off);
243extern void lc_reset(struct lru_cache *lc);
244extern void lc_destroy(struct lru_cache *lc);
245extern void lc_set(struct lru_cache *lc, unsigned int enr, int index);
246extern void lc_del(struct lru_cache *lc, struct lc_element *element);
247
248extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr);
249extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr);
250extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr);
251extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e);
252extern void lc_changed(struct lru_cache *lc, struct lc_element *e);
253
254struct seq_file;
255extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc);
256
257extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
258 void (*detail) (struct seq_file *, struct lc_element *));
259
260/**
261 * lc_try_lock - can be used to stop lc_get() from changing the tracked set
262 * @lc: the lru cache to operate on
263 *
264 * Note that the reference counts and order on the active and lru lists may
265 * still change. Returns true if we aquired the lock.
266 */
267static inline int lc_try_lock(struct lru_cache *lc)
268{
269 return !test_and_set_bit(__LC_DIRTY, &lc->flags);
270}
271
272/**
273 * lc_unlock - unlock @lc, allow lc_get() to change the set again
274 * @lc: the lru cache to operate on
275 */
276static inline void lc_unlock(struct lru_cache *lc)
277{
278 clear_bit(__LC_DIRTY, &lc->flags);
279 smp_mb__after_clear_bit();
280}
281
282static inline int lc_is_used(struct lru_cache *lc, unsigned int enr)
283{
284 struct lc_element *e = lc_find(lc, enr);
285 return e && e->refcnt;
286}
287
288#define lc_entry(ptr, type, member) \
289 container_of(ptr, type, member)
290
291extern struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i);
292extern unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e);
293
294#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index bb1326d3839c..1cfe51628e1b 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -200,4 +200,7 @@ config NLATTR
200config GENERIC_ATOMIC64 200config GENERIC_ATOMIC64
201 bool 201 bool
202 202
203config LRU_CACHE
204 tristate
205
203endmenu 206endmenu
diff --git a/lib/Makefile b/lib/Makefile
index 2e78277eff9d..347ad8db29d3 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -91,6 +91,8 @@ obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o
91 91
92obj-$(CONFIG_NLATTR) += nlattr.o 92obj-$(CONFIG_NLATTR) += nlattr.o
93 93
94obj-$(CONFIG_LRU_CACHE) += lru_cache.o
95
94obj-$(CONFIG_DMA_API_DEBUG) += dma-debug.o 96obj-$(CONFIG_DMA_API_DEBUG) += dma-debug.o
95 97
96obj-$(CONFIG_GENERIC_CSUM) += checksum.o 98obj-$(CONFIG_GENERIC_CSUM) += checksum.o
diff --git a/lib/lru_cache.c b/lib/lru_cache.c
new file mode 100644
index 000000000000..270de9d31b8c
--- /dev/null
+++ b/lib/lru_cache.c
@@ -0,0 +1,560 @@
1/*
2 lru_cache.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/module.h>
27#include <linux/bitops.h>
28#include <linux/slab.h>
29#include <linux/string.h> /* for memset */
30#include <linux/seq_file.h> /* for seq_printf */
31#include <linux/lru_cache.h>
32
33MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
34 "Lars Ellenberg <lars@linbit.com>");
35MODULE_DESCRIPTION("lru_cache - Track sets of hot objects");
36MODULE_LICENSE("GPL");
37
38/* this is developers aid only.
39 * it catches concurrent access (lack of locking on the users part) */
40#define PARANOIA_ENTRY() do { \
41 BUG_ON(!lc); \
42 BUG_ON(!lc->nr_elements); \
43 BUG_ON(test_and_set_bit(__LC_PARANOIA, &lc->flags)); \
44} while (0)
45
46#define RETURN(x...) do { \
47 clear_bit(__LC_PARANOIA, &lc->flags); \
48 smp_mb__after_clear_bit(); return x ; } while (0)
49
50/* BUG() if e is not one of the elements tracked by lc */
51#define PARANOIA_LC_ELEMENT(lc, e) do { \
52 struct lru_cache *lc_ = (lc); \
53 struct lc_element *e_ = (e); \
54 unsigned i = e_->lc_index; \
55 BUG_ON(i >= lc_->nr_elements); \
56 BUG_ON(lc_->lc_element[i] != e_); } while (0)
57
58/**
59 * lc_create - prepares to track objects in an active set
60 * @name: descriptive name only used in lc_seq_printf_stats and lc_seq_dump_details
61 * @e_count: number of elements allowed to be active simultaneously
62 * @e_size: size of the tracked objects
63 * @e_off: offset to the &struct lc_element member in a tracked object
64 *
65 * Returns a pointer to a newly initialized struct lru_cache on success,
66 * or NULL on (allocation) failure.
67 */
68struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
69 unsigned e_count, size_t e_size, size_t e_off)
70{
71 struct hlist_head *slot = NULL;
72 struct lc_element **element = NULL;
73 struct lru_cache *lc;
74 struct lc_element *e;
75 unsigned cache_obj_size = kmem_cache_size(cache);
76 unsigned i;
77
78 WARN_ON(cache_obj_size < e_size);
79 if (cache_obj_size < e_size)
80 return NULL;
81
82 /* e_count too big; would probably fail the allocation below anyways.
83 * for typical use cases, e_count should be few thousand at most. */
84 if (e_count > LC_MAX_ACTIVE)
85 return NULL;
86
87 slot = kzalloc(e_count * sizeof(struct hlist_head*), GFP_KERNEL);
88 if (!slot)
89 goto out_fail;
90 element = kzalloc(e_count * sizeof(struct lc_element *), GFP_KERNEL);
91 if (!element)
92 goto out_fail;
93
94 lc = kzalloc(sizeof(*lc), GFP_KERNEL);
95 if (!lc)
96 goto out_fail;
97
98 INIT_LIST_HEAD(&lc->in_use);
99 INIT_LIST_HEAD(&lc->lru);
100 INIT_LIST_HEAD(&lc->free);
101
102 lc->name = name;
103 lc->element_size = e_size;
104 lc->element_off = e_off;
105 lc->nr_elements = e_count;
106 lc->new_number = LC_FREE;
107 lc->lc_cache = cache;
108 lc->lc_element = element;
109 lc->lc_slot = slot;
110
111 /* preallocate all objects */
112 for (i = 0; i < e_count; i++) {
113 void *p = kmem_cache_alloc(cache, GFP_KERNEL);
114 if (!p)
115 break;
116 memset(p, 0, lc->element_size);
117 e = p + e_off;
118 e->lc_index = i;
119 e->lc_number = LC_FREE;
120 list_add(&e->list, &lc->free);
121 element[i] = e;
122 }
123 if (i == e_count)
124 return lc;
125
126 /* else: could not allocate all elements, give up */
127 for (i--; i; i--) {
128 void *p = element[i];
129 kmem_cache_free(cache, p - e_off);
130 }
131 kfree(lc);
132out_fail:
133 kfree(element);
134 kfree(slot);
135 return NULL;
136}
137
138void lc_free_by_index(struct lru_cache *lc, unsigned i)
139{
140 void *p = lc->lc_element[i];
141 WARN_ON(!p);
142 if (p) {
143 p -= lc->element_off;
144 kmem_cache_free(lc->lc_cache, p);
145 }
146}
147
148/**
149 * lc_destroy - frees memory allocated by lc_create()
150 * @lc: the lru cache to destroy
151 */
152void lc_destroy(struct lru_cache *lc)
153{
154 unsigned i;
155 if (!lc)
156 return;
157 for (i = 0; i < lc->nr_elements; i++)
158 lc_free_by_index(lc, i);
159 kfree(lc->lc_element);
160 kfree(lc->lc_slot);
161 kfree(lc);
162}
163
164/**
165 * lc_reset - does a full reset for @lc and the hash table slots.
166 * @lc: the lru cache to operate on
167 *
168 * It is roughly the equivalent of re-allocating a fresh lru_cache object,
169 * basically a short cut to lc_destroy(lc); lc = lc_create(...);
170 */
171void lc_reset(struct lru_cache *lc)
172{
173 unsigned i;
174
175 INIT_LIST_HEAD(&lc->in_use);
176 INIT_LIST_HEAD(&lc->lru);
177 INIT_LIST_HEAD(&lc->free);
178 lc->used = 0;
179 lc->hits = 0;
180 lc->misses = 0;
181 lc->starving = 0;
182 lc->dirty = 0;
183 lc->changed = 0;
184 lc->flags = 0;
185 lc->changing_element = NULL;
186 lc->new_number = LC_FREE;
187 memset(lc->lc_slot, 0, sizeof(struct hlist_head) * lc->nr_elements);
188
189 for (i = 0; i < lc->nr_elements; i++) {
190 struct lc_element *e = lc->lc_element[i];
191 void *p = e;
192 p -= lc->element_off;
193 memset(p, 0, lc->element_size);
194 /* re-init it */
195 e->lc_index = i;
196 e->lc_number = LC_FREE;
197 list_add(&e->list, &lc->free);
198 }
199}
200
201/**
202 * lc_seq_printf_stats - print stats about @lc into @seq
203 * @seq: the seq_file to print into
204 * @lc: the lru cache to print statistics of
205 */
206size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc)
207{
208 /* NOTE:
209 * total calls to lc_get are
210 * (starving + hits + misses)
211 * misses include "dirty" count (update from an other thread in
212 * progress) and "changed", when this in fact lead to an successful
213 * update of the cache.
214 */
215 return seq_printf(seq, "\t%s: used:%u/%u "
216 "hits:%lu misses:%lu starving:%lu dirty:%lu changed:%lu\n",
217 lc->name, lc->used, lc->nr_elements,
218 lc->hits, lc->misses, lc->starving, lc->dirty, lc->changed);
219}
220
221static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr)
222{
223 return lc->lc_slot + (enr % lc->nr_elements);
224}
225
226
227/**
228 * lc_find - find element by label, if present in the hash table
229 * @lc: The lru_cache object
230 * @enr: element number
231 *
232 * Returns the pointer to an element, if the element with the requested
233 * "label" or element number is present in the hash table,
234 * or NULL if not found. Does not change the refcnt.
235 */
236struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr)
237{
238 struct hlist_node *n;
239 struct lc_element *e;
240
241 BUG_ON(!lc);
242 BUG_ON(!lc->nr_elements);
243 hlist_for_each_entry(e, n, lc_hash_slot(lc, enr), colision) {
244 if (e->lc_number == enr)
245 return e;
246 }
247 return NULL;
248}
249
250/* returned element will be "recycled" immediately */
251static struct lc_element *lc_evict(struct lru_cache *lc)
252{
253 struct list_head *n;
254 struct lc_element *e;
255
256 if (list_empty(&lc->lru))
257 return NULL;
258
259 n = lc->lru.prev;
260 e = list_entry(n, struct lc_element, list);
261
262 PARANOIA_LC_ELEMENT(lc, e);
263
264 list_del(&e->list);
265 hlist_del(&e->colision);
266 return e;
267}
268
269/**
270 * lc_del - removes an element from the cache
271 * @lc: The lru_cache object
272 * @e: The element to remove
273 *
274 * @e must be unused (refcnt == 0). Moves @e from "lru" to "free" list,
275 * sets @e->enr to %LC_FREE.
276 */
277void lc_del(struct lru_cache *lc, struct lc_element *e)
278{
279 PARANOIA_ENTRY();
280 PARANOIA_LC_ELEMENT(lc, e);
281 BUG_ON(e->refcnt);
282
283 e->lc_number = LC_FREE;
284 hlist_del_init(&e->colision);
285 list_move(&e->list, &lc->free);
286 RETURN();
287}
288
289static struct lc_element *lc_get_unused_element(struct lru_cache *lc)
290{
291 struct list_head *n;
292
293 if (list_empty(&lc->free))
294 return lc_evict(lc);
295
296 n = lc->free.next;
297 list_del(n);
298 return list_entry(n, struct lc_element, list);
299}
300
301static int lc_unused_element_available(struct lru_cache *lc)
302{
303 if (!list_empty(&lc->free))
304 return 1; /* something on the free list */
305 if (!list_empty(&lc->lru))
306 return 1; /* something to evict */
307
308 return 0;
309}
310
311
312/**
313 * lc_get - get element by label, maybe change the active set
314 * @lc: the lru cache to operate on
315 * @enr: the label to look up
316 *
317 * Finds an element in the cache, increases its usage count,
318 * "touches" and returns it.
319 *
320 * In case the requested number is not present, it needs to be added to the
321 * cache. Therefore it is possible that an other element becomes evicted from
322 * the cache. In either case, the user is notified so he is able to e.g. keep
323 * a persistent log of the cache changes, and therefore the objects in use.
324 *
325 * Return values:
326 * NULL
327 * The cache was marked %LC_STARVING,
328 * or the requested label was not in the active set
329 * and a changing transaction is still pending (@lc was marked %LC_DIRTY).
330 * Or no unused or free element could be recycled (@lc will be marked as
331 * %LC_STARVING, blocking further lc_get() operations).
332 *
333 * pointer to the element with the REQUESTED element number.
334 * In this case, it can be used right away
335 *
336 * pointer to an UNUSED element with some different element number,
337 * where that different number may also be %LC_FREE.
338 *
339 * In this case, the cache is marked %LC_DIRTY (blocking further changes),
340 * and the returned element pointer is removed from the lru list and
341 * hash collision chains. The user now should do whatever housekeeping
342 * is necessary.
343 * Then he must call lc_changed(lc,element_pointer), to finish
344 * the change.
345 *
346 * NOTE: The user needs to check the lc_number on EACH use, so he recognizes
347 * any cache set change.
348 */
349struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr)
350{
351 struct lc_element *e;
352
353 PARANOIA_ENTRY();
354 if (lc->flags & LC_STARVING) {
355 ++lc->starving;
356 RETURN(NULL);
357 }
358
359 e = lc_find(lc, enr);
360 if (e) {
361 ++lc->hits;
362 if (e->refcnt++ == 0)
363 lc->used++;
364 list_move(&e->list, &lc->in_use); /* Not evictable... */
365 RETURN(e);
366 }
367
368 ++lc->misses;
369
370 /* In case there is nothing available and we can not kick out
371 * the LRU element, we have to wait ...
372 */
373 if (!lc_unused_element_available(lc)) {
374 __set_bit(__LC_STARVING, &lc->flags);
375 RETURN(NULL);
376 }
377
378 /* it was not present in the active set.
379 * we are going to recycle an unused (or even "free") element.
380 * user may need to commit a transaction to record that change.
381 * we serialize on flags & TF_DIRTY */
382 if (test_and_set_bit(__LC_DIRTY, &lc->flags)) {
383 ++lc->dirty;
384 RETURN(NULL);
385 }
386
387 e = lc_get_unused_element(lc);
388 BUG_ON(!e);
389
390 clear_bit(__LC_STARVING, &lc->flags);
391 BUG_ON(++e->refcnt != 1);
392 lc->used++;
393
394 lc->changing_element = e;
395 lc->new_number = enr;
396
397 RETURN(e);
398}
399
400/* similar to lc_get,
401 * but only gets a new reference on an existing element.
402 * you either get the requested element, or NULL.
403 * will be consolidated into one function.
404 */
405struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr)
406{
407 struct lc_element *e;
408
409 PARANOIA_ENTRY();
410 if (lc->flags & LC_STARVING) {
411 ++lc->starving;
412 RETURN(NULL);
413 }
414
415 e = lc_find(lc, enr);
416 if (e) {
417 ++lc->hits;
418 if (e->refcnt++ == 0)
419 lc->used++;
420 list_move(&e->list, &lc->in_use); /* Not evictable... */
421 }
422 RETURN(e);
423}
424
425/**
426 * lc_changed - tell @lc that the change has been recorded
427 * @lc: the lru cache to operate on
428 * @e: the element pending label change
429 */
430void lc_changed(struct lru_cache *lc, struct lc_element *e)
431{
432 PARANOIA_ENTRY();
433 BUG_ON(e != lc->changing_element);
434 PARANOIA_LC_ELEMENT(lc, e);
435 ++lc->changed;
436 e->lc_number = lc->new_number;
437 list_add(&e->list, &lc->in_use);
438 hlist_add_head(&e->colision, lc_hash_slot(lc, lc->new_number));
439 lc->changing_element = NULL;
440 lc->new_number = LC_FREE;
441 clear_bit(__LC_DIRTY, &lc->flags);
442 smp_mb__after_clear_bit();
443 RETURN();
444}
445
446
447/**
448 * lc_put - give up refcnt of @e
449 * @lc: the lru cache to operate on
450 * @e: the element to put
451 *
452 * If refcnt reaches zero, the element is moved to the lru list,
453 * and a %LC_STARVING (if set) is cleared.
454 * Returns the new (post-decrement) refcnt.
455 */
456unsigned int lc_put(struct lru_cache *lc, struct lc_element *e)
457{
458 PARANOIA_ENTRY();
459 PARANOIA_LC_ELEMENT(lc, e);
460 BUG_ON(e->refcnt == 0);
461 BUG_ON(e == lc->changing_element);
462 if (--e->refcnt == 0) {
463 /* move it to the front of LRU. */
464 list_move(&e->list, &lc->lru);
465 lc->used--;
466 clear_bit(__LC_STARVING, &lc->flags);
467 smp_mb__after_clear_bit();
468 }
469 RETURN(e->refcnt);
470}
471
472/**
473 * lc_element_by_index
474 * @lc: the lru cache to operate on
475 * @i: the index of the element to return
476 */
477struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i)
478{
479 BUG_ON(i >= lc->nr_elements);
480 BUG_ON(lc->lc_element[i] == NULL);
481 BUG_ON(lc->lc_element[i]->lc_index != i);
482 return lc->lc_element[i];
483}
484
485/**
486 * lc_index_of
487 * @lc: the lru cache to operate on
488 * @e: the element to query for its index position in lc->element
489 */
490unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e)
491{
492 PARANOIA_LC_ELEMENT(lc, e);
493 return e->lc_index;
494}
495
496/**
497 * lc_set - associate index with label
498 * @lc: the lru cache to operate on
499 * @enr: the label to set
500 * @index: the element index to associate label with.
501 *
502 * Used to initialize the active set to some previously recorded state.
503 */
504void lc_set(struct lru_cache *lc, unsigned int enr, int index)
505{
506 struct lc_element *e;
507
508 if (index < 0 || index >= lc->nr_elements)
509 return;
510
511 e = lc_element_by_index(lc, index);
512 e->lc_number = enr;
513
514 hlist_del_init(&e->colision);
515 hlist_add_head(&e->colision, lc_hash_slot(lc, enr));
516 list_move(&e->list, e->refcnt ? &lc->in_use : &lc->lru);
517}
518
519/**
520 * lc_dump - Dump a complete LRU cache to seq in textual form.
521 * @lc: the lru cache to operate on
522 * @seq: the &struct seq_file pointer to seq_printf into
523 * @utext: user supplied "heading" or other info
524 * @detail: function pointer the user may provide to dump further details
525 * of the object the lc_element is embedded in.
526 */
527void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
528 void (*detail) (struct seq_file *, struct lc_element *))
529{
530 unsigned int nr_elements = lc->nr_elements;
531 struct lc_element *e;
532 int i;
533
534 seq_printf(seq, "\tnn: lc_number refcnt %s\n ", utext);
535 for (i = 0; i < nr_elements; i++) {
536 e = lc_element_by_index(lc, i);
537 if (e->lc_number == LC_FREE) {
538 seq_printf(seq, "\t%2d: FREE\n", i);
539 } else {
540 seq_printf(seq, "\t%2d: %4u %4u ", i,
541 e->lc_number, e->refcnt);
542 detail(seq, e);
543 }
544 }
545}
546
547EXPORT_SYMBOL(lc_create);
548EXPORT_SYMBOL(lc_reset);
549EXPORT_SYMBOL(lc_destroy);
550EXPORT_SYMBOL(lc_set);
551EXPORT_SYMBOL(lc_del);
552EXPORT_SYMBOL(lc_try_get);
553EXPORT_SYMBOL(lc_find);
554EXPORT_SYMBOL(lc_get);
555EXPORT_SYMBOL(lc_put);
556EXPORT_SYMBOL(lc_changed);
557EXPORT_SYMBOL(lc_element_by_index);
558EXPORT_SYMBOL(lc_index_of);
559EXPORT_SYMBOL(lc_seq_printf_stats);
560EXPORT_SYMBOL(lc_seq_dump_details);