aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPhilipp Reisner <philipp.reisner@linbit.com>2009-11-03 04:59:10 -0500
committerPhilipp Reisner <philipp.reisner@linbit.com>2009-11-03 04:59:10 -0500
commit59131d8e0ae91f2e94909e0795923c4c7ee7eb8c (patch)
tree118e958961f1a5c2b64783ef239540397c176831
parent012abeea669ea49636cf952d13298bb68654146a (diff)
parent4f570f995f68ef77aae7e5a441222f59232f2d0e (diff)
Merge branch 'for-2.6.33' of git://git.kernel.dk/linux-2.6-block into for-2.6.33
-rw-r--r--Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg588
-rw-r--r--Documentation/blockdev/drbd/DRBD-data-packets.svg459
-rw-r--r--Documentation/blockdev/drbd/README.txt16
-rw-r--r--Documentation/blockdev/drbd/conn-states-8.dot18
-rw-r--r--Documentation/blockdev/drbd/disk-states-8.dot16
-rw-r--r--Documentation/blockdev/drbd/drbd-connection-state-overview.dot85
-rw-r--r--Documentation/blockdev/drbd/node-states-8.dot14
-rw-r--r--MAINTAINERS13
-rw-r--r--block/Kconfig.iosched26
-rw-r--r--block/Makefile1
-rw-r--r--block/as-iosched.c1520
-rw-r--r--block/cfq-iosched.c262
-rw-r--r--block/elevator.c10
-rw-r--r--drivers/block/Kconfig2
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/drbd/Kconfig71
-rw-r--r--drivers/block/drbd/Makefile5
-rw-r--r--drivers/block/drbd/drbd_actlog.c1424
-rw-r--r--drivers/block/drbd/drbd_bitmap.c1327
-rw-r--r--drivers/block/drbd/drbd_int.h2252
-rw-r--r--drivers/block/drbd/drbd_main.c3700
-rw-r--r--drivers/block/drbd/drbd_nl.c2360
-rw-r--r--drivers/block/drbd/drbd_proc.c265
-rw-r--r--drivers/block/drbd/drbd_receiver.c4427
-rw-r--r--drivers/block/drbd/drbd_req.c1120
-rw-r--r--drivers/block/drbd/drbd_req.h326
-rw-r--r--drivers/block/drbd/drbd_strings.c113
-rw-r--r--drivers/block/drbd/drbd_vli.h351
-rw-r--r--drivers/block/drbd/drbd_worker.c1512
-rw-r--r--drivers/block/drbd/drbd_wrappers.h91
-rw-r--r--fs/aio.c62
-rw-r--r--fs/block_dev.c12
-rw-r--r--fs/direct-io.c10
-rw-r--r--include/linux/backing-dev.h13
-rw-r--r--include/linux/bio.h8
-rw-r--r--include/linux/blkdev.h13
-rw-r--r--include/linux/drbd.h350
-rw-r--r--include/linux/drbd_limits.h137
-rw-r--r--include/linux/drbd_nl.h137
-rw-r--r--include/linux/drbd_tag_magic.h83
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/iocontext.h5
-rw-r--r--include/linux/lru_cache.h294
-rw-r--r--lib/Kconfig3
-rw-r--r--lib/Makefile2
-rw-r--r--lib/lru_cache.c560
46 files changed, 22432 insertions, 1634 deletions
diff --git a/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg b/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg
new file mode 100644
index 000000000000..f87cfa0dc2fb
--- /dev/null
+++ b/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg
@@ -0,0 +1,588 @@
1<?xml version="1.0" encoding="UTF-8" standalone="no"?>
2<!-- Created with Inkscape (http://www.inkscape.org/) -->
3<svg
4 xmlns:svg="http://www.w3.org/2000/svg"
5 xmlns="http://www.w3.org/2000/svg"
6 version="1.0"
7 width="210mm"
8 height="297mm"
9 viewBox="0 0 21000 29700"
10 id="svg2"
11 style="fill-rule:evenodd">
12 <defs
13 id="defs4" />
14 <g
15 id="Default"
16 style="visibility:visible">
17 <desc
18 id="desc180">Master slide</desc>
19 </g>
20 <path
21 d="M 11999,8601 L 11899,8301 L 12099,8301 L 11999,8601 z"
22 id="path193"
23 style="fill:#008000;visibility:visible" />
24 <path
25 d="M 11999,7801 L 11999,8361"
26 id="path197"
27 style="fill:none;stroke:#008000;visibility:visible" />
28 <path
29 d="M 7999,10401 L 7899,10101 L 8099,10101 L 7999,10401 z"
30 id="path209"
31 style="fill:#008000;visibility:visible" />
32 <path
33 d="M 7999,9601 L 7999,10161"
34 id="path213"
35 style="fill:none;stroke:#008000;visibility:visible" />
36 <path
37 d="M 11999,7801 L 11685,7840 L 11724,7644 L 11999,7801 z"
38 id="path225"
39 style="fill:#008000;visibility:visible" />
40 <path
41 d="M 7999,7001 L 11764,7754"
42 id="path229"
43 style="fill:none;stroke:#008000;visibility:visible" />
44 <g
45 transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-1244.4792,1416.5139)"
46 id="g245"
47 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
48 <text
49 id="text247">
50 <tspan
51 x="9139 9368 9579 9808 9986 10075 10252 10481 10659 10837 10909"
52 y="9284"
53 id="tspan249">RSDataReply</tspan>
54 </text>
55 </g>
56 <path
57 d="M 7999,9601 L 8281,9458 L 8311,9655 L 7999,9601 z"
58 id="path259"
59 style="fill:#008000;visibility:visible" />
60 <path
61 d="M 11999,9001 L 8236,9565"
62 id="path263"
63 style="fill:none;stroke:#008000;visibility:visible" />
64 <g
65 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,1620.9382,-1639.4947)"
66 id="g279"
67 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
68 <text
69 id="text281">
70 <tspan
71 x="8743 8972 9132 9310 9573 9801 10013 10242 10419 10597 10775 10953 11114"
72 y="7023"
73 id="tspan283">CsumRSRequest</tspan>
74 </text>
75 </g>
76 <text
77 id="text297"
78 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
79 <tspan
80 x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692"
81 y="5707"
82 id="tspan299">w_make_resync_request()</tspan>
83 </text>
84 <text
85 id="text313"
86 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
87 <tspan
88 x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378"
89 y="7806"
90 id="tspan315">receive_DataRequest()</tspan>
91 </text>
92 <text
93 id="text329"
94 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
95 <tspan
96 x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399"
97 y="8606"
98 id="tspan331">drbd_endio_read_sec()</tspan>
99 </text>
100 <text
101 id="text345"
102 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
103 <tspan
104 x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13825 13986 14164 14426 14604 14710 14871 15049 15154 15332 15510 15616"
105 y="9007"
106 id="tspan347">w_e_end_csum_rs_req()</tspan>
107 </text>
108 <text
109 id="text361"
110 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
111 <tspan
112 x="4444 4550 4728 4889 5066 5138 5299 5477 5655 5883 6095 6324 6501 6590 6768 6997 7175 7352 7424 7585 7691"
113 y="9507"
114 id="tspan363">receive_RSDataReply()</tspan>
115 </text>
116 <text
117 id="text377"
118 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
119 <tspan
120 x="4457 4635 4741 4918 5096 5274 5452 5630 5807 5879 6057 6235 6464 6569 6641 6730 6908 7086 7247 7425 7585 7691"
121 y="10407"
122 id="tspan379">drbd_endio_write_sec()</tspan>
123 </text>
124 <text
125 id="text393"
126 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
127 <tspan
128 x="4647 4825 5003 5180 5358 5536 5714 5820 5997 6158 6319 6497 6658 6836 7013 7085 7263 7424 7585 7691"
129 y="10907"
130 id="tspan395">e_end_resync_block()</tspan>
131 </text>
132 <path
133 d="M 11999,11601 L 11685,11640 L 11724,11444 L 11999,11601 z"
134 id="path405"
135 style="fill:#000080;visibility:visible" />
136 <path
137 d="M 7999,10801 L 11764,11554"
138 id="path409"
139 style="fill:none;stroke:#000080;visibility:visible" />
140 <g
141 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,2434.7562,-1674.649)"
142 id="g425"
143 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
144 <text
145 id="text427">
146 <tspan
147 x="9320 9621 9726 9798 9887 10065 10277 10438"
148 y="10943"
149 id="tspan429">WriteAck</tspan>
150 </text>
151 </g>
152 <text
153 id="text443"
154 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
155 <tspan
156 x="12199 12377 12555 12644 12821 13033 13105 13283 13444 13604 13816 13977 14138 14244"
157 y="11559"
158 id="tspan445">got_BlockAck()</tspan>
159 </text>
160 <text
161 id="text459"
162 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
163 <tspan
164 x="7999 8304 8541 8778 8990 9201 9413 9650 10001 10120 10357 10594 10806 11043 11280 11398 11703 11940 12152 12364 12601 12812 12931 13049 13261 13498 13710 13947 14065 14302 14540 14658 14777 14870 15107 15225 15437 15649 15886"
165 y="4877"
166 id="tspan461">Checksum based Resync, case not in sync</tspan>
167 </text>
168 <text
169 id="text475"
170 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
171 <tspan
172 x="6961 7266 7571 7854 8159 8299 8536 8654 8891 9010 9247 9484 9603 9840 9958 10077 10170 10407"
173 y="2806"
174 id="tspan477">DRBD-8.3 data flow</tspan>
175 </text>
176 <text
177 id="text491"
178 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
179 <tspan
180 x="5190 5419 5596 5774 5952 6113 6291 6468 6646 6824 6985 7146 7324 7586 7692"
181 y="7005"
182 id="tspan493">w_e_send_csum()</tspan>
183 </text>
184 <path
185 d="M 11999,17601 L 11899,17301 L 12099,17301 L 11999,17601 z"
186 id="path503"
187 style="fill:#008000;visibility:visible" />
188 <path
189 d="M 11999,16801 L 11999,17361"
190 id="path507"
191 style="fill:none;stroke:#008000;visibility:visible" />
192 <path
193 d="M 11999,16801 L 11685,16840 L 11724,16644 L 11999,16801 z"
194 id="path519"
195 style="fill:#008000;visibility:visible" />
196 <path
197 d="M 7999,16001 L 11764,16754"
198 id="path523"
199 style="fill:none;stroke:#008000;visibility:visible" />
200 <g
201 transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-2539.5806,1529.3491)"
202 id="g539"
203 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
204 <text
205 id="text541">
206 <tspan
207 x="9269 9498 9709 9798 9959 10048 10226 10437 10598 10776"
208 y="18265"
209 id="tspan543">RSIsInSync</tspan>
210 </text>
211 </g>
212 <path
213 d="M 7999,18601 L 8281,18458 L 8311,18655 L 7999,18601 z"
214 id="path553"
215 style="fill:#000080;visibility:visible" />
216 <path
217 d="M 11999,18001 L 8236,18565"
218 id="path557"
219 style="fill:none;stroke:#000080;visibility:visible" />
220 <g
221 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,3461.4027,-1449.3012)"
222 id="g573"
223 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
224 <text
225 id="text575">
226 <tspan
227 x="8743 8972 9132 9310 9573 9801 10013 10242 10419 10597 10775 10953 11114"
228 y="16023"
229 id="tspan577">CsumRSRequest</tspan>
230 </text>
231 </g>
232 <text
233 id="text591"
234 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
235 <tspan
236 x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378"
237 y="16806"
238 id="tspan593">receive_DataRequest()</tspan>
239 </text>
240 <text
241 id="text607"
242 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
243 <tspan
244 x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399"
245 y="17606"
246 id="tspan609">drbd_endio_read_sec()</tspan>
247 </text>
248 <text
249 id="text623"
250 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
251 <tspan
252 x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13825 13986 14164 14426 14604 14710 14871 15049 15154 15332 15510 15616"
253 y="18007"
254 id="tspan625">w_e_end_csum_rs_req()</tspan>
255 </text>
256 <text
257 id="text639"
258 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
259 <tspan
260 x="5735 5913 6091 6180 6357 6446 6607 6696 6874 7085 7246 7424 7585 7691"
261 y="18507"
262 id="tspan641">got_IsInSync()</tspan>
263 </text>
264 <text
265 id="text655"
266 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
267 <tspan
268 x="7999 8304 8541 8778 8990 9201 9413 9650 10001 10120 10357 10594 10806 11043 11280 11398 11703 11940 12152 12364 12601 12812 12931 13049 13261 13498 13710 13947 14065 14159 14396 14514 14726 14937 15175"
269 y="13877"
270 id="tspan657">Checksum based Resync, case in sync</tspan>
271 </text>
272 <path
273 d="M 12000,24601 L 11900,24301 L 12100,24301 L 12000,24601 z"
274 id="path667"
275 style="fill:#008000;visibility:visible" />
276 <path
277 d="M 12000,23801 L 12000,24361"
278 id="path671"
279 style="fill:none;stroke:#008000;visibility:visible" />
280 <path
281 d="M 8000,26401 L 7900,26101 L 8100,26101 L 8000,26401 z"
282 id="path683"
283 style="fill:#008000;visibility:visible" />
284 <path
285 d="M 8000,25601 L 8000,26161"
286 id="path687"
287 style="fill:none;stroke:#008000;visibility:visible" />
288 <path
289 d="M 12000,23801 L 11686,23840 L 11725,23644 L 12000,23801 z"
290 id="path699"
291 style="fill:#008000;visibility:visible" />
292 <path
293 d="M 8000,23001 L 11765,23754"
294 id="path703"
295 style="fill:none;stroke:#008000;visibility:visible" />
296 <g
297 transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-3543.8452,1630.5143)"
298 id="g719"
299 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
300 <text
301 id="text721">
302 <tspan
303 x="9464 9710 9921 10150 10328 10505 10577"
304 y="25236"
305 id="tspan723">OVReply</tspan>
306 </text>
307 </g>
308 <path
309 d="M 8000,25601 L 8282,25458 L 8312,25655 L 8000,25601 z"
310 id="path733"
311 style="fill:#008000;visibility:visible" />
312 <path
313 d="M 12000,25001 L 8237,25565"
314 id="path737"
315 style="fill:none;stroke:#008000;visibility:visible" />
316 <g
317 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,4918.2801,-1381.2128)"
318 id="g753"
319 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
320 <text
321 id="text755">
322 <tspan
323 x="9142 9388 9599 9828 10006 10183 10361 10539 10700"
324 y="23106"
325 id="tspan757">OVRequest</tspan>
326 </text>
327 </g>
328 <text
329 id="text771"
330 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
331 <tspan
332 x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13656 13868 14097 14274 14452 14630 14808 14969 15058 15163"
333 y="23806"
334 id="tspan773">receive_OVRequest()</tspan>
335 </text>
336 <text
337 id="text787"
338 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
339 <tspan
340 x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14084 14262 14439 14617 14795 14956 15134 15295 15400"
341 y="24606"
342 id="tspan789">drbd_endio_read_sec()</tspan>
343 </text>
344 <text
345 id="text803"
346 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
347 <tspan
348 x="12192 12421 12598 12776 12954 13132 13310 13487 13665 13843 14004 14182 14288 14465 14643 14749"
349 y="25007"
350 id="tspan805">w_e_end_ov_req()</tspan>
351 </text>
352 <text
353 id="text819"
354 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
355 <tspan
356 x="5101 5207 5385 5546 5723 5795 5956 6134 6312 6557 6769 6998 7175 7353 7425 7586 7692"
357 y="25507"
358 id="tspan821">receive_OVReply()</tspan>
359 </text>
360 <text
361 id="text835"
362 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
363 <tspan
364 x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692"
365 y="26407"
366 id="tspan837">drbd_endio_read_sec()</tspan>
367 </text>
368 <text
369 id="text851"
370 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
371 <tspan
372 x="4902 5131 5308 5486 5664 5842 6020 6197 6375 6553 6714 6892 6998 7175 7353 7425 7586 7692"
373 y="26907"
374 id="tspan853">w_e_end_ov_reply()</tspan>
375 </text>
376 <path
377 d="M 12000,27601 L 11686,27640 L 11725,27444 L 12000,27601 z"
378 id="path863"
379 style="fill:#000080;visibility:visible" />
380 <path
381 d="M 8000,26801 L 11765,27554"
382 id="path867"
383 style="fill:none;stroke:#000080;visibility:visible" />
384 <g
385 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,5704.1907,-1328.312)"
386 id="g883"
387 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
388 <text
389 id="text885">
390 <tspan
391 x="9279 9525 9736 9965 10143 10303 10481 10553"
392 y="26935"
393 id="tspan887">OVResult</tspan>
394 </text>
395 </g>
396 <text
397 id="text901"
398 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
399 <tspan
400 x="12200 12378 12556 12645 12822 13068 13280 13508 13686 13847 14025 14097 14185 14291"
401 y="27559"
402 id="tspan903">got_OVResult()</tspan>
403 </text>
404 <text
405 id="text917"
406 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
407 <tspan
408 x="8000 8330 8567 8660 8754 8991 9228 9346 9558 9795 9935 10028 10146"
409 y="21877"
410 id="tspan919">Online verify</tspan>
411 </text>
412 <text
413 id="text933"
414 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
415 <tspan
416 x="4641 4870 5047 5310 5488 5649 5826 6004 6182 6343 6521 6626 6804 6982 7160 7338 7499 7587 7693"
417 y="23005"
418 id="tspan935">w_make_ov_request()</tspan>
419 </text>
420 <path
421 d="M 8000,6500 L 7900,6200 L 8100,6200 L 8000,6500 z"
422 id="path945"
423 style="fill:#008000;visibility:visible" />
424 <path
425 d="M 8000,5700 L 8000,6260"
426 id="path949"
427 style="fill:none;stroke:#008000;visibility:visible" />
428 <path
429 d="M 3900,5500 L 3700,5500 L 3700,11000 L 3900,11000"
430 id="path961"
431 style="fill:none;stroke:#000000;visibility:visible" />
432 <path
433 d="M 3900,14500 L 3700,14500 L 3700,18600 L 3900,18600"
434 id="path973"
435 style="fill:none;stroke:#000000;visibility:visible" />
436 <path
437 d="M 3900,22800 L 3700,22800 L 3700,26900 L 3900,26900"
438 id="path985"
439 style="fill:none;stroke:#000000;visibility:visible" />
440 <text
441 id="text1001"
442 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
443 <tspan
444 x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692"
445 y="6506"
446 id="tspan1003">drbd_endio_read_sec()</tspan>
447 </text>
448 <text
449 id="text1017"
450 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
451 <tspan
452 x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692"
453 y="14708"
454 id="tspan1019">w_make_resync_request()</tspan>
455 </text>
456 <text
457 id="text1033"
458 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
459 <tspan
460 x="5190 5419 5596 5774 5952 6113 6291 6468 6646 6824 6985 7146 7324 7586 7692"
461 y="16006"
462 id="tspan1035">w_e_send_csum()</tspan>
463 </text>
464 <path
465 d="M 8000,15501 L 7900,15201 L 8100,15201 L 8000,15501 z"
466 id="path1045"
467 style="fill:#008000;visibility:visible" />
468 <path
469 d="M 8000,14701 L 8000,15261"
470 id="path1049"
471 style="fill:none;stroke:#008000;visibility:visible" />
472 <text
473 id="text1065"
474 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
475 <tspan
476 x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692"
477 y="15507"
478 id="tspan1067">drbd_endio_read_sec()</tspan>
479 </text>
480 <path
481 d="M 16100,9000 L 16300,9000 L 16300,7500 L 16100,7500"
482 id="path1077"
483 style="fill:none;stroke:#000000;visibility:visible" />
484 <path
485 d="M 16100,18000 L 16300,18000 L 16300,16500 L 16100,16500"
486 id="path1089"
487 style="fill:none;stroke:#000000;visibility:visible" />
488 <path
489 d="M 16100,25000 L 16300,25000 L 16300,23500 L 16100,23500"
490 id="path1101"
491 style="fill:none;stroke:#000000;visibility:visible" />
492 <text
493 id="text1117"
494 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
495 <tspan
496 x="2026 2132 2293 2471 2648 2826 3004 3076 3254 3431 3503 3681 3787"
497 y="5402"
498 id="tspan1119">rs_begin_io()</tspan>
499 </text>
500 <text
501 id="text1133"
502 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
503 <tspan
504 x="2027 2133 2294 2472 2649 2827 3005 3077 3255 3432 3504 3682 3788"
505 y="14402"
506 id="tspan1135">rs_begin_io()</tspan>
507 </text>
508 <text
509 id="text1149"
510 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
511 <tspan
512 x="2026 2132 2293 2471 2648 2826 3004 3076 3254 3431 3503 3681 3787"
513 y="22602"
514 id="tspan1151">rs_begin_io()</tspan>
515 </text>
516 <text
517 id="text1165"
518 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
519 <tspan
520 x="1426 1532 1693 1871 2031 2209 2472 2649 2721 2899 2988 3166 3344 3416 3593 3699"
521 y="11302"
522 id="tspan1167">rs_complete_io()</tspan>
523 </text>
524 <text
525 id="text1181"
526 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
527 <tspan
528 x="1526 1632 1793 1971 2131 2309 2572 2749 2821 2999 3088 3266 3444 3516 3693 3799"
529 y="18931"
530 id="tspan1183">rs_complete_io()</tspan>
531 </text>
532 <text
533 id="text1197"
534 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
535 <tspan
536 x="1526 1632 1793 1971 2131 2309 2572 2749 2821 2999 3088 3266 3444 3516 3693 3799"
537 y="27231"
538 id="tspan1199">rs_complete_io()</tspan>
539 </text>
540 <text
541 id="text1213"
542 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
543 <tspan
544 x="16126 16232 16393 16571 16748 16926 17104 17176 17354 17531 17603 17781 17887"
545 y="7402"
546 id="tspan1215">rs_begin_io()</tspan>
547 </text>
548 <text
549 id="text1229"
550 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
551 <tspan
552 x="16127 16233 16394 16572 16749 16927 17105 17177 17355 17532 17604 17782 17888"
553 y="16331"
554 id="tspan1231">rs_begin_io()</tspan>
555 </text>
556 <text
557 id="text1245"
558 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
559 <tspan
560 x="16127 16233 16394 16572 16749 16927 17105 17177 17355 17532 17604 17782 17888"
561 y="23302"
562 id="tspan1247">rs_begin_io()</tspan>
563 </text>
564 <text
565 id="text1261"
566 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
567 <tspan
568 x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388"
569 y="9302"
570 id="tspan1263">rs_complete_io()</tspan>
571 </text>
572 <text
573 id="text1277"
574 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
575 <tspan
576 x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388"
577 y="18331"
578 id="tspan1279">rs_complete_io()</tspan>
579 </text>
580 <text
581 id="text1293"
582 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
583 <tspan
584 x="16126 16232 16393 16571 16731 16909 17172 17349 17421 17599 17688 17866 18044 18116 18293 18399"
585 y="25302"
586 id="tspan1295">rs_complete_io()</tspan>
587 </text>
588</svg>
diff --git a/Documentation/blockdev/drbd/DRBD-data-packets.svg b/Documentation/blockdev/drbd/DRBD-data-packets.svg
new file mode 100644
index 000000000000..48a1e2165fec
--- /dev/null
+++ b/Documentation/blockdev/drbd/DRBD-data-packets.svg
@@ -0,0 +1,459 @@
1<?xml version="1.0" encoding="UTF-8" standalone="no"?>
2<!-- Created with Inkscape (http://www.inkscape.org/) -->
3<svg
4 xmlns:svg="http://www.w3.org/2000/svg"
5 xmlns="http://www.w3.org/2000/svg"
6 version="1.0"
7 width="210mm"
8 height="297mm"
9 viewBox="0 0 21000 29700"
10 id="svg2"
11 style="fill-rule:evenodd">
12 <defs
13 id="defs4" />
14 <g
15 id="Default"
16 style="visibility:visible">
17 <desc
18 id="desc176">Master slide</desc>
19 </g>
20 <path
21 d="M 11999,19601 L 11899,19301 L 12099,19301 L 11999,19601 z"
22 id="path189"
23 style="fill:#008000;visibility:visible" />
24 <path
25 d="M 11999,18801 L 11999,19361"
26 id="path193"
27 style="fill:none;stroke:#008000;visibility:visible" />
28 <path
29 d="M 7999,21401 L 7899,21101 L 8099,21101 L 7999,21401 z"
30 id="path205"
31 style="fill:#008000;visibility:visible" />
32 <path
33 d="M 7999,20601 L 7999,21161"
34 id="path209"
35 style="fill:none;stroke:#008000;visibility:visible" />
36 <path
37 d="M 11999,18801 L 11685,18840 L 11724,18644 L 11999,18801 z"
38 id="path221"
39 style="fill:#008000;visibility:visible" />
40 <path
41 d="M 7999,18001 L 11764,18754"
42 id="path225"
43 style="fill:none;stroke:#008000;visibility:visible" />
44 <text
45 x="-3023.845"
46 y="1106.8124"
47 transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)"
48 id="text243"
49 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
50 <tspan
51 x="6115.1553 6344.1553 6555.1553 6784.1553 6962.1553 7051.1553 7228.1553 7457.1553 7635.1553 7813.1553 7885.1553"
52 y="21390.812"
53 id="tspan245">RSDataReply</tspan>
54 </text>
55 <path
56 d="M 7999,20601 L 8281,20458 L 8311,20655 L 7999,20601 z"
57 id="path255"
58 style="fill:#008000;visibility:visible" />
59 <path
60 d="M 11999,20001 L 8236,20565"
61 id="path259"
62 style="fill:none;stroke:#008000;visibility:visible" />
63 <text
64 x="3502.5356"
65 y="-2184.6621"
66 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
67 id="text277"
68 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
69 <tspan
70 x="12321.536 12550.536 12761.536 12990.536 13168.536 13257.536 13434.536 13663.536 13841.536 14019.536 14196.536 14374.536 14535.536"
71 y="15854.338"
72 id="tspan279">RSDataRequest</tspan>
73 </text>
74 <text
75 id="text293"
76 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
77 <tspan
78 x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692"
79 y="17807"
80 id="tspan295">w_make_resync_request()</tspan>
81 </text>
82 <text
83 id="text309"
84 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
85 <tspan
86 x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378"
87 y="18806"
88 id="tspan311">receive_DataRequest()</tspan>
89 </text>
90 <text
91 id="text325"
92 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
93 <tspan
94 x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399"
95 y="19606"
96 id="tspan327">drbd_endio_read_sec()</tspan>
97 </text>
98 <text
99 id="text341"
100 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
101 <tspan
102 x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13770 13931 14109 14287 14375 14553 14731 14837 15015 15192 15298"
103 y="20007"
104 id="tspan343">w_e_end_rsdata_req()</tspan>
105 </text>
106 <text
107 id="text357"
108 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
109 <tspan
110 x="4444 4550 4728 4889 5066 5138 5299 5477 5655 5883 6095 6324 6501 6590 6768 6997 7175 7352 7424 7585 7691"
111 y="20507"
112 id="tspan359">receive_RSDataReply()</tspan>
113 </text>
114 <text
115 id="text373"
116 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
117 <tspan
118 x="4457 4635 4741 4918 5096 5274 5452 5630 5807 5879 6057 6235 6464 6569 6641 6730 6908 7086 7247 7425 7585 7691"
119 y="21407"
120 id="tspan375">drbd_endio_write_sec()</tspan>
121 </text>
122 <text
123 id="text389"
124 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
125 <tspan
126 x="4647 4825 5003 5180 5358 5536 5714 5820 5997 6158 6319 6497 6658 6836 7013 7085 7263 7424 7585 7691"
127 y="21907"
128 id="tspan391">e_end_resync_block()</tspan>
129 </text>
130 <path
131 d="M 11999,22601 L 11685,22640 L 11724,22444 L 11999,22601 z"
132 id="path401"
133 style="fill:#000080;visibility:visible" />
134 <path
135 d="M 7999,21801 L 11764,22554"
136 id="path405"
137 style="fill:none;stroke:#000080;visibility:visible" />
138 <text
139 x="4290.3008"
140 y="-2369.6162"
141 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
142 id="text423"
143 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
144 <tspan
145 x="13610.301 13911.301 14016.301 14088.301 14177.301 14355.301 14567.301 14728.301"
146 y="19573.385"
147 id="tspan425">WriteAck</tspan>
148 </text>
149 <text
150 id="text439"
151 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
152 <tspan
153 x="12199 12377 12555 12644 12821 13033 13105 13283 13444 13604 13816 13977 14138 14244"
154 y="22559"
155 id="tspan441">got_BlockAck()</tspan>
156 </text>
157 <text
158 id="text455"
159 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
160 <tspan
161 x="7999 8304 8541 8753 8964 9201 9413 9531 9769 9862 10099 10310 10522 10734 10852 10971 11208 11348 11585 11822"
162 y="16877"
163 id="tspan457">Resync blocks, 4-32K</tspan>
164 </text>
165 <path
166 d="M 12000,7601 L 11900,7301 L 12100,7301 L 12000,7601 z"
167 id="path467"
168 style="fill:#008000;visibility:visible" />
169 <path
170 d="M 12000,6801 L 12000,7361"
171 id="path471"
172 style="fill:none;stroke:#008000;visibility:visible" />
173 <path
174 d="M 12000,6801 L 11686,6840 L 11725,6644 L 12000,6801 z"
175 id="path483"
176 style="fill:#008000;visibility:visible" />
177 <path
178 d="M 8000,6001 L 11765,6754"
179 id="path487"
180 style="fill:none;stroke:#008000;visibility:visible" />
181 <text
182 x="-1288.1796"
183 y="1279.7666"
184 transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)"
185 id="text505"
186 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
187 <tspan
188 x="8174.8208 8475.8203 8580.8203 8652.8203 8741.8203 8919.8203 9131.8203 9292.8203"
189 y="9516.7666"
190 id="tspan507">WriteAck</tspan>
191 </text>
192 <path
193 d="M 8000,8601 L 8282,8458 L 8312,8655 L 8000,8601 z"
194 id="path517"
195 style="fill:#000080;visibility:visible" />
196 <path
197 d="M 12000,8001 L 8237,8565"
198 id="path521"
199 style="fill:none;stroke:#000080;visibility:visible" />
200 <text
201 x="1065.6655"
202 y="-2097.7664"
203 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
204 id="text539"
205 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
206 <tspan
207 x="10682.666 10911.666 11088.666 11177.666"
208 y="4107.2339"
209 id="tspan541">Data</tspan>
210 </text>
211 <text
212 id="text555"
213 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
214 <tspan
215 x="4746 4924 5030 5207 5385 5563 5826 6003 6164 6342 6520 6626 6803 6981 7159 7337 7498 7587 7692"
216 y="5505"
217 id="tspan557">drbd_make_request()</tspan>
218 </text>
219 <text
220 id="text571"
221 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
222 <tspan
223 x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13639 13817 13906 14084 14190"
224 y="6806"
225 id="tspan573">receive_Data()</tspan>
226 </text>
227 <text
228 id="text587"
229 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
230 <tspan
231 x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14207 14312 14384 14473 14651 14829 14990 15168 15328 15434"
232 y="7606"
233 id="tspan589">drbd_endio_write_sec()</tspan>
234 </text>
235 <text
236 id="text603"
237 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
238 <tspan
239 x="12192 12370 12548 12725 12903 13081 13259 13437 13509 13686 13847 14008 14114"
240 y="8007"
241 id="tspan605">e_end_block()</tspan>
242 </text>
243 <text
244 id="text619"
245 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
246 <tspan
247 x="5647 5825 6003 6092 6269 6481 6553 6731 6892 7052 7264 7425 7586 7692"
248 y="8606"
249 id="tspan621">got_BlockAck()</tspan>
250 </text>
251 <text
252 id="text635"
253 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
254 <tspan
255 x="8000 8305 8542 8779 9016 9109 9346 9486 9604 9956 10049 10189 10328 10565 10705 10942 11179 11298 11603 11742 11835 11954 12191 12310 12428 12665 12902 13139 13279 13516 13753"
256 y="4877"
257 id="tspan637">Regular mirrored write, 512-32K</tspan>
258 </text>
259 <text
260 id="text651"
261 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
262 <tspan
263 x="5381 5610 5787 5948 6126 6304 6482 6659 6837 7015 7087 7265 7426 7587 7692"
264 y="6003"
265 id="tspan653">w_send_dblock()</tspan>
266 </text>
267 <path
268 d="M 8000,6800 L 7900,6500 L 8100,6500 L 8000,6800 z"
269 id="path663"
270 style="fill:#008000;visibility:visible" />
271 <path
272 d="M 8000,6000 L 8000,6560"
273 id="path667"
274 style="fill:none;stroke:#008000;visibility:visible" />
275 <text
276 id="text683"
277 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
278 <tspan
279 x="4602 4780 4886 5063 5241 5419 5597 5775 5952 6024 6202 6380 6609 6714 6786 6875 7053 7231 7409 7515 7587 7692"
280 y="6905"
281 id="tspan685">drbd_endio_write_pri()</tspan>
282 </text>
283 <path
284 d="M 12000,13602 L 11900,13302 L 12100,13302 L 12000,13602 z"
285 id="path695"
286 style="fill:#008000;visibility:visible" />
287 <path
288 d="M 12000,12802 L 12000,13362"
289 id="path699"
290 style="fill:none;stroke:#008000;visibility:visible" />
291 <path
292 d="M 12000,12802 L 11686,12841 L 11725,12645 L 12000,12802 z"
293 id="path711"
294 style="fill:#008000;visibility:visible" />
295 <path
296 d="M 8000,12002 L 11765,12755"
297 id="path715"
298 style="fill:none;stroke:#008000;visibility:visible" />
299 <text
300 x="-2155.5266"
301 y="1201.5964"
302 transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)"
303 id="text733"
304 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
305 <tspan
306 x="7202.4736 7431.4736 7608.4736 7697.4736 7875.4736 8104.4736 8282.4736 8459.4736 8531.4736"
307 y="15454.597"
308 id="tspan735">DataReply</tspan>
309 </text>
310 <path
311 d="M 8000,14602 L 8282,14459 L 8312,14656 L 8000,14602 z"
312 id="path745"
313 style="fill:#008000;visibility:visible" />
314 <path
315 d="M 12000,14002 L 8237,14566"
316 id="path749"
317 style="fill:none;stroke:#008000;visibility:visible" />
318 <text
319 x="2280.3804"
320 y="-2103.2141"
321 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
322 id="text767"
323 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
324 <tspan
325 x="11316.381 11545.381 11722.381 11811.381 11989.381 12218.381 12396.381 12573.381 12751.381 12929.381 13090.381"
326 y="9981.7861"
327 id="tspan769">DataRequest</tspan>
328 </text>
329 <text
330 id="text783"
331 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
332 <tspan
333 x="4746 4924 5030 5207 5385 5563 5826 6003 6164 6342 6520 6626 6803 6981 7159 7337 7498 7587 7692"
334 y="11506"
335 id="tspan785">drbd_make_request()</tspan>
336 </text>
337 <text
338 id="text799"
339 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
340 <tspan
341 x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13639 13817 13906 14084 14312 14490 14668 14846 15024 15185 15273 15379"
342 y="12807"
343 id="tspan801">receive_DataRequest()</tspan>
344 </text>
345 <text
346 id="text815"
347 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
348 <tspan
349 x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14084 14262 14439 14617 14795 14956 15134 15295 15400"
350 y="13607"
351 id="tspan817">drbd_endio_read_sec()</tspan>
352 </text>
353 <text
354 id="text831"
355 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
356 <tspan
357 x="12192 12421 12598 12776 12954 13132 13310 13487 13665 13843 14021 14110 14288 14465 14571 14749 14927 15033"
358 y="14008"
359 id="tspan833">w_e_end_data_req()</tspan>
360 </text>
361 <g
362 id="g835"
363 style="visibility:visible">
364 <desc
365 id="desc837">Drawing</desc>
366 <text
367 id="text847"
368 style="font-size:318px;font-weight:400;fill:#008000;font-family:Helvetica embedded">
369 <tspan
370 x="4885 4991 5169 5330 5507 5579 5740 5918 6096 6324 6502 6591 6769 6997 7175 7353 7425 7586 7692"
371 y="14607"
372 id="tspan849">receive_DataReply()</tspan>
373 </text>
374 </g>
375 <text
376 id="text863"
377 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
378 <tspan
379 x="8000 8305 8398 8610 8821 8914 9151 9363 9575 9693 9833 10070 10307 10544 10663 10781 11018 11255 11493 11632 11869 12106"
380 y="10878"
381 id="tspan865">Diskless read, 512-32K</tspan>
382 </text>
383 <text
384 id="text879"
385 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
386 <tspan
387 x="5029 5258 5435 5596 5774 5952 6130 6307 6413 6591 6769 6947 7125 7230 7408 7586 7692"
388 y="12004"
389 id="tspan881">w_send_read_req()</tspan>
390 </text>
391 <text
392 id="text895"
393 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
394 <tspan
395 x="6961 7266 7571 7854 8159 8278 8515 8633 8870 9107 9226 9463 9581 9700 9793 10030"
396 y="2806"
397 id="tspan897">DRBD 8 data flow</tspan>
398 </text>
399 <path
400 d="M 3900,5300 L 3700,5300 L 3700,7000 L 3900,7000"
401 id="path907"
402 style="fill:none;stroke:#000000;visibility:visible" />
403 <path
404 d="M 3900,17600 L 3700,17600 L 3700,22000 L 3900,22000"
405 id="path919"
406 style="fill:none;stroke:#000000;visibility:visible" />
407 <path
408 d="M 16100,20000 L 16300,20000 L 16300,18500 L 16100,18500"
409 id="path931"
410 style="fill:none;stroke:#000000;visibility:visible" />
411 <text
412 id="text947"
413 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
414 <tspan
415 x="2126 2304 2376 2554 2731 2909 3087 3159 3337 3515 3587 3764 3870"
416 y="5202"
417 id="tspan949">al_begin_io()</tspan>
418 </text>
419 <text
420 id="text963"
421 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
422 <tspan
423 x="1632 1810 1882 2060 2220 2398 2661 2839 2910 3088 3177 3355 3533 3605 3783 3888"
424 y="7331"
425 id="tspan965">al_complete_io()</tspan>
426 </text>
427 <text
428 id="text979"
429 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
430 <tspan
431 x="2126 2232 2393 2571 2748 2926 3104 3176 3354 3531 3603 3781 3887"
432 y="17431"
433 id="tspan981">rs_begin_io()</tspan>
434 </text>
435 <text
436 id="text995"
437 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
438 <tspan
439 x="1626 1732 1893 2071 2231 2409 2672 2849 2921 3099 3188 3366 3544 3616 3793 3899"
440 y="22331"
441 id="tspan997">rs_complete_io()</tspan>
442 </text>
443 <text
444 id="text1011"
445 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
446 <tspan
447 x="16027 16133 16294 16472 16649 16827 17005 17077 17255 17432 17504 17682 17788"
448 y="18402"
449 id="tspan1013">rs_begin_io()</tspan>
450 </text>
451 <text
452 id="text1027"
453 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
454 <tspan
455 x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388"
456 y="20331"
457 id="tspan1029">rs_complete_io()</tspan>
458 </text>
459</svg>
diff --git a/Documentation/blockdev/drbd/README.txt b/Documentation/blockdev/drbd/README.txt
new file mode 100644
index 000000000000..627b0a1bf35e
--- /dev/null
+++ b/Documentation/blockdev/drbd/README.txt
@@ -0,0 +1,16 @@
1Description
2
3 DRBD is a shared-nothing, synchronously replicated block device. It
4 is designed to serve as a building block for high availability
5 clusters and in this context, is a "drop-in" replacement for shared
6 storage. Simplistically, you could see it as a network RAID 1.
7
8 Please visit http://www.drbd.org to find out more.
9
10The here included files are intended to help understand the implementation
11
12DRBD-8.3-data-packets.svg, DRBD-data-packets.svg
13 relates some functions, and write packets.
14
15conn-states-8.dot, disk-states-8.dot, node-states-8.dot
16 The sub graphs of DRBD's state transitions
diff --git a/Documentation/blockdev/drbd/conn-states-8.dot b/Documentation/blockdev/drbd/conn-states-8.dot
new file mode 100644
index 000000000000..025e8cf5e64a
--- /dev/null
+++ b/Documentation/blockdev/drbd/conn-states-8.dot
@@ -0,0 +1,18 @@
1digraph conn_states {
2 StandAllone -> WFConnection [ label = "ioctl_set_net()" ]
3 WFConnection -> Unconnected [ label = "unable to bind()" ]
4 WFConnection -> WFReportParams [ label = "in connect() after accept" ]
5 WFReportParams -> StandAllone [ label = "checks in receive_param()" ]
6 WFReportParams -> Connected [ label = "in receive_param()" ]
7 WFReportParams -> WFBitMapS [ label = "sync_handshake()" ]
8 WFReportParams -> WFBitMapT [ label = "sync_handshake()" ]
9 WFBitMapS -> SyncSource [ label = "receive_bitmap()" ]
10 WFBitMapT -> SyncTarget [ label = "receive_bitmap()" ]
11 SyncSource -> Connected
12 SyncTarget -> Connected
13 SyncSource -> PausedSyncS
14 SyncTarget -> PausedSyncT
15 PausedSyncS -> SyncSource
16 PausedSyncT -> SyncTarget
17 Connected -> WFConnection [ label = "* on network error" ]
18}
diff --git a/Documentation/blockdev/drbd/disk-states-8.dot b/Documentation/blockdev/drbd/disk-states-8.dot
new file mode 100644
index 000000000000..d06cfb46fb98
--- /dev/null
+++ b/Documentation/blockdev/drbd/disk-states-8.dot
@@ -0,0 +1,16 @@
1digraph disk_states {
2 Diskless -> Inconsistent [ label = "ioctl_set_disk()" ]
3 Diskless -> Consistent [ label = "ioctl_set_disk()" ]
4 Diskless -> Outdated [ label = "ioctl_set_disk()" ]
5 Consistent -> Outdated [ label = "receive_param()" ]
6 Consistent -> UpToDate [ label = "receive_param()" ]
7 Consistent -> Inconsistent [ label = "start resync" ]
8 Outdated -> Inconsistent [ label = "start resync" ]
9 UpToDate -> Inconsistent [ label = "ioctl_replicate" ]
10 Inconsistent -> UpToDate [ label = "resync completed" ]
11 Consistent -> Failed [ label = "io completion error" ]
12 Outdated -> Failed [ label = "io completion error" ]
13 UpToDate -> Failed [ label = "io completion error" ]
14 Inconsistent -> Failed [ label = "io completion error" ]
15 Failed -> Diskless [ label = "sending notify to peer" ]
16}
diff --git a/Documentation/blockdev/drbd/drbd-connection-state-overview.dot b/Documentation/blockdev/drbd/drbd-connection-state-overview.dot
new file mode 100644
index 000000000000..6d9cf0a7b11d
--- /dev/null
+++ b/Documentation/blockdev/drbd/drbd-connection-state-overview.dot
@@ -0,0 +1,85 @@
1// vim: set sw=2 sts=2 :
2digraph {
3 rankdir=BT
4 bgcolor=white
5
6 node [shape=plaintext]
7 node [fontcolor=black]
8
9 StandAlone [ style=filled,fillcolor=gray,label=StandAlone ]
10
11 node [fontcolor=lightgray]
12
13 Unconnected [ label=Unconnected ]
14
15 CommTrouble [ shape=record,
16 label="{communication loss|{Timeout|BrokenPipe|NetworkFailure}}" ]
17
18 node [fontcolor=gray]
19
20 subgraph cluster_try_connect {
21 label="try to connect, handshake"
22 rank=max
23 WFConnection [ label=WFConnection ]
24 WFReportParams [ label=WFReportParams ]
25 }
26
27 TearDown [ label=TearDown ]
28
29 Connected [ label=Connected,style=filled,fillcolor=green,fontcolor=black ]
30
31 node [fontcolor=lightblue]
32
33 StartingSyncS [ label=StartingSyncS ]
34 StartingSyncT [ label=StartingSyncT ]
35
36 subgraph cluster_bitmap_exchange {
37 node [fontcolor=red]
38 fontcolor=red
39 label="new application (WRITE?) requests blocked\lwhile bitmap is exchanged"
40
41 WFBitMapT [ label=WFBitMapT ]
42 WFSyncUUID [ label=WFSyncUUID ]
43 WFBitMapS [ label=WFBitMapS ]
44 }
45
46 node [fontcolor=blue]
47
48 cluster_resync [ shape=record,label="{<any>resynchronisation process running\l'concurrent' application requests allowed|{{<T>PausedSyncT\nSyncTarget}|{<S>PausedSyncS\nSyncSource}}}" ]
49
50 node [shape=box,fontcolor=black]
51
52 // drbdadm [label="drbdadm connect"]
53 // handshake [label="drbd_connect()\ndrbd_do_handshake\ndrbd_sync_handshake() etc."]
54 // comm_error [label="communication trouble"]
55
56 //
57 // edges
58 // --------------------------------------
59
60 StandAlone -> Unconnected [ label="drbdadm connect" ]
61 Unconnected -> StandAlone [ label="drbdadm disconnect\lor serious communication trouble" ]
62 Unconnected -> WFConnection [ label="receiver thread is started" ]
63 WFConnection -> WFReportParams [ headlabel="accept()\land/or \lconnect()\l" ]
64
65 WFReportParams -> StandAlone [ label="during handshake\lpeers do not agree\labout something essential" ]
66 WFReportParams -> Connected [ label="data identical\lno sync needed",color=green,fontcolor=green ]
67
68 WFReportParams -> WFBitMapS
69 WFReportParams -> WFBitMapT
70 WFBitMapT -> WFSyncUUID [minlen=0.1,constraint=false]
71
72 WFBitMapS -> cluster_resync:S
73 WFSyncUUID -> cluster_resync:T
74
75 edge [color=green]
76 cluster_resync:any -> Connected [ label="resnyc done",fontcolor=green ]
77
78 edge [color=red]
79 WFReportParams -> CommTrouble
80 Connected -> CommTrouble
81 cluster_resync:any -> CommTrouble
82 edge [color=black]
83 CommTrouble -> Unconnected [label="receiver thread is stopped" ]
84
85}
diff --git a/Documentation/blockdev/drbd/node-states-8.dot b/Documentation/blockdev/drbd/node-states-8.dot
new file mode 100644
index 000000000000..4a2b00c23547
--- /dev/null
+++ b/Documentation/blockdev/drbd/node-states-8.dot
@@ -0,0 +1,14 @@
1digraph node_states {
2 Secondary -> Primary [ label = "ioctl_set_state()" ]
3 Primary -> Secondary [ label = "ioctl_set_state()" ]
4}
5
6digraph peer_states {
7 Secondary -> Primary [ label = "recv state packet" ]
8 Primary -> Secondary [ label = "recv state packet" ]
9 Primary -> Unknown [ label = "connection lost" ]
10 Secondary -> Unknown [ label = "connection lost" ]
11 Unknown -> Primary [ label = "connected" ]
12 Unknown -> Secondary [ label = "connected" ]
13}
14
diff --git a/MAINTAINERS b/MAINTAINERS
index 88241154f4ce..ab095be9c2de 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1790,6 +1790,19 @@ S: Maintained
1790F: drivers/scsi/dpt* 1790F: drivers/scsi/dpt*
1791F: drivers/scsi/dpt/ 1791F: drivers/scsi/dpt/
1792 1792
1793DRBD DRIVER
1794P: Philipp Reisner
1795P: Lars Ellenberg
1796M: drbd-dev@lists.linbit.com
1797L: drbd-user@lists.linbit.com
1798W: http://www.drbd.org
1799T: git git://git.drbd.org/linux-2.6-drbd.git drbd
1800T: git git://git.drbd.org/drbd-8.3.git
1801S: Supported
1802F: drivers/block/drbd/
1803F: lib/lru_cache.c
1804F: Documentation/blockdev/drbd/
1805
1793DRIVER CORE, KOBJECTS, AND SYSFS 1806DRIVER CORE, KOBJECTS, AND SYSFS
1794M: Greg Kroah-Hartman <gregkh@suse.de> 1807M: Greg Kroah-Hartman <gregkh@suse.de>
1795T: quilt kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/ 1808T: quilt kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 7e803fc88770..8bd105115a69 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -12,24 +12,14 @@ config IOSCHED_NOOP
12 that do their own scheduling and require only minimal assistance from 12 that do their own scheduling and require only minimal assistance from
13 the kernel. 13 the kernel.
14 14
15config IOSCHED_AS
16 tristate "Anticipatory I/O scheduler"
17 default y
18 ---help---
19 The anticipatory I/O scheduler is generally a good choice for most
20 environments, but is quite large and complex when compared to the
21 deadline I/O scheduler, it can also be slower in some cases
22 especially some database loads.
23
24config IOSCHED_DEADLINE 15config IOSCHED_DEADLINE
25 tristate "Deadline I/O scheduler" 16 tristate "Deadline I/O scheduler"
26 default y 17 default y
27 ---help--- 18 ---help---
28 The deadline I/O scheduler is simple and compact, and is often as 19 The deadline I/O scheduler is simple and compact. It will provide
29 good as the anticipatory I/O scheduler, and in some database 20 CSCAN service with FIFO expiration of requests, switching to
30 workloads, better. In the case of a single process performing I/O to 21 a new point in the service tree and doing a batch of IO from there
31 a disk at any one time, its behaviour is almost identical to the 22 in case of expiry.
32 anticipatory I/O scheduler and so is a good choice.
33 23
34config IOSCHED_CFQ 24config IOSCHED_CFQ
35 tristate "CFQ I/O scheduler" 25 tristate "CFQ I/O scheduler"
@@ -37,7 +27,9 @@ config IOSCHED_CFQ
37 ---help--- 27 ---help---
38 The CFQ I/O scheduler tries to distribute bandwidth equally 28 The CFQ I/O scheduler tries to distribute bandwidth equally
39 among all processes in the system. It should provide a fair 29 among all processes in the system. It should provide a fair
40 working environment, suitable for desktop systems. 30 and low latency working environment, suitable for both desktop
31 and server systems.
32
41 This is the default I/O scheduler. 33 This is the default I/O scheduler.
42 34
43choice 35choice
@@ -47,9 +39,6 @@ choice
47 Select the I/O scheduler which will be used by default for all 39 Select the I/O scheduler which will be used by default for all
48 block devices. 40 block devices.
49 41
50 config DEFAULT_AS
51 bool "Anticipatory" if IOSCHED_AS=y
52
53 config DEFAULT_DEADLINE 42 config DEFAULT_DEADLINE
54 bool "Deadline" if IOSCHED_DEADLINE=y 43 bool "Deadline" if IOSCHED_DEADLINE=y
55 44
@@ -63,7 +52,6 @@ endchoice
63 52
64config DEFAULT_IOSCHED 53config DEFAULT_IOSCHED
65 string 54 string
66 default "anticipatory" if DEFAULT_AS
67 default "deadline" if DEFAULT_DEADLINE 55 default "deadline" if DEFAULT_DEADLINE
68 default "cfq" if DEFAULT_CFQ 56 default "cfq" if DEFAULT_CFQ
69 default "noop" if DEFAULT_NOOP 57 default "noop" if DEFAULT_NOOP
diff --git a/block/Makefile b/block/Makefile
index ba74ca6bfa14..7914108952f2 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -9,7 +9,6 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
9 9
10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
11obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o 11obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
12obj-$(CONFIG_IOSCHED_AS) += as-iosched.o
13obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o 12obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
14obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o 13obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
15 14
diff --git a/block/as-iosched.c b/block/as-iosched.c
deleted file mode 100644
index ce8ba57c6557..000000000000
--- a/block/as-iosched.c
+++ /dev/null
@@ -1,1520 +0,0 @@
1/*
2 * Anticipatory & deadline i/o scheduler.
3 *
4 * Copyright (C) 2002 Jens Axboe <axboe@kernel.dk>
5 * Nick Piggin <nickpiggin@yahoo.com.au>
6 *
7 */
8#include <linux/kernel.h>
9#include <linux/fs.h>
10#include <linux/blkdev.h>
11#include <linux/elevator.h>
12#include <linux/bio.h>
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/init.h>
16#include <linux/compiler.h>
17#include <linux/rbtree.h>
18#include <linux/interrupt.h>
19
20/*
21 * See Documentation/block/as-iosched.txt
22 */
23
24/*
25 * max time before a read is submitted.
26 */
27#define default_read_expire (HZ / 8)
28
29/*
30 * ditto for writes, these limits are not hard, even
31 * if the disk is capable of satisfying them.
32 */
33#define default_write_expire (HZ / 4)
34
35/*
36 * read_batch_expire describes how long we will allow a stream of reads to
37 * persist before looking to see whether it is time to switch over to writes.
38 */
39#define default_read_batch_expire (HZ / 2)
40
41/*
42 * write_batch_expire describes how long we want a stream of writes to run for.
43 * This is not a hard limit, but a target we set for the auto-tuning thingy.
44 * See, the problem is: we can send a lot of writes to disk cache / TCQ in
45 * a short amount of time...
46 */
47#define default_write_batch_expire (HZ / 8)
48
49/*
50 * max time we may wait to anticipate a read (default around 6ms)
51 */
52#define default_antic_expire ((HZ / 150) ? HZ / 150 : 1)
53
54/*
55 * Keep track of up to 20ms thinktimes. We can go as big as we like here,
56 * however huge values tend to interfere and not decay fast enough. A program
57 * might be in a non-io phase of operation. Waiting on user input for example,
58 * or doing a lengthy computation. A small penalty can be justified there, and
59 * will still catch out those processes that constantly have large thinktimes.
60 */
61#define MAX_THINKTIME (HZ/50UL)
62
63/* Bits in as_io_context.state */
64enum as_io_states {
65 AS_TASK_RUNNING=0, /* Process has not exited */
66 AS_TASK_IOSTARTED, /* Process has started some IO */
67 AS_TASK_IORUNNING, /* Process has completed some IO */
68};
69
70enum anticipation_status {
71 ANTIC_OFF=0, /* Not anticipating (normal operation) */
72 ANTIC_WAIT_REQ, /* The last read has not yet completed */
73 ANTIC_WAIT_NEXT, /* Currently anticipating a request vs
74 last read (which has completed) */
75 ANTIC_FINISHED, /* Anticipating but have found a candidate
76 * or timed out */
77};
78
79struct as_data {
80 /*
81 * run time data
82 */
83
84 struct request_queue *q; /* the "owner" queue */
85
86 /*
87 * requests (as_rq s) are present on both sort_list and fifo_list
88 */
89 struct rb_root sort_list[2];
90 struct list_head fifo_list[2];
91
92 struct request *next_rq[2]; /* next in sort order */
93 sector_t last_sector[2]; /* last SYNC & ASYNC sectors */
94
95 unsigned long exit_prob; /* probability a task will exit while
96 being waited on */
97 unsigned long exit_no_coop; /* probablility an exited task will
98 not be part of a later cooperating
99 request */
100 unsigned long new_ttime_total; /* mean thinktime on new proc */
101 unsigned long new_ttime_mean;
102 u64 new_seek_total; /* mean seek on new proc */
103 sector_t new_seek_mean;
104
105 unsigned long current_batch_expires;
106 unsigned long last_check_fifo[2];
107 int changed_batch; /* 1: waiting for old batch to end */
108 int new_batch; /* 1: waiting on first read complete */
109 int batch_data_dir; /* current batch SYNC / ASYNC */
110 int write_batch_count; /* max # of reqs in a write batch */
111 int current_write_count; /* how many requests left this batch */
112 int write_batch_idled; /* has the write batch gone idle? */
113
114 enum anticipation_status antic_status;
115 unsigned long antic_start; /* jiffies: when it started */
116 struct timer_list antic_timer; /* anticipatory scheduling timer */
117 struct work_struct antic_work; /* Deferred unplugging */
118 struct io_context *io_context; /* Identify the expected process */
119 int ioc_finished; /* IO associated with io_context is finished */
120 int nr_dispatched;
121
122 /*
123 * settings that change how the i/o scheduler behaves
124 */
125 unsigned long fifo_expire[2];
126 unsigned long batch_expire[2];
127 unsigned long antic_expire;
128};
129
130/*
131 * per-request data.
132 */
133enum arq_state {
134 AS_RQ_NEW=0, /* New - not referenced and not on any lists */
135 AS_RQ_QUEUED, /* In the request queue. It belongs to the
136 scheduler */
137 AS_RQ_DISPATCHED, /* On the dispatch list. It belongs to the
138 driver now */
139 AS_RQ_PRESCHED, /* Debug poisoning for requests being used */
140 AS_RQ_REMOVED,
141 AS_RQ_MERGED,
142 AS_RQ_POSTSCHED, /* when they shouldn't be */
143};
144
145#define RQ_IOC(rq) ((struct io_context *) (rq)->elevator_private)
146#define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2)
147#define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state)
148
149static DEFINE_PER_CPU(unsigned long, as_ioc_count);
150static struct completion *ioc_gone;
151static DEFINE_SPINLOCK(ioc_gone_lock);
152
153static void as_move_to_dispatch(struct as_data *ad, struct request *rq);
154static void as_antic_stop(struct as_data *ad);
155
156/*
157 * IO Context helper functions
158 */
159
160/* Called to deallocate the as_io_context */
161static void free_as_io_context(struct as_io_context *aic)
162{
163 kfree(aic);
164 elv_ioc_count_dec(as_ioc_count);
165 if (ioc_gone) {
166 /*
167 * AS scheduler is exiting, grab exit lock and check
168 * the pending io context count. If it hits zero,
169 * complete ioc_gone and set it back to NULL.
170 */
171 spin_lock(&ioc_gone_lock);
172 if (ioc_gone && !elv_ioc_count_read(as_ioc_count)) {
173 complete(ioc_gone);
174 ioc_gone = NULL;
175 }
176 spin_unlock(&ioc_gone_lock);
177 }
178}
179
180static void as_trim(struct io_context *ioc)
181{
182 spin_lock_irq(&ioc->lock);
183 if (ioc->aic)
184 free_as_io_context(ioc->aic);
185 ioc->aic = NULL;
186 spin_unlock_irq(&ioc->lock);
187}
188
189/* Called when the task exits */
190static void exit_as_io_context(struct as_io_context *aic)
191{
192 WARN_ON(!test_bit(AS_TASK_RUNNING, &aic->state));
193 clear_bit(AS_TASK_RUNNING, &aic->state);
194}
195
196static struct as_io_context *alloc_as_io_context(void)
197{
198 struct as_io_context *ret;
199
200 ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
201 if (ret) {
202 ret->dtor = free_as_io_context;
203 ret->exit = exit_as_io_context;
204 ret->state = 1 << AS_TASK_RUNNING;
205 atomic_set(&ret->nr_queued, 0);
206 atomic_set(&ret->nr_dispatched, 0);
207 spin_lock_init(&ret->lock);
208 ret->ttime_total = 0;
209 ret->ttime_samples = 0;
210 ret->ttime_mean = 0;
211 ret->seek_total = 0;
212 ret->seek_samples = 0;
213 ret->seek_mean = 0;
214 elv_ioc_count_inc(as_ioc_count);
215 }
216
217 return ret;
218}
219
220/*
221 * If the current task has no AS IO context then create one and initialise it.
222 * Then take a ref on the task's io context and return it.
223 */
224static struct io_context *as_get_io_context(int node)
225{
226 struct io_context *ioc = get_io_context(GFP_ATOMIC, node);
227 if (ioc && !ioc->aic) {
228 ioc->aic = alloc_as_io_context();
229 if (!ioc->aic) {
230 put_io_context(ioc);
231 ioc = NULL;
232 }
233 }
234 return ioc;
235}
236
237static void as_put_io_context(struct request *rq)
238{
239 struct as_io_context *aic;
240
241 if (unlikely(!RQ_IOC(rq)))
242 return;
243
244 aic = RQ_IOC(rq)->aic;
245
246 if (rq_is_sync(rq) && aic) {
247 unsigned long flags;
248
249 spin_lock_irqsave(&aic->lock, flags);
250 set_bit(AS_TASK_IORUNNING, &aic->state);
251 aic->last_end_request = jiffies;
252 spin_unlock_irqrestore(&aic->lock, flags);
253 }
254
255 put_io_context(RQ_IOC(rq));
256}
257
258/*
259 * rb tree support functions
260 */
261#define RQ_RB_ROOT(ad, rq) (&(ad)->sort_list[rq_is_sync((rq))])
262
263static void as_add_rq_rb(struct as_data *ad, struct request *rq)
264{
265 struct request *alias;
266
267 while ((unlikely(alias = elv_rb_add(RQ_RB_ROOT(ad, rq), rq)))) {
268 as_move_to_dispatch(ad, alias);
269 as_antic_stop(ad);
270 }
271}
272
273static inline void as_del_rq_rb(struct as_data *ad, struct request *rq)
274{
275 elv_rb_del(RQ_RB_ROOT(ad, rq), rq);
276}
277
278/*
279 * IO Scheduler proper
280 */
281
282#define MAXBACK (1024 * 1024) /*
283 * Maximum distance the disk will go backward
284 * for a request.
285 */
286
287#define BACK_PENALTY 2
288
289/*
290 * as_choose_req selects the preferred one of two requests of the same data_dir
291 * ignoring time - eg. timeouts, which is the job of as_dispatch_request
292 */
293static struct request *
294as_choose_req(struct as_data *ad, struct request *rq1, struct request *rq2)
295{
296 int data_dir;
297 sector_t last, s1, s2, d1, d2;
298 int r1_wrap=0, r2_wrap=0; /* requests are behind the disk head */
299 const sector_t maxback = MAXBACK;
300
301 if (rq1 == NULL || rq1 == rq2)
302 return rq2;
303 if (rq2 == NULL)
304 return rq1;
305
306 data_dir = rq_is_sync(rq1);
307
308 last = ad->last_sector[data_dir];
309 s1 = blk_rq_pos(rq1);
310 s2 = blk_rq_pos(rq2);
311
312 BUG_ON(data_dir != rq_is_sync(rq2));
313
314 /*
315 * Strict one way elevator _except_ in the case where we allow
316 * short backward seeks which are biased as twice the cost of a
317 * similar forward seek.
318 */
319 if (s1 >= last)
320 d1 = s1 - last;
321 else if (s1+maxback >= last)
322 d1 = (last - s1)*BACK_PENALTY;
323 else {
324 r1_wrap = 1;
325 d1 = 0; /* shut up, gcc */
326 }
327
328 if (s2 >= last)
329 d2 = s2 - last;
330 else if (s2+maxback >= last)
331 d2 = (last - s2)*BACK_PENALTY;
332 else {
333 r2_wrap = 1;
334 d2 = 0;
335 }
336
337 /* Found required data */
338 if (!r1_wrap && r2_wrap)
339 return rq1;
340 else if (!r2_wrap && r1_wrap)
341 return rq2;
342 else if (r1_wrap && r2_wrap) {
343 /* both behind the head */
344 if (s1 <= s2)
345 return rq1;
346 else
347 return rq2;
348 }
349
350 /* Both requests in front of the head */
351 if (d1 < d2)
352 return rq1;
353 else if (d2 < d1)
354 return rq2;
355 else {
356 if (s1 >= s2)
357 return rq1;
358 else
359 return rq2;
360 }
361}
362
363/*
364 * as_find_next_rq finds the next request after @prev in elevator order.
365 * this with as_choose_req form the basis for how the scheduler chooses
366 * what request to process next. Anticipation works on top of this.
367 */
368static struct request *
369as_find_next_rq(struct as_data *ad, struct request *last)
370{
371 struct rb_node *rbnext = rb_next(&last->rb_node);
372 struct rb_node *rbprev = rb_prev(&last->rb_node);
373 struct request *next = NULL, *prev = NULL;
374
375 BUG_ON(RB_EMPTY_NODE(&last->rb_node));
376
377 if (rbprev)
378 prev = rb_entry_rq(rbprev);
379
380 if (rbnext)
381 next = rb_entry_rq(rbnext);
382 else {
383 const int data_dir = rq_is_sync(last);
384
385 rbnext = rb_first(&ad->sort_list[data_dir]);
386 if (rbnext && rbnext != &last->rb_node)
387 next = rb_entry_rq(rbnext);
388 }
389
390 return as_choose_req(ad, next, prev);
391}
392
393/*
394 * anticipatory scheduling functions follow
395 */
396
397/*
398 * as_antic_expired tells us when we have anticipated too long.
399 * The funny "absolute difference" math on the elapsed time is to handle
400 * jiffy wraps, and disks which have been idle for 0x80000000 jiffies.
401 */
402static int as_antic_expired(struct as_data *ad)
403{
404 long delta_jif;
405
406 delta_jif = jiffies - ad->antic_start;
407 if (unlikely(delta_jif < 0))
408 delta_jif = -delta_jif;
409 if (delta_jif < ad->antic_expire)
410 return 0;
411
412 return 1;
413}
414
415/*
416 * as_antic_waitnext starts anticipating that a nice request will soon be
417 * submitted. See also as_antic_waitreq
418 */
419static void as_antic_waitnext(struct as_data *ad)
420{
421 unsigned long timeout;
422
423 BUG_ON(ad->antic_status != ANTIC_OFF
424 && ad->antic_status != ANTIC_WAIT_REQ);
425
426 timeout = ad->antic_start + ad->antic_expire;
427
428 mod_timer(&ad->antic_timer, timeout);
429
430 ad->antic_status = ANTIC_WAIT_NEXT;
431}
432
433/*
434 * as_antic_waitreq starts anticipating. We don't start timing the anticipation
435 * until the request that we're anticipating on has finished. This means we
436 * are timing from when the candidate process wakes up hopefully.
437 */
438static void as_antic_waitreq(struct as_data *ad)
439{
440 BUG_ON(ad->antic_status == ANTIC_FINISHED);
441 if (ad->antic_status == ANTIC_OFF) {
442 if (!ad->io_context || ad->ioc_finished)
443 as_antic_waitnext(ad);
444 else
445 ad->antic_status = ANTIC_WAIT_REQ;
446 }
447}
448
449/*
450 * This is called directly by the functions in this file to stop anticipation.
451 * We kill the timer and schedule a call to the request_fn asap.
452 */
453static void as_antic_stop(struct as_data *ad)
454{
455 int status = ad->antic_status;
456
457 if (status == ANTIC_WAIT_REQ || status == ANTIC_WAIT_NEXT) {
458 if (status == ANTIC_WAIT_NEXT)
459 del_timer(&ad->antic_timer);
460 ad->antic_status = ANTIC_FINISHED;
461 /* see as_work_handler */
462 kblockd_schedule_work(ad->q, &ad->antic_work);
463 }
464}
465
466/*
467 * as_antic_timeout is the timer function set by as_antic_waitnext.
468 */
469static void as_antic_timeout(unsigned long data)
470{
471 struct request_queue *q = (struct request_queue *)data;
472 struct as_data *ad = q->elevator->elevator_data;
473 unsigned long flags;
474
475 spin_lock_irqsave(q->queue_lock, flags);
476 if (ad->antic_status == ANTIC_WAIT_REQ
477 || ad->antic_status == ANTIC_WAIT_NEXT) {
478 struct as_io_context *aic;
479 spin_lock(&ad->io_context->lock);
480 aic = ad->io_context->aic;
481
482 ad->antic_status = ANTIC_FINISHED;
483 kblockd_schedule_work(q, &ad->antic_work);
484
485 if (aic->ttime_samples == 0) {
486 /* process anticipated on has exited or timed out*/
487 ad->exit_prob = (7*ad->exit_prob + 256)/8;
488 }
489 if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
490 /* process not "saved" by a cooperating request */
491 ad->exit_no_coop = (7*ad->exit_no_coop + 256)/8;
492 }
493 spin_unlock(&ad->io_context->lock);
494 }
495 spin_unlock_irqrestore(q->queue_lock, flags);
496}
497
498static void as_update_thinktime(struct as_data *ad, struct as_io_context *aic,
499 unsigned long ttime)
500{
501 /* fixed point: 1.0 == 1<<8 */
502 if (aic->ttime_samples == 0) {
503 ad->new_ttime_total = (7*ad->new_ttime_total + 256*ttime) / 8;
504 ad->new_ttime_mean = ad->new_ttime_total / 256;
505
506 ad->exit_prob = (7*ad->exit_prob)/8;
507 }
508 aic->ttime_samples = (7*aic->ttime_samples + 256) / 8;
509 aic->ttime_total = (7*aic->ttime_total + 256*ttime) / 8;
510 aic->ttime_mean = (aic->ttime_total + 128) / aic->ttime_samples;
511}
512
513static void as_update_seekdist(struct as_data *ad, struct as_io_context *aic,
514 sector_t sdist)
515{
516 u64 total;
517
518 if (aic->seek_samples == 0) {
519 ad->new_seek_total = (7*ad->new_seek_total + 256*(u64)sdist)/8;
520 ad->new_seek_mean = ad->new_seek_total / 256;
521 }
522
523 /*
524 * Don't allow the seek distance to get too large from the
525 * odd fragment, pagein, etc
526 */
527 if (aic->seek_samples <= 60) /* second&third seek */
528 sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*1024);
529 else
530 sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*64);
531
532 aic->seek_samples = (7*aic->seek_samples + 256) / 8;
533 aic->seek_total = (7*aic->seek_total + (u64)256*sdist) / 8;
534 total = aic->seek_total + (aic->seek_samples/2);
535 do_div(total, aic->seek_samples);
536 aic->seek_mean = (sector_t)total;
537}
538
539/*
540 * as_update_iohist keeps a decaying histogram of IO thinktimes, and
541 * updates @aic->ttime_mean based on that. It is called when a new
542 * request is queued.
543 */
544static void as_update_iohist(struct as_data *ad, struct as_io_context *aic,
545 struct request *rq)
546{
547 int data_dir = rq_is_sync(rq);
548 unsigned long thinktime = 0;
549 sector_t seek_dist;
550
551 if (aic == NULL)
552 return;
553
554 if (data_dir == BLK_RW_SYNC) {
555 unsigned long in_flight = atomic_read(&aic->nr_queued)
556 + atomic_read(&aic->nr_dispatched);
557 spin_lock(&aic->lock);
558 if (test_bit(AS_TASK_IORUNNING, &aic->state) ||
559 test_bit(AS_TASK_IOSTARTED, &aic->state)) {
560 /* Calculate read -> read thinktime */
561 if (test_bit(AS_TASK_IORUNNING, &aic->state)
562 && in_flight == 0) {
563 thinktime = jiffies - aic->last_end_request;
564 thinktime = min(thinktime, MAX_THINKTIME-1);
565 }
566 as_update_thinktime(ad, aic, thinktime);
567
568 /* Calculate read -> read seek distance */
569 if (aic->last_request_pos < blk_rq_pos(rq))
570 seek_dist = blk_rq_pos(rq) -
571 aic->last_request_pos;
572 else
573 seek_dist = aic->last_request_pos -
574 blk_rq_pos(rq);
575 as_update_seekdist(ad, aic, seek_dist);
576 }
577 aic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
578 set_bit(AS_TASK_IOSTARTED, &aic->state);
579 spin_unlock(&aic->lock);
580 }
581}
582
583/*
584 * as_close_req decides if one request is considered "close" to the
585 * previous one issued.
586 */
587static int as_close_req(struct as_data *ad, struct as_io_context *aic,
588 struct request *rq)
589{
590 unsigned long delay; /* jiffies */
591 sector_t last = ad->last_sector[ad->batch_data_dir];
592 sector_t next = blk_rq_pos(rq);
593 sector_t delta; /* acceptable close offset (in sectors) */
594 sector_t s;
595
596 if (ad->antic_status == ANTIC_OFF || !ad->ioc_finished)
597 delay = 0;
598 else
599 delay = jiffies - ad->antic_start;
600
601 if (delay == 0)
602 delta = 8192;
603 else if (delay <= (20 * HZ / 1000) && delay <= ad->antic_expire)
604 delta = 8192 << delay;
605 else
606 return 1;
607
608 if ((last <= next + (delta>>1)) && (next <= last + delta))
609 return 1;
610
611 if (last < next)
612 s = next - last;
613 else
614 s = last - next;
615
616 if (aic->seek_samples == 0) {
617 /*
618 * Process has just started IO. Use past statistics to
619 * gauge success possibility
620 */
621 if (ad->new_seek_mean > s) {
622 /* this request is better than what we're expecting */
623 return 1;
624 }
625
626 } else {
627 if (aic->seek_mean > s) {
628 /* this request is better than what we're expecting */
629 return 1;
630 }
631 }
632
633 return 0;
634}
635
636/*
637 * as_can_break_anticipation returns true if we have been anticipating this
638 * request.
639 *
640 * It also returns true if the process against which we are anticipating
641 * submits a write - that's presumably an fsync, O_SYNC write, etc. We want to
642 * dispatch it ASAP, because we know that application will not be submitting
643 * any new reads.
644 *
645 * If the task which has submitted the request has exited, break anticipation.
646 *
647 * If this task has queued some other IO, do not enter enticipation.
648 */
649static int as_can_break_anticipation(struct as_data *ad, struct request *rq)
650{
651 struct io_context *ioc;
652 struct as_io_context *aic;
653
654 ioc = ad->io_context;
655 BUG_ON(!ioc);
656 spin_lock(&ioc->lock);
657
658 if (rq && ioc == RQ_IOC(rq)) {
659 /* request from same process */
660 spin_unlock(&ioc->lock);
661 return 1;
662 }
663
664 if (ad->ioc_finished && as_antic_expired(ad)) {
665 /*
666 * In this situation status should really be FINISHED,
667 * however the timer hasn't had the chance to run yet.
668 */
669 spin_unlock(&ioc->lock);
670 return 1;
671 }
672
673 aic = ioc->aic;
674 if (!aic) {
675 spin_unlock(&ioc->lock);
676 return 0;
677 }
678
679 if (atomic_read(&aic->nr_queued) > 0) {
680 /* process has more requests queued */
681 spin_unlock(&ioc->lock);
682 return 1;
683 }
684
685 if (atomic_read(&aic->nr_dispatched) > 0) {
686 /* process has more requests dispatched */
687 spin_unlock(&ioc->lock);
688 return 1;
689 }
690
691 if (rq && rq_is_sync(rq) && as_close_req(ad, aic, rq)) {
692 /*
693 * Found a close request that is not one of ours.
694 *
695 * This makes close requests from another process update
696 * our IO history. Is generally useful when there are
697 * two or more cooperating processes working in the same
698 * area.
699 */
700 if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
701 if (aic->ttime_samples == 0)
702 ad->exit_prob = (7*ad->exit_prob + 256)/8;
703
704 ad->exit_no_coop = (7*ad->exit_no_coop)/8;
705 }
706
707 as_update_iohist(ad, aic, rq);
708 spin_unlock(&ioc->lock);
709 return 1;
710 }
711
712 if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
713 /* process anticipated on has exited */
714 if (aic->ttime_samples == 0)
715 ad->exit_prob = (7*ad->exit_prob + 256)/8;
716
717 if (ad->exit_no_coop > 128) {
718 spin_unlock(&ioc->lock);
719 return 1;
720 }
721 }
722
723 if (aic->ttime_samples == 0) {
724 if (ad->new_ttime_mean > ad->antic_expire) {
725 spin_unlock(&ioc->lock);
726 return 1;
727 }
728 if (ad->exit_prob * ad->exit_no_coop > 128*256) {
729 spin_unlock(&ioc->lock);
730 return 1;
731 }
732 } else if (aic->ttime_mean > ad->antic_expire) {
733 /* the process thinks too much between requests */
734 spin_unlock(&ioc->lock);
735 return 1;
736 }
737 spin_unlock(&ioc->lock);
738 return 0;
739}
740
741/*
742 * as_can_anticipate indicates whether we should either run rq
743 * or keep anticipating a better request.
744 */
745static int as_can_anticipate(struct as_data *ad, struct request *rq)
746{
747#if 0 /* disable for now, we need to check tag level as well */
748 /*
749 * SSD device without seek penalty, disable idling
750 */
751 if (blk_queue_nonrot(ad->q)) axman
752 return 0;
753#endif
754
755 if (!ad->io_context)
756 /*
757 * Last request submitted was a write
758 */
759 return 0;
760
761 if (ad->antic_status == ANTIC_FINISHED)
762 /*
763 * Don't restart if we have just finished. Run the next request
764 */
765 return 0;
766
767 if (as_can_break_anticipation(ad, rq))
768 /*
769 * This request is a good candidate. Don't keep anticipating,
770 * run it.
771 */
772 return 0;
773
774 /*
775 * OK from here, we haven't finished, and don't have a decent request!
776 * Status is either ANTIC_OFF so start waiting,
777 * ANTIC_WAIT_REQ so continue waiting for request to finish
778 * or ANTIC_WAIT_NEXT so continue waiting for an acceptable request.
779 */
780
781 return 1;
782}
783
784/*
785 * as_update_rq must be called whenever a request (rq) is added to
786 * the sort_list. This function keeps caches up to date, and checks if the
787 * request might be one we are "anticipating"
788 */
789static void as_update_rq(struct as_data *ad, struct request *rq)
790{
791 const int data_dir = rq_is_sync(rq);
792
793 /* keep the next_rq cache up to date */
794 ad->next_rq[data_dir] = as_choose_req(ad, rq, ad->next_rq[data_dir]);
795
796 /*
797 * have we been anticipating this request?
798 * or does it come from the same process as the one we are anticipating
799 * for?
800 */
801 if (ad->antic_status == ANTIC_WAIT_REQ
802 || ad->antic_status == ANTIC_WAIT_NEXT) {
803 if (as_can_break_anticipation(ad, rq))
804 as_antic_stop(ad);
805 }
806}
807
808/*
809 * Gathers timings and resizes the write batch automatically
810 */
811static void update_write_batch(struct as_data *ad)
812{
813 unsigned long batch = ad->batch_expire[BLK_RW_ASYNC];
814 long write_time;
815
816 write_time = (jiffies - ad->current_batch_expires) + batch;
817 if (write_time < 0)
818 write_time = 0;
819
820 if (write_time > batch && !ad->write_batch_idled) {
821 if (write_time > batch * 3)
822 ad->write_batch_count /= 2;
823 else
824 ad->write_batch_count--;
825 } else if (write_time < batch && ad->current_write_count == 0) {
826 if (batch > write_time * 3)
827 ad->write_batch_count *= 2;
828 else
829 ad->write_batch_count++;
830 }
831
832 if (ad->write_batch_count < 1)
833 ad->write_batch_count = 1;
834}
835
836/*
837 * as_completed_request is to be called when a request has completed and
838 * returned something to the requesting process, be it an error or data.
839 */
840static void as_completed_request(struct request_queue *q, struct request *rq)
841{
842 struct as_data *ad = q->elevator->elevator_data;
843
844 WARN_ON(!list_empty(&rq->queuelist));
845
846 if (RQ_STATE(rq) != AS_RQ_REMOVED) {
847 WARN(1, "rq->state %d\n", RQ_STATE(rq));
848 goto out;
849 }
850
851 if (ad->changed_batch && ad->nr_dispatched == 1) {
852 ad->current_batch_expires = jiffies +
853 ad->batch_expire[ad->batch_data_dir];
854 kblockd_schedule_work(q, &ad->antic_work);
855 ad->changed_batch = 0;
856
857 if (ad->batch_data_dir == BLK_RW_SYNC)
858 ad->new_batch = 1;
859 }
860 WARN_ON(ad->nr_dispatched == 0);
861 ad->nr_dispatched--;
862
863 /*
864 * Start counting the batch from when a request of that direction is
865 * actually serviced. This should help devices with big TCQ windows
866 * and writeback caches
867 */
868 if (ad->new_batch && ad->batch_data_dir == rq_is_sync(rq)) {
869 update_write_batch(ad);
870 ad->current_batch_expires = jiffies +
871 ad->batch_expire[BLK_RW_SYNC];
872 ad->new_batch = 0;
873 }
874
875 if (ad->io_context == RQ_IOC(rq) && ad->io_context) {
876 ad->antic_start = jiffies;
877 ad->ioc_finished = 1;
878 if (ad->antic_status == ANTIC_WAIT_REQ) {
879 /*
880 * We were waiting on this request, now anticipate
881 * the next one
882 */
883 as_antic_waitnext(ad);
884 }
885 }
886
887 as_put_io_context(rq);
888out:
889 RQ_SET_STATE(rq, AS_RQ_POSTSCHED);
890}
891
892/*
893 * as_remove_queued_request removes a request from the pre dispatch queue
894 * without updating refcounts. It is expected the caller will drop the
895 * reference unless it replaces the request at somepart of the elevator
896 * (ie. the dispatch queue)
897 */
898static void as_remove_queued_request(struct request_queue *q,
899 struct request *rq)
900{
901 const int data_dir = rq_is_sync(rq);
902 struct as_data *ad = q->elevator->elevator_data;
903 struct io_context *ioc;
904
905 WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED);
906
907 ioc = RQ_IOC(rq);
908 if (ioc && ioc->aic) {
909 BUG_ON(!atomic_read(&ioc->aic->nr_queued));
910 atomic_dec(&ioc->aic->nr_queued);
911 }
912
913 /*
914 * Update the "next_rq" cache if we are about to remove its
915 * entry
916 */
917 if (ad->next_rq[data_dir] == rq)
918 ad->next_rq[data_dir] = as_find_next_rq(ad, rq);
919
920 rq_fifo_clear(rq);
921 as_del_rq_rb(ad, rq);
922}
923
924/*
925 * as_fifo_expired returns 0 if there are no expired requests on the fifo,
926 * 1 otherwise. It is ratelimited so that we only perform the check once per
927 * `fifo_expire' interval. Otherwise a large number of expired requests
928 * would create a hopeless seekstorm.
929 *
930 * See as_antic_expired comment.
931 */
932static int as_fifo_expired(struct as_data *ad, int adir)
933{
934 struct request *rq;
935 long delta_jif;
936
937 delta_jif = jiffies - ad->last_check_fifo[adir];
938 if (unlikely(delta_jif < 0))
939 delta_jif = -delta_jif;
940 if (delta_jif < ad->fifo_expire[adir])
941 return 0;
942
943 ad->last_check_fifo[adir] = jiffies;
944
945 if (list_empty(&ad->fifo_list[adir]))
946 return 0;
947
948 rq = rq_entry_fifo(ad->fifo_list[adir].next);
949
950 return time_after(jiffies, rq_fifo_time(rq));
951}
952
953/*
954 * as_batch_expired returns true if the current batch has expired. A batch
955 * is a set of reads or a set of writes.
956 */
957static inline int as_batch_expired(struct as_data *ad)
958{
959 if (ad->changed_batch || ad->new_batch)
960 return 0;
961
962 if (ad->batch_data_dir == BLK_RW_SYNC)
963 /* TODO! add a check so a complete fifo gets written? */
964 return time_after(jiffies, ad->current_batch_expires);
965
966 return time_after(jiffies, ad->current_batch_expires)
967 || ad->current_write_count == 0;
968}
969
970/*
971 * move an entry to dispatch queue
972 */
973static void as_move_to_dispatch(struct as_data *ad, struct request *rq)
974{
975 const int data_dir = rq_is_sync(rq);
976
977 BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
978
979 as_antic_stop(ad);
980 ad->antic_status = ANTIC_OFF;
981
982 /*
983 * This has to be set in order to be correctly updated by
984 * as_find_next_rq
985 */
986 ad->last_sector[data_dir] = blk_rq_pos(rq) + blk_rq_sectors(rq);
987
988 if (data_dir == BLK_RW_SYNC) {
989 struct io_context *ioc = RQ_IOC(rq);
990 /* In case we have to anticipate after this */
991 copy_io_context(&ad->io_context, &ioc);
992 } else {
993 if (ad->io_context) {
994 put_io_context(ad->io_context);
995 ad->io_context = NULL;
996 }
997
998 if (ad->current_write_count != 0)
999 ad->current_write_count--;
1000 }
1001 ad->ioc_finished = 0;
1002
1003 ad->next_rq[data_dir] = as_find_next_rq(ad, rq);
1004
1005 /*
1006 * take it off the sort and fifo list, add to dispatch queue
1007 */
1008 as_remove_queued_request(ad->q, rq);
1009 WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED);
1010
1011 elv_dispatch_sort(ad->q, rq);
1012
1013 RQ_SET_STATE(rq, AS_RQ_DISPATCHED);
1014 if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
1015 atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched);
1016 ad->nr_dispatched++;
1017}
1018
1019/*
1020 * as_dispatch_request selects the best request according to
1021 * read/write expire, batch expire, etc, and moves it to the dispatch
1022 * queue. Returns 1 if a request was found, 0 otherwise.
1023 */
1024static int as_dispatch_request(struct request_queue *q, int force)
1025{
1026 struct as_data *ad = q->elevator->elevator_data;
1027 const int reads = !list_empty(&ad->fifo_list[BLK_RW_SYNC]);
1028 const int writes = !list_empty(&ad->fifo_list[BLK_RW_ASYNC]);
1029 struct request *rq;
1030
1031 if (unlikely(force)) {
1032 /*
1033 * Forced dispatch, accounting is useless. Reset
1034 * accounting states and dump fifo_lists. Note that
1035 * batch_data_dir is reset to BLK_RW_SYNC to avoid
1036 * screwing write batch accounting as write batch
1037 * accounting occurs on W->R transition.
1038 */
1039 int dispatched = 0;
1040
1041 ad->batch_data_dir = BLK_RW_SYNC;
1042 ad->changed_batch = 0;
1043 ad->new_batch = 0;
1044
1045 while (ad->next_rq[BLK_RW_SYNC]) {
1046 as_move_to_dispatch(ad, ad->next_rq[BLK_RW_SYNC]);
1047 dispatched++;
1048 }
1049 ad->last_check_fifo[BLK_RW_SYNC] = jiffies;
1050
1051 while (ad->next_rq[BLK_RW_ASYNC]) {
1052 as_move_to_dispatch(ad, ad->next_rq[BLK_RW_ASYNC]);
1053 dispatched++;
1054 }
1055 ad->last_check_fifo[BLK_RW_ASYNC] = jiffies;
1056
1057 return dispatched;
1058 }
1059
1060 /* Signal that the write batch was uncontended, so we can't time it */
1061 if (ad->batch_data_dir == BLK_RW_ASYNC && !reads) {
1062 if (ad->current_write_count == 0 || !writes)
1063 ad->write_batch_idled = 1;
1064 }
1065
1066 if (!(reads || writes)
1067 || ad->antic_status == ANTIC_WAIT_REQ
1068 || ad->antic_status == ANTIC_WAIT_NEXT
1069 || ad->changed_batch)
1070 return 0;
1071
1072 if (!(reads && writes && as_batch_expired(ad))) {
1073 /*
1074 * batch is still running or no reads or no writes
1075 */
1076 rq = ad->next_rq[ad->batch_data_dir];
1077
1078 if (ad->batch_data_dir == BLK_RW_SYNC && ad->antic_expire) {
1079 if (as_fifo_expired(ad, BLK_RW_SYNC))
1080 goto fifo_expired;
1081
1082 if (as_can_anticipate(ad, rq)) {
1083 as_antic_waitreq(ad);
1084 return 0;
1085 }
1086 }
1087
1088 if (rq) {
1089 /* we have a "next request" */
1090 if (reads && !writes)
1091 ad->current_batch_expires =
1092 jiffies + ad->batch_expire[BLK_RW_SYNC];
1093 goto dispatch_request;
1094 }
1095 }
1096
1097 /*
1098 * at this point we are not running a batch. select the appropriate
1099 * data direction (read / write)
1100 */
1101
1102 if (reads) {
1103 BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_SYNC]));
1104
1105 if (writes && ad->batch_data_dir == BLK_RW_SYNC)
1106 /*
1107 * Last batch was a read, switch to writes
1108 */
1109 goto dispatch_writes;
1110
1111 if (ad->batch_data_dir == BLK_RW_ASYNC) {
1112 WARN_ON(ad->new_batch);
1113 ad->changed_batch = 1;
1114 }
1115 ad->batch_data_dir = BLK_RW_SYNC;
1116 rq = rq_entry_fifo(ad->fifo_list[BLK_RW_SYNC].next);
1117 ad->last_check_fifo[ad->batch_data_dir] = jiffies;
1118 goto dispatch_request;
1119 }
1120
1121 /*
1122 * the last batch was a read
1123 */
1124
1125 if (writes) {
1126dispatch_writes:
1127 BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_ASYNC]));
1128
1129 if (ad->batch_data_dir == BLK_RW_SYNC) {
1130 ad->changed_batch = 1;
1131
1132 /*
1133 * new_batch might be 1 when the queue runs out of
1134 * reads. A subsequent submission of a write might
1135 * cause a change of batch before the read is finished.
1136 */
1137 ad->new_batch = 0;
1138 }
1139 ad->batch_data_dir = BLK_RW_ASYNC;
1140 ad->current_write_count = ad->write_batch_count;
1141 ad->write_batch_idled = 0;
1142 rq = rq_entry_fifo(ad->fifo_list[BLK_RW_ASYNC].next);
1143 ad->last_check_fifo[BLK_RW_ASYNC] = jiffies;
1144 goto dispatch_request;
1145 }
1146
1147 BUG();
1148 return 0;
1149
1150dispatch_request:
1151 /*
1152 * If a request has expired, service it.
1153 */
1154
1155 if (as_fifo_expired(ad, ad->batch_data_dir)) {
1156fifo_expired:
1157 rq = rq_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
1158 }
1159
1160 if (ad->changed_batch) {
1161 WARN_ON(ad->new_batch);
1162
1163 if (ad->nr_dispatched)
1164 return 0;
1165
1166 if (ad->batch_data_dir == BLK_RW_ASYNC)
1167 ad->current_batch_expires = jiffies +
1168 ad->batch_expire[BLK_RW_ASYNC];
1169 else
1170 ad->new_batch = 1;
1171
1172 ad->changed_batch = 0;
1173 }
1174
1175 /*
1176 * rq is the selected appropriate request.
1177 */
1178 as_move_to_dispatch(ad, rq);
1179
1180 return 1;
1181}
1182
1183/*
1184 * add rq to rbtree and fifo
1185 */
1186static void as_add_request(struct request_queue *q, struct request *rq)
1187{
1188 struct as_data *ad = q->elevator->elevator_data;
1189 int data_dir;
1190
1191 RQ_SET_STATE(rq, AS_RQ_NEW);
1192
1193 data_dir = rq_is_sync(rq);
1194
1195 rq->elevator_private = as_get_io_context(q->node);
1196
1197 if (RQ_IOC(rq)) {
1198 as_update_iohist(ad, RQ_IOC(rq)->aic, rq);
1199 atomic_inc(&RQ_IOC(rq)->aic->nr_queued);
1200 }
1201
1202 as_add_rq_rb(ad, rq);
1203
1204 /*
1205 * set expire time and add to fifo list
1206 */
1207 rq_set_fifo_time(rq, jiffies + ad->fifo_expire[data_dir]);
1208 list_add_tail(&rq->queuelist, &ad->fifo_list[data_dir]);
1209
1210 as_update_rq(ad, rq); /* keep state machine up to date */
1211 RQ_SET_STATE(rq, AS_RQ_QUEUED);
1212}
1213
1214static void as_activate_request(struct request_queue *q, struct request *rq)
1215{
1216 WARN_ON(RQ_STATE(rq) != AS_RQ_DISPATCHED);
1217 RQ_SET_STATE(rq, AS_RQ_REMOVED);
1218 if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
1219 atomic_dec(&RQ_IOC(rq)->aic->nr_dispatched);
1220}
1221
1222static void as_deactivate_request(struct request_queue *q, struct request *rq)
1223{
1224 WARN_ON(RQ_STATE(rq) != AS_RQ_REMOVED);
1225 RQ_SET_STATE(rq, AS_RQ_DISPATCHED);
1226 if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
1227 atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched);
1228}
1229
1230/*
1231 * as_queue_empty tells us if there are requests left in the device. It may
1232 * not be the case that a driver can get the next request even if the queue
1233 * is not empty - it is used in the block layer to check for plugging and
1234 * merging opportunities
1235 */
1236static int as_queue_empty(struct request_queue *q)
1237{
1238 struct as_data *ad = q->elevator->elevator_data;
1239
1240 return list_empty(&ad->fifo_list[BLK_RW_ASYNC])
1241 && list_empty(&ad->fifo_list[BLK_RW_SYNC]);
1242}
1243
1244static int
1245as_merge(struct request_queue *q, struct request **req, struct bio *bio)
1246{
1247 struct as_data *ad = q->elevator->elevator_data;
1248 sector_t rb_key = bio->bi_sector + bio_sectors(bio);
1249 struct request *__rq;
1250
1251 /*
1252 * check for front merge
1253 */
1254 __rq = elv_rb_find(&ad->sort_list[bio_data_dir(bio)], rb_key);
1255 if (__rq && elv_rq_merge_ok(__rq, bio)) {
1256 *req = __rq;
1257 return ELEVATOR_FRONT_MERGE;
1258 }
1259
1260 return ELEVATOR_NO_MERGE;
1261}
1262
1263static void as_merged_request(struct request_queue *q, struct request *req,
1264 int type)
1265{
1266 struct as_data *ad = q->elevator->elevator_data;
1267
1268 /*
1269 * if the merge was a front merge, we need to reposition request
1270 */
1271 if (type == ELEVATOR_FRONT_MERGE) {
1272 as_del_rq_rb(ad, req);
1273 as_add_rq_rb(ad, req);
1274 /*
1275 * Note! At this stage of this and the next function, our next
1276 * request may not be optimal - eg the request may have "grown"
1277 * behind the disk head. We currently don't bother adjusting.
1278 */
1279 }
1280}
1281
1282static void as_merged_requests(struct request_queue *q, struct request *req,
1283 struct request *next)
1284{
1285 /*
1286 * if next expires before rq, assign its expire time to arq
1287 * and move into next position (next will be deleted) in fifo
1288 */
1289 if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
1290 if (time_before(rq_fifo_time(next), rq_fifo_time(req))) {
1291 list_move(&req->queuelist, &next->queuelist);
1292 rq_set_fifo_time(req, rq_fifo_time(next));
1293 }
1294 }
1295
1296 /*
1297 * kill knowledge of next, this one is a goner
1298 */
1299 as_remove_queued_request(q, next);
1300 as_put_io_context(next);
1301
1302 RQ_SET_STATE(next, AS_RQ_MERGED);
1303}
1304
1305/*
1306 * This is executed in a "deferred" process context, by kblockd. It calls the
1307 * driver's request_fn so the driver can submit that request.
1308 *
1309 * IMPORTANT! This guy will reenter the elevator, so set up all queue global
1310 * state before calling, and don't rely on any state over calls.
1311 *
1312 * FIXME! dispatch queue is not a queue at all!
1313 */
1314static void as_work_handler(struct work_struct *work)
1315{
1316 struct as_data *ad = container_of(work, struct as_data, antic_work);
1317
1318 blk_run_queue(ad->q);
1319}
1320
1321static int as_may_queue(struct request_queue *q, int rw)
1322{
1323 int ret = ELV_MQUEUE_MAY;
1324 struct as_data *ad = q->elevator->elevator_data;
1325 struct io_context *ioc;
1326 if (ad->antic_status == ANTIC_WAIT_REQ ||
1327 ad->antic_status == ANTIC_WAIT_NEXT) {
1328 ioc = as_get_io_context(q->node);
1329 if (ad->io_context == ioc)
1330 ret = ELV_MQUEUE_MUST;
1331 put_io_context(ioc);
1332 }
1333
1334 return ret;
1335}
1336
1337static void as_exit_queue(struct elevator_queue *e)
1338{
1339 struct as_data *ad = e->elevator_data;
1340
1341 del_timer_sync(&ad->antic_timer);
1342 cancel_work_sync(&ad->antic_work);
1343
1344 BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_SYNC]));
1345 BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_ASYNC]));
1346
1347 put_io_context(ad->io_context);
1348 kfree(ad);
1349}
1350
1351/*
1352 * initialize elevator private data (as_data).
1353 */
1354static void *as_init_queue(struct request_queue *q)
1355{
1356 struct as_data *ad;
1357
1358 ad = kmalloc_node(sizeof(*ad), GFP_KERNEL | __GFP_ZERO, q->node);
1359 if (!ad)
1360 return NULL;
1361
1362 ad->q = q; /* Identify what queue the data belongs to */
1363
1364 /* anticipatory scheduling helpers */
1365 ad->antic_timer.function = as_antic_timeout;
1366 ad->antic_timer.data = (unsigned long)q;
1367 init_timer(&ad->antic_timer);
1368 INIT_WORK(&ad->antic_work, as_work_handler);
1369
1370 INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_SYNC]);
1371 INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_ASYNC]);
1372 ad->sort_list[BLK_RW_SYNC] = RB_ROOT;
1373 ad->sort_list[BLK_RW_ASYNC] = RB_ROOT;
1374 ad->fifo_expire[BLK_RW_SYNC] = default_read_expire;
1375 ad->fifo_expire[BLK_RW_ASYNC] = default_write_expire;
1376 ad->antic_expire = default_antic_expire;
1377 ad->batch_expire[BLK_RW_SYNC] = default_read_batch_expire;
1378 ad->batch_expire[BLK_RW_ASYNC] = default_write_batch_expire;
1379
1380 ad->current_batch_expires = jiffies + ad->batch_expire[BLK_RW_SYNC];
1381 ad->write_batch_count = ad->batch_expire[BLK_RW_ASYNC] / 10;
1382 if (ad->write_batch_count < 2)
1383 ad->write_batch_count = 2;
1384
1385 return ad;
1386}
1387
1388/*
1389 * sysfs parts below
1390 */
1391
1392static ssize_t
1393as_var_show(unsigned int var, char *page)
1394{
1395 return sprintf(page, "%d\n", var);
1396}
1397
1398static ssize_t
1399as_var_store(unsigned long *var, const char *page, size_t count)
1400{
1401 char *p = (char *) page;
1402
1403 *var = simple_strtoul(p, &p, 10);
1404 return count;
1405}
1406
1407static ssize_t est_time_show(struct elevator_queue *e, char *page)
1408{
1409 struct as_data *ad = e->elevator_data;
1410 int pos = 0;
1411
1412 pos += sprintf(page+pos, "%lu %% exit probability\n",
1413 100*ad->exit_prob/256);
1414 pos += sprintf(page+pos, "%lu %% probability of exiting without a "
1415 "cooperating process submitting IO\n",
1416 100*ad->exit_no_coop/256);
1417 pos += sprintf(page+pos, "%lu ms new thinktime\n", ad->new_ttime_mean);
1418 pos += sprintf(page+pos, "%llu sectors new seek distance\n",
1419 (unsigned long long)ad->new_seek_mean);
1420
1421 return pos;
1422}
1423
1424#define SHOW_FUNCTION(__FUNC, __VAR) \
1425static ssize_t __FUNC(struct elevator_queue *e, char *page) \
1426{ \
1427 struct as_data *ad = e->elevator_data; \
1428 return as_var_show(jiffies_to_msecs((__VAR)), (page)); \
1429}
1430SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[BLK_RW_SYNC]);
1431SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[BLK_RW_ASYNC]);
1432SHOW_FUNCTION(as_antic_expire_show, ad->antic_expire);
1433SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[BLK_RW_SYNC]);
1434SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[BLK_RW_ASYNC]);
1435#undef SHOW_FUNCTION
1436
1437#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
1438static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
1439{ \
1440 struct as_data *ad = e->elevator_data; \
1441 int ret = as_var_store(__PTR, (page), count); \
1442 if (*(__PTR) < (MIN)) \
1443 *(__PTR) = (MIN); \
1444 else if (*(__PTR) > (MAX)) \
1445 *(__PTR) = (MAX); \
1446 *(__PTR) = msecs_to_jiffies(*(__PTR)); \
1447 return ret; \
1448}
1449STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[BLK_RW_SYNC], 0, INT_MAX);
1450STORE_FUNCTION(as_write_expire_store,
1451 &ad->fifo_expire[BLK_RW_ASYNC], 0, INT_MAX);
1452STORE_FUNCTION(as_antic_expire_store, &ad->antic_expire, 0, INT_MAX);
1453STORE_FUNCTION(as_read_batch_expire_store,
1454 &ad->batch_expire[BLK_RW_SYNC], 0, INT_MAX);
1455STORE_FUNCTION(as_write_batch_expire_store,
1456 &ad->batch_expire[BLK_RW_ASYNC], 0, INT_MAX);
1457#undef STORE_FUNCTION
1458
1459#define AS_ATTR(name) \
1460 __ATTR(name, S_IRUGO|S_IWUSR, as_##name##_show, as_##name##_store)
1461
1462static struct elv_fs_entry as_attrs[] = {
1463 __ATTR_RO(est_time),
1464 AS_ATTR(read_expire),
1465 AS_ATTR(write_expire),
1466 AS_ATTR(antic_expire),
1467 AS_ATTR(read_batch_expire),
1468 AS_ATTR(write_batch_expire),
1469 __ATTR_NULL
1470};
1471
1472static struct elevator_type iosched_as = {
1473 .ops = {
1474 .elevator_merge_fn = as_merge,
1475 .elevator_merged_fn = as_merged_request,
1476 .elevator_merge_req_fn = as_merged_requests,
1477 .elevator_dispatch_fn = as_dispatch_request,
1478 .elevator_add_req_fn = as_add_request,
1479 .elevator_activate_req_fn = as_activate_request,
1480 .elevator_deactivate_req_fn = as_deactivate_request,
1481 .elevator_queue_empty_fn = as_queue_empty,
1482 .elevator_completed_req_fn = as_completed_request,
1483 .elevator_former_req_fn = elv_rb_former_request,
1484 .elevator_latter_req_fn = elv_rb_latter_request,
1485 .elevator_may_queue_fn = as_may_queue,
1486 .elevator_init_fn = as_init_queue,
1487 .elevator_exit_fn = as_exit_queue,
1488 .trim = as_trim,
1489 },
1490
1491 .elevator_attrs = as_attrs,
1492 .elevator_name = "anticipatory",
1493 .elevator_owner = THIS_MODULE,
1494};
1495
1496static int __init as_init(void)
1497{
1498 elv_register(&iosched_as);
1499
1500 return 0;
1501}
1502
1503static void __exit as_exit(void)
1504{
1505 DECLARE_COMPLETION_ONSTACK(all_gone);
1506 elv_unregister(&iosched_as);
1507 ioc_gone = &all_gone;
1508 /* ioc_gone's update must be visible before reading ioc_count */
1509 smp_wmb();
1510 if (elv_ioc_count_read(as_ioc_count))
1511 wait_for_completion(&all_gone);
1512 synchronize_rcu();
1513}
1514
1515module_init(as_init);
1516module_exit(as_exit);
1517
1518MODULE_AUTHOR("Nick Piggin");
1519MODULE_LICENSE("GPL");
1520MODULE_DESCRIPTION("anticipatory IO scheduler");
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 069a61017c02..757010d8fb7a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -38,6 +38,12 @@ static int cfq_slice_idle = HZ / 125;
38 */ 38 */
39#define CFQ_MIN_TT (2) 39#define CFQ_MIN_TT (2)
40 40
41/*
42 * Allow merged cfqqs to perform this amount of seeky I/O before
43 * deciding to break the queues up again.
44 */
45#define CFQQ_COOP_TOUT (HZ)
46
41#define CFQ_SLICE_SCALE (5) 47#define CFQ_SLICE_SCALE (5)
42#define CFQ_HW_QUEUE_MIN (5) 48#define CFQ_HW_QUEUE_MIN (5)
43 49
@@ -112,7 +118,15 @@ struct cfq_queue {
112 unsigned short ioprio, org_ioprio; 118 unsigned short ioprio, org_ioprio;
113 unsigned short ioprio_class, org_ioprio_class; 119 unsigned short ioprio_class, org_ioprio_class;
114 120
121 unsigned int seek_samples;
122 u64 seek_total;
123 sector_t seek_mean;
124 sector_t last_request_pos;
125 unsigned long seeky_start;
126
115 pid_t pid; 127 pid_t pid;
128
129 struct cfq_queue *new_cfqq;
116}; 130};
117 131
118/* 132/*
@@ -195,7 +209,7 @@ enum cfqq_state_flags {
195 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ 209 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */
196 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ 210 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */
197 CFQ_CFQQ_FLAG_sync, /* synchronous queue */ 211 CFQ_CFQQ_FLAG_sync, /* synchronous queue */
198 CFQ_CFQQ_FLAG_coop, /* has done a coop jump of the queue */ 212 CFQ_CFQQ_FLAG_coop, /* cfqq is shared */
199}; 213};
200 214
201#define CFQ_CFQQ_FNS(name) \ 215#define CFQ_CFQQ_FNS(name) \
@@ -943,11 +957,8 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
943static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, 957static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
944 struct cfq_queue *cfqq) 958 struct cfq_queue *cfqq)
945{ 959{
946 if (!cfqq) { 960 if (!cfqq)
947 cfqq = cfq_get_next_queue(cfqd); 961 cfqq = cfq_get_next_queue(cfqd);
948 if (cfqq)
949 cfq_clear_cfqq_coop(cfqq);
950 }
951 962
952 __cfq_set_active_queue(cfqd, cfqq); 963 __cfq_set_active_queue(cfqd, cfqq);
953 return cfqq; 964 return cfqq;
@@ -962,16 +973,16 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
962 return cfqd->last_position - blk_rq_pos(rq); 973 return cfqd->last_position - blk_rq_pos(rq);
963} 974}
964 975
965#define CIC_SEEK_THR 8 * 1024 976#define CFQQ_SEEK_THR 8 * 1024
966#define CIC_SEEKY(cic) ((cic)->seek_mean > CIC_SEEK_THR) 977#define CFQQ_SEEKY(cfqq) ((cfqq)->seek_mean > CFQQ_SEEK_THR)
967 978
968static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq) 979static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
980 struct request *rq)
969{ 981{
970 struct cfq_io_context *cic = cfqd->active_cic; 982 sector_t sdist = cfqq->seek_mean;
971 sector_t sdist = cic->seek_mean;
972 983
973 if (!sample_valid(cic->seek_samples)) 984 if (!sample_valid(cfqq->seek_samples))
974 sdist = CIC_SEEK_THR; 985 sdist = CFQQ_SEEK_THR;
975 986
976 return cfq_dist_from_last(cfqd, rq) <= sdist; 987 return cfq_dist_from_last(cfqd, rq) <= sdist;
977} 988}
@@ -1000,7 +1011,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
1000 * will contain the closest sector. 1011 * will contain the closest sector.
1001 */ 1012 */
1002 __cfqq = rb_entry(parent, struct cfq_queue, p_node); 1013 __cfqq = rb_entry(parent, struct cfq_queue, p_node);
1003 if (cfq_rq_close(cfqd, __cfqq->next_rq)) 1014 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1004 return __cfqq; 1015 return __cfqq;
1005 1016
1006 if (blk_rq_pos(__cfqq->next_rq) < sector) 1017 if (blk_rq_pos(__cfqq->next_rq) < sector)
@@ -1011,7 +1022,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
1011 return NULL; 1022 return NULL;
1012 1023
1013 __cfqq = rb_entry(node, struct cfq_queue, p_node); 1024 __cfqq = rb_entry(node, struct cfq_queue, p_node);
1014 if (cfq_rq_close(cfqd, __cfqq->next_rq)) 1025 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1015 return __cfqq; 1026 return __cfqq;
1016 1027
1017 return NULL; 1028 return NULL;
@@ -1028,16 +1039,13 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
1028 * assumption. 1039 * assumption.
1029 */ 1040 */
1030static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, 1041static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
1031 struct cfq_queue *cur_cfqq, 1042 struct cfq_queue *cur_cfqq)
1032 bool probe)
1033{ 1043{
1034 struct cfq_queue *cfqq; 1044 struct cfq_queue *cfqq;
1035 1045
1036 /* 1046 if (!cfq_cfqq_sync(cur_cfqq))
1037 * A valid cfq_io_context is necessary to compare requests against 1047 return NULL;
1038 * the seek_mean of the current cfqq. 1048 if (CFQQ_SEEKY(cur_cfqq))
1039 */
1040 if (!cfqd->active_cic)
1041 return NULL; 1049 return NULL;
1042 1050
1043 /* 1051 /*
@@ -1049,11 +1057,14 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
1049 if (!cfqq) 1057 if (!cfqq)
1050 return NULL; 1058 return NULL;
1051 1059
1052 if (cfq_cfqq_coop(cfqq)) 1060 /*
1061 * It only makes sense to merge sync queues.
1062 */
1063 if (!cfq_cfqq_sync(cfqq))
1064 return NULL;
1065 if (CFQQ_SEEKY(cfqq))
1053 return NULL; 1066 return NULL;
1054 1067
1055 if (!probe)
1056 cfq_mark_cfqq_coop(cfqq);
1057 return cfqq; 1068 return cfqq;
1058} 1069}
1059 1070
@@ -1110,7 +1121,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1110 * seeks. so allow a little bit of time for him to submit a new rq 1121 * seeks. so allow a little bit of time for him to submit a new rq
1111 */ 1122 */
1112 sl = cfqd->cfq_slice_idle; 1123 sl = cfqd->cfq_slice_idle;
1113 if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic)) 1124 if (sample_valid(cfqq->seek_samples) && CFQQ_SEEKY(cfqq))
1114 sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); 1125 sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
1115 1126
1116 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 1127 mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
@@ -1170,6 +1181,52 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1170} 1181}
1171 1182
1172/* 1183/*
1184 * Must be called with the queue_lock held.
1185 */
1186static int cfqq_process_refs(struct cfq_queue *cfqq)
1187{
1188 int process_refs, io_refs;
1189
1190 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
1191 process_refs = atomic_read(&cfqq->ref) - io_refs;
1192 BUG_ON(process_refs < 0);
1193 return process_refs;
1194}
1195
1196static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
1197{
1198 int process_refs, new_process_refs;
1199 struct cfq_queue *__cfqq;
1200
1201 /* Avoid a circular list and skip interim queue merges */
1202 while ((__cfqq = new_cfqq->new_cfqq)) {
1203 if (__cfqq == cfqq)
1204 return;
1205 new_cfqq = __cfqq;
1206 }
1207
1208 process_refs = cfqq_process_refs(cfqq);
1209 /*
1210 * If the process for the cfqq has gone away, there is no
1211 * sense in merging the queues.
1212 */
1213 if (process_refs == 0)
1214 return;
1215
1216 /*
1217 * Merge in the direction of the lesser amount of work.
1218 */
1219 new_process_refs = cfqq_process_refs(new_cfqq);
1220 if (new_process_refs >= process_refs) {
1221 cfqq->new_cfqq = new_cfqq;
1222 atomic_add(process_refs, &new_cfqq->ref);
1223 } else {
1224 new_cfqq->new_cfqq = cfqq;
1225 atomic_add(new_process_refs, &cfqq->ref);
1226 }
1227}
1228
1229/*
1173 * Select a queue for service. If we have a current active queue, 1230 * Select a queue for service. If we have a current active queue,
1174 * check whether to continue servicing it, or retrieve and set a new one. 1231 * check whether to continue servicing it, or retrieve and set a new one.
1175 */ 1232 */
@@ -1198,11 +1255,14 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1198 * If another queue has a request waiting within our mean seek 1255 * If another queue has a request waiting within our mean seek
1199 * distance, let it run. The expire code will check for close 1256 * distance, let it run. The expire code will check for close
1200 * cooperators and put the close queue at the front of the service 1257 * cooperators and put the close queue at the front of the service
1201 * tree. 1258 * tree. If possible, merge the expiring queue with the new cfqq.
1202 */ 1259 */
1203 new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0); 1260 new_cfqq = cfq_close_cooperator(cfqd, cfqq);
1204 if (new_cfqq) 1261 if (new_cfqq) {
1262 if (!cfqq->new_cfqq)
1263 cfq_setup_merge(cfqq, new_cfqq);
1205 goto expire; 1264 goto expire;
1265 }
1206 1266
1207 /* 1267 /*
1208 * No requests pending. If the active queue still has requests in 1268 * No requests pending. If the active queue still has requests in
@@ -1513,11 +1573,29 @@ static void cfq_free_io_context(struct io_context *ioc)
1513 1573
1514static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1574static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1515{ 1575{
1576 struct cfq_queue *__cfqq, *next;
1577
1516 if (unlikely(cfqq == cfqd->active_queue)) { 1578 if (unlikely(cfqq == cfqd->active_queue)) {
1517 __cfq_slice_expired(cfqd, cfqq, 0); 1579 __cfq_slice_expired(cfqd, cfqq, 0);
1518 cfq_schedule_dispatch(cfqd); 1580 cfq_schedule_dispatch(cfqd);
1519 } 1581 }
1520 1582
1583 /*
1584 * If this queue was scheduled to merge with another queue, be
1585 * sure to drop the reference taken on that queue (and others in
1586 * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs.
1587 */
1588 __cfqq = cfqq->new_cfqq;
1589 while (__cfqq) {
1590 if (__cfqq == cfqq) {
1591 WARN(1, "cfqq->new_cfqq loop detected\n");
1592 break;
1593 }
1594 next = __cfqq->new_cfqq;
1595 cfq_put_queue(__cfqq);
1596 __cfqq = next;
1597 }
1598
1521 cfq_put_queue(cfqq); 1599 cfq_put_queue(cfqq);
1522} 1600}
1523 1601
@@ -1947,33 +2025,46 @@ cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
1947} 2025}
1948 2026
1949static void 2027static void
1950cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic, 2028cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1951 struct request *rq) 2029 struct request *rq)
1952{ 2030{
1953 sector_t sdist; 2031 sector_t sdist;
1954 u64 total; 2032 u64 total;
1955 2033
1956 if (!cic->last_request_pos) 2034 if (!cfqq->last_request_pos)
1957 sdist = 0; 2035 sdist = 0;
1958 else if (cic->last_request_pos < blk_rq_pos(rq)) 2036 else if (cfqq->last_request_pos < blk_rq_pos(rq))
1959 sdist = blk_rq_pos(rq) - cic->last_request_pos; 2037 sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
1960 else 2038 else
1961 sdist = cic->last_request_pos - blk_rq_pos(rq); 2039 sdist = cfqq->last_request_pos - blk_rq_pos(rq);
1962 2040
1963 /* 2041 /*
1964 * Don't allow the seek distance to get too large from the 2042 * Don't allow the seek distance to get too large from the
1965 * odd fragment, pagein, etc 2043 * odd fragment, pagein, etc
1966 */ 2044 */
1967 if (cic->seek_samples <= 60) /* second&third seek */ 2045 if (cfqq->seek_samples <= 60) /* second&third seek */
1968 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024); 2046 sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*1024);
1969 else 2047 else
1970 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64); 2048 sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*64);
2049
2050 cfqq->seek_samples = (7*cfqq->seek_samples + 256) / 8;
2051 cfqq->seek_total = (7*cfqq->seek_total + (u64)256*sdist) / 8;
2052 total = cfqq->seek_total + (cfqq->seek_samples/2);
2053 do_div(total, cfqq->seek_samples);
2054 cfqq->seek_mean = (sector_t)total;
1971 2055
1972 cic->seek_samples = (7*cic->seek_samples + 256) / 8; 2056 /*
1973 cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8; 2057 * If this cfqq is shared between multiple processes, check to
1974 total = cic->seek_total + (cic->seek_samples/2); 2058 * make sure that those processes are still issuing I/Os within
1975 do_div(total, cic->seek_samples); 2059 * the mean seek distance. If not, it may be time to break the
1976 cic->seek_mean = (sector_t)total; 2060 * queues apart again.
2061 */
2062 if (cfq_cfqq_coop(cfqq)) {
2063 if (CFQQ_SEEKY(cfqq) && !cfqq->seeky_start)
2064 cfqq->seeky_start = jiffies;
2065 else if (!CFQQ_SEEKY(cfqq))
2066 cfqq->seeky_start = 0;
2067 }
1977} 2068}
1978 2069
1979/* 2070/*
@@ -1995,11 +2086,11 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1995 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); 2086 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
1996 2087
1997 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || 2088 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
1998 (!cfqd->cfq_latency && cfqd->hw_tag && CIC_SEEKY(cic))) 2089 (!cfqd->cfq_latency && cfqd->hw_tag && CFQQ_SEEKY(cfqq)))
1999 enable_idle = 0; 2090 enable_idle = 0;
2000 else if (sample_valid(cic->ttime_samples)) { 2091 else if (sample_valid(cic->ttime_samples)) {
2001 unsigned int slice_idle = cfqd->cfq_slice_idle; 2092 unsigned int slice_idle = cfqd->cfq_slice_idle;
2002 if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic)) 2093 if (sample_valid(cfqq->seek_samples) && CFQQ_SEEKY(cfqq))
2003 slice_idle = msecs_to_jiffies(CFQ_MIN_TT); 2094 slice_idle = msecs_to_jiffies(CFQ_MIN_TT);
2004 if (cic->ttime_mean > slice_idle) 2095 if (cic->ttime_mean > slice_idle)
2005 enable_idle = 0; 2096 enable_idle = 0;
@@ -2066,7 +2157,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
2066 * if this request is as-good as one we would expect from the 2157 * if this request is as-good as one we would expect from the
2067 * current cfqq, let it preempt 2158 * current cfqq, let it preempt
2068 */ 2159 */
2069 if (cfq_rq_close(cfqd, rq)) 2160 if (cfq_rq_close(cfqd, cfqq, rq))
2070 return true; 2161 return true;
2071 2162
2072 return false; 2163 return false;
@@ -2108,10 +2199,10 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2108 cfqq->meta_pending++; 2199 cfqq->meta_pending++;
2109 2200
2110 cfq_update_io_thinktime(cfqd, cic); 2201 cfq_update_io_thinktime(cfqd, cic);
2111 cfq_update_io_seektime(cfqd, cic, rq); 2202 cfq_update_io_seektime(cfqd, cfqq, rq);
2112 cfq_update_idle_window(cfqd, cfqq, cic); 2203 cfq_update_idle_window(cfqd, cfqq, cic);
2113 2204
2114 cic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); 2205 cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
2115 2206
2116 if (cfqq == cfqd->active_queue) { 2207 if (cfqq == cfqd->active_queue) {
2117 /* 2208 /*
@@ -2166,6 +2257,8 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
2166 */ 2257 */
2167static void cfq_update_hw_tag(struct cfq_data *cfqd) 2258static void cfq_update_hw_tag(struct cfq_data *cfqd)
2168{ 2259{
2260 struct cfq_queue *cfqq = cfqd->active_queue;
2261
2169 if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak) 2262 if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak)
2170 cfqd->rq_in_driver_peak = rq_in_driver(cfqd); 2263 cfqd->rq_in_driver_peak = rq_in_driver(cfqd);
2171 2264
@@ -2173,6 +2266,16 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd)
2173 rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN) 2266 rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN)
2174 return; 2267 return;
2175 2268
2269 /*
2270 * If active queue hasn't enough requests and can idle, cfq might not
2271 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
2272 * case
2273 */
2274 if (cfqq && cfq_cfqq_idle_window(cfqq) &&
2275 cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
2276 CFQ_HW_QUEUE_MIN && rq_in_driver(cfqd) < CFQ_HW_QUEUE_MIN)
2277 return;
2278
2176 if (cfqd->hw_tag_samples++ < 50) 2279 if (cfqd->hw_tag_samples++ < 50)
2177 return; 2280 return;
2178 2281
@@ -2230,7 +2333,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
2230 */ 2333 */
2231 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) 2334 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
2232 cfq_slice_expired(cfqd, 1); 2335 cfq_slice_expired(cfqd, 1);
2233 else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) && 2336 else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq) &&
2234 sync && !rq_noidle(rq)) 2337 sync && !rq_noidle(rq))
2235 cfq_arm_slice_timer(cfqd); 2338 cfq_arm_slice_timer(cfqd);
2236 } 2339 }
@@ -2256,12 +2359,10 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
2256 cfqq->ioprio = IOPRIO_NORM; 2359 cfqq->ioprio = IOPRIO_NORM;
2257 } else { 2360 } else {
2258 /* 2361 /*
2259 * check if we need to unboost the queue 2362 * unboost the queue (if needed)
2260 */ 2363 */
2261 if (cfqq->ioprio_class != cfqq->org_ioprio_class) 2364 cfqq->ioprio_class = cfqq->org_ioprio_class;
2262 cfqq->ioprio_class = cfqq->org_ioprio_class; 2365 cfqq->ioprio = cfqq->org_ioprio;
2263 if (cfqq->ioprio != cfqq->org_ioprio)
2264 cfqq->ioprio = cfqq->org_ioprio;
2265 } 2366 }
2266} 2367}
2267 2368
@@ -2325,6 +2426,43 @@ static void cfq_put_request(struct request *rq)
2325 } 2426 }
2326} 2427}
2327 2428
2429static struct cfq_queue *
2430cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
2431 struct cfq_queue *cfqq)
2432{
2433 cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
2434 cic_set_cfqq(cic, cfqq->new_cfqq, 1);
2435 cfq_mark_cfqq_coop(cfqq->new_cfqq);
2436 cfq_put_queue(cfqq);
2437 return cic_to_cfqq(cic, 1);
2438}
2439
2440static int should_split_cfqq(struct cfq_queue *cfqq)
2441{
2442 if (cfqq->seeky_start &&
2443 time_after(jiffies, cfqq->seeky_start + CFQQ_COOP_TOUT))
2444 return 1;
2445 return 0;
2446}
2447
2448/*
2449 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
2450 * was the last process referring to said cfqq.
2451 */
2452static struct cfq_queue *
2453split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
2454{
2455 if (cfqq_process_refs(cfqq) == 1) {
2456 cfqq->seeky_start = 0;
2457 cfqq->pid = current->pid;
2458 cfq_clear_cfqq_coop(cfqq);
2459 return cfqq;
2460 }
2461
2462 cic_set_cfqq(cic, NULL, 1);
2463 cfq_put_queue(cfqq);
2464 return NULL;
2465}
2328/* 2466/*
2329 * Allocate cfq data structures associated with this request. 2467 * Allocate cfq data structures associated with this request.
2330 */ 2468 */
@@ -2347,10 +2485,30 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
2347 if (!cic) 2485 if (!cic)
2348 goto queue_fail; 2486 goto queue_fail;
2349 2487
2488new_queue:
2350 cfqq = cic_to_cfqq(cic, is_sync); 2489 cfqq = cic_to_cfqq(cic, is_sync);
2351 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 2490 if (!cfqq || cfqq == &cfqd->oom_cfqq) {
2352 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); 2491 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
2353 cic_set_cfqq(cic, cfqq, is_sync); 2492 cic_set_cfqq(cic, cfqq, is_sync);
2493 } else {
2494 /*
2495 * If the queue was seeky for too long, break it apart.
2496 */
2497 if (cfq_cfqq_coop(cfqq) && should_split_cfqq(cfqq)) {
2498 cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
2499 cfqq = split_cfqq(cic, cfqq);
2500 if (!cfqq)
2501 goto new_queue;
2502 }
2503
2504 /*
2505 * Check to see if this queue is scheduled to merge with
2506 * another, closely cooperating queue. The merging of
2507 * queues happens here as it must be done in process context.
2508 * The reference on new_cfqq was taken in merge_cfqqs.
2509 */
2510 if (cfqq->new_cfqq)
2511 cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
2354 } 2512 }
2355 2513
2356 cfqq->allocated[rw]++; 2514 cfqq->allocated[rw]++;
diff --git a/block/elevator.c b/block/elevator.c
index a847046c6e53..9ad5ccc4c5ee 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -154,10 +154,7 @@ static struct elevator_type *elevator_get(const char *name)
154 154
155 spin_unlock(&elv_list_lock); 155 spin_unlock(&elv_list_lock);
156 156
157 if (!strcmp(name, "anticipatory")) 157 sprintf(elv, "%s-iosched", name);
158 sprintf(elv, "as-iosched");
159 else
160 sprintf(elv, "%s-iosched", name);
161 158
162 request_module("%s", elv); 159 request_module("%s", elv);
163 spin_lock(&elv_list_lock); 160 spin_lock(&elv_list_lock);
@@ -193,10 +190,7 @@ static int __init elevator_setup(char *str)
193 * Be backwards-compatible with previous kernels, so users 190 * Be backwards-compatible with previous kernels, so users
194 * won't get the wrong elevator. 191 * won't get the wrong elevator.
195 */ 192 */
196 if (!strcmp(str, "as")) 193 strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
197 strcpy(chosen_elevator, "anticipatory");
198 else
199 strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
200 return 1; 194 return 1;
201} 195}
202 196
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 1d886e079c58..77bfce52e9ca 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -271,6 +271,8 @@ config BLK_DEV_CRYPTOLOOP
271 instead, which can be configured to be on-disk compatible with the 271 instead, which can be configured to be on-disk compatible with the
272 cryptoloop device. 272 cryptoloop device.
273 273
274source "drivers/block/drbd/Kconfig"
275
274config BLK_DEV_NBD 276config BLK_DEV_NBD
275 tristate "Network block device support" 277 tristate "Network block device support"
276 depends on NET 278 depends on NET
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index cdaa3f8fddf0..aff5ac925c34 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -36,5 +36,6 @@ obj-$(CONFIG_BLK_DEV_UB) += ub.o
36obj-$(CONFIG_BLK_DEV_HD) += hd.o 36obj-$(CONFIG_BLK_DEV_HD) += hd.o
37 37
38obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o 38obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
39obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
39 40
40swim_mod-objs := swim.o swim_asm.o 41swim_mod-objs := swim.o swim_asm.o
diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig
new file mode 100644
index 000000000000..f4acd04ebeef
--- /dev/null
+++ b/drivers/block/drbd/Kconfig
@@ -0,0 +1,71 @@
1#
2# DRBD device driver configuration
3#
4
5comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected"
6 depends on !PROC_FS || !INET || !CONNECTOR
7
8config BLK_DEV_DRBD
9 tristate "DRBD Distributed Replicated Block Device support"
10 depends on PROC_FS && INET && CONNECTOR
11 select LRU_CACHE
12 default n
13 help
14
15 NOTE: In order to authenticate connections you have to select
16 CRYPTO_HMAC and a hash function as well.
17
18 DRBD is a shared-nothing, synchronously replicated block device. It
19 is designed to serve as a building block for high availability
20 clusters and in this context, is a "drop-in" replacement for shared
21 storage. Simplistically, you could see it as a network RAID 1.
22
23 Each minor device has a role, which can be 'primary' or 'secondary'.
24 On the node with the primary device the application is supposed to
25 run and to access the device (/dev/drbdX). Every write is sent to
26 the local 'lower level block device' and, across the network, to the
27 node with the device in 'secondary' state. The secondary device
28 simply writes the data to its lower level block device.
29
30 DRBD can also be used in dual-Primary mode (device writable on both
31 nodes), which means it can exhibit shared disk semantics in a
32 shared-nothing cluster. Needless to say, on top of dual-Primary
33 DRBD utilizing a cluster file system is necessary to maintain for
34 cache coherency.
35
36 For automatic failover you need a cluster manager (e.g. heartbeat).
37 See also: http://www.drbd.org/, http://www.linux-ha.org
38
39 If unsure, say N.
40
41config DRBD_FAULT_INJECTION
42 bool "DRBD fault injection"
43 depends on BLK_DEV_DRBD
44 help
45
46 Say Y here if you want to simulate IO errors, in order to test DRBD's
47 behavior.
48
49 The actual simulation of IO errors is done by writing 3 values to
50 /sys/module/drbd/parameters/
51
52 enable_faults: bitmask of...
53 1 meta data write
54 2 read
55 4 resync data write
56 8 read
57 16 data write
58 32 data read
59 64 read ahead
60 128 kmalloc of bitmap
61 256 allocation of EE (epoch_entries)
62
63 fault_devs: bitmask of minor numbers
64 fault_rate: frequency in percent
65
66 Example: Simulate data write errors on /dev/drbd0 with a probability of 5%.
67 echo 16 > /sys/module/drbd/parameters/enable_faults
68 echo 1 > /sys/module/drbd/parameters/fault_devs
69 echo 5 > /sys/module/drbd/parameters/fault_rate
70
71 If unsure, say N.
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile
new file mode 100644
index 000000000000..0d3f337ff5ff
--- /dev/null
+++ b/drivers/block/drbd/Makefile
@@ -0,0 +1,5 @@
1drbd-y := drbd_bitmap.o drbd_proc.o
2drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
3drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
4
5obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
new file mode 100644
index 000000000000..17956ff6a08d
--- /dev/null
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -0,0 +1,1424 @@
1/*
2 drbd_actlog.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/slab.h>
27#include <linux/drbd.h>
28#include "drbd_int.h"
29#include "drbd_wrappers.h"
30
31/* We maintain a trivial check sum in our on disk activity log.
32 * With that we can ensure correct operation even when the storage
33 * device might do a partial (last) sector write while loosing power.
34 */
35struct __packed al_transaction {
36 u32 magic;
37 u32 tr_number;
38 struct __packed {
39 u32 pos;
40 u32 extent; } updates[1 + AL_EXTENTS_PT];
41 u32 xor_sum;
42};
43
44struct update_odbm_work {
45 struct drbd_work w;
46 unsigned int enr;
47};
48
49struct update_al_work {
50 struct drbd_work w;
51 struct lc_element *al_ext;
52 struct completion event;
53 unsigned int enr;
54 /* if old_enr != LC_FREE, write corresponding bitmap sector, too */
55 unsigned int old_enr;
56};
57
58struct drbd_atodb_wait {
59 atomic_t count;
60 struct completion io_done;
61 struct drbd_conf *mdev;
62 int error;
63};
64
65
66int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);
67
68static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
69 struct drbd_backing_dev *bdev,
70 struct page *page, sector_t sector,
71 int rw, int size)
72{
73 struct bio *bio;
74 struct drbd_md_io md_io;
75 int ok;
76
77 md_io.mdev = mdev;
78 init_completion(&md_io.event);
79 md_io.error = 0;
80
81 if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags))
82 rw |= (1 << BIO_RW_BARRIER);
83 rw |= ((1<<BIO_RW_UNPLUG) | (1<<BIO_RW_SYNCIO));
84
85 retry:
86 bio = bio_alloc(GFP_NOIO, 1);
87 bio->bi_bdev = bdev->md_bdev;
88 bio->bi_sector = sector;
89 ok = (bio_add_page(bio, page, size, 0) == size);
90 if (!ok)
91 goto out;
92 bio->bi_private = &md_io;
93 bio->bi_end_io = drbd_md_io_complete;
94 bio->bi_rw = rw;
95
96 if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
97 bio_endio(bio, -EIO);
98 else
99 submit_bio(rw, bio);
100 wait_for_completion(&md_io.event);
101 ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;
102
103 /* check for unsupported barrier op.
104 * would rather check on EOPNOTSUPP, but that is not reliable.
105 * don't try again for ANY return value != 0 */
106 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && !ok)) {
107 /* Try again with no barrier */
108 dev_warn(DEV, "Barriers not supported on meta data device - disabling\n");
109 set_bit(MD_NO_BARRIER, &mdev->flags);
110 rw &= ~(1 << BIO_RW_BARRIER);
111 bio_put(bio);
112 goto retry;
113 }
114 out:
115 bio_put(bio);
116 return ok;
117}
118
119int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
120 sector_t sector, int rw)
121{
122 int logical_block_size, mask, ok;
123 int offset = 0;
124 struct page *iop = mdev->md_io_page;
125
126 D_ASSERT(mutex_is_locked(&mdev->md_io_mutex));
127
128 BUG_ON(!bdev->md_bdev);
129
130 logical_block_size = bdev_logical_block_size(bdev->md_bdev);
131 if (logical_block_size == 0)
132 logical_block_size = MD_SECTOR_SIZE;
133
134 /* in case logical_block_size != 512 [ s390 only? ] */
135 if (logical_block_size != MD_SECTOR_SIZE) {
136 mask = (logical_block_size / MD_SECTOR_SIZE) - 1;
137 D_ASSERT(mask == 1 || mask == 3 || mask == 7);
138 D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE);
139 offset = sector & mask;
140 sector = sector & ~mask;
141 iop = mdev->md_io_tmpp;
142
143 if (rw & WRITE) {
144 /* these are GFP_KERNEL pages, pre-allocated
145 * on device initialization */
146 void *p = page_address(mdev->md_io_page);
147 void *hp = page_address(mdev->md_io_tmpp);
148
149 ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector,
150 READ, logical_block_size);
151
152 if (unlikely(!ok)) {
153 dev_err(DEV, "drbd_md_sync_page_io(,%llus,"
154 "READ [logical_block_size!=512]) failed!\n",
155 (unsigned long long)sector);
156 return 0;
157 }
158
159 memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE);
160 }
161 }
162
163 if (sector < drbd_md_first_sector(bdev) ||
164 sector > drbd_md_last_sector(bdev))
165 dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
166 current->comm, current->pid, __func__,
167 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
168
169 ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size);
170 if (unlikely(!ok)) {
171 dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n",
172 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
173 return 0;
174 }
175
176 if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) {
177 void *p = page_address(mdev->md_io_page);
178 void *hp = page_address(mdev->md_io_tmpp);
179
180 memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE);
181 }
182
183 return ok;
184}
185
186static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
187{
188 struct lc_element *al_ext;
189 struct lc_element *tmp;
190 unsigned long al_flags = 0;
191
192 spin_lock_irq(&mdev->al_lock);
193 tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
194 if (unlikely(tmp != NULL)) {
195 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
196 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
197 spin_unlock_irq(&mdev->al_lock);
198 return NULL;
199 }
200 }
201 al_ext = lc_get(mdev->act_log, enr);
202 al_flags = mdev->act_log->flags;
203 spin_unlock_irq(&mdev->al_lock);
204
205 /*
206 if (!al_ext) {
207 if (al_flags & LC_STARVING)
208 dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n");
209 if (al_flags & LC_DIRTY)
210 dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n");
211 }
212 */
213
214 return al_ext;
215}
216
217void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector)
218{
219 unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
220 struct lc_element *al_ext;
221 struct update_al_work al_work;
222
223 D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
224
225 wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)));
226
227 if (al_ext->lc_number != enr) {
228 /* drbd_al_write_transaction(mdev,al_ext,enr);
229 * recurses into generic_make_request(), which
230 * disallows recursion, bios being serialized on the
231 * current->bio_tail list now.
232 * we have to delegate updates to the activity log
233 * to the worker thread. */
234 init_completion(&al_work.event);
235 al_work.al_ext = al_ext;
236 al_work.enr = enr;
237 al_work.old_enr = al_ext->lc_number;
238 al_work.w.cb = w_al_write_transaction;
239 drbd_queue_work_front(&mdev->data.work, &al_work.w);
240 wait_for_completion(&al_work.event);
241
242 mdev->al_writ_cnt++;
243
244 spin_lock_irq(&mdev->al_lock);
245 lc_changed(mdev->act_log, al_ext);
246 spin_unlock_irq(&mdev->al_lock);
247 wake_up(&mdev->al_wait);
248 }
249}
250
251void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)
252{
253 unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
254 struct lc_element *extent;
255 unsigned long flags;
256
257 spin_lock_irqsave(&mdev->al_lock, flags);
258
259 extent = lc_find(mdev->act_log, enr);
260
261 if (!extent) {
262 spin_unlock_irqrestore(&mdev->al_lock, flags);
263 dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
264 return;
265 }
266
267 if (lc_put(mdev->act_log, extent) == 0)
268 wake_up(&mdev->al_wait);
269
270 spin_unlock_irqrestore(&mdev->al_lock, flags);
271}
272
273int
274w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
275{
276 struct update_al_work *aw = container_of(w, struct update_al_work, w);
277 struct lc_element *updated = aw->al_ext;
278 const unsigned int new_enr = aw->enr;
279 const unsigned int evicted = aw->old_enr;
280 struct al_transaction *buffer;
281 sector_t sector;
282 int i, n, mx;
283 unsigned int extent_nr;
284 u32 xor_sum = 0;
285
286 if (!get_ldev(mdev)) {
287 dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n");
288 complete(&((struct update_al_work *)w)->event);
289 return 1;
290 }
291 /* do we have to do a bitmap write, first?
292 * TODO reduce maximum latency:
293 * submit both bios, then wait for both,
294 * instead of doing two synchronous sector writes. */
295 if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
296 drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT);
297
298 mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */
299 buffer = (struct al_transaction *)page_address(mdev->md_io_page);
300
301 buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
302 buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
303
304 n = lc_index_of(mdev->act_log, updated);
305
306 buffer->updates[0].pos = cpu_to_be32(n);
307 buffer->updates[0].extent = cpu_to_be32(new_enr);
308
309 xor_sum ^= new_enr;
310
311 mx = min_t(int, AL_EXTENTS_PT,
312 mdev->act_log->nr_elements - mdev->al_tr_cycle);
313 for (i = 0; i < mx; i++) {
314 unsigned idx = mdev->al_tr_cycle + i;
315 extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number;
316 buffer->updates[i+1].pos = cpu_to_be32(idx);
317 buffer->updates[i+1].extent = cpu_to_be32(extent_nr);
318 xor_sum ^= extent_nr;
319 }
320 for (; i < AL_EXTENTS_PT; i++) {
321 buffer->updates[i+1].pos = __constant_cpu_to_be32(-1);
322 buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE);
323 xor_sum ^= LC_FREE;
324 }
325 mdev->al_tr_cycle += AL_EXTENTS_PT;
326 if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
327 mdev->al_tr_cycle = 0;
328
329 buffer->xor_sum = cpu_to_be32(xor_sum);
330
331 sector = mdev->ldev->md.md_offset
332 + mdev->ldev->md.al_offset + mdev->al_tr_pos;
333
334 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
335 drbd_chk_io_error(mdev, 1, TRUE);
336
337 if (++mdev->al_tr_pos >
338 div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
339 mdev->al_tr_pos = 0;
340
341 D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
342 mdev->al_tr_number++;
343
344 mutex_unlock(&mdev->md_io_mutex);
345
346 complete(&((struct update_al_work *)w)->event);
347 put_ldev(mdev);
348
349 return 1;
350}
351
352/**
353 * drbd_al_read_tr() - Read a single transaction from the on disk activity log
354 * @mdev: DRBD device.
355 * @bdev: Block device to read form.
356 * @b: pointer to an al_transaction.
357 * @index: On disk slot of the transaction to read.
358 *
359 * Returns -1 on IO error, 0 on checksum error and 1 upon success.
360 */
361static int drbd_al_read_tr(struct drbd_conf *mdev,
362 struct drbd_backing_dev *bdev,
363 struct al_transaction *b,
364 int index)
365{
366 sector_t sector;
367 int rv, i;
368 u32 xor_sum = 0;
369
370 sector = bdev->md.md_offset + bdev->md.al_offset + index;
371
372 /* Dont process error normally,
373 * as this is done before disk is attached! */
374 if (!drbd_md_sync_page_io(mdev, bdev, sector, READ))
375 return -1;
376
377 rv = (be32_to_cpu(b->magic) == DRBD_MAGIC);
378
379 for (i = 0; i < AL_EXTENTS_PT + 1; i++)
380 xor_sum ^= be32_to_cpu(b->updates[i].extent);
381 rv &= (xor_sum == be32_to_cpu(b->xor_sum));
382
383 return rv;
384}
385
386/**
387 * drbd_al_read_log() - Restores the activity log from its on disk representation.
388 * @mdev: DRBD device.
389 * @bdev: Block device to read form.
390 *
391 * Returns 1 on success, returns 0 when reading the log failed due to IO errors.
392 */
393int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
394{
395 struct al_transaction *buffer;
396 int i;
397 int rv;
398 int mx;
399 int active_extents = 0;
400 int transactions = 0;
401 int found_valid = 0;
402 int from = 0;
403 int to = 0;
404 u32 from_tnr = 0;
405 u32 to_tnr = 0;
406 u32 cnr;
407
408 mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT);
409
410 /* lock out all other meta data io for now,
411 * and make sure the page is mapped.
412 */
413 mutex_lock(&mdev->md_io_mutex);
414 buffer = page_address(mdev->md_io_page);
415
416 /* Find the valid transaction in the log */
417 for (i = 0; i <= mx; i++) {
418 rv = drbd_al_read_tr(mdev, bdev, buffer, i);
419 if (rv == 0)
420 continue;
421 if (rv == -1) {
422 mutex_unlock(&mdev->md_io_mutex);
423 return 0;
424 }
425 cnr = be32_to_cpu(buffer->tr_number);
426
427 if (++found_valid == 1) {
428 from = i;
429 to = i;
430 from_tnr = cnr;
431 to_tnr = cnr;
432 continue;
433 }
434 if ((int)cnr - (int)from_tnr < 0) {
435 D_ASSERT(from_tnr - cnr + i - from == mx+1);
436 from = i;
437 from_tnr = cnr;
438 }
439 if ((int)cnr - (int)to_tnr > 0) {
440 D_ASSERT(cnr - to_tnr == i - to);
441 to = i;
442 to_tnr = cnr;
443 }
444 }
445
446 if (!found_valid) {
447 dev_warn(DEV, "No usable activity log found.\n");
448 mutex_unlock(&mdev->md_io_mutex);
449 return 1;
450 }
451
452 /* Read the valid transactions.
453 * dev_info(DEV, "Reading from %d to %d.\n",from,to); */
454 i = from;
455 while (1) {
456 int j, pos;
457 unsigned int extent_nr;
458 unsigned int trn;
459
460 rv = drbd_al_read_tr(mdev, bdev, buffer, i);
461 ERR_IF(rv == 0) goto cancel;
462 if (rv == -1) {
463 mutex_unlock(&mdev->md_io_mutex);
464 return 0;
465 }
466
467 trn = be32_to_cpu(buffer->tr_number);
468
469 spin_lock_irq(&mdev->al_lock);
470
471 /* This loop runs backwards because in the cyclic
472 elements there might be an old version of the
473 updated element (in slot 0). So the element in slot 0
474 can overwrite old versions. */
475 for (j = AL_EXTENTS_PT; j >= 0; j--) {
476 pos = be32_to_cpu(buffer->updates[j].pos);
477 extent_nr = be32_to_cpu(buffer->updates[j].extent);
478
479 if (extent_nr == LC_FREE)
480 continue;
481
482 lc_set(mdev->act_log, extent_nr, pos);
483 active_extents++;
484 }
485 spin_unlock_irq(&mdev->al_lock);
486
487 transactions++;
488
489cancel:
490 if (i == to)
491 break;
492 i++;
493 if (i > mx)
494 i = 0;
495 }
496
497 mdev->al_tr_number = to_tnr+1;
498 mdev->al_tr_pos = to;
499 if (++mdev->al_tr_pos >
500 div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
501 mdev->al_tr_pos = 0;
502
503 /* ok, we are done with it */
504 mutex_unlock(&mdev->md_io_mutex);
505
506 dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
507 transactions, active_extents);
508
509 return 1;
510}
511
512static void atodb_endio(struct bio *bio, int error)
513{
514 struct drbd_atodb_wait *wc = bio->bi_private;
515 struct drbd_conf *mdev = wc->mdev;
516 struct page *page;
517 int uptodate = bio_flagged(bio, BIO_UPTODATE);
518
519 /* strange behavior of some lower level drivers...
520 * fail the request by clearing the uptodate flag,
521 * but do not return any error?! */
522 if (!error && !uptodate)
523 error = -EIO;
524
525 drbd_chk_io_error(mdev, error, TRUE);
526 if (error && wc->error == 0)
527 wc->error = error;
528
529 if (atomic_dec_and_test(&wc->count))
530 complete(&wc->io_done);
531
532 page = bio->bi_io_vec[0].bv_page;
533 put_page(page);
534 bio_put(bio);
535 mdev->bm_writ_cnt++;
536 put_ldev(mdev);
537}
538
539#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
540/* activity log to on disk bitmap -- prepare bio unless that sector
541 * is already covered by previously prepared bios */
542static int atodb_prepare_unless_covered(struct drbd_conf *mdev,
543 struct bio **bios,
544 unsigned int enr,
545 struct drbd_atodb_wait *wc) __must_hold(local)
546{
547 struct bio *bio;
548 struct page *page;
549 sector_t on_disk_sector = enr + mdev->ldev->md.md_offset
550 + mdev->ldev->md.bm_offset;
551 unsigned int page_offset = PAGE_SIZE;
552 int offset;
553 int i = 0;
554 int err = -ENOMEM;
555
556 /* Check if that enr is already covered by an already created bio.
557 * Caution, bios[] is not NULL terminated,
558 * but only initialized to all NULL.
559 * For completely scattered activity log,
560 * the last invocation iterates over all bios,
561 * and finds the last NULL entry.
562 */
563 while ((bio = bios[i])) {
564 if (bio->bi_sector == on_disk_sector)
565 return 0;
566 i++;
567 }
568 /* bios[i] == NULL, the next not yet used slot */
569
570 /* GFP_KERNEL, we are not in the write-out path */
571 bio = bio_alloc(GFP_KERNEL, 1);
572 if (bio == NULL)
573 return -ENOMEM;
574
575 if (i > 0) {
576 const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec;
577 page_offset = prev_bv->bv_offset + prev_bv->bv_len;
578 page = prev_bv->bv_page;
579 }
580 if (page_offset == PAGE_SIZE) {
581 page = alloc_page(__GFP_HIGHMEM);
582 if (page == NULL)
583 goto out_bio_put;
584 page_offset = 0;
585 } else {
586 get_page(page);
587 }
588
589 offset = S2W(enr);
590 drbd_bm_get_lel(mdev, offset,
591 min_t(size_t, S2W(1), drbd_bm_words(mdev) - offset),
592 kmap(page) + page_offset);
593 kunmap(page);
594
595 bio->bi_private = wc;
596 bio->bi_end_io = atodb_endio;
597 bio->bi_bdev = mdev->ldev->md_bdev;
598 bio->bi_sector = on_disk_sector;
599
600 if (bio_add_page(bio, page, MD_SECTOR_SIZE, page_offset) != MD_SECTOR_SIZE)
601 goto out_put_page;
602
603 atomic_inc(&wc->count);
604 /* we already know that we may do this...
605 * get_ldev_if_state(mdev,D_ATTACHING);
606 * just get the extra reference, so that the local_cnt reflects
607 * the number of pending IO requests DRBD at its backing device.
608 */
609 atomic_inc(&mdev->local_cnt);
610
611 bios[i] = bio;
612
613 return 0;
614
615out_put_page:
616 err = -EINVAL;
617 put_page(page);
618out_bio_put:
619 bio_put(bio);
620 return err;
621}
622
623/**
624 * drbd_al_to_on_disk_bm() - * Writes bitmap parts covered by active AL extents
625 * @mdev: DRBD device.
626 *
627 * Called when we detach (unconfigure) local storage,
628 * or when we go from R_PRIMARY to R_SECONDARY role.
629 */
630void drbd_al_to_on_disk_bm(struct drbd_conf *mdev)
631{
632 int i, nr_elements;
633 unsigned int enr;
634 struct bio **bios;
635 struct drbd_atodb_wait wc;
636
637 ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING))
638 return; /* sorry, I don't have any act_log etc... */
639
640 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
641
642 nr_elements = mdev->act_log->nr_elements;
643
644 /* GFP_KERNEL, we are not in anyone's write-out path */
645 bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL);
646 if (!bios)
647 goto submit_one_by_one;
648
649 atomic_set(&wc.count, 0);
650 init_completion(&wc.io_done);
651 wc.mdev = mdev;
652 wc.error = 0;
653
654 for (i = 0; i < nr_elements; i++) {
655 enr = lc_element_by_index(mdev->act_log, i)->lc_number;
656 if (enr == LC_FREE)
657 continue;
658 /* next statement also does atomic_inc wc.count and local_cnt */
659 if (atodb_prepare_unless_covered(mdev, bios,
660 enr/AL_EXT_PER_BM_SECT,
661 &wc))
662 goto free_bios_submit_one_by_one;
663 }
664
665 /* unnecessary optimization? */
666 lc_unlock(mdev->act_log);
667 wake_up(&mdev->al_wait);
668
669 /* all prepared, submit them */
670 for (i = 0; i < nr_elements; i++) {
671 if (bios[i] == NULL)
672 break;
673 if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) {
674 bios[i]->bi_rw = WRITE;
675 bio_endio(bios[i], -EIO);
676 } else {
677 submit_bio(WRITE, bios[i]);
678 }
679 }
680
681 drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
682
683 /* always (try to) flush bitmap to stable storage */
684 drbd_md_flush(mdev);
685
686 /* In case we did not submit a single IO do not wait for
687 * them to complete. ( Because we would wait forever here. )
688 *
689 * In case we had IOs and they are already complete, there
690 * is not point in waiting anyways.
691 * Therefore this if () ... */
692 if (atomic_read(&wc.count))
693 wait_for_completion(&wc.io_done);
694
695 put_ldev(mdev);
696
697 kfree(bios);
698 return;
699
700 free_bios_submit_one_by_one:
701 /* free everything by calling the endio callback directly. */
702 for (i = 0; i < nr_elements && bios[i]; i++)
703 bio_endio(bios[i], 0);
704
705 kfree(bios);
706
707 submit_one_by_one:
708 dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n");
709
710 for (i = 0; i < mdev->act_log->nr_elements; i++) {
711 enr = lc_element_by_index(mdev->act_log, i)->lc_number;
712 if (enr == LC_FREE)
713 continue;
714 /* Really slow: if we have al-extents 16..19 active,
715 * sector 4 will be written four times! Synchronous! */
716 drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT);
717 }
718
719 lc_unlock(mdev->act_log);
720 wake_up(&mdev->al_wait);
721 put_ldev(mdev);
722}
723
724/**
725 * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents
726 * @mdev: DRBD device.
727 */
728void drbd_al_apply_to_bm(struct drbd_conf *mdev)
729{
730 unsigned int enr;
731 unsigned long add = 0;
732 char ppb[10];
733 int i;
734
735 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
736
737 for (i = 0; i < mdev->act_log->nr_elements; i++) {
738 enr = lc_element_by_index(mdev->act_log, i)->lc_number;
739 if (enr == LC_FREE)
740 continue;
741 add += drbd_bm_ALe_set_all(mdev, enr);
742 }
743
744 lc_unlock(mdev->act_log);
745 wake_up(&mdev->al_wait);
746
747 dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n",
748 ppsize(ppb, Bit2KB(add)));
749}
750
751static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
752{
753 int rv;
754
755 spin_lock_irq(&mdev->al_lock);
756 rv = (al_ext->refcnt == 0);
757 if (likely(rv))
758 lc_del(mdev->act_log, al_ext);
759 spin_unlock_irq(&mdev->al_lock);
760
761 return rv;
762}
763
764/**
765 * drbd_al_shrink() - Removes all active extents form the activity log
766 * @mdev: DRBD device.
767 *
768 * Removes all active extents form the activity log, waiting until
769 * the reference count of each entry dropped to 0 first, of course.
770 *
771 * You need to lock mdev->act_log with lc_try_lock() / lc_unlock()
772 */
773void drbd_al_shrink(struct drbd_conf *mdev)
774{
775 struct lc_element *al_ext;
776 int i;
777
778 D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags));
779
780 for (i = 0; i < mdev->act_log->nr_elements; i++) {
781 al_ext = lc_element_by_index(mdev->act_log, i);
782 if (al_ext->lc_number == LC_FREE)
783 continue;
784 wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext));
785 }
786
787 wake_up(&mdev->al_wait);
788}
789
790static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused)
791{
792 struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
793
794 if (!get_ldev(mdev)) {
795 if (__ratelimit(&drbd_ratelimit_state))
796 dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n");
797 kfree(udw);
798 return 1;
799 }
800
801 drbd_bm_write_sect(mdev, udw->enr);
802 put_ldev(mdev);
803
804 kfree(udw);
805
806 if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) {
807 switch (mdev->state.conn) {
808 case C_SYNC_SOURCE: case C_SYNC_TARGET:
809 case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:
810 drbd_resync_finished(mdev);
811 default:
812 /* nothing to do */
813 break;
814 }
815 }
816 drbd_bcast_sync_progress(mdev);
817
818 return 1;
819}
820
821
822/* ATTENTION. The AL's extents are 4MB each, while the extents in the
823 * resync LRU-cache are 16MB each.
824 * The caller of this function has to hold an get_ldev() reference.
825 *
826 * TODO will be obsoleted once we have a caching lru of the on disk bitmap
827 */
828static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
829 int count, int success)
830{
831 struct lc_element *e;
832 struct update_odbm_work *udw;
833
834 unsigned int enr;
835
836 D_ASSERT(atomic_read(&mdev->local_cnt));
837
838 /* I simply assume that a sector/size pair never crosses
839 * a 16 MB extent border. (Currently this is true...) */
840 enr = BM_SECT_TO_EXT(sector);
841
842 e = lc_get(mdev->resync, enr);
843 if (e) {
844 struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
845 if (ext->lce.lc_number == enr) {
846 if (success)
847 ext->rs_left -= count;
848 else
849 ext->rs_failed += count;
850 if (ext->rs_left < ext->rs_failed) {
851 dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
852 "rs_failed=%d count=%d\n",
853 (unsigned long long)sector,
854 ext->lce.lc_number, ext->rs_left,
855 ext->rs_failed, count);
856 dump_stack();
857
858 lc_put(mdev->resync, &ext->lce);
859 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
860 return;
861 }
862 } else {
863 /* Normally this element should be in the cache,
864 * since drbd_rs_begin_io() pulled it already in.
865 *
866 * But maybe an application write finished, and we set
867 * something outside the resync lru_cache in sync.
868 */
869 int rs_left = drbd_bm_e_weight(mdev, enr);
870 if (ext->flags != 0) {
871 dev_warn(DEV, "changing resync lce: %d[%u;%02lx]"
872 " -> %d[%u;00]\n",
873 ext->lce.lc_number, ext->rs_left,
874 ext->flags, enr, rs_left);
875 ext->flags = 0;
876 }
877 if (ext->rs_failed) {
878 dev_warn(DEV, "Kicking resync_lru element enr=%u "
879 "out with rs_failed=%d\n",
880 ext->lce.lc_number, ext->rs_failed);
881 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
882 }
883 ext->rs_left = rs_left;
884 ext->rs_failed = success ? 0 : count;
885 lc_changed(mdev->resync, &ext->lce);
886 }
887 lc_put(mdev->resync, &ext->lce);
888 /* no race, we are within the al_lock! */
889
890 if (ext->rs_left == ext->rs_failed) {
891 ext->rs_failed = 0;
892
893 udw = kmalloc(sizeof(*udw), GFP_ATOMIC);
894 if (udw) {
895 udw->enr = ext->lce.lc_number;
896 udw->w.cb = w_update_odbm;
897 drbd_queue_work_front(&mdev->data.work, &udw->w);
898 } else {
899 dev_warn(DEV, "Could not kmalloc an udw\n");
900 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
901 }
902 }
903 } else {
904 dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n",
905 mdev->resync_locked,
906 mdev->resync->nr_elements,
907 mdev->resync->flags);
908 }
909}
910
911/* clear the bit corresponding to the piece of storage in question:
912 * size byte of data starting from sector. Only clear a bits of the affected
913 * one ore more _aligned_ BM_BLOCK_SIZE blocks.
914 *
915 * called by worker on C_SYNC_TARGET and receiver on SyncSource.
916 *
917 */
918void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
919 const char *file, const unsigned int line)
920{
921 /* Is called from worker and receiver context _only_ */
922 unsigned long sbnr, ebnr, lbnr;
923 unsigned long count = 0;
924 sector_t esector, nr_sectors;
925 int wake_up = 0;
926 unsigned long flags;
927
928 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
929 dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
930 (unsigned long long)sector, size);
931 return;
932 }
933 nr_sectors = drbd_get_capacity(mdev->this_bdev);
934 esector = sector + (size >> 9) - 1;
935
936 ERR_IF(sector >= nr_sectors) return;
937 ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
938
939 lbnr = BM_SECT_TO_BIT(nr_sectors-1);
940
941 /* we clear it (in sync).
942 * round up start sector, round down end sector. we make sure we only
943 * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
944 if (unlikely(esector < BM_SECT_PER_BIT-1))
945 return;
946 if (unlikely(esector == (nr_sectors-1)))
947 ebnr = lbnr;
948 else
949 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
950 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
951
952 if (sbnr > ebnr)
953 return;
954
955 /*
956 * ok, (capacity & 7) != 0 sometimes, but who cares...
957 * we count rs_{total,left} in bits, not sectors.
958 */
959 spin_lock_irqsave(&mdev->al_lock, flags);
960 count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
961 if (count) {
962 /* we need the lock for drbd_try_clear_on_disk_bm */
963 if (jiffies - mdev->rs_mark_time > HZ*10) {
964 /* should be rolling marks,
965 * but we estimate only anyways. */
966 if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) &&
967 mdev->state.conn != C_PAUSED_SYNC_T &&
968 mdev->state.conn != C_PAUSED_SYNC_S) {
969 mdev->rs_mark_time = jiffies;
970 mdev->rs_mark_left = drbd_bm_total_weight(mdev);
971 }
972 }
973 if (get_ldev(mdev)) {
974 drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE);
975 put_ldev(mdev);
976 }
977 /* just wake_up unconditional now, various lc_chaged(),
978 * lc_put() in drbd_try_clear_on_disk_bm(). */
979 wake_up = 1;
980 }
981 spin_unlock_irqrestore(&mdev->al_lock, flags);
982 if (wake_up)
983 wake_up(&mdev->al_wait);
984}
985
986/*
987 * this is intended to set one request worth of data out of sync.
988 * affects at least 1 bit,
989 * and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits.
990 *
991 * called by tl_clear and drbd_send_dblock (==drbd_make_request).
992 * so this can be _any_ process.
993 */
994void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
995 const char *file, const unsigned int line)
996{
997 unsigned long sbnr, ebnr, lbnr, flags;
998 sector_t esector, nr_sectors;
999 unsigned int enr, count;
1000 struct lc_element *e;
1001
1002 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
1003 dev_err(DEV, "sector: %llus, size: %d\n",
1004 (unsigned long long)sector, size);
1005 return;
1006 }
1007
1008 if (!get_ldev(mdev))
1009 return; /* no disk, no metadata, no bitmap to set bits in */
1010
1011 nr_sectors = drbd_get_capacity(mdev->this_bdev);
1012 esector = sector + (size >> 9) - 1;
1013
1014 ERR_IF(sector >= nr_sectors)
1015 goto out;
1016 ERR_IF(esector >= nr_sectors)
1017 esector = (nr_sectors-1);
1018
1019 lbnr = BM_SECT_TO_BIT(nr_sectors-1);
1020
1021 /* we set it out of sync,
1022 * we do not need to round anything here */
1023 sbnr = BM_SECT_TO_BIT(sector);
1024 ebnr = BM_SECT_TO_BIT(esector);
1025
1026 /* ok, (capacity & 7) != 0 sometimes, but who cares...
1027 * we count rs_{total,left} in bits, not sectors. */
1028 spin_lock_irqsave(&mdev->al_lock, flags);
1029 count = drbd_bm_set_bits(mdev, sbnr, ebnr);
1030
1031 enr = BM_SECT_TO_EXT(sector);
1032 e = lc_find(mdev->resync, enr);
1033 if (e)
1034 lc_entry(e, struct bm_extent, lce)->rs_left += count;
1035 spin_unlock_irqrestore(&mdev->al_lock, flags);
1036
1037out:
1038 put_ldev(mdev);
1039}
1040
1041static
1042struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
1043{
1044 struct lc_element *e;
1045 struct bm_extent *bm_ext;
1046 int wakeup = 0;
1047 unsigned long rs_flags;
1048
1049 spin_lock_irq(&mdev->al_lock);
1050 if (mdev->resync_locked > mdev->resync->nr_elements/2) {
1051 spin_unlock_irq(&mdev->al_lock);
1052 return NULL;
1053 }
1054 e = lc_get(mdev->resync, enr);
1055 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1056 if (bm_ext) {
1057 if (bm_ext->lce.lc_number != enr) {
1058 bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
1059 bm_ext->rs_failed = 0;
1060 lc_changed(mdev->resync, &bm_ext->lce);
1061 wakeup = 1;
1062 }
1063 if (bm_ext->lce.refcnt == 1)
1064 mdev->resync_locked++;
1065 set_bit(BME_NO_WRITES, &bm_ext->flags);
1066 }
1067 rs_flags = mdev->resync->flags;
1068 spin_unlock_irq(&mdev->al_lock);
1069 if (wakeup)
1070 wake_up(&mdev->al_wait);
1071
1072 if (!bm_ext) {
1073 if (rs_flags & LC_STARVING)
1074 dev_warn(DEV, "Have to wait for element"
1075 " (resync LRU too small?)\n");
1076 BUG_ON(rs_flags & LC_DIRTY);
1077 }
1078
1079 return bm_ext;
1080}
1081
1082static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
1083{
1084 struct lc_element *al_ext;
1085 int rv = 0;
1086
1087 spin_lock_irq(&mdev->al_lock);
1088 if (unlikely(enr == mdev->act_log->new_number))
1089 rv = 1;
1090 else {
1091 al_ext = lc_find(mdev->act_log, enr);
1092 if (al_ext) {
1093 if (al_ext->refcnt)
1094 rv = 1;
1095 }
1096 }
1097 spin_unlock_irq(&mdev->al_lock);
1098
1099 /*
1100 if (unlikely(rv)) {
1101 dev_info(DEV, "Delaying sync read until app's write is done\n");
1102 }
1103 */
1104 return rv;
1105}
1106
1107/**
1108 * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED
1109 * @mdev: DRBD device.
1110 * @sector: The sector number.
1111 *
1112 * This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted.
1113 */
1114int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
1115{
1116 unsigned int enr = BM_SECT_TO_EXT(sector);
1117 struct bm_extent *bm_ext;
1118 int i, sig;
1119
1120 sig = wait_event_interruptible(mdev->al_wait,
1121 (bm_ext = _bme_get(mdev, enr)));
1122 if (sig)
1123 return 0;
1124
1125 if (test_bit(BME_LOCKED, &bm_ext->flags))
1126 return 1;
1127
1128 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
1129 sig = wait_event_interruptible(mdev->al_wait,
1130 !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i));
1131 if (sig) {
1132 spin_lock_irq(&mdev->al_lock);
1133 if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
1134 clear_bit(BME_NO_WRITES, &bm_ext->flags);
1135 mdev->resync_locked--;
1136 wake_up(&mdev->al_wait);
1137 }
1138 spin_unlock_irq(&mdev->al_lock);
1139 return 0;
1140 }
1141 }
1142
1143 set_bit(BME_LOCKED, &bm_ext->flags);
1144
1145 return 1;
1146}
1147
1148/**
1149 * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep
1150 * @mdev: DRBD device.
1151 * @sector: The sector number.
1152 *
1153 * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then
1154 * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN
1155 * if there is still application IO going on in this area.
1156 */
1157int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
1158{
1159 unsigned int enr = BM_SECT_TO_EXT(sector);
1160 const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT;
1161 struct lc_element *e;
1162 struct bm_extent *bm_ext;
1163 int i;
1164
1165 spin_lock_irq(&mdev->al_lock);
1166 if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) {
1167 /* in case you have very heavy scattered io, it may
1168 * stall the syncer undefined if we give up the ref count
1169 * when we try again and requeue.
1170 *
1171 * if we don't give up the refcount, but the next time
1172 * we are scheduled this extent has been "synced" by new
1173 * application writes, we'd miss the lc_put on the
1174 * extent we keep the refcount on.
1175 * so we remembered which extent we had to try again, and
1176 * if the next requested one is something else, we do
1177 * the lc_put here...
1178 * we also have to wake_up
1179 */
1180 e = lc_find(mdev->resync, mdev->resync_wenr);
1181 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1182 if (bm_ext) {
1183 D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
1184 D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
1185 clear_bit(BME_NO_WRITES, &bm_ext->flags);
1186 mdev->resync_wenr = LC_FREE;
1187 if (lc_put(mdev->resync, &bm_ext->lce) == 0)
1188 mdev->resync_locked--;
1189 wake_up(&mdev->al_wait);
1190 } else {
1191 dev_alert(DEV, "LOGIC BUG\n");
1192 }
1193 }
1194 /* TRY. */
1195 e = lc_try_get(mdev->resync, enr);
1196 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1197 if (bm_ext) {
1198 if (test_bit(BME_LOCKED, &bm_ext->flags))
1199 goto proceed;
1200 if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) {
1201 mdev->resync_locked++;
1202 } else {
1203 /* we did set the BME_NO_WRITES,
1204 * but then could not set BME_LOCKED,
1205 * so we tried again.
1206 * drop the extra reference. */
1207 bm_ext->lce.refcnt--;
1208 D_ASSERT(bm_ext->lce.refcnt > 0);
1209 }
1210 goto check_al;
1211 } else {
1212 /* do we rather want to try later? */
1213 if (mdev->resync_locked > mdev->resync->nr_elements-3)
1214 goto try_again;
1215 /* Do or do not. There is no try. -- Yoda */
1216 e = lc_get(mdev->resync, enr);
1217 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1218 if (!bm_ext) {
1219 const unsigned long rs_flags = mdev->resync->flags;
1220 if (rs_flags & LC_STARVING)
1221 dev_warn(DEV, "Have to wait for element"
1222 " (resync LRU too small?)\n");
1223 BUG_ON(rs_flags & LC_DIRTY);
1224 goto try_again;
1225 }
1226 if (bm_ext->lce.lc_number != enr) {
1227 bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
1228 bm_ext->rs_failed = 0;
1229 lc_changed(mdev->resync, &bm_ext->lce);
1230 wake_up(&mdev->al_wait);
1231 D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0);
1232 }
1233 set_bit(BME_NO_WRITES, &bm_ext->flags);
1234 D_ASSERT(bm_ext->lce.refcnt == 1);
1235 mdev->resync_locked++;
1236 goto check_al;
1237 }
1238check_al:
1239 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
1240 if (unlikely(al_enr+i == mdev->act_log->new_number))
1241 goto try_again;
1242 if (lc_is_used(mdev->act_log, al_enr+i))
1243 goto try_again;
1244 }
1245 set_bit(BME_LOCKED, &bm_ext->flags);
1246proceed:
1247 mdev->resync_wenr = LC_FREE;
1248 spin_unlock_irq(&mdev->al_lock);
1249 return 0;
1250
1251try_again:
1252 if (bm_ext)
1253 mdev->resync_wenr = enr;
1254 spin_unlock_irq(&mdev->al_lock);
1255 return -EAGAIN;
1256}
1257
1258void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector)
1259{
1260 unsigned int enr = BM_SECT_TO_EXT(sector);
1261 struct lc_element *e;
1262 struct bm_extent *bm_ext;
1263 unsigned long flags;
1264
1265 spin_lock_irqsave(&mdev->al_lock, flags);
1266 e = lc_find(mdev->resync, enr);
1267 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1268 if (!bm_ext) {
1269 spin_unlock_irqrestore(&mdev->al_lock, flags);
1270 if (__ratelimit(&drbd_ratelimit_state))
1271 dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n");
1272 return;
1273 }
1274
1275 if (bm_ext->lce.refcnt == 0) {
1276 spin_unlock_irqrestore(&mdev->al_lock, flags);
1277 dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, "
1278 "but refcnt is 0!?\n",
1279 (unsigned long long)sector, enr);
1280 return;
1281 }
1282
1283 if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
1284 clear_bit(BME_LOCKED, &bm_ext->flags);
1285 clear_bit(BME_NO_WRITES, &bm_ext->flags);
1286 mdev->resync_locked--;
1287 wake_up(&mdev->al_wait);
1288 }
1289
1290 spin_unlock_irqrestore(&mdev->al_lock, flags);
1291}
1292
1293/**
1294 * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED)
1295 * @mdev: DRBD device.
1296 */
1297void drbd_rs_cancel_all(struct drbd_conf *mdev)
1298{
1299 spin_lock_irq(&mdev->al_lock);
1300
1301 if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */
1302 lc_reset(mdev->resync);
1303 put_ldev(mdev);
1304 }
1305 mdev->resync_locked = 0;
1306 mdev->resync_wenr = LC_FREE;
1307 spin_unlock_irq(&mdev->al_lock);
1308 wake_up(&mdev->al_wait);
1309}
1310
1311/**
1312 * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU
1313 * @mdev: DRBD device.
1314 *
1315 * Returns 0 upon success, -EAGAIN if at least one reference count was
1316 * not zero.
1317 */
1318int drbd_rs_del_all(struct drbd_conf *mdev)
1319{
1320 struct lc_element *e;
1321 struct bm_extent *bm_ext;
1322 int i;
1323
1324 spin_lock_irq(&mdev->al_lock);
1325
1326 if (get_ldev_if_state(mdev, D_FAILED)) {
1327 /* ok, ->resync is there. */
1328 for (i = 0; i < mdev->resync->nr_elements; i++) {
1329 e = lc_element_by_index(mdev->resync, i);
1330 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1331 if (bm_ext->lce.lc_number == LC_FREE)
1332 continue;
1333 if (bm_ext->lce.lc_number == mdev->resync_wenr) {
1334 dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently"
1335 " got 'synced' by application io\n",
1336 mdev->resync_wenr);
1337 D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
1338 D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
1339 clear_bit(BME_NO_WRITES, &bm_ext->flags);
1340 mdev->resync_wenr = LC_FREE;
1341 lc_put(mdev->resync, &bm_ext->lce);
1342 }
1343 if (bm_ext->lce.refcnt != 0) {
1344 dev_info(DEV, "Retrying drbd_rs_del_all() later. "
1345 "refcnt=%d\n", bm_ext->lce.refcnt);
1346 put_ldev(mdev);
1347 spin_unlock_irq(&mdev->al_lock);
1348 return -EAGAIN;
1349 }
1350 D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
1351 D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags));
1352 lc_del(mdev->resync, &bm_ext->lce);
1353 }
1354 D_ASSERT(mdev->resync->used == 0);
1355 put_ldev(mdev);
1356 }
1357 spin_unlock_irq(&mdev->al_lock);
1358
1359 return 0;
1360}
1361
1362/**
1363 * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks
1364 * @mdev: DRBD device.
1365 * @sector: The sector number.
1366 * @size: Size of failed IO operation, in byte.
1367 */
1368void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
1369{
1370 /* Is called from worker and receiver context _only_ */
1371 unsigned long sbnr, ebnr, lbnr;
1372 unsigned long count;
1373 sector_t esector, nr_sectors;
1374 int wake_up = 0;
1375
1376 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
1377 dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
1378 (unsigned long long)sector, size);
1379 return;
1380 }
1381 nr_sectors = drbd_get_capacity(mdev->this_bdev);
1382 esector = sector + (size >> 9) - 1;
1383
1384 ERR_IF(sector >= nr_sectors) return;
1385 ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
1386
1387 lbnr = BM_SECT_TO_BIT(nr_sectors-1);
1388
1389 /*
1390 * round up start sector, round down end sector. we make sure we only
1391 * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */
1392 if (unlikely(esector < BM_SECT_PER_BIT-1))
1393 return;
1394 if (unlikely(esector == (nr_sectors-1)))
1395 ebnr = lbnr;
1396 else
1397 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
1398 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
1399
1400 if (sbnr > ebnr)
1401 return;
1402
1403 /*
1404 * ok, (capacity & 7) != 0 sometimes, but who cares...
1405 * we count rs_{total,left} in bits, not sectors.
1406 */
1407 spin_lock_irq(&mdev->al_lock);
1408 count = drbd_bm_count_bits(mdev, sbnr, ebnr);
1409 if (count) {
1410 mdev->rs_failed += count;
1411
1412 if (get_ldev(mdev)) {
1413 drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE);
1414 put_ldev(mdev);
1415 }
1416
1417 /* just wake_up unconditional now, various lc_chaged(),
1418 * lc_put() in drbd_try_clear_on_disk_bm(). */
1419 wake_up = 1;
1420 }
1421 spin_unlock_irq(&mdev->al_lock);
1422 if (wake_up)
1423 wake_up(&mdev->al_wait);
1424}
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
new file mode 100644
index 000000000000..b61057e77882
--- /dev/null
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -0,0 +1,1327 @@
1/*
2 drbd_bitmap.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25#include <linux/bitops.h>
26#include <linux/vmalloc.h>
27#include <linux/string.h>
28#include <linux/drbd.h>
29#include <asm/kmap_types.h>
30#include "drbd_int.h"
31
32/* OPAQUE outside this file!
33 * interface defined in drbd_int.h
34
35 * convention:
36 * function name drbd_bm_... => used elsewhere, "public".
37 * function name bm_... => internal to implementation, "private".
38
39 * Note that since find_first_bit returns int, at the current granularity of
40 * the bitmap (4KB per byte), this implementation "only" supports up to
41 * 1<<(32+12) == 16 TB...
42 */
43
44/*
45 * NOTE
46 * Access to the *bm_pages is protected by bm_lock.
47 * It is safe to read the other members within the lock.
48 *
49 * drbd_bm_set_bits is called from bio_endio callbacks,
50 * We may be called with irq already disabled,
51 * so we need spin_lock_irqsave().
52 * And we need the kmap_atomic.
53 */
54struct drbd_bitmap {
55 struct page **bm_pages;
56 spinlock_t bm_lock;
57 /* WARNING unsigned long bm_*:
58 * 32bit number of bit offset is just enough for 512 MB bitmap.
59 * it will blow up if we make the bitmap bigger...
60 * not that it makes much sense to have a bitmap that large,
61 * rather change the granularity to 16k or 64k or something.
62 * (that implies other problems, however...)
63 */
64 unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */
65 unsigned long bm_bits;
66 size_t bm_words;
67 size_t bm_number_of_pages;
68 sector_t bm_dev_capacity;
69 struct semaphore bm_change; /* serializes resize operations */
70
71 atomic_t bm_async_io;
72 wait_queue_head_t bm_io_wait;
73
74 unsigned long bm_flags;
75
76 /* debugging aid, in case we are still racy somewhere */
77 char *bm_why;
78 struct task_struct *bm_task;
79};
80
81/* definition of bits in bm_flags */
82#define BM_LOCKED 0
83#define BM_MD_IO_ERROR 1
84#define BM_P_VMALLOCED 2
85
86static int bm_is_locked(struct drbd_bitmap *b)
87{
88 return test_bit(BM_LOCKED, &b->bm_flags);
89}
90
91#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
92static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
93{
94 struct drbd_bitmap *b = mdev->bitmap;
95 if (!__ratelimit(&drbd_ratelimit_state))
96 return;
97 dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n",
98 current == mdev->receiver.task ? "receiver" :
99 current == mdev->asender.task ? "asender" :
100 current == mdev->worker.task ? "worker" : current->comm,
101 func, b->bm_why ?: "?",
102 b->bm_task == mdev->receiver.task ? "receiver" :
103 b->bm_task == mdev->asender.task ? "asender" :
104 b->bm_task == mdev->worker.task ? "worker" : "?");
105}
106
107void drbd_bm_lock(struct drbd_conf *mdev, char *why)
108{
109 struct drbd_bitmap *b = mdev->bitmap;
110 int trylock_failed;
111
112 if (!b) {
113 dev_err(DEV, "FIXME no bitmap in drbd_bm_lock!?\n");
114 return;
115 }
116
117 trylock_failed = down_trylock(&b->bm_change);
118
119 if (trylock_failed) {
120 dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n",
121 current == mdev->receiver.task ? "receiver" :
122 current == mdev->asender.task ? "asender" :
123 current == mdev->worker.task ? "worker" : current->comm,
124 why, b->bm_why ?: "?",
125 b->bm_task == mdev->receiver.task ? "receiver" :
126 b->bm_task == mdev->asender.task ? "asender" :
127 b->bm_task == mdev->worker.task ? "worker" : "?");
128 down(&b->bm_change);
129 }
130 if (__test_and_set_bit(BM_LOCKED, &b->bm_flags))
131 dev_err(DEV, "FIXME bitmap already locked in bm_lock\n");
132
133 b->bm_why = why;
134 b->bm_task = current;
135}
136
137void drbd_bm_unlock(struct drbd_conf *mdev)
138{
139 struct drbd_bitmap *b = mdev->bitmap;
140 if (!b) {
141 dev_err(DEV, "FIXME no bitmap in drbd_bm_unlock!?\n");
142 return;
143 }
144
145 if (!__test_and_clear_bit(BM_LOCKED, &mdev->bitmap->bm_flags))
146 dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n");
147
148 b->bm_why = NULL;
149 b->bm_task = NULL;
150 up(&b->bm_change);
151}
152
153/* word offset to long pointer */
154static unsigned long *__bm_map_paddr(struct drbd_bitmap *b, unsigned long offset, const enum km_type km)
155{
156 struct page *page;
157 unsigned long page_nr;
158
159 /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
160 page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
161 BUG_ON(page_nr >= b->bm_number_of_pages);
162 page = b->bm_pages[page_nr];
163
164 return (unsigned long *) kmap_atomic(page, km);
165}
166
167static unsigned long * bm_map_paddr(struct drbd_bitmap *b, unsigned long offset)
168{
169 return __bm_map_paddr(b, offset, KM_IRQ1);
170}
171
172static void __bm_unmap(unsigned long *p_addr, const enum km_type km)
173{
174 kunmap_atomic(p_addr, km);
175};
176
177static void bm_unmap(unsigned long *p_addr)
178{
179 return __bm_unmap(p_addr, KM_IRQ1);
180}
181
182/* long word offset of _bitmap_ sector */
183#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
184/* word offset from start of bitmap to word number _in_page_
185 * modulo longs per page
186#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
187 hm, well, Philipp thinks gcc might not optimze the % into & (... - 1)
188 so do it explicitly:
189 */
190#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
191
192/* Long words per page */
193#define LWPP (PAGE_SIZE/sizeof(long))
194
195/*
196 * actually most functions herein should take a struct drbd_bitmap*, not a
197 * struct drbd_conf*, but for the debug macros I like to have the mdev around
198 * to be able to report device specific.
199 */
200
201static void bm_free_pages(struct page **pages, unsigned long number)
202{
203 unsigned long i;
204 if (!pages)
205 return;
206
207 for (i = 0; i < number; i++) {
208 if (!pages[i]) {
209 printk(KERN_ALERT "drbd: bm_free_pages tried to free "
210 "a NULL pointer; i=%lu n=%lu\n",
211 i, number);
212 continue;
213 }
214 __free_page(pages[i]);
215 pages[i] = NULL;
216 }
217}
218
219static void bm_vk_free(void *ptr, int v)
220{
221 if (v)
222 vfree(ptr);
223 else
224 kfree(ptr);
225}
226
227/*
228 * "have" and "want" are NUMBER OF PAGES.
229 */
230static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
231{
232 struct page **old_pages = b->bm_pages;
233 struct page **new_pages, *page;
234 unsigned int i, bytes, vmalloced = 0;
235 unsigned long have = b->bm_number_of_pages;
236
237 BUG_ON(have == 0 && old_pages != NULL);
238 BUG_ON(have != 0 && old_pages == NULL);
239
240 if (have == want)
241 return old_pages;
242
243 /* Trying kmalloc first, falling back to vmalloc.
244 * GFP_KERNEL is ok, as this is done when a lower level disk is
245 * "attached" to the drbd. Context is receiver thread or cqueue
246 * thread. As we have no disk yet, we are not in the IO path,
247 * not even the IO path of the peer. */
248 bytes = sizeof(struct page *)*want;
249 new_pages = kmalloc(bytes, GFP_KERNEL);
250 if (!new_pages) {
251 new_pages = vmalloc(bytes);
252 if (!new_pages)
253 return NULL;
254 vmalloced = 1;
255 }
256
257 memset(new_pages, 0, bytes);
258 if (want >= have) {
259 for (i = 0; i < have; i++)
260 new_pages[i] = old_pages[i];
261 for (; i < want; i++) {
262 page = alloc_page(GFP_HIGHUSER);
263 if (!page) {
264 bm_free_pages(new_pages + have, i - have);
265 bm_vk_free(new_pages, vmalloced);
266 return NULL;
267 }
268 new_pages[i] = page;
269 }
270 } else {
271 for (i = 0; i < want; i++)
272 new_pages[i] = old_pages[i];
273 /* NOT HERE, we are outside the spinlock!
274 bm_free_pages(old_pages + want, have - want);
275 */
276 }
277
278 if (vmalloced)
279 set_bit(BM_P_VMALLOCED, &b->bm_flags);
280 else
281 clear_bit(BM_P_VMALLOCED, &b->bm_flags);
282
283 return new_pages;
284}
285
286/*
287 * called on driver init only. TODO call when a device is created.
288 * allocates the drbd_bitmap, and stores it in mdev->bitmap.
289 */
290int drbd_bm_init(struct drbd_conf *mdev)
291{
292 struct drbd_bitmap *b = mdev->bitmap;
293 WARN_ON(b != NULL);
294 b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);
295 if (!b)
296 return -ENOMEM;
297 spin_lock_init(&b->bm_lock);
298 init_MUTEX(&b->bm_change);
299 init_waitqueue_head(&b->bm_io_wait);
300
301 mdev->bitmap = b;
302
303 return 0;
304}
305
306sector_t drbd_bm_capacity(struct drbd_conf *mdev)
307{
308 ERR_IF(!mdev->bitmap) return 0;
309 return mdev->bitmap->bm_dev_capacity;
310}
311
312/* called on driver unload. TODO: call when a device is destroyed.
313 */
314void drbd_bm_cleanup(struct drbd_conf *mdev)
315{
316 ERR_IF (!mdev->bitmap) return;
317 bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
318 bm_vk_free(mdev->bitmap->bm_pages, test_bit(BM_P_VMALLOCED, &mdev->bitmap->bm_flags));
319 kfree(mdev->bitmap);
320 mdev->bitmap = NULL;
321}
322
323/*
324 * since (b->bm_bits % BITS_PER_LONG) != 0,
325 * this masks out the remaining bits.
326 * Returns the number of bits cleared.
327 */
328static int bm_clear_surplus(struct drbd_bitmap *b)
329{
330 const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1;
331 size_t w = b->bm_bits >> LN2_BPL;
332 int cleared = 0;
333 unsigned long *p_addr, *bm;
334
335 p_addr = bm_map_paddr(b, w);
336 bm = p_addr + MLPP(w);
337 if (w < b->bm_words) {
338 cleared = hweight_long(*bm & ~mask);
339 *bm &= mask;
340 w++; bm++;
341 }
342
343 if (w < b->bm_words) {
344 cleared += hweight_long(*bm);
345 *bm = 0;
346 }
347 bm_unmap(p_addr);
348 return cleared;
349}
350
351static void bm_set_surplus(struct drbd_bitmap *b)
352{
353 const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1;
354 size_t w = b->bm_bits >> LN2_BPL;
355 unsigned long *p_addr, *bm;
356
357 p_addr = bm_map_paddr(b, w);
358 bm = p_addr + MLPP(w);
359 if (w < b->bm_words) {
360 *bm |= ~mask;
361 bm++; w++;
362 }
363
364 if (w < b->bm_words) {
365 *bm = ~(0UL);
366 }
367 bm_unmap(p_addr);
368}
369
370static unsigned long __bm_count_bits(struct drbd_bitmap *b, const int swap_endian)
371{
372 unsigned long *p_addr, *bm, offset = 0;
373 unsigned long bits = 0;
374 unsigned long i, do_now;
375
376 while (offset < b->bm_words) {
377 i = do_now = min_t(size_t, b->bm_words-offset, LWPP);
378 p_addr = __bm_map_paddr(b, offset, KM_USER0);
379 bm = p_addr + MLPP(offset);
380 while (i--) {
381#ifndef __LITTLE_ENDIAN
382 if (swap_endian)
383 *bm = lel_to_cpu(*bm);
384#endif
385 bits += hweight_long(*bm++);
386 }
387 __bm_unmap(p_addr, KM_USER0);
388 offset += do_now;
389 cond_resched();
390 }
391
392 return bits;
393}
394
395static unsigned long bm_count_bits(struct drbd_bitmap *b)
396{
397 return __bm_count_bits(b, 0);
398}
399
400static unsigned long bm_count_bits_swap_endian(struct drbd_bitmap *b)
401{
402 return __bm_count_bits(b, 1);
403}
404
405/* offset and len in long words.*/
406static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
407{
408 unsigned long *p_addr, *bm;
409 size_t do_now, end;
410
411#define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512)
412
413 end = offset + len;
414
415 if (end > b->bm_words) {
416 printk(KERN_ALERT "drbd: bm_memset end > bm_words\n");
417 return;
418 }
419
420 while (offset < end) {
421 do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
422 p_addr = bm_map_paddr(b, offset);
423 bm = p_addr + MLPP(offset);
424 if (bm+do_now > p_addr + LWPP) {
425 printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
426 p_addr, bm, (int)do_now);
427 break; /* breaks to after catch_oob_access_end() only! */
428 }
429 memset(bm, c, do_now * sizeof(long));
430 bm_unmap(p_addr);
431 offset += do_now;
432 }
433}
434
435/*
436 * make sure the bitmap has enough room for the attached storage,
437 * if necessary, resize.
438 * called whenever we may have changed the device size.
439 * returns -ENOMEM if we could not allocate enough memory, 0 on success.
440 * In case this is actually a resize, we copy the old bitmap into the new one.
441 * Otherwise, the bitmap is initialized to all bits set.
442 */
443int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity)
444{
445 struct drbd_bitmap *b = mdev->bitmap;
446 unsigned long bits, words, owords, obits, *p_addr, *bm;
447 unsigned long want, have, onpages; /* number of pages */
448 struct page **npages, **opages = NULL;
449 int err = 0, growing;
450 int opages_vmalloced;
451
452 ERR_IF(!b) return -ENOMEM;
453
454 drbd_bm_lock(mdev, "resize");
455
456 dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n",
457 (unsigned long long)capacity);
458
459 if (capacity == b->bm_dev_capacity)
460 goto out;
461
462 opages_vmalloced = test_bit(BM_P_VMALLOCED, &b->bm_flags);
463
464 if (capacity == 0) {
465 spin_lock_irq(&b->bm_lock);
466 opages = b->bm_pages;
467 onpages = b->bm_number_of_pages;
468 owords = b->bm_words;
469 b->bm_pages = NULL;
470 b->bm_number_of_pages =
471 b->bm_set =
472 b->bm_bits =
473 b->bm_words =
474 b->bm_dev_capacity = 0;
475 spin_unlock_irq(&b->bm_lock);
476 bm_free_pages(opages, onpages);
477 bm_vk_free(opages, opages_vmalloced);
478 goto out;
479 }
480 bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
481
482 /* if we would use
483 words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
484 a 32bit host could present the wrong number of words
485 to a 64bit host.
486 */
487 words = ALIGN(bits, 64) >> LN2_BPL;
488
489 if (get_ldev(mdev)) {
490 D_ASSERT((u64)bits <= (((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12));
491 put_ldev(mdev);
492 }
493
494 /* one extra long to catch off by one errors */
495 want = ALIGN((words+1)*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
496 have = b->bm_number_of_pages;
497 if (want == have) {
498 D_ASSERT(b->bm_pages != NULL);
499 npages = b->bm_pages;
500 } else {
501 if (FAULT_ACTIVE(mdev, DRBD_FAULT_BM_ALLOC))
502 npages = NULL;
503 else
504 npages = bm_realloc_pages(b, want);
505 }
506
507 if (!npages) {
508 err = -ENOMEM;
509 goto out;
510 }
511
512 spin_lock_irq(&b->bm_lock);
513 opages = b->bm_pages;
514 owords = b->bm_words;
515 obits = b->bm_bits;
516
517 growing = bits > obits;
518 if (opages)
519 bm_set_surplus(b);
520
521 b->bm_pages = npages;
522 b->bm_number_of_pages = want;
523 b->bm_bits = bits;
524 b->bm_words = words;
525 b->bm_dev_capacity = capacity;
526
527 if (growing) {
528 bm_memset(b, owords, 0xff, words-owords);
529 b->bm_set += bits - obits;
530 }
531
532 if (want < have) {
533 /* implicit: (opages != NULL) && (opages != npages) */
534 bm_free_pages(opages + want, have - want);
535 }
536
537 p_addr = bm_map_paddr(b, words);
538 bm = p_addr + MLPP(words);
539 *bm = DRBD_MAGIC;
540 bm_unmap(p_addr);
541
542 (void)bm_clear_surplus(b);
543
544 spin_unlock_irq(&b->bm_lock);
545 if (opages != npages)
546 bm_vk_free(opages, opages_vmalloced);
547 if (!growing)
548 b->bm_set = bm_count_bits(b);
549 dev_info(DEV, "resync bitmap: bits=%lu words=%lu\n", bits, words);
550
551 out:
552 drbd_bm_unlock(mdev);
553 return err;
554}
555
556/* inherently racy:
557 * if not protected by other means, return value may be out of date when
558 * leaving this function...
559 * we still need to lock it, since it is important that this returns
560 * bm_set == 0 precisely.
561 *
562 * maybe bm_set should be atomic_t ?
563 */
564static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
565{
566 struct drbd_bitmap *b = mdev->bitmap;
567 unsigned long s;
568 unsigned long flags;
569
570 ERR_IF(!b) return 0;
571 ERR_IF(!b->bm_pages) return 0;
572
573 spin_lock_irqsave(&b->bm_lock, flags);
574 s = b->bm_set;
575 spin_unlock_irqrestore(&b->bm_lock, flags);
576
577 return s;
578}
579
580unsigned long drbd_bm_total_weight(struct drbd_conf *mdev)
581{
582 unsigned long s;
583 /* if I don't have a disk, I don't know about out-of-sync status */
584 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
585 return 0;
586 s = _drbd_bm_total_weight(mdev);
587 put_ldev(mdev);
588 return s;
589}
590
591size_t drbd_bm_words(struct drbd_conf *mdev)
592{
593 struct drbd_bitmap *b = mdev->bitmap;
594 ERR_IF(!b) return 0;
595 ERR_IF(!b->bm_pages) return 0;
596
597 return b->bm_words;
598}
599
600unsigned long drbd_bm_bits(struct drbd_conf *mdev)
601{
602 struct drbd_bitmap *b = mdev->bitmap;
603 ERR_IF(!b) return 0;
604
605 return b->bm_bits;
606}
607
608/* merge number words from buffer into the bitmap starting at offset.
609 * buffer[i] is expected to be little endian unsigned long.
610 * bitmap must be locked by drbd_bm_lock.
611 * currently only used from receive_bitmap.
612 */
613void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
614 unsigned long *buffer)
615{
616 struct drbd_bitmap *b = mdev->bitmap;
617 unsigned long *p_addr, *bm;
618 unsigned long word, bits;
619 size_t end, do_now;
620
621 end = offset + number;
622
623 ERR_IF(!b) return;
624 ERR_IF(!b->bm_pages) return;
625 if (number == 0)
626 return;
627 WARN_ON(offset >= b->bm_words);
628 WARN_ON(end > b->bm_words);
629
630 spin_lock_irq(&b->bm_lock);
631 while (offset < end) {
632 do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
633 p_addr = bm_map_paddr(b, offset);
634 bm = p_addr + MLPP(offset);
635 offset += do_now;
636 while (do_now--) {
637 bits = hweight_long(*bm);
638 word = *bm | lel_to_cpu(*buffer++);
639 *bm++ = word;
640 b->bm_set += hweight_long(word) - bits;
641 }
642 bm_unmap(p_addr);
643 }
644 /* with 32bit <-> 64bit cross-platform connect
645 * this is only correct for current usage,
646 * where we _know_ that we are 64 bit aligned,
647 * and know that this function is used in this way, too...
648 */
649 if (end == b->bm_words)
650 b->bm_set -= bm_clear_surplus(b);
651
652 spin_unlock_irq(&b->bm_lock);
653}
654
655/* copy number words from the bitmap starting at offset into the buffer.
656 * buffer[i] will be little endian unsigned long.
657 */
658void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
659 unsigned long *buffer)
660{
661 struct drbd_bitmap *b = mdev->bitmap;
662 unsigned long *p_addr, *bm;
663 size_t end, do_now;
664
665 end = offset + number;
666
667 ERR_IF(!b) return;
668 ERR_IF(!b->bm_pages) return;
669
670 spin_lock_irq(&b->bm_lock);
671 if ((offset >= b->bm_words) ||
672 (end > b->bm_words) ||
673 (number <= 0))
674 dev_err(DEV, "offset=%lu number=%lu bm_words=%lu\n",
675 (unsigned long) offset,
676 (unsigned long) number,
677 (unsigned long) b->bm_words);
678 else {
679 while (offset < end) {
680 do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
681 p_addr = bm_map_paddr(b, offset);
682 bm = p_addr + MLPP(offset);
683 offset += do_now;
684 while (do_now--)
685 *buffer++ = cpu_to_lel(*bm++);
686 bm_unmap(p_addr);
687 }
688 }
689 spin_unlock_irq(&b->bm_lock);
690}
691
692/* set all bits in the bitmap */
693void drbd_bm_set_all(struct drbd_conf *mdev)
694{
695 struct drbd_bitmap *b = mdev->bitmap;
696 ERR_IF(!b) return;
697 ERR_IF(!b->bm_pages) return;
698
699 spin_lock_irq(&b->bm_lock);
700 bm_memset(b, 0, 0xff, b->bm_words);
701 (void)bm_clear_surplus(b);
702 b->bm_set = b->bm_bits;
703 spin_unlock_irq(&b->bm_lock);
704}
705
706/* clear all bits in the bitmap */
707void drbd_bm_clear_all(struct drbd_conf *mdev)
708{
709 struct drbd_bitmap *b = mdev->bitmap;
710 ERR_IF(!b) return;
711 ERR_IF(!b->bm_pages) return;
712
713 spin_lock_irq(&b->bm_lock);
714 bm_memset(b, 0, 0, b->bm_words);
715 b->bm_set = 0;
716 spin_unlock_irq(&b->bm_lock);
717}
718
719static void bm_async_io_complete(struct bio *bio, int error)
720{
721 struct drbd_bitmap *b = bio->bi_private;
722 int uptodate = bio_flagged(bio, BIO_UPTODATE);
723
724
725 /* strange behavior of some lower level drivers...
726 * fail the request by clearing the uptodate flag,
727 * but do not return any error?!
728 * do we want to WARN() on this? */
729 if (!error && !uptodate)
730 error = -EIO;
731
732 if (error) {
733 /* doh. what now?
734 * for now, set all bits, and flag MD_IO_ERROR */
735 __set_bit(BM_MD_IO_ERROR, &b->bm_flags);
736 }
737 if (atomic_dec_and_test(&b->bm_async_io))
738 wake_up(&b->bm_io_wait);
739
740 bio_put(bio);
741}
742
743static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local)
744{
745 /* we are process context. we always get a bio */
746 struct bio *bio = bio_alloc(GFP_KERNEL, 1);
747 unsigned int len;
748 sector_t on_disk_sector =
749 mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset;
750 on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
751
752 /* this might happen with very small
753 * flexible external meta data device */
754 len = min_t(unsigned int, PAGE_SIZE,
755 (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9);
756
757 bio->bi_bdev = mdev->ldev->md_bdev;
758 bio->bi_sector = on_disk_sector;
759 bio_add_page(bio, b->bm_pages[page_nr], len, 0);
760 bio->bi_private = b;
761 bio->bi_end_io = bm_async_io_complete;
762
763 if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
764 bio->bi_rw |= rw;
765 bio_endio(bio, -EIO);
766 } else {
767 submit_bio(rw, bio);
768 }
769}
770
771# if defined(__LITTLE_ENDIAN)
772 /* nothing to do, on disk == in memory */
773# define bm_cpu_to_lel(x) ((void)0)
774# else
775void bm_cpu_to_lel(struct drbd_bitmap *b)
776{
777 /* need to cpu_to_lel all the pages ...
778 * this may be optimized by using
779 * cpu_to_lel(-1) == -1 and cpu_to_lel(0) == 0;
780 * the following is still not optimal, but better than nothing */
781 unsigned int i;
782 unsigned long *p_addr, *bm;
783 if (b->bm_set == 0) {
784 /* no page at all; avoid swap if all is 0 */
785 i = b->bm_number_of_pages;
786 } else if (b->bm_set == b->bm_bits) {
787 /* only the last page */
788 i = b->bm_number_of_pages - 1;
789 } else {
790 /* all pages */
791 i = 0;
792 }
793 for (; i < b->bm_number_of_pages; i++) {
794 p_addr = kmap_atomic(b->bm_pages[i], KM_USER0);
795 for (bm = p_addr; bm < p_addr + PAGE_SIZE/sizeof(long); bm++)
796 *bm = cpu_to_lel(*bm);
797 kunmap_atomic(p_addr, KM_USER0);
798 }
799}
800# endif
801/* lel_to_cpu == cpu_to_lel */
802# define bm_lel_to_cpu(x) bm_cpu_to_lel(x)
803
804/*
805 * bm_rw: read/write the whole bitmap from/to its on disk location.
806 */
807static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local)
808{
809 struct drbd_bitmap *b = mdev->bitmap;
810 /* sector_t sector; */
811 int bm_words, num_pages, i;
812 unsigned long now;
813 char ppb[10];
814 int err = 0;
815
816 WARN_ON(!bm_is_locked(b));
817
818 /* no spinlock here, the drbd_bm_lock should be enough! */
819
820 bm_words = drbd_bm_words(mdev);
821 num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT;
822
823 /* on disk bitmap is little endian */
824 if (rw == WRITE)
825 bm_cpu_to_lel(b);
826
827 now = jiffies;
828 atomic_set(&b->bm_async_io, num_pages);
829 __clear_bit(BM_MD_IO_ERROR, &b->bm_flags);
830
831 /* let the layers below us try to merge these bios... */
832 for (i = 0; i < num_pages; i++)
833 bm_page_io_async(mdev, b, i, rw);
834
835 drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
836 wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0);
837
838 if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) {
839 dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
840 drbd_chk_io_error(mdev, 1, TRUE);
841 err = -EIO;
842 }
843
844 now = jiffies;
845 if (rw == WRITE) {
846 /* swap back endianness */
847 bm_lel_to_cpu(b);
848 /* flush bitmap to stable storage */
849 drbd_md_flush(mdev);
850 } else /* rw == READ */ {
851 /* just read, if necessary adjust endianness */
852 b->bm_set = bm_count_bits_swap_endian(b);
853 dev_info(DEV, "recounting of set bits took additional %lu jiffies\n",
854 jiffies - now);
855 }
856 now = b->bm_set;
857
858 dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
859 ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
860
861 return err;
862}
863
864/**
865 * drbd_bm_read() - Read the whole bitmap from its on disk location.
866 * @mdev: DRBD device.
867 */
868int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
869{
870 return bm_rw(mdev, READ);
871}
872
873/**
874 * drbd_bm_write() - Write the whole bitmap to its on disk location.
875 * @mdev: DRBD device.
876 */
877int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
878{
879 return bm_rw(mdev, WRITE);
880}
881
882/**
883 * drbd_bm_write_sect: Writes a 512 (MD_SECTOR_SIZE) byte piece of the bitmap
884 * @mdev: DRBD device.
885 * @enr: Extent number in the resync lru (happens to be sector offset)
886 *
887 * The BM_EXT_SIZE is on purpose exactly the amount of the bitmap covered
888 * by a single sector write. Therefore enr == sector offset from the
889 * start of the bitmap.
890 */
891int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local)
892{
893 sector_t on_disk_sector = enr + mdev->ldev->md.md_offset
894 + mdev->ldev->md.bm_offset;
895 int bm_words, num_words, offset;
896 int err = 0;
897
898 mutex_lock(&mdev->md_io_mutex);
899 bm_words = drbd_bm_words(mdev);
900 offset = S2W(enr); /* word offset into bitmap */
901 num_words = min(S2W(1), bm_words - offset);
902 if (num_words < S2W(1))
903 memset(page_address(mdev->md_io_page), 0, MD_SECTOR_SIZE);
904 drbd_bm_get_lel(mdev, offset, num_words,
905 page_address(mdev->md_io_page));
906 if (!drbd_md_sync_page_io(mdev, mdev->ldev, on_disk_sector, WRITE)) {
907 int i;
908 err = -EIO;
909 dev_err(DEV, "IO ERROR writing bitmap sector %lu "
910 "(meta-disk sector %llus)\n",
911 enr, (unsigned long long)on_disk_sector);
912 drbd_chk_io_error(mdev, 1, TRUE);
913 for (i = 0; i < AL_EXT_PER_BM_SECT; i++)
914 drbd_bm_ALe_set_all(mdev, enr*AL_EXT_PER_BM_SECT+i);
915 }
916 mdev->bm_writ_cnt++;
917 mutex_unlock(&mdev->md_io_mutex);
918 return err;
919}
920
921/* NOTE
922 * find_first_bit returns int, we return unsigned long.
923 * should not make much difference anyways, but ...
924 *
925 * this returns a bit number, NOT a sector!
926 */
927#define BPP_MASK ((1UL << (PAGE_SHIFT+3)) - 1)
928static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
929 const int find_zero_bit, const enum km_type km)
930{
931 struct drbd_bitmap *b = mdev->bitmap;
932 unsigned long i = -1UL;
933 unsigned long *p_addr;
934 unsigned long bit_offset; /* bit offset of the mapped page. */
935
936 if (bm_fo > b->bm_bits) {
937 dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
938 } else {
939 while (bm_fo < b->bm_bits) {
940 unsigned long offset;
941 bit_offset = bm_fo & ~BPP_MASK; /* bit offset of the page */
942 offset = bit_offset >> LN2_BPL; /* word offset of the page */
943 p_addr = __bm_map_paddr(b, offset, km);
944
945 if (find_zero_bit)
946 i = find_next_zero_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK);
947 else
948 i = find_next_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK);
949
950 __bm_unmap(p_addr, km);
951 if (i < PAGE_SIZE*8) {
952 i = bit_offset + i;
953 if (i >= b->bm_bits)
954 break;
955 goto found;
956 }
957 bm_fo = bit_offset + PAGE_SIZE*8;
958 }
959 i = -1UL;
960 }
961 found:
962 return i;
963}
964
965static unsigned long bm_find_next(struct drbd_conf *mdev,
966 unsigned long bm_fo, const int find_zero_bit)
967{
968 struct drbd_bitmap *b = mdev->bitmap;
969 unsigned long i = -1UL;
970
971 ERR_IF(!b) return i;
972 ERR_IF(!b->bm_pages) return i;
973
974 spin_lock_irq(&b->bm_lock);
975 if (bm_is_locked(b))
976 bm_print_lock_info(mdev);
977
978 i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1);
979
980 spin_unlock_irq(&b->bm_lock);
981 return i;
982}
983
984unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
985{
986 return bm_find_next(mdev, bm_fo, 0);
987}
988
989#if 0
990/* not yet needed for anything. */
991unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
992{
993 return bm_find_next(mdev, bm_fo, 1);
994}
995#endif
996
997/* does not spin_lock_irqsave.
998 * you must take drbd_bm_lock() first */
999unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
1000{
1001 /* WARN_ON(!bm_is_locked(mdev)); */
1002 return __bm_find_next(mdev, bm_fo, 0, KM_USER1);
1003}
1004
1005unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
1006{
1007 /* WARN_ON(!bm_is_locked(mdev)); */
1008 return __bm_find_next(mdev, bm_fo, 1, KM_USER1);
1009}
1010
1011/* returns number of bits actually changed.
1012 * for val != 0, we change 0 -> 1, return code positive
1013 * for val == 0, we change 1 -> 0, return code negative
1014 * wants bitnr, not sector.
1015 * expected to be called for only a few bits (e - s about BITS_PER_LONG).
1016 * Must hold bitmap lock already. */
1017int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1018 unsigned long e, int val, const enum km_type km)
1019{
1020 struct drbd_bitmap *b = mdev->bitmap;
1021 unsigned long *p_addr = NULL;
1022 unsigned long bitnr;
1023 unsigned long last_page_nr = -1UL;
1024 int c = 0;
1025
1026 if (e >= b->bm_bits) {
1027 dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
1028 s, e, b->bm_bits);
1029 e = b->bm_bits ? b->bm_bits -1 : 0;
1030 }
1031 for (bitnr = s; bitnr <= e; bitnr++) {
1032 unsigned long offset = bitnr>>LN2_BPL;
1033 unsigned long page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
1034 if (page_nr != last_page_nr) {
1035 if (p_addr)
1036 __bm_unmap(p_addr, km);
1037 p_addr = __bm_map_paddr(b, offset, km);
1038 last_page_nr = page_nr;
1039 }
1040 if (val)
1041 c += (0 == __test_and_set_bit(bitnr & BPP_MASK, p_addr));
1042 else
1043 c -= (0 != __test_and_clear_bit(bitnr & BPP_MASK, p_addr));
1044 }
1045 if (p_addr)
1046 __bm_unmap(p_addr, km);
1047 b->bm_set += c;
1048 return c;
1049}
1050
1051/* returns number of bits actually changed.
1052 * for val != 0, we change 0 -> 1, return code positive
1053 * for val == 0, we change 1 -> 0, return code negative
1054 * wants bitnr, not sector */
1055int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1056 const unsigned long e, int val)
1057{
1058 unsigned long flags;
1059 struct drbd_bitmap *b = mdev->bitmap;
1060 int c = 0;
1061
1062 ERR_IF(!b) return 1;
1063 ERR_IF(!b->bm_pages) return 0;
1064
1065 spin_lock_irqsave(&b->bm_lock, flags);
1066 if (bm_is_locked(b))
1067 bm_print_lock_info(mdev);
1068
1069 c = __bm_change_bits_to(mdev, s, e, val, KM_IRQ1);
1070
1071 spin_unlock_irqrestore(&b->bm_lock, flags);
1072 return c;
1073}
1074
1075/* returns number of bits changed 0 -> 1 */
1076int drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1077{
1078 return bm_change_bits_to(mdev, s, e, 1);
1079}
1080
1081/* returns number of bits changed 1 -> 0 */
1082int drbd_bm_clear_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1083{
1084 return -bm_change_bits_to(mdev, s, e, 0);
1085}
1086
1087/* sets all bits in full words,
1088 * from first_word up to, but not including, last_word */
1089static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
1090 int page_nr, int first_word, int last_word)
1091{
1092 int i;
1093 int bits;
1094 unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_USER0);
1095 for (i = first_word; i < last_word; i++) {
1096 bits = hweight_long(paddr[i]);
1097 paddr[i] = ~0UL;
1098 b->bm_set += BITS_PER_LONG - bits;
1099 }
1100 kunmap_atomic(paddr, KM_USER0);
1101}
1102
1103/* Same thing as drbd_bm_set_bits, but without taking the spin_lock_irqsave.
1104 * You must first drbd_bm_lock().
1105 * Can be called to set the whole bitmap in one go.
1106 * Sets bits from s to e _inclusive_. */
1107void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1108{
1109 /* First set_bit from the first bit (s)
1110 * up to the next long boundary (sl),
1111 * then assign full words up to the last long boundary (el),
1112 * then set_bit up to and including the last bit (e).
1113 *
1114 * Do not use memset, because we must account for changes,
1115 * so we need to loop over the words with hweight() anyways.
1116 */
1117 unsigned long sl = ALIGN(s,BITS_PER_LONG);
1118 unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
1119 int first_page;
1120 int last_page;
1121 int page_nr;
1122 int first_word;
1123 int last_word;
1124
1125 if (e - s <= 3*BITS_PER_LONG) {
1126 /* don't bother; el and sl may even be wrong. */
1127 __bm_change_bits_to(mdev, s, e, 1, KM_USER0);
1128 return;
1129 }
1130
1131 /* difference is large enough that we can trust sl and el */
1132
1133 /* bits filling the current long */
1134 if (sl)
1135 __bm_change_bits_to(mdev, s, sl-1, 1, KM_USER0);
1136
1137 first_page = sl >> (3 + PAGE_SHIFT);
1138 last_page = el >> (3 + PAGE_SHIFT);
1139
1140 /* MLPP: modulo longs per page */
1141 /* LWPP: long words per page */
1142 first_word = MLPP(sl >> LN2_BPL);
1143 last_word = LWPP;
1144
1145 /* first and full pages, unless first page == last page */
1146 for (page_nr = first_page; page_nr < last_page; page_nr++) {
1147 bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word);
1148 cond_resched();
1149 first_word = 0;
1150 }
1151
1152 /* last page (respectively only page, for first page == last page) */
1153 last_word = MLPP(el >> LN2_BPL);
1154 bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word);
1155
1156 /* possibly trailing bits.
1157 * example: (e & 63) == 63, el will be e+1.
1158 * if that even was the very last bit,
1159 * it would trigger an assert in __bm_change_bits_to()
1160 */
1161 if (el <= e)
1162 __bm_change_bits_to(mdev, el, e, 1, KM_USER0);
1163}
1164
1165/* returns bit state
1166 * wants bitnr, NOT sector.
1167 * inherently racy... area needs to be locked by means of {al,rs}_lru
1168 * 1 ... bit set
1169 * 0 ... bit not set
1170 * -1 ... first out of bounds access, stop testing for bits!
1171 */
1172int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
1173{
1174 unsigned long flags;
1175 struct drbd_bitmap *b = mdev->bitmap;
1176 unsigned long *p_addr;
1177 int i;
1178
1179 ERR_IF(!b) return 0;
1180 ERR_IF(!b->bm_pages) return 0;
1181
1182 spin_lock_irqsave(&b->bm_lock, flags);
1183 if (bm_is_locked(b))
1184 bm_print_lock_info(mdev);
1185 if (bitnr < b->bm_bits) {
1186 unsigned long offset = bitnr>>LN2_BPL;
1187 p_addr = bm_map_paddr(b, offset);
1188 i = test_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0;
1189 bm_unmap(p_addr);
1190 } else if (bitnr == b->bm_bits) {
1191 i = -1;
1192 } else { /* (bitnr > b->bm_bits) */
1193 dev_err(DEV, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
1194 i = 0;
1195 }
1196
1197 spin_unlock_irqrestore(&b->bm_lock, flags);
1198 return i;
1199}
1200
1201/* returns number of bits set in the range [s, e] */
1202int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1203{
1204 unsigned long flags;
1205 struct drbd_bitmap *b = mdev->bitmap;
1206 unsigned long *p_addr = NULL, page_nr = -1;
1207 unsigned long bitnr;
1208 int c = 0;
1209 size_t w;
1210
1211 /* If this is called without a bitmap, that is a bug. But just to be
1212 * robust in case we screwed up elsewhere, in that case pretend there
1213 * was one dirty bit in the requested area, so we won't try to do a
1214 * local read there (no bitmap probably implies no disk) */
1215 ERR_IF(!b) return 1;
1216 ERR_IF(!b->bm_pages) return 1;
1217
1218 spin_lock_irqsave(&b->bm_lock, flags);
1219 if (bm_is_locked(b))
1220 bm_print_lock_info(mdev);
1221 for (bitnr = s; bitnr <= e; bitnr++) {
1222 w = bitnr >> LN2_BPL;
1223 if (page_nr != w >> (PAGE_SHIFT - LN2_BPL + 3)) {
1224 page_nr = w >> (PAGE_SHIFT - LN2_BPL + 3);
1225 if (p_addr)
1226 bm_unmap(p_addr);
1227 p_addr = bm_map_paddr(b, w);
1228 }
1229 ERR_IF (bitnr >= b->bm_bits) {
1230 dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
1231 } else {
1232 c += (0 != test_bit(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
1233 }
1234 }
1235 if (p_addr)
1236 bm_unmap(p_addr);
1237 spin_unlock_irqrestore(&b->bm_lock, flags);
1238 return c;
1239}
1240
1241
1242/* inherently racy...
1243 * return value may be already out-of-date when this function returns.
1244 * but the general usage is that this is only use during a cstate when bits are
1245 * only cleared, not set, and typically only care for the case when the return
1246 * value is zero, or we already "locked" this "bitmap extent" by other means.
1247 *
1248 * enr is bm-extent number, since we chose to name one sector (512 bytes)
1249 * worth of the bitmap a "bitmap extent".
1250 *
1251 * TODO
1252 * I think since we use it like a reference count, we should use the real
1253 * reference count of some bitmap extent element from some lru instead...
1254 *
1255 */
1256int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
1257{
1258 struct drbd_bitmap *b = mdev->bitmap;
1259 int count, s, e;
1260 unsigned long flags;
1261 unsigned long *p_addr, *bm;
1262
1263 ERR_IF(!b) return 0;
1264 ERR_IF(!b->bm_pages) return 0;
1265
1266 spin_lock_irqsave(&b->bm_lock, flags);
1267 if (bm_is_locked(b))
1268 bm_print_lock_info(mdev);
1269
1270 s = S2W(enr);
1271 e = min((size_t)S2W(enr+1), b->bm_words);
1272 count = 0;
1273 if (s < b->bm_words) {
1274 int n = e-s;
1275 p_addr = bm_map_paddr(b, s);
1276 bm = p_addr + MLPP(s);
1277 while (n--)
1278 count += hweight_long(*bm++);
1279 bm_unmap(p_addr);
1280 } else {
1281 dev_err(DEV, "start offset (%d) too large in drbd_bm_e_weight\n", s);
1282 }
1283 spin_unlock_irqrestore(&b->bm_lock, flags);
1284 return count;
1285}
1286
1287/* set all bits covered by the AL-extent al_enr */
1288unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
1289{
1290 struct drbd_bitmap *b = mdev->bitmap;
1291 unsigned long *p_addr, *bm;
1292 unsigned long weight;
1293 int count, s, e, i, do_now;
1294 ERR_IF(!b) return 0;
1295 ERR_IF(!b->bm_pages) return 0;
1296
1297 spin_lock_irq(&b->bm_lock);
1298 if (bm_is_locked(b))
1299 bm_print_lock_info(mdev);
1300 weight = b->bm_set;
1301
1302 s = al_enr * BM_WORDS_PER_AL_EXT;
1303 e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);
1304 /* assert that s and e are on the same page */
1305 D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3)
1306 == s >> (PAGE_SHIFT - LN2_BPL + 3));
1307 count = 0;
1308 if (s < b->bm_words) {
1309 i = do_now = e-s;
1310 p_addr = bm_map_paddr(b, s);
1311 bm = p_addr + MLPP(s);
1312 while (i--) {
1313 count += hweight_long(*bm);
1314 *bm = -1UL;
1315 bm++;
1316 }
1317 bm_unmap(p_addr);
1318 b->bm_set += do_now*BITS_PER_LONG - count;
1319 if (e == b->bm_words)
1320 b->bm_set -= bm_clear_surplus(b);
1321 } else {
1322 dev_err(DEV, "start offset (%d) too large in drbd_bm_ALe_set_all\n", s);
1323 }
1324 weight = b->bm_set - weight;
1325 spin_unlock_irq(&b->bm_lock);
1326 return weight;
1327}
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
new file mode 100644
index 000000000000..2312d782fe99
--- /dev/null
+++ b/drivers/block/drbd/drbd_int.h
@@ -0,0 +1,2252 @@
1/*
2 drbd_int.h
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24*/
25
26#ifndef _DRBD_INT_H
27#define _DRBD_INT_H
28
29#include <linux/compiler.h>
30#include <linux/types.h>
31#include <linux/version.h>
32#include <linux/list.h>
33#include <linux/sched.h>
34#include <linux/bitops.h>
35#include <linux/slab.h>
36#include <linux/crypto.h>
37#include <linux/ratelimit.h>
38#include <linux/tcp.h>
39#include <linux/mutex.h>
40#include <linux/major.h>
41#include <linux/blkdev.h>
42#include <linux/genhd.h>
43#include <net/tcp.h>
44#include <linux/lru_cache.h>
45
46#ifdef __CHECKER__
47# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr")))
48# define __protected_read_by(x) __attribute__((require_context(x,1,999,"read")))
49# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write")))
50# define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call")))
51#else
52# define __protected_by(x)
53# define __protected_read_by(x)
54# define __protected_write_by(x)
55# define __must_hold(x)
56#endif
57
58#define __no_warn(lock, stmt) do { __acquire(lock); stmt; __release(lock); } while (0)
59
60/* module parameter, defined in drbd_main.c */
61extern unsigned int minor_count;
62extern int disable_sendpage;
63extern int allow_oos;
64extern unsigned int cn_idx;
65
66#ifdef CONFIG_DRBD_FAULT_INJECTION
67extern int enable_faults;
68extern int fault_rate;
69extern int fault_devs;
70#endif
71
72extern char usermode_helper[];
73
74
75#ifndef TRUE
76#define TRUE 1
77#endif
78#ifndef FALSE
79#define FALSE 0
80#endif
81
82/* I don't remember why XCPU ...
83 * This is used to wake the asender,
84 * and to interrupt sending the sending task
85 * on disconnect.
86 */
87#define DRBD_SIG SIGXCPU
88
89/* This is used to stop/restart our threads.
90 * Cannot use SIGTERM nor SIGKILL, since these
91 * are sent out by init on runlevel changes
92 * I choose SIGHUP for now.
93 */
94#define DRBD_SIGKILL SIGHUP
95
96/* All EEs on the free list should have ID_VACANT (== 0)
97 * freshly allocated EEs get !ID_VACANT (== 1)
98 * so if it says "cannot dereference null pointer at adress 0x00000001",
99 * it is most likely one of these :( */
100
101#define ID_IN_SYNC (4711ULL)
102#define ID_OUT_OF_SYNC (4712ULL)
103
104#define ID_SYNCER (-1ULL)
105#define ID_VACANT 0
106#define is_syncer_block_id(id) ((id) == ID_SYNCER)
107
108struct drbd_conf;
109
110
111/* to shorten dev_warn(DEV, "msg"); and relatives statements */
112#define DEV (disk_to_dev(mdev->vdisk))
113
114#define D_ASSERT(exp) if (!(exp)) \
115 dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__)
116
117#define ERR_IF(exp) if (({ \
118 int _b = (exp) != 0; \
119 if (_b) dev_err(DEV, "%s: (%s) in %s:%d\n", \
120 __func__, #exp, __FILE__, __LINE__); \
121 _b; \
122 }))
123
124/* Defines to control fault insertion */
125enum {
126 DRBD_FAULT_MD_WR = 0, /* meta data write */
127 DRBD_FAULT_MD_RD = 1, /* read */
128 DRBD_FAULT_RS_WR = 2, /* resync */
129 DRBD_FAULT_RS_RD = 3,
130 DRBD_FAULT_DT_WR = 4, /* data */
131 DRBD_FAULT_DT_RD = 5,
132 DRBD_FAULT_DT_RA = 6, /* data read ahead */
133 DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */
134 DRBD_FAULT_AL_EE = 8, /* alloc ee */
135
136 DRBD_FAULT_MAX,
137};
138
139#ifdef CONFIG_DRBD_FAULT_INJECTION
140extern unsigned int
141_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type);
142static inline int
143drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) {
144 return fault_rate &&
145 (enable_faults & (1<<type)) &&
146 _drbd_insert_fault(mdev, type);
147}
148#define FAULT_ACTIVE(_m, _t) (drbd_insert_fault((_m), (_t)))
149
150#else
151#define FAULT_ACTIVE(_m, _t) (0)
152#endif
153
154/* integer division, round _UP_ to the next integer */
155#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0))
156/* usual integer division */
157#define div_floor(A, B) ((A)/(B))
158
159/* drbd_meta-data.c (still in drbd_main.c) */
160/* 4th incarnation of the disk layout. */
161#define DRBD_MD_MAGIC (DRBD_MAGIC+4)
162
163extern struct drbd_conf **minor_table;
164extern struct ratelimit_state drbd_ratelimit_state;
165
166/* on the wire */
167enum drbd_packets {
168 /* receiver (data socket) */
169 P_DATA = 0x00,
170 P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */
171 P_RS_DATA_REPLY = 0x02, /* Response to P_RS_DATA_REQUEST */
172 P_BARRIER = 0x03,
173 P_BITMAP = 0x04,
174 P_BECOME_SYNC_TARGET = 0x05,
175 P_BECOME_SYNC_SOURCE = 0x06,
176 P_UNPLUG_REMOTE = 0x07, /* Used at various times to hint the peer */
177 P_DATA_REQUEST = 0x08, /* Used to ask for a data block */
178 P_RS_DATA_REQUEST = 0x09, /* Used to ask for a data block for resync */
179 P_SYNC_PARAM = 0x0a,
180 P_PROTOCOL = 0x0b,
181 P_UUIDS = 0x0c,
182 P_SIZES = 0x0d,
183 P_STATE = 0x0e,
184 P_SYNC_UUID = 0x0f,
185 P_AUTH_CHALLENGE = 0x10,
186 P_AUTH_RESPONSE = 0x11,
187 P_STATE_CHG_REQ = 0x12,
188
189 /* asender (meta socket */
190 P_PING = 0x13,
191 P_PING_ACK = 0x14,
192 P_RECV_ACK = 0x15, /* Used in protocol B */
193 P_WRITE_ACK = 0x16, /* Used in protocol C */
194 P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
195 P_DISCARD_ACK = 0x18, /* Used in proto C, two-primaries conflict detection */
196 P_NEG_ACK = 0x19, /* Sent if local disk is unusable */
197 P_NEG_DREPLY = 0x1a, /* Local disk is broken... */
198 P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */
199 P_BARRIER_ACK = 0x1c,
200 P_STATE_CHG_REPLY = 0x1d,
201
202 /* "new" commands, no longer fitting into the ordering scheme above */
203
204 P_OV_REQUEST = 0x1e, /* data socket */
205 P_OV_REPLY = 0x1f,
206 P_OV_RESULT = 0x20, /* meta socket */
207 P_CSUM_RS_REQUEST = 0x21, /* data socket */
208 P_RS_IS_IN_SYNC = 0x22, /* meta socket */
209 P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */
210 P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */
211
212 P_MAX_CMD = 0x25,
213 P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
214 P_MAX_OPT_CMD = 0x101,
215
216 /* special command ids for handshake */
217
218 P_HAND_SHAKE_M = 0xfff1, /* First Packet on the MetaSock */
219 P_HAND_SHAKE_S = 0xfff2, /* First Packet on the Socket */
220
221 P_HAND_SHAKE = 0xfffe /* FIXED for the next century! */
222};
223
224static inline const char *cmdname(enum drbd_packets cmd)
225{
226 /* THINK may need to become several global tables
227 * when we want to support more than
228 * one PRO_VERSION */
229 static const char *cmdnames[] = {
230 [P_DATA] = "Data",
231 [P_DATA_REPLY] = "DataReply",
232 [P_RS_DATA_REPLY] = "RSDataReply",
233 [P_BARRIER] = "Barrier",
234 [P_BITMAP] = "ReportBitMap",
235 [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
236 [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
237 [P_UNPLUG_REMOTE] = "UnplugRemote",
238 [P_DATA_REQUEST] = "DataRequest",
239 [P_RS_DATA_REQUEST] = "RSDataRequest",
240 [P_SYNC_PARAM] = "SyncParam",
241 [P_SYNC_PARAM89] = "SyncParam89",
242 [P_PROTOCOL] = "ReportProtocol",
243 [P_UUIDS] = "ReportUUIDs",
244 [P_SIZES] = "ReportSizes",
245 [P_STATE] = "ReportState",
246 [P_SYNC_UUID] = "ReportSyncUUID",
247 [P_AUTH_CHALLENGE] = "AuthChallenge",
248 [P_AUTH_RESPONSE] = "AuthResponse",
249 [P_PING] = "Ping",
250 [P_PING_ACK] = "PingAck",
251 [P_RECV_ACK] = "RecvAck",
252 [P_WRITE_ACK] = "WriteAck",
253 [P_RS_WRITE_ACK] = "RSWriteAck",
254 [P_DISCARD_ACK] = "DiscardAck",
255 [P_NEG_ACK] = "NegAck",
256 [P_NEG_DREPLY] = "NegDReply",
257 [P_NEG_RS_DREPLY] = "NegRSDReply",
258 [P_BARRIER_ACK] = "BarrierAck",
259 [P_STATE_CHG_REQ] = "StateChgRequest",
260 [P_STATE_CHG_REPLY] = "StateChgReply",
261 [P_OV_REQUEST] = "OVRequest",
262 [P_OV_REPLY] = "OVReply",
263 [P_OV_RESULT] = "OVResult",
264 [P_MAX_CMD] = NULL,
265 };
266
267 if (cmd == P_HAND_SHAKE_M)
268 return "HandShakeM";
269 if (cmd == P_HAND_SHAKE_S)
270 return "HandShakeS";
271 if (cmd == P_HAND_SHAKE)
272 return "HandShake";
273 if (cmd >= P_MAX_CMD)
274 return "Unknown";
275 return cmdnames[cmd];
276}
277
278/* for sending/receiving the bitmap,
279 * possibly in some encoding scheme */
280struct bm_xfer_ctx {
281 /* "const"
282 * stores total bits and long words
283 * of the bitmap, so we don't need to
284 * call the accessor functions over and again. */
285 unsigned long bm_bits;
286 unsigned long bm_words;
287 /* during xfer, current position within the bitmap */
288 unsigned long bit_offset;
289 unsigned long word_offset;
290
291 /* statistics; index: (h->command == P_BITMAP) */
292 unsigned packets[2];
293 unsigned bytes[2];
294};
295
296extern void INFO_bm_xfer_stats(struct drbd_conf *mdev,
297 const char *direction, struct bm_xfer_ctx *c);
298
299static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
300{
301 /* word_offset counts "native long words" (32 or 64 bit),
302 * aligned at 64 bit.
303 * Encoded packet may end at an unaligned bit offset.
304 * In case a fallback clear text packet is transmitted in
305 * between, we adjust this offset back to the last 64bit
306 * aligned "native long word", which makes coding and decoding
307 * the plain text bitmap much more convenient. */
308#if BITS_PER_LONG == 64
309 c->word_offset = c->bit_offset >> 6;
310#elif BITS_PER_LONG == 32
311 c->word_offset = c->bit_offset >> 5;
312 c->word_offset &= ~(1UL);
313#else
314# error "unsupported BITS_PER_LONG"
315#endif
316}
317
318#ifndef __packed
319#define __packed __attribute__((packed))
320#endif
321
322/* This is the layout for a packet on the wire.
323 * The byteorder is the network byte order.
324 * (except block_id and barrier fields.
325 * these are pointers to local structs
326 * and have no relevance for the partner,
327 * which just echoes them as received.)
328 *
329 * NOTE that the payload starts at a long aligned offset,
330 * regardless of 32 or 64 bit arch!
331 */
332struct p_header {
333 u32 magic;
334 u16 command;
335 u16 length; /* bytes of data after this header */
336 u8 payload[0];
337} __packed;
338/* 8 bytes. packet FIXED for the next century! */
339
340/*
341 * short commands, packets without payload, plain p_header:
342 * P_PING
343 * P_PING_ACK
344 * P_BECOME_SYNC_TARGET
345 * P_BECOME_SYNC_SOURCE
346 * P_UNPLUG_REMOTE
347 */
348
349/*
350 * commands with out-of-struct payload:
351 * P_BITMAP (no additional fields)
352 * P_DATA, P_DATA_REPLY (see p_data)
353 * P_COMPRESSED_BITMAP (see receive_compressed_bitmap)
354 */
355
356/* these defines must not be changed without changing the protocol version */
357#define DP_HARDBARRIER 1
358#define DP_RW_SYNC 2
359#define DP_MAY_SET_IN_SYNC 4
360
361struct p_data {
362 struct p_header head;
363 u64 sector; /* 64 bits sector number */
364 u64 block_id; /* to identify the request in protocol B&C */
365 u32 seq_num;
366 u32 dp_flags;
367} __packed;
368
369/*
370 * commands which share a struct:
371 * p_block_ack:
372 * P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
373 * P_DISCARD_ACK (proto C, two-primaries conflict detection)
374 * p_block_req:
375 * P_DATA_REQUEST, P_RS_DATA_REQUEST
376 */
377struct p_block_ack {
378 struct p_header head;
379 u64 sector;
380 u64 block_id;
381 u32 blksize;
382 u32 seq_num;
383} __packed;
384
385
386struct p_block_req {
387 struct p_header head;
388 u64 sector;
389 u64 block_id;
390 u32 blksize;
391 u32 pad; /* to multiple of 8 Byte */
392} __packed;
393
394/*
395 * commands with their own struct for additional fields:
396 * P_HAND_SHAKE
397 * P_BARRIER
398 * P_BARRIER_ACK
399 * P_SYNC_PARAM
400 * ReportParams
401 */
402
403struct p_handshake {
404 struct p_header head; /* 8 bytes */
405 u32 protocol_min;
406 u32 feature_flags;
407 u32 protocol_max;
408
409 /* should be more than enough for future enhancements
410 * for now, feature_flags and the reserverd array shall be zero.
411 */
412
413 u32 _pad;
414 u64 reserverd[7];
415} __packed;
416/* 80 bytes, FIXED for the next century */
417
418struct p_barrier {
419 struct p_header head;
420 u32 barrier; /* barrier number _handle_ only */
421 u32 pad; /* to multiple of 8 Byte */
422} __packed;
423
424struct p_barrier_ack {
425 struct p_header head;
426 u32 barrier;
427 u32 set_size;
428} __packed;
429
430struct p_rs_param {
431 struct p_header head;
432 u32 rate;
433
434 /* Since protocol version 88 and higher. */
435 char verify_alg[0];
436} __packed;
437
438struct p_rs_param_89 {
439 struct p_header head;
440 u32 rate;
441 /* protocol version 89: */
442 char verify_alg[SHARED_SECRET_MAX];
443 char csums_alg[SHARED_SECRET_MAX];
444} __packed;
445
446struct p_protocol {
447 struct p_header head;
448 u32 protocol;
449 u32 after_sb_0p;
450 u32 after_sb_1p;
451 u32 after_sb_2p;
452 u32 want_lose;
453 u32 two_primaries;
454
455 /* Since protocol version 87 and higher. */
456 char integrity_alg[0];
457
458} __packed;
459
460struct p_uuids {
461 struct p_header head;
462 u64 uuid[UI_EXTENDED_SIZE];
463} __packed;
464
465struct p_rs_uuid {
466 struct p_header head;
467 u64 uuid;
468} __packed;
469
470struct p_sizes {
471 struct p_header head;
472 u64 d_size; /* size of disk */
473 u64 u_size; /* user requested size */
474 u64 c_size; /* current exported size */
475 u32 max_segment_size; /* Maximal size of a BIO */
476 u32 queue_order_type;
477} __packed;
478
479struct p_state {
480 struct p_header head;
481 u32 state;
482} __packed;
483
484struct p_req_state {
485 struct p_header head;
486 u32 mask;
487 u32 val;
488} __packed;
489
490struct p_req_state_reply {
491 struct p_header head;
492 u32 retcode;
493} __packed;
494
495struct p_drbd06_param {
496 u64 size;
497 u32 state;
498 u32 blksize;
499 u32 protocol;
500 u32 version;
501 u32 gen_cnt[5];
502 u32 bit_map_gen[5];
503} __packed;
504
505struct p_discard {
506 struct p_header head;
507 u64 block_id;
508 u32 seq_num;
509 u32 pad;
510} __packed;
511
512/* Valid values for the encoding field.
513 * Bump proto version when changing this. */
514enum drbd_bitmap_code {
515 /* RLE_VLI_Bytes = 0,
516 * and other bit variants had been defined during
517 * algorithm evaluation. */
518 RLE_VLI_Bits = 2,
519};
520
521struct p_compressed_bm {
522 struct p_header head;
523 /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
524 * (encoding & 0x80): polarity (set/unset) of first runlength
525 * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
526 * used to pad up to head.length bytes
527 */
528 u8 encoding;
529
530 u8 code[0];
531} __packed;
532
533/* DCBP: Drbd Compressed Bitmap Packet ... */
534static inline enum drbd_bitmap_code
535DCBP_get_code(struct p_compressed_bm *p)
536{
537 return (enum drbd_bitmap_code)(p->encoding & 0x0f);
538}
539
540static inline void
541DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
542{
543 BUG_ON(code & ~0xf);
544 p->encoding = (p->encoding & ~0xf) | code;
545}
546
547static inline int
548DCBP_get_start(struct p_compressed_bm *p)
549{
550 return (p->encoding & 0x80) != 0;
551}
552
553static inline void
554DCBP_set_start(struct p_compressed_bm *p, int set)
555{
556 p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
557}
558
559static inline int
560DCBP_get_pad_bits(struct p_compressed_bm *p)
561{
562 return (p->encoding >> 4) & 0x7;
563}
564
565static inline void
566DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
567{
568 BUG_ON(n & ~0x7);
569 p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
570}
571
572/* one bitmap packet, including the p_header,
573 * should fit within one _architecture independend_ page.
574 * so we need to use the fixed size 4KiB page size
575 * most architechtures have used for a long time.
576 */
577#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header))
578#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
579#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
580#if (PAGE_SIZE < 4096)
581/* drbd_send_bitmap / receive_bitmap would break horribly */
582#error "PAGE_SIZE too small"
583#endif
584
585union p_polymorph {
586 struct p_header header;
587 struct p_handshake handshake;
588 struct p_data data;
589 struct p_block_ack block_ack;
590 struct p_barrier barrier;
591 struct p_barrier_ack barrier_ack;
592 struct p_rs_param_89 rs_param_89;
593 struct p_protocol protocol;
594 struct p_sizes sizes;
595 struct p_uuids uuids;
596 struct p_state state;
597 struct p_req_state req_state;
598 struct p_req_state_reply req_state_reply;
599 struct p_block_req block_req;
600} __packed;
601
602/**********************************************************************/
603enum drbd_thread_state {
604 None,
605 Running,
606 Exiting,
607 Restarting
608};
609
610struct drbd_thread {
611 spinlock_t t_lock;
612 struct task_struct *task;
613 struct completion stop;
614 enum drbd_thread_state t_state;
615 int (*function) (struct drbd_thread *);
616 struct drbd_conf *mdev;
617 int reset_cpu_mask;
618};
619
620static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
621{
622 /* THINK testing the t_state seems to be uncritical in all cases
623 * (but thread_{start,stop}), so we can read it *without* the lock.
624 * --lge */
625
626 smp_rmb();
627 return thi->t_state;
628}
629
630
631/*
632 * Having this as the first member of a struct provides sort of "inheritance".
633 * "derived" structs can be "drbd_queue_work()"ed.
634 * The callback should know and cast back to the descendant struct.
635 * drbd_request and drbd_epoch_entry are descendants of drbd_work.
636 */
637struct drbd_work;
638typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel);
639struct drbd_work {
640 struct list_head list;
641 drbd_work_cb cb;
642};
643
644struct drbd_tl_epoch;
645struct drbd_request {
646 struct drbd_work w;
647 struct drbd_conf *mdev;
648
649 /* if local IO is not allowed, will be NULL.
650 * if local IO _is_ allowed, holds the locally submitted bio clone,
651 * or, after local IO completion, the ERR_PTR(error).
652 * see drbd_endio_pri(). */
653 struct bio *private_bio;
654
655 struct hlist_node colision;
656 sector_t sector;
657 unsigned int size;
658 unsigned int epoch; /* barrier_nr */
659
660 /* barrier_nr: used to check on "completion" whether this req was in
661 * the current epoch, and we therefore have to close it,
662 * starting a new epoch...
663 */
664
665 /* up to here, the struct layout is identical to drbd_epoch_entry;
666 * we might be able to use that to our advantage... */
667
668 struct list_head tl_requests; /* ring list in the transfer log */
669 struct bio *master_bio; /* master bio pointer */
670 unsigned long rq_state; /* see comments above _req_mod() */
671 int seq_num;
672 unsigned long start_time;
673};
674
675struct drbd_tl_epoch {
676 struct drbd_work w;
677 struct list_head requests; /* requests before */
678 struct drbd_tl_epoch *next; /* pointer to the next barrier */
679 unsigned int br_number; /* the barriers identifier. */
680 int n_req; /* number of requests attached before this barrier */
681};
682
683struct drbd_request;
684
685/* These Tl_epoch_entries may be in one of 6 lists:
686 active_ee .. data packet being written
687 sync_ee .. syncer block being written
688 done_ee .. block written, need to send P_WRITE_ACK
689 read_ee .. [RS]P_DATA_REQUEST being read
690*/
691
692struct drbd_epoch {
693 struct list_head list;
694 unsigned int barrier_nr;
695 atomic_t epoch_size; /* increased on every request added. */
696 atomic_t active; /* increased on every req. added, and dec on every finished. */
697 unsigned long flags;
698};
699
700/* drbd_epoch flag bits */
701enum {
702 DE_BARRIER_IN_NEXT_EPOCH_ISSUED,
703 DE_BARRIER_IN_NEXT_EPOCH_DONE,
704 DE_CONTAINS_A_BARRIER,
705 DE_HAVE_BARRIER_NUMBER,
706 DE_IS_FINISHING,
707};
708
709enum epoch_event {
710 EV_PUT,
711 EV_GOT_BARRIER_NR,
712 EV_BARRIER_DONE,
713 EV_BECAME_LAST,
714 EV_CLEANUP = 32, /* used as flag */
715};
716
717struct drbd_epoch_entry {
718 struct drbd_work w;
719 struct drbd_conf *mdev;
720 struct bio *private_bio;
721 struct hlist_node colision;
722 sector_t sector;
723 unsigned int size;
724 struct drbd_epoch *epoch;
725
726 /* up to here, the struct layout is identical to drbd_request;
727 * we might be able to use that to our advantage... */
728
729 unsigned int flags;
730 u64 block_id;
731};
732
733struct drbd_wq_barrier {
734 struct drbd_work w;
735 struct completion done;
736};
737
738struct digest_info {
739 int digest_size;
740 void *digest;
741};
742
743/* ee flag bits */
744enum {
745 __EE_CALL_AL_COMPLETE_IO,
746 __EE_CONFLICT_PENDING,
747 __EE_MAY_SET_IN_SYNC,
748 __EE_IS_BARRIER,
749};
750#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
751#define EE_CONFLICT_PENDING (1<<__EE_CONFLICT_PENDING)
752#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
753#define EE_IS_BARRIER (1<<__EE_IS_BARRIER)
754
755/* global flag bits */
756enum {
757 CREATE_BARRIER, /* next P_DATA is preceeded by a P_BARRIER */
758 SIGNAL_ASENDER, /* whether asender wants to be interrupted */
759 SEND_PING, /* whether asender should send a ping asap */
760
761 STOP_SYNC_TIMER, /* tell timer to cancel itself */
762 UNPLUG_QUEUED, /* only relevant with kernel 2.4 */
763 UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */
764 MD_DIRTY, /* current uuids and flags not yet on disk */
765 DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */
766 USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */
767 CLUSTER_ST_CHANGE, /* Cluster wide state change going on... */
768 CL_ST_CHG_SUCCESS,
769 CL_ST_CHG_FAIL,
770 CRASHED_PRIMARY, /* This node was a crashed primary.
771 * Gets cleared when the state.conn
772 * goes into C_CONNECTED state. */
773 WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */
774 NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */
775 CONSIDER_RESYNC,
776
777 MD_NO_BARRIER, /* meta data device does not support barriers,
778 so don't even try */
779 SUSPEND_IO, /* suspend application io */
780 BITMAP_IO, /* suspend application io;
781 once no more io in flight, start bitmap io */
782 BITMAP_IO_QUEUED, /* Started bitmap IO */
783 RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
784 NET_CONGESTED, /* The data socket is congested */
785
786 CONFIG_PENDING, /* serialization of (re)configuration requests.
787 * if set, also prevents the device from dying */
788 DEVICE_DYING, /* device became unconfigured,
789 * but worker thread is still handling the cleanup.
790 * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed,
791 * while this is set. */
792 RESIZE_PENDING, /* Size change detected locally, waiting for the response from
793 * the peer, if it changed there as well. */
794};
795
796struct drbd_bitmap; /* opaque for drbd_conf */
797
798/* TODO sort members for performance
799 * MAYBE group them further */
800
801/* THINK maybe we actually want to use the default "event/%s" worker threads
802 * or similar in linux 2.6, which uses per cpu data and threads.
803 *
804 * To be general, this might need a spin_lock member.
805 * For now, please use the mdev->req_lock to protect list_head,
806 * see drbd_queue_work below.
807 */
808struct drbd_work_queue {
809 struct list_head q;
810 struct semaphore s; /* producers up it, worker down()s it */
811 spinlock_t q_lock; /* to protect the list. */
812};
813
814struct drbd_socket {
815 struct drbd_work_queue work;
816 struct mutex mutex;
817 struct socket *socket;
818 /* this way we get our
819 * send/receive buffers off the stack */
820 union p_polymorph sbuf;
821 union p_polymorph rbuf;
822};
823
824struct drbd_md {
825 u64 md_offset; /* sector offset to 'super' block */
826
827 u64 la_size_sect; /* last agreed size, unit sectors */
828 u64 uuid[UI_SIZE];
829 u64 device_uuid;
830 u32 flags;
831 u32 md_size_sect;
832
833 s32 al_offset; /* signed relative sector offset to al area */
834 s32 bm_offset; /* signed relative sector offset to bitmap */
835
836 /* u32 al_nr_extents; important for restoring the AL
837 * is stored into sync_conf.al_extents, which in turn
838 * gets applied to act_log->nr_elements
839 */
840};
841
842/* for sync_conf and other types... */
843#define NL_PACKET(name, number, fields) struct name { fields };
844#define NL_INTEGER(pn,pr,member) int member;
845#define NL_INT64(pn,pr,member) __u64 member;
846#define NL_BIT(pn,pr,member) unsigned member:1;
847#define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len;
848#include "linux/drbd_nl.h"
849
850struct drbd_backing_dev {
851 struct block_device *backing_bdev;
852 struct block_device *md_bdev;
853 struct file *lo_file;
854 struct file *md_file;
855 struct drbd_md md;
856 struct disk_conf dc; /* The user provided config... */
857 sector_t known_size; /* last known size of that backing device */
858};
859
860struct drbd_md_io {
861 struct drbd_conf *mdev;
862 struct completion event;
863 int error;
864};
865
866struct bm_io_work {
867 struct drbd_work w;
868 char *why;
869 int (*io_fn)(struct drbd_conf *mdev);
870 void (*done)(struct drbd_conf *mdev, int rv);
871};
872
873enum write_ordering_e {
874 WO_none,
875 WO_drain_io,
876 WO_bdev_flush,
877 WO_bio_barrier
878};
879
880struct drbd_conf {
881 /* things that are stored as / read from meta data on disk */
882 unsigned long flags;
883
884 /* configured by drbdsetup */
885 struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */
886 struct syncer_conf sync_conf;
887 struct drbd_backing_dev *ldev __protected_by(local);
888
889 sector_t p_size; /* partner's disk size */
890 struct request_queue *rq_queue;
891 struct block_device *this_bdev;
892 struct gendisk *vdisk;
893
894 struct drbd_socket data; /* data/barrier/cstate/parameter packets */
895 struct drbd_socket meta; /* ping/ack (metadata) packets */
896 int agreed_pro_version; /* actually used protocol version */
897 unsigned long last_received; /* in jiffies, either socket */
898 unsigned int ko_count;
899 struct drbd_work resync_work,
900 unplug_work,
901 md_sync_work;
902 struct timer_list resync_timer;
903 struct timer_list md_sync_timer;
904
905 /* Used after attach while negotiating new disk state. */
906 union drbd_state new_state_tmp;
907
908 union drbd_state state;
909 wait_queue_head_t misc_wait;
910 wait_queue_head_t state_wait; /* upon each state change. */
911 unsigned int send_cnt;
912 unsigned int recv_cnt;
913 unsigned int read_cnt;
914 unsigned int writ_cnt;
915 unsigned int al_writ_cnt;
916 unsigned int bm_writ_cnt;
917 atomic_t ap_bio_cnt; /* Requests we need to complete */
918 atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
919 atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
920 atomic_t unacked_cnt; /* Need to send replys for */
921 atomic_t local_cnt; /* Waiting for local completion */
922 atomic_t net_cnt; /* Users of net_conf */
923 spinlock_t req_lock;
924 struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */
925 struct drbd_tl_epoch *newest_tle;
926 struct drbd_tl_epoch *oldest_tle;
927 struct list_head out_of_sequence_requests;
928 struct hlist_head *tl_hash;
929 unsigned int tl_hash_s;
930
931 /* blocks to sync in this run [unit BM_BLOCK_SIZE] */
932 unsigned long rs_total;
933 /* number of sync IOs that failed in this run */
934 unsigned long rs_failed;
935 /* Syncer's start time [unit jiffies] */
936 unsigned long rs_start;
937 /* cumulated time in PausedSyncX state [unit jiffies] */
938 unsigned long rs_paused;
939 /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
940 unsigned long rs_mark_left;
941 /* marks's time [unit jiffies] */
942 unsigned long rs_mark_time;
943 /* skipped because csum was equeal [unit BM_BLOCK_SIZE] */
944 unsigned long rs_same_csum;
945
946 /* where does the admin want us to start? (sector) */
947 sector_t ov_start_sector;
948 /* where are we now? (sector) */
949 sector_t ov_position;
950 /* Start sector of out of sync range (to merge printk reporting). */
951 sector_t ov_last_oos_start;
952 /* size of out-of-sync range in sectors. */
953 sector_t ov_last_oos_size;
954 unsigned long ov_left; /* in bits */
955 struct crypto_hash *csums_tfm;
956 struct crypto_hash *verify_tfm;
957
958 struct drbd_thread receiver;
959 struct drbd_thread worker;
960 struct drbd_thread asender;
961 struct drbd_bitmap *bitmap;
962 unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */
963
964 /* Used to track operations of resync... */
965 struct lru_cache *resync;
966 /* Number of locked elements in resync LRU */
967 unsigned int resync_locked;
968 /* resync extent number waiting for application requests */
969 unsigned int resync_wenr;
970
971 int open_cnt;
972 u64 *p_uuid;
973 struct drbd_epoch *current_epoch;
974 spinlock_t epoch_lock;
975 unsigned int epochs;
976 enum write_ordering_e write_ordering;
977 struct list_head active_ee; /* IO in progress */
978 struct list_head sync_ee; /* IO in progress */
979 struct list_head done_ee; /* send ack */
980 struct list_head read_ee; /* IO in progress */
981 struct list_head net_ee; /* zero-copy network send in progress */
982 struct hlist_head *ee_hash; /* is proteced by req_lock! */
983 unsigned int ee_hash_s;
984
985 /* this one is protected by ee_lock, single thread */
986 struct drbd_epoch_entry *last_write_w_barrier;
987
988 int next_barrier_nr;
989 struct hlist_head *app_reads_hash; /* is proteced by req_lock */
990 struct list_head resync_reads;
991 atomic_t pp_in_use;
992 wait_queue_head_t ee_wait;
993 struct page *md_io_page; /* one page buffer for md_io */
994 struct page *md_io_tmpp; /* for logical_block_size != 512 */
995 struct mutex md_io_mutex; /* protects the md_io_buffer */
996 spinlock_t al_lock;
997 wait_queue_head_t al_wait;
998 struct lru_cache *act_log; /* activity log */
999 unsigned int al_tr_number;
1000 int al_tr_cycle;
1001 int al_tr_pos; /* position of the next transaction in the journal */
1002 struct crypto_hash *cram_hmac_tfm;
1003 struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */
1004 struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */
1005 void *int_dig_out;
1006 void *int_dig_in;
1007 void *int_dig_vv;
1008 wait_queue_head_t seq_wait;
1009 atomic_t packet_seq;
1010 unsigned int peer_seq;
1011 spinlock_t peer_seq_lock;
1012 unsigned int minor;
1013 unsigned long comm_bm_set; /* communicated number of set bits. */
1014 cpumask_var_t cpu_mask;
1015 struct bm_io_work bm_io_work;
1016 u64 ed_uuid; /* UUID of the exposed data */
1017 struct mutex state_mutex;
1018 char congestion_reason; /* Why we where congested... */
1019};
1020
1021static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
1022{
1023 struct drbd_conf *mdev;
1024
1025 mdev = minor < minor_count ? minor_table[minor] : NULL;
1026
1027 return mdev;
1028}
1029
1030static inline unsigned int mdev_to_minor(struct drbd_conf *mdev)
1031{
1032 return mdev->minor;
1033}
1034
1035/* returns 1 if it was successfull,
1036 * returns 0 if there was no data socket.
1037 * so wherever you are going to use the data.socket, e.g. do
1038 * if (!drbd_get_data_sock(mdev))
1039 * return 0;
1040 * CODE();
1041 * drbd_put_data_sock(mdev);
1042 */
1043static inline int drbd_get_data_sock(struct drbd_conf *mdev)
1044{
1045 mutex_lock(&mdev->data.mutex);
1046 /* drbd_disconnect() could have called drbd_free_sock()
1047 * while we were waiting in down()... */
1048 if (unlikely(mdev->data.socket == NULL)) {
1049 mutex_unlock(&mdev->data.mutex);
1050 return 0;
1051 }
1052 return 1;
1053}
1054
1055static inline void drbd_put_data_sock(struct drbd_conf *mdev)
1056{
1057 mutex_unlock(&mdev->data.mutex);
1058}
1059
1060/*
1061 * function declarations
1062 *************************/
1063
1064/* drbd_main.c */
1065
1066enum chg_state_flags {
1067 CS_HARD = 1,
1068 CS_VERBOSE = 2,
1069 CS_WAIT_COMPLETE = 4,
1070 CS_SERIALIZE = 8,
1071 CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE,
1072};
1073
1074extern void drbd_init_set_defaults(struct drbd_conf *mdev);
1075extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
1076 union drbd_state mask, union drbd_state val);
1077extern void drbd_force_state(struct drbd_conf *, union drbd_state,
1078 union drbd_state);
1079extern int _drbd_request_state(struct drbd_conf *, union drbd_state,
1080 union drbd_state, enum chg_state_flags);
1081extern int __drbd_set_state(struct drbd_conf *, union drbd_state,
1082 enum chg_state_flags, struct completion *done);
1083extern void print_st_err(struct drbd_conf *, union drbd_state,
1084 union drbd_state, int);
1085extern int drbd_thread_start(struct drbd_thread *thi);
1086extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait);
1087#ifdef CONFIG_SMP
1088extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev);
1089extern void drbd_calc_cpu_mask(struct drbd_conf *mdev);
1090#else
1091#define drbd_thread_current_set_cpu(A) ({})
1092#define drbd_calc_cpu_mask(A) ({})
1093#endif
1094extern void drbd_free_resources(struct drbd_conf *mdev);
1095extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
1096 unsigned int set_size);
1097extern void tl_clear(struct drbd_conf *mdev);
1098extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
1099extern void drbd_free_sock(struct drbd_conf *mdev);
1100extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
1101 void *buf, size_t size, unsigned msg_flags);
1102extern int drbd_send_protocol(struct drbd_conf *mdev);
1103extern int drbd_send_uuids(struct drbd_conf *mdev);
1104extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
1105extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val);
1106extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply);
1107extern int _drbd_send_state(struct drbd_conf *mdev);
1108extern int drbd_send_state(struct drbd_conf *mdev);
1109extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1110 enum drbd_packets cmd, struct p_header *h,
1111 size_t size, unsigned msg_flags);
1112#define USE_DATA_SOCKET 1
1113#define USE_META_SOCKET 0
1114extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1115 enum drbd_packets cmd, struct p_header *h,
1116 size_t size);
1117extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
1118 char *data, size_t size);
1119extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc);
1120extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr,
1121 u32 set_size);
1122extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
1123 struct drbd_epoch_entry *e);
1124extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
1125 struct p_block_req *rp);
1126extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
1127 struct p_data *dp);
1128extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
1129 sector_t sector, int blksize, u64 block_id);
1130extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
1131 struct drbd_epoch_entry *e);
1132extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req);
1133extern int _drbd_send_barrier(struct drbd_conf *mdev,
1134 struct drbd_tl_epoch *barrier);
1135extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
1136 sector_t sector, int size, u64 block_id);
1137extern int drbd_send_drequest_csum(struct drbd_conf *mdev,
1138 sector_t sector,int size,
1139 void *digest, int digest_size,
1140 enum drbd_packets cmd);
1141extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size);
1142
1143extern int drbd_send_bitmap(struct drbd_conf *mdev);
1144extern int _drbd_send_bitmap(struct drbd_conf *mdev);
1145extern int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode);
1146extern void drbd_free_bc(struct drbd_backing_dev *ldev);
1147extern void drbd_mdev_cleanup(struct drbd_conf *mdev);
1148
1149/* drbd_meta-data.c (still in drbd_main.c) */
1150extern void drbd_md_sync(struct drbd_conf *mdev);
1151extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev);
1152/* maybe define them below as inline? */
1153extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
1154extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
1155extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
1156extern void _drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
1157extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local);
1158extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local);
1159extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local);
1160extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
1161extern void drbd_md_mark_dirty(struct drbd_conf *mdev);
1162extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
1163 int (*io_fn)(struct drbd_conf *),
1164 void (*done)(struct drbd_conf *, int),
1165 char *why);
1166extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
1167extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
1168extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
1169
1170
1171/* Meta data layout
1172 We reserve a 128MB Block (4k aligned)
1173 * either at the end of the backing device
1174 * or on a seperate meta data device. */
1175
1176#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */
1177/* The following numbers are sectors */
1178#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */
1179#define MD_AL_MAX_SIZE 64 /* = 32 kb LOG ~ 3776 extents ~ 14 GB Storage */
1180/* Allows up to about 3.8TB */
1181#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE)
1182
1183/* Since the smalles IO unit is usually 512 byte */
1184#define MD_SECTOR_SHIFT 9
1185#define MD_SECTOR_SIZE (1<<MD_SECTOR_SHIFT)
1186
1187/* activity log */
1188#define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */
1189#define AL_EXTENT_SHIFT 22 /* One extent represents 4M Storage */
1190#define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT)
1191
1192#if BITS_PER_LONG == 32
1193#define LN2_BPL 5
1194#define cpu_to_lel(A) cpu_to_le32(A)
1195#define lel_to_cpu(A) le32_to_cpu(A)
1196#elif BITS_PER_LONG == 64
1197#define LN2_BPL 6
1198#define cpu_to_lel(A) cpu_to_le64(A)
1199#define lel_to_cpu(A) le64_to_cpu(A)
1200#else
1201#error "LN2 of BITS_PER_LONG unknown!"
1202#endif
1203
1204/* resync bitmap */
1205/* 16MB sized 'bitmap extent' to track syncer usage */
1206struct bm_extent {
1207 int rs_left; /* number of bits set (out of sync) in this extent. */
1208 int rs_failed; /* number of failed resync requests in this extent. */
1209 unsigned long flags;
1210 struct lc_element lce;
1211};
1212
1213#define BME_NO_WRITES 0 /* bm_extent.flags: no more requests on this one! */
1214#define BME_LOCKED 1 /* bm_extent.flags: syncer active on this one. */
1215
1216/* drbd_bitmap.c */
1217/*
1218 * We need to store one bit for a block.
1219 * Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap.
1220 * Bit 0 ==> local node thinks this block is binary identical on both nodes
1221 * Bit 1 ==> local node thinks this block needs to be synced.
1222 */
1223
1224#define BM_BLOCK_SHIFT 12 /* 4k per bit */
1225#define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT)
1226/* (9+3) : 512 bytes @ 8 bits; representing 16M storage
1227 * per sector of on disk bitmap */
1228#define BM_EXT_SHIFT (BM_BLOCK_SHIFT + MD_SECTOR_SHIFT + 3) /* = 24 */
1229#define BM_EXT_SIZE (1<<BM_EXT_SHIFT)
1230
1231#if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12)
1232#error "HAVE YOU FIXED drbdmeta AS WELL??"
1233#endif
1234
1235/* thus many _storage_ sectors are described by one bit */
1236#define BM_SECT_TO_BIT(x) ((x)>>(BM_BLOCK_SHIFT-9))
1237#define BM_BIT_TO_SECT(x) ((sector_t)(x)<<(BM_BLOCK_SHIFT-9))
1238#define BM_SECT_PER_BIT BM_BIT_TO_SECT(1)
1239
1240/* bit to represented kilo byte conversion */
1241#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10))
1242
1243/* in which _bitmap_ extent (resp. sector) the bit for a certain
1244 * _storage_ sector is located in */
1245#define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9))
1246
1247/* how much _storage_ sectors we have per bitmap sector */
1248#define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9))
1249#define BM_SECT_PER_EXT BM_EXT_TO_SECT(1)
1250
1251/* in one sector of the bitmap, we have this many activity_log extents. */
1252#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
1253#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
1254
1255#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
1256#define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
1257
1258/* the extent in "PER_EXTENT" below is an activity log extent
1259 * we need that many (long words/bytes) to store the bitmap
1260 * of one AL_EXTENT_SIZE chunk of storage.
1261 * we can store the bitmap for that many AL_EXTENTS within
1262 * one sector of the _on_disk_ bitmap:
1263 * bit 0 bit 37 bit 38 bit (512*8)-1
1264 * ...|........|........|.. // ..|........|
1265 * sect. 0 `296 `304 ^(512*8*8)-1
1266 *
1267#define BM_WORDS_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG )
1268#define BM_BYTES_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 ) // 128
1269#define BM_EXT_PER_SECT ( 512 / BM_BYTES_PER_EXTENT ) // 4
1270 */
1271
1272#define DRBD_MAX_SECTORS_32 (0xffffffffLU)
1273#define DRBD_MAX_SECTORS_BM \
1274 ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9)))
1275#if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32
1276#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
1277#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM
1278#elif !defined(CONFIG_LBD) && BITS_PER_LONG == 32
1279#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32
1280#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
1281#else
1282#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
1283/* 16 TB in units of sectors */
1284#if BITS_PER_LONG == 32
1285/* adjust by one page worth of bitmap,
1286 * so we won't wrap around in drbd_bm_find_next_bit.
1287 * you should use 64bit OS for that much storage, anyways. */
1288#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff)
1289#else
1290#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0x1LU << 32)
1291#endif
1292#endif
1293
1294/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
1295 * With a value of 6 all IO in one 32K block make it to the same slot of the
1296 * hash table. */
1297#define HT_SHIFT 6
1298#define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT))
1299
1300/* Number of elements in the app_reads_hash */
1301#define APP_R_HSIZE 15
1302
1303extern int drbd_bm_init(struct drbd_conf *mdev);
1304extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors);
1305extern void drbd_bm_cleanup(struct drbd_conf *mdev);
1306extern void drbd_bm_set_all(struct drbd_conf *mdev);
1307extern void drbd_bm_clear_all(struct drbd_conf *mdev);
1308extern int drbd_bm_set_bits(
1309 struct drbd_conf *mdev, unsigned long s, unsigned long e);
1310extern int drbd_bm_clear_bits(
1311 struct drbd_conf *mdev, unsigned long s, unsigned long e);
1312/* bm_set_bits variant for use while holding drbd_bm_lock */
1313extern void _drbd_bm_set_bits(struct drbd_conf *mdev,
1314 const unsigned long s, const unsigned long e);
1315extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr);
1316extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
1317extern int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local);
1318extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
1319extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
1320extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
1321 unsigned long al_enr);
1322extern size_t drbd_bm_words(struct drbd_conf *mdev);
1323extern unsigned long drbd_bm_bits(struct drbd_conf *mdev);
1324extern sector_t drbd_bm_capacity(struct drbd_conf *mdev);
1325extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
1326/* bm_find_next variants for use while you hold drbd_bm_lock() */
1327extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
1328extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo);
1329extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev);
1330extern int drbd_bm_rs_done(struct drbd_conf *mdev);
1331/* for receive_bitmap */
1332extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset,
1333 size_t number, unsigned long *buffer);
1334/* for _drbd_send_bitmap and drbd_bm_write_sect */
1335extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset,
1336 size_t number, unsigned long *buffer);
1337
1338extern void drbd_bm_lock(struct drbd_conf *mdev, char *why);
1339extern void drbd_bm_unlock(struct drbd_conf *mdev);
1340
1341extern int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e);
1342/* drbd_main.c */
1343
1344extern struct kmem_cache *drbd_request_cache;
1345extern struct kmem_cache *drbd_ee_cache; /* epoch entries */
1346extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
1347extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
1348extern mempool_t *drbd_request_mempool;
1349extern mempool_t *drbd_ee_mempool;
1350
1351extern struct page *drbd_pp_pool; /* drbd's page pool */
1352extern spinlock_t drbd_pp_lock;
1353extern int drbd_pp_vacant;
1354extern wait_queue_head_t drbd_pp_wait;
1355
1356extern rwlock_t global_state_lock;
1357
1358extern struct drbd_conf *drbd_new_device(unsigned int minor);
1359extern void drbd_free_mdev(struct drbd_conf *mdev);
1360
1361extern int proc_details;
1362
1363/* drbd_req */
1364extern int drbd_make_request_26(struct request_queue *q, struct bio *bio);
1365extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
1366extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
1367extern int is_valid_ar_handle(struct drbd_request *, sector_t);
1368
1369
1370/* drbd_nl.c */
1371extern void drbd_suspend_io(struct drbd_conf *mdev);
1372extern void drbd_resume_io(struct drbd_conf *mdev);
1373extern char *ppsize(char *buf, unsigned long long size);
1374extern sector_t drbd_new_dev_size(struct drbd_conf *,
1375 struct drbd_backing_dev *);
1376enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
1377extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *) __must_hold(local);
1378extern void resync_after_online_grow(struct drbd_conf *);
1379extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
1380extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
1381 int force);
1382enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
1383extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
1384
1385/* drbd_worker.c */
1386extern int drbd_worker(struct drbd_thread *thi);
1387extern int drbd_alter_sa(struct drbd_conf *mdev, int na);
1388extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side);
1389extern void resume_next_sg(struct drbd_conf *mdev);
1390extern void suspend_other_sg(struct drbd_conf *mdev);
1391extern int drbd_resync_finished(struct drbd_conf *mdev);
1392/* maybe rather drbd_main.c ? */
1393extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
1394 struct drbd_backing_dev *bdev, sector_t sector, int rw);
1395extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
1396
1397static inline void ov_oos_print(struct drbd_conf *mdev)
1398{
1399 if (mdev->ov_last_oos_size) {
1400 dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n",
1401 (unsigned long long)mdev->ov_last_oos_start,
1402 (unsigned long)mdev->ov_last_oos_size);
1403 }
1404 mdev->ov_last_oos_size=0;
1405}
1406
1407
1408extern void drbd_csum(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
1409/* worker callbacks */
1410extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int);
1411extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int);
1412extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int);
1413extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int);
1414extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int);
1415extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int);
1416extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int);
1417extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int);
1418extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int);
1419extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int);
1420extern int w_io_error(struct drbd_conf *, struct drbd_work *, int);
1421extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int);
1422extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int);
1423extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int);
1424extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
1425extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
1426extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
1427extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
1428
1429extern void resync_timer_fn(unsigned long data);
1430
1431/* drbd_receiver.c */
1432extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
1433extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
1434 u64 id,
1435 sector_t sector,
1436 unsigned int data_size,
1437 gfp_t gfp_mask) __must_hold(local);
1438extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e);
1439extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
1440 struct list_head *head);
1441extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
1442 struct list_head *head);
1443extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
1444extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
1445extern void drbd_flush_workqueue(struct drbd_conf *mdev);
1446
1447/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
1448 * mess with get_fs/set_fs, we know we are KERNEL_DS always. */
1449static inline int drbd_setsockopt(struct socket *sock, int level, int optname,
1450 char __user *optval, int optlen)
1451{
1452 int err;
1453 if (level == SOL_SOCKET)
1454 err = sock_setsockopt(sock, level, optname, optval, optlen);
1455 else
1456 err = sock->ops->setsockopt(sock, level, optname, optval,
1457 optlen);
1458 return err;
1459}
1460
1461static inline void drbd_tcp_cork(struct socket *sock)
1462{
1463 int __user val = 1;
1464 (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
1465 (char __user *)&val, sizeof(val));
1466}
1467
1468static inline void drbd_tcp_uncork(struct socket *sock)
1469{
1470 int __user val = 0;
1471 (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
1472 (char __user *)&val, sizeof(val));
1473}
1474
1475static inline void drbd_tcp_nodelay(struct socket *sock)
1476{
1477 int __user val = 1;
1478 (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY,
1479 (char __user *)&val, sizeof(val));
1480}
1481
1482static inline void drbd_tcp_quickack(struct socket *sock)
1483{
1484 int __user val = 1;
1485 (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
1486 (char __user *)&val, sizeof(val));
1487}
1488
1489void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo);
1490
1491/* drbd_proc.c */
1492extern struct proc_dir_entry *drbd_proc;
1493extern struct file_operations drbd_proc_fops;
1494extern const char *drbd_conn_str(enum drbd_conns s);
1495extern const char *drbd_role_str(enum drbd_role s);
1496
1497/* drbd_actlog.c */
1498extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector);
1499extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector);
1500extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector);
1501extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
1502extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
1503extern void drbd_rs_cancel_all(struct drbd_conf *mdev);
1504extern int drbd_rs_del_all(struct drbd_conf *mdev);
1505extern void drbd_rs_failed_io(struct drbd_conf *mdev,
1506 sector_t sector, int size);
1507extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *);
1508extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector,
1509 int size, const char *file, const unsigned int line);
1510#define drbd_set_in_sync(mdev, sector, size) \
1511 __drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__)
1512extern void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
1513 int size, const char *file, const unsigned int line);
1514#define drbd_set_out_of_sync(mdev, sector, size) \
1515 __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
1516extern void drbd_al_apply_to_bm(struct drbd_conf *mdev);
1517extern void drbd_al_to_on_disk_bm(struct drbd_conf *mdev);
1518extern void drbd_al_shrink(struct drbd_conf *mdev);
1519
1520
1521/* drbd_nl.c */
1522
1523void drbd_nl_cleanup(void);
1524int __init drbd_nl_init(void);
1525void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state);
1526void drbd_bcast_sync_progress(struct drbd_conf *mdev);
1527void drbd_bcast_ee(struct drbd_conf *mdev,
1528 const char *reason, const int dgs,
1529 const char* seen_hash, const char* calc_hash,
1530 const struct drbd_epoch_entry* e);
1531
1532
1533/**
1534 * DOC: DRBD State macros
1535 *
1536 * These macros are used to express state changes in easily readable form.
1537 *
1538 * The NS macros expand to a mask and a value, that can be bit ored onto the
1539 * current state as soon as the spinlock (req_lock) was taken.
1540 *
1541 * The _NS macros are used for state functions that get called with the
1542 * spinlock. These macros expand directly to the new state value.
1543 *
1544 * Besides the basic forms NS() and _NS() additional _?NS[23] are defined
1545 * to express state changes that affect more than one aspect of the state.
1546 *
1547 * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
1548 * Means that the network connection was established and that the peer
1549 * is in secondary role.
1550 */
1551#define role_MASK R_MASK
1552#define peer_MASK R_MASK
1553#define disk_MASK D_MASK
1554#define pdsk_MASK D_MASK
1555#define conn_MASK C_MASK
1556#define susp_MASK 1
1557#define user_isp_MASK 1
1558#define aftr_isp_MASK 1
1559
1560#define NS(T, S) \
1561 ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
1562 ({ union drbd_state val; val.i = 0; val.T = (S); val; })
1563#define NS2(T1, S1, T2, S2) \
1564 ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
1565 mask.T2 = T2##_MASK; mask; }), \
1566 ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
1567 val.T2 = (S2); val; })
1568#define NS3(T1, S1, T2, S2, T3, S3) \
1569 ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
1570 mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
1571 ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
1572 val.T2 = (S2); val.T3 = (S3); val; })
1573
1574#define _NS(D, T, S) \
1575 D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; })
1576#define _NS2(D, T1, S1, T2, S2) \
1577 D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
1578 __ns.T2 = (S2); __ns; })
1579#define _NS3(D, T1, S1, T2, S2, T3, S3) \
1580 D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
1581 __ns.T2 = (S2); __ns.T3 = (S3); __ns; })
1582
1583/*
1584 * inline helper functions
1585 *************************/
1586
1587static inline void drbd_state_lock(struct drbd_conf *mdev)
1588{
1589 wait_event(mdev->misc_wait,
1590 !test_and_set_bit(CLUSTER_ST_CHANGE, &mdev->flags));
1591}
1592
1593static inline void drbd_state_unlock(struct drbd_conf *mdev)
1594{
1595 clear_bit(CLUSTER_ST_CHANGE, &mdev->flags);
1596 wake_up(&mdev->misc_wait);
1597}
1598
1599static inline int _drbd_set_state(struct drbd_conf *mdev,
1600 union drbd_state ns, enum chg_state_flags flags,
1601 struct completion *done)
1602{
1603 int rv;
1604
1605 read_lock(&global_state_lock);
1606 rv = __drbd_set_state(mdev, ns, flags, done);
1607 read_unlock(&global_state_lock);
1608
1609 return rv;
1610}
1611
1612/**
1613 * drbd_request_state() - Reqest a state change
1614 * @mdev: DRBD device.
1615 * @mask: mask of state bits to change.
1616 * @val: value of new state bits.
1617 *
1618 * This is the most graceful way of requesting a state change. It is verbose
1619 * quite verbose in case the state change is not possible, and all those
1620 * state changes are globally serialized.
1621 */
1622static inline int drbd_request_state(struct drbd_conf *mdev,
1623 union drbd_state mask,
1624 union drbd_state val)
1625{
1626 return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED);
1627}
1628
1629#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__)
1630static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, const char *where)
1631{
1632 switch (mdev->ldev->dc.on_io_error) {
1633 case EP_PASS_ON:
1634 if (!forcedetach) {
1635 if (printk_ratelimit())
1636 dev_err(DEV, "Local IO failed in %s."
1637 "Passing error on...\n", where);
1638 break;
1639 }
1640 /* NOTE fall through to detach case if forcedetach set */
1641 case EP_DETACH:
1642 case EP_CALL_HELPER:
1643 if (mdev->state.disk > D_FAILED) {
1644 _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
1645 dev_err(DEV, "Local IO failed in %s."
1646 "Detaching...\n", where);
1647 }
1648 break;
1649 }
1650}
1651
1652/**
1653 * drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers
1654 * @mdev: DRBD device.
1655 * @error: Error code passed to the IO completion callback
1656 * @forcedetach: Force detach. I.e. the error happened while accessing the meta data
1657 *
1658 * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED)
1659 */
1660#define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__)
1661static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
1662 int error, int forcedetach, const char *where)
1663{
1664 if (error) {
1665 unsigned long flags;
1666 spin_lock_irqsave(&mdev->req_lock, flags);
1667 __drbd_chk_io_error_(mdev, forcedetach, where);
1668 spin_unlock_irqrestore(&mdev->req_lock, flags);
1669 }
1670}
1671
1672
1673/**
1674 * drbd_md_first_sector() - Returns the first sector number of the meta data area
1675 * @bdev: Meta data block device.
1676 *
1677 * BTW, for internal meta data, this happens to be the maximum capacity
1678 * we could agree upon with our peer node.
1679 */
1680static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
1681{
1682 switch (bdev->dc.meta_dev_idx) {
1683 case DRBD_MD_INDEX_INTERNAL:
1684 case DRBD_MD_INDEX_FLEX_INT:
1685 return bdev->md.md_offset + bdev->md.bm_offset;
1686 case DRBD_MD_INDEX_FLEX_EXT:
1687 default:
1688 return bdev->md.md_offset;
1689 }
1690}
1691
1692/**
1693 * drbd_md_last_sector() - Return the last sector number of the meta data area
1694 * @bdev: Meta data block device.
1695 */
1696static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
1697{
1698 switch (bdev->dc.meta_dev_idx) {
1699 case DRBD_MD_INDEX_INTERNAL:
1700 case DRBD_MD_INDEX_FLEX_INT:
1701 return bdev->md.md_offset + MD_AL_OFFSET - 1;
1702 case DRBD_MD_INDEX_FLEX_EXT:
1703 default:
1704 return bdev->md.md_offset + bdev->md.md_size_sect;
1705 }
1706}
1707
1708/* Returns the number of 512 byte sectors of the device */
1709static inline sector_t drbd_get_capacity(struct block_device *bdev)
1710{
1711 /* return bdev ? get_capacity(bdev->bd_disk) : 0; */
1712 return bdev ? bdev->bd_inode->i_size >> 9 : 0;
1713}
1714
1715/**
1716 * drbd_get_max_capacity() - Returns the capacity we announce to out peer
1717 * @bdev: Meta data block device.
1718 *
1719 * returns the capacity we announce to out peer. we clip ourselves at the
1720 * various MAX_SECTORS, because if we don't, current implementation will
1721 * oops sooner or later
1722 */
1723static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
1724{
1725 sector_t s;
1726 switch (bdev->dc.meta_dev_idx) {
1727 case DRBD_MD_INDEX_INTERNAL:
1728 case DRBD_MD_INDEX_FLEX_INT:
1729 s = drbd_get_capacity(bdev->backing_bdev)
1730 ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
1731 drbd_md_first_sector(bdev))
1732 : 0;
1733 break;
1734 case DRBD_MD_INDEX_FLEX_EXT:
1735 s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
1736 drbd_get_capacity(bdev->backing_bdev));
1737 /* clip at maximum size the meta device can support */
1738 s = min_t(sector_t, s,
1739 BM_EXT_TO_SECT(bdev->md.md_size_sect
1740 - bdev->md.bm_offset));
1741 break;
1742 default:
1743 s = min_t(sector_t, DRBD_MAX_SECTORS,
1744 drbd_get_capacity(bdev->backing_bdev));
1745 }
1746 return s;
1747}
1748
1749/**
1750 * drbd_md_ss__() - Return the sector number of our meta data super block
1751 * @mdev: DRBD device.
1752 * @bdev: Meta data block device.
1753 */
1754static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
1755 struct drbd_backing_dev *bdev)
1756{
1757 switch (bdev->dc.meta_dev_idx) {
1758 default: /* external, some index */
1759 return MD_RESERVED_SECT * bdev->dc.meta_dev_idx;
1760 case DRBD_MD_INDEX_INTERNAL:
1761 /* with drbd08, internal meta data is always "flexible" */
1762 case DRBD_MD_INDEX_FLEX_INT:
1763 /* sizeof(struct md_on_disk_07) == 4k
1764 * position: last 4k aligned block of 4k size */
1765 if (!bdev->backing_bdev) {
1766 if (__ratelimit(&drbd_ratelimit_state)) {
1767 dev_err(DEV, "bdev->backing_bdev==NULL\n");
1768 dump_stack();
1769 }
1770 return 0;
1771 }
1772 return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL)
1773 - MD_AL_OFFSET;
1774 case DRBD_MD_INDEX_FLEX_EXT:
1775 return 0;
1776 }
1777}
1778
1779static inline void
1780_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
1781{
1782 list_add_tail(&w->list, &q->q);
1783 up(&q->s);
1784}
1785
1786static inline void
1787drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
1788{
1789 unsigned long flags;
1790 spin_lock_irqsave(&q->q_lock, flags);
1791 list_add(&w->list, &q->q);
1792 up(&q->s); /* within the spinlock,
1793 see comment near end of drbd_worker() */
1794 spin_unlock_irqrestore(&q->q_lock, flags);
1795}
1796
1797static inline void
1798drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
1799{
1800 unsigned long flags;
1801 spin_lock_irqsave(&q->q_lock, flags);
1802 list_add_tail(&w->list, &q->q);
1803 up(&q->s); /* within the spinlock,
1804 see comment near end of drbd_worker() */
1805 spin_unlock_irqrestore(&q->q_lock, flags);
1806}
1807
1808static inline void wake_asender(struct drbd_conf *mdev)
1809{
1810 if (test_bit(SIGNAL_ASENDER, &mdev->flags))
1811 force_sig(DRBD_SIG, mdev->asender.task);
1812}
1813
1814static inline void request_ping(struct drbd_conf *mdev)
1815{
1816 set_bit(SEND_PING, &mdev->flags);
1817 wake_asender(mdev);
1818}
1819
1820static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
1821 enum drbd_packets cmd)
1822{
1823 struct p_header h;
1824 return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
1825}
1826
1827static inline int drbd_send_ping(struct drbd_conf *mdev)
1828{
1829 struct p_header h;
1830 return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
1831}
1832
1833static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
1834{
1835 struct p_header h;
1836 return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
1837}
1838
1839static inline void drbd_thread_stop(struct drbd_thread *thi)
1840{
1841 _drbd_thread_stop(thi, FALSE, TRUE);
1842}
1843
1844static inline void drbd_thread_stop_nowait(struct drbd_thread *thi)
1845{
1846 _drbd_thread_stop(thi, FALSE, FALSE);
1847}
1848
1849static inline void drbd_thread_restart_nowait(struct drbd_thread *thi)
1850{
1851 _drbd_thread_stop(thi, TRUE, FALSE);
1852}
1853
1854/* counts how many answer packets packets we expect from our peer,
1855 * for either explicit application requests,
1856 * or implicit barrier packets as necessary.
1857 * increased:
1858 * w_send_barrier
1859 * _req_mod(req, queue_for_net_write or queue_for_net_read);
1860 * it is much easier and equally valid to count what we queue for the
1861 * worker, even before it actually was queued or send.
1862 * (drbd_make_request_common; recovery path on read io-error)
1863 * decreased:
1864 * got_BarrierAck (respective tl_clear, tl_clear_barrier)
1865 * _req_mod(req, data_received)
1866 * [from receive_DataReply]
1867 * _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked)
1868 * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)]
1869 * for some reason it is NOT decreased in got_NegAck,
1870 * but in the resulting cleanup code from report_params.
1871 * we should try to remember the reason for that...
1872 * _req_mod(req, send_failed or send_canceled)
1873 * _req_mod(req, connection_lost_while_pending)
1874 * [from tl_clear_barrier]
1875 */
1876static inline void inc_ap_pending(struct drbd_conf *mdev)
1877{
1878 atomic_inc(&mdev->ap_pending_cnt);
1879}
1880
1881#define ERR_IF_CNT_IS_NEGATIVE(which) \
1882 if (atomic_read(&mdev->which) < 0) \
1883 dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \
1884 __func__ , __LINE__ , \
1885 atomic_read(&mdev->which))
1886
1887#define dec_ap_pending(mdev) do { \
1888 typecheck(struct drbd_conf *, mdev); \
1889 if (atomic_dec_and_test(&mdev->ap_pending_cnt)) \
1890 wake_up(&mdev->misc_wait); \
1891 ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0)
1892
1893/* counts how many resync-related answers we still expect from the peer
1894 * increase decrease
1895 * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY)
1896 * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK whith ID_SYNCER)
1897 * (or P_NEG_ACK with ID_SYNCER)
1898 */
1899static inline void inc_rs_pending(struct drbd_conf *mdev)
1900{
1901 atomic_inc(&mdev->rs_pending_cnt);
1902}
1903
1904#define dec_rs_pending(mdev) do { \
1905 typecheck(struct drbd_conf *, mdev); \
1906 atomic_dec(&mdev->rs_pending_cnt); \
1907 ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0)
1908
1909/* counts how many answers we still need to send to the peer.
1910 * increased on
1911 * receive_Data unless protocol A;
1912 * we need to send a P_RECV_ACK (proto B)
1913 * or P_WRITE_ACK (proto C)
1914 * receive_RSDataReply (recv_resync_read) we need to send a P_WRITE_ACK
1915 * receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA
1916 * receive_Barrier_* we need to send a P_BARRIER_ACK
1917 */
1918static inline void inc_unacked(struct drbd_conf *mdev)
1919{
1920 atomic_inc(&mdev->unacked_cnt);
1921}
1922
1923#define dec_unacked(mdev) do { \
1924 typecheck(struct drbd_conf *, mdev); \
1925 atomic_dec(&mdev->unacked_cnt); \
1926 ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
1927
1928#define sub_unacked(mdev, n) do { \
1929 typecheck(struct drbd_conf *, mdev); \
1930 atomic_sub(n, &mdev->unacked_cnt); \
1931 ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
1932
1933
1934static inline void put_net_conf(struct drbd_conf *mdev)
1935{
1936 if (atomic_dec_and_test(&mdev->net_cnt))
1937 wake_up(&mdev->misc_wait);
1938}
1939
1940/**
1941 * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there
1942 * @mdev: DRBD device.
1943 *
1944 * You have to call put_net_conf() when finished working with mdev->net_conf.
1945 */
1946static inline int get_net_conf(struct drbd_conf *mdev)
1947{
1948 int have_net_conf;
1949
1950 atomic_inc(&mdev->net_cnt);
1951 have_net_conf = mdev->state.conn >= C_UNCONNECTED;
1952 if (!have_net_conf)
1953 put_net_conf(mdev);
1954 return have_net_conf;
1955}
1956
1957/**
1958 * get_ldev() - Increase the ref count on mdev->ldev. Returns 0 if there is no ldev
1959 * @M: DRBD device.
1960 *
1961 * You have to call put_ldev() when finished working with mdev->ldev.
1962 */
1963#define get_ldev(M) __cond_lock(local, _get_ldev_if_state(M,D_INCONSISTENT))
1964#define get_ldev_if_state(M,MINS) __cond_lock(local, _get_ldev_if_state(M,MINS))
1965
1966static inline void put_ldev(struct drbd_conf *mdev)
1967{
1968 __release(local);
1969 if (atomic_dec_and_test(&mdev->local_cnt))
1970 wake_up(&mdev->misc_wait);
1971 D_ASSERT(atomic_read(&mdev->local_cnt) >= 0);
1972}
1973
1974#ifndef __CHECKER__
1975static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
1976{
1977 int io_allowed;
1978
1979 atomic_inc(&mdev->local_cnt);
1980 io_allowed = (mdev->state.disk >= mins);
1981 if (!io_allowed)
1982 put_ldev(mdev);
1983 return io_allowed;
1984}
1985#else
1986extern int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins);
1987#endif
1988
1989/* you must have an "get_ldev" reference */
1990static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
1991 unsigned long *bits_left, unsigned int *per_mil_done)
1992{
1993 /*
1994 * this is to break it at compile time when we change that
1995 * (we may feel 4TB maximum storage per drbd is not enough)
1996 */
1997 typecheck(unsigned long, mdev->rs_total);
1998
1999 /* note: both rs_total and rs_left are in bits, i.e. in
2000 * units of BM_BLOCK_SIZE.
2001 * for the percentage, we don't care. */
2002
2003 *bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
2004 /* >> 10 to prevent overflow,
2005 * +1 to prevent division by zero */
2006 if (*bits_left > mdev->rs_total) {
2007 /* doh. maybe a logic bug somewhere.
2008 * may also be just a race condition
2009 * between this and a disconnect during sync.
2010 * for now, just prevent in-kernel buffer overflow.
2011 */
2012 smp_rmb();
2013 dev_warn(DEV, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n",
2014 drbd_conn_str(mdev->state.conn),
2015 *bits_left, mdev->rs_total, mdev->rs_failed);
2016 *per_mil_done = 0;
2017 } else {
2018 /* make sure the calculation happens in long context */
2019 unsigned long tmp = 1000UL -
2020 (*bits_left >> 10)*1000UL
2021 / ((mdev->rs_total >> 10) + 1UL);
2022 *per_mil_done = tmp;
2023 }
2024}
2025
2026
2027/* this throttles on-the-fly application requests
2028 * according to max_buffers settings;
2029 * maybe re-implement using semaphores? */
2030static inline int drbd_get_max_buffers(struct drbd_conf *mdev)
2031{
2032 int mxb = 1000000; /* arbitrary limit on open requests */
2033 if (get_net_conf(mdev)) {
2034 mxb = mdev->net_conf->max_buffers;
2035 put_net_conf(mdev);
2036 }
2037 return mxb;
2038}
2039
2040static inline int drbd_state_is_stable(union drbd_state s)
2041{
2042
2043 /* DO NOT add a default clause, we want the compiler to warn us
2044 * for any newly introduced state we may have forgotten to add here */
2045
2046 switch ((enum drbd_conns)s.conn) {
2047 /* new io only accepted when there is no connection, ... */
2048 case C_STANDALONE:
2049 case C_WF_CONNECTION:
2050 /* ... or there is a well established connection. */
2051 case C_CONNECTED:
2052 case C_SYNC_SOURCE:
2053 case C_SYNC_TARGET:
2054 case C_VERIFY_S:
2055 case C_VERIFY_T:
2056 case C_PAUSED_SYNC_S:
2057 case C_PAUSED_SYNC_T:
2058 /* maybe stable, look at the disk state */
2059 break;
2060
2061 /* no new io accepted during tansitional states
2062 * like handshake or teardown */
2063 case C_DISCONNECTING:
2064 case C_UNCONNECTED:
2065 case C_TIMEOUT:
2066 case C_BROKEN_PIPE:
2067 case C_NETWORK_FAILURE:
2068 case C_PROTOCOL_ERROR:
2069 case C_TEAR_DOWN:
2070 case C_WF_REPORT_PARAMS:
2071 case C_STARTING_SYNC_S:
2072 case C_STARTING_SYNC_T:
2073 case C_WF_BITMAP_S:
2074 case C_WF_BITMAP_T:
2075 case C_WF_SYNC_UUID:
2076 case C_MASK:
2077 /* not "stable" */
2078 return 0;
2079 }
2080
2081 switch ((enum drbd_disk_state)s.disk) {
2082 case D_DISKLESS:
2083 case D_INCONSISTENT:
2084 case D_OUTDATED:
2085 case D_CONSISTENT:
2086 case D_UP_TO_DATE:
2087 /* disk state is stable as well. */
2088 break;
2089
2090 /* no new io accepted during tansitional states */
2091 case D_ATTACHING:
2092 case D_FAILED:
2093 case D_NEGOTIATING:
2094 case D_UNKNOWN:
2095 case D_MASK:
2096 /* not "stable" */
2097 return 0;
2098 }
2099
2100 return 1;
2101}
2102
2103static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
2104{
2105 int mxb = drbd_get_max_buffers(mdev);
2106
2107 if (mdev->state.susp)
2108 return 0;
2109 if (test_bit(SUSPEND_IO, &mdev->flags))
2110 return 0;
2111
2112 /* to avoid potential deadlock or bitmap corruption,
2113 * in various places, we only allow new application io
2114 * to start during "stable" states. */
2115
2116 /* no new io accepted when attaching or detaching the disk */
2117 if (!drbd_state_is_stable(mdev->state))
2118 return 0;
2119
2120 /* since some older kernels don't have atomic_add_unless,
2121 * and we are within the spinlock anyways, we have this workaround. */
2122 if (atomic_read(&mdev->ap_bio_cnt) > mxb)
2123 return 0;
2124 if (test_bit(BITMAP_IO, &mdev->flags))
2125 return 0;
2126 return 1;
2127}
2128
2129/* I'd like to use wait_event_lock_irq,
2130 * but I'm not sure when it got introduced,
2131 * and not sure when it has 3 or 4 arguments */
2132static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two)
2133{
2134 /* compare with after_state_ch,
2135 * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */
2136 DEFINE_WAIT(wait);
2137
2138 /* we wait here
2139 * as long as the device is suspended
2140 * until the bitmap is no longer on the fly during connection
2141 * handshake as long as we would exeed the max_buffer limit.
2142 *
2143 * to avoid races with the reconnect code,
2144 * we need to atomic_inc within the spinlock. */
2145
2146 spin_lock_irq(&mdev->req_lock);
2147 while (!__inc_ap_bio_cond(mdev)) {
2148 prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
2149 spin_unlock_irq(&mdev->req_lock);
2150 schedule();
2151 finish_wait(&mdev->misc_wait, &wait);
2152 spin_lock_irq(&mdev->req_lock);
2153 }
2154 atomic_add(one_or_two, &mdev->ap_bio_cnt);
2155 spin_unlock_irq(&mdev->req_lock);
2156}
2157
2158static inline void dec_ap_bio(struct drbd_conf *mdev)
2159{
2160 int mxb = drbd_get_max_buffers(mdev);
2161 int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt);
2162
2163 D_ASSERT(ap_bio >= 0);
2164 /* this currently does wake_up for every dec_ap_bio!
2165 * maybe rather introduce some type of hysteresis?
2166 * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */
2167 if (ap_bio < mxb)
2168 wake_up(&mdev->misc_wait);
2169 if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) {
2170 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
2171 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
2172 }
2173}
2174
2175static inline void drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
2176{
2177 mdev->ed_uuid = val;
2178}
2179
2180static inline int seq_cmp(u32 a, u32 b)
2181{
2182 /* we assume wrap around at 32bit.
2183 * for wrap around at 24bit (old atomic_t),
2184 * we'd have to
2185 * a <<= 8; b <<= 8;
2186 */
2187 return (s32)(a) - (s32)(b);
2188}
2189#define seq_lt(a, b) (seq_cmp((a), (b)) < 0)
2190#define seq_gt(a, b) (seq_cmp((a), (b)) > 0)
2191#define seq_ge(a, b) (seq_cmp((a), (b)) >= 0)
2192#define seq_le(a, b) (seq_cmp((a), (b)) <= 0)
2193/* CAUTION: please no side effects in arguments! */
2194#define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b)))
2195
2196static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq)
2197{
2198 unsigned int m;
2199 spin_lock(&mdev->peer_seq_lock);
2200 m = seq_max(mdev->peer_seq, new_seq);
2201 mdev->peer_seq = m;
2202 spin_unlock(&mdev->peer_seq_lock);
2203 if (m == new_seq)
2204 wake_up(&mdev->seq_wait);
2205}
2206
2207static inline void drbd_update_congested(struct drbd_conf *mdev)
2208{
2209 struct sock *sk = mdev->data.socket->sk;
2210 if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
2211 set_bit(NET_CONGESTED, &mdev->flags);
2212}
2213
2214static inline int drbd_queue_order_type(struct drbd_conf *mdev)
2215{
2216 /* sorry, we currently have no working implementation
2217 * of distributed TCQ stuff */
2218#ifndef QUEUE_ORDERED_NONE
2219#define QUEUE_ORDERED_NONE 0
2220#endif
2221 return QUEUE_ORDERED_NONE;
2222}
2223
2224static inline void drbd_blk_run_queue(struct request_queue *q)
2225{
2226 if (q && q->unplug_fn)
2227 q->unplug_fn(q);
2228}
2229
2230static inline void drbd_kick_lo(struct drbd_conf *mdev)
2231{
2232 if (get_ldev(mdev)) {
2233 drbd_blk_run_queue(bdev_get_queue(mdev->ldev->backing_bdev));
2234 put_ldev(mdev);
2235 }
2236}
2237
2238static inline void drbd_md_flush(struct drbd_conf *mdev)
2239{
2240 int r;
2241
2242 if (test_bit(MD_NO_BARRIER, &mdev->flags))
2243 return;
2244
2245 r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL);
2246 if (r) {
2247 set_bit(MD_NO_BARRIER, &mdev->flags);
2248 dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
2249 }
2250}
2251
2252#endif
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
new file mode 100644
index 000000000000..11d8ff6016ac
--- /dev/null
+++ b/drivers/block/drbd/drbd_main.c
@@ -0,0 +1,3700 @@
1/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
29#include <linux/module.h>
30#include <linux/version.h>
31#include <linux/drbd.h>
32#include <asm/uaccess.h>
33#include <asm/types.h>
34#include <net/sock.h>
35#include <linux/ctype.h>
36#include <linux/smp_lock.h>
37#include <linux/fs.h>
38#include <linux/file.h>
39#include <linux/proc_fs.h>
40#include <linux/init.h>
41#include <linux/mm.h>
42#include <linux/memcontrol.h>
43#include <linux/mm_inline.h>
44#include <linux/slab.h>
45#include <linux/random.h>
46#include <linux/reboot.h>
47#include <linux/notifier.h>
48#include <linux/kthread.h>
49
50#define __KERNEL_SYSCALLS__
51#include <linux/unistd.h>
52#include <linux/vmalloc.h>
53
54#include <linux/drbd_limits.h>
55#include "drbd_int.h"
56#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
57
58#include "drbd_vli.h"
59
60struct after_state_chg_work {
61 struct drbd_work w;
62 union drbd_state os;
63 union drbd_state ns;
64 enum chg_state_flags flags;
65 struct completion *done;
66};
67
68int drbdd_init(struct drbd_thread *);
69int drbd_worker(struct drbd_thread *);
70int drbd_asender(struct drbd_thread *);
71
72int drbd_init(void);
73static int drbd_open(struct block_device *bdev, fmode_t mode);
74static int drbd_release(struct gendisk *gd, fmode_t mode);
75static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79static void md_sync_timer_fn(unsigned long data);
80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
81
82MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
83 "Lars Ellenberg <lars@linbit.com>");
84MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
85MODULE_VERSION(REL_VERSION);
86MODULE_LICENSE("GPL");
87MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
88MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
89
90#include <linux/moduleparam.h>
91/* allow_open_on_secondary */
92MODULE_PARM_DESC(allow_oos, "DONT USE!");
93/* thanks to these macros, if compiled into the kernel (not-module),
94 * this becomes the boot parameter drbd.minor_count */
95module_param(minor_count, uint, 0444);
96module_param(disable_sendpage, bool, 0644);
97module_param(allow_oos, bool, 0);
98module_param(cn_idx, uint, 0444);
99module_param(proc_details, int, 0644);
100
101#ifdef CONFIG_DRBD_FAULT_INJECTION
102int enable_faults;
103int fault_rate;
104static int fault_count;
105int fault_devs;
106/* bitmap of enabled faults */
107module_param(enable_faults, int, 0664);
108/* fault rate % value - applies to all enabled faults */
109module_param(fault_rate, int, 0664);
110/* count of faults inserted */
111module_param(fault_count, int, 0664);
112/* bitmap of devices to insert faults on */
113module_param(fault_devs, int, 0644);
114#endif
115
116/* module parameter, defined */
117unsigned int minor_count = 32;
118int disable_sendpage;
119int allow_oos;
120unsigned int cn_idx = CN_IDX_DRBD;
121int proc_details; /* Detail level in proc drbd*/
122
123/* Module parameter for setting the user mode helper program
124 * to run. Default is /sbin/drbdadm */
125char usermode_helper[80] = "/sbin/drbdadm";
126
127module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
128
129/* in 2.6.x, our device mapping and config info contains our virtual gendisks
130 * as member "struct gendisk *vdisk;"
131 */
132struct drbd_conf **minor_table;
133
134struct kmem_cache *drbd_request_cache;
135struct kmem_cache *drbd_ee_cache; /* epoch entries */
136struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
137struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
138mempool_t *drbd_request_mempool;
139mempool_t *drbd_ee_mempool;
140
141/* I do not use a standard mempool, because:
142 1) I want to hand out the pre-allocated objects first.
143 2) I want to be able to interrupt sleeping allocation with a signal.
144 Note: This is a single linked list, the next pointer is the private
145 member of struct page.
146 */
147struct page *drbd_pp_pool;
148spinlock_t drbd_pp_lock;
149int drbd_pp_vacant;
150wait_queue_head_t drbd_pp_wait;
151
152DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
153
154static struct block_device_operations drbd_ops = {
155 .owner = THIS_MODULE,
156 .open = drbd_open,
157 .release = drbd_release,
158};
159
160#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
161
162#ifdef __CHECKER__
163/* When checking with sparse, and this is an inline function, sparse will
164 give tons of false positives. When this is a real functions sparse works.
165 */
166int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
167{
168 int io_allowed;
169
170 atomic_inc(&mdev->local_cnt);
171 io_allowed = (mdev->state.disk >= mins);
172 if (!io_allowed) {
173 if (atomic_dec_and_test(&mdev->local_cnt))
174 wake_up(&mdev->misc_wait);
175 }
176 return io_allowed;
177}
178
179#endif
180
181/**
182 * DOC: The transfer log
183 *
184 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
185 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
186 * of the list. There is always at least one &struct drbd_tl_epoch object.
187 *
188 * Each &struct drbd_tl_epoch has a circular double linked list of requests
189 * attached.
190 */
191static int tl_init(struct drbd_conf *mdev)
192{
193 struct drbd_tl_epoch *b;
194
195 /* during device minor initialization, we may well use GFP_KERNEL */
196 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
197 if (!b)
198 return 0;
199 INIT_LIST_HEAD(&b->requests);
200 INIT_LIST_HEAD(&b->w.list);
201 b->next = NULL;
202 b->br_number = 4711;
203 b->n_req = 0;
204 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
205
206 mdev->oldest_tle = b;
207 mdev->newest_tle = b;
208 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
209
210 mdev->tl_hash = NULL;
211 mdev->tl_hash_s = 0;
212
213 return 1;
214}
215
216static void tl_cleanup(struct drbd_conf *mdev)
217{
218 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
219 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
220 kfree(mdev->oldest_tle);
221 mdev->oldest_tle = NULL;
222 kfree(mdev->unused_spare_tle);
223 mdev->unused_spare_tle = NULL;
224 kfree(mdev->tl_hash);
225 mdev->tl_hash = NULL;
226 mdev->tl_hash_s = 0;
227}
228
229/**
230 * _tl_add_barrier() - Adds a barrier to the transfer log
231 * @mdev: DRBD device.
232 * @new: Barrier to be added before the current head of the TL.
233 *
234 * The caller must hold the req_lock.
235 */
236void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
237{
238 struct drbd_tl_epoch *newest_before;
239
240 INIT_LIST_HEAD(&new->requests);
241 INIT_LIST_HEAD(&new->w.list);
242 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
243 new->next = NULL;
244 new->n_req = 0;
245
246 newest_before = mdev->newest_tle;
247 /* never send a barrier number == 0, because that is special-cased
248 * when using TCQ for our write ordering code */
249 new->br_number = (newest_before->br_number+1) ?: 1;
250 if (mdev->newest_tle != new) {
251 mdev->newest_tle->next = new;
252 mdev->newest_tle = new;
253 }
254}
255
256/**
257 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
258 * @mdev: DRBD device.
259 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
260 * @set_size: Expected number of requests before that barrier.
261 *
262 * In case the passed barrier_nr or set_size does not match the oldest
263 * &struct drbd_tl_epoch objects this function will cause a termination
264 * of the connection.
265 */
266void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
267 unsigned int set_size)
268{
269 struct drbd_tl_epoch *b, *nob; /* next old barrier */
270 struct list_head *le, *tle;
271 struct drbd_request *r;
272
273 spin_lock_irq(&mdev->req_lock);
274
275 b = mdev->oldest_tle;
276
277 /* first some paranoia code */
278 if (b == NULL) {
279 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
280 barrier_nr);
281 goto bail;
282 }
283 if (b->br_number != barrier_nr) {
284 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
285 barrier_nr, b->br_number);
286 goto bail;
287 }
288 if (b->n_req != set_size) {
289 dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n",
290 barrier_nr, set_size, b->n_req);
291 goto bail;
292 }
293
294 /* Clean up list of requests processed during current epoch */
295 list_for_each_safe(le, tle, &b->requests) {
296 r = list_entry(le, struct drbd_request, tl_requests);
297 _req_mod(r, barrier_acked);
298 }
299 /* There could be requests on the list waiting for completion
300 of the write to the local disk. To avoid corruptions of
301 slab's data structures we have to remove the lists head.
302
303 Also there could have been a barrier ack out of sequence, overtaking
304 the write acks - which would be a bug and violating write ordering.
305 To not deadlock in case we lose connection while such requests are
306 still pending, we need some way to find them for the
307 _req_mode(connection_lost_while_pending).
308
309 These have been list_move'd to the out_of_sequence_requests list in
310 _req_mod(, barrier_acked) above.
311 */
312 list_del_init(&b->requests);
313
314 nob = b->next;
315 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
316 _tl_add_barrier(mdev, b);
317 if (nob)
318 mdev->oldest_tle = nob;
319 /* if nob == NULL b was the only barrier, and becomes the new
320 barrier. Therefore mdev->oldest_tle points already to b */
321 } else {
322 D_ASSERT(nob != NULL);
323 mdev->oldest_tle = nob;
324 kfree(b);
325 }
326
327 spin_unlock_irq(&mdev->req_lock);
328 dec_ap_pending(mdev);
329
330 return;
331
332bail:
333 spin_unlock_irq(&mdev->req_lock);
334 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
335}
336
337
338/**
339 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
340 * @mdev: DRBD device.
341 *
342 * This is called after the connection to the peer was lost. The storage covered
343 * by the requests on the transfer gets marked as our of sync. Called from the
344 * receiver thread and the worker thread.
345 */
346void tl_clear(struct drbd_conf *mdev)
347{
348 struct drbd_tl_epoch *b, *tmp;
349 struct list_head *le, *tle;
350 struct drbd_request *r;
351 int new_initial_bnr = net_random();
352
353 spin_lock_irq(&mdev->req_lock);
354
355 b = mdev->oldest_tle;
356 while (b) {
357 list_for_each_safe(le, tle, &b->requests) {
358 r = list_entry(le, struct drbd_request, tl_requests);
359 /* It would be nice to complete outside of spinlock.
360 * But this is easier for now. */
361 _req_mod(r, connection_lost_while_pending);
362 }
363 tmp = b->next;
364
365 /* there could still be requests on that ring list,
366 * in case local io is still pending */
367 list_del(&b->requests);
368
369 /* dec_ap_pending corresponding to queue_barrier.
370 * the newest barrier may not have been queued yet,
371 * in which case w.cb is still NULL. */
372 if (b->w.cb != NULL)
373 dec_ap_pending(mdev);
374
375 if (b == mdev->newest_tle) {
376 /* recycle, but reinit! */
377 D_ASSERT(tmp == NULL);
378 INIT_LIST_HEAD(&b->requests);
379 INIT_LIST_HEAD(&b->w.list);
380 b->w.cb = NULL;
381 b->br_number = new_initial_bnr;
382 b->n_req = 0;
383
384 mdev->oldest_tle = b;
385 break;
386 }
387 kfree(b);
388 b = tmp;
389 }
390
391 /* we expect this list to be empty. */
392 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
393
394 /* but just in case, clean it up anyways! */
395 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
396 r = list_entry(le, struct drbd_request, tl_requests);
397 /* It would be nice to complete outside of spinlock.
398 * But this is easier for now. */
399 _req_mod(r, connection_lost_while_pending);
400 }
401
402 /* ensure bit indicating barrier is required is clear */
403 clear_bit(CREATE_BARRIER, &mdev->flags);
404
405 spin_unlock_irq(&mdev->req_lock);
406}
407
408/**
409 * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
410 * @mdev: DRBD device.
411 * @os: old (current) state.
412 * @ns: new (wanted) state.
413 */
414static int cl_wide_st_chg(struct drbd_conf *mdev,
415 union drbd_state os, union drbd_state ns)
416{
417 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
418 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
419 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
420 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
421 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
422 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
423 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
424}
425
426int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
427 union drbd_state mask, union drbd_state val)
428{
429 unsigned long flags;
430 union drbd_state os, ns;
431 int rv;
432
433 spin_lock_irqsave(&mdev->req_lock, flags);
434 os = mdev->state;
435 ns.i = (os.i & ~mask.i) | val.i;
436 rv = _drbd_set_state(mdev, ns, f, NULL);
437 ns = mdev->state;
438 spin_unlock_irqrestore(&mdev->req_lock, flags);
439
440 return rv;
441}
442
443/**
444 * drbd_force_state() - Impose a change which happens outside our control on our state
445 * @mdev: DRBD device.
446 * @mask: mask of state bits to change.
447 * @val: value of new state bits.
448 */
449void drbd_force_state(struct drbd_conf *mdev,
450 union drbd_state mask, union drbd_state val)
451{
452 drbd_change_state(mdev, CS_HARD, mask, val);
453}
454
455static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
456static int is_valid_state_transition(struct drbd_conf *,
457 union drbd_state, union drbd_state);
458static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
459 union drbd_state ns, int *warn_sync_abort);
460int drbd_send_state_req(struct drbd_conf *,
461 union drbd_state, union drbd_state);
462
463static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
464 union drbd_state mask, union drbd_state val)
465{
466 union drbd_state os, ns;
467 unsigned long flags;
468 int rv;
469
470 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
471 return SS_CW_SUCCESS;
472
473 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
474 return SS_CW_FAILED_BY_PEER;
475
476 rv = 0;
477 spin_lock_irqsave(&mdev->req_lock, flags);
478 os = mdev->state;
479 ns.i = (os.i & ~mask.i) | val.i;
480 ns = sanitize_state(mdev, os, ns, NULL);
481
482 if (!cl_wide_st_chg(mdev, os, ns))
483 rv = SS_CW_NO_NEED;
484 if (!rv) {
485 rv = is_valid_state(mdev, ns);
486 if (rv == SS_SUCCESS) {
487 rv = is_valid_state_transition(mdev, ns, os);
488 if (rv == SS_SUCCESS)
489 rv = 0; /* cont waiting, otherwise fail. */
490 }
491 }
492 spin_unlock_irqrestore(&mdev->req_lock, flags);
493
494 return rv;
495}
496
497/**
498 * drbd_req_state() - Perform an eventually cluster wide state change
499 * @mdev: DRBD device.
500 * @mask: mask of state bits to change.
501 * @val: value of new state bits.
502 * @f: flags
503 *
504 * Should not be called directly, use drbd_request_state() or
505 * _drbd_request_state().
506 */
507static int drbd_req_state(struct drbd_conf *mdev,
508 union drbd_state mask, union drbd_state val,
509 enum chg_state_flags f)
510{
511 struct completion done;
512 unsigned long flags;
513 union drbd_state os, ns;
514 int rv;
515
516 init_completion(&done);
517
518 if (f & CS_SERIALIZE)
519 mutex_lock(&mdev->state_mutex);
520
521 spin_lock_irqsave(&mdev->req_lock, flags);
522 os = mdev->state;
523 ns.i = (os.i & ~mask.i) | val.i;
524 ns = sanitize_state(mdev, os, ns, NULL);
525
526 if (cl_wide_st_chg(mdev, os, ns)) {
527 rv = is_valid_state(mdev, ns);
528 if (rv == SS_SUCCESS)
529 rv = is_valid_state_transition(mdev, ns, os);
530 spin_unlock_irqrestore(&mdev->req_lock, flags);
531
532 if (rv < SS_SUCCESS) {
533 if (f & CS_VERBOSE)
534 print_st_err(mdev, os, ns, rv);
535 goto abort;
536 }
537
538 drbd_state_lock(mdev);
539 if (!drbd_send_state_req(mdev, mask, val)) {
540 drbd_state_unlock(mdev);
541 rv = SS_CW_FAILED_BY_PEER;
542 if (f & CS_VERBOSE)
543 print_st_err(mdev, os, ns, rv);
544 goto abort;
545 }
546
547 wait_event(mdev->state_wait,
548 (rv = _req_st_cond(mdev, mask, val)));
549
550 if (rv < SS_SUCCESS) {
551 drbd_state_unlock(mdev);
552 if (f & CS_VERBOSE)
553 print_st_err(mdev, os, ns, rv);
554 goto abort;
555 }
556 spin_lock_irqsave(&mdev->req_lock, flags);
557 os = mdev->state;
558 ns.i = (os.i & ~mask.i) | val.i;
559 rv = _drbd_set_state(mdev, ns, f, &done);
560 drbd_state_unlock(mdev);
561 } else {
562 rv = _drbd_set_state(mdev, ns, f, &done);
563 }
564
565 spin_unlock_irqrestore(&mdev->req_lock, flags);
566
567 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
568 D_ASSERT(current != mdev->worker.task);
569 wait_for_completion(&done);
570 }
571
572abort:
573 if (f & CS_SERIALIZE)
574 mutex_unlock(&mdev->state_mutex);
575
576 return rv;
577}
578
579/**
580 * _drbd_request_state() - Request a state change (with flags)
581 * @mdev: DRBD device.
582 * @mask: mask of state bits to change.
583 * @val: value of new state bits.
584 * @f: flags
585 *
586 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
587 * flag, or when logging of failed state change requests is not desired.
588 */
589int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
590 union drbd_state val, enum chg_state_flags f)
591{
592 int rv;
593
594 wait_event(mdev->state_wait,
595 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
596
597 return rv;
598}
599
600static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
601{
602 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
603 name,
604 drbd_conn_str(ns.conn),
605 drbd_role_str(ns.role),
606 drbd_role_str(ns.peer),
607 drbd_disk_str(ns.disk),
608 drbd_disk_str(ns.pdsk),
609 ns.susp ? 's' : 'r',
610 ns.aftr_isp ? 'a' : '-',
611 ns.peer_isp ? 'p' : '-',
612 ns.user_isp ? 'u' : '-'
613 );
614}
615
616void print_st_err(struct drbd_conf *mdev,
617 union drbd_state os, union drbd_state ns, int err)
618{
619 if (err == SS_IN_TRANSIENT_STATE)
620 return;
621 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
622 print_st(mdev, " state", os);
623 print_st(mdev, "wanted", ns);
624}
625
626
627#define drbd_peer_str drbd_role_str
628#define drbd_pdsk_str drbd_disk_str
629
630#define drbd_susp_str(A) ((A) ? "1" : "0")
631#define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
632#define drbd_peer_isp_str(A) ((A) ? "1" : "0")
633#define drbd_user_isp_str(A) ((A) ? "1" : "0")
634
635#define PSC(A) \
636 ({ if (ns.A != os.A) { \
637 pbp += sprintf(pbp, #A "( %s -> %s ) ", \
638 drbd_##A##_str(os.A), \
639 drbd_##A##_str(ns.A)); \
640 } })
641
642/**
643 * is_valid_state() - Returns an SS_ error code if ns is not valid
644 * @mdev: DRBD device.
645 * @ns: State to consider.
646 */
647static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
648{
649 /* See drbd_state_sw_errors in drbd_strings.c */
650
651 enum drbd_fencing_p fp;
652 int rv = SS_SUCCESS;
653
654 fp = FP_DONT_CARE;
655 if (get_ldev(mdev)) {
656 fp = mdev->ldev->dc.fencing;
657 put_ldev(mdev);
658 }
659
660 if (get_net_conf(mdev)) {
661 if (!mdev->net_conf->two_primaries &&
662 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
663 rv = SS_TWO_PRIMARIES;
664 put_net_conf(mdev);
665 }
666
667 if (rv <= 0)
668 /* already found a reason to abort */;
669 else if (ns.role == R_SECONDARY && mdev->open_cnt)
670 rv = SS_DEVICE_IN_USE;
671
672 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
673 rv = SS_NO_UP_TO_DATE_DISK;
674
675 else if (fp >= FP_RESOURCE &&
676 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
677 rv = SS_PRIMARY_NOP;
678
679 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
680 rv = SS_NO_UP_TO_DATE_DISK;
681
682 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
683 rv = SS_NO_LOCAL_DISK;
684
685 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
686 rv = SS_NO_REMOTE_DISK;
687
688 else if ((ns.conn == C_CONNECTED ||
689 ns.conn == C_WF_BITMAP_S ||
690 ns.conn == C_SYNC_SOURCE ||
691 ns.conn == C_PAUSED_SYNC_S) &&
692 ns.disk == D_OUTDATED)
693 rv = SS_CONNECTED_OUTDATES;
694
695 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
696 (mdev->sync_conf.verify_alg[0] == 0))
697 rv = SS_NO_VERIFY_ALG;
698
699 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
700 mdev->agreed_pro_version < 88)
701 rv = SS_NOT_SUPPORTED;
702
703 return rv;
704}
705
706/**
707 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
708 * @mdev: DRBD device.
709 * @ns: new state.
710 * @os: old state.
711 */
712static int is_valid_state_transition(struct drbd_conf *mdev,
713 union drbd_state ns, union drbd_state os)
714{
715 int rv = SS_SUCCESS;
716
717 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
718 os.conn > C_CONNECTED)
719 rv = SS_RESYNC_RUNNING;
720
721 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
722 rv = SS_ALREADY_STANDALONE;
723
724 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
725 rv = SS_IS_DISKLESS;
726
727 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
728 rv = SS_NO_NET_CONFIG;
729
730 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
731 rv = SS_LOWER_THAN_OUTDATED;
732
733 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
734 rv = SS_IN_TRANSIENT_STATE;
735
736 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
737 rv = SS_IN_TRANSIENT_STATE;
738
739 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
740 rv = SS_NEED_CONNECTION;
741
742 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
743 ns.conn != os.conn && os.conn > C_CONNECTED)
744 rv = SS_RESYNC_RUNNING;
745
746 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
747 os.conn < C_CONNECTED)
748 rv = SS_NEED_CONNECTION;
749
750 return rv;
751}
752
753/**
754 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
755 * @mdev: DRBD device.
756 * @os: old state.
757 * @ns: new state.
758 * @warn_sync_abort:
759 *
760 * When we loose connection, we have to set the state of the peers disk (pdsk)
761 * to D_UNKNOWN. This rule and many more along those lines are in this function.
762 */
763static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
764 union drbd_state ns, int *warn_sync_abort)
765{
766 enum drbd_fencing_p fp;
767
768 fp = FP_DONT_CARE;
769 if (get_ldev(mdev)) {
770 fp = mdev->ldev->dc.fencing;
771 put_ldev(mdev);
772 }
773
774 /* Disallow Network errors to configure a device's network part */
775 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
776 os.conn <= C_DISCONNECTING)
777 ns.conn = os.conn;
778
779 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */
780 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
781 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING)
782 ns.conn = os.conn;
783
784 /* After C_DISCONNECTING only C_STANDALONE may follow */
785 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
786 ns.conn = os.conn;
787
788 if (ns.conn < C_CONNECTED) {
789 ns.peer_isp = 0;
790 ns.peer = R_UNKNOWN;
791 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
792 ns.pdsk = D_UNKNOWN;
793 }
794
795 /* Clear the aftr_isp when becoming unconfigured */
796 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
797 ns.aftr_isp = 0;
798
799 if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS)
800 ns.pdsk = D_UNKNOWN;
801
802 /* Abort resync if a disk fails/detaches */
803 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
804 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
805 if (warn_sync_abort)
806 *warn_sync_abort = 1;
807 ns.conn = C_CONNECTED;
808 }
809
810 if (ns.conn >= C_CONNECTED &&
811 ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) ||
812 (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) {
813 switch (ns.conn) {
814 case C_WF_BITMAP_T:
815 case C_PAUSED_SYNC_T:
816 ns.disk = D_OUTDATED;
817 break;
818 case C_CONNECTED:
819 case C_WF_BITMAP_S:
820 case C_SYNC_SOURCE:
821 case C_PAUSED_SYNC_S:
822 ns.disk = D_UP_TO_DATE;
823 break;
824 case C_SYNC_TARGET:
825 ns.disk = D_INCONSISTENT;
826 dev_warn(DEV, "Implicitly set disk state Inconsistent!\n");
827 break;
828 }
829 if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE)
830 dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n");
831 }
832
833 if (ns.conn >= C_CONNECTED &&
834 (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) {
835 switch (ns.conn) {
836 case C_CONNECTED:
837 case C_WF_BITMAP_T:
838 case C_PAUSED_SYNC_T:
839 case C_SYNC_TARGET:
840 ns.pdsk = D_UP_TO_DATE;
841 break;
842 case C_WF_BITMAP_S:
843 case C_PAUSED_SYNC_S:
844 ns.pdsk = D_OUTDATED;
845 break;
846 case C_SYNC_SOURCE:
847 ns.pdsk = D_INCONSISTENT;
848 dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n");
849 break;
850 }
851 if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE)
852 dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n");
853 }
854
855 /* Connection breaks down before we finished "Negotiating" */
856 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
857 get_ldev_if_state(mdev, D_NEGOTIATING)) {
858 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
859 ns.disk = mdev->new_state_tmp.disk;
860 ns.pdsk = mdev->new_state_tmp.pdsk;
861 } else {
862 dev_alert(DEV, "Connection lost while negotiating, no data!\n");
863 ns.disk = D_DISKLESS;
864 ns.pdsk = D_UNKNOWN;
865 }
866 put_ldev(mdev);
867 }
868
869 if (fp == FP_STONITH &&
870 (ns.role == R_PRIMARY &&
871 ns.conn < C_CONNECTED &&
872 ns.pdsk > D_OUTDATED))
873 ns.susp = 1;
874
875 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
876 if (ns.conn == C_SYNC_SOURCE)
877 ns.conn = C_PAUSED_SYNC_S;
878 if (ns.conn == C_SYNC_TARGET)
879 ns.conn = C_PAUSED_SYNC_T;
880 } else {
881 if (ns.conn == C_PAUSED_SYNC_S)
882 ns.conn = C_SYNC_SOURCE;
883 if (ns.conn == C_PAUSED_SYNC_T)
884 ns.conn = C_SYNC_TARGET;
885 }
886
887 return ns;
888}
889
890/* helper for __drbd_set_state */
891static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
892{
893 if (cs == C_VERIFY_T) {
894 /* starting online verify from an arbitrary position
895 * does not fit well into the existing protocol.
896 * on C_VERIFY_T, we initialize ov_left and friends
897 * implicitly in receive_DataRequest once the
898 * first P_OV_REQUEST is received */
899 mdev->ov_start_sector = ~(sector_t)0;
900 } else {
901 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
902 if (bit >= mdev->rs_total)
903 mdev->ov_start_sector =
904 BM_BIT_TO_SECT(mdev->rs_total - 1);
905 mdev->ov_position = mdev->ov_start_sector;
906 }
907}
908
909/**
910 * __drbd_set_state() - Set a new DRBD state
911 * @mdev: DRBD device.
912 * @ns: new state.
913 * @flags: Flags
914 * @done: Optional completion, that will get completed after the after_state_ch() finished
915 *
916 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
917 */
918int __drbd_set_state(struct drbd_conf *mdev,
919 union drbd_state ns, enum chg_state_flags flags,
920 struct completion *done)
921{
922 union drbd_state os;
923 int rv = SS_SUCCESS;
924 int warn_sync_abort = 0;
925 struct after_state_chg_work *ascw;
926
927 os = mdev->state;
928
929 ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
930
931 if (ns.i == os.i)
932 return SS_NOTHING_TO_DO;
933
934 if (!(flags & CS_HARD)) {
935 /* pre-state-change checks ; only look at ns */
936 /* See drbd_state_sw_errors in drbd_strings.c */
937
938 rv = is_valid_state(mdev, ns);
939 if (rv < SS_SUCCESS) {
940 /* If the old state was illegal as well, then let
941 this happen...*/
942
943 if (is_valid_state(mdev, os) == rv) {
944 dev_err(DEV, "Considering state change from bad state. "
945 "Error would be: '%s'\n",
946 drbd_set_st_err_str(rv));
947 print_st(mdev, "old", os);
948 print_st(mdev, "new", ns);
949 rv = is_valid_state_transition(mdev, ns, os);
950 }
951 } else
952 rv = is_valid_state_transition(mdev, ns, os);
953 }
954
955 if (rv < SS_SUCCESS) {
956 if (flags & CS_VERBOSE)
957 print_st_err(mdev, os, ns, rv);
958 return rv;
959 }
960
961 if (warn_sync_abort)
962 dev_warn(DEV, "Resync aborted.\n");
963
964 {
965 char *pbp, pb[300];
966 pbp = pb;
967 *pbp = 0;
968 PSC(role);
969 PSC(peer);
970 PSC(conn);
971 PSC(disk);
972 PSC(pdsk);
973 PSC(susp);
974 PSC(aftr_isp);
975 PSC(peer_isp);
976 PSC(user_isp);
977 dev_info(DEV, "%s\n", pb);
978 }
979
980 /* solve the race between becoming unconfigured,
981 * worker doing the cleanup, and
982 * admin reconfiguring us:
983 * on (re)configure, first set CONFIG_PENDING,
984 * then wait for a potentially exiting worker,
985 * start the worker, and schedule one no_op.
986 * then proceed with configuration.
987 */
988 if (ns.disk == D_DISKLESS &&
989 ns.conn == C_STANDALONE &&
990 ns.role == R_SECONDARY &&
991 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
992 set_bit(DEVICE_DYING, &mdev->flags);
993
994 mdev->state.i = ns.i;
995 wake_up(&mdev->misc_wait);
996 wake_up(&mdev->state_wait);
997
998 /* post-state-change actions */
999 if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) {
1000 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1001 mod_timer(&mdev->resync_timer, jiffies);
1002 }
1003
1004 /* aborted verify run. log the last position */
1005 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1006 ns.conn < C_CONNECTED) {
1007 mdev->ov_start_sector =
1008 BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left);
1009 dev_info(DEV, "Online Verify reached sector %llu\n",
1010 (unsigned long long)mdev->ov_start_sector);
1011 }
1012
1013 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1014 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1015 dev_info(DEV, "Syncer continues.\n");
1016 mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time;
1017 if (ns.conn == C_SYNC_TARGET) {
1018 if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))
1019 mod_timer(&mdev->resync_timer, jiffies);
1020 /* This if (!test_bit) is only needed for the case
1021 that a device that has ceased to used its timer,
1022 i.e. it is already in drbd_resync_finished() gets
1023 paused and resumed. */
1024 }
1025 }
1026
1027 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1028 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1029 dev_info(DEV, "Resync suspended\n");
1030 mdev->rs_mark_time = jiffies;
1031 if (ns.conn == C_PAUSED_SYNC_T)
1032 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1033 }
1034
1035 if (os.conn == C_CONNECTED &&
1036 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1037 mdev->ov_position = 0;
1038 mdev->rs_total =
1039 mdev->rs_mark_left = drbd_bm_bits(mdev);
1040 if (mdev->agreed_pro_version >= 90)
1041 set_ov_position(mdev, ns.conn);
1042 else
1043 mdev->ov_start_sector = 0;
1044 mdev->ov_left = mdev->rs_total
1045 - BM_SECT_TO_BIT(mdev->ov_position);
1046 mdev->rs_start =
1047 mdev->rs_mark_time = jiffies;
1048 mdev->ov_last_oos_size = 0;
1049 mdev->ov_last_oos_start = 0;
1050
1051 if (ns.conn == C_VERIFY_S) {
1052 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1053 (unsigned long long)mdev->ov_position);
1054 mod_timer(&mdev->resync_timer, jiffies);
1055 }
1056 }
1057
1058 if (get_ldev(mdev)) {
1059 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1060 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1061 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1062
1063 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1064 mdf |= MDF_CRASHED_PRIMARY;
1065 if (mdev->state.role == R_PRIMARY ||
1066 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1067 mdf |= MDF_PRIMARY_IND;
1068 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1069 mdf |= MDF_CONNECTED_IND;
1070 if (mdev->state.disk > D_INCONSISTENT)
1071 mdf |= MDF_CONSISTENT;
1072 if (mdev->state.disk > D_OUTDATED)
1073 mdf |= MDF_WAS_UP_TO_DATE;
1074 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1075 mdf |= MDF_PEER_OUT_DATED;
1076 if (mdf != mdev->ldev->md.flags) {
1077 mdev->ldev->md.flags = mdf;
1078 drbd_md_mark_dirty(mdev);
1079 }
1080 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1081 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1082 put_ldev(mdev);
1083 }
1084
1085 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1086 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1087 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1088 set_bit(CONSIDER_RESYNC, &mdev->flags);
1089
1090 /* Receiver should clean up itself */
1091 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1092 drbd_thread_stop_nowait(&mdev->receiver);
1093
1094 /* Now the receiver finished cleaning up itself, it should die */
1095 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1096 drbd_thread_stop_nowait(&mdev->receiver);
1097
1098 /* Upon network failure, we need to restart the receiver. */
1099 if (os.conn > C_TEAR_DOWN &&
1100 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1101 drbd_thread_restart_nowait(&mdev->receiver);
1102
1103 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1104 if (ascw) {
1105 ascw->os = os;
1106 ascw->ns = ns;
1107 ascw->flags = flags;
1108 ascw->w.cb = w_after_state_ch;
1109 ascw->done = done;
1110 drbd_queue_work(&mdev->data.work, &ascw->w);
1111 } else {
1112 dev_warn(DEV, "Could not kmalloc an ascw\n");
1113 }
1114
1115 return rv;
1116}
1117
1118static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1119{
1120 struct after_state_chg_work *ascw =
1121 container_of(w, struct after_state_chg_work, w);
1122 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1123 if (ascw->flags & CS_WAIT_COMPLETE) {
1124 D_ASSERT(ascw->done != NULL);
1125 complete(ascw->done);
1126 }
1127 kfree(ascw);
1128
1129 return 1;
1130}
1131
1132static void abw_start_sync(struct drbd_conf *mdev, int rv)
1133{
1134 if (rv) {
1135 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1136 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1137 return;
1138 }
1139
1140 switch (mdev->state.conn) {
1141 case C_STARTING_SYNC_T:
1142 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1143 break;
1144 case C_STARTING_SYNC_S:
1145 drbd_start_resync(mdev, C_SYNC_SOURCE);
1146 break;
1147 }
1148}
1149
1150/**
1151 * after_state_ch() - Perform after state change actions that may sleep
1152 * @mdev: DRBD device.
1153 * @os: old state.
1154 * @ns: new state.
1155 * @flags: Flags
1156 */
1157static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1158 union drbd_state ns, enum chg_state_flags flags)
1159{
1160 enum drbd_fencing_p fp;
1161
1162 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1163 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1164 if (mdev->p_uuid)
1165 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1166 }
1167
1168 fp = FP_DONT_CARE;
1169 if (get_ldev(mdev)) {
1170 fp = mdev->ldev->dc.fencing;
1171 put_ldev(mdev);
1172 }
1173
1174 /* Inform userspace about the change... */
1175 drbd_bcast_state(mdev, ns);
1176
1177 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1178 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1179 drbd_khelper(mdev, "pri-on-incon-degr");
1180
1181 /* Here we have the actions that are performed after a
1182 state change. This function might sleep */
1183
1184 if (fp == FP_STONITH && ns.susp) {
1185 /* case1: The outdate peer handler is successful:
1186 * case2: The connection was established again: */
1187 if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) ||
1188 (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) {
1189 tl_clear(mdev);
1190 spin_lock_irq(&mdev->req_lock);
1191 _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
1192 spin_unlock_irq(&mdev->req_lock);
1193 }
1194 }
1195 /* Do not change the order of the if above and the two below... */
1196 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1197 drbd_send_uuids(mdev);
1198 drbd_send_state(mdev);
1199 }
1200 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1201 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1202
1203 /* Lost contact to peer's copy of the data */
1204 if ((os.pdsk >= D_INCONSISTENT &&
1205 os.pdsk != D_UNKNOWN &&
1206 os.pdsk != D_OUTDATED)
1207 && (ns.pdsk < D_INCONSISTENT ||
1208 ns.pdsk == D_UNKNOWN ||
1209 ns.pdsk == D_OUTDATED)) {
1210 kfree(mdev->p_uuid);
1211 mdev->p_uuid = NULL;
1212 if (get_ldev(mdev)) {
1213 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1214 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1215 drbd_uuid_new_current(mdev);
1216 drbd_send_uuids(mdev);
1217 }
1218 put_ldev(mdev);
1219 }
1220 }
1221
1222 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1223 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
1224 drbd_uuid_new_current(mdev);
1225
1226 /* D_DISKLESS Peer becomes secondary */
1227 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1228 drbd_al_to_on_disk_bm(mdev);
1229 put_ldev(mdev);
1230 }
1231
1232 /* Last part of the attaching process ... */
1233 if (ns.conn >= C_CONNECTED &&
1234 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1235 kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */
1236 mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */
1237 drbd_send_sizes(mdev, 0); /* to start sync... */
1238 drbd_send_uuids(mdev);
1239 drbd_send_state(mdev);
1240 }
1241
1242 /* We want to pause/continue resync, tell peer. */
1243 if (ns.conn >= C_CONNECTED &&
1244 ((os.aftr_isp != ns.aftr_isp) ||
1245 (os.user_isp != ns.user_isp)))
1246 drbd_send_state(mdev);
1247
1248 /* In case one of the isp bits got set, suspend other devices. */
1249 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1250 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1251 suspend_other_sg(mdev);
1252
1253 /* Make sure the peer gets informed about eventual state
1254 changes (ISP bits) while we were in WFReportParams. */
1255 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1256 drbd_send_state(mdev);
1257
1258 /* We are in the progress to start a full sync... */
1259 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1260 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1261 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1262
1263 /* We are invalidating our self... */
1264 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1265 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1266 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1267
1268 if (os.disk > D_FAILED && ns.disk == D_FAILED) {
1269 enum drbd_io_error_p eh;
1270
1271 eh = EP_PASS_ON;
1272 if (get_ldev_if_state(mdev, D_FAILED)) {
1273 eh = mdev->ldev->dc.on_io_error;
1274 put_ldev(mdev);
1275 }
1276
1277 drbd_rs_cancel_all(mdev);
1278 /* since get_ldev() only works as long as disk>=D_INCONSISTENT,
1279 and it is D_DISKLESS here, local_cnt can only go down, it can
1280 not increase... It will reach zero */
1281 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1282 mdev->rs_total = 0;
1283 mdev->rs_failed = 0;
1284 atomic_set(&mdev->rs_pending_cnt, 0);
1285
1286 spin_lock_irq(&mdev->req_lock);
1287 _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL);
1288 spin_unlock_irq(&mdev->req_lock);
1289
1290 if (eh == EP_CALL_HELPER)
1291 drbd_khelper(mdev, "local-io-error");
1292 }
1293
1294 if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
1295
1296 if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ {
1297 if (drbd_send_state(mdev))
1298 dev_warn(DEV, "Notified peer that my disk is broken.\n");
1299 else
1300 dev_err(DEV, "Sending state in drbd_io_error() failed\n");
1301 }
1302
1303 lc_destroy(mdev->resync);
1304 mdev->resync = NULL;
1305 lc_destroy(mdev->act_log);
1306 mdev->act_log = NULL;
1307 __no_warn(local,
1308 drbd_free_bc(mdev->ldev);
1309 mdev->ldev = NULL;);
1310
1311 if (mdev->md_io_tmpp)
1312 __free_page(mdev->md_io_tmpp);
1313 }
1314
1315 /* Disks got bigger while they were detached */
1316 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1317 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1318 if (ns.conn == C_CONNECTED)
1319 resync_after_online_grow(mdev);
1320 }
1321
1322 /* A resync finished or aborted, wake paused devices... */
1323 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1324 (os.peer_isp && !ns.peer_isp) ||
1325 (os.user_isp && !ns.user_isp))
1326 resume_next_sg(mdev);
1327
1328 /* Upon network connection, we need to start the receiver */
1329 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1330 drbd_thread_start(&mdev->receiver);
1331
1332 /* Terminate worker thread if we are unconfigured - it will be
1333 restarted as needed... */
1334 if (ns.disk == D_DISKLESS &&
1335 ns.conn == C_STANDALONE &&
1336 ns.role == R_SECONDARY) {
1337 if (os.aftr_isp != ns.aftr_isp)
1338 resume_next_sg(mdev);
1339 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1340 if (test_bit(DEVICE_DYING, &mdev->flags))
1341 drbd_thread_stop_nowait(&mdev->worker);
1342 }
1343
1344 drbd_md_sync(mdev);
1345}
1346
1347
1348static int drbd_thread_setup(void *arg)
1349{
1350 struct drbd_thread *thi = (struct drbd_thread *) arg;
1351 struct drbd_conf *mdev = thi->mdev;
1352 unsigned long flags;
1353 int retval;
1354
1355restart:
1356 retval = thi->function(thi);
1357
1358 spin_lock_irqsave(&thi->t_lock, flags);
1359
1360 /* if the receiver has been "Exiting", the last thing it did
1361 * was set the conn state to "StandAlone",
1362 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1363 * and receiver thread will be "started".
1364 * drbd_thread_start needs to set "Restarting" in that case.
1365 * t_state check and assignment needs to be within the same spinlock,
1366 * so either thread_start sees Exiting, and can remap to Restarting,
1367 * or thread_start see None, and can proceed as normal.
1368 */
1369
1370 if (thi->t_state == Restarting) {
1371 dev_info(DEV, "Restarting %s\n", current->comm);
1372 thi->t_state = Running;
1373 spin_unlock_irqrestore(&thi->t_lock, flags);
1374 goto restart;
1375 }
1376
1377 thi->task = NULL;
1378 thi->t_state = None;
1379 smp_mb();
1380 complete(&thi->stop);
1381 spin_unlock_irqrestore(&thi->t_lock, flags);
1382
1383 dev_info(DEV, "Terminating %s\n", current->comm);
1384
1385 /* Release mod reference taken when thread was started */
1386 module_put(THIS_MODULE);
1387 return retval;
1388}
1389
1390static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1391 int (*func) (struct drbd_thread *))
1392{
1393 spin_lock_init(&thi->t_lock);
1394 thi->task = NULL;
1395 thi->t_state = None;
1396 thi->function = func;
1397 thi->mdev = mdev;
1398}
1399
1400int drbd_thread_start(struct drbd_thread *thi)
1401{
1402 struct drbd_conf *mdev = thi->mdev;
1403 struct task_struct *nt;
1404 unsigned long flags;
1405
1406 const char *me =
1407 thi == &mdev->receiver ? "receiver" :
1408 thi == &mdev->asender ? "asender" :
1409 thi == &mdev->worker ? "worker" : "NONSENSE";
1410
1411 /* is used from state engine doing drbd_thread_stop_nowait,
1412 * while holding the req lock irqsave */
1413 spin_lock_irqsave(&thi->t_lock, flags);
1414
1415 switch (thi->t_state) {
1416 case None:
1417 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1418 me, current->comm, current->pid);
1419
1420 /* Get ref on module for thread - this is released when thread exits */
1421 if (!try_module_get(THIS_MODULE)) {
1422 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1423 spin_unlock_irqrestore(&thi->t_lock, flags);
1424 return FALSE;
1425 }
1426
1427 init_completion(&thi->stop);
1428 D_ASSERT(thi->task == NULL);
1429 thi->reset_cpu_mask = 1;
1430 thi->t_state = Running;
1431 spin_unlock_irqrestore(&thi->t_lock, flags);
1432 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1433
1434 nt = kthread_create(drbd_thread_setup, (void *) thi,
1435 "drbd%d_%s", mdev_to_minor(mdev), me);
1436
1437 if (IS_ERR(nt)) {
1438 dev_err(DEV, "Couldn't start thread\n");
1439
1440 module_put(THIS_MODULE);
1441 return FALSE;
1442 }
1443 spin_lock_irqsave(&thi->t_lock, flags);
1444 thi->task = nt;
1445 thi->t_state = Running;
1446 spin_unlock_irqrestore(&thi->t_lock, flags);
1447 wake_up_process(nt);
1448 break;
1449 case Exiting:
1450 thi->t_state = Restarting;
1451 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1452 me, current->comm, current->pid);
1453 /* fall through */
1454 case Running:
1455 case Restarting:
1456 default:
1457 spin_unlock_irqrestore(&thi->t_lock, flags);
1458 break;
1459 }
1460
1461 return TRUE;
1462}
1463
1464
1465void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1466{
1467 unsigned long flags;
1468
1469 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1470
1471 /* may be called from state engine, holding the req lock irqsave */
1472 spin_lock_irqsave(&thi->t_lock, flags);
1473
1474 if (thi->t_state == None) {
1475 spin_unlock_irqrestore(&thi->t_lock, flags);
1476 if (restart)
1477 drbd_thread_start(thi);
1478 return;
1479 }
1480
1481 if (thi->t_state != ns) {
1482 if (thi->task == NULL) {
1483 spin_unlock_irqrestore(&thi->t_lock, flags);
1484 return;
1485 }
1486
1487 thi->t_state = ns;
1488 smp_mb();
1489 init_completion(&thi->stop);
1490 if (thi->task != current)
1491 force_sig(DRBD_SIGKILL, thi->task);
1492
1493 }
1494
1495 spin_unlock_irqrestore(&thi->t_lock, flags);
1496
1497 if (wait)
1498 wait_for_completion(&thi->stop);
1499}
1500
1501#ifdef CONFIG_SMP
1502/**
1503 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1504 * @mdev: DRBD device.
1505 *
1506 * Forces all threads of a device onto the same CPU. This is beneficial for
1507 * DRBD's performance. May be overwritten by user's configuration.
1508 */
1509void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1510{
1511 int ord, cpu;
1512
1513 /* user override. */
1514 if (cpumask_weight(mdev->cpu_mask))
1515 return;
1516
1517 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1518 for_each_online_cpu(cpu) {
1519 if (ord-- == 0) {
1520 cpumask_set_cpu(cpu, mdev->cpu_mask);
1521 return;
1522 }
1523 }
1524 /* should not be reached */
1525 cpumask_setall(mdev->cpu_mask);
1526}
1527
1528/**
1529 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1530 * @mdev: DRBD device.
1531 *
1532 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1533 * prematurely.
1534 */
1535void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1536{
1537 struct task_struct *p = current;
1538 struct drbd_thread *thi =
1539 p == mdev->asender.task ? &mdev->asender :
1540 p == mdev->receiver.task ? &mdev->receiver :
1541 p == mdev->worker.task ? &mdev->worker :
1542 NULL;
1543 ERR_IF(thi == NULL)
1544 return;
1545 if (!thi->reset_cpu_mask)
1546 return;
1547 thi->reset_cpu_mask = 0;
1548 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1549}
1550#endif
1551
1552/* the appropriate socket mutex must be held already */
1553int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1554 enum drbd_packets cmd, struct p_header *h,
1555 size_t size, unsigned msg_flags)
1556{
1557 int sent, ok;
1558
1559 ERR_IF(!h) return FALSE;
1560 ERR_IF(!size) return FALSE;
1561
1562 h->magic = BE_DRBD_MAGIC;
1563 h->command = cpu_to_be16(cmd);
1564 h->length = cpu_to_be16(size-sizeof(struct p_header));
1565
1566 sent = drbd_send(mdev, sock, h, size, msg_flags);
1567
1568 ok = (sent == size);
1569 if (!ok)
1570 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1571 cmdname(cmd), (int)size, sent);
1572 return ok;
1573}
1574
1575/* don't pass the socket. we may only look at it
1576 * when we hold the appropriate socket mutex.
1577 */
1578int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1579 enum drbd_packets cmd, struct p_header *h, size_t size)
1580{
1581 int ok = 0;
1582 struct socket *sock;
1583
1584 if (use_data_socket) {
1585 mutex_lock(&mdev->data.mutex);
1586 sock = mdev->data.socket;
1587 } else {
1588 mutex_lock(&mdev->meta.mutex);
1589 sock = mdev->meta.socket;
1590 }
1591
1592 /* drbd_disconnect() could have called drbd_free_sock()
1593 * while we were waiting in down()... */
1594 if (likely(sock != NULL))
1595 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1596
1597 if (use_data_socket)
1598 mutex_unlock(&mdev->data.mutex);
1599 else
1600 mutex_unlock(&mdev->meta.mutex);
1601 return ok;
1602}
1603
1604int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1605 size_t size)
1606{
1607 struct p_header h;
1608 int ok;
1609
1610 h.magic = BE_DRBD_MAGIC;
1611 h.command = cpu_to_be16(cmd);
1612 h.length = cpu_to_be16(size);
1613
1614 if (!drbd_get_data_sock(mdev))
1615 return 0;
1616
1617 ok = (sizeof(h) ==
1618 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1619 ok = ok && (size ==
1620 drbd_send(mdev, mdev->data.socket, data, size, 0));
1621
1622 drbd_put_data_sock(mdev);
1623
1624 return ok;
1625}
1626
1627int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1628{
1629 struct p_rs_param_89 *p;
1630 struct socket *sock;
1631 int size, rv;
1632 const int apv = mdev->agreed_pro_version;
1633
1634 size = apv <= 87 ? sizeof(struct p_rs_param)
1635 : apv == 88 ? sizeof(struct p_rs_param)
1636 + strlen(mdev->sync_conf.verify_alg) + 1
1637 : /* 89 */ sizeof(struct p_rs_param_89);
1638
1639 /* used from admin command context and receiver/worker context.
1640 * to avoid kmalloc, grab the socket right here,
1641 * then use the pre-allocated sbuf there */
1642 mutex_lock(&mdev->data.mutex);
1643 sock = mdev->data.socket;
1644
1645 if (likely(sock != NULL)) {
1646 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1647
1648 p = &mdev->data.sbuf.rs_param_89;
1649
1650 /* initialize verify_alg and csums_alg */
1651 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1652
1653 p->rate = cpu_to_be32(sc->rate);
1654
1655 if (apv >= 88)
1656 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1657 if (apv >= 89)
1658 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1659
1660 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1661 } else
1662 rv = 0; /* not ok */
1663
1664 mutex_unlock(&mdev->data.mutex);
1665
1666 return rv;
1667}
1668
1669int drbd_send_protocol(struct drbd_conf *mdev)
1670{
1671 struct p_protocol *p;
1672 int size, rv;
1673
1674 size = sizeof(struct p_protocol);
1675
1676 if (mdev->agreed_pro_version >= 87)
1677 size += strlen(mdev->net_conf->integrity_alg) + 1;
1678
1679 /* we must not recurse into our own queue,
1680 * as that is blocked during handshake */
1681 p = kmalloc(size, GFP_NOIO);
1682 if (p == NULL)
1683 return 0;
1684
1685 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
1686 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
1687 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
1688 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
1689 p->want_lose = cpu_to_be32(mdev->net_conf->want_lose);
1690 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1691
1692 if (mdev->agreed_pro_version >= 87)
1693 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1694
1695 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
1696 (struct p_header *)p, size);
1697 kfree(p);
1698 return rv;
1699}
1700
1701int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1702{
1703 struct p_uuids p;
1704 int i;
1705
1706 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1707 return 1;
1708
1709 for (i = UI_CURRENT; i < UI_SIZE; i++)
1710 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1711
1712 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1713 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1714 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1715 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1716 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1717 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1718
1719 put_ldev(mdev);
1720
1721 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
1722 (struct p_header *)&p, sizeof(p));
1723}
1724
1725int drbd_send_uuids(struct drbd_conf *mdev)
1726{
1727 return _drbd_send_uuids(mdev, 0);
1728}
1729
1730int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1731{
1732 return _drbd_send_uuids(mdev, 8);
1733}
1734
1735
1736int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1737{
1738 struct p_rs_uuid p;
1739
1740 p.uuid = cpu_to_be64(val);
1741
1742 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
1743 (struct p_header *)&p, sizeof(p));
1744}
1745
1746int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply)
1747{
1748 struct p_sizes p;
1749 sector_t d_size, u_size;
1750 int q_order_type;
1751 int ok;
1752
1753 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1754 D_ASSERT(mdev->ldev->backing_bdev);
1755 d_size = drbd_get_max_capacity(mdev->ldev);
1756 u_size = mdev->ldev->dc.disk_size;
1757 q_order_type = drbd_queue_order_type(mdev);
1758 p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev));
1759 put_ldev(mdev);
1760 } else {
1761 d_size = 0;
1762 u_size = 0;
1763 q_order_type = QUEUE_ORDERED_NONE;
1764 }
1765
1766 p.d_size = cpu_to_be64(d_size);
1767 p.u_size = cpu_to_be64(u_size);
1768 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1769 p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
1770 p.queue_order_type = cpu_to_be32(q_order_type);
1771
1772 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
1773 (struct p_header *)&p, sizeof(p));
1774 return ok;
1775}
1776
1777/**
1778 * drbd_send_state() - Sends the drbd state to the peer
1779 * @mdev: DRBD device.
1780 */
1781int drbd_send_state(struct drbd_conf *mdev)
1782{
1783 struct socket *sock;
1784 struct p_state p;
1785 int ok = 0;
1786
1787 /* Grab state lock so we wont send state if we're in the middle
1788 * of a cluster wide state change on another thread */
1789 drbd_state_lock(mdev);
1790
1791 mutex_lock(&mdev->data.mutex);
1792
1793 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
1794 sock = mdev->data.socket;
1795
1796 if (likely(sock != NULL)) {
1797 ok = _drbd_send_cmd(mdev, sock, P_STATE,
1798 (struct p_header *)&p, sizeof(p), 0);
1799 }
1800
1801 mutex_unlock(&mdev->data.mutex);
1802
1803 drbd_state_unlock(mdev);
1804 return ok;
1805}
1806
1807int drbd_send_state_req(struct drbd_conf *mdev,
1808 union drbd_state mask, union drbd_state val)
1809{
1810 struct p_req_state p;
1811
1812 p.mask = cpu_to_be32(mask.i);
1813 p.val = cpu_to_be32(val.i);
1814
1815 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
1816 (struct p_header *)&p, sizeof(p));
1817}
1818
1819int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
1820{
1821 struct p_req_state_reply p;
1822
1823 p.retcode = cpu_to_be32(retcode);
1824
1825 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
1826 (struct p_header *)&p, sizeof(p));
1827}
1828
1829int fill_bitmap_rle_bits(struct drbd_conf *mdev,
1830 struct p_compressed_bm *p,
1831 struct bm_xfer_ctx *c)
1832{
1833 struct bitstream bs;
1834 unsigned long plain_bits;
1835 unsigned long tmp;
1836 unsigned long rl;
1837 unsigned len;
1838 unsigned toggle;
1839 int bits;
1840
1841 /* may we use this feature? */
1842 if ((mdev->sync_conf.use_rle == 0) ||
1843 (mdev->agreed_pro_version < 90))
1844 return 0;
1845
1846 if (c->bit_offset >= c->bm_bits)
1847 return 0; /* nothing to do. */
1848
1849 /* use at most thus many bytes */
1850 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
1851 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
1852 /* plain bits covered in this code string */
1853 plain_bits = 0;
1854
1855 /* p->encoding & 0x80 stores whether the first run length is set.
1856 * bit offset is implicit.
1857 * start with toggle == 2 to be able to tell the first iteration */
1858 toggle = 2;
1859
1860 /* see how much plain bits we can stuff into one packet
1861 * using RLE and VLI. */
1862 do {
1863 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
1864 : _drbd_bm_find_next(mdev, c->bit_offset);
1865 if (tmp == -1UL)
1866 tmp = c->bm_bits;
1867 rl = tmp - c->bit_offset;
1868
1869 if (toggle == 2) { /* first iteration */
1870 if (rl == 0) {
1871 /* the first checked bit was set,
1872 * store start value, */
1873 DCBP_set_start(p, 1);
1874 /* but skip encoding of zero run length */
1875 toggle = !toggle;
1876 continue;
1877 }
1878 DCBP_set_start(p, 0);
1879 }
1880
1881 /* paranoia: catch zero runlength.
1882 * can only happen if bitmap is modified while we scan it. */
1883 if (rl == 0) {
1884 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
1885 "t:%u bo:%lu\n", toggle, c->bit_offset);
1886 return -1;
1887 }
1888
1889 bits = vli_encode_bits(&bs, rl);
1890 if (bits == -ENOBUFS) /* buffer full */
1891 break;
1892 if (bits <= 0) {
1893 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
1894 return 0;
1895 }
1896
1897 toggle = !toggle;
1898 plain_bits += rl;
1899 c->bit_offset = tmp;
1900 } while (c->bit_offset < c->bm_bits);
1901
1902 len = bs.cur.b - p->code + !!bs.cur.bit;
1903
1904 if (plain_bits < (len << 3)) {
1905 /* incompressible with this method.
1906 * we need to rewind both word and bit position. */
1907 c->bit_offset -= plain_bits;
1908 bm_xfer_ctx_bit_to_word_offset(c);
1909 c->bit_offset = c->word_offset * BITS_PER_LONG;
1910 return 0;
1911 }
1912
1913 /* RLE + VLI was able to compress it just fine.
1914 * update c->word_offset. */
1915 bm_xfer_ctx_bit_to_word_offset(c);
1916
1917 /* store pad_bits */
1918 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
1919
1920 return len;
1921}
1922
1923enum { OK, FAILED, DONE }
1924send_bitmap_rle_or_plain(struct drbd_conf *mdev,
1925 struct p_header *h, struct bm_xfer_ctx *c)
1926{
1927 struct p_compressed_bm *p = (void*)h;
1928 unsigned long num_words;
1929 int len;
1930 int ok;
1931
1932 len = fill_bitmap_rle_bits(mdev, p, c);
1933
1934 if (len < 0)
1935 return FAILED;
1936
1937 if (len) {
1938 DCBP_set_code(p, RLE_VLI_Bits);
1939 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
1940 sizeof(*p) + len, 0);
1941
1942 c->packets[0]++;
1943 c->bytes[0] += sizeof(*p) + len;
1944
1945 if (c->bit_offset >= c->bm_bits)
1946 len = 0; /* DONE */
1947 } else {
1948 /* was not compressible.
1949 * send a buffer full of plain text bits instead. */
1950 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
1951 len = num_words * sizeof(long);
1952 if (len)
1953 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
1954 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
1955 h, sizeof(struct p_header) + len, 0);
1956 c->word_offset += num_words;
1957 c->bit_offset = c->word_offset * BITS_PER_LONG;
1958
1959 c->packets[1]++;
1960 c->bytes[1] += sizeof(struct p_header) + len;
1961
1962 if (c->bit_offset > c->bm_bits)
1963 c->bit_offset = c->bm_bits;
1964 }
1965 ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
1966
1967 if (ok == DONE)
1968 INFO_bm_xfer_stats(mdev, "send", c);
1969 return ok;
1970}
1971
1972/* See the comment at receive_bitmap() */
1973int _drbd_send_bitmap(struct drbd_conf *mdev)
1974{
1975 struct bm_xfer_ctx c;
1976 struct p_header *p;
1977 int ret;
1978
1979 ERR_IF(!mdev->bitmap) return FALSE;
1980
1981 /* maybe we should use some per thread scratch page,
1982 * and allocate that during initial device creation? */
1983 p = (struct p_header *) __get_free_page(GFP_NOIO);
1984 if (!p) {
1985 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
1986 return FALSE;
1987 }
1988
1989 if (get_ldev(mdev)) {
1990 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
1991 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
1992 drbd_bm_set_all(mdev);
1993 if (drbd_bm_write(mdev)) {
1994 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
1995 * but otherwise process as per normal - need to tell other
1996 * side that a full resync is required! */
1997 dev_err(DEV, "Failed to write bitmap to disk!\n");
1998 } else {
1999 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2000 drbd_md_sync(mdev);
2001 }
2002 }
2003 put_ldev(mdev);
2004 }
2005
2006 c = (struct bm_xfer_ctx) {
2007 .bm_bits = drbd_bm_bits(mdev),
2008 .bm_words = drbd_bm_words(mdev),
2009 };
2010
2011 do {
2012 ret = send_bitmap_rle_or_plain(mdev, p, &c);
2013 } while (ret == OK);
2014
2015 free_page((unsigned long) p);
2016 return (ret == DONE);
2017}
2018
2019int drbd_send_bitmap(struct drbd_conf *mdev)
2020{
2021 int err;
2022
2023 if (!drbd_get_data_sock(mdev))
2024 return -1;
2025 err = !_drbd_send_bitmap(mdev);
2026 drbd_put_data_sock(mdev);
2027 return err;
2028}
2029
2030int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2031{
2032 int ok;
2033 struct p_barrier_ack p;
2034
2035 p.barrier = barrier_nr;
2036 p.set_size = cpu_to_be32(set_size);
2037
2038 if (mdev->state.conn < C_CONNECTED)
2039 return FALSE;
2040 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2041 (struct p_header *)&p, sizeof(p));
2042 return ok;
2043}
2044
2045/**
2046 * _drbd_send_ack() - Sends an ack packet
2047 * @mdev: DRBD device.
2048 * @cmd: Packet command code.
2049 * @sector: sector, needs to be in big endian byte order
2050 * @blksize: size in byte, needs to be in big endian byte order
2051 * @block_id: Id, big endian byte order
2052 */
2053static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2054 u64 sector,
2055 u32 blksize,
2056 u64 block_id)
2057{
2058 int ok;
2059 struct p_block_ack p;
2060
2061 p.sector = sector;
2062 p.block_id = block_id;
2063 p.blksize = blksize;
2064 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2065
2066 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2067 return FALSE;
2068 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2069 (struct p_header *)&p, sizeof(p));
2070 return ok;
2071}
2072
2073int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2074 struct p_data *dp)
2075{
2076 const int header_size = sizeof(struct p_data)
2077 - sizeof(struct p_header);
2078 int data_size = ((struct p_header *)dp)->length - header_size;
2079
2080 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2081 dp->block_id);
2082}
2083
2084int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2085 struct p_block_req *rp)
2086{
2087 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2088}
2089
2090/**
2091 * drbd_send_ack() - Sends an ack packet
2092 * @mdev: DRBD device.
2093 * @cmd: Packet command code.
2094 * @e: Epoch entry.
2095 */
2096int drbd_send_ack(struct drbd_conf *mdev,
2097 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2098{
2099 return _drbd_send_ack(mdev, cmd,
2100 cpu_to_be64(e->sector),
2101 cpu_to_be32(e->size),
2102 e->block_id);
2103}
2104
2105/* This function misuses the block_id field to signal if the blocks
2106 * are is sync or not. */
2107int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2108 sector_t sector, int blksize, u64 block_id)
2109{
2110 return _drbd_send_ack(mdev, cmd,
2111 cpu_to_be64(sector),
2112 cpu_to_be32(blksize),
2113 cpu_to_be64(block_id));
2114}
2115
2116int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2117 sector_t sector, int size, u64 block_id)
2118{
2119 int ok;
2120 struct p_block_req p;
2121
2122 p.sector = cpu_to_be64(sector);
2123 p.block_id = block_id;
2124 p.blksize = cpu_to_be32(size);
2125
2126 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2127 (struct p_header *)&p, sizeof(p));
2128 return ok;
2129}
2130
2131int drbd_send_drequest_csum(struct drbd_conf *mdev,
2132 sector_t sector, int size,
2133 void *digest, int digest_size,
2134 enum drbd_packets cmd)
2135{
2136 int ok;
2137 struct p_block_req p;
2138
2139 p.sector = cpu_to_be64(sector);
2140 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2141 p.blksize = cpu_to_be32(size);
2142
2143 p.head.magic = BE_DRBD_MAGIC;
2144 p.head.command = cpu_to_be16(cmd);
2145 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size);
2146
2147 mutex_lock(&mdev->data.mutex);
2148
2149 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2150 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2151
2152 mutex_unlock(&mdev->data.mutex);
2153
2154 return ok;
2155}
2156
2157int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2158{
2159 int ok;
2160 struct p_block_req p;
2161
2162 p.sector = cpu_to_be64(sector);
2163 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2164 p.blksize = cpu_to_be32(size);
2165
2166 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2167 (struct p_header *)&p, sizeof(p));
2168 return ok;
2169}
2170
2171/* called on sndtimeo
2172 * returns FALSE if we should retry,
2173 * TRUE if we think connection is dead
2174 */
2175static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2176{
2177 int drop_it;
2178 /* long elapsed = (long)(jiffies - mdev->last_received); */
2179
2180 drop_it = mdev->meta.socket == sock
2181 || !mdev->asender.task
2182 || get_t_state(&mdev->asender) != Running
2183 || mdev->state.conn < C_CONNECTED;
2184
2185 if (drop_it)
2186 return TRUE;
2187
2188 drop_it = !--mdev->ko_count;
2189 if (!drop_it) {
2190 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2191 current->comm, current->pid, mdev->ko_count);
2192 request_ping(mdev);
2193 }
2194
2195 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2196}
2197
2198/* The idea of sendpage seems to be to put some kind of reference
2199 * to the page into the skb, and to hand it over to the NIC. In
2200 * this process get_page() gets called.
2201 *
2202 * As soon as the page was really sent over the network put_page()
2203 * gets called by some part of the network layer. [ NIC driver? ]
2204 *
2205 * [ get_page() / put_page() increment/decrement the count. If count
2206 * reaches 0 the page will be freed. ]
2207 *
2208 * This works nicely with pages from FSs.
2209 * But this means that in protocol A we might signal IO completion too early!
2210 *
2211 * In order not to corrupt data during a resync we must make sure
2212 * that we do not reuse our own buffer pages (EEs) to early, therefore
2213 * we have the net_ee list.
2214 *
2215 * XFS seems to have problems, still, it submits pages with page_count == 0!
2216 * As a workaround, we disable sendpage on pages
2217 * with page_count == 0 or PageSlab.
2218 */
2219static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2220 int offset, size_t size)
2221{
2222 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0);
2223 kunmap(page);
2224 if (sent == size)
2225 mdev->send_cnt += size>>9;
2226 return sent == size;
2227}
2228
2229static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2230 int offset, size_t size)
2231{
2232 mm_segment_t oldfs = get_fs();
2233 int sent, ok;
2234 int len = size;
2235
2236 /* e.g. XFS meta- & log-data is in slab pages, which have a
2237 * page_count of 0 and/or have PageSlab() set.
2238 * we cannot use send_page for those, as that does get_page();
2239 * put_page(); and would cause either a VM_BUG directly, or
2240 * __page_cache_release a page that would actually still be referenced
2241 * by someone, leading to some obscure delayed Oops somewhere else. */
2242 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2243 return _drbd_no_send_page(mdev, page, offset, size);
2244
2245 drbd_update_congested(mdev);
2246 set_fs(KERNEL_DS);
2247 do {
2248 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2249 offset, len,
2250 MSG_NOSIGNAL);
2251 if (sent == -EAGAIN) {
2252 if (we_should_drop_the_connection(mdev,
2253 mdev->data.socket))
2254 break;
2255 else
2256 continue;
2257 }
2258 if (sent <= 0) {
2259 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2260 __func__, (int)size, len, sent);
2261 break;
2262 }
2263 len -= sent;
2264 offset += sent;
2265 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2266 set_fs(oldfs);
2267 clear_bit(NET_CONGESTED, &mdev->flags);
2268
2269 ok = (len == 0);
2270 if (likely(ok))
2271 mdev->send_cnt += size>>9;
2272 return ok;
2273}
2274
2275static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2276{
2277 struct bio_vec *bvec;
2278 int i;
2279 __bio_for_each_segment(bvec, bio, i, 0) {
2280 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2281 bvec->bv_offset, bvec->bv_len))
2282 return 0;
2283 }
2284 return 1;
2285}
2286
2287static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2288{
2289 struct bio_vec *bvec;
2290 int i;
2291 __bio_for_each_segment(bvec, bio, i, 0) {
2292 if (!_drbd_send_page(mdev, bvec->bv_page,
2293 bvec->bv_offset, bvec->bv_len))
2294 return 0;
2295 }
2296
2297 return 1;
2298}
2299
2300/* Used to send write requests
2301 * R_PRIMARY -> Peer (P_DATA)
2302 */
2303int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2304{
2305 int ok = 1;
2306 struct p_data p;
2307 unsigned int dp_flags = 0;
2308 void *dgb;
2309 int dgs;
2310
2311 if (!drbd_get_data_sock(mdev))
2312 return 0;
2313
2314 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2315 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2316
2317 p.head.magic = BE_DRBD_MAGIC;
2318 p.head.command = cpu_to_be16(P_DATA);
2319 p.head.length =
2320 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size);
2321
2322 p.sector = cpu_to_be64(req->sector);
2323 p.block_id = (unsigned long)req;
2324 p.seq_num = cpu_to_be32(req->seq_num =
2325 atomic_add_return(1, &mdev->packet_seq));
2326 dp_flags = 0;
2327
2328 /* NOTE: no need to check if barriers supported here as we would
2329 * not pass the test in make_request_common in that case
2330 */
2331 if (bio_rw_flagged(req->master_bio, BIO_RW_BARRIER)) {
2332 dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n");
2333 /* dp_flags |= DP_HARDBARRIER; */
2334 }
2335 if (bio_rw_flagged(req->master_bio, BIO_RW_SYNCIO))
2336 dp_flags |= DP_RW_SYNC;
2337 /* for now handle SYNCIO and UNPLUG
2338 * as if they still were one and the same flag */
2339 if (bio_rw_flagged(req->master_bio, BIO_RW_UNPLUG))
2340 dp_flags |= DP_RW_SYNC;
2341 if (mdev->state.conn >= C_SYNC_SOURCE &&
2342 mdev->state.conn <= C_PAUSED_SYNC_T)
2343 dp_flags |= DP_MAY_SET_IN_SYNC;
2344
2345 p.dp_flags = cpu_to_be32(dp_flags);
2346 set_bit(UNPLUG_REMOTE, &mdev->flags);
2347 ok = (sizeof(p) ==
2348 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE));
2349 if (ok && dgs) {
2350 dgb = mdev->int_dig_out;
2351 drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2352 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2353 }
2354 if (ok) {
2355 if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
2356 ok = _drbd_send_bio(mdev, req->master_bio);
2357 else
2358 ok = _drbd_send_zc_bio(mdev, req->master_bio);
2359 }
2360
2361 drbd_put_data_sock(mdev);
2362 return ok;
2363}
2364
2365/* answer packet, used to send data back for read requests:
2366 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2367 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2368 */
2369int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2370 struct drbd_epoch_entry *e)
2371{
2372 int ok;
2373 struct p_data p;
2374 void *dgb;
2375 int dgs;
2376
2377 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2378 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2379
2380 p.head.magic = BE_DRBD_MAGIC;
2381 p.head.command = cpu_to_be16(cmd);
2382 p.head.length =
2383 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size);
2384
2385 p.sector = cpu_to_be64(e->sector);
2386 p.block_id = e->block_id;
2387 /* p.seq_num = 0; No sequence numbers here.. */
2388
2389 /* Only called by our kernel thread.
2390 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2391 * in response to admin command or module unload.
2392 */
2393 if (!drbd_get_data_sock(mdev))
2394 return 0;
2395
2396 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p,
2397 sizeof(p), MSG_MORE);
2398 if (ok && dgs) {
2399 dgb = mdev->int_dig_out;
2400 drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb);
2401 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2402 }
2403 if (ok)
2404 ok = _drbd_send_zc_bio(mdev, e->private_bio);
2405
2406 drbd_put_data_sock(mdev);
2407 return ok;
2408}
2409
2410/*
2411 drbd_send distinguishes two cases:
2412
2413 Packets sent via the data socket "sock"
2414 and packets sent via the meta data socket "msock"
2415
2416 sock msock
2417 -----------------+-------------------------+------------------------------
2418 timeout conf.timeout / 2 conf.timeout / 2
2419 timeout action send a ping via msock Abort communication
2420 and close all sockets
2421*/
2422
2423/*
2424 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2425 */
2426int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2427 void *buf, size_t size, unsigned msg_flags)
2428{
2429 struct kvec iov;
2430 struct msghdr msg;
2431 int rv, sent = 0;
2432
2433 if (!sock)
2434 return -1000;
2435
2436 /* THINK if (signal_pending) return ... ? */
2437
2438 iov.iov_base = buf;
2439 iov.iov_len = size;
2440
2441 msg.msg_name = NULL;
2442 msg.msg_namelen = 0;
2443 msg.msg_control = NULL;
2444 msg.msg_controllen = 0;
2445 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2446
2447 if (sock == mdev->data.socket) {
2448 mdev->ko_count = mdev->net_conf->ko_count;
2449 drbd_update_congested(mdev);
2450 }
2451 do {
2452 /* STRANGE
2453 * tcp_sendmsg does _not_ use its size parameter at all ?
2454 *
2455 * -EAGAIN on timeout, -EINTR on signal.
2456 */
2457/* THINK
2458 * do we need to block DRBD_SIG if sock == &meta.socket ??
2459 * otherwise wake_asender() might interrupt some send_*Ack !
2460 */
2461 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2462 if (rv == -EAGAIN) {
2463 if (we_should_drop_the_connection(mdev, sock))
2464 break;
2465 else
2466 continue;
2467 }
2468 D_ASSERT(rv != 0);
2469 if (rv == -EINTR) {
2470 flush_signals(current);
2471 rv = 0;
2472 }
2473 if (rv < 0)
2474 break;
2475 sent += rv;
2476 iov.iov_base += rv;
2477 iov.iov_len -= rv;
2478 } while (sent < size);
2479
2480 if (sock == mdev->data.socket)
2481 clear_bit(NET_CONGESTED, &mdev->flags);
2482
2483 if (rv <= 0) {
2484 if (rv != -EAGAIN) {
2485 dev_err(DEV, "%s_sendmsg returned %d\n",
2486 sock == mdev->meta.socket ? "msock" : "sock",
2487 rv);
2488 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2489 } else
2490 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2491 }
2492
2493 return sent;
2494}
2495
2496static int drbd_open(struct block_device *bdev, fmode_t mode)
2497{
2498 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2499 unsigned long flags;
2500 int rv = 0;
2501
2502 spin_lock_irqsave(&mdev->req_lock, flags);
2503 /* to have a stable mdev->state.role
2504 * and no race with updating open_cnt */
2505
2506 if (mdev->state.role != R_PRIMARY) {
2507 if (mode & FMODE_WRITE)
2508 rv = -EROFS;
2509 else if (!allow_oos)
2510 rv = -EMEDIUMTYPE;
2511 }
2512
2513 if (!rv)
2514 mdev->open_cnt++;
2515 spin_unlock_irqrestore(&mdev->req_lock, flags);
2516
2517 return rv;
2518}
2519
2520static int drbd_release(struct gendisk *gd, fmode_t mode)
2521{
2522 struct drbd_conf *mdev = gd->private_data;
2523 mdev->open_cnt--;
2524 return 0;
2525}
2526
2527static void drbd_unplug_fn(struct request_queue *q)
2528{
2529 struct drbd_conf *mdev = q->queuedata;
2530
2531 /* unplug FIRST */
2532 spin_lock_irq(q->queue_lock);
2533 blk_remove_plug(q);
2534 spin_unlock_irq(q->queue_lock);
2535
2536 /* only if connected */
2537 spin_lock_irq(&mdev->req_lock);
2538 if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) {
2539 D_ASSERT(mdev->state.role == R_PRIMARY);
2540 if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) {
2541 /* add to the data.work queue,
2542 * unless already queued.
2543 * XXX this might be a good addition to drbd_queue_work
2544 * anyways, to detect "double queuing" ... */
2545 if (list_empty(&mdev->unplug_work.list))
2546 drbd_queue_work(&mdev->data.work,
2547 &mdev->unplug_work);
2548 }
2549 }
2550 spin_unlock_irq(&mdev->req_lock);
2551
2552 if (mdev->state.disk >= D_INCONSISTENT)
2553 drbd_kick_lo(mdev);
2554}
2555
2556static void drbd_set_defaults(struct drbd_conf *mdev)
2557{
2558 mdev->sync_conf.after = DRBD_AFTER_DEF;
2559 mdev->sync_conf.rate = DRBD_RATE_DEF;
2560 mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_DEF;
2561 mdev->state = (union drbd_state) {
2562 { .role = R_SECONDARY,
2563 .peer = R_UNKNOWN,
2564 .conn = C_STANDALONE,
2565 .disk = D_DISKLESS,
2566 .pdsk = D_UNKNOWN,
2567 .susp = 0
2568 } };
2569}
2570
2571void drbd_init_set_defaults(struct drbd_conf *mdev)
2572{
2573 /* the memset(,0,) did most of this.
2574 * note: only assignments, no allocation in here */
2575
2576 drbd_set_defaults(mdev);
2577
2578 /* for now, we do NOT yet support it,
2579 * even though we start some framework
2580 * to eventually support barriers */
2581 set_bit(NO_BARRIER_SUPP, &mdev->flags);
2582
2583 atomic_set(&mdev->ap_bio_cnt, 0);
2584 atomic_set(&mdev->ap_pending_cnt, 0);
2585 atomic_set(&mdev->rs_pending_cnt, 0);
2586 atomic_set(&mdev->unacked_cnt, 0);
2587 atomic_set(&mdev->local_cnt, 0);
2588 atomic_set(&mdev->net_cnt, 0);
2589 atomic_set(&mdev->packet_seq, 0);
2590 atomic_set(&mdev->pp_in_use, 0);
2591
2592 mutex_init(&mdev->md_io_mutex);
2593 mutex_init(&mdev->data.mutex);
2594 mutex_init(&mdev->meta.mutex);
2595 sema_init(&mdev->data.work.s, 0);
2596 sema_init(&mdev->meta.work.s, 0);
2597 mutex_init(&mdev->state_mutex);
2598
2599 spin_lock_init(&mdev->data.work.q_lock);
2600 spin_lock_init(&mdev->meta.work.q_lock);
2601
2602 spin_lock_init(&mdev->al_lock);
2603 spin_lock_init(&mdev->req_lock);
2604 spin_lock_init(&mdev->peer_seq_lock);
2605 spin_lock_init(&mdev->epoch_lock);
2606
2607 INIT_LIST_HEAD(&mdev->active_ee);
2608 INIT_LIST_HEAD(&mdev->sync_ee);
2609 INIT_LIST_HEAD(&mdev->done_ee);
2610 INIT_LIST_HEAD(&mdev->read_ee);
2611 INIT_LIST_HEAD(&mdev->net_ee);
2612 INIT_LIST_HEAD(&mdev->resync_reads);
2613 INIT_LIST_HEAD(&mdev->data.work.q);
2614 INIT_LIST_HEAD(&mdev->meta.work.q);
2615 INIT_LIST_HEAD(&mdev->resync_work.list);
2616 INIT_LIST_HEAD(&mdev->unplug_work.list);
2617 INIT_LIST_HEAD(&mdev->md_sync_work.list);
2618 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
2619 mdev->resync_work.cb = w_resync_inactive;
2620 mdev->unplug_work.cb = w_send_write_hint;
2621 mdev->md_sync_work.cb = w_md_sync;
2622 mdev->bm_io_work.w.cb = w_bitmap_io;
2623 init_timer(&mdev->resync_timer);
2624 init_timer(&mdev->md_sync_timer);
2625 mdev->resync_timer.function = resync_timer_fn;
2626 mdev->resync_timer.data = (unsigned long) mdev;
2627 mdev->md_sync_timer.function = md_sync_timer_fn;
2628 mdev->md_sync_timer.data = (unsigned long) mdev;
2629
2630 init_waitqueue_head(&mdev->misc_wait);
2631 init_waitqueue_head(&mdev->state_wait);
2632 init_waitqueue_head(&mdev->ee_wait);
2633 init_waitqueue_head(&mdev->al_wait);
2634 init_waitqueue_head(&mdev->seq_wait);
2635
2636 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2637 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2638 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2639
2640 mdev->agreed_pro_version = PRO_VERSION_MAX;
2641 mdev->write_ordering = WO_bio_barrier;
2642 mdev->resync_wenr = LC_FREE;
2643}
2644
2645void drbd_mdev_cleanup(struct drbd_conf *mdev)
2646{
2647 if (mdev->receiver.t_state != None)
2648 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2649 mdev->receiver.t_state);
2650
2651 /* no need to lock it, I'm the only thread alive */
2652 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
2653 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2654 mdev->al_writ_cnt =
2655 mdev->bm_writ_cnt =
2656 mdev->read_cnt =
2657 mdev->recv_cnt =
2658 mdev->send_cnt =
2659 mdev->writ_cnt =
2660 mdev->p_size =
2661 mdev->rs_start =
2662 mdev->rs_total =
2663 mdev->rs_failed =
2664 mdev->rs_mark_left =
2665 mdev->rs_mark_time = 0;
2666 D_ASSERT(mdev->net_conf == NULL);
2667
2668 drbd_set_my_capacity(mdev, 0);
2669 if (mdev->bitmap) {
2670 /* maybe never allocated. */
2671 drbd_bm_resize(mdev, 0);
2672 drbd_bm_cleanup(mdev);
2673 }
2674
2675 drbd_free_resources(mdev);
2676
2677 /*
2678 * currently we drbd_init_ee only on module load, so
2679 * we may do drbd_release_ee only on module unload!
2680 */
2681 D_ASSERT(list_empty(&mdev->active_ee));
2682 D_ASSERT(list_empty(&mdev->sync_ee));
2683 D_ASSERT(list_empty(&mdev->done_ee));
2684 D_ASSERT(list_empty(&mdev->read_ee));
2685 D_ASSERT(list_empty(&mdev->net_ee));
2686 D_ASSERT(list_empty(&mdev->resync_reads));
2687 D_ASSERT(list_empty(&mdev->data.work.q));
2688 D_ASSERT(list_empty(&mdev->meta.work.q));
2689 D_ASSERT(list_empty(&mdev->resync_work.list));
2690 D_ASSERT(list_empty(&mdev->unplug_work.list));
2691
2692}
2693
2694
2695static void drbd_destroy_mempools(void)
2696{
2697 struct page *page;
2698
2699 while (drbd_pp_pool) {
2700 page = drbd_pp_pool;
2701 drbd_pp_pool = (struct page *)page_private(page);
2702 __free_page(page);
2703 drbd_pp_vacant--;
2704 }
2705
2706 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2707
2708 if (drbd_ee_mempool)
2709 mempool_destroy(drbd_ee_mempool);
2710 if (drbd_request_mempool)
2711 mempool_destroy(drbd_request_mempool);
2712 if (drbd_ee_cache)
2713 kmem_cache_destroy(drbd_ee_cache);
2714 if (drbd_request_cache)
2715 kmem_cache_destroy(drbd_request_cache);
2716 if (drbd_bm_ext_cache)
2717 kmem_cache_destroy(drbd_bm_ext_cache);
2718 if (drbd_al_ext_cache)
2719 kmem_cache_destroy(drbd_al_ext_cache);
2720
2721 drbd_ee_mempool = NULL;
2722 drbd_request_mempool = NULL;
2723 drbd_ee_cache = NULL;
2724 drbd_request_cache = NULL;
2725 drbd_bm_ext_cache = NULL;
2726 drbd_al_ext_cache = NULL;
2727
2728 return;
2729}
2730
2731static int drbd_create_mempools(void)
2732{
2733 struct page *page;
2734 const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count;
2735 int i;
2736
2737 /* prepare our caches and mempools */
2738 drbd_request_mempool = NULL;
2739 drbd_ee_cache = NULL;
2740 drbd_request_cache = NULL;
2741 drbd_bm_ext_cache = NULL;
2742 drbd_al_ext_cache = NULL;
2743 drbd_pp_pool = NULL;
2744
2745 /* caches */
2746 drbd_request_cache = kmem_cache_create(
2747 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
2748 if (drbd_request_cache == NULL)
2749 goto Enomem;
2750
2751 drbd_ee_cache = kmem_cache_create(
2752 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
2753 if (drbd_ee_cache == NULL)
2754 goto Enomem;
2755
2756 drbd_bm_ext_cache = kmem_cache_create(
2757 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
2758 if (drbd_bm_ext_cache == NULL)
2759 goto Enomem;
2760
2761 drbd_al_ext_cache = kmem_cache_create(
2762 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
2763 if (drbd_al_ext_cache == NULL)
2764 goto Enomem;
2765
2766 /* mempools */
2767 drbd_request_mempool = mempool_create(number,
2768 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
2769 if (drbd_request_mempool == NULL)
2770 goto Enomem;
2771
2772 drbd_ee_mempool = mempool_create(number,
2773 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2774 if (drbd_request_mempool == NULL)
2775 goto Enomem;
2776
2777 /* drbd's page pool */
2778 spin_lock_init(&drbd_pp_lock);
2779
2780 for (i = 0; i < number; i++) {
2781 page = alloc_page(GFP_HIGHUSER);
2782 if (!page)
2783 goto Enomem;
2784 set_page_private(page, (unsigned long)drbd_pp_pool);
2785 drbd_pp_pool = page;
2786 }
2787 drbd_pp_vacant = number;
2788
2789 return 0;
2790
2791Enomem:
2792 drbd_destroy_mempools(); /* in case we allocated some */
2793 return -ENOMEM;
2794}
2795
2796static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
2797 void *unused)
2798{
2799 /* just so we have it. you never know what interesting things we
2800 * might want to do here some day...
2801 */
2802
2803 return NOTIFY_DONE;
2804}
2805
2806static struct notifier_block drbd_notifier = {
2807 .notifier_call = drbd_notify_sys,
2808};
2809
2810static void drbd_release_ee_lists(struct drbd_conf *mdev)
2811{
2812 int rr;
2813
2814 rr = drbd_release_ee(mdev, &mdev->active_ee);
2815 if (rr)
2816 dev_err(DEV, "%d EEs in active list found!\n", rr);
2817
2818 rr = drbd_release_ee(mdev, &mdev->sync_ee);
2819 if (rr)
2820 dev_err(DEV, "%d EEs in sync list found!\n", rr);
2821
2822 rr = drbd_release_ee(mdev, &mdev->read_ee);
2823 if (rr)
2824 dev_err(DEV, "%d EEs in read list found!\n", rr);
2825
2826 rr = drbd_release_ee(mdev, &mdev->done_ee);
2827 if (rr)
2828 dev_err(DEV, "%d EEs in done list found!\n", rr);
2829
2830 rr = drbd_release_ee(mdev, &mdev->net_ee);
2831 if (rr)
2832 dev_err(DEV, "%d EEs in net list found!\n", rr);
2833}
2834
2835/* caution. no locking.
2836 * currently only used from module cleanup code. */
2837static void drbd_delete_device(unsigned int minor)
2838{
2839 struct drbd_conf *mdev = minor_to_mdev(minor);
2840
2841 if (!mdev)
2842 return;
2843
2844 /* paranoia asserts */
2845 if (mdev->open_cnt != 0)
2846 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
2847 __FILE__ , __LINE__);
2848
2849 ERR_IF (!list_empty(&mdev->data.work.q)) {
2850 struct list_head *lp;
2851 list_for_each(lp, &mdev->data.work.q) {
2852 dev_err(DEV, "lp = %p\n", lp);
2853 }
2854 };
2855 /* end paranoia asserts */
2856
2857 del_gendisk(mdev->vdisk);
2858
2859 /* cleanup stuff that may have been allocated during
2860 * device (re-)configuration or state changes */
2861
2862 if (mdev->this_bdev)
2863 bdput(mdev->this_bdev);
2864
2865 drbd_free_resources(mdev);
2866
2867 drbd_release_ee_lists(mdev);
2868
2869 /* should be free'd on disconnect? */
2870 kfree(mdev->ee_hash);
2871 /*
2872 mdev->ee_hash_s = 0;
2873 mdev->ee_hash = NULL;
2874 */
2875
2876 lc_destroy(mdev->act_log);
2877 lc_destroy(mdev->resync);
2878
2879 kfree(mdev->p_uuid);
2880 /* mdev->p_uuid = NULL; */
2881
2882 kfree(mdev->int_dig_out);
2883 kfree(mdev->int_dig_in);
2884 kfree(mdev->int_dig_vv);
2885
2886 /* cleanup the rest that has been
2887 * allocated from drbd_new_device
2888 * and actually free the mdev itself */
2889 drbd_free_mdev(mdev);
2890}
2891
2892static void drbd_cleanup(void)
2893{
2894 unsigned int i;
2895
2896 unregister_reboot_notifier(&drbd_notifier);
2897
2898 drbd_nl_cleanup();
2899
2900 if (minor_table) {
2901 if (drbd_proc)
2902 remove_proc_entry("drbd", NULL);
2903 i = minor_count;
2904 while (i--)
2905 drbd_delete_device(i);
2906 drbd_destroy_mempools();
2907 }
2908
2909 kfree(minor_table);
2910
2911 unregister_blkdev(DRBD_MAJOR, "drbd");
2912
2913 printk(KERN_INFO "drbd: module cleanup done.\n");
2914}
2915
2916/**
2917 * drbd_congested() - Callback for pdflush
2918 * @congested_data: User data
2919 * @bdi_bits: Bits pdflush is currently interested in
2920 *
2921 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
2922 */
2923static int drbd_congested(void *congested_data, int bdi_bits)
2924{
2925 struct drbd_conf *mdev = congested_data;
2926 struct request_queue *q;
2927 char reason = '-';
2928 int r = 0;
2929
2930 if (!__inc_ap_bio_cond(mdev)) {
2931 /* DRBD has frozen IO */
2932 r = bdi_bits;
2933 reason = 'd';
2934 goto out;
2935 }
2936
2937 if (get_ldev(mdev)) {
2938 q = bdev_get_queue(mdev->ldev->backing_bdev);
2939 r = bdi_congested(&q->backing_dev_info, bdi_bits);
2940 put_ldev(mdev);
2941 if (r)
2942 reason = 'b';
2943 }
2944
2945 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
2946 r |= (1 << BDI_async_congested);
2947 reason = reason == 'b' ? 'a' : 'n';
2948 }
2949
2950out:
2951 mdev->congestion_reason = reason;
2952 return r;
2953}
2954
2955struct drbd_conf *drbd_new_device(unsigned int minor)
2956{
2957 struct drbd_conf *mdev;
2958 struct gendisk *disk;
2959 struct request_queue *q;
2960
2961 /* GFP_KERNEL, we are outside of all write-out paths */
2962 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
2963 if (!mdev)
2964 return NULL;
2965 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
2966 goto out_no_cpumask;
2967
2968 mdev->minor = minor;
2969
2970 drbd_init_set_defaults(mdev);
2971
2972 q = blk_alloc_queue(GFP_KERNEL);
2973 if (!q)
2974 goto out_no_q;
2975 mdev->rq_queue = q;
2976 q->queuedata = mdev;
2977 blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE);
2978
2979 disk = alloc_disk(1);
2980 if (!disk)
2981 goto out_no_disk;
2982 mdev->vdisk = disk;
2983
2984 set_disk_ro(disk, TRUE);
2985
2986 disk->queue = q;
2987 disk->major = DRBD_MAJOR;
2988 disk->first_minor = minor;
2989 disk->fops = &drbd_ops;
2990 sprintf(disk->disk_name, "drbd%d", minor);
2991 disk->private_data = mdev;
2992
2993 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
2994 /* we have no partitions. we contain only ourselves. */
2995 mdev->this_bdev->bd_contains = mdev->this_bdev;
2996
2997 q->backing_dev_info.congested_fn = drbd_congested;
2998 q->backing_dev_info.congested_data = mdev;
2999
3000 blk_queue_make_request(q, drbd_make_request_26);
3001 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3002 blk_queue_merge_bvec(q, drbd_merge_bvec);
3003 q->queue_lock = &mdev->req_lock; /* needed since we use */
3004 /* plugging on a queue, that actually has no requests! */
3005 q->unplug_fn = drbd_unplug_fn;
3006
3007 mdev->md_io_page = alloc_page(GFP_KERNEL);
3008 if (!mdev->md_io_page)
3009 goto out_no_io_page;
3010
3011 if (drbd_bm_init(mdev))
3012 goto out_no_bitmap;
3013 /* no need to lock access, we are still initializing this minor device. */
3014 if (!tl_init(mdev))
3015 goto out_no_tl;
3016
3017 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3018 if (!mdev->app_reads_hash)
3019 goto out_no_app_reads;
3020
3021 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3022 if (!mdev->current_epoch)
3023 goto out_no_epoch;
3024
3025 INIT_LIST_HEAD(&mdev->current_epoch->list);
3026 mdev->epochs = 1;
3027
3028 return mdev;
3029
3030/* out_whatever_else:
3031 kfree(mdev->current_epoch); */
3032out_no_epoch:
3033 kfree(mdev->app_reads_hash);
3034out_no_app_reads:
3035 tl_cleanup(mdev);
3036out_no_tl:
3037 drbd_bm_cleanup(mdev);
3038out_no_bitmap:
3039 __free_page(mdev->md_io_page);
3040out_no_io_page:
3041 put_disk(disk);
3042out_no_disk:
3043 blk_cleanup_queue(q);
3044out_no_q:
3045 free_cpumask_var(mdev->cpu_mask);
3046out_no_cpumask:
3047 kfree(mdev);
3048 return NULL;
3049}
3050
3051/* counterpart of drbd_new_device.
3052 * last part of drbd_delete_device. */
3053void drbd_free_mdev(struct drbd_conf *mdev)
3054{
3055 kfree(mdev->current_epoch);
3056 kfree(mdev->app_reads_hash);
3057 tl_cleanup(mdev);
3058 if (mdev->bitmap) /* should no longer be there. */
3059 drbd_bm_cleanup(mdev);
3060 __free_page(mdev->md_io_page);
3061 put_disk(mdev->vdisk);
3062 blk_cleanup_queue(mdev->rq_queue);
3063 free_cpumask_var(mdev->cpu_mask);
3064 kfree(mdev);
3065}
3066
3067
3068int __init drbd_init(void)
3069{
3070 int err;
3071
3072 if (sizeof(struct p_handshake) != 80) {
3073 printk(KERN_ERR
3074 "drbd: never change the size or layout "
3075 "of the HandShake packet.\n");
3076 return -EINVAL;
3077 }
3078
3079 if (1 > minor_count || minor_count > 255) {
3080 printk(KERN_ERR
3081 "drbd: invalid minor_count (%d)\n", minor_count);
3082#ifdef MODULE
3083 return -EINVAL;
3084#else
3085 minor_count = 8;
3086#endif
3087 }
3088
3089 err = drbd_nl_init();
3090 if (err)
3091 return err;
3092
3093 err = register_blkdev(DRBD_MAJOR, "drbd");
3094 if (err) {
3095 printk(KERN_ERR
3096 "drbd: unable to register block device major %d\n",
3097 DRBD_MAJOR);
3098 return err;
3099 }
3100
3101 register_reboot_notifier(&drbd_notifier);
3102
3103 /*
3104 * allocate all necessary structs
3105 */
3106 err = -ENOMEM;
3107
3108 init_waitqueue_head(&drbd_pp_wait);
3109
3110 drbd_proc = NULL; /* play safe for drbd_cleanup */
3111 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3112 GFP_KERNEL);
3113 if (!minor_table)
3114 goto Enomem;
3115
3116 err = drbd_create_mempools();
3117 if (err)
3118 goto Enomem;
3119
3120 drbd_proc = proc_create("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops);
3121 if (!drbd_proc) {
3122 printk(KERN_ERR "drbd: unable to register proc file\n");
3123 goto Enomem;
3124 }
3125
3126 rwlock_init(&global_state_lock);
3127
3128 printk(KERN_INFO "drbd: initialized. "
3129 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3130 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3131 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3132 printk(KERN_INFO "drbd: registered as block device major %d\n",
3133 DRBD_MAJOR);
3134 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3135
3136 return 0; /* Success! */
3137
3138Enomem:
3139 drbd_cleanup();
3140 if (err == -ENOMEM)
3141 /* currently always the case */
3142 printk(KERN_ERR "drbd: ran out of memory\n");
3143 else
3144 printk(KERN_ERR "drbd: initialization failure\n");
3145 return err;
3146}
3147
3148void drbd_free_bc(struct drbd_backing_dev *ldev)
3149{
3150 if (ldev == NULL)
3151 return;
3152
3153 bd_release(ldev->backing_bdev);
3154 bd_release(ldev->md_bdev);
3155
3156 fput(ldev->lo_file);
3157 fput(ldev->md_file);
3158
3159 kfree(ldev);
3160}
3161
3162void drbd_free_sock(struct drbd_conf *mdev)
3163{
3164 if (mdev->data.socket) {
3165 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3166 sock_release(mdev->data.socket);
3167 mdev->data.socket = NULL;
3168 }
3169 if (mdev->meta.socket) {
3170 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3171 sock_release(mdev->meta.socket);
3172 mdev->meta.socket = NULL;
3173 }
3174}
3175
3176
3177void drbd_free_resources(struct drbd_conf *mdev)
3178{
3179 crypto_free_hash(mdev->csums_tfm);
3180 mdev->csums_tfm = NULL;
3181 crypto_free_hash(mdev->verify_tfm);
3182 mdev->verify_tfm = NULL;
3183 crypto_free_hash(mdev->cram_hmac_tfm);
3184 mdev->cram_hmac_tfm = NULL;
3185 crypto_free_hash(mdev->integrity_w_tfm);
3186 mdev->integrity_w_tfm = NULL;
3187 crypto_free_hash(mdev->integrity_r_tfm);
3188 mdev->integrity_r_tfm = NULL;
3189
3190 drbd_free_sock(mdev);
3191
3192 __no_warn(local,
3193 drbd_free_bc(mdev->ldev);
3194 mdev->ldev = NULL;);
3195}
3196
3197/* meta data management */
3198
3199struct meta_data_on_disk {
3200 u64 la_size; /* last agreed size. */
3201 u64 uuid[UI_SIZE]; /* UUIDs. */
3202 u64 device_uuid;
3203 u64 reserved_u64_1;
3204 u32 flags; /* MDF */
3205 u32 magic;
3206 u32 md_size_sect;
3207 u32 al_offset; /* offset to this block */
3208 u32 al_nr_extents; /* important for restoring the AL */
3209 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3210 u32 bm_offset; /* offset to the bitmap, from here */
3211 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3212 u32 reserved_u32[4];
3213
3214} __packed;
3215
3216/**
3217 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3218 * @mdev: DRBD device.
3219 */
3220void drbd_md_sync(struct drbd_conf *mdev)
3221{
3222 struct meta_data_on_disk *buffer;
3223 sector_t sector;
3224 int i;
3225
3226 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3227 return;
3228 del_timer(&mdev->md_sync_timer);
3229
3230 /* We use here D_FAILED and not D_ATTACHING because we try to write
3231 * metadata even if we detach due to a disk failure! */
3232 if (!get_ldev_if_state(mdev, D_FAILED))
3233 return;
3234
3235 mutex_lock(&mdev->md_io_mutex);
3236 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3237 memset(buffer, 0, 512);
3238
3239 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3240 for (i = UI_CURRENT; i < UI_SIZE; i++)
3241 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3242 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3243 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3244
3245 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3246 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3247 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3248 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3249 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3250
3251 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3252
3253 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3254 sector = mdev->ldev->md.md_offset;
3255
3256 if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3257 clear_bit(MD_DIRTY, &mdev->flags);
3258 } else {
3259 /* this was a try anyways ... */
3260 dev_err(DEV, "meta data update failed!\n");
3261
3262 drbd_chk_io_error(mdev, 1, TRUE);
3263 }
3264
3265 /* Update mdev->ldev->md.la_size_sect,
3266 * since we updated it on metadata. */
3267 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3268
3269 mutex_unlock(&mdev->md_io_mutex);
3270 put_ldev(mdev);
3271}
3272
3273/**
3274 * drbd_md_read() - Reads in the meta data super block
3275 * @mdev: DRBD device.
3276 * @bdev: Device from which the meta data should be read in.
3277 *
3278 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
3279 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3280 */
3281int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3282{
3283 struct meta_data_on_disk *buffer;
3284 int i, rv = NO_ERROR;
3285
3286 if (!get_ldev_if_state(mdev, D_ATTACHING))
3287 return ERR_IO_MD_DISK;
3288
3289 mutex_lock(&mdev->md_io_mutex);
3290 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3291
3292 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3293 /* NOTE: cant do normal error processing here as this is
3294 called BEFORE disk is attached */
3295 dev_err(DEV, "Error while reading metadata.\n");
3296 rv = ERR_IO_MD_DISK;
3297 goto err;
3298 }
3299
3300 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3301 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3302 rv = ERR_MD_INVALID;
3303 goto err;
3304 }
3305 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3306 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3307 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3308 rv = ERR_MD_INVALID;
3309 goto err;
3310 }
3311 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3312 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3313 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3314 rv = ERR_MD_INVALID;
3315 goto err;
3316 }
3317 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3318 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3319 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3320 rv = ERR_MD_INVALID;
3321 goto err;
3322 }
3323
3324 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3325 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3326 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3327 rv = ERR_MD_INVALID;
3328 goto err;
3329 }
3330
3331 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3332 for (i = UI_CURRENT; i < UI_SIZE; i++)
3333 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3334 bdev->md.flags = be32_to_cpu(buffer->flags);
3335 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3336 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3337
3338 if (mdev->sync_conf.al_extents < 7)
3339 mdev->sync_conf.al_extents = 127;
3340
3341 err:
3342 mutex_unlock(&mdev->md_io_mutex);
3343 put_ldev(mdev);
3344
3345 return rv;
3346}
3347
3348/**
3349 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3350 * @mdev: DRBD device.
3351 *
3352 * Call this function if you change anything that should be written to
3353 * the meta-data super block. This function sets MD_DIRTY, and starts a
3354 * timer that ensures that within five seconds you have to call drbd_md_sync().
3355 */
3356void drbd_md_mark_dirty(struct drbd_conf *mdev)
3357{
3358 set_bit(MD_DIRTY, &mdev->flags);
3359 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
3360}
3361
3362
3363static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3364{
3365 int i;
3366
3367 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
3368 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
3369}
3370
3371void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3372{
3373 if (idx == UI_CURRENT) {
3374 if (mdev->state.role == R_PRIMARY)
3375 val |= 1;
3376 else
3377 val &= ~((u64)1);
3378
3379 drbd_set_ed_uuid(mdev, val);
3380 }
3381
3382 mdev->ldev->md.uuid[idx] = val;
3383 drbd_md_mark_dirty(mdev);
3384}
3385
3386
3387void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3388{
3389 if (mdev->ldev->md.uuid[idx]) {
3390 drbd_uuid_move_history(mdev);
3391 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
3392 }
3393 _drbd_uuid_set(mdev, idx, val);
3394}
3395
3396/**
3397 * drbd_uuid_new_current() - Creates a new current UUID
3398 * @mdev: DRBD device.
3399 *
3400 * Creates a new current UUID, and rotates the old current UUID into
3401 * the bitmap slot. Causes an incremental resync upon next connect.
3402 */
3403void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3404{
3405 u64 val;
3406
3407 dev_info(DEV, "Creating new current UUID\n");
3408 D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3409 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
3410
3411 get_random_bytes(&val, sizeof(u64));
3412 _drbd_uuid_set(mdev, UI_CURRENT, val);
3413}
3414
3415void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3416{
3417 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3418 return;
3419
3420 if (val == 0) {
3421 drbd_uuid_move_history(mdev);
3422 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3423 mdev->ldev->md.uuid[UI_BITMAP] = 0;
3424 } else {
3425 if (mdev->ldev->md.uuid[UI_BITMAP])
3426 dev_warn(DEV, "bm UUID already set");
3427
3428 mdev->ldev->md.uuid[UI_BITMAP] = val;
3429 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3430
3431 }
3432 drbd_md_mark_dirty(mdev);
3433}
3434
3435/**
3436 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3437 * @mdev: DRBD device.
3438 *
3439 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3440 */
3441int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3442{
3443 int rv = -EIO;
3444
3445 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3446 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3447 drbd_md_sync(mdev);
3448 drbd_bm_set_all(mdev);
3449
3450 rv = drbd_bm_write(mdev);
3451
3452 if (!rv) {
3453 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3454 drbd_md_sync(mdev);
3455 }
3456
3457 put_ldev(mdev);
3458 }
3459
3460 return rv;
3461}
3462
3463/**
3464 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3465 * @mdev: DRBD device.
3466 *
3467 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3468 */
3469int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3470{
3471 int rv = -EIO;
3472
3473 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3474 drbd_bm_clear_all(mdev);
3475 rv = drbd_bm_write(mdev);
3476 put_ldev(mdev);
3477 }
3478
3479 return rv;
3480}
3481
3482static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3483{
3484 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3485 int rv;
3486
3487 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3488
3489 drbd_bm_lock(mdev, work->why);
3490 rv = work->io_fn(mdev);
3491 drbd_bm_unlock(mdev);
3492
3493 clear_bit(BITMAP_IO, &mdev->flags);
3494 wake_up(&mdev->misc_wait);
3495
3496 if (work->done)
3497 work->done(mdev, rv);
3498
3499 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3500 work->why = NULL;
3501
3502 return 1;
3503}
3504
3505/**
3506 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3507 * @mdev: DRBD device.
3508 * @io_fn: IO callback to be called when bitmap IO is possible
3509 * @done: callback to be called after the bitmap IO was performed
3510 * @why: Descriptive text of the reason for doing the IO
3511 *
3512 * While IO on the bitmap happens we freeze application IO thus we ensure
3513 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3514 * called from worker context. It MUST NOT be used while a previous such
3515 * work is still pending!
3516 */
3517void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3518 int (*io_fn)(struct drbd_conf *),
3519 void (*done)(struct drbd_conf *, int),
3520 char *why)
3521{
3522 D_ASSERT(current == mdev->worker.task);
3523
3524 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3525 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3526 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3527 if (mdev->bm_io_work.why)
3528 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3529 why, mdev->bm_io_work.why);
3530
3531 mdev->bm_io_work.io_fn = io_fn;
3532 mdev->bm_io_work.done = done;
3533 mdev->bm_io_work.why = why;
3534
3535 set_bit(BITMAP_IO, &mdev->flags);
3536 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
3537 if (list_empty(&mdev->bm_io_work.w.list)) {
3538 set_bit(BITMAP_IO_QUEUED, &mdev->flags);
3539 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
3540 } else
3541 dev_err(DEV, "FIXME avoided double queuing bm_io_work\n");
3542 }
3543}
3544
3545/**
3546 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
3547 * @mdev: DRBD device.
3548 * @io_fn: IO callback to be called when bitmap IO is possible
3549 * @why: Descriptive text of the reason for doing the IO
3550 *
3551 * freezes application IO while that the actual IO operations runs. This
3552 * functions MAY NOT be called from worker context.
3553 */
3554int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3555{
3556 int rv;
3557
3558 D_ASSERT(current != mdev->worker.task);
3559
3560 drbd_suspend_io(mdev);
3561
3562 drbd_bm_lock(mdev, why);
3563 rv = io_fn(mdev);
3564 drbd_bm_unlock(mdev);
3565
3566 drbd_resume_io(mdev);
3567
3568 return rv;
3569}
3570
3571void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3572{
3573 if ((mdev->ldev->md.flags & flag) != flag) {
3574 drbd_md_mark_dirty(mdev);
3575 mdev->ldev->md.flags |= flag;
3576 }
3577}
3578
3579void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3580{
3581 if ((mdev->ldev->md.flags & flag) != 0) {
3582 drbd_md_mark_dirty(mdev);
3583 mdev->ldev->md.flags &= ~flag;
3584 }
3585}
3586int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3587{
3588 return (bdev->md.flags & flag) != 0;
3589}
3590
3591static void md_sync_timer_fn(unsigned long data)
3592{
3593 struct drbd_conf *mdev = (struct drbd_conf *) data;
3594
3595 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
3596}
3597
3598static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3599{
3600 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
3601 drbd_md_sync(mdev);
3602
3603 return 1;
3604}
3605
3606#ifdef CONFIG_DRBD_FAULT_INJECTION
3607/* Fault insertion support including random number generator shamelessly
3608 * stolen from kernel/rcutorture.c */
3609struct fault_random_state {
3610 unsigned long state;
3611 unsigned long count;
3612};
3613
3614#define FAULT_RANDOM_MULT 39916801 /* prime */
3615#define FAULT_RANDOM_ADD 479001701 /* prime */
3616#define FAULT_RANDOM_REFRESH 10000
3617
3618/*
3619 * Crude but fast random-number generator. Uses a linear congruential
3620 * generator, with occasional help from get_random_bytes().
3621 */
3622static unsigned long
3623_drbd_fault_random(struct fault_random_state *rsp)
3624{
3625 long refresh;
3626
3627 if (--rsp->count < 0) {
3628 get_random_bytes(&refresh, sizeof(refresh));
3629 rsp->state += refresh;
3630 rsp->count = FAULT_RANDOM_REFRESH;
3631 }
3632 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3633 return swahw32(rsp->state);
3634}
3635
3636static char *
3637_drbd_fault_str(unsigned int type) {
3638 static char *_faults[] = {
3639 [DRBD_FAULT_MD_WR] = "Meta-data write",
3640 [DRBD_FAULT_MD_RD] = "Meta-data read",
3641 [DRBD_FAULT_RS_WR] = "Resync write",
3642 [DRBD_FAULT_RS_RD] = "Resync read",
3643 [DRBD_FAULT_DT_WR] = "Data write",
3644 [DRBD_FAULT_DT_RD] = "Data read",
3645 [DRBD_FAULT_DT_RA] = "Data read ahead",
3646 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
3647 [DRBD_FAULT_AL_EE] = "EE allocation"
3648 };
3649
3650 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3651}
3652
3653unsigned int
3654_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
3655{
3656 static struct fault_random_state rrs = {0, 0};
3657
3658 unsigned int ret = (
3659 (fault_devs == 0 ||
3660 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
3661 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
3662
3663 if (ret) {
3664 fault_count++;
3665
3666 if (printk_ratelimit())
3667 dev_warn(DEV, "***Simulating %s failure\n",
3668 _drbd_fault_str(type));
3669 }
3670
3671 return ret;
3672}
3673#endif
3674
3675const char *drbd_buildtag(void)
3676{
3677 /* DRBD built from external sources has here a reference to the
3678 git hash of the source code. */
3679
3680 static char buildtag[38] = "\0uilt-in";
3681
3682 if (buildtag[0] == 0) {
3683#ifdef CONFIG_MODULES
3684 if (THIS_MODULE != NULL)
3685 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3686 else
3687#endif
3688 buildtag[0] = 'b';
3689 }
3690
3691 return buildtag;
3692}
3693
3694module_init(drbd_init)
3695module_exit(drbd_cleanup)
3696
3697EXPORT_SYMBOL(drbd_conn_str);
3698EXPORT_SYMBOL(drbd_role_str);
3699EXPORT_SYMBOL(drbd_disk_str);
3700EXPORT_SYMBOL(drbd_set_st_err_str);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
new file mode 100644
index 000000000000..22538d9628f1
--- /dev/null
+++ b/drivers/block/drbd/drbd_nl.c
@@ -0,0 +1,2360 @@
1/*
2 drbd_nl.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/module.h>
27#include <linux/drbd.h>
28#include <linux/in.h>
29#include <linux/fs.h>
30#include <linux/file.h>
31#include <linux/slab.h>
32#include <linux/connector.h>
33#include <linux/blkpg.h>
34#include <linux/cpumask.h>
35#include "drbd_int.h"
36#include "drbd_wrappers.h"
37#include <asm/unaligned.h>
38#include <linux/drbd_tag_magic.h>
39#include <linux/drbd_limits.h>
40
41static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int);
42static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *);
43static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *);
44
45/* see get_sb_bdev and bd_claim */
46static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
47
48/* Generate the tag_list to struct functions */
49#define NL_PACKET(name, number, fields) \
50static int name ## _from_tags(struct drbd_conf *mdev, \
51 unsigned short *tags, struct name *arg) __attribute__ ((unused)); \
52static int name ## _from_tags(struct drbd_conf *mdev, \
53 unsigned short *tags, struct name *arg) \
54{ \
55 int tag; \
56 int dlen; \
57 \
58 while ((tag = get_unaligned(tags++)) != TT_END) { \
59 dlen = get_unaligned(tags++); \
60 switch (tag_number(tag)) { \
61 fields \
62 default: \
63 if (tag & T_MANDATORY) { \
64 dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \
65 return 0; \
66 } \
67 } \
68 tags = (unsigned short *)((char *)tags + dlen); \
69 } \
70 return 1; \
71}
72#define NL_INTEGER(pn, pr, member) \
73 case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \
74 arg->member = get_unaligned((int *)(tags)); \
75 break;
76#define NL_INT64(pn, pr, member) \
77 case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \
78 arg->member = get_unaligned((u64 *)(tags)); \
79 break;
80#define NL_BIT(pn, pr, member) \
81 case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \
82 arg->member = *(char *)(tags) ? 1 : 0; \
83 break;
84#define NL_STRING(pn, pr, member, len) \
85 case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \
86 if (dlen > len) { \
87 dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \
88 #member, dlen, (unsigned int)len); \
89 return 0; \
90 } \
91 arg->member ## _len = dlen; \
92 memcpy(arg->member, tags, min_t(size_t, dlen, len)); \
93 break;
94#include "linux/drbd_nl.h"
95
96/* Generate the struct to tag_list functions */
97#define NL_PACKET(name, number, fields) \
98static unsigned short* \
99name ## _to_tags(struct drbd_conf *mdev, \
100 struct name *arg, unsigned short *tags) __attribute__ ((unused)); \
101static unsigned short* \
102name ## _to_tags(struct drbd_conf *mdev, \
103 struct name *arg, unsigned short *tags) \
104{ \
105 fields \
106 return tags; \
107}
108
109#define NL_INTEGER(pn, pr, member) \
110 put_unaligned(pn | pr | TT_INTEGER, tags++); \
111 put_unaligned(sizeof(int), tags++); \
112 put_unaligned(arg->member, (int *)tags); \
113 tags = (unsigned short *)((char *)tags+sizeof(int));
114#define NL_INT64(pn, pr, member) \
115 put_unaligned(pn | pr | TT_INT64, tags++); \
116 put_unaligned(sizeof(u64), tags++); \
117 put_unaligned(arg->member, (u64 *)tags); \
118 tags = (unsigned short *)((char *)tags+sizeof(u64));
119#define NL_BIT(pn, pr, member) \
120 put_unaligned(pn | pr | TT_BIT, tags++); \
121 put_unaligned(sizeof(char), tags++); \
122 *(char *)tags = arg->member; \
123 tags = (unsigned short *)((char *)tags+sizeof(char));
124#define NL_STRING(pn, pr, member, len) \
125 put_unaligned(pn | pr | TT_STRING, tags++); \
126 put_unaligned(arg->member ## _len, tags++); \
127 memcpy(tags, arg->member, arg->member ## _len); \
128 tags = (unsigned short *)((char *)tags + arg->member ## _len);
129#include "linux/drbd_nl.h"
130
131void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name);
132void drbd_nl_send_reply(struct cn_msg *, int);
133
134int drbd_khelper(struct drbd_conf *mdev, char *cmd)
135{
136 char *envp[] = { "HOME=/",
137 "TERM=linux",
138 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
139 NULL, /* Will be set to address family */
140 NULL, /* Will be set to address */
141 NULL };
142
143 char mb[12], af[20], ad[60], *afs;
144 char *argv[] = {usermode_helper, cmd, mb, NULL };
145 int ret;
146
147 snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev));
148
149 if (get_net_conf(mdev)) {
150 switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) {
151 case AF_INET6:
152 afs = "ipv6";
153 snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6",
154 &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr);
155 break;
156 case AF_INET:
157 afs = "ipv4";
158 snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
159 &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
160 break;
161 default:
162 afs = "ssocks";
163 snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
164 &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
165 }
166 snprintf(af, 20, "DRBD_PEER_AF=%s", afs);
167 envp[3]=af;
168 envp[4]=ad;
169 put_net_conf(mdev);
170 }
171
172 dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
173
174 drbd_bcast_ev_helper(mdev, cmd);
175 ret = call_usermodehelper(usermode_helper, argv, envp, 1);
176 if (ret)
177 dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
178 usermode_helper, cmd, mb,
179 (ret >> 8) & 0xff, ret);
180 else
181 dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
182 usermode_helper, cmd, mb,
183 (ret >> 8) & 0xff, ret);
184
185 if (ret < 0) /* Ignore any ERRNOs we got. */
186 ret = 0;
187
188 return ret;
189}
190
191enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
192{
193 char *ex_to_string;
194 int r;
195 enum drbd_disk_state nps;
196 enum drbd_fencing_p fp;
197
198 D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
199
200 if (get_ldev_if_state(mdev, D_CONSISTENT)) {
201 fp = mdev->ldev->dc.fencing;
202 put_ldev(mdev);
203 } else {
204 dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n");
205 return mdev->state.pdsk;
206 }
207
208 if (fp == FP_STONITH)
209 _drbd_request_state(mdev, NS(susp, 1), CS_WAIT_COMPLETE);
210
211 r = drbd_khelper(mdev, "fence-peer");
212
213 switch ((r>>8) & 0xff) {
214 case 3: /* peer is inconsistent */
215 ex_to_string = "peer is inconsistent or worse";
216 nps = D_INCONSISTENT;
217 break;
218 case 4: /* peer got outdated, or was already outdated */
219 ex_to_string = "peer was fenced";
220 nps = D_OUTDATED;
221 break;
222 case 5: /* peer was down */
223 if (mdev->state.disk == D_UP_TO_DATE) {
224 /* we will(have) create(d) a new UUID anyways... */
225 ex_to_string = "peer is unreachable, assumed to be dead";
226 nps = D_OUTDATED;
227 } else {
228 ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
229 nps = mdev->state.pdsk;
230 }
231 break;
232 case 6: /* Peer is primary, voluntarily outdate myself.
233 * This is useful when an unconnected R_SECONDARY is asked to
234 * become R_PRIMARY, but finds the other peer being active. */
235 ex_to_string = "peer is active";
236 dev_warn(DEV, "Peer is primary, outdating myself.\n");
237 nps = D_UNKNOWN;
238 _drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE);
239 break;
240 case 7:
241 if (fp != FP_STONITH)
242 dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n");
243 ex_to_string = "peer was stonithed";
244 nps = D_OUTDATED;
245 break;
246 default:
247 /* The script is broken ... */
248 nps = D_UNKNOWN;
249 dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
250 return nps;
251 }
252
253 dev_info(DEV, "fence-peer helper returned %d (%s)\n",
254 (r>>8) & 0xff, ex_to_string);
255 return nps;
256}
257
258
259int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
260{
261 const int max_tries = 4;
262 int r = 0;
263 int try = 0;
264 int forced = 0;
265 union drbd_state mask, val;
266 enum drbd_disk_state nps;
267
268 if (new_role == R_PRIMARY)
269 request_ping(mdev); /* Detect a dead peer ASAP */
270
271 mutex_lock(&mdev->state_mutex);
272
273 mask.i = 0; mask.role = R_MASK;
274 val.i = 0; val.role = new_role;
275
276 while (try++ < max_tries) {
277 r = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE);
278
279 /* in case we first succeeded to outdate,
280 * but now suddenly could establish a connection */
281 if (r == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
282 val.pdsk = 0;
283 mask.pdsk = 0;
284 continue;
285 }
286
287 if (r == SS_NO_UP_TO_DATE_DISK && force &&
288 (mdev->state.disk == D_INCONSISTENT ||
289 mdev->state.disk == D_OUTDATED)) {
290 mask.disk = D_MASK;
291 val.disk = D_UP_TO_DATE;
292 forced = 1;
293 continue;
294 }
295
296 if (r == SS_NO_UP_TO_DATE_DISK &&
297 mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) {
298 D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
299 nps = drbd_try_outdate_peer(mdev);
300
301 if (nps == D_OUTDATED || nps == D_INCONSISTENT) {
302 val.disk = D_UP_TO_DATE;
303 mask.disk = D_MASK;
304 }
305
306 val.pdsk = nps;
307 mask.pdsk = D_MASK;
308
309 continue;
310 }
311
312 if (r == SS_NOTHING_TO_DO)
313 goto fail;
314 if (r == SS_PRIMARY_NOP && mask.pdsk == 0) {
315 nps = drbd_try_outdate_peer(mdev);
316
317 if (force && nps > D_OUTDATED) {
318 dev_warn(DEV, "Forced into split brain situation!\n");
319 nps = D_OUTDATED;
320 }
321
322 mask.pdsk = D_MASK;
323 val.pdsk = nps;
324
325 continue;
326 }
327 if (r == SS_TWO_PRIMARIES) {
328 /* Maybe the peer is detected as dead very soon...
329 retry at most once more in this case. */
330 __set_current_state(TASK_INTERRUPTIBLE);
331 schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10);
332 if (try < max_tries)
333 try = max_tries - 1;
334 continue;
335 }
336 if (r < SS_SUCCESS) {
337 r = _drbd_request_state(mdev, mask, val,
338 CS_VERBOSE + CS_WAIT_COMPLETE);
339 if (r < SS_SUCCESS)
340 goto fail;
341 }
342 break;
343 }
344
345 if (r < SS_SUCCESS)
346 goto fail;
347
348 if (forced)
349 dev_warn(DEV, "Forced to consider local data as UpToDate!\n");
350
351 /* Wait until nothing is on the fly :) */
352 wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0);
353
354 if (new_role == R_SECONDARY) {
355 set_disk_ro(mdev->vdisk, TRUE);
356 if (get_ldev(mdev)) {
357 mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
358 put_ldev(mdev);
359 }
360 } else {
361 if (get_net_conf(mdev)) {
362 mdev->net_conf->want_lose = 0;
363 put_net_conf(mdev);
364 }
365 set_disk_ro(mdev->vdisk, FALSE);
366 if (get_ldev(mdev)) {
367 if (((mdev->state.conn < C_CONNECTED ||
368 mdev->state.pdsk <= D_FAILED)
369 && mdev->ldev->md.uuid[UI_BITMAP] == 0) || forced)
370 drbd_uuid_new_current(mdev);
371
372 mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1;
373 put_ldev(mdev);
374 }
375 }
376
377 if ((new_role == R_SECONDARY) && get_ldev(mdev)) {
378 drbd_al_to_on_disk_bm(mdev);
379 put_ldev(mdev);
380 }
381
382 if (mdev->state.conn >= C_WF_REPORT_PARAMS) {
383 /* if this was forced, we should consider sync */
384 if (forced)
385 drbd_send_uuids(mdev);
386 drbd_send_state(mdev);
387 }
388
389 drbd_md_sync(mdev);
390
391 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
392 fail:
393 mutex_unlock(&mdev->state_mutex);
394 return r;
395}
396
397
398static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
399 struct drbd_nl_cfg_reply *reply)
400{
401 struct primary primary_args;
402
403 memset(&primary_args, 0, sizeof(struct primary));
404 if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) {
405 reply->ret_code = ERR_MANDATORY_TAG;
406 return 0;
407 }
408
409 reply->ret_code =
410 drbd_set_role(mdev, R_PRIMARY, primary_args.overwrite_peer);
411
412 return 0;
413}
414
415static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
416 struct drbd_nl_cfg_reply *reply)
417{
418 reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0);
419
420 return 0;
421}
422
423/* initializes the md.*_offset members, so we are able to find
424 * the on disk meta data */
425static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
426 struct drbd_backing_dev *bdev)
427{
428 sector_t md_size_sect = 0;
429 switch (bdev->dc.meta_dev_idx) {
430 default:
431 /* v07 style fixed size indexed meta data */
432 bdev->md.md_size_sect = MD_RESERVED_SECT;
433 bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
434 bdev->md.al_offset = MD_AL_OFFSET;
435 bdev->md.bm_offset = MD_BM_OFFSET;
436 break;
437 case DRBD_MD_INDEX_FLEX_EXT:
438 /* just occupy the full device; unit: sectors */
439 bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
440 bdev->md.md_offset = 0;
441 bdev->md.al_offset = MD_AL_OFFSET;
442 bdev->md.bm_offset = MD_BM_OFFSET;
443 break;
444 case DRBD_MD_INDEX_INTERNAL:
445 case DRBD_MD_INDEX_FLEX_INT:
446 bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
447 /* al size is still fixed */
448 bdev->md.al_offset = -MD_AL_MAX_SIZE;
449 /* we need (slightly less than) ~ this much bitmap sectors: */
450 md_size_sect = drbd_get_capacity(bdev->backing_bdev);
451 md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
452 md_size_sect = BM_SECT_TO_EXT(md_size_sect);
453 md_size_sect = ALIGN(md_size_sect, 8);
454
455 /* plus the "drbd meta data super block",
456 * and the activity log; */
457 md_size_sect += MD_BM_OFFSET;
458
459 bdev->md.md_size_sect = md_size_sect;
460 /* bitmap offset is adjusted by 'super' block size */
461 bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET;
462 break;
463 }
464}
465
466char *ppsize(char *buf, unsigned long long size)
467{
468 /* Needs 9 bytes at max. */
469 static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
470 int base = 0;
471 while (size >= 10000) {
472 /* shift + round */
473 size = (size >> 10) + !!(size & (1<<9));
474 base++;
475 }
476 sprintf(buf, "%lu %cB", (long)size, units[base]);
477
478 return buf;
479}
480
481/* there is still a theoretical deadlock when called from receiver
482 * on an D_INCONSISTENT R_PRIMARY:
483 * remote READ does inc_ap_bio, receiver would need to receive answer
484 * packet from remote to dec_ap_bio again.
485 * receiver receive_sizes(), comes here,
486 * waits for ap_bio_cnt == 0. -> deadlock.
487 * but this cannot happen, actually, because:
488 * R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable
489 * (not connected, or bad/no disk on peer):
490 * see drbd_fail_request_early, ap_bio_cnt is zero.
491 * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
492 * peer may not initiate a resize.
493 */
494void drbd_suspend_io(struct drbd_conf *mdev)
495{
496 set_bit(SUSPEND_IO, &mdev->flags);
497 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
498}
499
500void drbd_resume_io(struct drbd_conf *mdev)
501{
502 clear_bit(SUSPEND_IO, &mdev->flags);
503 wake_up(&mdev->misc_wait);
504}
505
506/**
507 * drbd_determine_dev_size() - Sets the right device size obeying all constraints
508 * @mdev: DRBD device.
509 *
510 * Returns 0 on success, negative return values indicate errors.
511 * You should call drbd_md_sync() after calling this function.
512 */
513enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev) __must_hold(local)
514{
515 sector_t prev_first_sect, prev_size; /* previous meta location */
516 sector_t la_size;
517 sector_t size;
518 char ppb[10];
519
520 int md_moved, la_size_changed;
521 enum determine_dev_size rv = unchanged;
522
523 /* race:
524 * application request passes inc_ap_bio,
525 * but then cannot get an AL-reference.
526 * this function later may wait on ap_bio_cnt == 0. -> deadlock.
527 *
528 * to avoid that:
529 * Suspend IO right here.
530 * still lock the act_log to not trigger ASSERTs there.
531 */
532 drbd_suspend_io(mdev);
533
534 /* no wait necessary anymore, actually we could assert that */
535 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
536
537 prev_first_sect = drbd_md_first_sector(mdev->ldev);
538 prev_size = mdev->ldev->md.md_size_sect;
539 la_size = mdev->ldev->md.la_size_sect;
540
541 /* TODO: should only be some assert here, not (re)init... */
542 drbd_md_set_sector_offsets(mdev, mdev->ldev);
543
544 size = drbd_new_dev_size(mdev, mdev->ldev);
545
546 if (drbd_get_capacity(mdev->this_bdev) != size ||
547 drbd_bm_capacity(mdev) != size) {
548 int err;
549 err = drbd_bm_resize(mdev, size);
550 if (unlikely(err)) {
551 /* currently there is only one error: ENOMEM! */
552 size = drbd_bm_capacity(mdev)>>1;
553 if (size == 0) {
554 dev_err(DEV, "OUT OF MEMORY! "
555 "Could not allocate bitmap!\n");
556 } else {
557 dev_err(DEV, "BM resizing failed. "
558 "Leaving size unchanged at size = %lu KB\n",
559 (unsigned long)size);
560 }
561 rv = dev_size_error;
562 }
563 /* racy, see comments above. */
564 drbd_set_my_capacity(mdev, size);
565 mdev->ldev->md.la_size_sect = size;
566 dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
567 (unsigned long long)size>>1);
568 }
569 if (rv == dev_size_error)
570 goto out;
571
572 la_size_changed = (la_size != mdev->ldev->md.la_size_sect);
573
574 md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev)
575 || prev_size != mdev->ldev->md.md_size_sect;
576
577 if (la_size_changed || md_moved) {
578 drbd_al_shrink(mdev); /* All extents inactive. */
579 dev_info(DEV, "Writing the whole bitmap, %s\n",
580 la_size_changed && md_moved ? "size changed and md moved" :
581 la_size_changed ? "size changed" : "md moved");
582 rv = drbd_bitmap_io(mdev, &drbd_bm_write, "size changed"); /* does drbd_resume_io() ! */
583 drbd_md_mark_dirty(mdev);
584 }
585
586 if (size > la_size)
587 rv = grew;
588 if (size < la_size)
589 rv = shrunk;
590out:
591 lc_unlock(mdev->act_log);
592 wake_up(&mdev->al_wait);
593 drbd_resume_io(mdev);
594
595 return rv;
596}
597
598sector_t
599drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
600{
601 sector_t p_size = mdev->p_size; /* partner's disk size. */
602 sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */
603 sector_t m_size; /* my size */
604 sector_t u_size = bdev->dc.disk_size; /* size requested by user. */
605 sector_t size = 0;
606
607 m_size = drbd_get_max_capacity(bdev);
608
609 if (p_size && m_size) {
610 size = min_t(sector_t, p_size, m_size);
611 } else {
612 if (la_size) {
613 size = la_size;
614 if (m_size && m_size < size)
615 size = m_size;
616 if (p_size && p_size < size)
617 size = p_size;
618 } else {
619 if (m_size)
620 size = m_size;
621 if (p_size)
622 size = p_size;
623 }
624 }
625
626 if (size == 0)
627 dev_err(DEV, "Both nodes diskless!\n");
628
629 if (u_size) {
630 if (u_size > size)
631 dev_err(DEV, "Requested disk size is too big (%lu > %lu)\n",
632 (unsigned long)u_size>>1, (unsigned long)size>>1);
633 else
634 size = u_size;
635 }
636
637 return size;
638}
639
640/**
641 * drbd_check_al_size() - Ensures that the AL is of the right size
642 * @mdev: DRBD device.
643 *
644 * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation
645 * failed, and 0 on success. You should call drbd_md_sync() after you called
646 * this function.
647 */
648static int drbd_check_al_size(struct drbd_conf *mdev)
649{
650 struct lru_cache *n, *t;
651 struct lc_element *e;
652 unsigned int in_use;
653 int i;
654
655 ERR_IF(mdev->sync_conf.al_extents < 7)
656 mdev->sync_conf.al_extents = 127;
657
658 if (mdev->act_log &&
659 mdev->act_log->nr_elements == mdev->sync_conf.al_extents)
660 return 0;
661
662 in_use = 0;
663 t = mdev->act_log;
664 n = lc_create("act_log", drbd_al_ext_cache,
665 mdev->sync_conf.al_extents, sizeof(struct lc_element), 0);
666
667 if (n == NULL) {
668 dev_err(DEV, "Cannot allocate act_log lru!\n");
669 return -ENOMEM;
670 }
671 spin_lock_irq(&mdev->al_lock);
672 if (t) {
673 for (i = 0; i < t->nr_elements; i++) {
674 e = lc_element_by_index(t, i);
675 if (e->refcnt)
676 dev_err(DEV, "refcnt(%d)==%d\n",
677 e->lc_number, e->refcnt);
678 in_use += e->refcnt;
679 }
680 }
681 if (!in_use)
682 mdev->act_log = n;
683 spin_unlock_irq(&mdev->al_lock);
684 if (in_use) {
685 dev_err(DEV, "Activity log still in use!\n");
686 lc_destroy(n);
687 return -EBUSY;
688 } else {
689 if (t)
690 lc_destroy(t);
691 }
692 drbd_md_mark_dirty(mdev); /* we changed mdev->act_log->nr_elemens */
693 return 0;
694}
695
696void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __must_hold(local)
697{
698 struct request_queue * const q = mdev->rq_queue;
699 struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
700 int max_segments = mdev->ldev->dc.max_bio_bvecs;
701
702 if (b->merge_bvec_fn && !mdev->ldev->dc.use_bmbv)
703 max_seg_s = PAGE_SIZE;
704
705 max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s);
706
707 blk_queue_max_sectors(q, max_seg_s >> 9);
708 blk_queue_max_phys_segments(q, max_segments ? max_segments : MAX_PHYS_SEGMENTS);
709 blk_queue_max_hw_segments(q, max_segments ? max_segments : MAX_HW_SEGMENTS);
710 blk_queue_max_segment_size(q, max_seg_s);
711 blk_queue_logical_block_size(q, 512);
712 blk_queue_segment_boundary(q, PAGE_SIZE-1);
713 blk_stack_limits(&q->limits, &b->limits, 0);
714
715 if (b->merge_bvec_fn)
716 dev_warn(DEV, "Backing device's merge_bvec_fn() = %p\n",
717 b->merge_bvec_fn);
718 dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q));
719
720 if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
721 dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
722 q->backing_dev_info.ra_pages,
723 b->backing_dev_info.ra_pages);
724 q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
725 }
726}
727
728/* serialize deconfig (worker exiting, doing cleanup)
729 * and reconfig (drbdsetup disk, drbdsetup net)
730 *
731 * wait for a potentially exiting worker, then restart it,
732 * or start a new one.
733 */
734static void drbd_reconfig_start(struct drbd_conf *mdev)
735{
736 wait_event(mdev->state_wait, test_and_set_bit(CONFIG_PENDING, &mdev->flags));
737 wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags));
738 drbd_thread_start(&mdev->worker);
739}
740
741/* if still unconfigured, stops worker again.
742 * if configured now, clears CONFIG_PENDING.
743 * wakes potential waiters */
744static void drbd_reconfig_done(struct drbd_conf *mdev)
745{
746 spin_lock_irq(&mdev->req_lock);
747 if (mdev->state.disk == D_DISKLESS &&
748 mdev->state.conn == C_STANDALONE &&
749 mdev->state.role == R_SECONDARY) {
750 set_bit(DEVICE_DYING, &mdev->flags);
751 drbd_thread_stop_nowait(&mdev->worker);
752 } else
753 clear_bit(CONFIG_PENDING, &mdev->flags);
754 spin_unlock_irq(&mdev->req_lock);
755 wake_up(&mdev->state_wait);
756}
757
758/* does always return 0;
759 * interesting return code is in reply->ret_code */
760static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
761 struct drbd_nl_cfg_reply *reply)
762{
763 enum drbd_ret_codes retcode;
764 enum determine_dev_size dd;
765 sector_t max_possible_sectors;
766 sector_t min_md_device_sectors;
767 struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
768 struct inode *inode, *inode2;
769 struct lru_cache *resync_lru = NULL;
770 union drbd_state ns, os;
771 int rv;
772 int cp_discovered = 0;
773 int logical_block_size;
774
775 drbd_reconfig_start(mdev);
776
777 /* if you want to reconfigure, please tear down first */
778 if (mdev->state.disk > D_DISKLESS) {
779 retcode = ERR_DISK_CONFIGURED;
780 goto fail;
781 }
782
783 /* allocation not in the IO path, cqueue thread context */
784 nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
785 if (!nbc) {
786 retcode = ERR_NOMEM;
787 goto fail;
788 }
789
790 nbc->dc.disk_size = DRBD_DISK_SIZE_SECT_DEF;
791 nbc->dc.on_io_error = DRBD_ON_IO_ERROR_DEF;
792 nbc->dc.fencing = DRBD_FENCING_DEF;
793 nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF;
794
795 if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) {
796 retcode = ERR_MANDATORY_TAG;
797 goto fail;
798 }
799
800 if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
801 retcode = ERR_MD_IDX_INVALID;
802 goto fail;
803 }
804
805 nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0);
806 if (IS_ERR(nbc->lo_file)) {
807 dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
808 PTR_ERR(nbc->lo_file));
809 nbc->lo_file = NULL;
810 retcode = ERR_OPEN_DISK;
811 goto fail;
812 }
813
814 inode = nbc->lo_file->f_dentry->d_inode;
815
816 if (!S_ISBLK(inode->i_mode)) {
817 retcode = ERR_DISK_NOT_BDEV;
818 goto fail;
819 }
820
821 nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0);
822 if (IS_ERR(nbc->md_file)) {
823 dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
824 PTR_ERR(nbc->md_file));
825 nbc->md_file = NULL;
826 retcode = ERR_OPEN_MD_DISK;
827 goto fail;
828 }
829
830 inode2 = nbc->md_file->f_dentry->d_inode;
831
832 if (!S_ISBLK(inode2->i_mode)) {
833 retcode = ERR_MD_NOT_BDEV;
834 goto fail;
835 }
836
837 nbc->backing_bdev = inode->i_bdev;
838 if (bd_claim(nbc->backing_bdev, mdev)) {
839 printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n",
840 nbc->backing_bdev, mdev,
841 nbc->backing_bdev->bd_holder,
842 nbc->backing_bdev->bd_contains->bd_holder,
843 nbc->backing_bdev->bd_holders);
844 retcode = ERR_BDCLAIM_DISK;
845 goto fail;
846 }
847
848 resync_lru = lc_create("resync", drbd_bm_ext_cache,
849 61, sizeof(struct bm_extent),
850 offsetof(struct bm_extent, lce));
851 if (!resync_lru) {
852 retcode = ERR_NOMEM;
853 goto release_bdev_fail;
854 }
855
856 /* meta_dev_idx >= 0: external fixed size,
857 * possibly multiple drbd sharing one meta device.
858 * TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is
859 * not yet used by some other drbd minor!
860 * (if you use drbd.conf + drbdadm,
861 * that should check it for you already; but if you don't, or someone
862 * fooled it, we need to double check here) */
863 nbc->md_bdev = inode2->i_bdev;
864 if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev
865 : (void *) drbd_m_holder)) {
866 retcode = ERR_BDCLAIM_MD_DISK;
867 goto release_bdev_fail;
868 }
869
870 if ((nbc->backing_bdev == nbc->md_bdev) !=
871 (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
872 nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
873 retcode = ERR_MD_IDX_INVALID;
874 goto release_bdev2_fail;
875 }
876
877 /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
878 drbd_md_set_sector_offsets(mdev, nbc);
879
880 if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) {
881 dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
882 (unsigned long long) drbd_get_max_capacity(nbc),
883 (unsigned long long) nbc->dc.disk_size);
884 retcode = ERR_DISK_TO_SMALL;
885 goto release_bdev2_fail;
886 }
887
888 if (nbc->dc.meta_dev_idx < 0) {
889 max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
890 /* at least one MB, otherwise it does not make sense */
891 min_md_device_sectors = (2<<10);
892 } else {
893 max_possible_sectors = DRBD_MAX_SECTORS;
894 min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1);
895 }
896
897 if (drbd_get_capacity(nbc->md_bdev) > max_possible_sectors)
898 dev_warn(DEV, "truncating very big lower level device "
899 "to currently maximum possible %llu sectors\n",
900 (unsigned long long) max_possible_sectors);
901
902 if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
903 retcode = ERR_MD_DISK_TO_SMALL;
904 dev_warn(DEV, "refusing attach: md-device too small, "
905 "at least %llu sectors needed for this meta-disk type\n",
906 (unsigned long long) min_md_device_sectors);
907 goto release_bdev2_fail;
908 }
909
910 /* Make sure the new disk is big enough
911 * (we may currently be R_PRIMARY with no local disk...) */
912 if (drbd_get_max_capacity(nbc) <
913 drbd_get_capacity(mdev->this_bdev)) {
914 retcode = ERR_DISK_TO_SMALL;
915 goto release_bdev2_fail;
916 }
917
918 nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
919
920 drbd_suspend_io(mdev);
921 /* also wait for the last barrier ack. */
922 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt));
923 /* and for any other previously queued work */
924 drbd_flush_workqueue(mdev);
925
926 retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE);
927 drbd_resume_io(mdev);
928 if (retcode < SS_SUCCESS)
929 goto release_bdev2_fail;
930
931 if (!get_ldev_if_state(mdev, D_ATTACHING))
932 goto force_diskless;
933
934 drbd_md_set_sector_offsets(mdev, nbc);
935
936 if (!mdev->bitmap) {
937 if (drbd_bm_init(mdev)) {
938 retcode = ERR_NOMEM;
939 goto force_diskless_dec;
940 }
941 }
942
943 retcode = drbd_md_read(mdev, nbc);
944 if (retcode != NO_ERROR)
945 goto force_diskless_dec;
946
947 if (mdev->state.conn < C_CONNECTED &&
948 mdev->state.role == R_PRIMARY &&
949 (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
950 dev_err(DEV, "Can only attach to data with current UUID=%016llX\n",
951 (unsigned long long)mdev->ed_uuid);
952 retcode = ERR_DATA_NOT_CURRENT;
953 goto force_diskless_dec;
954 }
955
956 /* Since we are diskless, fix the activity log first... */
957 if (drbd_check_al_size(mdev)) {
958 retcode = ERR_NOMEM;
959 goto force_diskless_dec;
960 }
961
962 /* Prevent shrinking of consistent devices ! */
963 if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
964 drbd_new_dev_size(mdev, nbc) < nbc->md.la_size_sect) {
965 dev_warn(DEV, "refusing to truncate a consistent device\n");
966 retcode = ERR_DISK_TO_SMALL;
967 goto force_diskless_dec;
968 }
969
970 if (!drbd_al_read_log(mdev, nbc)) {
971 retcode = ERR_IO_MD_DISK;
972 goto force_diskless_dec;
973 }
974
975 /* allocate a second IO page if logical_block_size != 512 */
976 logical_block_size = bdev_logical_block_size(nbc->md_bdev);
977 if (logical_block_size == 0)
978 logical_block_size = MD_SECTOR_SIZE;
979
980 if (logical_block_size != MD_SECTOR_SIZE) {
981 if (!mdev->md_io_tmpp) {
982 struct page *page = alloc_page(GFP_NOIO);
983 if (!page)
984 goto force_diskless_dec;
985
986 dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n",
987 logical_block_size, MD_SECTOR_SIZE);
988 dev_warn(DEV, "Workaround engaged (has performance impact).\n");
989
990 mdev->md_io_tmpp = page;
991 }
992 }
993
994 /* Reset the "barriers don't work" bits here, then force meta data to
995 * be written, to ensure we determine if barriers are supported. */
996 if (nbc->dc.no_md_flush)
997 set_bit(MD_NO_BARRIER, &mdev->flags);
998 else
999 clear_bit(MD_NO_BARRIER, &mdev->flags);
1000
1001 /* Point of no return reached.
1002 * Devices and memory are no longer released by error cleanup below.
1003 * now mdev takes over responsibility, and the state engine should
1004 * clean it up somewhere. */
1005 D_ASSERT(mdev->ldev == NULL);
1006 mdev->ldev = nbc;
1007 mdev->resync = resync_lru;
1008 nbc = NULL;
1009 resync_lru = NULL;
1010
1011 mdev->write_ordering = WO_bio_barrier;
1012 drbd_bump_write_ordering(mdev, WO_bio_barrier);
1013
1014 if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY))
1015 set_bit(CRASHED_PRIMARY, &mdev->flags);
1016 else
1017 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1018
1019 if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND)) {
1020 set_bit(CRASHED_PRIMARY, &mdev->flags);
1021 cp_discovered = 1;
1022 }
1023
1024 mdev->send_cnt = 0;
1025 mdev->recv_cnt = 0;
1026 mdev->read_cnt = 0;
1027 mdev->writ_cnt = 0;
1028
1029 drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE);
1030
1031 /* If I am currently not R_PRIMARY,
1032 * but meta data primary indicator is set,
1033 * I just now recover from a hard crash,
1034 * and have been R_PRIMARY before that crash.
1035 *
1036 * Now, if I had no connection before that crash
1037 * (have been degraded R_PRIMARY), chances are that
1038 * I won't find my peer now either.
1039 *
1040 * In that case, and _only_ in that case,
1041 * we use the degr-wfc-timeout instead of the default,
1042 * so we can automatically recover from a crash of a
1043 * degraded but active "cluster" after a certain timeout.
1044 */
1045 clear_bit(USE_DEGR_WFC_T, &mdev->flags);
1046 if (mdev->state.role != R_PRIMARY &&
1047 drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
1048 !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND))
1049 set_bit(USE_DEGR_WFC_T, &mdev->flags);
1050
1051 dd = drbd_determin_dev_size(mdev);
1052 if (dd == dev_size_error) {
1053 retcode = ERR_NOMEM_BITMAP;
1054 goto force_diskless_dec;
1055 } else if (dd == grew)
1056 set_bit(RESYNC_AFTER_NEG, &mdev->flags);
1057
1058 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
1059 dev_info(DEV, "Assuming that all blocks are out of sync "
1060 "(aka FullSync)\n");
1061 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from attaching")) {
1062 retcode = ERR_IO_MD_DISK;
1063 goto force_diskless_dec;
1064 }
1065 } else {
1066 if (drbd_bitmap_io(mdev, &drbd_bm_read, "read from attaching") < 0) {
1067 retcode = ERR_IO_MD_DISK;
1068 goto force_diskless_dec;
1069 }
1070 }
1071
1072 if (cp_discovered) {
1073 drbd_al_apply_to_bm(mdev);
1074 drbd_al_to_on_disk_bm(mdev);
1075 }
1076
1077 spin_lock_irq(&mdev->req_lock);
1078 os = mdev->state;
1079 ns.i = os.i;
1080 /* If MDF_CONSISTENT is not set go into inconsistent state,
1081 otherwise investigate MDF_WasUpToDate...
1082 If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
1083 otherwise into D_CONSISTENT state.
1084 */
1085 if (drbd_md_test_flag(mdev->ldev, MDF_CONSISTENT)) {
1086 if (drbd_md_test_flag(mdev->ldev, MDF_WAS_UP_TO_DATE))
1087 ns.disk = D_CONSISTENT;
1088 else
1089 ns.disk = D_OUTDATED;
1090 } else {
1091 ns.disk = D_INCONSISTENT;
1092 }
1093
1094 if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED))
1095 ns.pdsk = D_OUTDATED;
1096
1097 if ( ns.disk == D_CONSISTENT &&
1098 (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE))
1099 ns.disk = D_UP_TO_DATE;
1100
1101 /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
1102 MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before
1103 this point, because drbd_request_state() modifies these
1104 flags. */
1105
1106 /* In case we are C_CONNECTED postpone any decision on the new disk
1107 state after the negotiation phase. */
1108 if (mdev->state.conn == C_CONNECTED) {
1109 mdev->new_state_tmp.i = ns.i;
1110 ns.i = os.i;
1111 ns.disk = D_NEGOTIATING;
1112 }
1113
1114 rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
1115 ns = mdev->state;
1116 spin_unlock_irq(&mdev->req_lock);
1117
1118 if (rv < SS_SUCCESS)
1119 goto force_diskless_dec;
1120
1121 if (mdev->state.role == R_PRIMARY)
1122 mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1;
1123 else
1124 mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
1125
1126 drbd_md_mark_dirty(mdev);
1127 drbd_md_sync(mdev);
1128
1129 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
1130 put_ldev(mdev);
1131 reply->ret_code = retcode;
1132 drbd_reconfig_done(mdev);
1133 return 0;
1134
1135 force_diskless_dec:
1136 put_ldev(mdev);
1137 force_diskless:
1138 drbd_force_state(mdev, NS(disk, D_DISKLESS));
1139 drbd_md_sync(mdev);
1140 release_bdev2_fail:
1141 if (nbc)
1142 bd_release(nbc->md_bdev);
1143 release_bdev_fail:
1144 if (nbc)
1145 bd_release(nbc->backing_bdev);
1146 fail:
1147 if (nbc) {
1148 if (nbc->lo_file)
1149 fput(nbc->lo_file);
1150 if (nbc->md_file)
1151 fput(nbc->md_file);
1152 kfree(nbc);
1153 }
1154 lc_destroy(resync_lru);
1155
1156 reply->ret_code = retcode;
1157 drbd_reconfig_done(mdev);
1158 return 0;
1159}
1160
1161static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1162 struct drbd_nl_cfg_reply *reply)
1163{
1164 reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS));
1165 return 0;
1166}
1167
1168static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1169 struct drbd_nl_cfg_reply *reply)
1170{
1171 int i, ns;
1172 enum drbd_ret_codes retcode;
1173 struct net_conf *new_conf = NULL;
1174 struct crypto_hash *tfm = NULL;
1175 struct crypto_hash *integrity_w_tfm = NULL;
1176 struct crypto_hash *integrity_r_tfm = NULL;
1177 struct hlist_head *new_tl_hash = NULL;
1178 struct hlist_head *new_ee_hash = NULL;
1179 struct drbd_conf *odev;
1180 char hmac_name[CRYPTO_MAX_ALG_NAME];
1181 void *int_dig_out = NULL;
1182 void *int_dig_in = NULL;
1183 void *int_dig_vv = NULL;
1184 struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr;
1185
1186 drbd_reconfig_start(mdev);
1187
1188 if (mdev->state.conn > C_STANDALONE) {
1189 retcode = ERR_NET_CONFIGURED;
1190 goto fail;
1191 }
1192
1193 /* allocation not in the IO path, cqueue thread context */
1194 new_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
1195 if (!new_conf) {
1196 retcode = ERR_NOMEM;
1197 goto fail;
1198 }
1199
1200 memset(new_conf, 0, sizeof(struct net_conf));
1201 new_conf->timeout = DRBD_TIMEOUT_DEF;
1202 new_conf->try_connect_int = DRBD_CONNECT_INT_DEF;
1203 new_conf->ping_int = DRBD_PING_INT_DEF;
1204 new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF;
1205 new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF;
1206 new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF;
1207 new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF;
1208 new_conf->rcvbuf_size = DRBD_RCVBUF_SIZE_DEF;
1209 new_conf->ko_count = DRBD_KO_COUNT_DEF;
1210 new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF;
1211 new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF;
1212 new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF;
1213 new_conf->want_lose = 0;
1214 new_conf->two_primaries = 0;
1215 new_conf->wire_protocol = DRBD_PROT_C;
1216 new_conf->ping_timeo = DRBD_PING_TIMEO_DEF;
1217 new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF;
1218
1219 if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) {
1220 retcode = ERR_MANDATORY_TAG;
1221 goto fail;
1222 }
1223
1224 if (new_conf->two_primaries
1225 && (new_conf->wire_protocol != DRBD_PROT_C)) {
1226 retcode = ERR_NOT_PROTO_C;
1227 goto fail;
1228 };
1229
1230 if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
1231 retcode = ERR_DISCARD;
1232 goto fail;
1233 }
1234
1235 retcode = NO_ERROR;
1236
1237 new_my_addr = (struct sockaddr *)&new_conf->my_addr;
1238 new_peer_addr = (struct sockaddr *)&new_conf->peer_addr;
1239 for (i = 0; i < minor_count; i++) {
1240 odev = minor_to_mdev(i);
1241 if (!odev || odev == mdev)
1242 continue;
1243 if (get_net_conf(odev)) {
1244 taken_addr = (struct sockaddr *)&odev->net_conf->my_addr;
1245 if (new_conf->my_addr_len == odev->net_conf->my_addr_len &&
1246 !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len))
1247 retcode = ERR_LOCAL_ADDR;
1248
1249 taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr;
1250 if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len &&
1251 !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len))
1252 retcode = ERR_PEER_ADDR;
1253
1254 put_net_conf(odev);
1255 if (retcode != NO_ERROR)
1256 goto fail;
1257 }
1258 }
1259
1260 if (new_conf->cram_hmac_alg[0] != 0) {
1261 snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
1262 new_conf->cram_hmac_alg);
1263 tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC);
1264 if (IS_ERR(tfm)) {
1265 tfm = NULL;
1266 retcode = ERR_AUTH_ALG;
1267 goto fail;
1268 }
1269
1270 if (crypto_tfm_alg_type(crypto_hash_tfm(tfm))
1271 != CRYPTO_ALG_TYPE_HASH) {
1272 retcode = ERR_AUTH_ALG_ND;
1273 goto fail;
1274 }
1275 }
1276
1277 if (new_conf->integrity_alg[0]) {
1278 integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
1279 if (IS_ERR(integrity_w_tfm)) {
1280 integrity_w_tfm = NULL;
1281 retcode=ERR_INTEGRITY_ALG;
1282 goto fail;
1283 }
1284
1285 if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) {
1286 retcode=ERR_INTEGRITY_ALG_ND;
1287 goto fail;
1288 }
1289
1290 integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
1291 if (IS_ERR(integrity_r_tfm)) {
1292 integrity_r_tfm = NULL;
1293 retcode=ERR_INTEGRITY_ALG;
1294 goto fail;
1295 }
1296 }
1297
1298 ns = new_conf->max_epoch_size/8;
1299 if (mdev->tl_hash_s != ns) {
1300 new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
1301 if (!new_tl_hash) {
1302 retcode = ERR_NOMEM;
1303 goto fail;
1304 }
1305 }
1306
1307 ns = new_conf->max_buffers/8;
1308 if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) {
1309 new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
1310 if (!new_ee_hash) {
1311 retcode = ERR_NOMEM;
1312 goto fail;
1313 }
1314 }
1315
1316 ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
1317
1318 if (integrity_w_tfm) {
1319 i = crypto_hash_digestsize(integrity_w_tfm);
1320 int_dig_out = kmalloc(i, GFP_KERNEL);
1321 if (!int_dig_out) {
1322 retcode = ERR_NOMEM;
1323 goto fail;
1324 }
1325 int_dig_in = kmalloc(i, GFP_KERNEL);
1326 if (!int_dig_in) {
1327 retcode = ERR_NOMEM;
1328 goto fail;
1329 }
1330 int_dig_vv = kmalloc(i, GFP_KERNEL);
1331 if (!int_dig_vv) {
1332 retcode = ERR_NOMEM;
1333 goto fail;
1334 }
1335 }
1336
1337 if (!mdev->bitmap) {
1338 if(drbd_bm_init(mdev)) {
1339 retcode = ERR_NOMEM;
1340 goto fail;
1341 }
1342 }
1343
1344 spin_lock_irq(&mdev->req_lock);
1345 if (mdev->net_conf != NULL) {
1346 retcode = ERR_NET_CONFIGURED;
1347 spin_unlock_irq(&mdev->req_lock);
1348 goto fail;
1349 }
1350 mdev->net_conf = new_conf;
1351
1352 mdev->send_cnt = 0;
1353 mdev->recv_cnt = 0;
1354
1355 if (new_tl_hash) {
1356 kfree(mdev->tl_hash);
1357 mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8;
1358 mdev->tl_hash = new_tl_hash;
1359 }
1360
1361 if (new_ee_hash) {
1362 kfree(mdev->ee_hash);
1363 mdev->ee_hash_s = mdev->net_conf->max_buffers/8;
1364 mdev->ee_hash = new_ee_hash;
1365 }
1366
1367 crypto_free_hash(mdev->cram_hmac_tfm);
1368 mdev->cram_hmac_tfm = tfm;
1369
1370 crypto_free_hash(mdev->integrity_w_tfm);
1371 mdev->integrity_w_tfm = integrity_w_tfm;
1372
1373 crypto_free_hash(mdev->integrity_r_tfm);
1374 mdev->integrity_r_tfm = integrity_r_tfm;
1375
1376 kfree(mdev->int_dig_out);
1377 kfree(mdev->int_dig_in);
1378 kfree(mdev->int_dig_vv);
1379 mdev->int_dig_out=int_dig_out;
1380 mdev->int_dig_in=int_dig_in;
1381 mdev->int_dig_vv=int_dig_vv;
1382 spin_unlock_irq(&mdev->req_lock);
1383
1384 retcode = _drbd_request_state(mdev, NS(conn, C_UNCONNECTED), CS_VERBOSE);
1385
1386 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
1387 reply->ret_code = retcode;
1388 drbd_reconfig_done(mdev);
1389 return 0;
1390
1391fail:
1392 kfree(int_dig_out);
1393 kfree(int_dig_in);
1394 kfree(int_dig_vv);
1395 crypto_free_hash(tfm);
1396 crypto_free_hash(integrity_w_tfm);
1397 crypto_free_hash(integrity_r_tfm);
1398 kfree(new_tl_hash);
1399 kfree(new_ee_hash);
1400 kfree(new_conf);
1401
1402 reply->ret_code = retcode;
1403 drbd_reconfig_done(mdev);
1404 return 0;
1405}
1406
1407static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1408 struct drbd_nl_cfg_reply *reply)
1409{
1410 int retcode;
1411
1412 retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED);
1413
1414 if (retcode == SS_NOTHING_TO_DO)
1415 goto done;
1416 else if (retcode == SS_ALREADY_STANDALONE)
1417 goto done;
1418 else if (retcode == SS_PRIMARY_NOP) {
1419 /* Our statche checking code wants to see the peer outdated. */
1420 retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
1421 pdsk, D_OUTDATED));
1422 } else if (retcode == SS_CW_FAILED_BY_PEER) {
1423 /* The peer probably wants to see us outdated. */
1424 retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
1425 disk, D_OUTDATED),
1426 CS_ORDERED);
1427 if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) {
1428 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
1429 retcode = SS_SUCCESS;
1430 }
1431 }
1432
1433 if (retcode < SS_SUCCESS)
1434 goto fail;
1435
1436 if (wait_event_interruptible(mdev->state_wait,
1437 mdev->state.conn != C_DISCONNECTING)) {
1438 /* Do not test for mdev->state.conn == C_STANDALONE, since
1439 someone else might connect us in the mean time! */
1440 retcode = ERR_INTR;
1441 goto fail;
1442 }
1443
1444 done:
1445 retcode = NO_ERROR;
1446 fail:
1447 drbd_md_sync(mdev);
1448 reply->ret_code = retcode;
1449 return 0;
1450}
1451
1452void resync_after_online_grow(struct drbd_conf *mdev)
1453{
1454 int iass; /* I am sync source */
1455
1456 dev_info(DEV, "Resync of new storage after online grow\n");
1457 if (mdev->state.role != mdev->state.peer)
1458 iass = (mdev->state.role == R_PRIMARY);
1459 else
1460 iass = test_bit(DISCARD_CONCURRENT, &mdev->flags);
1461
1462 if (iass)
1463 drbd_start_resync(mdev, C_SYNC_SOURCE);
1464 else
1465 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
1466}
1467
1468static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1469 struct drbd_nl_cfg_reply *reply)
1470{
1471 struct resize rs;
1472 int retcode = NO_ERROR;
1473 int ldsc = 0; /* local disk size changed */
1474 enum determine_dev_size dd;
1475
1476 memset(&rs, 0, sizeof(struct resize));
1477 if (!resize_from_tags(mdev, nlp->tag_list, &rs)) {
1478 retcode = ERR_MANDATORY_TAG;
1479 goto fail;
1480 }
1481
1482 if (mdev->state.conn > C_CONNECTED) {
1483 retcode = ERR_RESIZE_RESYNC;
1484 goto fail;
1485 }
1486
1487 if (mdev->state.role == R_SECONDARY &&
1488 mdev->state.peer == R_SECONDARY) {
1489 retcode = ERR_NO_PRIMARY;
1490 goto fail;
1491 }
1492
1493 if (!get_ldev(mdev)) {
1494 retcode = ERR_NO_DISK;
1495 goto fail;
1496 }
1497
1498 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
1499 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
1500 ldsc = 1;
1501 }
1502
1503 mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
1504 dd = drbd_determin_dev_size(mdev);
1505 drbd_md_sync(mdev);
1506 put_ldev(mdev);
1507 if (dd == dev_size_error) {
1508 retcode = ERR_NOMEM_BITMAP;
1509 goto fail;
1510 }
1511
1512 if (mdev->state.conn == C_CONNECTED && (dd != unchanged || ldsc)) {
1513 if (dd == grew)
1514 set_bit(RESIZE_PENDING, &mdev->flags);
1515
1516 drbd_send_uuids(mdev);
1517 drbd_send_sizes(mdev, 1);
1518 }
1519
1520 fail:
1521 reply->ret_code = retcode;
1522 return 0;
1523}
1524
1525static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1526 struct drbd_nl_cfg_reply *reply)
1527{
1528 int retcode = NO_ERROR;
1529 int err;
1530 int ovr; /* online verify running */
1531 int rsr; /* re-sync running */
1532 struct crypto_hash *verify_tfm = NULL;
1533 struct crypto_hash *csums_tfm = NULL;
1534 struct syncer_conf sc;
1535 cpumask_var_t new_cpu_mask;
1536
1537 if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) {
1538 retcode = ERR_NOMEM;
1539 goto fail;
1540 }
1541
1542 if (nlp->flags & DRBD_NL_SET_DEFAULTS) {
1543 memset(&sc, 0, sizeof(struct syncer_conf));
1544 sc.rate = DRBD_RATE_DEF;
1545 sc.after = DRBD_AFTER_DEF;
1546 sc.al_extents = DRBD_AL_EXTENTS_DEF;
1547 } else
1548 memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
1549
1550 if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) {
1551 retcode = ERR_MANDATORY_TAG;
1552 goto fail;
1553 }
1554
1555 /* re-sync running */
1556 rsr = ( mdev->state.conn == C_SYNC_SOURCE ||
1557 mdev->state.conn == C_SYNC_TARGET ||
1558 mdev->state.conn == C_PAUSED_SYNC_S ||
1559 mdev->state.conn == C_PAUSED_SYNC_T );
1560
1561 if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) {
1562 retcode = ERR_CSUMS_RESYNC_RUNNING;
1563 goto fail;
1564 }
1565
1566 if (!rsr && sc.csums_alg[0]) {
1567 csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC);
1568 if (IS_ERR(csums_tfm)) {
1569 csums_tfm = NULL;
1570 retcode = ERR_CSUMS_ALG;
1571 goto fail;
1572 }
1573
1574 if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) {
1575 retcode = ERR_CSUMS_ALG_ND;
1576 goto fail;
1577 }
1578 }
1579
1580 /* online verify running */
1581 ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T);
1582
1583 if (ovr) {
1584 if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) {
1585 retcode = ERR_VERIFY_RUNNING;
1586 goto fail;
1587 }
1588 }
1589
1590 if (!ovr && sc.verify_alg[0]) {
1591 verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC);
1592 if (IS_ERR(verify_tfm)) {
1593 verify_tfm = NULL;
1594 retcode = ERR_VERIFY_ALG;
1595 goto fail;
1596 }
1597
1598 if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) {
1599 retcode = ERR_VERIFY_ALG_ND;
1600 goto fail;
1601 }
1602 }
1603
1604 /* silently ignore cpu mask on UP kernel */
1605 if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) {
1606 err = __bitmap_parse(sc.cpu_mask, 32, 0,
1607 cpumask_bits(new_cpu_mask), nr_cpu_ids);
1608 if (err) {
1609 dev_warn(DEV, "__bitmap_parse() failed with %d\n", err);
1610 retcode = ERR_CPU_MASK_PARSE;
1611 goto fail;
1612 }
1613 }
1614
1615 ERR_IF (sc.rate < 1) sc.rate = 1;
1616 ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */
1617#define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT)
1618 if (sc.al_extents > AL_MAX) {
1619 dev_err(DEV, "sc.al_extents > %d\n", AL_MAX);
1620 sc.al_extents = AL_MAX;
1621 }
1622#undef AL_MAX
1623
1624 /* most sanity checks done, try to assign the new sync-after
1625 * dependency. need to hold the global lock in there,
1626 * to avoid a race in the dependency loop check. */
1627 retcode = drbd_alter_sa(mdev, sc.after);
1628 if (retcode != NO_ERROR)
1629 goto fail;
1630
1631 /* ok, assign the rest of it as well.
1632 * lock against receive_SyncParam() */
1633 spin_lock(&mdev->peer_seq_lock);
1634 mdev->sync_conf = sc;
1635
1636 if (!rsr) {
1637 crypto_free_hash(mdev->csums_tfm);
1638 mdev->csums_tfm = csums_tfm;
1639 csums_tfm = NULL;
1640 }
1641
1642 if (!ovr) {
1643 crypto_free_hash(mdev->verify_tfm);
1644 mdev->verify_tfm = verify_tfm;
1645 verify_tfm = NULL;
1646 }
1647 spin_unlock(&mdev->peer_seq_lock);
1648
1649 if (get_ldev(mdev)) {
1650 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
1651 drbd_al_shrink(mdev);
1652 err = drbd_check_al_size(mdev);
1653 lc_unlock(mdev->act_log);
1654 wake_up(&mdev->al_wait);
1655
1656 put_ldev(mdev);
1657 drbd_md_sync(mdev);
1658
1659 if (err) {
1660 retcode = ERR_NOMEM;
1661 goto fail;
1662 }
1663 }
1664
1665 if (mdev->state.conn >= C_CONNECTED)
1666 drbd_send_sync_param(mdev, &sc);
1667
1668 if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) {
1669 cpumask_copy(mdev->cpu_mask, new_cpu_mask);
1670 drbd_calc_cpu_mask(mdev);
1671 mdev->receiver.reset_cpu_mask = 1;
1672 mdev->asender.reset_cpu_mask = 1;
1673 mdev->worker.reset_cpu_mask = 1;
1674 }
1675
1676 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
1677fail:
1678 free_cpumask_var(new_cpu_mask);
1679 crypto_free_hash(csums_tfm);
1680 crypto_free_hash(verify_tfm);
1681 reply->ret_code = retcode;
1682 return 0;
1683}
1684
1685static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1686 struct drbd_nl_cfg_reply *reply)
1687{
1688 int retcode;
1689
1690 retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
1691
1692 if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION)
1693 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
1694
1695 while (retcode == SS_NEED_CONNECTION) {
1696 spin_lock_irq(&mdev->req_lock);
1697 if (mdev->state.conn < C_CONNECTED)
1698 retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL);
1699 spin_unlock_irq(&mdev->req_lock);
1700
1701 if (retcode != SS_NEED_CONNECTION)
1702 break;
1703
1704 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
1705 }
1706
1707 reply->ret_code = retcode;
1708 return 0;
1709}
1710
1711static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1712 struct drbd_nl_cfg_reply *reply)
1713{
1714
1715 reply->ret_code = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
1716
1717 return 0;
1718}
1719
1720static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1721 struct drbd_nl_cfg_reply *reply)
1722{
1723 int retcode = NO_ERROR;
1724
1725 if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
1726 retcode = ERR_PAUSE_IS_SET;
1727
1728 reply->ret_code = retcode;
1729 return 0;
1730}
1731
1732static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1733 struct drbd_nl_cfg_reply *reply)
1734{
1735 int retcode = NO_ERROR;
1736
1737 if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO)
1738 retcode = ERR_PAUSE_IS_CLEAR;
1739
1740 reply->ret_code = retcode;
1741 return 0;
1742}
1743
1744static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1745 struct drbd_nl_cfg_reply *reply)
1746{
1747 reply->ret_code = drbd_request_state(mdev, NS(susp, 1));
1748
1749 return 0;
1750}
1751
1752static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1753 struct drbd_nl_cfg_reply *reply)
1754{
1755 reply->ret_code = drbd_request_state(mdev, NS(susp, 0));
1756 return 0;
1757}
1758
1759static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1760 struct drbd_nl_cfg_reply *reply)
1761{
1762 reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED));
1763 return 0;
1764}
1765
1766static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1767 struct drbd_nl_cfg_reply *reply)
1768{
1769 unsigned short *tl;
1770
1771 tl = reply->tag_list;
1772
1773 if (get_ldev(mdev)) {
1774 tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl);
1775 put_ldev(mdev);
1776 }
1777
1778 if (get_net_conf(mdev)) {
1779 tl = net_conf_to_tags(mdev, mdev->net_conf, tl);
1780 put_net_conf(mdev);
1781 }
1782 tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl);
1783
1784 put_unaligned(TT_END, tl++); /* Close the tag list */
1785
1786 return (int)((char *)tl - (char *)reply->tag_list);
1787}
1788
1789static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1790 struct drbd_nl_cfg_reply *reply)
1791{
1792 unsigned short *tl = reply->tag_list;
1793 union drbd_state s = mdev->state;
1794 unsigned long rs_left;
1795 unsigned int res;
1796
1797 tl = get_state_to_tags(mdev, (struct get_state *)&s, tl);
1798
1799 /* no local ref, no bitmap, no syncer progress. */
1800 if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) {
1801 if (get_ldev(mdev)) {
1802 drbd_get_syncer_progress(mdev, &rs_left, &res);
1803 tl = tl_add_int(tl, T_sync_progress, &res);
1804 put_ldev(mdev);
1805 }
1806 }
1807 put_unaligned(TT_END, tl++); /* Close the tag list */
1808
1809 return (int)((char *)tl - (char *)reply->tag_list);
1810}
1811
1812static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1813 struct drbd_nl_cfg_reply *reply)
1814{
1815 unsigned short *tl;
1816
1817 tl = reply->tag_list;
1818
1819 if (get_ldev(mdev)) {
1820 tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64));
1821 tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags);
1822 put_ldev(mdev);
1823 }
1824 put_unaligned(TT_END, tl++); /* Close the tag list */
1825
1826 return (int)((char *)tl - (char *)reply->tag_list);
1827}
1828
1829/**
1830 * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use
1831 * @mdev: DRBD device.
1832 * @nlp: Netlink/connector packet from drbdsetup
1833 * @reply: Reply packet for drbdsetup
1834 */
1835static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1836 struct drbd_nl_cfg_reply *reply)
1837{
1838 unsigned short *tl;
1839 char rv;
1840
1841 tl = reply->tag_list;
1842
1843 rv = mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
1844 test_bit(USE_DEGR_WFC_T, &mdev->flags) ? UT_DEGRADED : UT_DEFAULT;
1845
1846 tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv));
1847 put_unaligned(TT_END, tl++); /* Close the tag list */
1848
1849 return (int)((char *)tl - (char *)reply->tag_list);
1850}
1851
1852static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1853 struct drbd_nl_cfg_reply *reply)
1854{
1855 /* default to resume from last known position, if possible */
1856 struct start_ov args =
1857 { .start_sector = mdev->ov_start_sector };
1858
1859 if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) {
1860 reply->ret_code = ERR_MANDATORY_TAG;
1861 return 0;
1862 }
1863 /* w_make_ov_request expects position to be aligned */
1864 mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT;
1865 reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
1866 return 0;
1867}
1868
1869
1870static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1871 struct drbd_nl_cfg_reply *reply)
1872{
1873 int retcode = NO_ERROR;
1874 int skip_initial_sync = 0;
1875 int err;
1876
1877 struct new_c_uuid args;
1878
1879 memset(&args, 0, sizeof(struct new_c_uuid));
1880 if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) {
1881 reply->ret_code = ERR_MANDATORY_TAG;
1882 return 0;
1883 }
1884
1885 mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */
1886
1887 if (!get_ldev(mdev)) {
1888 retcode = ERR_NO_DISK;
1889 goto out;
1890 }
1891
1892 /* this is "skip initial sync", assume to be clean */
1893 if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 &&
1894 mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
1895 dev_info(DEV, "Preparing to skip initial sync\n");
1896 skip_initial_sync = 1;
1897 } else if (mdev->state.conn != C_STANDALONE) {
1898 retcode = ERR_CONNECTED;
1899 goto out_dec;
1900 }
1901
1902 drbd_uuid_set(mdev, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */
1903 drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */
1904
1905 if (args.clear_bm) {
1906 err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, "clear_n_write from new_c_uuid");
1907 if (err) {
1908 dev_err(DEV, "Writing bitmap failed with %d\n",err);
1909 retcode = ERR_IO_MD_DISK;
1910 }
1911 if (skip_initial_sync) {
1912 drbd_send_uuids_skip_initial_sync(mdev);
1913 _drbd_uuid_set(mdev, UI_BITMAP, 0);
1914 spin_lock_irq(&mdev->req_lock);
1915 _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
1916 CS_VERBOSE, NULL);
1917 spin_unlock_irq(&mdev->req_lock);
1918 }
1919 }
1920
1921 drbd_md_sync(mdev);
1922out_dec:
1923 put_ldev(mdev);
1924out:
1925 mutex_unlock(&mdev->state_mutex);
1926
1927 reply->ret_code = retcode;
1928 return 0;
1929}
1930
1931static struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp)
1932{
1933 struct drbd_conf *mdev;
1934
1935 if (nlp->drbd_minor >= minor_count)
1936 return NULL;
1937
1938 mdev = minor_to_mdev(nlp->drbd_minor);
1939
1940 if (!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) {
1941 struct gendisk *disk = NULL;
1942 mdev = drbd_new_device(nlp->drbd_minor);
1943
1944 spin_lock_irq(&drbd_pp_lock);
1945 if (minor_table[nlp->drbd_minor] == NULL) {
1946 minor_table[nlp->drbd_minor] = mdev;
1947 disk = mdev->vdisk;
1948 mdev = NULL;
1949 } /* else: we lost the race */
1950 spin_unlock_irq(&drbd_pp_lock);
1951
1952 if (disk) /* we won the race above */
1953 /* in case we ever add a drbd_delete_device(),
1954 * don't forget the del_gendisk! */
1955 add_disk(disk);
1956 else /* we lost the race above */
1957 drbd_free_mdev(mdev);
1958
1959 mdev = minor_to_mdev(nlp->drbd_minor);
1960 }
1961
1962 return mdev;
1963}
1964
1965struct cn_handler_struct {
1966 int (*function)(struct drbd_conf *,
1967 struct drbd_nl_cfg_req *,
1968 struct drbd_nl_cfg_reply *);
1969 int reply_body_size;
1970};
1971
1972static struct cn_handler_struct cnd_table[] = {
1973 [ P_primary ] = { &drbd_nl_primary, 0 },
1974 [ P_secondary ] = { &drbd_nl_secondary, 0 },
1975 [ P_disk_conf ] = { &drbd_nl_disk_conf, 0 },
1976 [ P_detach ] = { &drbd_nl_detach, 0 },
1977 [ P_net_conf ] = { &drbd_nl_net_conf, 0 },
1978 [ P_disconnect ] = { &drbd_nl_disconnect, 0 },
1979 [ P_resize ] = { &drbd_nl_resize, 0 },
1980 [ P_syncer_conf ] = { &drbd_nl_syncer_conf, 0 },
1981 [ P_invalidate ] = { &drbd_nl_invalidate, 0 },
1982 [ P_invalidate_peer ] = { &drbd_nl_invalidate_peer, 0 },
1983 [ P_pause_sync ] = { &drbd_nl_pause_sync, 0 },
1984 [ P_resume_sync ] = { &drbd_nl_resume_sync, 0 },
1985 [ P_suspend_io ] = { &drbd_nl_suspend_io, 0 },
1986 [ P_resume_io ] = { &drbd_nl_resume_io, 0 },
1987 [ P_outdate ] = { &drbd_nl_outdate, 0 },
1988 [ P_get_config ] = { &drbd_nl_get_config,
1989 sizeof(struct syncer_conf_tag_len_struct) +
1990 sizeof(struct disk_conf_tag_len_struct) +
1991 sizeof(struct net_conf_tag_len_struct) },
1992 [ P_get_state ] = { &drbd_nl_get_state,
1993 sizeof(struct get_state_tag_len_struct) +
1994 sizeof(struct sync_progress_tag_len_struct) },
1995 [ P_get_uuids ] = { &drbd_nl_get_uuids,
1996 sizeof(struct get_uuids_tag_len_struct) },
1997 [ P_get_timeout_flag ] = { &drbd_nl_get_timeout_flag,
1998 sizeof(struct get_timeout_flag_tag_len_struct)},
1999 [ P_start_ov ] = { &drbd_nl_start_ov, 0 },
2000 [ P_new_c_uuid ] = { &drbd_nl_new_c_uuid, 0 },
2001};
2002
2003static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms *nsp)
2004{
2005 struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data;
2006 struct cn_handler_struct *cm;
2007 struct cn_msg *cn_reply;
2008 struct drbd_nl_cfg_reply *reply;
2009 struct drbd_conf *mdev;
2010 int retcode, rr;
2011 int reply_size = sizeof(struct cn_msg)
2012 + sizeof(struct drbd_nl_cfg_reply)
2013 + sizeof(short int);
2014
2015 if (!try_module_get(THIS_MODULE)) {
2016 printk(KERN_ERR "drbd: try_module_get() failed!\n");
2017 return;
2018 }
2019
2020 if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN)) {
2021 retcode = ERR_PERM;
2022 goto fail;
2023 }
2024
2025 mdev = ensure_mdev(nlp);
2026 if (!mdev) {
2027 retcode = ERR_MINOR_INVALID;
2028 goto fail;
2029 }
2030
2031 if (nlp->packet_type >= P_nl_after_last_packet) {
2032 retcode = ERR_PACKET_NR;
2033 goto fail;
2034 }
2035
2036 cm = cnd_table + nlp->packet_type;
2037
2038 /* This may happen if packet number is 0: */
2039 if (cm->function == NULL) {
2040 retcode = ERR_PACKET_NR;
2041 goto fail;
2042 }
2043
2044 reply_size += cm->reply_body_size;
2045
2046 /* allocation not in the IO path, cqueue thread context */
2047 cn_reply = kmalloc(reply_size, GFP_KERNEL);
2048 if (!cn_reply) {
2049 retcode = ERR_NOMEM;
2050 goto fail;
2051 }
2052 reply = (struct drbd_nl_cfg_reply *) cn_reply->data;
2053
2054 reply->packet_type =
2055 cm->reply_body_size ? nlp->packet_type : P_nl_after_last_packet;
2056 reply->minor = nlp->drbd_minor;
2057 reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */
2058 /* reply->tag_list; might be modified by cm->function. */
2059
2060 rr = cm->function(mdev, nlp, reply);
2061
2062 cn_reply->id = req->id;
2063 cn_reply->seq = req->seq;
2064 cn_reply->ack = req->ack + 1;
2065 cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr;
2066 cn_reply->flags = 0;
2067
2068 rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL);
2069 if (rr && rr != -ESRCH)
2070 printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
2071
2072 kfree(cn_reply);
2073 module_put(THIS_MODULE);
2074 return;
2075 fail:
2076 drbd_nl_send_reply(req, retcode);
2077 module_put(THIS_MODULE);
2078}
2079
2080static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */
2081
2082static unsigned short *
2083__tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data,
2084 unsigned short len, int nul_terminated)
2085{
2086 unsigned short l = tag_descriptions[tag_number(tag)].max_len;
2087 len = (len < l) ? len : l;
2088 put_unaligned(tag, tl++);
2089 put_unaligned(len, tl++);
2090 memcpy(tl, data, len);
2091 tl = (unsigned short*)((char*)tl + len);
2092 if (nul_terminated)
2093 *((char*)tl - 1) = 0;
2094 return tl;
2095}
2096
2097static unsigned short *
2098tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len)
2099{
2100 return __tl_add_blob(tl, tag, data, len, 0);
2101}
2102
2103static unsigned short *
2104tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str)
2105{
2106 return __tl_add_blob(tl, tag, str, strlen(str)+1, 0);
2107}
2108
2109static unsigned short *
2110tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val)
2111{
2112 put_unaligned(tag, tl++);
2113 switch(tag_type(tag)) {
2114 case TT_INTEGER:
2115 put_unaligned(sizeof(int), tl++);
2116 put_unaligned(*(int *)val, (int *)tl);
2117 tl = (unsigned short*)((char*)tl+sizeof(int));
2118 break;
2119 case TT_INT64:
2120 put_unaligned(sizeof(u64), tl++);
2121 put_unaligned(*(u64 *)val, (u64 *)tl);
2122 tl = (unsigned short*)((char*)tl+sizeof(u64));
2123 break;
2124 default:
2125 /* someone did something stupid. */
2126 ;
2127 }
2128 return tl;
2129}
2130
2131void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state)
2132{
2133 char buffer[sizeof(struct cn_msg)+
2134 sizeof(struct drbd_nl_cfg_reply)+
2135 sizeof(struct get_state_tag_len_struct)+
2136 sizeof(short int)];
2137 struct cn_msg *cn_reply = (struct cn_msg *) buffer;
2138 struct drbd_nl_cfg_reply *reply =
2139 (struct drbd_nl_cfg_reply *)cn_reply->data;
2140 unsigned short *tl = reply->tag_list;
2141
2142 /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
2143
2144 tl = get_state_to_tags(mdev, (struct get_state *)&state, tl);
2145
2146 put_unaligned(TT_END, tl++); /* Close the tag list */
2147
2148 cn_reply->id.idx = CN_IDX_DRBD;
2149 cn_reply->id.val = CN_VAL_DRBD;
2150
2151 cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
2152 cn_reply->ack = 0; /* not used here. */
2153 cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
2154 (int)((char *)tl - (char *)reply->tag_list);
2155 cn_reply->flags = 0;
2156
2157 reply->packet_type = P_get_state;
2158 reply->minor = mdev_to_minor(mdev);
2159 reply->ret_code = NO_ERROR;
2160
2161 cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2162}
2163
2164void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name)
2165{
2166 char buffer[sizeof(struct cn_msg)+
2167 sizeof(struct drbd_nl_cfg_reply)+
2168 sizeof(struct call_helper_tag_len_struct)+
2169 sizeof(short int)];
2170 struct cn_msg *cn_reply = (struct cn_msg *) buffer;
2171 struct drbd_nl_cfg_reply *reply =
2172 (struct drbd_nl_cfg_reply *)cn_reply->data;
2173 unsigned short *tl = reply->tag_list;
2174
2175 /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
2176
2177 tl = tl_add_str(tl, T_helper, helper_name);
2178 put_unaligned(TT_END, tl++); /* Close the tag list */
2179
2180 cn_reply->id.idx = CN_IDX_DRBD;
2181 cn_reply->id.val = CN_VAL_DRBD;
2182
2183 cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
2184 cn_reply->ack = 0; /* not used here. */
2185 cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
2186 (int)((char *)tl - (char *)reply->tag_list);
2187 cn_reply->flags = 0;
2188
2189 reply->packet_type = P_call_helper;
2190 reply->minor = mdev_to_minor(mdev);
2191 reply->ret_code = NO_ERROR;
2192
2193 cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2194}
2195
2196void drbd_bcast_ee(struct drbd_conf *mdev,
2197 const char *reason, const int dgs,
2198 const char* seen_hash, const char* calc_hash,
2199 const struct drbd_epoch_entry* e)
2200{
2201 struct cn_msg *cn_reply;
2202 struct drbd_nl_cfg_reply *reply;
2203 struct bio_vec *bvec;
2204 unsigned short *tl;
2205 int i;
2206
2207 if (!e)
2208 return;
2209 if (!reason || !reason[0])
2210 return;
2211
2212 /* apparently we have to memcpy twice, first to prepare the data for the
2213 * struct cn_msg, then within cn_netlink_send from the cn_msg to the
2214 * netlink skb. */
2215 /* receiver thread context, which is not in the writeout path (of this node),
2216 * but may be in the writeout path of the _other_ node.
2217 * GFP_NOIO to avoid potential "distributed deadlock". */
2218 cn_reply = kmalloc(
2219 sizeof(struct cn_msg)+
2220 sizeof(struct drbd_nl_cfg_reply)+
2221 sizeof(struct dump_ee_tag_len_struct)+
2222 sizeof(short int),
2223 GFP_NOIO);
2224
2225 if (!cn_reply) {
2226 dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n",
2227 (unsigned long long)e->sector, e->size);
2228 return;
2229 }
2230
2231 reply = (struct drbd_nl_cfg_reply*)cn_reply->data;
2232 tl = reply->tag_list;
2233
2234 tl = tl_add_str(tl, T_dump_ee_reason, reason);
2235 tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs);
2236 tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs);
2237 tl = tl_add_int(tl, T_ee_sector, &e->sector);
2238 tl = tl_add_int(tl, T_ee_block_id, &e->block_id);
2239
2240 put_unaligned(T_ee_data, tl++);
2241 put_unaligned(e->size, tl++);
2242
2243 __bio_for_each_segment(bvec, e->private_bio, i, 0) {
2244 void *d = kmap(bvec->bv_page);
2245 memcpy(tl, d + bvec->bv_offset, bvec->bv_len);
2246 kunmap(bvec->bv_page);
2247 tl=(unsigned short*)((char*)tl + bvec->bv_len);
2248 }
2249 put_unaligned(TT_END, tl++); /* Close the tag list */
2250
2251 cn_reply->id.idx = CN_IDX_DRBD;
2252 cn_reply->id.val = CN_VAL_DRBD;
2253
2254 cn_reply->seq = atomic_add_return(1,&drbd_nl_seq);
2255 cn_reply->ack = 0; // not used here.
2256 cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
2257 (int)((char*)tl - (char*)reply->tag_list);
2258 cn_reply->flags = 0;
2259
2260 reply->packet_type = P_dump_ee;
2261 reply->minor = mdev_to_minor(mdev);
2262 reply->ret_code = NO_ERROR;
2263
2264 cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2265 kfree(cn_reply);
2266}
2267
2268void drbd_bcast_sync_progress(struct drbd_conf *mdev)
2269{
2270 char buffer[sizeof(struct cn_msg)+
2271 sizeof(struct drbd_nl_cfg_reply)+
2272 sizeof(struct sync_progress_tag_len_struct)+
2273 sizeof(short int)];
2274 struct cn_msg *cn_reply = (struct cn_msg *) buffer;
2275 struct drbd_nl_cfg_reply *reply =
2276 (struct drbd_nl_cfg_reply *)cn_reply->data;
2277 unsigned short *tl = reply->tag_list;
2278 unsigned long rs_left;
2279 unsigned int res;
2280
2281 /* no local ref, no bitmap, no syncer progress, no broadcast. */
2282 if (!get_ldev(mdev))
2283 return;
2284 drbd_get_syncer_progress(mdev, &rs_left, &res);
2285 put_ldev(mdev);
2286
2287 tl = tl_add_int(tl, T_sync_progress, &res);
2288 put_unaligned(TT_END, tl++); /* Close the tag list */
2289
2290 cn_reply->id.idx = CN_IDX_DRBD;
2291 cn_reply->id.val = CN_VAL_DRBD;
2292
2293 cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
2294 cn_reply->ack = 0; /* not used here. */
2295 cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
2296 (int)((char *)tl - (char *)reply->tag_list);
2297 cn_reply->flags = 0;
2298
2299 reply->packet_type = P_sync_progress;
2300 reply->minor = mdev_to_minor(mdev);
2301 reply->ret_code = NO_ERROR;
2302
2303 cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2304}
2305
2306int __init drbd_nl_init(void)
2307{
2308 static struct cb_id cn_id_drbd;
2309 int err, try=10;
2310
2311 cn_id_drbd.val = CN_VAL_DRBD;
2312 do {
2313 cn_id_drbd.idx = cn_idx;
2314 err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback);
2315 if (!err)
2316 break;
2317 cn_idx = (cn_idx + CN_IDX_STEP);
2318 } while (try--);
2319
2320 if (err) {
2321 printk(KERN_ERR "drbd: cn_drbd failed to register\n");
2322 return err;
2323 }
2324
2325 return 0;
2326}
2327
2328void drbd_nl_cleanup(void)
2329{
2330 static struct cb_id cn_id_drbd;
2331
2332 cn_id_drbd.idx = cn_idx;
2333 cn_id_drbd.val = CN_VAL_DRBD;
2334
2335 cn_del_callback(&cn_id_drbd);
2336}
2337
2338void drbd_nl_send_reply(struct cn_msg *req, int ret_code)
2339{
2340 char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)];
2341 struct cn_msg *cn_reply = (struct cn_msg *) buffer;
2342 struct drbd_nl_cfg_reply *reply =
2343 (struct drbd_nl_cfg_reply *)cn_reply->data;
2344 int rr;
2345
2346 cn_reply->id = req->id;
2347
2348 cn_reply->seq = req->seq;
2349 cn_reply->ack = req->ack + 1;
2350 cn_reply->len = sizeof(struct drbd_nl_cfg_reply);
2351 cn_reply->flags = 0;
2352
2353 reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor;
2354 reply->ret_code = ret_code;
2355
2356 rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2357 if (rr && rr != -ESRCH)
2358 printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
2359}
2360
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
new file mode 100644
index 000000000000..bdd0b4943b10
--- /dev/null
+++ b/drivers/block/drbd/drbd_proc.c
@@ -0,0 +1,265 @@
1/*
2 drbd_proc.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/module.h>
27
28#include <asm/uaccess.h>
29#include <linux/fs.h>
30#include <linux/file.h>
31#include <linux/slab.h>
32#include <linux/proc_fs.h>
33#include <linux/seq_file.h>
34#include <linux/drbd.h>
35#include "drbd_int.h"
36
37static int drbd_proc_open(struct inode *inode, struct file *file);
38
39
40struct proc_dir_entry *drbd_proc;
41struct file_operations drbd_proc_fops = {
42 .owner = THIS_MODULE,
43 .open = drbd_proc_open,
44 .read = seq_read,
45 .llseek = seq_lseek,
46 .release = single_release,
47};
48
49
50/*lge
51 * progress bars shamelessly adapted from driver/md/md.c
52 * output looks like
53 * [=====>..............] 33.5% (23456/123456)
54 * finish: 2:20:20 speed: 6,345 (6,456) K/sec
55 */
56static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
57{
58 unsigned long db, dt, dbdt, rt, rs_left;
59 unsigned int res;
60 int i, x, y;
61
62 drbd_get_syncer_progress(mdev, &rs_left, &res);
63
64 x = res/50;
65 y = 20-x;
66 seq_printf(seq, "\t[");
67 for (i = 1; i < x; i++)
68 seq_printf(seq, "=");
69 seq_printf(seq, ">");
70 for (i = 0; i < y; i++)
71 seq_printf(seq, ".");
72 seq_printf(seq, "] ");
73
74 seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10);
75 /* if more than 1 GB display in MB */
76 if (mdev->rs_total > 0x100000L)
77 seq_printf(seq, "(%lu/%lu)M\n\t",
78 (unsigned long) Bit2KB(rs_left >> 10),
79 (unsigned long) Bit2KB(mdev->rs_total >> 10));
80 else
81 seq_printf(seq, "(%lu/%lu)K\n\t",
82 (unsigned long) Bit2KB(rs_left),
83 (unsigned long) Bit2KB(mdev->rs_total));
84
85 /* see drivers/md/md.c
86 * We do not want to overflow, so the order of operands and
87 * the * 100 / 100 trick are important. We do a +1 to be
88 * safe against division by zero. We only estimate anyway.
89 *
90 * dt: time from mark until now
91 * db: blocks written from mark until now
92 * rt: remaining time
93 */
94 dt = (jiffies - mdev->rs_mark_time) / HZ;
95
96 if (dt > 20) {
97 /* if we made no update to rs_mark_time for too long,
98 * we are stalled. show that. */
99 seq_printf(seq, "stalled\n");
100 return;
101 }
102
103 if (!dt)
104 dt++;
105 db = mdev->rs_mark_left - rs_left;
106 rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
107
108 seq_printf(seq, "finish: %lu:%02lu:%02lu",
109 rt / 3600, (rt % 3600) / 60, rt % 60);
110
111 /* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */
112 dbdt = Bit2KB(db/dt);
113 if (dbdt > 1000)
114 seq_printf(seq, " speed: %ld,%03ld",
115 dbdt/1000, dbdt % 1000);
116 else
117 seq_printf(seq, " speed: %ld", dbdt);
118
119 /* mean speed since syncer started
120 * we do account for PausedSync periods */
121 dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
122 if (dt <= 0)
123 dt = 1;
124 db = mdev->rs_total - rs_left;
125 dbdt = Bit2KB(db/dt);
126 if (dbdt > 1000)
127 seq_printf(seq, " (%ld,%03ld)",
128 dbdt/1000, dbdt % 1000);
129 else
130 seq_printf(seq, " (%ld)", dbdt);
131
132 seq_printf(seq, " K/sec\n");
133}
134
135static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
136{
137 struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
138
139 seq_printf(seq, "%5d %s %s\n", bme->rs_left,
140 bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------",
141 bme->flags & BME_LOCKED ? "LOCKED" : "------"
142 );
143}
144
145static int drbd_seq_show(struct seq_file *seq, void *v)
146{
147 int i, hole = 0;
148 const char *sn;
149 struct drbd_conf *mdev;
150
151 static char write_ordering_chars[] = {
152 [WO_none] = 'n',
153 [WO_drain_io] = 'd',
154 [WO_bdev_flush] = 'f',
155 [WO_bio_barrier] = 'b',
156 };
157
158 seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
159 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag());
160
161 /*
162 cs .. connection state
163 ro .. node role (local/remote)
164 ds .. disk state (local/remote)
165 protocol
166 various flags
167 ns .. network send
168 nr .. network receive
169 dw .. disk write
170 dr .. disk read
171 al .. activity log write count
172 bm .. bitmap update write count
173 pe .. pending (waiting for ack or data reply)
174 ua .. unack'd (still need to send ack or data reply)
175 ap .. application requests accepted, but not yet completed
176 ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending
177 wo .. write ordering mode currently in use
178 oos .. known out-of-sync kB
179 */
180
181 for (i = 0; i < minor_count; i++) {
182 mdev = minor_to_mdev(i);
183 if (!mdev) {
184 hole = 1;
185 continue;
186 }
187 if (hole) {
188 hole = 0;
189 seq_printf(seq, "\n");
190 }
191
192 sn = drbd_conn_str(mdev->state.conn);
193
194 if (mdev->state.conn == C_STANDALONE &&
195 mdev->state.disk == D_DISKLESS &&
196 mdev->state.role == R_SECONDARY) {
197 seq_printf(seq, "%2d: cs:Unconfigured\n", i);
198 } else {
199 seq_printf(seq,
200 "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n"
201 " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
202 "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
203 i, sn,
204 drbd_role_str(mdev->state.role),
205 drbd_role_str(mdev->state.peer),
206 drbd_disk_str(mdev->state.disk),
207 drbd_disk_str(mdev->state.pdsk),
208 (mdev->net_conf == NULL ? ' ' :
209 (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')),
210 mdev->state.susp ? 's' : 'r',
211 mdev->state.aftr_isp ? 'a' : '-',
212 mdev->state.peer_isp ? 'p' : '-',
213 mdev->state.user_isp ? 'u' : '-',
214 mdev->congestion_reason ?: '-',
215 mdev->send_cnt/2,
216 mdev->recv_cnt/2,
217 mdev->writ_cnt/2,
218 mdev->read_cnt/2,
219 mdev->al_writ_cnt,
220 mdev->bm_writ_cnt,
221 atomic_read(&mdev->local_cnt),
222 atomic_read(&mdev->ap_pending_cnt) +
223 atomic_read(&mdev->rs_pending_cnt),
224 atomic_read(&mdev->unacked_cnt),
225 atomic_read(&mdev->ap_bio_cnt),
226 mdev->epochs,
227 write_ordering_chars[mdev->write_ordering]
228 );
229 seq_printf(seq, " oos:%lu\n",
230 Bit2KB(drbd_bm_total_weight(mdev)));
231 }
232 if (mdev->state.conn == C_SYNC_SOURCE ||
233 mdev->state.conn == C_SYNC_TARGET)
234 drbd_syncer_progress(mdev, seq);
235
236 if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
237 seq_printf(seq, "\t%3d%% %lu/%lu\n",
238 (int)((mdev->rs_total-mdev->ov_left) /
239 (mdev->rs_total/100+1)),
240 mdev->rs_total - mdev->ov_left,
241 mdev->rs_total);
242
243 if (proc_details >= 1 && get_ldev_if_state(mdev, D_FAILED)) {
244 lc_seq_printf_stats(seq, mdev->resync);
245 lc_seq_printf_stats(seq, mdev->act_log);
246 put_ldev(mdev);
247 }
248
249 if (proc_details >= 2) {
250 if (mdev->resync) {
251 lc_seq_dump_details(seq, mdev->resync, "rs_left",
252 resync_dump_detail);
253 }
254 }
255 }
256
257 return 0;
258}
259
260static int drbd_proc_open(struct inode *inode, struct file *file)
261{
262 return single_open(file, drbd_seq_show, PDE(inode)->data);
263}
264
265/* PROC FS stuff end */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
new file mode 100644
index 000000000000..360baf60f574
--- /dev/null
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -0,0 +1,4427 @@
1/*
2 drbd_receiver.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25
26#include <linux/module.h>
27
28#include <asm/uaccess.h>
29#include <net/sock.h>
30
31#include <linux/version.h>
32#include <linux/drbd.h>
33#include <linux/fs.h>
34#include <linux/file.h>
35#include <linux/in.h>
36#include <linux/mm.h>
37#include <linux/memcontrol.h>
38#include <linux/mm_inline.h>
39#include <linux/slab.h>
40#include <linux/smp_lock.h>
41#include <linux/pkt_sched.h>
42#define __KERNEL_SYSCALLS__
43#include <linux/unistd.h>
44#include <linux/vmalloc.h>
45#include <linux/random.h>
46#include <linux/mm.h>
47#include <linux/string.h>
48#include <linux/scatterlist.h>
49#include "drbd_int.h"
50#include "drbd_req.h"
51
52#include "drbd_vli.h"
53
54struct flush_work {
55 struct drbd_work w;
56 struct drbd_epoch *epoch;
57};
58
59enum finish_epoch {
60 FE_STILL_LIVE,
61 FE_DESTROYED,
62 FE_RECYCLED,
63};
64
65static int drbd_do_handshake(struct drbd_conf *mdev);
66static int drbd_do_auth(struct drbd_conf *mdev);
67
68static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
69static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
70
71static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
72{
73 struct drbd_epoch *prev;
74 spin_lock(&mdev->epoch_lock);
75 prev = list_entry(epoch->list.prev, struct drbd_epoch, list);
76 if (prev == epoch || prev == mdev->current_epoch)
77 prev = NULL;
78 spin_unlock(&mdev->epoch_lock);
79 return prev;
80}
81
82#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
83
84static struct page *drbd_pp_first_page_or_try_alloc(struct drbd_conf *mdev)
85{
86 struct page *page = NULL;
87
88 /* Yes, testing drbd_pp_vacant outside the lock is racy.
89 * So what. It saves a spin_lock. */
90 if (drbd_pp_vacant > 0) {
91 spin_lock(&drbd_pp_lock);
92 page = drbd_pp_pool;
93 if (page) {
94 drbd_pp_pool = (struct page *)page_private(page);
95 set_page_private(page, 0); /* just to be polite */
96 drbd_pp_vacant--;
97 }
98 spin_unlock(&drbd_pp_lock);
99 }
100 /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
101 * "criss-cross" setup, that might cause write-out on some other DRBD,
102 * which in turn might block on the other node at this very place. */
103 if (!page)
104 page = alloc_page(GFP_TRY);
105 if (page)
106 atomic_inc(&mdev->pp_in_use);
107 return page;
108}
109
110/* kick lower level device, if we have more than (arbitrary number)
111 * reference counts on it, which typically are locally submitted io
112 * requests. don't use unacked_cnt, so we speed up proto A and B, too. */
113static void maybe_kick_lo(struct drbd_conf *mdev)
114{
115 if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark)
116 drbd_kick_lo(mdev);
117}
118
119static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed)
120{
121 struct drbd_epoch_entry *e;
122 struct list_head *le, *tle;
123
124 /* The EEs are always appended to the end of the list. Since
125 they are sent in order over the wire, they have to finish
126 in order. As soon as we see the first not finished we can
127 stop to examine the list... */
128
129 list_for_each_safe(le, tle, &mdev->net_ee) {
130 e = list_entry(le, struct drbd_epoch_entry, w.list);
131 if (drbd_bio_has_active_page(e->private_bio))
132 break;
133 list_move(le, to_be_freed);
134 }
135}
136
137static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
138{
139 LIST_HEAD(reclaimed);
140 struct drbd_epoch_entry *e, *t;
141
142 maybe_kick_lo(mdev);
143 spin_lock_irq(&mdev->req_lock);
144 reclaim_net_ee(mdev, &reclaimed);
145 spin_unlock_irq(&mdev->req_lock);
146
147 list_for_each_entry_safe(e, t, &reclaimed, w.list)
148 drbd_free_ee(mdev, e);
149}
150
151/**
152 * drbd_pp_alloc() - Returns a page, fails only if a signal comes in
153 * @mdev: DRBD device.
154 * @retry: whether or not to retry allocation forever (or until signalled)
155 *
156 * Tries to allocate a page, first from our own page pool, then from the
157 * kernel, unless this allocation would exceed the max_buffers setting.
158 * If @retry is non-zero, retry until DRBD frees a page somewhere else.
159 */
160static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry)
161{
162 struct page *page = NULL;
163 DEFINE_WAIT(wait);
164
165 if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
166 page = drbd_pp_first_page_or_try_alloc(mdev);
167 if (page)
168 return page;
169 }
170
171 for (;;) {
172 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
173
174 drbd_kick_lo_and_reclaim_net(mdev);
175
176 if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
177 page = drbd_pp_first_page_or_try_alloc(mdev);
178 if (page)
179 break;
180 }
181
182 if (!retry)
183 break;
184
185 if (signal_pending(current)) {
186 dev_warn(DEV, "drbd_pp_alloc interrupted!\n");
187 break;
188 }
189
190 schedule();
191 }
192 finish_wait(&drbd_pp_wait, &wait);
193
194 return page;
195}
196
197/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
198 * Is also used from inside an other spin_lock_irq(&mdev->req_lock) */
199static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
200{
201 int free_it;
202
203 spin_lock(&drbd_pp_lock);
204 if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
205 free_it = 1;
206 } else {
207 set_page_private(page, (unsigned long)drbd_pp_pool);
208 drbd_pp_pool = page;
209 drbd_pp_vacant++;
210 free_it = 0;
211 }
212 spin_unlock(&drbd_pp_lock);
213
214 atomic_dec(&mdev->pp_in_use);
215
216 if (free_it)
217 __free_page(page);
218
219 wake_up(&drbd_pp_wait);
220}
221
222static void drbd_pp_free_bio_pages(struct drbd_conf *mdev, struct bio *bio)
223{
224 struct page *p_to_be_freed = NULL;
225 struct page *page;
226 struct bio_vec *bvec;
227 int i;
228
229 spin_lock(&drbd_pp_lock);
230 __bio_for_each_segment(bvec, bio, i, 0) {
231 if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
232 set_page_private(bvec->bv_page, (unsigned long)p_to_be_freed);
233 p_to_be_freed = bvec->bv_page;
234 } else {
235 set_page_private(bvec->bv_page, (unsigned long)drbd_pp_pool);
236 drbd_pp_pool = bvec->bv_page;
237 drbd_pp_vacant++;
238 }
239 }
240 spin_unlock(&drbd_pp_lock);
241 atomic_sub(bio->bi_vcnt, &mdev->pp_in_use);
242
243 while (p_to_be_freed) {
244 page = p_to_be_freed;
245 p_to_be_freed = (struct page *)page_private(page);
246 set_page_private(page, 0); /* just to be polite */
247 put_page(page);
248 }
249
250 wake_up(&drbd_pp_wait);
251}
252
253/*
254You need to hold the req_lock:
255 _drbd_wait_ee_list_empty()
256
257You must not have the req_lock:
258 drbd_free_ee()
259 drbd_alloc_ee()
260 drbd_init_ee()
261 drbd_release_ee()
262 drbd_ee_fix_bhs()
263 drbd_process_done_ee()
264 drbd_clear_done_ee()
265 drbd_wait_ee_list_empty()
266*/
267
268struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
269 u64 id,
270 sector_t sector,
271 unsigned int data_size,
272 gfp_t gfp_mask) __must_hold(local)
273{
274 struct request_queue *q;
275 struct drbd_epoch_entry *e;
276 struct page *page;
277 struct bio *bio;
278 unsigned int ds;
279
280 if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE))
281 return NULL;
282
283 e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
284 if (!e) {
285 if (!(gfp_mask & __GFP_NOWARN))
286 dev_err(DEV, "alloc_ee: Allocation of an EE failed\n");
287 return NULL;
288 }
289
290 bio = bio_alloc(gfp_mask & ~__GFP_HIGHMEM, div_ceil(data_size, PAGE_SIZE));
291 if (!bio) {
292 if (!(gfp_mask & __GFP_NOWARN))
293 dev_err(DEV, "alloc_ee: Allocation of a bio failed\n");
294 goto fail1;
295 }
296
297 bio->bi_bdev = mdev->ldev->backing_bdev;
298 bio->bi_sector = sector;
299
300 ds = data_size;
301 while (ds) {
302 page = drbd_pp_alloc(mdev, (gfp_mask & __GFP_WAIT));
303 if (!page) {
304 if (!(gfp_mask & __GFP_NOWARN))
305 dev_err(DEV, "alloc_ee: Allocation of a page failed\n");
306 goto fail2;
307 }
308 if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) {
309 drbd_pp_free(mdev, page);
310 dev_err(DEV, "alloc_ee: bio_add_page(s=%llu,"
311 "data_size=%u,ds=%u) failed\n",
312 (unsigned long long)sector, data_size, ds);
313
314 q = bdev_get_queue(bio->bi_bdev);
315 if (q->merge_bvec_fn) {
316 struct bvec_merge_data bvm = {
317 .bi_bdev = bio->bi_bdev,
318 .bi_sector = bio->bi_sector,
319 .bi_size = bio->bi_size,
320 .bi_rw = bio->bi_rw,
321 };
322 int l = q->merge_bvec_fn(q, &bvm,
323 &bio->bi_io_vec[bio->bi_vcnt]);
324 dev_err(DEV, "merge_bvec_fn() = %d\n", l);
325 }
326
327 /* dump more of the bio. */
328 dev_err(DEV, "bio->bi_max_vecs = %d\n", bio->bi_max_vecs);
329 dev_err(DEV, "bio->bi_vcnt = %d\n", bio->bi_vcnt);
330 dev_err(DEV, "bio->bi_size = %d\n", bio->bi_size);
331 dev_err(DEV, "bio->bi_phys_segments = %d\n", bio->bi_phys_segments);
332
333 goto fail2;
334 break;
335 }
336 ds -= min_t(int, ds, PAGE_SIZE);
337 }
338
339 D_ASSERT(data_size == bio->bi_size);
340
341 bio->bi_private = e;
342 e->mdev = mdev;
343 e->sector = sector;
344 e->size = bio->bi_size;
345
346 e->private_bio = bio;
347 e->block_id = id;
348 INIT_HLIST_NODE(&e->colision);
349 e->epoch = NULL;
350 e->flags = 0;
351
352 return e;
353
354 fail2:
355 drbd_pp_free_bio_pages(mdev, bio);
356 bio_put(bio);
357 fail1:
358 mempool_free(e, drbd_ee_mempool);
359
360 return NULL;
361}
362
363void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
364{
365 struct bio *bio = e->private_bio;
366 drbd_pp_free_bio_pages(mdev, bio);
367 bio_put(bio);
368 D_ASSERT(hlist_unhashed(&e->colision));
369 mempool_free(e, drbd_ee_mempool);
370}
371
372int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
373{
374 LIST_HEAD(work_list);
375 struct drbd_epoch_entry *e, *t;
376 int count = 0;
377
378 spin_lock_irq(&mdev->req_lock);
379 list_splice_init(list, &work_list);
380 spin_unlock_irq(&mdev->req_lock);
381
382 list_for_each_entry_safe(e, t, &work_list, w.list) {
383 drbd_free_ee(mdev, e);
384 count++;
385 }
386 return count;
387}
388
389
390/*
391 * This function is called from _asender only_
392 * but see also comments in _req_mod(,barrier_acked)
393 * and receive_Barrier.
394 *
395 * Move entries from net_ee to done_ee, if ready.
396 * Grab done_ee, call all callbacks, free the entries.
397 * The callbacks typically send out ACKs.
398 */
399static int drbd_process_done_ee(struct drbd_conf *mdev)
400{
401 LIST_HEAD(work_list);
402 LIST_HEAD(reclaimed);
403 struct drbd_epoch_entry *e, *t;
404 int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS);
405
406 spin_lock_irq(&mdev->req_lock);
407 reclaim_net_ee(mdev, &reclaimed);
408 list_splice_init(&mdev->done_ee, &work_list);
409 spin_unlock_irq(&mdev->req_lock);
410
411 list_for_each_entry_safe(e, t, &reclaimed, w.list)
412 drbd_free_ee(mdev, e);
413
414 /* possible callbacks here:
415 * e_end_block, and e_end_resync_block, e_send_discard_ack.
416 * all ignore the last argument.
417 */
418 list_for_each_entry_safe(e, t, &work_list, w.list) {
419 /* list_del not necessary, next/prev members not touched */
420 ok = e->w.cb(mdev, &e->w, !ok) && ok;
421 drbd_free_ee(mdev, e);
422 }
423 wake_up(&mdev->ee_wait);
424
425 return ok;
426}
427
428void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
429{
430 DEFINE_WAIT(wait);
431
432 /* avoids spin_lock/unlock
433 * and calling prepare_to_wait in the fast path */
434 while (!list_empty(head)) {
435 prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
436 spin_unlock_irq(&mdev->req_lock);
437 drbd_kick_lo(mdev);
438 schedule();
439 finish_wait(&mdev->ee_wait, &wait);
440 spin_lock_irq(&mdev->req_lock);
441 }
442}
443
444void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
445{
446 spin_lock_irq(&mdev->req_lock);
447 _drbd_wait_ee_list_empty(mdev, head);
448 spin_unlock_irq(&mdev->req_lock);
449}
450
451/* see also kernel_accept; which is only present since 2.6.18.
452 * also we want to log which part of it failed, exactly */
453static int drbd_accept(struct drbd_conf *mdev, const char **what,
454 struct socket *sock, struct socket **newsock)
455{
456 struct sock *sk = sock->sk;
457 int err = 0;
458
459 *what = "listen";
460 err = sock->ops->listen(sock, 5);
461 if (err < 0)
462 goto out;
463
464 *what = "sock_create_lite";
465 err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
466 newsock);
467 if (err < 0)
468 goto out;
469
470 *what = "accept";
471 err = sock->ops->accept(sock, *newsock, 0);
472 if (err < 0) {
473 sock_release(*newsock);
474 *newsock = NULL;
475 goto out;
476 }
477 (*newsock)->ops = sock->ops;
478
479out:
480 return err;
481}
482
483static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock,
484 void *buf, size_t size, int flags)
485{
486 mm_segment_t oldfs;
487 struct kvec iov = {
488 .iov_base = buf,
489 .iov_len = size,
490 };
491 struct msghdr msg = {
492 .msg_iovlen = 1,
493 .msg_iov = (struct iovec *)&iov,
494 .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
495 };
496 int rv;
497
498 oldfs = get_fs();
499 set_fs(KERNEL_DS);
500 rv = sock_recvmsg(sock, &msg, size, msg.msg_flags);
501 set_fs(oldfs);
502
503 return rv;
504}
505
506static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size)
507{
508 mm_segment_t oldfs;
509 struct kvec iov = {
510 .iov_base = buf,
511 .iov_len = size,
512 };
513 struct msghdr msg = {
514 .msg_iovlen = 1,
515 .msg_iov = (struct iovec *)&iov,
516 .msg_flags = MSG_WAITALL | MSG_NOSIGNAL
517 };
518 int rv;
519
520 oldfs = get_fs();
521 set_fs(KERNEL_DS);
522
523 for (;;) {
524 rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags);
525 if (rv == size)
526 break;
527
528 /* Note:
529 * ECONNRESET other side closed the connection
530 * ERESTARTSYS (on sock) we got a signal
531 */
532
533 if (rv < 0) {
534 if (rv == -ECONNRESET)
535 dev_info(DEV, "sock was reset by peer\n");
536 else if (rv != -ERESTARTSYS)
537 dev_err(DEV, "sock_recvmsg returned %d\n", rv);
538 break;
539 } else if (rv == 0) {
540 dev_info(DEV, "sock was shut down by peer\n");
541 break;
542 } else {
543 /* signal came in, or peer/link went down,
544 * after we read a partial message
545 */
546 /* D_ASSERT(signal_pending(current)); */
547 break;
548 }
549 };
550
551 set_fs(oldfs);
552
553 if (rv != size)
554 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
555
556 return rv;
557}
558
559static struct socket *drbd_try_connect(struct drbd_conf *mdev)
560{
561 const char *what;
562 struct socket *sock;
563 struct sockaddr_in6 src_in6;
564 int err;
565 int disconnect_on_error = 1;
566
567 if (!get_net_conf(mdev))
568 return NULL;
569
570 what = "sock_create_kern";
571 err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
572 SOCK_STREAM, IPPROTO_TCP, &sock);
573 if (err < 0) {
574 sock = NULL;
575 goto out;
576 }
577
578 sock->sk->sk_rcvtimeo =
579 sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ;
580
581 /* explicitly bind to the configured IP as source IP
582 * for the outgoing connections.
583 * This is needed for multihomed hosts and to be
584 * able to use lo: interfaces for drbd.
585 * Make sure to use 0 as port number, so linux selects
586 * a free one dynamically.
587 */
588 memcpy(&src_in6, mdev->net_conf->my_addr,
589 min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6)));
590 if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6)
591 src_in6.sin6_port = 0;
592 else
593 ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
594
595 what = "bind before connect";
596 err = sock->ops->bind(sock,
597 (struct sockaddr *) &src_in6,
598 mdev->net_conf->my_addr_len);
599 if (err < 0)
600 goto out;
601
602 /* connect may fail, peer not yet available.
603 * stay C_WF_CONNECTION, don't go Disconnecting! */
604 disconnect_on_error = 0;
605 what = "connect";
606 err = sock->ops->connect(sock,
607 (struct sockaddr *)mdev->net_conf->peer_addr,
608 mdev->net_conf->peer_addr_len, 0);
609
610out:
611 if (err < 0) {
612 if (sock) {
613 sock_release(sock);
614 sock = NULL;
615 }
616 switch (-err) {
617 /* timeout, busy, signal pending */
618 case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
619 case EINTR: case ERESTARTSYS:
620 /* peer not (yet) available, network problem */
621 case ECONNREFUSED: case ENETUNREACH:
622 case EHOSTDOWN: case EHOSTUNREACH:
623 disconnect_on_error = 0;
624 break;
625 default:
626 dev_err(DEV, "%s failed, err = %d\n", what, err);
627 }
628 if (disconnect_on_error)
629 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
630 }
631 put_net_conf(mdev);
632 return sock;
633}
634
635static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
636{
637 int timeo, err;
638 struct socket *s_estab = NULL, *s_listen;
639 const char *what;
640
641 if (!get_net_conf(mdev))
642 return NULL;
643
644 what = "sock_create_kern";
645 err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
646 SOCK_STREAM, IPPROTO_TCP, &s_listen);
647 if (err) {
648 s_listen = NULL;
649 goto out;
650 }
651
652 timeo = mdev->net_conf->try_connect_int * HZ;
653 timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
654
655 s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */
656 s_listen->sk->sk_rcvtimeo = timeo;
657 s_listen->sk->sk_sndtimeo = timeo;
658
659 what = "bind before listen";
660 err = s_listen->ops->bind(s_listen,
661 (struct sockaddr *) mdev->net_conf->my_addr,
662 mdev->net_conf->my_addr_len);
663 if (err < 0)
664 goto out;
665
666 err = drbd_accept(mdev, &what, s_listen, &s_estab);
667
668out:
669 if (s_listen)
670 sock_release(s_listen);
671 if (err < 0) {
672 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
673 dev_err(DEV, "%s failed, err = %d\n", what, err);
674 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
675 }
676 }
677 put_net_conf(mdev);
678
679 return s_estab;
680}
681
682static int drbd_send_fp(struct drbd_conf *mdev,
683 struct socket *sock, enum drbd_packets cmd)
684{
685 struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
686
687 return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
688}
689
690static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
691{
692 struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
693 int rr;
694
695 rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
696
697 if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC)
698 return be16_to_cpu(h->command);
699
700 return 0xffff;
701}
702
703/**
704 * drbd_socket_okay() - Free the socket if its connection is not okay
705 * @mdev: DRBD device.
706 * @sock: pointer to the pointer to the socket.
707 */
708static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
709{
710 int rr;
711 char tb[4];
712
713 if (!*sock)
714 return FALSE;
715
716 rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
717
718 if (rr > 0 || rr == -EAGAIN) {
719 return TRUE;
720 } else {
721 sock_release(*sock);
722 *sock = NULL;
723 return FALSE;
724 }
725}
726
727/*
728 * return values:
729 * 1 yes, we have a valid connection
730 * 0 oops, did not work out, please try again
731 * -1 peer talks different language,
732 * no point in trying again, please go standalone.
733 * -2 We do not have a network config...
734 */
735static int drbd_connect(struct drbd_conf *mdev)
736{
737 struct socket *s, *sock, *msock;
738 int try, h, ok;
739
740 D_ASSERT(!mdev->data.socket);
741
742 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags))
743 dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n");
744
745 if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
746 return -2;
747
748 clear_bit(DISCARD_CONCURRENT, &mdev->flags);
749
750 sock = NULL;
751 msock = NULL;
752
753 do {
754 for (try = 0;;) {
755 /* 3 tries, this should take less than a second! */
756 s = drbd_try_connect(mdev);
757 if (s || ++try >= 3)
758 break;
759 /* give the other side time to call bind() & listen() */
760 __set_current_state(TASK_INTERRUPTIBLE);
761 schedule_timeout(HZ / 10);
762 }
763
764 if (s) {
765 if (!sock) {
766 drbd_send_fp(mdev, s, P_HAND_SHAKE_S);
767 sock = s;
768 s = NULL;
769 } else if (!msock) {
770 drbd_send_fp(mdev, s, P_HAND_SHAKE_M);
771 msock = s;
772 s = NULL;
773 } else {
774 dev_err(DEV, "Logic error in drbd_connect()\n");
775 goto out_release_sockets;
776 }
777 }
778
779 if (sock && msock) {
780 __set_current_state(TASK_INTERRUPTIBLE);
781 schedule_timeout(HZ / 10);
782 ok = drbd_socket_okay(mdev, &sock);
783 ok = drbd_socket_okay(mdev, &msock) && ok;
784 if (ok)
785 break;
786 }
787
788retry:
789 s = drbd_wait_for_connect(mdev);
790 if (s) {
791 try = drbd_recv_fp(mdev, s);
792 drbd_socket_okay(mdev, &sock);
793 drbd_socket_okay(mdev, &msock);
794 switch (try) {
795 case P_HAND_SHAKE_S:
796 if (sock) {
797 dev_warn(DEV, "initial packet S crossed\n");
798 sock_release(sock);
799 }
800 sock = s;
801 break;
802 case P_HAND_SHAKE_M:
803 if (msock) {
804 dev_warn(DEV, "initial packet M crossed\n");
805 sock_release(msock);
806 }
807 msock = s;
808 set_bit(DISCARD_CONCURRENT, &mdev->flags);
809 break;
810 default:
811 dev_warn(DEV, "Error receiving initial packet\n");
812 sock_release(s);
813 if (random32() & 1)
814 goto retry;
815 }
816 }
817
818 if (mdev->state.conn <= C_DISCONNECTING)
819 goto out_release_sockets;
820 if (signal_pending(current)) {
821 flush_signals(current);
822 smp_rmb();
823 if (get_t_state(&mdev->receiver) == Exiting)
824 goto out_release_sockets;
825 }
826
827 if (sock && msock) {
828 ok = drbd_socket_okay(mdev, &sock);
829 ok = drbd_socket_okay(mdev, &msock) && ok;
830 if (ok)
831 break;
832 }
833 } while (1);
834
835 msock->sk->sk_reuse = 1; /* SO_REUSEADDR */
836 sock->sk->sk_reuse = 1; /* SO_REUSEADDR */
837
838 sock->sk->sk_allocation = GFP_NOIO;
839 msock->sk->sk_allocation = GFP_NOIO;
840
841 sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
842 msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
843
844 if (mdev->net_conf->sndbuf_size) {
845 sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size;
846 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
847 }
848
849 if (mdev->net_conf->rcvbuf_size) {
850 sock->sk->sk_rcvbuf = mdev->net_conf->rcvbuf_size;
851 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
852 }
853
854 /* NOT YET ...
855 * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
856 * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
857 * first set it to the P_HAND_SHAKE timeout,
858 * which we set to 4x the configured ping_timeout. */
859 sock->sk->sk_sndtimeo =
860 sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10;
861
862 msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
863 msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
864
865 /* we don't want delays.
866 * we use TCP_CORK where apropriate, though */
867 drbd_tcp_nodelay(sock);
868 drbd_tcp_nodelay(msock);
869
870 mdev->data.socket = sock;
871 mdev->meta.socket = msock;
872 mdev->last_received = jiffies;
873
874 D_ASSERT(mdev->asender.task == NULL);
875
876 h = drbd_do_handshake(mdev);
877 if (h <= 0)
878 return h;
879
880 if (mdev->cram_hmac_tfm) {
881 /* drbd_request_state(mdev, NS(conn, WFAuth)); */
882 if (!drbd_do_auth(mdev)) {
883 dev_err(DEV, "Authentication of peer failed\n");
884 return -1;
885 }
886 }
887
888 if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS)
889 return 0;
890
891 sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
892 sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
893
894 atomic_set(&mdev->packet_seq, 0);
895 mdev->peer_seq = 0;
896
897 drbd_thread_start(&mdev->asender);
898
899 drbd_send_protocol(mdev);
900 drbd_send_sync_param(mdev, &mdev->sync_conf);
901 drbd_send_sizes(mdev, 0);
902 drbd_send_uuids(mdev);
903 drbd_send_state(mdev);
904 clear_bit(USE_DEGR_WFC_T, &mdev->flags);
905 clear_bit(RESIZE_PENDING, &mdev->flags);
906
907 return 1;
908
909out_release_sockets:
910 if (sock)
911 sock_release(sock);
912 if (msock)
913 sock_release(msock);
914 return -1;
915}
916
917static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h)
918{
919 int r;
920
921 r = drbd_recv(mdev, h, sizeof(*h));
922
923 if (unlikely(r != sizeof(*h))) {
924 dev_err(DEV, "short read expecting header on sock: r=%d\n", r);
925 return FALSE;
926 };
927 h->command = be16_to_cpu(h->command);
928 h->length = be16_to_cpu(h->length);
929 if (unlikely(h->magic != BE_DRBD_MAGIC)) {
930 dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n",
931 (long)be32_to_cpu(h->magic),
932 h->command, h->length);
933 return FALSE;
934 }
935 mdev->last_received = jiffies;
936
937 return TRUE;
938}
939
940static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
941{
942 int rv;
943
944 if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
945 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL);
946 if (rv) {
947 dev_err(DEV, "local disk flush failed with status %d\n", rv);
948 /* would rather check on EOPNOTSUPP, but that is not reliable.
949 * don't try again for ANY return value != 0
950 * if (rv == -EOPNOTSUPP) */
951 drbd_bump_write_ordering(mdev, WO_drain_io);
952 }
953 put_ldev(mdev);
954 }
955
956 return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
957}
958
959static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
960{
961 struct flush_work *fw = (struct flush_work *)w;
962 struct drbd_epoch *epoch = fw->epoch;
963
964 kfree(w);
965
966 if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags))
967 drbd_flush_after_epoch(mdev, epoch);
968
969 drbd_may_finish_epoch(mdev, epoch, EV_PUT |
970 (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0));
971
972 return 1;
973}
974
975/**
976 * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
977 * @mdev: DRBD device.
978 * @epoch: Epoch object.
979 * @ev: Epoch event.
980 */
981static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
982 struct drbd_epoch *epoch,
983 enum epoch_event ev)
984{
985 int finish, epoch_size;
986 struct drbd_epoch *next_epoch;
987 int schedule_flush = 0;
988 enum finish_epoch rv = FE_STILL_LIVE;
989
990 spin_lock(&mdev->epoch_lock);
991 do {
992 next_epoch = NULL;
993 finish = 0;
994
995 epoch_size = atomic_read(&epoch->epoch_size);
996
997 switch (ev & ~EV_CLEANUP) {
998 case EV_PUT:
999 atomic_dec(&epoch->active);
1000 break;
1001 case EV_GOT_BARRIER_NR:
1002 set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
1003
1004 /* Special case: If we just switched from WO_bio_barrier to
1005 WO_bdev_flush we should not finish the current epoch */
1006 if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 &&
1007 mdev->write_ordering != WO_bio_barrier &&
1008 epoch == mdev->current_epoch)
1009 clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags);
1010 break;
1011 case EV_BARRIER_DONE:
1012 set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags);
1013 break;
1014 case EV_BECAME_LAST:
1015 /* nothing to do*/
1016 break;
1017 }
1018
1019 if (epoch_size != 0 &&
1020 atomic_read(&epoch->active) == 0 &&
1021 test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) &&
1022 epoch->list.prev == &mdev->current_epoch->list &&
1023 !test_bit(DE_IS_FINISHING, &epoch->flags)) {
1024 /* Nearly all conditions are met to finish that epoch... */
1025 if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ||
1026 mdev->write_ordering == WO_none ||
1027 (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) ||
1028 ev & EV_CLEANUP) {
1029 finish = 1;
1030 set_bit(DE_IS_FINISHING, &epoch->flags);
1031 } else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) &&
1032 mdev->write_ordering == WO_bio_barrier) {
1033 atomic_inc(&epoch->active);
1034 schedule_flush = 1;
1035 }
1036 }
1037 if (finish) {
1038 if (!(ev & EV_CLEANUP)) {
1039 spin_unlock(&mdev->epoch_lock);
1040 drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
1041 spin_lock(&mdev->epoch_lock);
1042 }
1043 dec_unacked(mdev);
1044
1045 if (mdev->current_epoch != epoch) {
1046 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
1047 list_del(&epoch->list);
1048 ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
1049 mdev->epochs--;
1050 kfree(epoch);
1051
1052 if (rv == FE_STILL_LIVE)
1053 rv = FE_DESTROYED;
1054 } else {
1055 epoch->flags = 0;
1056 atomic_set(&epoch->epoch_size, 0);
1057 /* atomic_set(&epoch->active, 0); is alrady zero */
1058 if (rv == FE_STILL_LIVE)
1059 rv = FE_RECYCLED;
1060 }
1061 }
1062
1063 if (!next_epoch)
1064 break;
1065
1066 epoch = next_epoch;
1067 } while (1);
1068
1069 spin_unlock(&mdev->epoch_lock);
1070
1071 if (schedule_flush) {
1072 struct flush_work *fw;
1073 fw = kmalloc(sizeof(*fw), GFP_ATOMIC);
1074 if (fw) {
1075 fw->w.cb = w_flush;
1076 fw->epoch = epoch;
1077 drbd_queue_work(&mdev->data.work, &fw->w);
1078 } else {
1079 dev_warn(DEV, "Could not kmalloc a flush_work obj\n");
1080 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1081 /* That is not a recursion, only one level */
1082 drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
1083 drbd_may_finish_epoch(mdev, epoch, EV_PUT);
1084 }
1085 }
1086
1087 return rv;
1088}
1089
1090/**
1091 * drbd_bump_write_ordering() - Fall back to an other write ordering method
1092 * @mdev: DRBD device.
1093 * @wo: Write ordering method to try.
1094 */
1095void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local)
1096{
1097 enum write_ordering_e pwo;
1098 static char *write_ordering_str[] = {
1099 [WO_none] = "none",
1100 [WO_drain_io] = "drain",
1101 [WO_bdev_flush] = "flush",
1102 [WO_bio_barrier] = "barrier",
1103 };
1104
1105 pwo = mdev->write_ordering;
1106 wo = min(pwo, wo);
1107 if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier)
1108 wo = WO_bdev_flush;
1109 if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
1110 wo = WO_drain_io;
1111 if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
1112 wo = WO_none;
1113 mdev->write_ordering = wo;
1114 if (pwo != mdev->write_ordering || wo == WO_bio_barrier)
1115 dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
1116}
1117
1118/**
1119 * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set
1120 * @mdev: DRBD device.
1121 * @w: work object.
1122 * @cancel: The connection will be closed anyways (unused in this callback)
1123 */
1124int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
1125{
1126 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1127 struct bio *bio = e->private_bio;
1128
1129 /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
1130 (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
1131 so that we can finish that epoch in drbd_may_finish_epoch().
1132 That is necessary if we already have a long chain of Epochs, before
1133 we realize that BIO_RW_BARRIER is actually not supported */
1134
1135 /* As long as the -ENOTSUPP on the barrier is reported immediately
1136 that will never trigger. If it is reported late, we will just
1137 print that warning and continue correctly for all future requests
1138 with WO_bdev_flush */
1139 if (previous_epoch(mdev, e->epoch))
1140 dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
1141
1142 /* prepare bio for re-submit,
1143 * re-init volatile members */
1144 /* we still have a local reference,
1145 * get_ldev was done in receive_Data. */
1146 bio->bi_bdev = mdev->ldev->backing_bdev;
1147 bio->bi_sector = e->sector;
1148 bio->bi_size = e->size;
1149 bio->bi_idx = 0;
1150
1151 bio->bi_flags &= ~(BIO_POOL_MASK - 1);
1152 bio->bi_flags |= 1 << BIO_UPTODATE;
1153
1154 /* don't know whether this is necessary: */
1155 bio->bi_phys_segments = 0;
1156 bio->bi_next = NULL;
1157
1158 /* these should be unchanged: */
1159 /* bio->bi_end_io = drbd_endio_write_sec; */
1160 /* bio->bi_vcnt = whatever; */
1161
1162 e->w.cb = e_end_block;
1163
1164 /* This is no longer a barrier request. */
1165 bio->bi_rw &= ~(1UL << BIO_RW_BARRIER);
1166
1167 drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, bio);
1168
1169 return 1;
1170}
1171
1172static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h)
1173{
1174 int rv, issue_flush;
1175 struct p_barrier *p = (struct p_barrier *)h;
1176 struct drbd_epoch *epoch;
1177
1178 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
1179
1180 rv = drbd_recv(mdev, h->payload, h->length);
1181 ERR_IF(rv != h->length) return FALSE;
1182
1183 inc_unacked(mdev);
1184
1185 if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
1186 drbd_kick_lo(mdev);
1187
1188 mdev->current_epoch->barrier_nr = p->barrier;
1189 rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR);
1190
1191 /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
1192 * the activity log, which means it would not be resynced in case the
1193 * R_PRIMARY crashes now.
1194 * Therefore we must send the barrier_ack after the barrier request was
1195 * completed. */
1196 switch (mdev->write_ordering) {
1197 case WO_bio_barrier:
1198 case WO_none:
1199 if (rv == FE_RECYCLED)
1200 return TRUE;
1201 break;
1202
1203 case WO_bdev_flush:
1204 case WO_drain_io:
1205 D_ASSERT(rv == FE_STILL_LIVE);
1206 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
1207 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1208 rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
1209 if (rv == FE_RECYCLED)
1210 return TRUE;
1211
1212 /* The asender will send all the ACKs and barrier ACKs out, since
1213 all EEs moved from the active_ee to the done_ee. We need to
1214 provide a new epoch object for the EEs that come in soon */
1215 break;
1216 }
1217
1218 /* receiver context, in the writeout path of the other node.
1219 * avoid potential distributed deadlock */
1220 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1221 if (!epoch) {
1222 dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
1223 issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1224 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1225 if (issue_flush) {
1226 rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
1227 if (rv == FE_RECYCLED)
1228 return TRUE;
1229 }
1230
1231 drbd_wait_ee_list_empty(mdev, &mdev->done_ee);
1232
1233 return TRUE;
1234 }
1235
1236 epoch->flags = 0;
1237 atomic_set(&epoch->epoch_size, 0);
1238 atomic_set(&epoch->active, 0);
1239
1240 spin_lock(&mdev->epoch_lock);
1241 if (atomic_read(&mdev->current_epoch->epoch_size)) {
1242 list_add(&epoch->list, &mdev->current_epoch->list);
1243 mdev->current_epoch = epoch;
1244 mdev->epochs++;
1245 } else {
1246 /* The current_epoch got recycled while we allocated this one... */
1247 kfree(epoch);
1248 }
1249 spin_unlock(&mdev->epoch_lock);
1250
1251 return TRUE;
1252}
1253
1254/* used from receive_RSDataReply (recv_resync_read)
1255 * and from receive_Data */
1256static struct drbd_epoch_entry *
1257read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
1258{
1259 struct drbd_epoch_entry *e;
1260 struct bio_vec *bvec;
1261 struct page *page;
1262 struct bio *bio;
1263 int dgs, ds, i, rr;
1264 void *dig_in = mdev->int_dig_in;
1265 void *dig_vv = mdev->int_dig_vv;
1266
1267 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
1268 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
1269
1270 if (dgs) {
1271 rr = drbd_recv(mdev, dig_in, dgs);
1272 if (rr != dgs) {
1273 dev_warn(DEV, "short read receiving data digest: read %d expected %d\n",
1274 rr, dgs);
1275 return NULL;
1276 }
1277 }
1278
1279 data_size -= dgs;
1280
1281 ERR_IF(data_size & 0x1ff) return NULL;
1282 ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL;
1283
1284 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1285 * "criss-cross" setup, that might cause write-out on some other DRBD,
1286 * which in turn might block on the other node at this very place. */
1287 e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
1288 if (!e)
1289 return NULL;
1290 bio = e->private_bio;
1291 ds = data_size;
1292 bio_for_each_segment(bvec, bio, i) {
1293 page = bvec->bv_page;
1294 rr = drbd_recv(mdev, kmap(page), min_t(int, ds, PAGE_SIZE));
1295 kunmap(page);
1296 if (rr != min_t(int, ds, PAGE_SIZE)) {
1297 drbd_free_ee(mdev, e);
1298 dev_warn(DEV, "short read receiving data: read %d expected %d\n",
1299 rr, min_t(int, ds, PAGE_SIZE));
1300 return NULL;
1301 }
1302 ds -= rr;
1303 }
1304
1305 if (dgs) {
1306 drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv);
1307 if (memcmp(dig_in, dig_vv, dgs)) {
1308 dev_err(DEV, "Digest integrity check FAILED.\n");
1309 drbd_bcast_ee(mdev, "digest failed",
1310 dgs, dig_in, dig_vv, e);
1311 drbd_free_ee(mdev, e);
1312 return NULL;
1313 }
1314 }
1315 mdev->recv_cnt += data_size>>9;
1316 return e;
1317}
1318
1319/* drbd_drain_block() just takes a data block
1320 * out of the socket input buffer, and discards it.
1321 */
1322static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
1323{
1324 struct page *page;
1325 int rr, rv = 1;
1326 void *data;
1327
1328 page = drbd_pp_alloc(mdev, 1);
1329
1330 data = kmap(page);
1331 while (data_size) {
1332 rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE));
1333 if (rr != min_t(int, data_size, PAGE_SIZE)) {
1334 rv = 0;
1335 dev_warn(DEV, "short read receiving data: read %d expected %d\n",
1336 rr, min_t(int, data_size, PAGE_SIZE));
1337 break;
1338 }
1339 data_size -= rr;
1340 }
1341 kunmap(page);
1342 drbd_pp_free(mdev, page);
1343 return rv;
1344}
1345
1346static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
1347 sector_t sector, int data_size)
1348{
1349 struct bio_vec *bvec;
1350 struct bio *bio;
1351 int dgs, rr, i, expect;
1352 void *dig_in = mdev->int_dig_in;
1353 void *dig_vv = mdev->int_dig_vv;
1354
1355 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
1356 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
1357
1358 if (dgs) {
1359 rr = drbd_recv(mdev, dig_in, dgs);
1360 if (rr != dgs) {
1361 dev_warn(DEV, "short read receiving data reply digest: read %d expected %d\n",
1362 rr, dgs);
1363 return 0;
1364 }
1365 }
1366
1367 data_size -= dgs;
1368
1369 /* optimistically update recv_cnt. if receiving fails below,
1370 * we disconnect anyways, and counters will be reset. */
1371 mdev->recv_cnt += data_size>>9;
1372
1373 bio = req->master_bio;
1374 D_ASSERT(sector == bio->bi_sector);
1375
1376 bio_for_each_segment(bvec, bio, i) {
1377 expect = min_t(int, data_size, bvec->bv_len);
1378 rr = drbd_recv(mdev,
1379 kmap(bvec->bv_page)+bvec->bv_offset,
1380 expect);
1381 kunmap(bvec->bv_page);
1382 if (rr != expect) {
1383 dev_warn(DEV, "short read receiving data reply: "
1384 "read %d expected %d\n",
1385 rr, expect);
1386 return 0;
1387 }
1388 data_size -= rr;
1389 }
1390
1391 if (dgs) {
1392 drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv);
1393 if (memcmp(dig_in, dig_vv, dgs)) {
1394 dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
1395 return 0;
1396 }
1397 }
1398
1399 D_ASSERT(data_size == 0);
1400 return 1;
1401}
1402
1403/* e_end_resync_block() is called via
1404 * drbd_process_done_ee() by asender only */
1405static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1406{
1407 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1408 sector_t sector = e->sector;
1409 int ok;
1410
1411 D_ASSERT(hlist_unhashed(&e->colision));
1412
1413 if (likely(drbd_bio_uptodate(e->private_bio))) {
1414 drbd_set_in_sync(mdev, sector, e->size);
1415 ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
1416 } else {
1417 /* Record failure to sync */
1418 drbd_rs_failed_io(mdev, sector, e->size);
1419
1420 ok = drbd_send_ack(mdev, P_NEG_ACK, e);
1421 }
1422 dec_unacked(mdev);
1423
1424 return ok;
1425}
1426
1427static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local)
1428{
1429 struct drbd_epoch_entry *e;
1430
1431 e = read_in_block(mdev, ID_SYNCER, sector, data_size);
1432 if (!e) {
1433 put_ldev(mdev);
1434 return FALSE;
1435 }
1436
1437 dec_rs_pending(mdev);
1438
1439 e->private_bio->bi_end_io = drbd_endio_write_sec;
1440 e->private_bio->bi_rw = WRITE;
1441 e->w.cb = e_end_resync_block;
1442
1443 inc_unacked(mdev);
1444 /* corresponding dec_unacked() in e_end_resync_block()
1445 * respective _drbd_clear_done_ee */
1446
1447 spin_lock_irq(&mdev->req_lock);
1448 list_add(&e->w.list, &mdev->sync_ee);
1449 spin_unlock_irq(&mdev->req_lock);
1450
1451 drbd_generic_make_request(mdev, DRBD_FAULT_RS_WR, e->private_bio);
1452 /* accounting done in endio */
1453
1454 maybe_kick_lo(mdev);
1455 return TRUE;
1456}
1457
1458static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
1459{
1460 struct drbd_request *req;
1461 sector_t sector;
1462 unsigned int header_size, data_size;
1463 int ok;
1464 struct p_data *p = (struct p_data *)h;
1465
1466 header_size = sizeof(*p) - sizeof(*h);
1467 data_size = h->length - header_size;
1468
1469 ERR_IF(data_size == 0) return FALSE;
1470
1471 if (drbd_recv(mdev, h->payload, header_size) != header_size)
1472 return FALSE;
1473
1474 sector = be64_to_cpu(p->sector);
1475
1476 spin_lock_irq(&mdev->req_lock);
1477 req = _ar_id_to_req(mdev, p->block_id, sector);
1478 spin_unlock_irq(&mdev->req_lock);
1479 if (unlikely(!req)) {
1480 dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n");
1481 return FALSE;
1482 }
1483
1484 /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid
1485 * special casing it there for the various failure cases.
1486 * still no race with drbd_fail_pending_reads */
1487 ok = recv_dless_read(mdev, req, sector, data_size);
1488
1489 if (ok)
1490 req_mod(req, data_received);
1491 /* else: nothing. handled from drbd_disconnect...
1492 * I don't think we may complete this just yet
1493 * in case we are "on-disconnect: freeze" */
1494
1495 return ok;
1496}
1497
1498static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h)
1499{
1500 sector_t sector;
1501 unsigned int header_size, data_size;
1502 int ok;
1503 struct p_data *p = (struct p_data *)h;
1504
1505 header_size = sizeof(*p) - sizeof(*h);
1506 data_size = h->length - header_size;
1507
1508 ERR_IF(data_size == 0) return FALSE;
1509
1510 if (drbd_recv(mdev, h->payload, header_size) != header_size)
1511 return FALSE;
1512
1513 sector = be64_to_cpu(p->sector);
1514 D_ASSERT(p->block_id == ID_SYNCER);
1515
1516 if (get_ldev(mdev)) {
1517 /* data is submitted to disk within recv_resync_read.
1518 * corresponding put_ldev done below on error,
1519 * or in drbd_endio_write_sec. */
1520 ok = recv_resync_read(mdev, sector, data_size);
1521 } else {
1522 if (__ratelimit(&drbd_ratelimit_state))
1523 dev_err(DEV, "Can not write resync data to local disk.\n");
1524
1525 ok = drbd_drain_block(mdev, data_size);
1526
1527 drbd_send_ack_dp(mdev, P_NEG_ACK, p);
1528 }
1529
1530 return ok;
1531}
1532
1533/* e_end_block() is called via drbd_process_done_ee().
1534 * this means this function only runs in the asender thread
1535 */
1536static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1537{
1538 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1539 sector_t sector = e->sector;
1540 struct drbd_epoch *epoch;
1541 int ok = 1, pcmd;
1542
1543 if (e->flags & EE_IS_BARRIER) {
1544 epoch = previous_epoch(mdev, e->epoch);
1545 if (epoch)
1546 drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0));
1547 }
1548
1549 if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
1550 if (likely(drbd_bio_uptodate(e->private_bio))) {
1551 pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
1552 mdev->state.conn <= C_PAUSED_SYNC_T &&
1553 e->flags & EE_MAY_SET_IN_SYNC) ?
1554 P_RS_WRITE_ACK : P_WRITE_ACK;
1555 ok &= drbd_send_ack(mdev, pcmd, e);
1556 if (pcmd == P_RS_WRITE_ACK)
1557 drbd_set_in_sync(mdev, sector, e->size);
1558 } else {
1559 ok = drbd_send_ack(mdev, P_NEG_ACK, e);
1560 /* we expect it to be marked out of sync anyways...
1561 * maybe assert this? */
1562 }
1563 dec_unacked(mdev);
1564 }
1565 /* we delete from the conflict detection hash _after_ we sent out the
1566 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */
1567 if (mdev->net_conf->two_primaries) {
1568 spin_lock_irq(&mdev->req_lock);
1569 D_ASSERT(!hlist_unhashed(&e->colision));
1570 hlist_del_init(&e->colision);
1571 spin_unlock_irq(&mdev->req_lock);
1572 } else {
1573 D_ASSERT(hlist_unhashed(&e->colision));
1574 }
1575
1576 drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
1577
1578 return ok;
1579}
1580
1581static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1582{
1583 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1584 int ok = 1;
1585
1586 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
1587 ok = drbd_send_ack(mdev, P_DISCARD_ACK, e);
1588
1589 spin_lock_irq(&mdev->req_lock);
1590 D_ASSERT(!hlist_unhashed(&e->colision));
1591 hlist_del_init(&e->colision);
1592 spin_unlock_irq(&mdev->req_lock);
1593
1594 dec_unacked(mdev);
1595
1596 return ok;
1597}
1598
1599/* Called from receive_Data.
1600 * Synchronize packets on sock with packets on msock.
1601 *
1602 * This is here so even when a P_DATA packet traveling via sock overtook an Ack
1603 * packet traveling on msock, they are still processed in the order they have
1604 * been sent.
1605 *
1606 * Note: we don't care for Ack packets overtaking P_DATA packets.
1607 *
1608 * In case packet_seq is larger than mdev->peer_seq number, there are
1609 * outstanding packets on the msock. We wait for them to arrive.
1610 * In case we are the logically next packet, we update mdev->peer_seq
1611 * ourselves. Correctly handles 32bit wrap around.
1612 *
1613 * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
1614 * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
1615 * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
1616 * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
1617 *
1618 * returns 0 if we may process the packet,
1619 * -ERESTARTSYS if we were interrupted (by disconnect signal). */
1620static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
1621{
1622 DEFINE_WAIT(wait);
1623 unsigned int p_seq;
1624 long timeout;
1625 int ret = 0;
1626 spin_lock(&mdev->peer_seq_lock);
1627 for (;;) {
1628 prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
1629 if (seq_le(packet_seq, mdev->peer_seq+1))
1630 break;
1631 if (signal_pending(current)) {
1632 ret = -ERESTARTSYS;
1633 break;
1634 }
1635 p_seq = mdev->peer_seq;
1636 spin_unlock(&mdev->peer_seq_lock);
1637 timeout = schedule_timeout(30*HZ);
1638 spin_lock(&mdev->peer_seq_lock);
1639 if (timeout == 0 && p_seq == mdev->peer_seq) {
1640 ret = -ETIMEDOUT;
1641 dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n");
1642 break;
1643 }
1644 }
1645 finish_wait(&mdev->seq_wait, &wait);
1646 if (mdev->peer_seq+1 == packet_seq)
1647 mdev->peer_seq++;
1648 spin_unlock(&mdev->peer_seq_lock);
1649 return ret;
1650}
1651
1652/* mirrored write */
1653static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
1654{
1655 sector_t sector;
1656 struct drbd_epoch_entry *e;
1657 struct p_data *p = (struct p_data *)h;
1658 int header_size, data_size;
1659 int rw = WRITE;
1660 u32 dp_flags;
1661
1662 header_size = sizeof(*p) - sizeof(*h);
1663 data_size = h->length - header_size;
1664
1665 ERR_IF(data_size == 0) return FALSE;
1666
1667 if (drbd_recv(mdev, h->payload, header_size) != header_size)
1668 return FALSE;
1669
1670 if (!get_ldev(mdev)) {
1671 if (__ratelimit(&drbd_ratelimit_state))
1672 dev_err(DEV, "Can not write mirrored data block "
1673 "to local disk.\n");
1674 spin_lock(&mdev->peer_seq_lock);
1675 if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num))
1676 mdev->peer_seq++;
1677 spin_unlock(&mdev->peer_seq_lock);
1678
1679 drbd_send_ack_dp(mdev, P_NEG_ACK, p);
1680 atomic_inc(&mdev->current_epoch->epoch_size);
1681 return drbd_drain_block(mdev, data_size);
1682 }
1683
1684 /* get_ldev(mdev) successful.
1685 * Corresponding put_ldev done either below (on various errors),
1686 * or in drbd_endio_write_sec, if we successfully submit the data at
1687 * the end of this function. */
1688
1689 sector = be64_to_cpu(p->sector);
1690 e = read_in_block(mdev, p->block_id, sector, data_size);
1691 if (!e) {
1692 put_ldev(mdev);
1693 return FALSE;
1694 }
1695
1696 e->private_bio->bi_end_io = drbd_endio_write_sec;
1697 e->w.cb = e_end_block;
1698
1699 spin_lock(&mdev->epoch_lock);
1700 e->epoch = mdev->current_epoch;
1701 atomic_inc(&e->epoch->epoch_size);
1702 atomic_inc(&e->epoch->active);
1703
1704 if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) {
1705 struct drbd_epoch *epoch;
1706 /* Issue a barrier if we start a new epoch, and the previous epoch
1707 was not a epoch containing a single request which already was
1708 a Barrier. */
1709 epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list);
1710 if (epoch == e->epoch) {
1711 set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
1712 rw |= (1<<BIO_RW_BARRIER);
1713 e->flags |= EE_IS_BARRIER;
1714 } else {
1715 if (atomic_read(&epoch->epoch_size) > 1 ||
1716 !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) {
1717 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1718 set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
1719 rw |= (1<<BIO_RW_BARRIER);
1720 e->flags |= EE_IS_BARRIER;
1721 }
1722 }
1723 }
1724 spin_unlock(&mdev->epoch_lock);
1725
1726 dp_flags = be32_to_cpu(p->dp_flags);
1727 if (dp_flags & DP_HARDBARRIER) {
1728 dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n");
1729 /* rw |= (1<<BIO_RW_BARRIER); */
1730 }
1731 if (dp_flags & DP_RW_SYNC)
1732 rw |= (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG);
1733 if (dp_flags & DP_MAY_SET_IN_SYNC)
1734 e->flags |= EE_MAY_SET_IN_SYNC;
1735
1736 /* I'm the receiver, I do hold a net_cnt reference. */
1737 if (!mdev->net_conf->two_primaries) {
1738 spin_lock_irq(&mdev->req_lock);
1739 } else {
1740 /* don't get the req_lock yet,
1741 * we may sleep in drbd_wait_peer_seq */
1742 const int size = e->size;
1743 const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags);
1744 DEFINE_WAIT(wait);
1745 struct drbd_request *i;
1746 struct hlist_node *n;
1747 struct hlist_head *slot;
1748 int first;
1749
1750 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
1751 BUG_ON(mdev->ee_hash == NULL);
1752 BUG_ON(mdev->tl_hash == NULL);
1753
1754 /* conflict detection and handling:
1755 * 1. wait on the sequence number,
1756 * in case this data packet overtook ACK packets.
1757 * 2. check our hash tables for conflicting requests.
1758 * we only need to walk the tl_hash, since an ee can not
1759 * have a conflict with an other ee: on the submitting
1760 * node, the corresponding req had already been conflicting,
1761 * and a conflicting req is never sent.
1762 *
1763 * Note: for two_primaries, we are protocol C,
1764 * so there cannot be any request that is DONE
1765 * but still on the transfer log.
1766 *
1767 * unconditionally add to the ee_hash.
1768 *
1769 * if no conflicting request is found:
1770 * submit.
1771 *
1772 * if any conflicting request is found
1773 * that has not yet been acked,
1774 * AND I have the "discard concurrent writes" flag:
1775 * queue (via done_ee) the P_DISCARD_ACK; OUT.
1776 *
1777 * if any conflicting request is found:
1778 * block the receiver, waiting on misc_wait
1779 * until no more conflicting requests are there,
1780 * or we get interrupted (disconnect).
1781 *
1782 * we do not just write after local io completion of those
1783 * requests, but only after req is done completely, i.e.
1784 * we wait for the P_DISCARD_ACK to arrive!
1785 *
1786 * then proceed normally, i.e. submit.
1787 */
1788 if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num)))
1789 goto out_interrupted;
1790
1791 spin_lock_irq(&mdev->req_lock);
1792
1793 hlist_add_head(&e->colision, ee_hash_slot(mdev, sector));
1794
1795#define OVERLAPS overlaps(i->sector, i->size, sector, size)
1796 slot = tl_hash_slot(mdev, sector);
1797 first = 1;
1798 for (;;) {
1799 int have_unacked = 0;
1800 int have_conflict = 0;
1801 prepare_to_wait(&mdev->misc_wait, &wait,
1802 TASK_INTERRUPTIBLE);
1803 hlist_for_each_entry(i, n, slot, colision) {
1804 if (OVERLAPS) {
1805 /* only ALERT on first iteration,
1806 * we may be woken up early... */
1807 if (first)
1808 dev_alert(DEV, "%s[%u] Concurrent local write detected!"
1809 " new: %llus +%u; pending: %llus +%u\n",
1810 current->comm, current->pid,
1811 (unsigned long long)sector, size,
1812 (unsigned long long)i->sector, i->size);
1813 if (i->rq_state & RQ_NET_PENDING)
1814 ++have_unacked;
1815 ++have_conflict;
1816 }
1817 }
1818#undef OVERLAPS
1819 if (!have_conflict)
1820 break;
1821
1822 /* Discard Ack only for the _first_ iteration */
1823 if (first && discard && have_unacked) {
1824 dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n",
1825 (unsigned long long)sector);
1826 inc_unacked(mdev);
1827 e->w.cb = e_send_discard_ack;
1828 list_add_tail(&e->w.list, &mdev->done_ee);
1829
1830 spin_unlock_irq(&mdev->req_lock);
1831
1832 /* we could probably send that P_DISCARD_ACK ourselves,
1833 * but I don't like the receiver using the msock */
1834
1835 put_ldev(mdev);
1836 wake_asender(mdev);
1837 finish_wait(&mdev->misc_wait, &wait);
1838 return TRUE;
1839 }
1840
1841 if (signal_pending(current)) {
1842 hlist_del_init(&e->colision);
1843
1844 spin_unlock_irq(&mdev->req_lock);
1845
1846 finish_wait(&mdev->misc_wait, &wait);
1847 goto out_interrupted;
1848 }
1849
1850 spin_unlock_irq(&mdev->req_lock);
1851 if (first) {
1852 first = 0;
1853 dev_alert(DEV, "Concurrent write! [W AFTERWARDS] "
1854 "sec=%llus\n", (unsigned long long)sector);
1855 } else if (discard) {
1856 /* we had none on the first iteration.
1857 * there must be none now. */
1858 D_ASSERT(have_unacked == 0);
1859 }
1860 schedule();
1861 spin_lock_irq(&mdev->req_lock);
1862 }
1863 finish_wait(&mdev->misc_wait, &wait);
1864 }
1865
1866 list_add(&e->w.list, &mdev->active_ee);
1867 spin_unlock_irq(&mdev->req_lock);
1868
1869 switch (mdev->net_conf->wire_protocol) {
1870 case DRBD_PROT_C:
1871 inc_unacked(mdev);
1872 /* corresponding dec_unacked() in e_end_block()
1873 * respective _drbd_clear_done_ee */
1874 break;
1875 case DRBD_PROT_B:
1876 /* I really don't like it that the receiver thread
1877 * sends on the msock, but anyways */
1878 drbd_send_ack(mdev, P_RECV_ACK, e);
1879 break;
1880 case DRBD_PROT_A:
1881 /* nothing to do */
1882 break;
1883 }
1884
1885 if (mdev->state.pdsk == D_DISKLESS) {
1886 /* In case we have the only disk of the cluster, */
1887 drbd_set_out_of_sync(mdev, e->sector, e->size);
1888 e->flags |= EE_CALL_AL_COMPLETE_IO;
1889 drbd_al_begin_io(mdev, e->sector);
1890 }
1891
1892 e->private_bio->bi_rw = rw;
1893 drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, e->private_bio);
1894 /* accounting done in endio */
1895
1896 maybe_kick_lo(mdev);
1897 return TRUE;
1898
1899out_interrupted:
1900 /* yes, the epoch_size now is imbalanced.
1901 * but we drop the connection anyways, so we don't have a chance to
1902 * receive a barrier... atomic_inc(&mdev->epoch_size); */
1903 put_ldev(mdev);
1904 drbd_free_ee(mdev, e);
1905 return FALSE;
1906}
1907
1908static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
1909{
1910 sector_t sector;
1911 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
1912 struct drbd_epoch_entry *e;
1913 struct digest_info *di = NULL;
1914 int size, digest_size;
1915 unsigned int fault_type;
1916 struct p_block_req *p =
1917 (struct p_block_req *)h;
1918 const int brps = sizeof(*p)-sizeof(*h);
1919
1920 if (drbd_recv(mdev, h->payload, brps) != brps)
1921 return FALSE;
1922
1923 sector = be64_to_cpu(p->sector);
1924 size = be32_to_cpu(p->blksize);
1925
1926 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
1927 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
1928 (unsigned long long)sector, size);
1929 return FALSE;
1930 }
1931 if (sector + (size>>9) > capacity) {
1932 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
1933 (unsigned long long)sector, size);
1934 return FALSE;
1935 }
1936
1937 if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
1938 if (__ratelimit(&drbd_ratelimit_state))
1939 dev_err(DEV, "Can not satisfy peer's read request, "
1940 "no local data.\n");
1941 drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY :
1942 P_NEG_RS_DREPLY , p);
1943 return TRUE;
1944 }
1945
1946 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1947 * "criss-cross" setup, that might cause write-out on some other DRBD,
1948 * which in turn might block on the other node at this very place. */
1949 e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO);
1950 if (!e) {
1951 put_ldev(mdev);
1952 return FALSE;
1953 }
1954
1955 e->private_bio->bi_rw = READ;
1956 e->private_bio->bi_end_io = drbd_endio_read_sec;
1957
1958 switch (h->command) {
1959 case P_DATA_REQUEST:
1960 e->w.cb = w_e_end_data_req;
1961 fault_type = DRBD_FAULT_DT_RD;
1962 break;
1963 case P_RS_DATA_REQUEST:
1964 e->w.cb = w_e_end_rsdata_req;
1965 fault_type = DRBD_FAULT_RS_RD;
1966 /* Eventually this should become asynchronously. Currently it
1967 * blocks the whole receiver just to delay the reading of a
1968 * resync data block.
1969 * the drbd_work_queue mechanism is made for this...
1970 */
1971 if (!drbd_rs_begin_io(mdev, sector)) {
1972 /* we have been interrupted,
1973 * probably connection lost! */
1974 D_ASSERT(signal_pending(current));
1975 goto out_free_e;
1976 }
1977 break;
1978
1979 case P_OV_REPLY:
1980 case P_CSUM_RS_REQUEST:
1981 fault_type = DRBD_FAULT_RS_RD;
1982 digest_size = h->length - brps ;
1983 di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
1984 if (!di)
1985 goto out_free_e;
1986
1987 di->digest_size = digest_size;
1988 di->digest = (((char *)di)+sizeof(struct digest_info));
1989
1990 if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
1991 goto out_free_e;
1992
1993 e->block_id = (u64)(unsigned long)di;
1994 if (h->command == P_CSUM_RS_REQUEST) {
1995 D_ASSERT(mdev->agreed_pro_version >= 89);
1996 e->w.cb = w_e_end_csum_rs_req;
1997 } else if (h->command == P_OV_REPLY) {
1998 e->w.cb = w_e_end_ov_reply;
1999 dec_rs_pending(mdev);
2000 break;
2001 }
2002
2003 if (!drbd_rs_begin_io(mdev, sector)) {
2004 /* we have been interrupted, probably connection lost! */
2005 D_ASSERT(signal_pending(current));
2006 goto out_free_e;
2007 }
2008 break;
2009
2010 case P_OV_REQUEST:
2011 if (mdev->state.conn >= C_CONNECTED &&
2012 mdev->state.conn != C_VERIFY_T)
2013 dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n",
2014 drbd_conn_str(mdev->state.conn));
2015 if (mdev->ov_start_sector == ~(sector_t)0 &&
2016 mdev->agreed_pro_version >= 90) {
2017 mdev->ov_start_sector = sector;
2018 mdev->ov_position = sector;
2019 mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(sector);
2020 dev_info(DEV, "Online Verify start sector: %llu\n",
2021 (unsigned long long)sector);
2022 }
2023 e->w.cb = w_e_end_ov_req;
2024 fault_type = DRBD_FAULT_RS_RD;
2025 /* Eventually this should become asynchronous. Currently it
2026 * blocks the whole receiver just to delay the reading of a
2027 * resync data block.
2028 * the drbd_work_queue mechanism is made for this...
2029 */
2030 if (!drbd_rs_begin_io(mdev, sector)) {
2031 /* we have been interrupted,
2032 * probably connection lost! */
2033 D_ASSERT(signal_pending(current));
2034 goto out_free_e;
2035 }
2036 break;
2037
2038
2039 default:
2040 dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
2041 cmdname(h->command));
2042 fault_type = DRBD_FAULT_MAX;
2043 }
2044
2045 spin_lock_irq(&mdev->req_lock);
2046 list_add(&e->w.list, &mdev->read_ee);
2047 spin_unlock_irq(&mdev->req_lock);
2048
2049 inc_unacked(mdev);
2050
2051 drbd_generic_make_request(mdev, fault_type, e->private_bio);
2052 maybe_kick_lo(mdev);
2053
2054 return TRUE;
2055
2056out_free_e:
2057 kfree(di);
2058 put_ldev(mdev);
2059 drbd_free_ee(mdev, e);
2060 return FALSE;
2061}
2062
2063static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
2064{
2065 int self, peer, rv = -100;
2066 unsigned long ch_self, ch_peer;
2067
2068 self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2069 peer = mdev->p_uuid[UI_BITMAP] & 1;
2070
2071 ch_peer = mdev->p_uuid[UI_SIZE];
2072 ch_self = mdev->comm_bm_set;
2073
2074 switch (mdev->net_conf->after_sb_0p) {
2075 case ASB_CONSENSUS:
2076 case ASB_DISCARD_SECONDARY:
2077 case ASB_CALL_HELPER:
2078 dev_err(DEV, "Configuration error.\n");
2079 break;
2080 case ASB_DISCONNECT:
2081 break;
2082 case ASB_DISCARD_YOUNGER_PRI:
2083 if (self == 0 && peer == 1) {
2084 rv = -1;
2085 break;
2086 }
2087 if (self == 1 && peer == 0) {
2088 rv = 1;
2089 break;
2090 }
2091 /* Else fall through to one of the other strategies... */
2092 case ASB_DISCARD_OLDER_PRI:
2093 if (self == 0 && peer == 1) {
2094 rv = 1;
2095 break;
2096 }
2097 if (self == 1 && peer == 0) {
2098 rv = -1;
2099 break;
2100 }
2101 /* Else fall through to one of the other strategies... */
2102 dev_warn(DEV, "Discard younger/older primary did not found a decision\n"
2103 "Using discard-least-changes instead\n");
2104 case ASB_DISCARD_ZERO_CHG:
2105 if (ch_peer == 0 && ch_self == 0) {
2106 rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
2107 ? -1 : 1;
2108 break;
2109 } else {
2110 if (ch_peer == 0) { rv = 1; break; }
2111 if (ch_self == 0) { rv = -1; break; }
2112 }
2113 if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG)
2114 break;
2115 case ASB_DISCARD_LEAST_CHG:
2116 if (ch_self < ch_peer)
2117 rv = -1;
2118 else if (ch_self > ch_peer)
2119 rv = 1;
2120 else /* ( ch_self == ch_peer ) */
2121 /* Well, then use something else. */
2122 rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
2123 ? -1 : 1;
2124 break;
2125 case ASB_DISCARD_LOCAL:
2126 rv = -1;
2127 break;
2128 case ASB_DISCARD_REMOTE:
2129 rv = 1;
2130 }
2131
2132 return rv;
2133}
2134
2135static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
2136{
2137 int self, peer, hg, rv = -100;
2138
2139 self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2140 peer = mdev->p_uuid[UI_BITMAP] & 1;
2141
2142 switch (mdev->net_conf->after_sb_1p) {
2143 case ASB_DISCARD_YOUNGER_PRI:
2144 case ASB_DISCARD_OLDER_PRI:
2145 case ASB_DISCARD_LEAST_CHG:
2146 case ASB_DISCARD_LOCAL:
2147 case ASB_DISCARD_REMOTE:
2148 dev_err(DEV, "Configuration error.\n");
2149 break;
2150 case ASB_DISCONNECT:
2151 break;
2152 case ASB_CONSENSUS:
2153 hg = drbd_asb_recover_0p(mdev);
2154 if (hg == -1 && mdev->state.role == R_SECONDARY)
2155 rv = hg;
2156 if (hg == 1 && mdev->state.role == R_PRIMARY)
2157 rv = hg;
2158 break;
2159 case ASB_VIOLENTLY:
2160 rv = drbd_asb_recover_0p(mdev);
2161 break;
2162 case ASB_DISCARD_SECONDARY:
2163 return mdev->state.role == R_PRIMARY ? 1 : -1;
2164 case ASB_CALL_HELPER:
2165 hg = drbd_asb_recover_0p(mdev);
2166 if (hg == -1 && mdev->state.role == R_PRIMARY) {
2167 self = drbd_set_role(mdev, R_SECONDARY, 0);
2168 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2169 * we might be here in C_WF_REPORT_PARAMS which is transient.
2170 * we do not need to wait for the after state change work either. */
2171 self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2172 if (self != SS_SUCCESS) {
2173 drbd_khelper(mdev, "pri-lost-after-sb");
2174 } else {
2175 dev_warn(DEV, "Successfully gave up primary role.\n");
2176 rv = hg;
2177 }
2178 } else
2179 rv = hg;
2180 }
2181
2182 return rv;
2183}
2184
2185static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
2186{
2187 int self, peer, hg, rv = -100;
2188
2189 self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2190 peer = mdev->p_uuid[UI_BITMAP] & 1;
2191
2192 switch (mdev->net_conf->after_sb_2p) {
2193 case ASB_DISCARD_YOUNGER_PRI:
2194 case ASB_DISCARD_OLDER_PRI:
2195 case ASB_DISCARD_LEAST_CHG:
2196 case ASB_DISCARD_LOCAL:
2197 case ASB_DISCARD_REMOTE:
2198 case ASB_CONSENSUS:
2199 case ASB_DISCARD_SECONDARY:
2200 dev_err(DEV, "Configuration error.\n");
2201 break;
2202 case ASB_VIOLENTLY:
2203 rv = drbd_asb_recover_0p(mdev);
2204 break;
2205 case ASB_DISCONNECT:
2206 break;
2207 case ASB_CALL_HELPER:
2208 hg = drbd_asb_recover_0p(mdev);
2209 if (hg == -1) {
2210 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2211 * we might be here in C_WF_REPORT_PARAMS which is transient.
2212 * we do not need to wait for the after state change work either. */
2213 self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2214 if (self != SS_SUCCESS) {
2215 drbd_khelper(mdev, "pri-lost-after-sb");
2216 } else {
2217 dev_warn(DEV, "Successfully gave up primary role.\n");
2218 rv = hg;
2219 }
2220 } else
2221 rv = hg;
2222 }
2223
2224 return rv;
2225}
2226
2227static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid,
2228 u64 bits, u64 flags)
2229{
2230 if (!uuid) {
2231 dev_info(DEV, "%s uuid info vanished while I was looking!\n", text);
2232 return;
2233 }
2234 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
2235 text,
2236 (unsigned long long)uuid[UI_CURRENT],
2237 (unsigned long long)uuid[UI_BITMAP],
2238 (unsigned long long)uuid[UI_HISTORY_START],
2239 (unsigned long long)uuid[UI_HISTORY_END],
2240 (unsigned long long)bits,
2241 (unsigned long long)flags);
2242}
2243
2244/*
2245 100 after split brain try auto recover
2246 2 C_SYNC_SOURCE set BitMap
2247 1 C_SYNC_SOURCE use BitMap
2248 0 no Sync
2249 -1 C_SYNC_TARGET use BitMap
2250 -2 C_SYNC_TARGET set BitMap
2251 -100 after split brain, disconnect
2252-1000 unrelated data
2253 */
2254static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local)
2255{
2256 u64 self, peer;
2257 int i, j;
2258
2259 self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2260 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2261
2262 *rule_nr = 10;
2263 if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
2264 return 0;
2265
2266 *rule_nr = 20;
2267 if ((self == UUID_JUST_CREATED || self == (u64)0) &&
2268 peer != UUID_JUST_CREATED)
2269 return -2;
2270
2271 *rule_nr = 30;
2272 if (self != UUID_JUST_CREATED &&
2273 (peer == UUID_JUST_CREATED || peer == (u64)0))
2274 return 2;
2275
2276 if (self == peer) {
2277 int rct, dc; /* roles at crash time */
2278
2279 if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
2280
2281 if (mdev->agreed_pro_version < 91)
2282 return -1001;
2283
2284 if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
2285 (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
2286 dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n");
2287 drbd_uuid_set_bm(mdev, 0UL);
2288
2289 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2290 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2291 *rule_nr = 34;
2292 } else {
2293 dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n");
2294 *rule_nr = 36;
2295 }
2296
2297 return 1;
2298 }
2299
2300 if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
2301
2302 if (mdev->agreed_pro_version < 91)
2303 return -1001;
2304
2305 if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
2306 (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
2307 dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
2308
2309 mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START];
2310 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP];
2311 mdev->p_uuid[UI_BITMAP] = 0UL;
2312
2313 drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2314 *rule_nr = 35;
2315 } else {
2316 dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n");
2317 *rule_nr = 37;
2318 }
2319
2320 return -1;
2321 }
2322
2323 /* Common power [off|failure] */
2324 rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) +
2325 (mdev->p_uuid[UI_FLAGS] & 2);
2326 /* lowest bit is set when we were primary,
2327 * next bit (weight 2) is set when peer was primary */
2328 *rule_nr = 40;
2329
2330 switch (rct) {
2331 case 0: /* !self_pri && !peer_pri */ return 0;
2332 case 1: /* self_pri && !peer_pri */ return 1;
2333 case 2: /* !self_pri && peer_pri */ return -1;
2334 case 3: /* self_pri && peer_pri */
2335 dc = test_bit(DISCARD_CONCURRENT, &mdev->flags);
2336 return dc ? -1 : 1;
2337 }
2338 }
2339
2340 *rule_nr = 50;
2341 peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2342 if (self == peer)
2343 return -1;
2344
2345 *rule_nr = 51;
2346 peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
2347 if (self == peer) {
2348 self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2349 peer = mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1);
2350 if (self == peer) {
2351 /* The last P_SYNC_UUID did not get though. Undo the last start of
2352 resync as sync source modifications of the peer's UUIDs. */
2353
2354 if (mdev->agreed_pro_version < 91)
2355 return -1001;
2356
2357 mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
2358 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
2359 return -1;
2360 }
2361 }
2362
2363 *rule_nr = 60;
2364 self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2365 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2366 peer = mdev->p_uuid[i] & ~((u64)1);
2367 if (self == peer)
2368 return -2;
2369 }
2370
2371 *rule_nr = 70;
2372 self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2373 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2374 if (self == peer)
2375 return 1;
2376
2377 *rule_nr = 71;
2378 self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2379 if (self == peer) {
2380 self = mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1);
2381 peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
2382 if (self == peer) {
2383 /* The last P_SYNC_UUID did not get though. Undo the last start of
2384 resync as sync source modifications of our UUIDs. */
2385
2386 if (mdev->agreed_pro_version < 91)
2387 return -1001;
2388
2389 _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
2390 _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]);
2391
2392 dev_info(DEV, "Undid last start of resync:\n");
2393
2394 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2395 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2396
2397 return 1;
2398 }
2399 }
2400
2401
2402 *rule_nr = 80;
2403 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2404 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2405 if (self == peer)
2406 return 2;
2407 }
2408
2409 *rule_nr = 90;
2410 self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2411 peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2412 if (self == peer && self != ((u64)0))
2413 return 100;
2414
2415 *rule_nr = 100;
2416 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2417 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2418 for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
2419 peer = mdev->p_uuid[j] & ~((u64)1);
2420 if (self == peer)
2421 return -100;
2422 }
2423 }
2424
2425 return -1000;
2426}
2427
2428/* drbd_sync_handshake() returns the new conn state on success, or
2429 CONN_MASK (-1) on failure.
2430 */
2431static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role,
2432 enum drbd_disk_state peer_disk) __must_hold(local)
2433{
2434 int hg, rule_nr;
2435 enum drbd_conns rv = C_MASK;
2436 enum drbd_disk_state mydisk;
2437
2438 mydisk = mdev->state.disk;
2439 if (mydisk == D_NEGOTIATING)
2440 mydisk = mdev->new_state_tmp.disk;
2441
2442 dev_info(DEV, "drbd_sync_handshake:\n");
2443 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0);
2444 drbd_uuid_dump(mdev, "peer", mdev->p_uuid,
2445 mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2446
2447 hg = drbd_uuid_compare(mdev, &rule_nr);
2448
2449 dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
2450
2451 if (hg == -1000) {
2452 dev_alert(DEV, "Unrelated data, aborting!\n");
2453 return C_MASK;
2454 }
2455 if (hg == -1001) {
2456 dev_alert(DEV, "To resolve this both sides have to support at least protocol\n");
2457 return C_MASK;
2458 }
2459
2460 if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
2461 (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) {
2462 int f = (hg == -100) || abs(hg) == 2;
2463 hg = mydisk > D_INCONSISTENT ? 1 : -1;
2464 if (f)
2465 hg = hg*2;
2466 dev_info(DEV, "Becoming sync %s due to disk states.\n",
2467 hg > 0 ? "source" : "target");
2468 }
2469
2470 if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
2471 int pcount = (mdev->state.role == R_PRIMARY)
2472 + (peer_role == R_PRIMARY);
2473 int forced = (hg == -100);
2474
2475 switch (pcount) {
2476 case 0:
2477 hg = drbd_asb_recover_0p(mdev);
2478 break;
2479 case 1:
2480 hg = drbd_asb_recover_1p(mdev);
2481 break;
2482 case 2:
2483 hg = drbd_asb_recover_2p(mdev);
2484 break;
2485 }
2486 if (abs(hg) < 100) {
2487 dev_warn(DEV, "Split-Brain detected, %d primaries, "
2488 "automatically solved. Sync from %s node\n",
2489 pcount, (hg < 0) ? "peer" : "this");
2490 if (forced) {
2491 dev_warn(DEV, "Doing a full sync, since"
2492 " UUIDs where ambiguous.\n");
2493 hg = hg*2;
2494 }
2495 }
2496 }
2497
2498 if (hg == -100) {
2499 if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1))
2500 hg = -1;
2501 if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1))
2502 hg = 1;
2503
2504 if (abs(hg) < 100)
2505 dev_warn(DEV, "Split-Brain detected, manually solved. "
2506 "Sync from %s node\n",
2507 (hg < 0) ? "peer" : "this");
2508 }
2509
2510 if (hg == -100) {
2511 dev_alert(DEV, "Split-Brain detected, dropping connection!\n");
2512 drbd_khelper(mdev, "split-brain");
2513 return C_MASK;
2514 }
2515
2516 if (hg > 0 && mydisk <= D_INCONSISTENT) {
2517 dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n");
2518 return C_MASK;
2519 }
2520
2521 if (hg < 0 && /* by intention we do not use mydisk here. */
2522 mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) {
2523 switch (mdev->net_conf->rr_conflict) {
2524 case ASB_CALL_HELPER:
2525 drbd_khelper(mdev, "pri-lost");
2526 /* fall through */
2527 case ASB_DISCONNECT:
2528 dev_err(DEV, "I shall become SyncTarget, but I am primary!\n");
2529 return C_MASK;
2530 case ASB_VIOLENTLY:
2531 dev_warn(DEV, "Becoming SyncTarget, violating the stable-data"
2532 "assumption\n");
2533 }
2534 }
2535
2536 if (abs(hg) >= 2) {
2537 dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
2538 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake"))
2539 return C_MASK;
2540 }
2541
2542 if (hg > 0) { /* become sync source. */
2543 rv = C_WF_BITMAP_S;
2544 } else if (hg < 0) { /* become sync target */
2545 rv = C_WF_BITMAP_T;
2546 } else {
2547 rv = C_CONNECTED;
2548 if (drbd_bm_total_weight(mdev)) {
2549 dev_info(DEV, "No resync, but %lu bits in bitmap!\n",
2550 drbd_bm_total_weight(mdev));
2551 }
2552 }
2553
2554 return rv;
2555}
2556
2557/* returns 1 if invalid */
2558static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
2559{
2560 /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
2561 if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) ||
2562 (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL))
2563 return 0;
2564
2565 /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
2566 if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL ||
2567 self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL)
2568 return 1;
2569
2570 /* everything else is valid if they are equal on both sides. */
2571 if (peer == self)
2572 return 0;
2573
2574 /* everything es is invalid. */
2575 return 1;
2576}
2577
2578static int receive_protocol(struct drbd_conf *mdev, struct p_header *h)
2579{
2580 struct p_protocol *p = (struct p_protocol *)h;
2581 int header_size, data_size;
2582 int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
2583 int p_want_lose, p_two_primaries;
2584 char p_integrity_alg[SHARED_SECRET_MAX] = "";
2585
2586 header_size = sizeof(*p) - sizeof(*h);
2587 data_size = h->length - header_size;
2588
2589 if (drbd_recv(mdev, h->payload, header_size) != header_size)
2590 return FALSE;
2591
2592 p_proto = be32_to_cpu(p->protocol);
2593 p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
2594 p_after_sb_1p = be32_to_cpu(p->after_sb_1p);
2595 p_after_sb_2p = be32_to_cpu(p->after_sb_2p);
2596 p_want_lose = be32_to_cpu(p->want_lose);
2597 p_two_primaries = be32_to_cpu(p->two_primaries);
2598
2599 if (p_proto != mdev->net_conf->wire_protocol) {
2600 dev_err(DEV, "incompatible communication protocols\n");
2601 goto disconnect;
2602 }
2603
2604 if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) {
2605 dev_err(DEV, "incompatible after-sb-0pri settings\n");
2606 goto disconnect;
2607 }
2608
2609 if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) {
2610 dev_err(DEV, "incompatible after-sb-1pri settings\n");
2611 goto disconnect;
2612 }
2613
2614 if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) {
2615 dev_err(DEV, "incompatible after-sb-2pri settings\n");
2616 goto disconnect;
2617 }
2618
2619 if (p_want_lose && mdev->net_conf->want_lose) {
2620 dev_err(DEV, "both sides have the 'want_lose' flag set\n");
2621 goto disconnect;
2622 }
2623
2624 if (p_two_primaries != mdev->net_conf->two_primaries) {
2625 dev_err(DEV, "incompatible setting of the two-primaries options\n");
2626 goto disconnect;
2627 }
2628
2629 if (mdev->agreed_pro_version >= 87) {
2630 unsigned char *my_alg = mdev->net_conf->integrity_alg;
2631
2632 if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size)
2633 return FALSE;
2634
2635 p_integrity_alg[SHARED_SECRET_MAX-1] = 0;
2636 if (strcmp(p_integrity_alg, my_alg)) {
2637 dev_err(DEV, "incompatible setting of the data-integrity-alg\n");
2638 goto disconnect;
2639 }
2640 dev_info(DEV, "data-integrity-alg: %s\n",
2641 my_alg[0] ? my_alg : (unsigned char *)"<not-used>");
2642 }
2643
2644 return TRUE;
2645
2646disconnect:
2647 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2648 return FALSE;
2649}
2650
2651/* helper function
2652 * input: alg name, feature name
2653 * return: NULL (alg name was "")
2654 * ERR_PTR(error) if something goes wrong
2655 * or the crypto hash ptr, if it worked out ok. */
2656struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
2657 const char *alg, const char *name)
2658{
2659 struct crypto_hash *tfm;
2660
2661 if (!alg[0])
2662 return NULL;
2663
2664 tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
2665 if (IS_ERR(tfm)) {
2666 dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n",
2667 alg, name, PTR_ERR(tfm));
2668 return tfm;
2669 }
2670 if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
2671 crypto_free_hash(tfm);
2672 dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name);
2673 return ERR_PTR(-EINVAL);
2674 }
2675 return tfm;
2676}
2677
2678static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
2679{
2680 int ok = TRUE;
2681 struct p_rs_param_89 *p = (struct p_rs_param_89 *)h;
2682 unsigned int header_size, data_size, exp_max_sz;
2683 struct crypto_hash *verify_tfm = NULL;
2684 struct crypto_hash *csums_tfm = NULL;
2685 const int apv = mdev->agreed_pro_version;
2686
2687 exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
2688 : apv == 88 ? sizeof(struct p_rs_param)
2689 + SHARED_SECRET_MAX
2690 : /* 89 */ sizeof(struct p_rs_param_89);
2691
2692 if (h->length > exp_max_sz) {
2693 dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
2694 h->length, exp_max_sz);
2695 return FALSE;
2696 }
2697
2698 if (apv <= 88) {
2699 header_size = sizeof(struct p_rs_param) - sizeof(*h);
2700 data_size = h->length - header_size;
2701 } else /* apv >= 89 */ {
2702 header_size = sizeof(struct p_rs_param_89) - sizeof(*h);
2703 data_size = h->length - header_size;
2704 D_ASSERT(data_size == 0);
2705 }
2706
2707 /* initialize verify_alg and csums_alg */
2708 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
2709
2710 if (drbd_recv(mdev, h->payload, header_size) != header_size)
2711 return FALSE;
2712
2713 mdev->sync_conf.rate = be32_to_cpu(p->rate);
2714
2715 if (apv >= 88) {
2716 if (apv == 88) {
2717 if (data_size > SHARED_SECRET_MAX) {
2718 dev_err(DEV, "verify-alg too long, "
2719 "peer wants %u, accepting only %u byte\n",
2720 data_size, SHARED_SECRET_MAX);
2721 return FALSE;
2722 }
2723
2724 if (drbd_recv(mdev, p->verify_alg, data_size) != data_size)
2725 return FALSE;
2726
2727 /* we expect NUL terminated string */
2728 /* but just in case someone tries to be evil */
2729 D_ASSERT(p->verify_alg[data_size-1] == 0);
2730 p->verify_alg[data_size-1] = 0;
2731
2732 } else /* apv >= 89 */ {
2733 /* we still expect NUL terminated strings */
2734 /* but just in case someone tries to be evil */
2735 D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0);
2736 D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0);
2737 p->verify_alg[SHARED_SECRET_MAX-1] = 0;
2738 p->csums_alg[SHARED_SECRET_MAX-1] = 0;
2739 }
2740
2741 if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) {
2742 if (mdev->state.conn == C_WF_REPORT_PARAMS) {
2743 dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
2744 mdev->sync_conf.verify_alg, p->verify_alg);
2745 goto disconnect;
2746 }
2747 verify_tfm = drbd_crypto_alloc_digest_safe(mdev,
2748 p->verify_alg, "verify-alg");
2749 if (IS_ERR(verify_tfm)) {
2750 verify_tfm = NULL;
2751 goto disconnect;
2752 }
2753 }
2754
2755 if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) {
2756 if (mdev->state.conn == C_WF_REPORT_PARAMS) {
2757 dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
2758 mdev->sync_conf.csums_alg, p->csums_alg);
2759 goto disconnect;
2760 }
2761 csums_tfm = drbd_crypto_alloc_digest_safe(mdev,
2762 p->csums_alg, "csums-alg");
2763 if (IS_ERR(csums_tfm)) {
2764 csums_tfm = NULL;
2765 goto disconnect;
2766 }
2767 }
2768
2769
2770 spin_lock(&mdev->peer_seq_lock);
2771 /* lock against drbd_nl_syncer_conf() */
2772 if (verify_tfm) {
2773 strcpy(mdev->sync_conf.verify_alg, p->verify_alg);
2774 mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1;
2775 crypto_free_hash(mdev->verify_tfm);
2776 mdev->verify_tfm = verify_tfm;
2777 dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
2778 }
2779 if (csums_tfm) {
2780 strcpy(mdev->sync_conf.csums_alg, p->csums_alg);
2781 mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1;
2782 crypto_free_hash(mdev->csums_tfm);
2783 mdev->csums_tfm = csums_tfm;
2784 dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
2785 }
2786 spin_unlock(&mdev->peer_seq_lock);
2787 }
2788
2789 return ok;
2790disconnect:
2791 /* just for completeness: actually not needed,
2792 * as this is not reached if csums_tfm was ok. */
2793 crypto_free_hash(csums_tfm);
2794 /* but free the verify_tfm again, if csums_tfm did not work out */
2795 crypto_free_hash(verify_tfm);
2796 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2797 return FALSE;
2798}
2799
2800static void drbd_setup_order_type(struct drbd_conf *mdev, int peer)
2801{
2802 /* sorry, we currently have no working implementation
2803 * of distributed TCQ */
2804}
2805
2806/* warn if the arguments differ by more than 12.5% */
2807static void warn_if_differ_considerably(struct drbd_conf *mdev,
2808 const char *s, sector_t a, sector_t b)
2809{
2810 sector_t d;
2811 if (a == 0 || b == 0)
2812 return;
2813 d = (a > b) ? (a - b) : (b - a);
2814 if (d > (a>>3) || d > (b>>3))
2815 dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s,
2816 (unsigned long long)a, (unsigned long long)b);
2817}
2818
2819static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
2820{
2821 struct p_sizes *p = (struct p_sizes *)h;
2822 enum determine_dev_size dd = unchanged;
2823 unsigned int max_seg_s;
2824 sector_t p_size, p_usize, my_usize;
2825 int ldsc = 0; /* local disk size changed */
2826 enum drbd_conns nconn;
2827
2828 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
2829 if (drbd_recv(mdev, h->payload, h->length) != h->length)
2830 return FALSE;
2831
2832 p_size = be64_to_cpu(p->d_size);
2833 p_usize = be64_to_cpu(p->u_size);
2834
2835 if (p_size == 0 && mdev->state.disk == D_DISKLESS) {
2836 dev_err(DEV, "some backing storage is needed\n");
2837 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2838 return FALSE;
2839 }
2840
2841 /* just store the peer's disk size for now.
2842 * we still need to figure out whether we accept that. */
2843 mdev->p_size = p_size;
2844
2845#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
2846 if (get_ldev(mdev)) {
2847 warn_if_differ_considerably(mdev, "lower level device sizes",
2848 p_size, drbd_get_max_capacity(mdev->ldev));
2849 warn_if_differ_considerably(mdev, "user requested size",
2850 p_usize, mdev->ldev->dc.disk_size);
2851
2852 /* if this is the first connect, or an otherwise expected
2853 * param exchange, choose the minimum */
2854 if (mdev->state.conn == C_WF_REPORT_PARAMS)
2855 p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size,
2856 p_usize);
2857
2858 my_usize = mdev->ldev->dc.disk_size;
2859
2860 if (mdev->ldev->dc.disk_size != p_usize) {
2861 mdev->ldev->dc.disk_size = p_usize;
2862 dev_info(DEV, "Peer sets u_size to %lu sectors\n",
2863 (unsigned long)mdev->ldev->dc.disk_size);
2864 }
2865
2866 /* Never shrink a device with usable data during connect.
2867 But allow online shrinking if we are connected. */
2868 if (drbd_new_dev_size(mdev, mdev->ldev) <
2869 drbd_get_capacity(mdev->this_bdev) &&
2870 mdev->state.disk >= D_OUTDATED &&
2871 mdev->state.conn < C_CONNECTED) {
2872 dev_err(DEV, "The peer's disk size is too small!\n");
2873 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2874 mdev->ldev->dc.disk_size = my_usize;
2875 put_ldev(mdev);
2876 return FALSE;
2877 }
2878 put_ldev(mdev);
2879 }
2880#undef min_not_zero
2881
2882 if (get_ldev(mdev)) {
2883 dd = drbd_determin_dev_size(mdev);
2884 put_ldev(mdev);
2885 if (dd == dev_size_error)
2886 return FALSE;
2887 drbd_md_sync(mdev);
2888 } else {
2889 /* I am diskless, need to accept the peer's size. */
2890 drbd_set_my_capacity(mdev, p_size);
2891 }
2892
2893 if (mdev->p_uuid && mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
2894 nconn = drbd_sync_handshake(mdev,
2895 mdev->state.peer, mdev->state.pdsk);
2896 put_ldev(mdev);
2897
2898 if (nconn == C_MASK) {
2899 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2900 return FALSE;
2901 }
2902
2903 if (drbd_request_state(mdev, NS(conn, nconn)) < SS_SUCCESS) {
2904 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2905 return FALSE;
2906 }
2907 }
2908
2909 if (get_ldev(mdev)) {
2910 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
2911 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
2912 ldsc = 1;
2913 }
2914
2915 max_seg_s = be32_to_cpu(p->max_segment_size);
2916 if (max_seg_s != queue_max_segment_size(mdev->rq_queue))
2917 drbd_setup_queue_param(mdev, max_seg_s);
2918
2919 drbd_setup_order_type(mdev, be32_to_cpu(p->queue_order_type));
2920 put_ldev(mdev);
2921 }
2922
2923 if (mdev->state.conn > C_WF_REPORT_PARAMS) {
2924 if (be64_to_cpu(p->c_size) !=
2925 drbd_get_capacity(mdev->this_bdev) || ldsc) {
2926 /* we have different sizes, probably peer
2927 * needs to know my new size... */
2928 drbd_send_sizes(mdev, 0);
2929 }
2930 if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
2931 (dd == grew && mdev->state.conn == C_CONNECTED)) {
2932 if (mdev->state.pdsk >= D_INCONSISTENT &&
2933 mdev->state.disk >= D_INCONSISTENT)
2934 resync_after_online_grow(mdev);
2935 else
2936 set_bit(RESYNC_AFTER_NEG, &mdev->flags);
2937 }
2938 }
2939
2940 return TRUE;
2941}
2942
2943static int receive_uuids(struct drbd_conf *mdev, struct p_header *h)
2944{
2945 struct p_uuids *p = (struct p_uuids *)h;
2946 u64 *p_uuid;
2947 int i;
2948
2949 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
2950 if (drbd_recv(mdev, h->payload, h->length) != h->length)
2951 return FALSE;
2952
2953 p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
2954
2955 for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
2956 p_uuid[i] = be64_to_cpu(p->uuid[i]);
2957
2958 kfree(mdev->p_uuid);
2959 mdev->p_uuid = p_uuid;
2960
2961 if (mdev->state.conn < C_CONNECTED &&
2962 mdev->state.disk < D_INCONSISTENT &&
2963 mdev->state.role == R_PRIMARY &&
2964 (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
2965 dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
2966 (unsigned long long)mdev->ed_uuid);
2967 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2968 return FALSE;
2969 }
2970
2971 if (get_ldev(mdev)) {
2972 int skip_initial_sync =
2973 mdev->state.conn == C_CONNECTED &&
2974 mdev->agreed_pro_version >= 90 &&
2975 mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
2976 (p_uuid[UI_FLAGS] & 8);
2977 if (skip_initial_sync) {
2978 dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n");
2979 drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
2980 "clear_n_write from receive_uuids");
2981 _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]);
2982 _drbd_uuid_set(mdev, UI_BITMAP, 0);
2983 _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
2984 CS_VERBOSE, NULL);
2985 drbd_md_sync(mdev);
2986 }
2987 put_ldev(mdev);
2988 }
2989
2990 /* Before we test for the disk state, we should wait until an eventually
2991 ongoing cluster wide state change is finished. That is important if
2992 we are primary and are detaching from our disk. We need to see the
2993 new disk state... */
2994 wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags));
2995 if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
2996 drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
2997
2998 return TRUE;
2999}
3000
3001/**
3002 * convert_state() - Converts the peer's view of the cluster state to our point of view
3003 * @ps: The state as seen by the peer.
3004 */
3005static union drbd_state convert_state(union drbd_state ps)
3006{
3007 union drbd_state ms;
3008
3009 static enum drbd_conns c_tab[] = {
3010 [C_CONNECTED] = C_CONNECTED,
3011
3012 [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
3013 [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
3014 [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
3015 [C_VERIFY_S] = C_VERIFY_T,
3016 [C_MASK] = C_MASK,
3017 };
3018
3019 ms.i = ps.i;
3020
3021 ms.conn = c_tab[ps.conn];
3022 ms.peer = ps.role;
3023 ms.role = ps.peer;
3024 ms.pdsk = ps.disk;
3025 ms.disk = ps.pdsk;
3026 ms.peer_isp = (ps.aftr_isp | ps.user_isp);
3027
3028 return ms;
3029}
3030
3031static int receive_req_state(struct drbd_conf *mdev, struct p_header *h)
3032{
3033 struct p_req_state *p = (struct p_req_state *)h;
3034 union drbd_state mask, val;
3035 int rv;
3036
3037 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
3038 if (drbd_recv(mdev, h->payload, h->length) != h->length)
3039 return FALSE;
3040
3041 mask.i = be32_to_cpu(p->mask);
3042 val.i = be32_to_cpu(p->val);
3043
3044 if (test_bit(DISCARD_CONCURRENT, &mdev->flags) &&
3045 test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) {
3046 drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
3047 return TRUE;
3048 }
3049
3050 mask = convert_state(mask);
3051 val = convert_state(val);
3052
3053 rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
3054
3055 drbd_send_sr_reply(mdev, rv);
3056 drbd_md_sync(mdev);
3057
3058 return TRUE;
3059}
3060
3061static int receive_state(struct drbd_conf *mdev, struct p_header *h)
3062{
3063 struct p_state *p = (struct p_state *)h;
3064 enum drbd_conns nconn, oconn;
3065 union drbd_state ns, peer_state;
3066 enum drbd_disk_state real_peer_disk;
3067 int rv;
3068
3069 ERR_IF(h->length != (sizeof(*p)-sizeof(*h)))
3070 return FALSE;
3071
3072 if (drbd_recv(mdev, h->payload, h->length) != h->length)
3073 return FALSE;
3074
3075 peer_state.i = be32_to_cpu(p->state);
3076
3077 real_peer_disk = peer_state.disk;
3078 if (peer_state.disk == D_NEGOTIATING) {
3079 real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
3080 dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
3081 }
3082
3083 spin_lock_irq(&mdev->req_lock);
3084 retry:
3085 oconn = nconn = mdev->state.conn;
3086 spin_unlock_irq(&mdev->req_lock);
3087
3088 if (nconn == C_WF_REPORT_PARAMS)
3089 nconn = C_CONNECTED;
3090
3091 if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
3092 get_ldev_if_state(mdev, D_NEGOTIATING)) {
3093 int cr; /* consider resync */
3094
3095 /* if we established a new connection */
3096 cr = (oconn < C_CONNECTED);
3097 /* if we had an established connection
3098 * and one of the nodes newly attaches a disk */
3099 cr |= (oconn == C_CONNECTED &&
3100 (peer_state.disk == D_NEGOTIATING ||
3101 mdev->state.disk == D_NEGOTIATING));
3102 /* if we have both been inconsistent, and the peer has been
3103 * forced to be UpToDate with --overwrite-data */
3104 cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
3105 /* if we had been plain connected, and the admin requested to
3106 * start a sync by "invalidate" or "invalidate-remote" */
3107 cr |= (oconn == C_CONNECTED &&
3108 (peer_state.conn >= C_STARTING_SYNC_S &&
3109 peer_state.conn <= C_WF_BITMAP_T));
3110
3111 if (cr)
3112 nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
3113
3114 put_ldev(mdev);
3115 if (nconn == C_MASK) {
3116 if (mdev->state.disk == D_NEGOTIATING) {
3117 drbd_force_state(mdev, NS(disk, D_DISKLESS));
3118 nconn = C_CONNECTED;
3119 } else if (peer_state.disk == D_NEGOTIATING) {
3120 dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
3121 peer_state.disk = D_DISKLESS;
3122 } else {
3123 D_ASSERT(oconn == C_WF_REPORT_PARAMS);
3124 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3125 return FALSE;
3126 }
3127 }
3128 }
3129
3130 spin_lock_irq(&mdev->req_lock);
3131 if (mdev->state.conn != oconn)
3132 goto retry;
3133 clear_bit(CONSIDER_RESYNC, &mdev->flags);
3134 ns.i = mdev->state.i;
3135 ns.conn = nconn;
3136 ns.peer = peer_state.role;
3137 ns.pdsk = real_peer_disk;
3138 ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
3139 if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
3140 ns.disk = mdev->new_state_tmp.disk;
3141
3142 rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL);
3143 ns = mdev->state;
3144 spin_unlock_irq(&mdev->req_lock);
3145
3146 if (rv < SS_SUCCESS) {
3147 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3148 return FALSE;
3149 }
3150
3151 if (oconn > C_WF_REPORT_PARAMS) {
3152 if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
3153 peer_state.disk != D_NEGOTIATING ) {
3154 /* we want resync, peer has not yet decided to sync... */
3155 /* Nowadays only used when forcing a node into primary role and
3156 setting its disk to UpToDate with that */
3157 drbd_send_uuids(mdev);
3158 drbd_send_state(mdev);
3159 }
3160 }
3161
3162 mdev->net_conf->want_lose = 0;
3163
3164 drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
3165
3166 return TRUE;
3167}
3168
3169static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
3170{
3171 struct p_rs_uuid *p = (struct p_rs_uuid *)h;
3172
3173 wait_event(mdev->misc_wait,
3174 mdev->state.conn == C_WF_SYNC_UUID ||
3175 mdev->state.conn < C_CONNECTED ||
3176 mdev->state.disk < D_NEGOTIATING);
3177
3178 /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
3179
3180 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
3181 if (drbd_recv(mdev, h->payload, h->length) != h->length)
3182 return FALSE;
3183
3184 /* Here the _drbd_uuid_ functions are right, current should
3185 _not_ be rotated into the history */
3186 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
3187 _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid));
3188 _drbd_uuid_set(mdev, UI_BITMAP, 0UL);
3189
3190 drbd_start_resync(mdev, C_SYNC_TARGET);
3191
3192 put_ldev(mdev);
3193 } else
3194 dev_err(DEV, "Ignoring SyncUUID packet!\n");
3195
3196 return TRUE;
3197}
3198
3199enum receive_bitmap_ret { OK, DONE, FAILED };
3200
3201static enum receive_bitmap_ret
3202receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h,
3203 unsigned long *buffer, struct bm_xfer_ctx *c)
3204{
3205 unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
3206 unsigned want = num_words * sizeof(long);
3207
3208 if (want != h->length) {
3209 dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length);
3210 return FAILED;
3211 }
3212 if (want == 0)
3213 return DONE;
3214 if (drbd_recv(mdev, buffer, want) != want)
3215 return FAILED;
3216
3217 drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer);
3218
3219 c->word_offset += num_words;
3220 c->bit_offset = c->word_offset * BITS_PER_LONG;
3221 if (c->bit_offset > c->bm_bits)
3222 c->bit_offset = c->bm_bits;
3223
3224 return OK;
3225}
3226
3227static enum receive_bitmap_ret
3228recv_bm_rle_bits(struct drbd_conf *mdev,
3229 struct p_compressed_bm *p,
3230 struct bm_xfer_ctx *c)
3231{
3232 struct bitstream bs;
3233 u64 look_ahead;
3234 u64 rl;
3235 u64 tmp;
3236 unsigned long s = c->bit_offset;
3237 unsigned long e;
3238 int len = p->head.length - (sizeof(*p) - sizeof(p->head));
3239 int toggle = DCBP_get_start(p);
3240 int have;
3241 int bits;
3242
3243 bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p));
3244
3245 bits = bitstream_get_bits(&bs, &look_ahead, 64);
3246 if (bits < 0)
3247 return FAILED;
3248
3249 for (have = bits; have > 0; s += rl, toggle = !toggle) {
3250 bits = vli_decode_bits(&rl, look_ahead);
3251 if (bits <= 0)
3252 return FAILED;
3253
3254 if (toggle) {
3255 e = s + rl -1;
3256 if (e >= c->bm_bits) {
3257 dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
3258 return FAILED;
3259 }
3260 _drbd_bm_set_bits(mdev, s, e);
3261 }
3262
3263 if (have < bits) {
3264 dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
3265 have, bits, look_ahead,
3266 (unsigned int)(bs.cur.b - p->code),
3267 (unsigned int)bs.buf_len);
3268 return FAILED;
3269 }
3270 look_ahead >>= bits;
3271 have -= bits;
3272
3273 bits = bitstream_get_bits(&bs, &tmp, 64 - have);
3274 if (bits < 0)
3275 return FAILED;
3276 look_ahead |= tmp << have;
3277 have += bits;
3278 }
3279
3280 c->bit_offset = s;
3281 bm_xfer_ctx_bit_to_word_offset(c);
3282
3283 return (s == c->bm_bits) ? DONE : OK;
3284}
3285
3286static enum receive_bitmap_ret
3287decode_bitmap_c(struct drbd_conf *mdev,
3288 struct p_compressed_bm *p,
3289 struct bm_xfer_ctx *c)
3290{
3291 if (DCBP_get_code(p) == RLE_VLI_Bits)
3292 return recv_bm_rle_bits(mdev, p, c);
3293
3294 /* other variants had been implemented for evaluation,
3295 * but have been dropped as this one turned out to be "best"
3296 * during all our tests. */
3297
3298 dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
3299 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3300 return FAILED;
3301}
3302
3303void INFO_bm_xfer_stats(struct drbd_conf *mdev,
3304 const char *direction, struct bm_xfer_ctx *c)
3305{
3306 /* what would it take to transfer it "plaintext" */
3307 unsigned plain = sizeof(struct p_header) *
3308 ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
3309 + c->bm_words * sizeof(long);
3310 unsigned total = c->bytes[0] + c->bytes[1];
3311 unsigned r;
3312
3313 /* total can not be zero. but just in case: */
3314 if (total == 0)
3315 return;
3316
3317 /* don't report if not compressed */
3318 if (total >= plain)
3319 return;
3320
3321 /* total < plain. check for overflow, still */
3322 r = (total > UINT_MAX/1000) ? (total / (plain/1000))
3323 : (1000 * total / plain);
3324
3325 if (r > 1000)
3326 r = 1000;
3327
3328 r = 1000 - r;
3329 dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
3330 "total %u; compression: %u.%u%%\n",
3331 direction,
3332 c->bytes[1], c->packets[1],
3333 c->bytes[0], c->packets[0],
3334 total, r/10, r % 10);
3335}
3336
3337/* Since we are processing the bitfield from lower addresses to higher,
3338 it does not matter if the process it in 32 bit chunks or 64 bit
3339 chunks as long as it is little endian. (Understand it as byte stream,
3340 beginning with the lowest byte...) If we would use big endian
3341 we would need to process it from the highest address to the lowest,
3342 in order to be agnostic to the 32 vs 64 bits issue.
3343
3344 returns 0 on failure, 1 if we successfully received it. */
3345static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
3346{
3347 struct bm_xfer_ctx c;
3348 void *buffer;
3349 enum receive_bitmap_ret ret;
3350 int ok = FALSE;
3351
3352 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
3353
3354 drbd_bm_lock(mdev, "receive bitmap");
3355
3356 /* maybe we should use some per thread scratch page,
3357 * and allocate that during initial device creation? */
3358 buffer = (unsigned long *) __get_free_page(GFP_NOIO);
3359 if (!buffer) {
3360 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
3361 goto out;
3362 }
3363
3364 c = (struct bm_xfer_ctx) {
3365 .bm_bits = drbd_bm_bits(mdev),
3366 .bm_words = drbd_bm_words(mdev),
3367 };
3368
3369 do {
3370 if (h->command == P_BITMAP) {
3371 ret = receive_bitmap_plain(mdev, h, buffer, &c);
3372 } else if (h->command == P_COMPRESSED_BITMAP) {
3373 /* MAYBE: sanity check that we speak proto >= 90,
3374 * and the feature is enabled! */
3375 struct p_compressed_bm *p;
3376
3377 if (h->length > BM_PACKET_PAYLOAD_BYTES) {
3378 dev_err(DEV, "ReportCBitmap packet too large\n");
3379 goto out;
3380 }
3381 /* use the page buff */
3382 p = buffer;
3383 memcpy(p, h, sizeof(*h));
3384 if (drbd_recv(mdev, p->head.payload, h->length) != h->length)
3385 goto out;
3386 if (p->head.length <= (sizeof(*p) - sizeof(p->head))) {
3387 dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length);
3388 return FAILED;
3389 }
3390 ret = decode_bitmap_c(mdev, p, &c);
3391 } else {
3392 dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command);
3393 goto out;
3394 }
3395
3396 c.packets[h->command == P_BITMAP]++;
3397 c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length;
3398
3399 if (ret != OK)
3400 break;
3401
3402 if (!drbd_recv_header(mdev, h))
3403 goto out;
3404 } while (ret == OK);
3405 if (ret == FAILED)
3406 goto out;
3407
3408 INFO_bm_xfer_stats(mdev, "receive", &c);
3409
3410 if (mdev->state.conn == C_WF_BITMAP_T) {
3411 ok = !drbd_send_bitmap(mdev);
3412 if (!ok)
3413 goto out;
3414 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
3415 ok = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
3416 D_ASSERT(ok == SS_SUCCESS);
3417 } else if (mdev->state.conn != C_WF_BITMAP_S) {
3418 /* admin may have requested C_DISCONNECTING,
3419 * other threads may have noticed network errors */
3420 dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n",
3421 drbd_conn_str(mdev->state.conn));
3422 }
3423
3424 ok = TRUE;
3425 out:
3426 drbd_bm_unlock(mdev);
3427 if (ok && mdev->state.conn == C_WF_BITMAP_S)
3428 drbd_start_resync(mdev, C_SYNC_SOURCE);
3429 free_page((unsigned long) buffer);
3430 return ok;
3431}
3432
3433static int receive_skip(struct drbd_conf *mdev, struct p_header *h)
3434{
3435 /* TODO zero copy sink :) */
3436 static char sink[128];
3437 int size, want, r;
3438
3439 dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
3440 h->command, h->length);
3441
3442 size = h->length;
3443 while (size > 0) {
3444 want = min_t(int, size, sizeof(sink));
3445 r = drbd_recv(mdev, sink, want);
3446 ERR_IF(r <= 0) break;
3447 size -= r;
3448 }
3449 return size == 0;
3450}
3451
3452static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
3453{
3454 if (mdev->state.disk >= D_INCONSISTENT)
3455 drbd_kick_lo(mdev);
3456
3457 /* Make sure we've acked all the TCP data associated
3458 * with the data requests being unplugged */
3459 drbd_tcp_quickack(mdev->data.socket);
3460
3461 return TRUE;
3462}
3463
3464typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *);
3465
3466static drbd_cmd_handler_f drbd_default_handler[] = {
3467 [P_DATA] = receive_Data,
3468 [P_DATA_REPLY] = receive_DataReply,
3469 [P_RS_DATA_REPLY] = receive_RSDataReply,
3470 [P_BARRIER] = receive_Barrier,
3471 [P_BITMAP] = receive_bitmap,
3472 [P_COMPRESSED_BITMAP] = receive_bitmap,
3473 [P_UNPLUG_REMOTE] = receive_UnplugRemote,
3474 [P_DATA_REQUEST] = receive_DataRequest,
3475 [P_RS_DATA_REQUEST] = receive_DataRequest,
3476 [P_SYNC_PARAM] = receive_SyncParam,
3477 [P_SYNC_PARAM89] = receive_SyncParam,
3478 [P_PROTOCOL] = receive_protocol,
3479 [P_UUIDS] = receive_uuids,
3480 [P_SIZES] = receive_sizes,
3481 [P_STATE] = receive_state,
3482 [P_STATE_CHG_REQ] = receive_req_state,
3483 [P_SYNC_UUID] = receive_sync_uuid,
3484 [P_OV_REQUEST] = receive_DataRequest,
3485 [P_OV_REPLY] = receive_DataRequest,
3486 [P_CSUM_RS_REQUEST] = receive_DataRequest,
3487 /* anything missing from this table is in
3488 * the asender_tbl, see get_asender_cmd */
3489 [P_MAX_CMD] = NULL,
3490};
3491
3492static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler;
3493static drbd_cmd_handler_f *drbd_opt_cmd_handler;
3494
3495static void drbdd(struct drbd_conf *mdev)
3496{
3497 drbd_cmd_handler_f handler;
3498 struct p_header *header = &mdev->data.rbuf.header;
3499
3500 while (get_t_state(&mdev->receiver) == Running) {
3501 drbd_thread_current_set_cpu(mdev);
3502 if (!drbd_recv_header(mdev, header))
3503 break;
3504
3505 if (header->command < P_MAX_CMD)
3506 handler = drbd_cmd_handler[header->command];
3507 else if (P_MAY_IGNORE < header->command
3508 && header->command < P_MAX_OPT_CMD)
3509 handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE];
3510 else if (header->command > P_MAX_OPT_CMD)
3511 handler = receive_skip;
3512 else
3513 handler = NULL;
3514
3515 if (unlikely(!handler)) {
3516 dev_err(DEV, "unknown packet type %d, l: %d!\n",
3517 header->command, header->length);
3518 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3519 break;
3520 }
3521 if (unlikely(!handler(mdev, header))) {
3522 dev_err(DEV, "error receiving %s, l: %d!\n",
3523 cmdname(header->command), header->length);
3524 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3525 break;
3526 }
3527 }
3528}
3529
3530static void drbd_fail_pending_reads(struct drbd_conf *mdev)
3531{
3532 struct hlist_head *slot;
3533 struct hlist_node *pos;
3534 struct hlist_node *tmp;
3535 struct drbd_request *req;
3536 int i;
3537
3538 /*
3539 * Application READ requests
3540 */
3541 spin_lock_irq(&mdev->req_lock);
3542 for (i = 0; i < APP_R_HSIZE; i++) {
3543 slot = mdev->app_reads_hash+i;
3544 hlist_for_each_entry_safe(req, pos, tmp, slot, colision) {
3545 /* it may (but should not any longer!)
3546 * be on the work queue; if that assert triggers,
3547 * we need to also grab the
3548 * spin_lock_irq(&mdev->data.work.q_lock);
3549 * and list_del_init here. */
3550 D_ASSERT(list_empty(&req->w.list));
3551 /* It would be nice to complete outside of spinlock.
3552 * But this is easier for now. */
3553 _req_mod(req, connection_lost_while_pending);
3554 }
3555 }
3556 for (i = 0; i < APP_R_HSIZE; i++)
3557 if (!hlist_empty(mdev->app_reads_hash+i))
3558 dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: "
3559 "%p, should be NULL\n", i, mdev->app_reads_hash[i].first);
3560
3561 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
3562 spin_unlock_irq(&mdev->req_lock);
3563}
3564
3565void drbd_flush_workqueue(struct drbd_conf *mdev)
3566{
3567 struct drbd_wq_barrier barr;
3568
3569 barr.w.cb = w_prev_work_done;
3570 init_completion(&barr.done);
3571 drbd_queue_work(&mdev->data.work, &barr.w);
3572 wait_for_completion(&barr.done);
3573}
3574
3575static void drbd_disconnect(struct drbd_conf *mdev)
3576{
3577 enum drbd_fencing_p fp;
3578 union drbd_state os, ns;
3579 int rv = SS_UNKNOWN_ERROR;
3580 unsigned int i;
3581
3582 if (mdev->state.conn == C_STANDALONE)
3583 return;
3584 if (mdev->state.conn >= C_WF_CONNECTION)
3585 dev_err(DEV, "ASSERT FAILED cstate = %s, expected < WFConnection\n",
3586 drbd_conn_str(mdev->state.conn));
3587
3588 /* asender does not clean up anything. it must not interfere, either */
3589 drbd_thread_stop(&mdev->asender);
3590
3591 mutex_lock(&mdev->data.mutex);
3592 drbd_free_sock(mdev);
3593 mutex_unlock(&mdev->data.mutex);
3594
3595 spin_lock_irq(&mdev->req_lock);
3596 _drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
3597 _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
3598 _drbd_wait_ee_list_empty(mdev, &mdev->read_ee);
3599 spin_unlock_irq(&mdev->req_lock);
3600
3601 /* We do not have data structures that would allow us to
3602 * get the rs_pending_cnt down to 0 again.
3603 * * On C_SYNC_TARGET we do not have any data structures describing
3604 * the pending RSDataRequest's we have sent.
3605 * * On C_SYNC_SOURCE there is no data structure that tracks
3606 * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
3607 * And no, it is not the sum of the reference counts in the
3608 * resync_LRU. The resync_LRU tracks the whole operation including
3609 * the disk-IO, while the rs_pending_cnt only tracks the blocks
3610 * on the fly. */
3611 drbd_rs_cancel_all(mdev);
3612 mdev->rs_total = 0;
3613 mdev->rs_failed = 0;
3614 atomic_set(&mdev->rs_pending_cnt, 0);
3615 wake_up(&mdev->misc_wait);
3616
3617 /* make sure syncer is stopped and w_resume_next_sg queued */
3618 del_timer_sync(&mdev->resync_timer);
3619 set_bit(STOP_SYNC_TIMER, &mdev->flags);
3620 resync_timer_fn((unsigned long)mdev);
3621
3622 /* so we can be sure that all remote or resync reads
3623 * made it at least to net_ee */
3624 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
3625
3626 /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
3627 * w_make_resync_request etc. which may still be on the worker queue
3628 * to be "canceled" */
3629 drbd_flush_workqueue(mdev);
3630
3631 /* This also does reclaim_net_ee(). If we do this too early, we might
3632 * miss some resync ee and pages.*/
3633 drbd_process_done_ee(mdev);
3634
3635 kfree(mdev->p_uuid);
3636 mdev->p_uuid = NULL;
3637
3638 if (!mdev->state.susp)
3639 tl_clear(mdev);
3640
3641 drbd_fail_pending_reads(mdev);
3642
3643 dev_info(DEV, "Connection closed\n");
3644
3645 drbd_md_sync(mdev);
3646
3647 fp = FP_DONT_CARE;
3648 if (get_ldev(mdev)) {
3649 fp = mdev->ldev->dc.fencing;
3650 put_ldev(mdev);
3651 }
3652
3653 if (mdev->state.role == R_PRIMARY) {
3654 if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) {
3655 enum drbd_disk_state nps = drbd_try_outdate_peer(mdev);
3656 drbd_request_state(mdev, NS(pdsk, nps));
3657 }
3658 }
3659
3660 spin_lock_irq(&mdev->req_lock);
3661 os = mdev->state;
3662 if (os.conn >= C_UNCONNECTED) {
3663 /* Do not restart in case we are C_DISCONNECTING */
3664 ns = os;
3665 ns.conn = C_UNCONNECTED;
3666 rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
3667 }
3668 spin_unlock_irq(&mdev->req_lock);
3669
3670 if (os.conn == C_DISCONNECTING) {
3671 struct hlist_head *h;
3672 wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0);
3673
3674 /* we must not free the tl_hash
3675 * while application io is still on the fly */
3676 wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0);
3677
3678 spin_lock_irq(&mdev->req_lock);
3679 /* paranoia code */
3680 for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
3681 if (h->first)
3682 dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
3683 (int)(h - mdev->ee_hash), h->first);
3684 kfree(mdev->ee_hash);
3685 mdev->ee_hash = NULL;
3686 mdev->ee_hash_s = 0;
3687
3688 /* paranoia code */
3689 for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
3690 if (h->first)
3691 dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
3692 (int)(h - mdev->tl_hash), h->first);
3693 kfree(mdev->tl_hash);
3694 mdev->tl_hash = NULL;
3695 mdev->tl_hash_s = 0;
3696 spin_unlock_irq(&mdev->req_lock);
3697
3698 crypto_free_hash(mdev->cram_hmac_tfm);
3699 mdev->cram_hmac_tfm = NULL;
3700
3701 kfree(mdev->net_conf);
3702 mdev->net_conf = NULL;
3703 drbd_request_state(mdev, NS(conn, C_STANDALONE));
3704 }
3705
3706 /* tcp_close and release of sendpage pages can be deferred. I don't
3707 * want to use SO_LINGER, because apparently it can be deferred for
3708 * more than 20 seconds (longest time I checked).
3709 *
3710 * Actually we don't care for exactly when the network stack does its
3711 * put_page(), but release our reference on these pages right here.
3712 */
3713 i = drbd_release_ee(mdev, &mdev->net_ee);
3714 if (i)
3715 dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
3716 i = atomic_read(&mdev->pp_in_use);
3717 if (i)
3718 dev_info(DEV, "pp_in_use = %u, expected 0\n", i);
3719
3720 D_ASSERT(list_empty(&mdev->read_ee));
3721 D_ASSERT(list_empty(&mdev->active_ee));
3722 D_ASSERT(list_empty(&mdev->sync_ee));
3723 D_ASSERT(list_empty(&mdev->done_ee));
3724
3725 /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
3726 atomic_set(&mdev->current_epoch->epoch_size, 0);
3727 D_ASSERT(list_empty(&mdev->current_epoch->list));
3728}
3729
3730/*
3731 * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
3732 * we can agree on is stored in agreed_pro_version.
3733 *
3734 * feature flags and the reserved array should be enough room for future
3735 * enhancements of the handshake protocol, and possible plugins...
3736 *
3737 * for now, they are expected to be zero, but ignored.
3738 */
3739static int drbd_send_handshake(struct drbd_conf *mdev)
3740{
3741 /* ASSERT current == mdev->receiver ... */
3742 struct p_handshake *p = &mdev->data.sbuf.handshake;
3743 int ok;
3744
3745 if (mutex_lock_interruptible(&mdev->data.mutex)) {
3746 dev_err(DEV, "interrupted during initial handshake\n");
3747 return 0; /* interrupted. not ok. */
3748 }
3749
3750 if (mdev->data.socket == NULL) {
3751 mutex_unlock(&mdev->data.mutex);
3752 return 0;
3753 }
3754
3755 memset(p, 0, sizeof(*p));
3756 p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
3757 p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
3758 ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
3759 (struct p_header *)p, sizeof(*p), 0 );
3760 mutex_unlock(&mdev->data.mutex);
3761 return ok;
3762}
3763
3764/*
3765 * return values:
3766 * 1 yes, we have a valid connection
3767 * 0 oops, did not work out, please try again
3768 * -1 peer talks different language,
3769 * no point in trying again, please go standalone.
3770 */
3771static int drbd_do_handshake(struct drbd_conf *mdev)
3772{
3773 /* ASSERT current == mdev->receiver ... */
3774 struct p_handshake *p = &mdev->data.rbuf.handshake;
3775 const int expect = sizeof(struct p_handshake)
3776 -sizeof(struct p_header);
3777 int rv;
3778
3779 rv = drbd_send_handshake(mdev);
3780 if (!rv)
3781 return 0;
3782
3783 rv = drbd_recv_header(mdev, &p->head);
3784 if (!rv)
3785 return 0;
3786
3787 if (p->head.command != P_HAND_SHAKE) {
3788 dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
3789 cmdname(p->head.command), p->head.command);
3790 return -1;
3791 }
3792
3793 if (p->head.length != expect) {
3794 dev_err(DEV, "expected HandShake length: %u, received: %u\n",
3795 expect, p->head.length);
3796 return -1;
3797 }
3798
3799 rv = drbd_recv(mdev, &p->head.payload, expect);
3800
3801 if (rv != expect) {
3802 dev_err(DEV, "short read receiving handshake packet: l=%u\n", rv);
3803 return 0;
3804 }
3805
3806 p->protocol_min = be32_to_cpu(p->protocol_min);
3807 p->protocol_max = be32_to_cpu(p->protocol_max);
3808 if (p->protocol_max == 0)
3809 p->protocol_max = p->protocol_min;
3810
3811 if (PRO_VERSION_MAX < p->protocol_min ||
3812 PRO_VERSION_MIN > p->protocol_max)
3813 goto incompat;
3814
3815 mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
3816
3817 dev_info(DEV, "Handshake successful: "
3818 "Agreed network protocol version %d\n", mdev->agreed_pro_version);
3819
3820 return 1;
3821
3822 incompat:
3823 dev_err(DEV, "incompatible DRBD dialects: "
3824 "I support %d-%d, peer supports %d-%d\n",
3825 PRO_VERSION_MIN, PRO_VERSION_MAX,
3826 p->protocol_min, p->protocol_max);
3827 return -1;
3828}
3829
3830#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
3831static int drbd_do_auth(struct drbd_conf *mdev)
3832{
3833 dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
3834 dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
3835 return 0;
3836}
3837#else
3838#define CHALLENGE_LEN 64
3839static int drbd_do_auth(struct drbd_conf *mdev)
3840{
3841 char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */
3842 struct scatterlist sg;
3843 char *response = NULL;
3844 char *right_response = NULL;
3845 char *peers_ch = NULL;
3846 struct p_header p;
3847 unsigned int key_len = strlen(mdev->net_conf->shared_secret);
3848 unsigned int resp_size;
3849 struct hash_desc desc;
3850 int rv;
3851
3852 desc.tfm = mdev->cram_hmac_tfm;
3853 desc.flags = 0;
3854
3855 rv = crypto_hash_setkey(mdev->cram_hmac_tfm,
3856 (u8 *)mdev->net_conf->shared_secret, key_len);
3857 if (rv) {
3858 dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv);
3859 rv = 0;
3860 goto fail;
3861 }
3862
3863 get_random_bytes(my_challenge, CHALLENGE_LEN);
3864
3865 rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN);
3866 if (!rv)
3867 goto fail;
3868
3869 rv = drbd_recv_header(mdev, &p);
3870 if (!rv)
3871 goto fail;
3872
3873 if (p.command != P_AUTH_CHALLENGE) {
3874 dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
3875 cmdname(p.command), p.command);
3876 rv = 0;
3877 goto fail;
3878 }
3879
3880 if (p.length > CHALLENGE_LEN*2) {
3881 dev_err(DEV, "expected AuthChallenge payload too big.\n");
3882 rv = 0;
3883 goto fail;
3884 }
3885
3886 peers_ch = kmalloc(p.length, GFP_NOIO);
3887 if (peers_ch == NULL) {
3888 dev_err(DEV, "kmalloc of peers_ch failed\n");
3889 rv = 0;
3890 goto fail;
3891 }
3892
3893 rv = drbd_recv(mdev, peers_ch, p.length);
3894
3895 if (rv != p.length) {
3896 dev_err(DEV, "short read AuthChallenge: l=%u\n", rv);
3897 rv = 0;
3898 goto fail;
3899 }
3900
3901 resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm);
3902 response = kmalloc(resp_size, GFP_NOIO);
3903 if (response == NULL) {
3904 dev_err(DEV, "kmalloc of response failed\n");
3905 rv = 0;
3906 goto fail;
3907 }
3908
3909 sg_init_table(&sg, 1);
3910 sg_set_buf(&sg, peers_ch, p.length);
3911
3912 rv = crypto_hash_digest(&desc, &sg, sg.length, response);
3913 if (rv) {
3914 dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
3915 rv = 0;
3916 goto fail;
3917 }
3918
3919 rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size);
3920 if (!rv)
3921 goto fail;
3922
3923 rv = drbd_recv_header(mdev, &p);
3924 if (!rv)
3925 goto fail;
3926
3927 if (p.command != P_AUTH_RESPONSE) {
3928 dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
3929 cmdname(p.command), p.command);
3930 rv = 0;
3931 goto fail;
3932 }
3933
3934 if (p.length != resp_size) {
3935 dev_err(DEV, "expected AuthResponse payload of wrong size\n");
3936 rv = 0;
3937 goto fail;
3938 }
3939
3940 rv = drbd_recv(mdev, response , resp_size);
3941
3942 if (rv != resp_size) {
3943 dev_err(DEV, "short read receiving AuthResponse: l=%u\n", rv);
3944 rv = 0;
3945 goto fail;
3946 }
3947
3948 right_response = kmalloc(resp_size, GFP_NOIO);
3949 if (response == NULL) {
3950 dev_err(DEV, "kmalloc of right_response failed\n");
3951 rv = 0;
3952 goto fail;
3953 }
3954
3955 sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
3956
3957 rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
3958 if (rv) {
3959 dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
3960 rv = 0;
3961 goto fail;
3962 }
3963
3964 rv = !memcmp(response, right_response, resp_size);
3965
3966 if (rv)
3967 dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n",
3968 resp_size, mdev->net_conf->cram_hmac_alg);
3969
3970 fail:
3971 kfree(peers_ch);
3972 kfree(response);
3973 kfree(right_response);
3974
3975 return rv;
3976}
3977#endif
3978
3979int drbdd_init(struct drbd_thread *thi)
3980{
3981 struct drbd_conf *mdev = thi->mdev;
3982 unsigned int minor = mdev_to_minor(mdev);
3983 int h;
3984
3985 sprintf(current->comm, "drbd%d_receiver", minor);
3986
3987 dev_info(DEV, "receiver (re)started\n");
3988
3989 do {
3990 h = drbd_connect(mdev);
3991 if (h == 0) {
3992 drbd_disconnect(mdev);
3993 __set_current_state(TASK_INTERRUPTIBLE);
3994 schedule_timeout(HZ);
3995 }
3996 if (h == -1) {
3997 dev_warn(DEV, "Discarding network configuration.\n");
3998 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3999 }
4000 } while (h == 0);
4001
4002 if (h > 0) {
4003 if (get_net_conf(mdev)) {
4004 drbdd(mdev);
4005 put_net_conf(mdev);
4006 }
4007 }
4008
4009 drbd_disconnect(mdev);
4010
4011 dev_info(DEV, "receiver terminated\n");
4012 return 0;
4013}
4014
4015/* ********* acknowledge sender ******** */
4016
4017static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h)
4018{
4019 struct p_req_state_reply *p = (struct p_req_state_reply *)h;
4020
4021 int retcode = be32_to_cpu(p->retcode);
4022
4023 if (retcode >= SS_SUCCESS) {
4024 set_bit(CL_ST_CHG_SUCCESS, &mdev->flags);
4025 } else {
4026 set_bit(CL_ST_CHG_FAIL, &mdev->flags);
4027 dev_err(DEV, "Requested state change failed by peer: %s (%d)\n",
4028 drbd_set_st_err_str(retcode), retcode);
4029 }
4030 wake_up(&mdev->state_wait);
4031
4032 return TRUE;
4033}
4034
4035static int got_Ping(struct drbd_conf *mdev, struct p_header *h)
4036{
4037 return drbd_send_ping_ack(mdev);
4038
4039}
4040
4041static int got_PingAck(struct drbd_conf *mdev, struct p_header *h)
4042{
4043 /* restore idle timeout */
4044 mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
4045
4046 return TRUE;
4047}
4048
4049static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h)
4050{
4051 struct p_block_ack *p = (struct p_block_ack *)h;
4052 sector_t sector = be64_to_cpu(p->sector);
4053 int blksize = be32_to_cpu(p->blksize);
4054
4055 D_ASSERT(mdev->agreed_pro_version >= 89);
4056
4057 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4058
4059 drbd_rs_complete_io(mdev, sector);
4060 drbd_set_in_sync(mdev, sector, blksize);
4061 /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
4062 mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
4063 dec_rs_pending(mdev);
4064
4065 return TRUE;
4066}
4067
4068/* when we receive the ACK for a write request,
4069 * verify that we actually know about it */
4070static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
4071 u64 id, sector_t sector)
4072{
4073 struct hlist_head *slot = tl_hash_slot(mdev, sector);
4074 struct hlist_node *n;
4075 struct drbd_request *req;
4076
4077 hlist_for_each_entry(req, n, slot, colision) {
4078 if ((unsigned long)req == (unsigned long)id) {
4079 if (req->sector != sector) {
4080 dev_err(DEV, "_ack_id_to_req: found req %p but it has "
4081 "wrong sector (%llus versus %llus)\n", req,
4082 (unsigned long long)req->sector,
4083 (unsigned long long)sector);
4084 break;
4085 }
4086 return req;
4087 }
4088 }
4089 dev_err(DEV, "_ack_id_to_req: failed to find req %p, sector %llus in list\n",
4090 (void *)(unsigned long)id, (unsigned long long)sector);
4091 return NULL;
4092}
4093
4094typedef struct drbd_request *(req_validator_fn)
4095 (struct drbd_conf *mdev, u64 id, sector_t sector);
4096
4097static int validate_req_change_req_state(struct drbd_conf *mdev,
4098 u64 id, sector_t sector, req_validator_fn validator,
4099 const char *func, enum drbd_req_event what)
4100{
4101 struct drbd_request *req;
4102 struct bio_and_error m;
4103
4104 spin_lock_irq(&mdev->req_lock);
4105 req = validator(mdev, id, sector);
4106 if (unlikely(!req)) {
4107 spin_unlock_irq(&mdev->req_lock);
4108 dev_err(DEV, "%s: got a corrupt block_id/sector pair\n", func);
4109 return FALSE;
4110 }
4111 __req_mod(req, what, &m);
4112 spin_unlock_irq(&mdev->req_lock);
4113
4114 if (m.bio)
4115 complete_master_bio(mdev, &m);
4116 return TRUE;
4117}
4118
4119static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h)
4120{
4121 struct p_block_ack *p = (struct p_block_ack *)h;
4122 sector_t sector = be64_to_cpu(p->sector);
4123 int blksize = be32_to_cpu(p->blksize);
4124 enum drbd_req_event what;
4125
4126 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4127
4128 if (is_syncer_block_id(p->block_id)) {
4129 drbd_set_in_sync(mdev, sector, blksize);
4130 dec_rs_pending(mdev);
4131 return TRUE;
4132 }
4133 switch (be16_to_cpu(h->command)) {
4134 case P_RS_WRITE_ACK:
4135 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4136 what = write_acked_by_peer_and_sis;
4137 break;
4138 case P_WRITE_ACK:
4139 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4140 what = write_acked_by_peer;
4141 break;
4142 case P_RECV_ACK:
4143 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B);
4144 what = recv_acked_by_peer;
4145 break;
4146 case P_DISCARD_ACK:
4147 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4148 what = conflict_discarded_by_peer;
4149 break;
4150 default:
4151 D_ASSERT(0);
4152 return FALSE;
4153 }
4154
4155 return validate_req_change_req_state(mdev, p->block_id, sector,
4156 _ack_id_to_req, __func__ , what);
4157}
4158
4159static int got_NegAck(struct drbd_conf *mdev, struct p_header *h)
4160{
4161 struct p_block_ack *p = (struct p_block_ack *)h;
4162 sector_t sector = be64_to_cpu(p->sector);
4163
4164 if (__ratelimit(&drbd_ratelimit_state))
4165 dev_warn(DEV, "Got NegAck packet. Peer is in troubles?\n");
4166
4167 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4168
4169 if (is_syncer_block_id(p->block_id)) {
4170 int size = be32_to_cpu(p->blksize);
4171 dec_rs_pending(mdev);
4172 drbd_rs_failed_io(mdev, sector, size);
4173 return TRUE;
4174 }
4175 return validate_req_change_req_state(mdev, p->block_id, sector,
4176 _ack_id_to_req, __func__ , neg_acked);
4177}
4178
4179static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h)
4180{
4181 struct p_block_ack *p = (struct p_block_ack *)h;
4182 sector_t sector = be64_to_cpu(p->sector);
4183
4184 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4185 dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n",
4186 (unsigned long long)sector, be32_to_cpu(p->blksize));
4187
4188 return validate_req_change_req_state(mdev, p->block_id, sector,
4189 _ar_id_to_req, __func__ , neg_acked);
4190}
4191
4192static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
4193{
4194 sector_t sector;
4195 int size;
4196 struct p_block_ack *p = (struct p_block_ack *)h;
4197
4198 sector = be64_to_cpu(p->sector);
4199 size = be32_to_cpu(p->blksize);
4200 D_ASSERT(p->block_id == ID_SYNCER);
4201
4202 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4203
4204 dec_rs_pending(mdev);
4205
4206 if (get_ldev_if_state(mdev, D_FAILED)) {
4207 drbd_rs_complete_io(mdev, sector);
4208 drbd_rs_failed_io(mdev, sector, size);
4209 put_ldev(mdev);
4210 }
4211
4212 return TRUE;
4213}
4214
4215static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h)
4216{
4217 struct p_barrier_ack *p = (struct p_barrier_ack *)h;
4218
4219 tl_release(mdev, p->barrier, be32_to_cpu(p->set_size));
4220
4221 return TRUE;
4222}
4223
4224static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
4225{
4226 struct p_block_ack *p = (struct p_block_ack *)h;
4227 struct drbd_work *w;
4228 sector_t sector;
4229 int size;
4230
4231 sector = be64_to_cpu(p->sector);
4232 size = be32_to_cpu(p->blksize);
4233
4234 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4235
4236 if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
4237 drbd_ov_oos_found(mdev, sector, size);
4238 else
4239 ov_oos_print(mdev);
4240
4241 drbd_rs_complete_io(mdev, sector);
4242 dec_rs_pending(mdev);
4243
4244 if (--mdev->ov_left == 0) {
4245 w = kmalloc(sizeof(*w), GFP_NOIO);
4246 if (w) {
4247 w->cb = w_ov_finished;
4248 drbd_queue_work_front(&mdev->data.work, w);
4249 } else {
4250 dev_err(DEV, "kmalloc(w) failed.");
4251 ov_oos_print(mdev);
4252 drbd_resync_finished(mdev);
4253 }
4254 }
4255 return TRUE;
4256}
4257
4258struct asender_cmd {
4259 size_t pkt_size;
4260 int (*process)(struct drbd_conf *mdev, struct p_header *h);
4261};
4262
4263static struct asender_cmd *get_asender_cmd(int cmd)
4264{
4265 static struct asender_cmd asender_tbl[] = {
4266 /* anything missing from this table is in
4267 * the drbd_cmd_handler (drbd_default_handler) table,
4268 * see the beginning of drbdd() */
4269 [P_PING] = { sizeof(struct p_header), got_Ping },
4270 [P_PING_ACK] = { sizeof(struct p_header), got_PingAck },
4271 [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4272 [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4273 [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4274 [P_DISCARD_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4275 [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck },
4276 [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply },
4277 [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply},
4278 [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult },
4279 [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
4280 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
4281 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
4282 [P_MAX_CMD] = { 0, NULL },
4283 };
4284 if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
4285 return NULL;
4286 return &asender_tbl[cmd];
4287}
4288
4289int drbd_asender(struct drbd_thread *thi)
4290{
4291 struct drbd_conf *mdev = thi->mdev;
4292 struct p_header *h = &mdev->meta.rbuf.header;
4293 struct asender_cmd *cmd = NULL;
4294
4295 int rv, len;
4296 void *buf = h;
4297 int received = 0;
4298 int expect = sizeof(struct p_header);
4299 int empty;
4300
4301 sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
4302
4303 current->policy = SCHED_RR; /* Make this a realtime task! */
4304 current->rt_priority = 2; /* more important than all other tasks */
4305
4306 while (get_t_state(thi) == Running) {
4307 drbd_thread_current_set_cpu(mdev);
4308 if (test_and_clear_bit(SEND_PING, &mdev->flags)) {
4309 ERR_IF(!drbd_send_ping(mdev)) goto reconnect;
4310 mdev->meta.socket->sk->sk_rcvtimeo =
4311 mdev->net_conf->ping_timeo*HZ/10;
4312 }
4313
4314 /* conditionally cork;
4315 * it may hurt latency if we cork without much to send */
4316 if (!mdev->net_conf->no_cork &&
4317 3 < atomic_read(&mdev->unacked_cnt))
4318 drbd_tcp_cork(mdev->meta.socket);
4319 while (1) {
4320 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4321 flush_signals(current);
4322 if (!drbd_process_done_ee(mdev)) {
4323 dev_err(DEV, "process_done_ee() = NOT_OK\n");
4324 goto reconnect;
4325 }
4326 /* to avoid race with newly queued ACKs */
4327 set_bit(SIGNAL_ASENDER, &mdev->flags);
4328 spin_lock_irq(&mdev->req_lock);
4329 empty = list_empty(&mdev->done_ee);
4330 spin_unlock_irq(&mdev->req_lock);
4331 /* new ack may have been queued right here,
4332 * but then there is also a signal pending,
4333 * and we start over... */
4334 if (empty)
4335 break;
4336 }
4337 /* but unconditionally uncork unless disabled */
4338 if (!mdev->net_conf->no_cork)
4339 drbd_tcp_uncork(mdev->meta.socket);
4340
4341 /* short circuit, recv_msg would return EINTR anyways. */
4342 if (signal_pending(current))
4343 continue;
4344
4345 rv = drbd_recv_short(mdev, mdev->meta.socket,
4346 buf, expect-received, 0);
4347 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4348
4349 flush_signals(current);
4350
4351 /* Note:
4352 * -EINTR (on meta) we got a signal
4353 * -EAGAIN (on meta) rcvtimeo expired
4354 * -ECONNRESET other side closed the connection
4355 * -ERESTARTSYS (on data) we got a signal
4356 * rv < 0 other than above: unexpected error!
4357 * rv == expected: full header or command
4358 * rv < expected: "woken" by signal during receive
4359 * rv == 0 : "connection shut down by peer"
4360 */
4361 if (likely(rv > 0)) {
4362 received += rv;
4363 buf += rv;
4364 } else if (rv == 0) {
4365 dev_err(DEV, "meta connection shut down by peer.\n");
4366 goto reconnect;
4367 } else if (rv == -EAGAIN) {
4368 if (mdev->meta.socket->sk->sk_rcvtimeo ==
4369 mdev->net_conf->ping_timeo*HZ/10) {
4370 dev_err(DEV, "PingAck did not arrive in time.\n");
4371 goto reconnect;
4372 }
4373 set_bit(SEND_PING, &mdev->flags);
4374 continue;
4375 } else if (rv == -EINTR) {
4376 continue;
4377 } else {
4378 dev_err(DEV, "sock_recvmsg returned %d\n", rv);
4379 goto reconnect;
4380 }
4381
4382 if (received == expect && cmd == NULL) {
4383 if (unlikely(h->magic != BE_DRBD_MAGIC)) {
4384 dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n",
4385 (long)be32_to_cpu(h->magic),
4386 h->command, h->length);
4387 goto reconnect;
4388 }
4389 cmd = get_asender_cmd(be16_to_cpu(h->command));
4390 len = be16_to_cpu(h->length);
4391 if (unlikely(cmd == NULL)) {
4392 dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n",
4393 (long)be32_to_cpu(h->magic),
4394 h->command, h->length);
4395 goto disconnect;
4396 }
4397 expect = cmd->pkt_size;
4398 ERR_IF(len != expect-sizeof(struct p_header))
4399 goto reconnect;
4400 }
4401 if (received == expect) {
4402 D_ASSERT(cmd != NULL);
4403 if (!cmd->process(mdev, h))
4404 goto reconnect;
4405
4406 buf = h;
4407 received = 0;
4408 expect = sizeof(struct p_header);
4409 cmd = NULL;
4410 }
4411 }
4412
4413 if (0) {
4414reconnect:
4415 drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
4416 }
4417 if (0) {
4418disconnect:
4419 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
4420 }
4421 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4422
4423 D_ASSERT(mdev->state.conn < C_CONNECTED);
4424 dev_info(DEV, "asender terminated\n");
4425
4426 return 0;
4427}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
new file mode 100644
index 000000000000..3678d3d66c6c
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.c
@@ -0,0 +1,1120 @@
1/*
2 drbd_req.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/module.h>
27
28#include <linux/slab.h>
29#include <linux/drbd.h>
30#include "drbd_int.h"
31#include "drbd_req.h"
32
33
34/* Update disk stats at start of I/O request */
35static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio)
36{
37 const int rw = bio_data_dir(bio);
38 int cpu;
39 cpu = part_stat_lock();
40 part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
41 part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
42 part_stat_unlock();
43 mdev->vdisk->part0.in_flight[rw]++;
44}
45
46/* Update disk stats when completing request upwards */
47static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
48{
49 int rw = bio_data_dir(req->master_bio);
50 unsigned long duration = jiffies - req->start_time;
51 int cpu;
52 cpu = part_stat_lock();
53 part_stat_add(cpu, &mdev->vdisk->part0, ticks[rw], duration);
54 part_round_stats(cpu, &mdev->vdisk->part0);
55 part_stat_unlock();
56 mdev->vdisk->part0.in_flight[rw]--;
57}
58
59static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw)
60{
61 const unsigned long s = req->rq_state;
62 /* if it was a write, we may have to set the corresponding
63 * bit(s) out-of-sync first. If it had a local part, we need to
64 * release the reference to the activity log. */
65 if (rw == WRITE) {
66 /* remove it from the transfer log.
67 * well, only if it had been there in the first
68 * place... if it had not (local only or conflicting
69 * and never sent), it should still be "empty" as
70 * initialized in drbd_req_new(), so we can list_del() it
71 * here unconditionally */
72 list_del(&req->tl_requests);
73 /* Set out-of-sync unless both OK flags are set
74 * (local only or remote failed).
75 * Other places where we set out-of-sync:
76 * READ with local io-error */
77 if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
78 drbd_set_out_of_sync(mdev, req->sector, req->size);
79
80 if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
81 drbd_set_in_sync(mdev, req->sector, req->size);
82
83 /* one might be tempted to move the drbd_al_complete_io
84 * to the local io completion callback drbd_endio_pri.
85 * but, if this was a mirror write, we may only
86 * drbd_al_complete_io after this is RQ_NET_DONE,
87 * otherwise the extent could be dropped from the al
88 * before it has actually been written on the peer.
89 * if we crash before our peer knows about the request,
90 * but after the extent has been dropped from the al,
91 * we would forget to resync the corresponding extent.
92 */
93 if (s & RQ_LOCAL_MASK) {
94 if (get_ldev_if_state(mdev, D_FAILED)) {
95 drbd_al_complete_io(mdev, req->sector);
96 put_ldev(mdev);
97 } else if (__ratelimit(&drbd_ratelimit_state)) {
98 dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), "
99 "but my Disk seems to have failed :(\n",
100 (unsigned long long) req->sector);
101 }
102 }
103 }
104
105 /* if it was a local io error, we want to notify our
106 * peer about that, and see if we need to
107 * detach the disk and stuff.
108 * to avoid allocating some special work
109 * struct, reuse the request. */
110
111 /* THINK
112 * why do we do this not when we detect the error,
113 * but delay it until it is "done", i.e. possibly
114 * until the next barrier ack? */
115
116 if (rw == WRITE &&
117 ((s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) {
118 if (!(req->w.list.next == LIST_POISON1 ||
119 list_empty(&req->w.list))) {
120 /* DEBUG ASSERT only; if this triggers, we
121 * probably corrupt the worker list here */
122 dev_err(DEV, "req->w.list.next = %p\n", req->w.list.next);
123 dev_err(DEV, "req->w.list.prev = %p\n", req->w.list.prev);
124 }
125 req->w.cb = w_io_error;
126 drbd_queue_work(&mdev->data.work, &req->w);
127 /* drbd_req_free() is done in w_io_error */
128 } else {
129 drbd_req_free(req);
130 }
131}
132
133static void queue_barrier(struct drbd_conf *mdev)
134{
135 struct drbd_tl_epoch *b;
136
137 /* We are within the req_lock. Once we queued the barrier for sending,
138 * we set the CREATE_BARRIER bit. It is cleared as soon as a new
139 * barrier/epoch object is added. This is the only place this bit is
140 * set. It indicates that the barrier for this epoch is already queued,
141 * and no new epoch has been created yet. */
142 if (test_bit(CREATE_BARRIER, &mdev->flags))
143 return;
144
145 b = mdev->newest_tle;
146 b->w.cb = w_send_barrier;
147 /* inc_ap_pending done here, so we won't
148 * get imbalanced on connection loss.
149 * dec_ap_pending will be done in got_BarrierAck
150 * or (on connection loss) in tl_clear. */
151 inc_ap_pending(mdev);
152 drbd_queue_work(&mdev->data.work, &b->w);
153 set_bit(CREATE_BARRIER, &mdev->flags);
154}
155
156static void _about_to_complete_local_write(struct drbd_conf *mdev,
157 struct drbd_request *req)
158{
159 const unsigned long s = req->rq_state;
160 struct drbd_request *i;
161 struct drbd_epoch_entry *e;
162 struct hlist_node *n;
163 struct hlist_head *slot;
164
165 /* before we can signal completion to the upper layers,
166 * we may need to close the current epoch */
167 if (mdev->state.conn >= C_CONNECTED &&
168 req->epoch == mdev->newest_tle->br_number)
169 queue_barrier(mdev);
170
171 /* we need to do the conflict detection stuff,
172 * if we have the ee_hash (two_primaries) and
173 * this has been on the network */
174 if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) {
175 const sector_t sector = req->sector;
176 const int size = req->size;
177
178 /* ASSERT:
179 * there must be no conflicting requests, since
180 * they must have been failed on the spot */
181#define OVERLAPS overlaps(sector, size, i->sector, i->size)
182 slot = tl_hash_slot(mdev, sector);
183 hlist_for_each_entry(i, n, slot, colision) {
184 if (OVERLAPS) {
185 dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; "
186 "other: %p %llus +%u\n",
187 req, (unsigned long long)sector, size,
188 i, (unsigned long long)i->sector, i->size);
189 }
190 }
191
192 /* maybe "wake" those conflicting epoch entries
193 * that wait for this request to finish.
194 *
195 * currently, there can be only _one_ such ee
196 * (well, or some more, which would be pending
197 * P_DISCARD_ACK not yet sent by the asender...),
198 * since we block the receiver thread upon the
199 * first conflict detection, which will wait on
200 * misc_wait. maybe we want to assert that?
201 *
202 * anyways, if we found one,
203 * we just have to do a wake_up. */
204#undef OVERLAPS
205#define OVERLAPS overlaps(sector, size, e->sector, e->size)
206 slot = ee_hash_slot(mdev, req->sector);
207 hlist_for_each_entry(e, n, slot, colision) {
208 if (OVERLAPS) {
209 wake_up(&mdev->misc_wait);
210 break;
211 }
212 }
213 }
214#undef OVERLAPS
215}
216
217void complete_master_bio(struct drbd_conf *mdev,
218 struct bio_and_error *m)
219{
220 bio_endio(m->bio, m->error);
221 dec_ap_bio(mdev);
222}
223
224/* Helper for __req_mod().
225 * Set m->bio to the master bio, if it is fit to be completed,
226 * or leave it alone (it is initialized to NULL in __req_mod),
227 * if it has already been completed, or cannot be completed yet.
228 * If m->bio is set, the error status to be returned is placed in m->error.
229 */
230void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
231{
232 const unsigned long s = req->rq_state;
233 struct drbd_conf *mdev = req->mdev;
234 /* only WRITES may end up here without a master bio (on barrier ack) */
235 int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE;
236
237 /* we must not complete the master bio, while it is
238 * still being processed by _drbd_send_zc_bio (drbd_send_dblock)
239 * not yet acknowledged by the peer
240 * not yet completed by the local io subsystem
241 * these flags may get cleared in any order by
242 * the worker,
243 * the receiver,
244 * the bio_endio completion callbacks.
245 */
246 if (s & RQ_NET_QUEUED)
247 return;
248 if (s & RQ_NET_PENDING)
249 return;
250 if (s & RQ_LOCAL_PENDING)
251 return;
252
253 if (req->master_bio) {
254 /* this is data_received (remote read)
255 * or protocol C P_WRITE_ACK
256 * or protocol B P_RECV_ACK
257 * or protocol A "handed_over_to_network" (SendAck)
258 * or canceled or failed,
259 * or killed from the transfer log due to connection loss.
260 */
261
262 /*
263 * figure out whether to report success or failure.
264 *
265 * report success when at least one of the operations succeeded.
266 * or, to put the other way,
267 * only report failure, when both operations failed.
268 *
269 * what to do about the failures is handled elsewhere.
270 * what we need to do here is just: complete the master_bio.
271 *
272 * local completion error, if any, has been stored as ERR_PTR
273 * in private_bio within drbd_endio_pri.
274 */
275 int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
276 int error = PTR_ERR(req->private_bio);
277
278 /* remove the request from the conflict detection
279 * respective block_id verification hash */
280 if (!hlist_unhashed(&req->colision))
281 hlist_del(&req->colision);
282 else
283 D_ASSERT((s & RQ_NET_MASK) == 0);
284
285 /* for writes we need to do some extra housekeeping */
286 if (rw == WRITE)
287 _about_to_complete_local_write(mdev, req);
288
289 /* Update disk stats */
290 _drbd_end_io_acct(mdev, req);
291
292 m->error = ok ? 0 : (error ?: -EIO);
293 m->bio = req->master_bio;
294 req->master_bio = NULL;
295 }
296
297 if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) {
298 /* this is disconnected (local only) operation,
299 * or protocol C P_WRITE_ACK,
300 * or protocol A or B P_BARRIER_ACK,
301 * or killed from the transfer log due to connection loss. */
302 _req_is_done(mdev, req, rw);
303 }
304 /* else: network part and not DONE yet. that is
305 * protocol A or B, barrier ack still pending... */
306}
307
308/*
309 * checks whether there was an overlapping request
310 * or ee already registered.
311 *
312 * if so, return 1, in which case this request is completed on the spot,
313 * without ever being submitted or send.
314 *
315 * return 0 if it is ok to submit this request.
316 *
317 * NOTE:
318 * paranoia: assume something above us is broken, and issues different write
319 * requests for the same block simultaneously...
320 *
321 * To ensure these won't be reordered differently on both nodes, resulting in
322 * diverging data sets, we discard the later one(s). Not that this is supposed
323 * to happen, but this is the rationale why we also have to check for
324 * conflicting requests with local origin, and why we have to do so regardless
325 * of whether we allowed multiple primaries.
326 *
327 * BTW, in case we only have one primary, the ee_hash is empty anyways, and the
328 * second hlist_for_each_entry becomes a noop. This is even simpler than to
329 * grab a reference on the net_conf, and check for the two_primaries flag...
330 */
331static int _req_conflicts(struct drbd_request *req)
332{
333 struct drbd_conf *mdev = req->mdev;
334 const sector_t sector = req->sector;
335 const int size = req->size;
336 struct drbd_request *i;
337 struct drbd_epoch_entry *e;
338 struct hlist_node *n;
339 struct hlist_head *slot;
340
341 D_ASSERT(hlist_unhashed(&req->colision));
342
343 if (!get_net_conf(mdev))
344 return 0;
345
346 /* BUG_ON */
347 ERR_IF (mdev->tl_hash_s == 0)
348 goto out_no_conflict;
349 BUG_ON(mdev->tl_hash == NULL);
350
351#define OVERLAPS overlaps(i->sector, i->size, sector, size)
352 slot = tl_hash_slot(mdev, sector);
353 hlist_for_each_entry(i, n, slot, colision) {
354 if (OVERLAPS) {
355 dev_alert(DEV, "%s[%u] Concurrent local write detected! "
356 "[DISCARD L] new: %llus +%u; "
357 "pending: %llus +%u\n",
358 current->comm, current->pid,
359 (unsigned long long)sector, size,
360 (unsigned long long)i->sector, i->size);
361 goto out_conflict;
362 }
363 }
364
365 if (mdev->ee_hash_s) {
366 /* now, check for overlapping requests with remote origin */
367 BUG_ON(mdev->ee_hash == NULL);
368#undef OVERLAPS
369#define OVERLAPS overlaps(e->sector, e->size, sector, size)
370 slot = ee_hash_slot(mdev, sector);
371 hlist_for_each_entry(e, n, slot, colision) {
372 if (OVERLAPS) {
373 dev_alert(DEV, "%s[%u] Concurrent remote write detected!"
374 " [DISCARD L] new: %llus +%u; "
375 "pending: %llus +%u\n",
376 current->comm, current->pid,
377 (unsigned long long)sector, size,
378 (unsigned long long)e->sector, e->size);
379 goto out_conflict;
380 }
381 }
382 }
383#undef OVERLAPS
384
385out_no_conflict:
386 /* this is like it should be, and what we expected.
387 * our users do behave after all... */
388 put_net_conf(mdev);
389 return 0;
390
391out_conflict:
392 put_net_conf(mdev);
393 return 1;
394}
395
396/* obviously this could be coded as many single functions
397 * instead of one huge switch,
398 * or by putting the code directly in the respective locations
399 * (as it has been before).
400 *
401 * but having it this way
402 * enforces that it is all in this one place, where it is easier to audit,
403 * it makes it obvious that whatever "event" "happens" to a request should
404 * happen "atomically" within the req_lock,
405 * and it enforces that we have to think in a very structured manner
406 * about the "events" that may happen to a request during its life time ...
407 */
408void __req_mod(struct drbd_request *req, enum drbd_req_event what,
409 struct bio_and_error *m)
410{
411 struct drbd_conf *mdev = req->mdev;
412 m->bio = NULL;
413
414 switch (what) {
415 default:
416 dev_err(DEV, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__);
417 break;
418
419 /* does not happen...
420 * initialization done in drbd_req_new
421 case created:
422 break;
423 */
424
425 case to_be_send: /* via network */
426 /* reached via drbd_make_request_common
427 * and from w_read_retry_remote */
428 D_ASSERT(!(req->rq_state & RQ_NET_MASK));
429 req->rq_state |= RQ_NET_PENDING;
430 inc_ap_pending(mdev);
431 break;
432
433 case to_be_submitted: /* locally */
434 /* reached via drbd_make_request_common */
435 D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK));
436 req->rq_state |= RQ_LOCAL_PENDING;
437 break;
438
439 case completed_ok:
440 if (bio_data_dir(req->master_bio) == WRITE)
441 mdev->writ_cnt += req->size>>9;
442 else
443 mdev->read_cnt += req->size>>9;
444
445 req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
446 req->rq_state &= ~RQ_LOCAL_PENDING;
447
448 _req_may_be_done(req, m);
449 put_ldev(mdev);
450 break;
451
452 case write_completed_with_error:
453 req->rq_state |= RQ_LOCAL_COMPLETED;
454 req->rq_state &= ~RQ_LOCAL_PENDING;
455
456 dev_alert(DEV, "Local WRITE failed sec=%llus size=%u\n",
457 (unsigned long long)req->sector, req->size);
458 /* and now: check how to handle local io error. */
459 __drbd_chk_io_error(mdev, FALSE);
460 _req_may_be_done(req, m);
461 put_ldev(mdev);
462 break;
463
464 case read_ahead_completed_with_error:
465 /* it is legal to fail READA */
466 req->rq_state |= RQ_LOCAL_COMPLETED;
467 req->rq_state &= ~RQ_LOCAL_PENDING;
468 _req_may_be_done(req, m);
469 put_ldev(mdev);
470 break;
471
472 case read_completed_with_error:
473 drbd_set_out_of_sync(mdev, req->sector, req->size);
474
475 req->rq_state |= RQ_LOCAL_COMPLETED;
476 req->rq_state &= ~RQ_LOCAL_PENDING;
477
478 dev_alert(DEV, "Local READ failed sec=%llus size=%u\n",
479 (unsigned long long)req->sector, req->size);
480 /* _req_mod(req,to_be_send); oops, recursion... */
481 D_ASSERT(!(req->rq_state & RQ_NET_MASK));
482 req->rq_state |= RQ_NET_PENDING;
483 inc_ap_pending(mdev);
484
485 __drbd_chk_io_error(mdev, FALSE);
486 put_ldev(mdev);
487 /* NOTE: if we have no connection,
488 * or know the peer has no good data either,
489 * then we don't actually need to "queue_for_net_read",
490 * but we do so anyways, since the drbd_io_error()
491 * and the potential state change to "Diskless"
492 * needs to be done from process context */
493
494 /* fall through: _req_mod(req,queue_for_net_read); */
495
496 case queue_for_net_read:
497 /* READ or READA, and
498 * no local disk,
499 * or target area marked as invalid,
500 * or just got an io-error. */
501 /* from drbd_make_request_common
502 * or from bio_endio during read io-error recovery */
503
504 /* so we can verify the handle in the answer packet
505 * corresponding hlist_del is in _req_may_be_done() */
506 hlist_add_head(&req->colision, ar_hash_slot(mdev, req->sector));
507
508 set_bit(UNPLUG_REMOTE, &mdev->flags); /* why? */
509
510 D_ASSERT(req->rq_state & RQ_NET_PENDING);
511 req->rq_state |= RQ_NET_QUEUED;
512 req->w.cb = (req->rq_state & RQ_LOCAL_MASK)
513 ? w_read_retry_remote
514 : w_send_read_req;
515 drbd_queue_work(&mdev->data.work, &req->w);
516 break;
517
518 case queue_for_net_write:
519 /* assert something? */
520 /* from drbd_make_request_common only */
521
522 hlist_add_head(&req->colision, tl_hash_slot(mdev, req->sector));
523 /* corresponding hlist_del is in _req_may_be_done() */
524
525 /* NOTE
526 * In case the req ended up on the transfer log before being
527 * queued on the worker, it could lead to this request being
528 * missed during cleanup after connection loss.
529 * So we have to do both operations here,
530 * within the same lock that protects the transfer log.
531 *
532 * _req_add_to_epoch(req); this has to be after the
533 * _maybe_start_new_epoch(req); which happened in
534 * drbd_make_request_common, because we now may set the bit
535 * again ourselves to close the current epoch.
536 *
537 * Add req to the (now) current epoch (barrier). */
538
539 /* see drbd_make_request_common,
540 * just after it grabs the req_lock */
541 D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0);
542
543 req->epoch = mdev->newest_tle->br_number;
544 list_add_tail(&req->tl_requests,
545 &mdev->newest_tle->requests);
546
547 /* increment size of current epoch */
548 mdev->newest_tle->n_req++;
549
550 /* queue work item to send data */
551 D_ASSERT(req->rq_state & RQ_NET_PENDING);
552 req->rq_state |= RQ_NET_QUEUED;
553 req->w.cb = w_send_dblock;
554 drbd_queue_work(&mdev->data.work, &req->w);
555
556 /* close the epoch, in case it outgrew the limit */
557 if (mdev->newest_tle->n_req >= mdev->net_conf->max_epoch_size)
558 queue_barrier(mdev);
559
560 break;
561
562 case send_canceled:
563 /* treat it the same */
564 case send_failed:
565 /* real cleanup will be done from tl_clear. just update flags
566 * so it is no longer marked as on the worker queue */
567 req->rq_state &= ~RQ_NET_QUEUED;
568 /* if we did it right, tl_clear should be scheduled only after
569 * this, so this should not be necessary! */
570 _req_may_be_done(req, m);
571 break;
572
573 case handed_over_to_network:
574 /* assert something? */
575 if (bio_data_dir(req->master_bio) == WRITE &&
576 mdev->net_conf->wire_protocol == DRBD_PROT_A) {
577 /* this is what is dangerous about protocol A:
578 * pretend it was successfully written on the peer. */
579 if (req->rq_state & RQ_NET_PENDING) {
580 dec_ap_pending(mdev);
581 req->rq_state &= ~RQ_NET_PENDING;
582 req->rq_state |= RQ_NET_OK;
583 } /* else: neg-ack was faster... */
584 /* it is still not yet RQ_NET_DONE until the
585 * corresponding epoch barrier got acked as well,
586 * so we know what to dirty on connection loss */
587 }
588 req->rq_state &= ~RQ_NET_QUEUED;
589 req->rq_state |= RQ_NET_SENT;
590 /* because _drbd_send_zc_bio could sleep, and may want to
591 * dereference the bio even after the "write_acked_by_peer" and
592 * "completed_ok" events came in, once we return from
593 * _drbd_send_zc_bio (drbd_send_dblock), we have to check
594 * whether it is done already, and end it. */
595 _req_may_be_done(req, m);
596 break;
597
598 case connection_lost_while_pending:
599 /* transfer log cleanup after connection loss */
600 /* assert something? */
601 if (req->rq_state & RQ_NET_PENDING)
602 dec_ap_pending(mdev);
603 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
604 req->rq_state |= RQ_NET_DONE;
605 /* if it is still queued, we may not complete it here.
606 * it will be canceled soon. */
607 if (!(req->rq_state & RQ_NET_QUEUED))
608 _req_may_be_done(req, m);
609 break;
610
611 case write_acked_by_peer_and_sis:
612 req->rq_state |= RQ_NET_SIS;
613 case conflict_discarded_by_peer:
614 /* for discarded conflicting writes of multiple primaries,
615 * there is no need to keep anything in the tl, potential
616 * node crashes are covered by the activity log. */
617 if (what == conflict_discarded_by_peer)
618 dev_alert(DEV, "Got DiscardAck packet %llus +%u!"
619 " DRBD is not a random data generator!\n",
620 (unsigned long long)req->sector, req->size);
621 req->rq_state |= RQ_NET_DONE;
622 /* fall through */
623 case write_acked_by_peer:
624 /* protocol C; successfully written on peer.
625 * Nothing to do here.
626 * We want to keep the tl in place for all protocols, to cater
627 * for volatile write-back caches on lower level devices.
628 *
629 * A barrier request is expected to have forced all prior
630 * requests onto stable storage, so completion of a barrier
631 * request could set NET_DONE right here, and not wait for the
632 * P_BARRIER_ACK, but that is an unnecessary optimization. */
633
634 /* this makes it effectively the same as for: */
635 case recv_acked_by_peer:
636 /* protocol B; pretends to be successfully written on peer.
637 * see also notes above in handed_over_to_network about
638 * protocol != C */
639 req->rq_state |= RQ_NET_OK;
640 D_ASSERT(req->rq_state & RQ_NET_PENDING);
641 dec_ap_pending(mdev);
642 req->rq_state &= ~RQ_NET_PENDING;
643 _req_may_be_done(req, m);
644 break;
645
646 case neg_acked:
647 /* assert something? */
648 if (req->rq_state & RQ_NET_PENDING)
649 dec_ap_pending(mdev);
650 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
651
652 req->rq_state |= RQ_NET_DONE;
653 _req_may_be_done(req, m);
654 /* else: done by handed_over_to_network */
655 break;
656
657 case barrier_acked:
658 if (req->rq_state & RQ_NET_PENDING) {
659 /* barrier came in before all requests have been acked.
660 * this is bad, because if the connection is lost now,
661 * we won't be able to clean them up... */
662 dev_err(DEV, "FIXME (barrier_acked but pending)\n");
663 list_move(&req->tl_requests, &mdev->out_of_sequence_requests);
664 }
665 D_ASSERT(req->rq_state & RQ_NET_SENT);
666 req->rq_state |= RQ_NET_DONE;
667 _req_may_be_done(req, m);
668 break;
669
670 case data_received:
671 D_ASSERT(req->rq_state & RQ_NET_PENDING);
672 dec_ap_pending(mdev);
673 req->rq_state &= ~RQ_NET_PENDING;
674 req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
675 _req_may_be_done(req, m);
676 break;
677 };
678}
679
680/* we may do a local read if:
681 * - we are consistent (of course),
682 * - or we are generally inconsistent,
683 * BUT we are still/already IN SYNC for this area.
684 * since size may be bigger than BM_BLOCK_SIZE,
685 * we may need to check several bits.
686 */
687static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size)
688{
689 unsigned long sbnr, ebnr;
690 sector_t esector, nr_sectors;
691
692 if (mdev->state.disk == D_UP_TO_DATE)
693 return 1;
694 if (mdev->state.disk >= D_OUTDATED)
695 return 0;
696 if (mdev->state.disk < D_INCONSISTENT)
697 return 0;
698 /* state.disk == D_INCONSISTENT We will have a look at the BitMap */
699 nr_sectors = drbd_get_capacity(mdev->this_bdev);
700 esector = sector + (size >> 9) - 1;
701
702 D_ASSERT(sector < nr_sectors);
703 D_ASSERT(esector < nr_sectors);
704
705 sbnr = BM_SECT_TO_BIT(sector);
706 ebnr = BM_SECT_TO_BIT(esector);
707
708 return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr);
709}
710
711static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
712{
713 const int rw = bio_rw(bio);
714 const int size = bio->bi_size;
715 const sector_t sector = bio->bi_sector;
716 struct drbd_tl_epoch *b = NULL;
717 struct drbd_request *req;
718 int local, remote;
719 int err = -EIO;
720
721 /* allocate outside of all locks; */
722 req = drbd_req_new(mdev, bio);
723 if (!req) {
724 dec_ap_bio(mdev);
725 /* only pass the error to the upper layers.
726 * if user cannot handle io errors, that's not our business. */
727 dev_err(DEV, "could not kmalloc() req\n");
728 bio_endio(bio, -ENOMEM);
729 return 0;
730 }
731
732 local = get_ldev(mdev);
733 if (!local) {
734 bio_put(req->private_bio); /* or we get a bio leak */
735 req->private_bio = NULL;
736 }
737 if (rw == WRITE) {
738 remote = 1;
739 } else {
740 /* READ || READA */
741 if (local) {
742 if (!drbd_may_do_local_read(mdev, sector, size)) {
743 /* we could kick the syncer to
744 * sync this extent asap, wait for
745 * it, then continue locally.
746 * Or just issue the request remotely.
747 */
748 local = 0;
749 bio_put(req->private_bio);
750 req->private_bio = NULL;
751 put_ldev(mdev);
752 }
753 }
754 remote = !local && mdev->state.pdsk >= D_UP_TO_DATE;
755 }
756
757 /* If we have a disk, but a READA request is mapped to remote,
758 * we are R_PRIMARY, D_INCONSISTENT, SyncTarget.
759 * Just fail that READA request right here.
760 *
761 * THINK: maybe fail all READA when not local?
762 * or make this configurable...
763 * if network is slow, READA won't do any good.
764 */
765 if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) {
766 err = -EWOULDBLOCK;
767 goto fail_and_free_req;
768 }
769
770 /* For WRITES going to the local disk, grab a reference on the target
771 * extent. This waits for any resync activity in the corresponding
772 * resync extent to finish, and, if necessary, pulls in the target
773 * extent into the activity log, which involves further disk io because
774 * of transactional on-disk meta data updates. */
775 if (rw == WRITE && local)
776 drbd_al_begin_io(mdev, sector);
777
778 remote = remote && (mdev->state.pdsk == D_UP_TO_DATE ||
779 (mdev->state.pdsk == D_INCONSISTENT &&
780 mdev->state.conn >= C_CONNECTED));
781
782 if (!(local || remote)) {
783 dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
784 goto fail_free_complete;
785 }
786
787 /* For WRITE request, we have to make sure that we have an
788 * unused_spare_tle, in case we need to start a new epoch.
789 * I try to be smart and avoid to pre-allocate always "just in case",
790 * but there is a race between testing the bit and pointer outside the
791 * spinlock, and grabbing the spinlock.
792 * if we lost that race, we retry. */
793 if (rw == WRITE && remote &&
794 mdev->unused_spare_tle == NULL &&
795 test_bit(CREATE_BARRIER, &mdev->flags)) {
796allocate_barrier:
797 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO);
798 if (!b) {
799 dev_err(DEV, "Failed to alloc barrier.\n");
800 err = -ENOMEM;
801 goto fail_free_complete;
802 }
803 }
804
805 /* GOOD, everything prepared, grab the spin_lock */
806 spin_lock_irq(&mdev->req_lock);
807
808 if (remote) {
809 remote = (mdev->state.pdsk == D_UP_TO_DATE ||
810 (mdev->state.pdsk == D_INCONSISTENT &&
811 mdev->state.conn >= C_CONNECTED));
812 if (!remote)
813 dev_warn(DEV, "lost connection while grabbing the req_lock!\n");
814 if (!(local || remote)) {
815 dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
816 spin_unlock_irq(&mdev->req_lock);
817 goto fail_free_complete;
818 }
819 }
820
821 if (b && mdev->unused_spare_tle == NULL) {
822 mdev->unused_spare_tle = b;
823 b = NULL;
824 }
825 if (rw == WRITE && remote &&
826 mdev->unused_spare_tle == NULL &&
827 test_bit(CREATE_BARRIER, &mdev->flags)) {
828 /* someone closed the current epoch
829 * while we were grabbing the spinlock */
830 spin_unlock_irq(&mdev->req_lock);
831 goto allocate_barrier;
832 }
833
834
835 /* Update disk stats */
836 _drbd_start_io_acct(mdev, req, bio);
837
838 /* _maybe_start_new_epoch(mdev);
839 * If we need to generate a write barrier packet, we have to add the
840 * new epoch (barrier) object, and queue the barrier packet for sending,
841 * and queue the req's data after it _within the same lock_, otherwise
842 * we have race conditions were the reorder domains could be mixed up.
843 *
844 * Even read requests may start a new epoch and queue the corresponding
845 * barrier packet. To get the write ordering right, we only have to
846 * make sure that, if this is a write request and it triggered a
847 * barrier packet, this request is queued within the same spinlock. */
848 if (remote && mdev->unused_spare_tle &&
849 test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
850 _tl_add_barrier(mdev, mdev->unused_spare_tle);
851 mdev->unused_spare_tle = NULL;
852 } else {
853 D_ASSERT(!(remote && rw == WRITE &&
854 test_bit(CREATE_BARRIER, &mdev->flags)));
855 }
856
857 /* NOTE
858 * Actually, 'local' may be wrong here already, since we may have failed
859 * to write to the meta data, and may become wrong anytime because of
860 * local io-error for some other request, which would lead to us
861 * "detaching" the local disk.
862 *
863 * 'remote' may become wrong any time because the network could fail.
864 *
865 * This is a harmless race condition, though, since it is handled
866 * correctly at the appropriate places; so it just defers the failure
867 * of the respective operation.
868 */
869
870 /* mark them early for readability.
871 * this just sets some state flags. */
872 if (remote)
873 _req_mod(req, to_be_send);
874 if (local)
875 _req_mod(req, to_be_submitted);
876
877 /* check this request on the collision detection hash tables.
878 * if we have a conflict, just complete it here.
879 * THINK do we want to check reads, too? (I don't think so...) */
880 if (rw == WRITE && _req_conflicts(req)) {
881 /* this is a conflicting request.
882 * even though it may have been only _partially_
883 * overlapping with one of the currently pending requests,
884 * without even submitting or sending it, we will
885 * pretend that it was successfully served right now.
886 */
887 if (local) {
888 bio_put(req->private_bio);
889 req->private_bio = NULL;
890 drbd_al_complete_io(mdev, req->sector);
891 put_ldev(mdev);
892 local = 0;
893 }
894 if (remote)
895 dec_ap_pending(mdev);
896 _drbd_end_io_acct(mdev, req);
897 /* THINK: do we want to fail it (-EIO), or pretend success? */
898 bio_endio(req->master_bio, 0);
899 req->master_bio = NULL;
900 dec_ap_bio(mdev);
901 drbd_req_free(req);
902 remote = 0;
903 }
904
905 /* NOTE remote first: to get the concurrent write detection right,
906 * we must register the request before start of local IO. */
907 if (remote) {
908 /* either WRITE and C_CONNECTED,
909 * or READ, and no local disk,
910 * or READ, but not in sync.
911 */
912 _req_mod(req, (rw == WRITE)
913 ? queue_for_net_write
914 : queue_for_net_read);
915 }
916 spin_unlock_irq(&mdev->req_lock);
917 kfree(b); /* if someone else has beaten us to it... */
918
919 if (local) {
920 req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
921
922 if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
923 : rw == READ ? DRBD_FAULT_DT_RD
924 : DRBD_FAULT_DT_RA))
925 bio_endio(req->private_bio, -EIO);
926 else
927 generic_make_request(req->private_bio);
928 }
929
930 /* we need to plug ALWAYS since we possibly need to kick lo_dev.
931 * we plug after submit, so we won't miss an unplug event */
932 drbd_plug_device(mdev);
933
934 return 0;
935
936fail_free_complete:
937 if (rw == WRITE && local)
938 drbd_al_complete_io(mdev, sector);
939fail_and_free_req:
940 if (local) {
941 bio_put(req->private_bio);
942 req->private_bio = NULL;
943 put_ldev(mdev);
944 }
945 bio_endio(bio, err);
946 drbd_req_free(req);
947 dec_ap_bio(mdev);
948 kfree(b);
949
950 return 0;
951}
952
953/* helper function for drbd_make_request
954 * if we can determine just by the mdev (state) that this request will fail,
955 * return 1
956 * otherwise return 0
957 */
958static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
959{
960 /* Unconfigured */
961 if (mdev->state.conn == C_DISCONNECTING &&
962 mdev->state.disk == D_DISKLESS)
963 return 1;
964
965 if (mdev->state.role != R_PRIMARY &&
966 (!allow_oos || is_write)) {
967 if (__ratelimit(&drbd_ratelimit_state)) {
968 dev_err(DEV, "Process %s[%u] tried to %s; "
969 "since we are not in Primary state, "
970 "we cannot allow this\n",
971 current->comm, current->pid,
972 is_write ? "WRITE" : "READ");
973 }
974 return 1;
975 }
976
977 /*
978 * Paranoia: we might have been primary, but sync target, or
979 * even diskless, then lost the connection.
980 * This should have been handled (panic? suspend?) somewhere
981 * else. But maybe it was not, so check again here.
982 * Caution: as long as we do not have a read/write lock on mdev,
983 * to serialize state changes, this is racy, since we may lose
984 * the connection *after* we test for the cstate.
985 */
986 if (mdev->state.disk < D_UP_TO_DATE && mdev->state.pdsk < D_UP_TO_DATE) {
987 if (__ratelimit(&drbd_ratelimit_state))
988 dev_err(DEV, "Sorry, I have no access to good data anymore.\n");
989 return 1;
990 }
991
992 return 0;
993}
994
995int drbd_make_request_26(struct request_queue *q, struct bio *bio)
996{
997 unsigned int s_enr, e_enr;
998 struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
999
1000 if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) {
1001 bio_endio(bio, -EPERM);
1002 return 0;
1003 }
1004
1005 /* Reject barrier requests if we know the underlying device does
1006 * not support them.
1007 * XXX: Need to get this info from peer as well some how so we
1008 * XXX: reject if EITHER side/data/metadata area does not support them.
1009 *
1010 * because of those XXX, this is not yet enabled,
1011 * i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit.
1012 */
1013 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags))) {
1014 /* dev_warn(DEV, "Rejecting barrier request as underlying device does not support\n"); */
1015 bio_endio(bio, -EOPNOTSUPP);
1016 return 0;
1017 }
1018
1019 /*
1020 * what we "blindly" assume:
1021 */
1022 D_ASSERT(bio->bi_size > 0);
1023 D_ASSERT((bio->bi_size & 0x1ff) == 0);
1024 D_ASSERT(bio->bi_idx == 0);
1025
1026 /* to make some things easier, force alignment of requests within the
1027 * granularity of our hash tables */
1028 s_enr = bio->bi_sector >> HT_SHIFT;
1029 e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT;
1030
1031 if (likely(s_enr == e_enr)) {
1032 inc_ap_bio(mdev, 1);
1033 return drbd_make_request_common(mdev, bio);
1034 }
1035
1036 /* can this bio be split generically?
1037 * Maybe add our own split-arbitrary-bios function. */
1038 if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_SEGMENT_SIZE) {
1039 /* rather error out here than BUG in bio_split */
1040 dev_err(DEV, "bio would need to, but cannot, be split: "
1041 "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n",
1042 bio->bi_vcnt, bio->bi_idx, bio->bi_size,
1043 (unsigned long long)bio->bi_sector);
1044 bio_endio(bio, -EINVAL);
1045 } else {
1046 /* This bio crosses some boundary, so we have to split it. */
1047 struct bio_pair *bp;
1048 /* works for the "do not cross hash slot boundaries" case
1049 * e.g. sector 262269, size 4096
1050 * s_enr = 262269 >> 6 = 4097
1051 * e_enr = (262269+8-1) >> 6 = 4098
1052 * HT_SHIFT = 6
1053 * sps = 64, mask = 63
1054 * first_sectors = 64 - (262269 & 63) = 3
1055 */
1056 const sector_t sect = bio->bi_sector;
1057 const int sps = 1 << HT_SHIFT; /* sectors per slot */
1058 const int mask = sps - 1;
1059 const sector_t first_sectors = sps - (sect & mask);
1060 bp = bio_split(bio,
1061#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
1062 bio_split_pool,
1063#endif
1064 first_sectors);
1065
1066 /* we need to get a "reference count" (ap_bio_cnt)
1067 * to avoid races with the disconnect/reconnect/suspend code.
1068 * In case we need to split the bio here, we need to get two references
1069 * atomically, otherwise we might deadlock when trying to submit the
1070 * second one! */
1071 inc_ap_bio(mdev, 2);
1072
1073 D_ASSERT(e_enr == s_enr + 1);
1074
1075 drbd_make_request_common(mdev, &bp->bio1);
1076 drbd_make_request_common(mdev, &bp->bio2);
1077 bio_pair_release(bp);
1078 }
1079 return 0;
1080}
1081
1082/* This is called by bio_add_page(). With this function we reduce
1083 * the number of BIOs that span over multiple DRBD_MAX_SEGMENT_SIZEs
1084 * units (was AL_EXTENTs).
1085 *
1086 * we do the calculation within the lower 32bit of the byte offsets,
1087 * since we don't care for actual offset, but only check whether it
1088 * would cross "activity log extent" boundaries.
1089 *
1090 * As long as the BIO is empty we have to allow at least one bvec,
1091 * regardless of size and offset. so the resulting bio may still
1092 * cross extent boundaries. those are dealt with (bio_split) in
1093 * drbd_make_request_26.
1094 */
1095int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
1096{
1097 struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
1098 unsigned int bio_offset =
1099 (unsigned int)bvm->bi_sector << 9; /* 32 bit */
1100 unsigned int bio_size = bvm->bi_size;
1101 int limit, backing_limit;
1102
1103 limit = DRBD_MAX_SEGMENT_SIZE
1104 - ((bio_offset & (DRBD_MAX_SEGMENT_SIZE-1)) + bio_size);
1105 if (limit < 0)
1106 limit = 0;
1107 if (bio_size == 0) {
1108 if (limit <= bvec->bv_len)
1109 limit = bvec->bv_len;
1110 } else if (limit && get_ldev(mdev)) {
1111 struct request_queue * const b =
1112 mdev->ldev->backing_bdev->bd_disk->queue;
1113 if (b->merge_bvec_fn && mdev->ldev->dc.use_bmbv) {
1114 backing_limit = b->merge_bvec_fn(b, bvm, bvec);
1115 limit = min(limit, backing_limit);
1116 }
1117 put_ldev(mdev);
1118 }
1119 return limit;
1120}
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
new file mode 100644
index 000000000000..f22c1bc8ec7e
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.h
@@ -0,0 +1,326 @@
1/*
2 drbd_req.h
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2006-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2006-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
8 Copyright (C) 2006-2008, Philipp Reisner <philipp.reisner@linbit.com>.
9
10 DRBD is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 DRBD is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25#ifndef _DRBD_REQ_H
26#define _DRBD_REQ_H
27
28#include <linux/module.h>
29
30#include <linux/slab.h>
31#include <linux/drbd.h>
32#include "drbd_int.h"
33#include "drbd_wrappers.h"
34
35/* The request callbacks will be called in irq context by the IDE drivers,
36 and in Softirqs/Tasklets/BH context by the SCSI drivers,
37 and by the receiver and worker in kernel-thread context.
38 Try to get the locking right :) */
39
40/*
41 * Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are
42 * associated with IO requests originating from the block layer above us.
43 *
44 * There are quite a few things that may happen to a drbd request
45 * during its lifetime.
46 *
47 * It will be created.
48 * It will be marked with the intention to be
49 * submitted to local disk and/or
50 * send via the network.
51 *
52 * It has to be placed on the transfer log and other housekeeping lists,
53 * In case we have a network connection.
54 *
55 * It may be identified as a concurrent (write) request
56 * and be handled accordingly.
57 *
58 * It may me handed over to the local disk subsystem.
59 * It may be completed by the local disk subsystem,
60 * either sucessfully or with io-error.
61 * In case it is a READ request, and it failed locally,
62 * it may be retried remotely.
63 *
64 * It may be queued for sending.
65 * It may be handed over to the network stack,
66 * which may fail.
67 * It may be acknowledged by the "peer" according to the wire_protocol in use.
68 * this may be a negative ack.
69 * It may receive a faked ack when the network connection is lost and the
70 * transfer log is cleaned up.
71 * Sending may be canceled due to network connection loss.
72 * When it finally has outlived its time,
73 * corresponding dirty bits in the resync-bitmap may be cleared or set,
74 * it will be destroyed,
75 * and completion will be signalled to the originator,
76 * with or without "success".
77 */
78
79enum drbd_req_event {
80 created,
81 to_be_send,
82 to_be_submitted,
83
84 /* XXX yes, now I am inconsistent...
85 * these two are not "events" but "actions"
86 * oh, well... */
87 queue_for_net_write,
88 queue_for_net_read,
89
90 send_canceled,
91 send_failed,
92 handed_over_to_network,
93 connection_lost_while_pending,
94 recv_acked_by_peer,
95 write_acked_by_peer,
96 write_acked_by_peer_and_sis, /* and set_in_sync */
97 conflict_discarded_by_peer,
98 neg_acked,
99 barrier_acked, /* in protocol A and B */
100 data_received, /* (remote read) */
101
102 read_completed_with_error,
103 read_ahead_completed_with_error,
104 write_completed_with_error,
105 completed_ok,
106 nothing, /* for tracing only */
107};
108
109/* encoding of request states for now. we don't actually need that many bits.
110 * we don't need to do atomic bit operations either, since most of the time we
111 * need to look at the connection state and/or manipulate some lists at the
112 * same time, so we should hold the request lock anyways.
113 */
114enum drbd_req_state_bits {
115 /* 210
116 * 000: no local possible
117 * 001: to be submitted
118 * UNUSED, we could map: 011: submitted, completion still pending
119 * 110: completed ok
120 * 010: completed with error
121 */
122 __RQ_LOCAL_PENDING,
123 __RQ_LOCAL_COMPLETED,
124 __RQ_LOCAL_OK,
125
126 /* 76543
127 * 00000: no network possible
128 * 00001: to be send
129 * 00011: to be send, on worker queue
130 * 00101: sent, expecting recv_ack (B) or write_ack (C)
131 * 11101: sent,
132 * recv_ack (B) or implicit "ack" (A),
133 * still waiting for the barrier ack.
134 * master_bio may already be completed and invalidated.
135 * 11100: write_acked (C),
136 * data_received (for remote read, any protocol)
137 * or finally the barrier ack has arrived (B,A)...
138 * request can be freed
139 * 01100: neg-acked (write, protocol C)
140 * or neg-d-acked (read, any protocol)
141 * or killed from the transfer log
142 * during cleanup after connection loss
143 * request can be freed
144 * 01000: canceled or send failed...
145 * request can be freed
146 */
147
148 /* if "SENT" is not set, yet, this can still fail or be canceled.
149 * if "SENT" is set already, we still wait for an Ack packet.
150 * when cleared, the master_bio may be completed.
151 * in (B,A) the request object may still linger on the transaction log
152 * until the corresponding barrier ack comes in */
153 __RQ_NET_PENDING,
154
155 /* If it is QUEUED, and it is a WRITE, it is also registered in the
156 * transfer log. Currently we need this flag to avoid conflicts between
157 * worker canceling the request and tl_clear_barrier killing it from
158 * transfer log. We should restructure the code so this conflict does
159 * no longer occur. */
160 __RQ_NET_QUEUED,
161
162 /* well, actually only "handed over to the network stack".
163 *
164 * TODO can potentially be dropped because of the similar meaning
165 * of RQ_NET_SENT and ~RQ_NET_QUEUED.
166 * however it is not exactly the same. before we drop it
167 * we must ensure that we can tell a request with network part
168 * from a request without, regardless of what happens to it. */
169 __RQ_NET_SENT,
170
171 /* when set, the request may be freed (if RQ_NET_QUEUED is clear).
172 * basically this means the corresponding P_BARRIER_ACK was received */
173 __RQ_NET_DONE,
174
175 /* whether or not we know (C) or pretend (B,A) that the write
176 * was successfully written on the peer.
177 */
178 __RQ_NET_OK,
179
180 /* peer called drbd_set_in_sync() for this write */
181 __RQ_NET_SIS,
182
183 /* keep this last, its for the RQ_NET_MASK */
184 __RQ_NET_MAX,
185};
186
187#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
188#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
189#define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK)
190
191#define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */
192
193#define RQ_NET_PENDING (1UL << __RQ_NET_PENDING)
194#define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED)
195#define RQ_NET_SENT (1UL << __RQ_NET_SENT)
196#define RQ_NET_DONE (1UL << __RQ_NET_DONE)
197#define RQ_NET_OK (1UL << __RQ_NET_OK)
198#define RQ_NET_SIS (1UL << __RQ_NET_SIS)
199
200/* 0x1f8 */
201#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
202
203/* epoch entries */
204static inline
205struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector)
206{
207 BUG_ON(mdev->ee_hash_s == 0);
208 return mdev->ee_hash +
209 ((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s);
210}
211
212/* transfer log (drbd_request objects) */
213static inline
214struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector)
215{
216 BUG_ON(mdev->tl_hash_s == 0);
217 return mdev->tl_hash +
218 ((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s);
219}
220
221/* application reads (drbd_request objects) */
222static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector)
223{
224 return mdev->app_reads_hash
225 + ((unsigned int)(sector) % APP_R_HSIZE);
226}
227
228/* when we receive the answer for a read request,
229 * verify that we actually know about it */
230static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
231 u64 id, sector_t sector)
232{
233 struct hlist_head *slot = ar_hash_slot(mdev, sector);
234 struct hlist_node *n;
235 struct drbd_request *req;
236
237 hlist_for_each_entry(req, n, slot, colision) {
238 if ((unsigned long)req == (unsigned long)id) {
239 D_ASSERT(req->sector == sector);
240 return req;
241 }
242 }
243 return NULL;
244}
245
246static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
247 struct bio *bio_src)
248{
249 struct bio *bio;
250 struct drbd_request *req =
251 mempool_alloc(drbd_request_mempool, GFP_NOIO);
252 if (likely(req)) {
253 bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
254
255 req->rq_state = 0;
256 req->mdev = mdev;
257 req->master_bio = bio_src;
258 req->private_bio = bio;
259 req->epoch = 0;
260 req->sector = bio->bi_sector;
261 req->size = bio->bi_size;
262 req->start_time = jiffies;
263 INIT_HLIST_NODE(&req->colision);
264 INIT_LIST_HEAD(&req->tl_requests);
265 INIT_LIST_HEAD(&req->w.list);
266
267 bio->bi_private = req;
268 bio->bi_end_io = drbd_endio_pri;
269 bio->bi_next = NULL;
270 }
271 return req;
272}
273
274static inline void drbd_req_free(struct drbd_request *req)
275{
276 mempool_free(req, drbd_request_mempool);
277}
278
279static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
280{
281 return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
282}
283
284/* Short lived temporary struct on the stack.
285 * We could squirrel the error to be returned into
286 * bio->bi_size, or similar. But that would be too ugly. */
287struct bio_and_error {
288 struct bio *bio;
289 int error;
290};
291
292extern void _req_may_be_done(struct drbd_request *req,
293 struct bio_and_error *m);
294extern void __req_mod(struct drbd_request *req, enum drbd_req_event what,
295 struct bio_and_error *m);
296extern void complete_master_bio(struct drbd_conf *mdev,
297 struct bio_and_error *m);
298
299/* use this if you don't want to deal with calling complete_master_bio()
300 * outside the spinlock, e.g. when walking some list on cleanup. */
301static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what)
302{
303 struct drbd_conf *mdev = req->mdev;
304 struct bio_and_error m;
305
306 /* __req_mod possibly frees req, do not touch req after that! */
307 __req_mod(req, what, &m);
308 if (m.bio)
309 complete_master_bio(mdev, &m);
310}
311
312/* completion of master bio is outside of spinlock.
313 * If you need it irqsave, do it your self! */
314static inline void req_mod(struct drbd_request *req,
315 enum drbd_req_event what)
316{
317 struct drbd_conf *mdev = req->mdev;
318 struct bio_and_error m;
319 spin_lock_irq(&mdev->req_lock);
320 __req_mod(req, what, &m);
321 spin_unlock_irq(&mdev->req_lock);
322
323 if (m.bio)
324 complete_master_bio(mdev, &m);
325}
326#endif
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
new file mode 100644
index 000000000000..76863e3f05be
--- /dev/null
+++ b/drivers/block/drbd/drbd_strings.c
@@ -0,0 +1,113 @@
1/*
2 drbd.h
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24*/
25
26#include <linux/drbd.h>
27
28static const char *drbd_conn_s_names[] = {
29 [C_STANDALONE] = "StandAlone",
30 [C_DISCONNECTING] = "Disconnecting",
31 [C_UNCONNECTED] = "Unconnected",
32 [C_TIMEOUT] = "Timeout",
33 [C_BROKEN_PIPE] = "BrokenPipe",
34 [C_NETWORK_FAILURE] = "NetworkFailure",
35 [C_PROTOCOL_ERROR] = "ProtocolError",
36 [C_WF_CONNECTION] = "WFConnection",
37 [C_WF_REPORT_PARAMS] = "WFReportParams",
38 [C_TEAR_DOWN] = "TearDown",
39 [C_CONNECTED] = "Connected",
40 [C_STARTING_SYNC_S] = "StartingSyncS",
41 [C_STARTING_SYNC_T] = "StartingSyncT",
42 [C_WF_BITMAP_S] = "WFBitMapS",
43 [C_WF_BITMAP_T] = "WFBitMapT",
44 [C_WF_SYNC_UUID] = "WFSyncUUID",
45 [C_SYNC_SOURCE] = "SyncSource",
46 [C_SYNC_TARGET] = "SyncTarget",
47 [C_PAUSED_SYNC_S] = "PausedSyncS",
48 [C_PAUSED_SYNC_T] = "PausedSyncT",
49 [C_VERIFY_S] = "VerifyS",
50 [C_VERIFY_T] = "VerifyT",
51};
52
53static const char *drbd_role_s_names[] = {
54 [R_PRIMARY] = "Primary",
55 [R_SECONDARY] = "Secondary",
56 [R_UNKNOWN] = "Unknown"
57};
58
59static const char *drbd_disk_s_names[] = {
60 [D_DISKLESS] = "Diskless",
61 [D_ATTACHING] = "Attaching",
62 [D_FAILED] = "Failed",
63 [D_NEGOTIATING] = "Negotiating",
64 [D_INCONSISTENT] = "Inconsistent",
65 [D_OUTDATED] = "Outdated",
66 [D_UNKNOWN] = "DUnknown",
67 [D_CONSISTENT] = "Consistent",
68 [D_UP_TO_DATE] = "UpToDate",
69};
70
71static const char *drbd_state_sw_errors[] = {
72 [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
73 [-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk",
74 [-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
75 [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk",
76 [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected",
77 [-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated",
78 [-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active",
79 [-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device",
80 [-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node",
81 [-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk",
82 [-SS_DEVICE_IN_USE] = "Device is held open by someone",
83 [-SS_NO_NET_CONFIG] = "Have no net/connection configuration",
84 [-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify",
85 [-SS_NEED_CONNECTION] = "Need a connection to start verify or resync",
86 [-SS_NOT_SUPPORTED] = "Peer does not support protocol",
87 [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
88 [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
89 [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
90};
91
92const char *drbd_conn_str(enum drbd_conns s)
93{
94 /* enums are unsigned... */
95 return s > C_PAUSED_SYNC_T ? "TOO_LARGE" : drbd_conn_s_names[s];
96}
97
98const char *drbd_role_str(enum drbd_role s)
99{
100 return s > R_SECONDARY ? "TOO_LARGE" : drbd_role_s_names[s];
101}
102
103const char *drbd_disk_str(enum drbd_disk_state s)
104{
105 return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s];
106}
107
108const char *drbd_set_st_err_str(enum drbd_state_ret_codes err)
109{
110 return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" :
111 err > SS_TWO_PRIMARIES ? "TOO_LARGE"
112 : drbd_state_sw_errors[-err];
113}
diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h
new file mode 100644
index 000000000000..fc824006e721
--- /dev/null
+++ b/drivers/block/drbd/drbd_vli.h
@@ -0,0 +1,351 @@
1/*
2-*- linux-c -*-
3 drbd_receiver.c
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25#ifndef _DRBD_VLI_H
26#define _DRBD_VLI_H
27
28/*
29 * At a granularity of 4KiB storage represented per bit,
30 * and stroage sizes of several TiB,
31 * and possibly small-bandwidth replication,
32 * the bitmap transfer time can take much too long,
33 * if transmitted in plain text.
34 *
35 * We try to reduce the transfered bitmap information
36 * by encoding runlengths of bit polarity.
37 *
38 * We never actually need to encode a "zero" (runlengths are positive).
39 * But then we have to store the value of the first bit.
40 * The first bit of information thus shall encode if the first runlength
41 * gives the number of set or unset bits.
42 *
43 * We assume that large areas are either completely set or unset,
44 * which gives good compression with any runlength method,
45 * even when encoding the runlength as fixed size 32bit/64bit integers.
46 *
47 * Still, there may be areas where the polarity flips every few bits,
48 * and encoding the runlength sequence of those areas with fix size
49 * integers would be much worse than plaintext.
50 *
51 * We want to encode small runlength values with minimum code length,
52 * while still being able to encode a Huge run of all zeros.
53 *
54 * Thus we need a Variable Length Integer encoding, VLI.
55 *
56 * For some cases, we produce more code bits than plaintext input.
57 * We need to send incompressible chunks as plaintext, skip over them
58 * and then see if the next chunk compresses better.
59 *
60 * We don't care too much about "excellent" compression ratio for large
61 * runlengths (all set/all clear): whether we achieve a factor of 100
62 * or 1000 is not that much of an issue.
63 * We do not want to waste too much on short runlengths in the "noisy"
64 * parts of the bitmap, though.
65 *
66 * There are endless variants of VLI, we experimented with:
67 * * simple byte-based
68 * * various bit based with different code word length.
69 *
70 * To avoid yet an other configuration parameter (choice of bitmap compression
71 * algorithm) which was difficult to explain and tune, we just chose the one
72 * variant that turned out best in all test cases.
73 * Based on real world usage patterns, with device sizes ranging from a few GiB
74 * to several TiB, file server/mailserver/webserver/mysql/postgress,
75 * mostly idle to really busy, the all time winner (though sometimes only
76 * marginally better) is:
77 */
78
79/*
80 * encoding is "visualised" as
81 * __little endian__ bitstream, least significant bit first (left most)
82 *
83 * this particular encoding is chosen so that the prefix code
84 * starts as unary encoding the level, then modified so that
85 * 10 levels can be described in 8bit, with minimal overhead
86 * for the smaller levels.
87 *
88 * Number of data bits follow fibonacci sequence, with the exception of the
89 * last level (+1 data bit, so it makes 64bit total). The only worse code when
90 * encoding bit polarity runlength is 1 plain bits => 2 code bits.
91prefix data bits max val Nº data bits
920 x 0x2 1
9310 x 0x4 1
94110 xx 0x8 2
951110 xxx 0x10 3
9611110 xxx xx 0x30 5
97111110 xx xxxxxx 0x130 8
9811111100 xxxxxxxx xxxxx 0x2130 13
9911111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21
10011111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34
10111111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
102 * maximum encodable value: 0x100000400202130 == 2**56 + some */
103
104/* compression "table":
105 transmitted x 0.29
106 as plaintext x ........................
107 x ........................
108 x ........................
109 x 0.59 0.21........................
110 x ........................................................
111 x .. c ...................................................
112 x 0.44.. o ...................................................
113 x .......... d ...................................................
114 x .......... e ...................................................
115 X............. ...................................................
116 x.............. b ...................................................
1172.0x............... i ...................................................
118 #X................ t ...................................................
119 #................. s ........................... plain bits ..........
120-+-----------------------------------------------------------------------
121 1 16 32 64
122*/
123
124/* LEVEL: (total bits, prefix bits, prefix value),
125 * sorted ascending by number of total bits.
126 * The rest of the code table is calculated at compiletime from this. */
127
128/* fibonacci data 1, 1, ... */
129#define VLI_L_1_1() do { \
130 LEVEL( 2, 1, 0x00); \
131 LEVEL( 3, 2, 0x01); \
132 LEVEL( 5, 3, 0x03); \
133 LEVEL( 7, 4, 0x07); \
134 LEVEL(10, 5, 0x0f); \
135 LEVEL(14, 6, 0x1f); \
136 LEVEL(21, 8, 0x3f); \
137 LEVEL(29, 8, 0x7f); \
138 LEVEL(42, 8, 0xbf); \
139 LEVEL(64, 8, 0xff); \
140 } while (0)
141
142/* finds a suitable level to decode the least significant part of in.
143 * returns number of bits consumed.
144 *
145 * BUG() for bad input, as that would mean a buggy code table. */
146static inline int vli_decode_bits(u64 *out, const u64 in)
147{
148 u64 adj = 1;
149
150#define LEVEL(t,b,v) \
151 do { \
152 if ((in & ((1 << b) -1)) == v) { \
153 *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \
154 return t; \
155 } \
156 adj += 1ULL << (t - b); \
157 } while (0)
158
159 VLI_L_1_1();
160
161 /* NOT REACHED, if VLI_LEVELS code table is defined properly */
162 BUG();
163#undef LEVEL
164}
165
166/* return number of code bits needed,
167 * or negative error number */
168static inline int __vli_encode_bits(u64 *out, const u64 in)
169{
170 u64 max = 0;
171 u64 adj = 1;
172
173 if (in == 0)
174 return -EINVAL;
175
176#define LEVEL(t,b,v) do { \
177 max += 1ULL << (t - b); \
178 if (in <= max) { \
179 if (out) \
180 *out = ((in - adj) << b) | v; \
181 return t; \
182 } \
183 adj = max + 1; \
184 } while (0)
185
186 VLI_L_1_1();
187
188 return -EOVERFLOW;
189#undef LEVEL
190}
191
192#undef VLI_L_1_1
193
194/* code from here down is independend of actually used bit code */
195
196/*
197 * Code length is determined by some unique (e.g. unary) prefix.
198 * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
199 * not a byte stream.
200 */
201
202/* for the bitstream, we need a cursor */
203struct bitstream_cursor {
204 /* the current byte */
205 u8 *b;
206 /* the current bit within *b, nomalized: 0..7 */
207 unsigned int bit;
208};
209
210/* initialize cursor to point to first bit of stream */
211static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
212{
213 cur->b = s;
214 cur->bit = 0;
215}
216
217/* advance cursor by that many bits; maximum expected input value: 64,
218 * but depending on VLI implementation, it may be more. */
219static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
220{
221 bits += cur->bit;
222 cur->b = cur->b + (bits >> 3);
223 cur->bit = bits & 7;
224}
225
226/* the bitstream itself knows its length */
227struct bitstream {
228 struct bitstream_cursor cur;
229 unsigned char *buf;
230 size_t buf_len; /* in bytes */
231
232 /* for input stream:
233 * number of trailing 0 bits for padding
234 * total number of valid bits in stream: buf_len * 8 - pad_bits */
235 unsigned int pad_bits;
236};
237
238static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
239{
240 bs->buf = s;
241 bs->buf_len = len;
242 bs->pad_bits = pad_bits;
243 bitstream_cursor_reset(&bs->cur, bs->buf);
244}
245
246static inline void bitstream_rewind(struct bitstream *bs)
247{
248 bitstream_cursor_reset(&bs->cur, bs->buf);
249 memset(bs->buf, 0, bs->buf_len);
250}
251
252/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
253 * Ignores "pad_bits".
254 * Returns zero if bits == 0 (nothing to do).
255 * Returns number of bits used if successful.
256 *
257 * If there is not enough room left in bitstream,
258 * leaves bitstream unchanged and returns -ENOBUFS.
259 */
260static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
261{
262 unsigned char *b = bs->cur.b;
263 unsigned int tmp;
264
265 if (bits == 0)
266 return 0;
267
268 if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
269 return -ENOBUFS;
270
271 /* paranoia: strip off hi bits; they should not be set anyways. */
272 if (bits < 64)
273 val &= ~0ULL >> (64 - bits);
274
275 *b++ |= (val & 0xff) << bs->cur.bit;
276
277 for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
278 *b++ |= (val >> tmp) & 0xff;
279
280 bitstream_cursor_advance(&bs->cur, bits);
281 return bits;
282}
283
284/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
285 *
286 * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
287 *
288 * If there are less than the requested number of valid bits left in the
289 * bitstream, still fetches all available bits.
290 *
291 * Returns number of actually fetched bits.
292 */
293static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
294{
295 u64 val;
296 unsigned int n;
297
298 if (bits > 64)
299 return -EINVAL;
300
301 if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
302 bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
303 - bs->cur.bit - bs->pad_bits;
304
305 if (bits == 0) {
306 *out = 0;
307 return 0;
308 }
309
310 /* get the high bits */
311 val = 0;
312 n = (bs->cur.bit + bits + 7) >> 3;
313 /* n may be at most 9, if cur.bit + bits > 64 */
314 /* which means this copies at most 8 byte */
315 if (n) {
316 memcpy(&val, bs->cur.b+1, n - 1);
317 val = le64_to_cpu(val) << (8 - bs->cur.bit);
318 }
319
320 /* we still need the low bits */
321 val |= bs->cur.b[0] >> bs->cur.bit;
322
323 /* and mask out bits we don't want */
324 val &= ~0ULL >> (64 - bits);
325
326 bitstream_cursor_advance(&bs->cur, bits);
327 *out = val;
328
329 return bits;
330}
331
332/* encodes @in as vli into @bs;
333
334 * return values
335 * > 0: number of bits successfully stored in bitstream
336 * -ENOBUFS @bs is full
337 * -EINVAL input zero (invalid)
338 * -EOVERFLOW input too large for this vli code (invalid)
339 */
340static inline int vli_encode_bits(struct bitstream *bs, u64 in)
341{
342 u64 code = code;
343 int bits = __vli_encode_bits(&code, in);
344
345 if (bits <= 0)
346 return bits;
347
348 return bitstream_put_bits(bs, code, bits);
349}
350
351#endif
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
new file mode 100644
index 000000000000..ed8796f1112d
--- /dev/null
+++ b/drivers/block/drbd/drbd_worker.c
@@ -0,0 +1,1512 @@
1/*
2 drbd_worker.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/module.h>
27#include <linux/version.h>
28#include <linux/drbd.h>
29#include <linux/sched.h>
30#include <linux/smp_lock.h>
31#include <linux/wait.h>
32#include <linux/mm.h>
33#include <linux/memcontrol.h>
34#include <linux/mm_inline.h>
35#include <linux/slab.h>
36#include <linux/random.h>
37#include <linux/mm.h>
38#include <linux/string.h>
39#include <linux/scatterlist.h>
40
41#include "drbd_int.h"
42#include "drbd_req.h"
43
44#define SLEEP_TIME (HZ/10)
45
46static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
47
48
49
50/* defined here:
51 drbd_md_io_complete
52 drbd_endio_write_sec
53 drbd_endio_read_sec
54 drbd_endio_pri
55
56 * more endio handlers:
57 atodb_endio in drbd_actlog.c
58 drbd_bm_async_io_complete in drbd_bitmap.c
59
60 * For all these callbacks, note the following:
61 * The callbacks will be called in irq context by the IDE drivers,
62 * and in Softirqs/Tasklets/BH context by the SCSI drivers.
63 * Try to get the locking right :)
64 *
65 */
66
67
68/* About the global_state_lock
69 Each state transition on an device holds a read lock. In case we have
70 to evaluate the sync after dependencies, we grab a write lock, because
71 we need stable states on all devices for that. */
72rwlock_t global_state_lock;
73
74/* used for synchronous meta data and bitmap IO
75 * submitted by drbd_md_sync_page_io()
76 */
77void drbd_md_io_complete(struct bio *bio, int error)
78{
79 struct drbd_md_io *md_io;
80
81 md_io = (struct drbd_md_io *)bio->bi_private;
82 md_io->error = error;
83
84 complete(&md_io->event);
85}
86
87/* reads on behalf of the partner,
88 * "submitted" by the receiver
89 */
90void drbd_endio_read_sec(struct bio *bio, int error) __releases(local)
91{
92 unsigned long flags = 0;
93 struct drbd_epoch_entry *e = NULL;
94 struct drbd_conf *mdev;
95 int uptodate = bio_flagged(bio, BIO_UPTODATE);
96
97 e = bio->bi_private;
98 mdev = e->mdev;
99
100 if (error)
101 dev_warn(DEV, "read: error=%d s=%llus\n", error,
102 (unsigned long long)e->sector);
103 if (!error && !uptodate) {
104 dev_warn(DEV, "read: setting error to -EIO s=%llus\n",
105 (unsigned long long)e->sector);
106 /* strange behavior of some lower level drivers...
107 * fail the request by clearing the uptodate flag,
108 * but do not return any error?! */
109 error = -EIO;
110 }
111
112 D_ASSERT(e->block_id != ID_VACANT);
113
114 spin_lock_irqsave(&mdev->req_lock, flags);
115 mdev->read_cnt += e->size >> 9;
116 list_del(&e->w.list);
117 if (list_empty(&mdev->read_ee))
118 wake_up(&mdev->ee_wait);
119 spin_unlock_irqrestore(&mdev->req_lock, flags);
120
121 drbd_chk_io_error(mdev, error, FALSE);
122 drbd_queue_work(&mdev->data.work, &e->w);
123 put_ldev(mdev);
124}
125
126/* writes on behalf of the partner, or resync writes,
127 * "submitted" by the receiver.
128 */
129void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
130{
131 unsigned long flags = 0;
132 struct drbd_epoch_entry *e = NULL;
133 struct drbd_conf *mdev;
134 sector_t e_sector;
135 int do_wake;
136 int is_syncer_req;
137 int do_al_complete_io;
138 int uptodate = bio_flagged(bio, BIO_UPTODATE);
139 int is_barrier = bio_rw_flagged(bio, BIO_RW_BARRIER);
140
141 e = bio->bi_private;
142 mdev = e->mdev;
143
144 if (error)
145 dev_warn(DEV, "write: error=%d s=%llus\n", error,
146 (unsigned long long)e->sector);
147 if (!error && !uptodate) {
148 dev_warn(DEV, "write: setting error to -EIO s=%llus\n",
149 (unsigned long long)e->sector);
150 /* strange behavior of some lower level drivers...
151 * fail the request by clearing the uptodate flag,
152 * but do not return any error?! */
153 error = -EIO;
154 }
155
156 /* error == -ENOTSUPP would be a better test,
157 * alas it is not reliable */
158 if (error && is_barrier && e->flags & EE_IS_BARRIER) {
159 drbd_bump_write_ordering(mdev, WO_bdev_flush);
160 spin_lock_irqsave(&mdev->req_lock, flags);
161 list_del(&e->w.list);
162 e->w.cb = w_e_reissue;
163 /* put_ldev actually happens below, once we come here again. */
164 __release(local);
165 spin_unlock_irqrestore(&mdev->req_lock, flags);
166 drbd_queue_work(&mdev->data.work, &e->w);
167 return;
168 }
169
170 D_ASSERT(e->block_id != ID_VACANT);
171
172 spin_lock_irqsave(&mdev->req_lock, flags);
173 mdev->writ_cnt += e->size >> 9;
174 is_syncer_req = is_syncer_block_id(e->block_id);
175
176 /* after we moved e to done_ee,
177 * we may no longer access it,
178 * it may be freed/reused already!
179 * (as soon as we release the req_lock) */
180 e_sector = e->sector;
181 do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
182
183 list_del(&e->w.list); /* has been on active_ee or sync_ee */
184 list_add_tail(&e->w.list, &mdev->done_ee);
185
186 /* No hlist_del_init(&e->colision) here, we did not send the Ack yet,
187 * neither did we wake possibly waiting conflicting requests.
188 * done from "drbd_process_done_ee" within the appropriate w.cb
189 * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
190
191 do_wake = is_syncer_req
192 ? list_empty(&mdev->sync_ee)
193 : list_empty(&mdev->active_ee);
194
195 if (error)
196 __drbd_chk_io_error(mdev, FALSE);
197 spin_unlock_irqrestore(&mdev->req_lock, flags);
198
199 if (is_syncer_req)
200 drbd_rs_complete_io(mdev, e_sector);
201
202 if (do_wake)
203 wake_up(&mdev->ee_wait);
204
205 if (do_al_complete_io)
206 drbd_al_complete_io(mdev, e_sector);
207
208 wake_asender(mdev);
209 put_ldev(mdev);
210
211}
212
213/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
214 */
215void drbd_endio_pri(struct bio *bio, int error)
216{
217 unsigned long flags;
218 struct drbd_request *req = bio->bi_private;
219 struct drbd_conf *mdev = req->mdev;
220 struct bio_and_error m;
221 enum drbd_req_event what;
222 int uptodate = bio_flagged(bio, BIO_UPTODATE);
223
224 if (error)
225 dev_warn(DEV, "p %s: error=%d\n",
226 bio_data_dir(bio) == WRITE ? "write" : "read", error);
227 if (!error && !uptodate) {
228 dev_warn(DEV, "p %s: setting error to -EIO\n",
229 bio_data_dir(bio) == WRITE ? "write" : "read");
230 /* strange behavior of some lower level drivers...
231 * fail the request by clearing the uptodate flag,
232 * but do not return any error?! */
233 error = -EIO;
234 }
235
236 /* to avoid recursion in __req_mod */
237 if (unlikely(error)) {
238 what = (bio_data_dir(bio) == WRITE)
239 ? write_completed_with_error
240 : (bio_rw(bio) == READA)
241 ? read_completed_with_error
242 : read_ahead_completed_with_error;
243 } else
244 what = completed_ok;
245
246 bio_put(req->private_bio);
247 req->private_bio = ERR_PTR(error);
248
249 spin_lock_irqsave(&mdev->req_lock, flags);
250 __req_mod(req, what, &m);
251 spin_unlock_irqrestore(&mdev->req_lock, flags);
252
253 if (m.bio)
254 complete_master_bio(mdev, &m);
255}
256
257int w_io_error(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
258{
259 struct drbd_request *req = container_of(w, struct drbd_request, w);
260
261 /* NOTE: mdev->ldev can be NULL by the time we get here! */
262 /* D_ASSERT(mdev->ldev->dc.on_io_error != EP_PASS_ON); */
263
264 /* the only way this callback is scheduled is from _req_may_be_done,
265 * when it is done and had a local write error, see comments there */
266 drbd_req_free(req);
267
268 return TRUE;
269}
270
271int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
272{
273 struct drbd_request *req = container_of(w, struct drbd_request, w);
274
275 /* We should not detach for read io-error,
276 * but try to WRITE the P_DATA_REPLY to the failed location,
277 * to give the disk the chance to relocate that block */
278
279 spin_lock_irq(&mdev->req_lock);
280 if (cancel ||
281 mdev->state.conn < C_CONNECTED ||
282 mdev->state.pdsk <= D_INCONSISTENT) {
283 _req_mod(req, send_canceled);
284 spin_unlock_irq(&mdev->req_lock);
285 dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n");
286 return 1;
287 }
288 spin_unlock_irq(&mdev->req_lock);
289
290 return w_send_read_req(mdev, w, 0);
291}
292
293int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
294{
295 ERR_IF(cancel) return 1;
296 dev_err(DEV, "resync inactive, but callback triggered??\n");
297 return 1; /* Simply ignore this! */
298}
299
300void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
301{
302 struct hash_desc desc;
303 struct scatterlist sg;
304 struct bio_vec *bvec;
305 int i;
306
307 desc.tfm = tfm;
308 desc.flags = 0;
309
310 sg_init_table(&sg, 1);
311 crypto_hash_init(&desc);
312
313 __bio_for_each_segment(bvec, bio, i, 0) {
314 sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
315 crypto_hash_update(&desc, &sg, sg.length);
316 }
317 crypto_hash_final(&desc, digest);
318}
319
320static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
321{
322 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
323 int digest_size;
324 void *digest;
325 int ok;
326
327 D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
328
329 if (unlikely(cancel)) {
330 drbd_free_ee(mdev, e);
331 return 1;
332 }
333
334 if (likely(drbd_bio_uptodate(e->private_bio))) {
335 digest_size = crypto_hash_digestsize(mdev->csums_tfm);
336 digest = kmalloc(digest_size, GFP_NOIO);
337 if (digest) {
338 drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
339
340 inc_rs_pending(mdev);
341 ok = drbd_send_drequest_csum(mdev,
342 e->sector,
343 e->size,
344 digest,
345 digest_size,
346 P_CSUM_RS_REQUEST);
347 kfree(digest);
348 } else {
349 dev_err(DEV, "kmalloc() of digest failed.\n");
350 ok = 0;
351 }
352 } else
353 ok = 1;
354
355 drbd_free_ee(mdev, e);
356
357 if (unlikely(!ok))
358 dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
359 return ok;
360}
361
362#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
363
364static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
365{
366 struct drbd_epoch_entry *e;
367
368 if (!get_ldev(mdev))
369 return 0;
370
371 /* GFP_TRY, because if there is no memory available right now, this may
372 * be rescheduled for later. It is "only" background resync, after all. */
373 e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
374 if (!e) {
375 put_ldev(mdev);
376 return 2;
377 }
378
379 spin_lock_irq(&mdev->req_lock);
380 list_add(&e->w.list, &mdev->read_ee);
381 spin_unlock_irq(&mdev->req_lock);
382
383 e->private_bio->bi_end_io = drbd_endio_read_sec;
384 e->private_bio->bi_rw = READ;
385 e->w.cb = w_e_send_csum;
386
387 mdev->read_cnt += size >> 9;
388 drbd_generic_make_request(mdev, DRBD_FAULT_RS_RD, e->private_bio);
389
390 return 1;
391}
392
393void resync_timer_fn(unsigned long data)
394{
395 unsigned long flags;
396 struct drbd_conf *mdev = (struct drbd_conf *) data;
397 int queue;
398
399 spin_lock_irqsave(&mdev->req_lock, flags);
400
401 if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) {
402 queue = 1;
403 if (mdev->state.conn == C_VERIFY_S)
404 mdev->resync_work.cb = w_make_ov_request;
405 else
406 mdev->resync_work.cb = w_make_resync_request;
407 } else {
408 queue = 0;
409 mdev->resync_work.cb = w_resync_inactive;
410 }
411
412 spin_unlock_irqrestore(&mdev->req_lock, flags);
413
414 /* harmless race: list_empty outside data.work.q_lock */
415 if (list_empty(&mdev->resync_work.list) && queue)
416 drbd_queue_work(&mdev->data.work, &mdev->resync_work);
417}
418
419int w_make_resync_request(struct drbd_conf *mdev,
420 struct drbd_work *w, int cancel)
421{
422 unsigned long bit;
423 sector_t sector;
424 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
425 int max_segment_size = queue_max_segment_size(mdev->rq_queue);
426 int number, i, size, pe, mx;
427 int align, queued, sndbuf;
428
429 if (unlikely(cancel))
430 return 1;
431
432 if (unlikely(mdev->state.conn < C_CONNECTED)) {
433 dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected");
434 return 0;
435 }
436
437 if (mdev->state.conn != C_SYNC_TARGET)
438 dev_err(DEV, "%s in w_make_resync_request\n",
439 drbd_conn_str(mdev->state.conn));
440
441 if (!get_ldev(mdev)) {
442 /* Since we only need to access mdev->rsync a
443 get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
444 to continue resync with a broken disk makes no sense at
445 all */
446 dev_err(DEV, "Disk broke down during resync!\n");
447 mdev->resync_work.cb = w_resync_inactive;
448 return 1;
449 }
450
451 number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
452 pe = atomic_read(&mdev->rs_pending_cnt);
453
454 mutex_lock(&mdev->data.mutex);
455 if (mdev->data.socket)
456 mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req);
457 else
458 mx = 1;
459 mutex_unlock(&mdev->data.mutex);
460
461 /* For resync rates >160MB/sec, allow more pending RS requests */
462 if (number > mx)
463 mx = number;
464
465 /* Limit the number of pending RS requests to no more than the peer's receive buffer */
466 if ((pe + number) > mx) {
467 number = mx - pe;
468 }
469
470 for (i = 0; i < number; i++) {
471 /* Stop generating RS requests, when half of the send buffer is filled */
472 mutex_lock(&mdev->data.mutex);
473 if (mdev->data.socket) {
474 queued = mdev->data.socket->sk->sk_wmem_queued;
475 sndbuf = mdev->data.socket->sk->sk_sndbuf;
476 } else {
477 queued = 1;
478 sndbuf = 0;
479 }
480 mutex_unlock(&mdev->data.mutex);
481 if (queued > sndbuf / 2)
482 goto requeue;
483
484next_sector:
485 size = BM_BLOCK_SIZE;
486 bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo);
487
488 if (bit == -1UL) {
489 mdev->bm_resync_fo = drbd_bm_bits(mdev);
490 mdev->resync_work.cb = w_resync_inactive;
491 put_ldev(mdev);
492 return 1;
493 }
494
495 sector = BM_BIT_TO_SECT(bit);
496
497 if (drbd_try_rs_begin_io(mdev, sector)) {
498 mdev->bm_resync_fo = bit;
499 goto requeue;
500 }
501 mdev->bm_resync_fo = bit + 1;
502
503 if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) {
504 drbd_rs_complete_io(mdev, sector);
505 goto next_sector;
506 }
507
508#if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE
509 /* try to find some adjacent bits.
510 * we stop if we have already the maximum req size.
511 *
512 * Additionally always align bigger requests, in order to
513 * be prepared for all stripe sizes of software RAIDs.
514 *
515 * we _do_ care about the agreed-upon q->max_segment_size
516 * here, as splitting up the requests on the other side is more
517 * difficult. the consequence is, that on lvm and md and other
518 * "indirect" devices, this is dead code, since
519 * q->max_segment_size will be PAGE_SIZE.
520 */
521 align = 1;
522 for (;;) {
523 if (size + BM_BLOCK_SIZE > max_segment_size)
524 break;
525
526 /* Be always aligned */
527 if (sector & ((1<<(align+3))-1))
528 break;
529
530 /* do not cross extent boundaries */
531 if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
532 break;
533 /* now, is it actually dirty, after all?
534 * caution, drbd_bm_test_bit is tri-state for some
535 * obscure reason; ( b == 0 ) would get the out-of-band
536 * only accidentally right because of the "oddly sized"
537 * adjustment below */
538 if (drbd_bm_test_bit(mdev, bit+1) != 1)
539 break;
540 bit++;
541 size += BM_BLOCK_SIZE;
542 if ((BM_BLOCK_SIZE << align) <= size)
543 align++;
544 i++;
545 }
546 /* if we merged some,
547 * reset the offset to start the next drbd_bm_find_next from */
548 if (size > BM_BLOCK_SIZE)
549 mdev->bm_resync_fo = bit + 1;
550#endif
551
552 /* adjust very last sectors, in case we are oddly sized */
553 if (sector + (size>>9) > capacity)
554 size = (capacity-sector)<<9;
555 if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
556 switch (read_for_csum(mdev, sector, size)) {
557 case 0: /* Disk failure*/
558 put_ldev(mdev);
559 return 0;
560 case 2: /* Allocation failed */
561 drbd_rs_complete_io(mdev, sector);
562 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
563 goto requeue;
564 /* case 1: everything ok */
565 }
566 } else {
567 inc_rs_pending(mdev);
568 if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
569 sector, size, ID_SYNCER)) {
570 dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");
571 dec_rs_pending(mdev);
572 put_ldev(mdev);
573 return 0;
574 }
575 }
576 }
577
578 if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) {
579 /* last syncer _request_ was sent,
580 * but the P_RS_DATA_REPLY not yet received. sync will end (and
581 * next sync group will resume), as soon as we receive the last
582 * resync data block, and the last bit is cleared.
583 * until then resync "work" is "inactive" ...
584 */
585 mdev->resync_work.cb = w_resync_inactive;
586 put_ldev(mdev);
587 return 1;
588 }
589
590 requeue:
591 mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
592 put_ldev(mdev);
593 return 1;
594}
595
596static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
597{
598 int number, i, size;
599 sector_t sector;
600 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
601
602 if (unlikely(cancel))
603 return 1;
604
605 if (unlikely(mdev->state.conn < C_CONNECTED)) {
606 dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected");
607 return 0;
608 }
609
610 number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
611 if (atomic_read(&mdev->rs_pending_cnt) > number)
612 goto requeue;
613
614 number -= atomic_read(&mdev->rs_pending_cnt);
615
616 sector = mdev->ov_position;
617 for (i = 0; i < number; i++) {
618 if (sector >= capacity) {
619 mdev->resync_work.cb = w_resync_inactive;
620 return 1;
621 }
622
623 size = BM_BLOCK_SIZE;
624
625 if (drbd_try_rs_begin_io(mdev, sector)) {
626 mdev->ov_position = sector;
627 goto requeue;
628 }
629
630 if (sector + (size>>9) > capacity)
631 size = (capacity-sector)<<9;
632
633 inc_rs_pending(mdev);
634 if (!drbd_send_ov_request(mdev, sector, size)) {
635 dec_rs_pending(mdev);
636 return 0;
637 }
638 sector += BM_SECT_PER_BIT;
639 }
640 mdev->ov_position = sector;
641
642 requeue:
643 mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
644 return 1;
645}
646
647
648int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
649{
650 kfree(w);
651 ov_oos_print(mdev);
652 drbd_resync_finished(mdev);
653
654 return 1;
655}
656
657static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
658{
659 kfree(w);
660
661 drbd_resync_finished(mdev);
662
663 return 1;
664}
665
666int drbd_resync_finished(struct drbd_conf *mdev)
667{
668 unsigned long db, dt, dbdt;
669 unsigned long n_oos;
670 union drbd_state os, ns;
671 struct drbd_work *w;
672 char *khelper_cmd = NULL;
673
674 /* Remove all elements from the resync LRU. Since future actions
675 * might set bits in the (main) bitmap, then the entries in the
676 * resync LRU would be wrong. */
677 if (drbd_rs_del_all(mdev)) {
678 /* In case this is not possible now, most probably because
679 * there are P_RS_DATA_REPLY Packets lingering on the worker's
680 * queue (or even the read operations for those packets
681 * is not finished by now). Retry in 100ms. */
682
683 drbd_kick_lo(mdev);
684 __set_current_state(TASK_INTERRUPTIBLE);
685 schedule_timeout(HZ / 10);
686 w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
687 if (w) {
688 w->cb = w_resync_finished;
689 drbd_queue_work(&mdev->data.work, w);
690 return 1;
691 }
692 dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
693 }
694
695 dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
696 if (dt <= 0)
697 dt = 1;
698 db = mdev->rs_total;
699 dbdt = Bit2KB(db/dt);
700 mdev->rs_paused /= HZ;
701
702 if (!get_ldev(mdev))
703 goto out;
704
705 spin_lock_irq(&mdev->req_lock);
706 os = mdev->state;
707
708 /* This protects us against multiple calls (that can happen in the presence
709 of application IO), and against connectivity loss just before we arrive here. */
710 if (os.conn <= C_CONNECTED)
711 goto out_unlock;
712
713 ns = os;
714 ns.conn = C_CONNECTED;
715
716 dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
717 (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ?
718 "Online verify " : "Resync",
719 dt + mdev->rs_paused, mdev->rs_paused, dbdt);
720
721 n_oos = drbd_bm_total_weight(mdev);
722
723 if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
724 if (n_oos) {
725 dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n",
726 n_oos, Bit2KB(1));
727 khelper_cmd = "out-of-sync";
728 }
729 } else {
730 D_ASSERT((n_oos - mdev->rs_failed) == 0);
731
732 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
733 khelper_cmd = "after-resync-target";
734
735 if (mdev->csums_tfm && mdev->rs_total) {
736 const unsigned long s = mdev->rs_same_csum;
737 const unsigned long t = mdev->rs_total;
738 const int ratio =
739 (t == 0) ? 0 :
740 (t < 100000) ? ((s*100)/t) : (s/(t/100));
741 dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; "
742 "transferred %luK total %luK\n",
743 ratio,
744 Bit2KB(mdev->rs_same_csum),
745 Bit2KB(mdev->rs_total - mdev->rs_same_csum),
746 Bit2KB(mdev->rs_total));
747 }
748 }
749
750 if (mdev->rs_failed) {
751 dev_info(DEV, " %lu failed blocks\n", mdev->rs_failed);
752
753 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
754 ns.disk = D_INCONSISTENT;
755 ns.pdsk = D_UP_TO_DATE;
756 } else {
757 ns.disk = D_UP_TO_DATE;
758 ns.pdsk = D_INCONSISTENT;
759 }
760 } else {
761 ns.disk = D_UP_TO_DATE;
762 ns.pdsk = D_UP_TO_DATE;
763
764 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
765 if (mdev->p_uuid) {
766 int i;
767 for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
768 _drbd_uuid_set(mdev, i, mdev->p_uuid[i]);
769 drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]);
770 _drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]);
771 } else {
772 dev_err(DEV, "mdev->p_uuid is NULL! BUG\n");
773 }
774 }
775
776 drbd_uuid_set_bm(mdev, 0UL);
777
778 if (mdev->p_uuid) {
779 /* Now the two UUID sets are equal, update what we
780 * know of the peer. */
781 int i;
782 for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
783 mdev->p_uuid[i] = mdev->ldev->md.uuid[i];
784 }
785 }
786
787 _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
788out_unlock:
789 spin_unlock_irq(&mdev->req_lock);
790 put_ldev(mdev);
791out:
792 mdev->rs_total = 0;
793 mdev->rs_failed = 0;
794 mdev->rs_paused = 0;
795 mdev->ov_start_sector = 0;
796
797 if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
798 dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n");
799 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
800 }
801
802 if (khelper_cmd)
803 drbd_khelper(mdev, khelper_cmd);
804
805 return 1;
806}
807
808/* helper */
809static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
810{
811 if (drbd_bio_has_active_page(e->private_bio)) {
812 /* This might happen if sendpage() has not finished */
813 spin_lock_irq(&mdev->req_lock);
814 list_add_tail(&e->w.list, &mdev->net_ee);
815 spin_unlock_irq(&mdev->req_lock);
816 } else
817 drbd_free_ee(mdev, e);
818}
819
820/**
821 * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
822 * @mdev: DRBD device.
823 * @w: work object.
824 * @cancel: The connection will be closed anyways
825 */
826int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
827{
828 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
829 int ok;
830
831 if (unlikely(cancel)) {
832 drbd_free_ee(mdev, e);
833 dec_unacked(mdev);
834 return 1;
835 }
836
837 if (likely(drbd_bio_uptodate(e->private_bio))) {
838 ok = drbd_send_block(mdev, P_DATA_REPLY, e);
839 } else {
840 if (__ratelimit(&drbd_ratelimit_state))
841 dev_err(DEV, "Sending NegDReply. sector=%llus.\n",
842 (unsigned long long)e->sector);
843
844 ok = drbd_send_ack(mdev, P_NEG_DREPLY, e);
845 }
846
847 dec_unacked(mdev);
848
849 move_to_net_ee_or_free(mdev, e);
850
851 if (unlikely(!ok))
852 dev_err(DEV, "drbd_send_block() failed\n");
853 return ok;
854}
855
856/**
857 * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
858 * @mdev: DRBD device.
859 * @w: work object.
860 * @cancel: The connection will be closed anyways
861 */
862int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
863{
864 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
865 int ok;
866
867 if (unlikely(cancel)) {
868 drbd_free_ee(mdev, e);
869 dec_unacked(mdev);
870 return 1;
871 }
872
873 if (get_ldev_if_state(mdev, D_FAILED)) {
874 drbd_rs_complete_io(mdev, e->sector);
875 put_ldev(mdev);
876 }
877
878 if (likely(drbd_bio_uptodate(e->private_bio))) {
879 if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
880 inc_rs_pending(mdev);
881 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
882 } else {
883 if (__ratelimit(&drbd_ratelimit_state))
884 dev_err(DEV, "Not sending RSDataReply, "
885 "partner DISKLESS!\n");
886 ok = 1;
887 }
888 } else {
889 if (__ratelimit(&drbd_ratelimit_state))
890 dev_err(DEV, "Sending NegRSDReply. sector %llus.\n",
891 (unsigned long long)e->sector);
892
893 ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
894
895 /* update resync data with failure */
896 drbd_rs_failed_io(mdev, e->sector, e->size);
897 }
898
899 dec_unacked(mdev);
900
901 move_to_net_ee_or_free(mdev, e);
902
903 if (unlikely(!ok))
904 dev_err(DEV, "drbd_send_block() failed\n");
905 return ok;
906}
907
908int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
909{
910 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
911 struct digest_info *di;
912 int digest_size;
913 void *digest = NULL;
914 int ok, eq = 0;
915
916 if (unlikely(cancel)) {
917 drbd_free_ee(mdev, e);
918 dec_unacked(mdev);
919 return 1;
920 }
921
922 drbd_rs_complete_io(mdev, e->sector);
923
924 di = (struct digest_info *)(unsigned long)e->block_id;
925
926 if (likely(drbd_bio_uptodate(e->private_bio))) {
927 /* quick hack to try to avoid a race against reconfiguration.
928 * a real fix would be much more involved,
929 * introducing more locking mechanisms */
930 if (mdev->csums_tfm) {
931 digest_size = crypto_hash_digestsize(mdev->csums_tfm);
932 D_ASSERT(digest_size == di->digest_size);
933 digest = kmalloc(digest_size, GFP_NOIO);
934 }
935 if (digest) {
936 drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
937 eq = !memcmp(digest, di->digest, digest_size);
938 kfree(digest);
939 }
940
941 if (eq) {
942 drbd_set_in_sync(mdev, e->sector, e->size);
943 mdev->rs_same_csum++;
944 ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
945 } else {
946 inc_rs_pending(mdev);
947 e->block_id = ID_SYNCER;
948 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
949 }
950 } else {
951 ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
952 if (__ratelimit(&drbd_ratelimit_state))
953 dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
954 }
955
956 dec_unacked(mdev);
957
958 kfree(di);
959
960 move_to_net_ee_or_free(mdev, e);
961
962 if (unlikely(!ok))
963 dev_err(DEV, "drbd_send_block/ack() failed\n");
964 return ok;
965}
966
967int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
968{
969 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
970 int digest_size;
971 void *digest;
972 int ok = 1;
973
974 if (unlikely(cancel))
975 goto out;
976
977 if (unlikely(!drbd_bio_uptodate(e->private_bio)))
978 goto out;
979
980 digest_size = crypto_hash_digestsize(mdev->verify_tfm);
981 /* FIXME if this allocation fails, online verify will not terminate! */
982 digest = kmalloc(digest_size, GFP_NOIO);
983 if (digest) {
984 drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest);
985 inc_rs_pending(mdev);
986 ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
987 digest, digest_size, P_OV_REPLY);
988 if (!ok)
989 dec_rs_pending(mdev);
990 kfree(digest);
991 }
992
993out:
994 drbd_free_ee(mdev, e);
995
996 dec_unacked(mdev);
997
998 return ok;
999}
1000
1001void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
1002{
1003 if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) {
1004 mdev->ov_last_oos_size += size>>9;
1005 } else {
1006 mdev->ov_last_oos_start = sector;
1007 mdev->ov_last_oos_size = size>>9;
1008 }
1009 drbd_set_out_of_sync(mdev, sector, size);
1010 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
1011}
1012
1013int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1014{
1015 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
1016 struct digest_info *di;
1017 int digest_size;
1018 void *digest;
1019 int ok, eq = 0;
1020
1021 if (unlikely(cancel)) {
1022 drbd_free_ee(mdev, e);
1023 dec_unacked(mdev);
1024 return 1;
1025 }
1026
1027 /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
1028 * the resync lru has been cleaned up already */
1029 drbd_rs_complete_io(mdev, e->sector);
1030
1031 di = (struct digest_info *)(unsigned long)e->block_id;
1032
1033 if (likely(drbd_bio_uptodate(e->private_bio))) {
1034 digest_size = crypto_hash_digestsize(mdev->verify_tfm);
1035 digest = kmalloc(digest_size, GFP_NOIO);
1036 if (digest) {
1037 drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest);
1038
1039 D_ASSERT(digest_size == di->digest_size);
1040 eq = !memcmp(digest, di->digest, digest_size);
1041 kfree(digest);
1042 }
1043 } else {
1044 ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
1045 if (__ratelimit(&drbd_ratelimit_state))
1046 dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
1047 }
1048
1049 dec_unacked(mdev);
1050
1051 kfree(di);
1052
1053 if (!eq)
1054 drbd_ov_oos_found(mdev, e->sector, e->size);
1055 else
1056 ov_oos_print(mdev);
1057
1058 ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size,
1059 eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
1060
1061 drbd_free_ee(mdev, e);
1062
1063 if (--mdev->ov_left == 0) {
1064 ov_oos_print(mdev);
1065 drbd_resync_finished(mdev);
1066 }
1067
1068 return ok;
1069}
1070
1071int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1072{
1073 struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w);
1074 complete(&b->done);
1075 return 1;
1076}
1077
1078int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1079{
1080 struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w);
1081 struct p_barrier *p = &mdev->data.sbuf.barrier;
1082 int ok = 1;
1083
1084 /* really avoid racing with tl_clear. w.cb may have been referenced
1085 * just before it was reassigned and re-queued, so double check that.
1086 * actually, this race was harmless, since we only try to send the
1087 * barrier packet here, and otherwise do nothing with the object.
1088 * but compare with the head of w_clear_epoch */
1089 spin_lock_irq(&mdev->req_lock);
1090 if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED)
1091 cancel = 1;
1092 spin_unlock_irq(&mdev->req_lock);
1093 if (cancel)
1094 return 1;
1095
1096 if (!drbd_get_data_sock(mdev))
1097 return 0;
1098 p->barrier = b->br_number;
1099 /* inc_ap_pending was done where this was queued.
1100 * dec_ap_pending will be done in got_BarrierAck
1101 * or (on connection loss) in w_clear_epoch. */
1102 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
1103 (struct p_header *)p, sizeof(*p), 0);
1104 drbd_put_data_sock(mdev);
1105
1106 return ok;
1107}
1108
1109int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1110{
1111 if (cancel)
1112 return 1;
1113 return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE);
1114}
1115
1116/**
1117 * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
1118 * @mdev: DRBD device.
1119 * @w: work object.
1120 * @cancel: The connection will be closed anyways
1121 */
1122int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1123{
1124 struct drbd_request *req = container_of(w, struct drbd_request, w);
1125 int ok;
1126
1127 if (unlikely(cancel)) {
1128 req_mod(req, send_canceled);
1129 return 1;
1130 }
1131
1132 ok = drbd_send_dblock(mdev, req);
1133 req_mod(req, ok ? handed_over_to_network : send_failed);
1134
1135 return ok;
1136}
1137
1138/**
1139 * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
1140 * @mdev: DRBD device.
1141 * @w: work object.
1142 * @cancel: The connection will be closed anyways
1143 */
1144int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1145{
1146 struct drbd_request *req = container_of(w, struct drbd_request, w);
1147 int ok;
1148
1149 if (unlikely(cancel)) {
1150 req_mod(req, send_canceled);
1151 return 1;
1152 }
1153
1154 ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size,
1155 (unsigned long)req);
1156
1157 if (!ok) {
1158 /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send();
1159 * so this is probably redundant */
1160 if (mdev->state.conn >= C_CONNECTED)
1161 drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
1162 }
1163 req_mod(req, ok ? handed_over_to_network : send_failed);
1164
1165 return ok;
1166}
1167
1168static int _drbd_may_sync_now(struct drbd_conf *mdev)
1169{
1170 struct drbd_conf *odev = mdev;
1171
1172 while (1) {
1173 if (odev->sync_conf.after == -1)
1174 return 1;
1175 odev = minor_to_mdev(odev->sync_conf.after);
1176 ERR_IF(!odev) return 1;
1177 if ((odev->state.conn >= C_SYNC_SOURCE &&
1178 odev->state.conn <= C_PAUSED_SYNC_T) ||
1179 odev->state.aftr_isp || odev->state.peer_isp ||
1180 odev->state.user_isp)
1181 return 0;
1182 }
1183}
1184
1185/**
1186 * _drbd_pause_after() - Pause resync on all devices that may not resync now
1187 * @mdev: DRBD device.
1188 *
1189 * Called from process context only (admin command and after_state_ch).
1190 */
1191static int _drbd_pause_after(struct drbd_conf *mdev)
1192{
1193 struct drbd_conf *odev;
1194 int i, rv = 0;
1195
1196 for (i = 0; i < minor_count; i++) {
1197 odev = minor_to_mdev(i);
1198 if (!odev)
1199 continue;
1200 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1201 continue;
1202 if (!_drbd_may_sync_now(odev))
1203 rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
1204 != SS_NOTHING_TO_DO);
1205 }
1206
1207 return rv;
1208}
1209
1210/**
1211 * _drbd_resume_next() - Resume resync on all devices that may resync now
1212 * @mdev: DRBD device.
1213 *
1214 * Called from process context only (admin command and worker).
1215 */
1216static int _drbd_resume_next(struct drbd_conf *mdev)
1217{
1218 struct drbd_conf *odev;
1219 int i, rv = 0;
1220
1221 for (i = 0; i < minor_count; i++) {
1222 odev = minor_to_mdev(i);
1223 if (!odev)
1224 continue;
1225 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1226 continue;
1227 if (odev->state.aftr_isp) {
1228 if (_drbd_may_sync_now(odev))
1229 rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
1230 CS_HARD, NULL)
1231 != SS_NOTHING_TO_DO) ;
1232 }
1233 }
1234 return rv;
1235}
1236
1237void resume_next_sg(struct drbd_conf *mdev)
1238{
1239 write_lock_irq(&global_state_lock);
1240 _drbd_resume_next(mdev);
1241 write_unlock_irq(&global_state_lock);
1242}
1243
1244void suspend_other_sg(struct drbd_conf *mdev)
1245{
1246 write_lock_irq(&global_state_lock);
1247 _drbd_pause_after(mdev);
1248 write_unlock_irq(&global_state_lock);
1249}
1250
1251static int sync_after_error(struct drbd_conf *mdev, int o_minor)
1252{
1253 struct drbd_conf *odev;
1254
1255 if (o_minor == -1)
1256 return NO_ERROR;
1257 if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)
1258 return ERR_SYNC_AFTER;
1259
1260 /* check for loops */
1261 odev = minor_to_mdev(o_minor);
1262 while (1) {
1263 if (odev == mdev)
1264 return ERR_SYNC_AFTER_CYCLE;
1265
1266 /* dependency chain ends here, no cycles. */
1267 if (odev->sync_conf.after == -1)
1268 return NO_ERROR;
1269
1270 /* follow the dependency chain */
1271 odev = minor_to_mdev(odev->sync_conf.after);
1272 }
1273}
1274
1275int drbd_alter_sa(struct drbd_conf *mdev, int na)
1276{
1277 int changes;
1278 int retcode;
1279
1280 write_lock_irq(&global_state_lock);
1281 retcode = sync_after_error(mdev, na);
1282 if (retcode == NO_ERROR) {
1283 mdev->sync_conf.after = na;
1284 do {
1285 changes = _drbd_pause_after(mdev);
1286 changes |= _drbd_resume_next(mdev);
1287 } while (changes);
1288 }
1289 write_unlock_irq(&global_state_lock);
1290 return retcode;
1291}
1292
1293/**
1294 * drbd_start_resync() - Start the resync process
1295 * @mdev: DRBD device.
1296 * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET
1297 *
1298 * This function might bring you directly into one of the
1299 * C_PAUSED_SYNC_* states.
1300 */
1301void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1302{
1303 union drbd_state ns;
1304 int r;
1305
1306 if (mdev->state.conn >= C_SYNC_SOURCE) {
1307 dev_err(DEV, "Resync already running!\n");
1308 return;
1309 }
1310
1311 /* In case a previous resync run was aborted by an IO error/detach on the peer. */
1312 drbd_rs_cancel_all(mdev);
1313
1314 if (side == C_SYNC_TARGET) {
1315 /* Since application IO was locked out during C_WF_BITMAP_T and
1316 C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
1317 we check that we might make the data inconsistent. */
1318 r = drbd_khelper(mdev, "before-resync-target");
1319 r = (r >> 8) & 0xff;
1320 if (r > 0) {
1321 dev_info(DEV, "before-resync-target handler returned %d, "
1322 "dropping connection.\n", r);
1323 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
1324 return;
1325 }
1326 }
1327
1328 drbd_state_lock(mdev);
1329
1330 if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
1331 drbd_state_unlock(mdev);
1332 return;
1333 }
1334
1335 if (side == C_SYNC_TARGET) {
1336 mdev->bm_resync_fo = 0;
1337 } else /* side == C_SYNC_SOURCE */ {
1338 u64 uuid;
1339
1340 get_random_bytes(&uuid, sizeof(u64));
1341 drbd_uuid_set(mdev, UI_BITMAP, uuid);
1342 drbd_send_sync_uuid(mdev, uuid);
1343
1344 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
1345 }
1346
1347 write_lock_irq(&global_state_lock);
1348 ns = mdev->state;
1349
1350 ns.aftr_isp = !_drbd_may_sync_now(mdev);
1351
1352 ns.conn = side;
1353
1354 if (side == C_SYNC_TARGET)
1355 ns.disk = D_INCONSISTENT;
1356 else /* side == C_SYNC_SOURCE */
1357 ns.pdsk = D_INCONSISTENT;
1358
1359 r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
1360 ns = mdev->state;
1361
1362 if (ns.conn < C_CONNECTED)
1363 r = SS_UNKNOWN_ERROR;
1364
1365 if (r == SS_SUCCESS) {
1366 mdev->rs_total =
1367 mdev->rs_mark_left = drbd_bm_total_weight(mdev);
1368 mdev->rs_failed = 0;
1369 mdev->rs_paused = 0;
1370 mdev->rs_start =
1371 mdev->rs_mark_time = jiffies;
1372 mdev->rs_same_csum = 0;
1373 _drbd_pause_after(mdev);
1374 }
1375 write_unlock_irq(&global_state_lock);
1376 drbd_state_unlock(mdev);
1377 put_ldev(mdev);
1378
1379 if (r == SS_SUCCESS) {
1380 dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
1381 drbd_conn_str(ns.conn),
1382 (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
1383 (unsigned long) mdev->rs_total);
1384
1385 if (mdev->rs_total == 0) {
1386 /* Peer still reachable? Beware of failing before-resync-target handlers! */
1387 request_ping(mdev);
1388 __set_current_state(TASK_INTERRUPTIBLE);
1389 schedule_timeout(mdev->net_conf->ping_timeo*HZ/9); /* 9 instead 10 */
1390 drbd_resync_finished(mdev);
1391 return;
1392 }
1393
1394 /* ns.conn may already be != mdev->state.conn,
1395 * we may have been paused in between, or become paused until
1396 * the timer triggers.
1397 * No matter, that is handled in resync_timer_fn() */
1398 if (ns.conn == C_SYNC_TARGET)
1399 mod_timer(&mdev->resync_timer, jiffies);
1400
1401 drbd_md_sync(mdev);
1402 }
1403}
1404
1405int drbd_worker(struct drbd_thread *thi)
1406{
1407 struct drbd_conf *mdev = thi->mdev;
1408 struct drbd_work *w = NULL;
1409 LIST_HEAD(work_list);
1410 int intr = 0, i;
1411
1412 sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev));
1413
1414 while (get_t_state(thi) == Running) {
1415 drbd_thread_current_set_cpu(mdev);
1416
1417 if (down_trylock(&mdev->data.work.s)) {
1418 mutex_lock(&mdev->data.mutex);
1419 if (mdev->data.socket && !mdev->net_conf->no_cork)
1420 drbd_tcp_uncork(mdev->data.socket);
1421 mutex_unlock(&mdev->data.mutex);
1422
1423 intr = down_interruptible(&mdev->data.work.s);
1424
1425 mutex_lock(&mdev->data.mutex);
1426 if (mdev->data.socket && !mdev->net_conf->no_cork)
1427 drbd_tcp_cork(mdev->data.socket);
1428 mutex_unlock(&mdev->data.mutex);
1429 }
1430
1431 if (intr) {
1432 D_ASSERT(intr == -EINTR);
1433 flush_signals(current);
1434 ERR_IF (get_t_state(thi) == Running)
1435 continue;
1436 break;
1437 }
1438
1439 if (get_t_state(thi) != Running)
1440 break;
1441 /* With this break, we have done a down() but not consumed
1442 the entry from the list. The cleanup code takes care of
1443 this... */
1444
1445 w = NULL;
1446 spin_lock_irq(&mdev->data.work.q_lock);
1447 ERR_IF(list_empty(&mdev->data.work.q)) {
1448 /* something terribly wrong in our logic.
1449 * we were able to down() the semaphore,
1450 * but the list is empty... doh.
1451 *
1452 * what is the best thing to do now?
1453 * try again from scratch, restarting the receiver,
1454 * asender, whatnot? could break even more ugly,
1455 * e.g. when we are primary, but no good local data.
1456 *
1457 * I'll try to get away just starting over this loop.
1458 */
1459 spin_unlock_irq(&mdev->data.work.q_lock);
1460 continue;
1461 }
1462 w = list_entry(mdev->data.work.q.next, struct drbd_work, list);
1463 list_del_init(&w->list);
1464 spin_unlock_irq(&mdev->data.work.q_lock);
1465
1466 if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) {
1467 /* dev_warn(DEV, "worker: a callback failed! \n"); */
1468 if (mdev->state.conn >= C_CONNECTED)
1469 drbd_force_state(mdev,
1470 NS(conn, C_NETWORK_FAILURE));
1471 }
1472 }
1473 D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags));
1474 D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags));
1475
1476 spin_lock_irq(&mdev->data.work.q_lock);
1477 i = 0;
1478 while (!list_empty(&mdev->data.work.q)) {
1479 list_splice_init(&mdev->data.work.q, &work_list);
1480 spin_unlock_irq(&mdev->data.work.q_lock);
1481
1482 while (!list_empty(&work_list)) {
1483 w = list_entry(work_list.next, struct drbd_work, list);
1484 list_del_init(&w->list);
1485 w->cb(mdev, w, 1);
1486 i++; /* dead debugging code */
1487 }
1488
1489 spin_lock_irq(&mdev->data.work.q_lock);
1490 }
1491 sema_init(&mdev->data.work.s, 0);
1492 /* DANGEROUS race: if someone did queue his work within the spinlock,
1493 * but up() ed outside the spinlock, we could get an up() on the
1494 * semaphore without corresponding list entry.
1495 * So don't do that.
1496 */
1497 spin_unlock_irq(&mdev->data.work.q_lock);
1498
1499 D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
1500 /* _drbd_set_state only uses stop_nowait.
1501 * wait here for the Exiting receiver. */
1502 drbd_thread_stop(&mdev->receiver);
1503 drbd_mdev_cleanup(mdev);
1504
1505 dev_info(DEV, "worker terminated\n");
1506
1507 clear_bit(DEVICE_DYING, &mdev->flags);
1508 clear_bit(CONFIG_PENDING, &mdev->flags);
1509 wake_up(&mdev->state_wait);
1510
1511 return 0;
1512}
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h
new file mode 100644
index 000000000000..f93fa111ce50
--- /dev/null
+++ b/drivers/block/drbd/drbd_wrappers.h
@@ -0,0 +1,91 @@
1#ifndef _DRBD_WRAPPERS_H
2#define _DRBD_WRAPPERS_H
3
4#include <linux/ctype.h>
5#include <linux/mm.h>
6
7/* see get_sb_bdev and bd_claim */
8extern char *drbd_sec_holder;
9
10/* sets the number of 512 byte sectors of our virtual device */
11static inline void drbd_set_my_capacity(struct drbd_conf *mdev,
12 sector_t size)
13{
14 /* set_capacity(mdev->this_bdev->bd_disk, size); */
15 set_capacity(mdev->vdisk, size);
16 mdev->this_bdev->bd_inode->i_size = (loff_t)size << 9;
17}
18
19#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE)
20
21static inline int drbd_bio_has_active_page(struct bio *bio)
22{
23 struct bio_vec *bvec;
24 int i;
25
26 __bio_for_each_segment(bvec, bio, i, 0) {
27 if (page_count(bvec->bv_page) > 1)
28 return 1;
29 }
30
31 return 0;
32}
33
34/* bi_end_io handlers */
35extern void drbd_md_io_complete(struct bio *bio, int error);
36extern void drbd_endio_read_sec(struct bio *bio, int error);
37extern void drbd_endio_write_sec(struct bio *bio, int error);
38extern void drbd_endio_pri(struct bio *bio, int error);
39
40/*
41 * used to submit our private bio
42 */
43static inline void drbd_generic_make_request(struct drbd_conf *mdev,
44 int fault_type, struct bio *bio)
45{
46 __release(local);
47 if (!bio->bi_bdev) {
48 printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
49 "bio->bi_bdev == NULL\n",
50 mdev_to_minor(mdev));
51 dump_stack();
52 bio_endio(bio, -ENODEV);
53 return;
54 }
55
56 if (FAULT_ACTIVE(mdev, fault_type))
57 bio_endio(bio, -EIO);
58 else
59 generic_make_request(bio);
60}
61
62static inline void drbd_plug_device(struct drbd_conf *mdev)
63{
64 struct request_queue *q;
65 q = bdev_get_queue(mdev->this_bdev);
66
67 spin_lock_irq(q->queue_lock);
68
69/* XXX the check on !blk_queue_plugged is redundant,
70 * implicitly checked in blk_plug_device */
71
72 if (!blk_queue_plugged(q)) {
73 blk_plug_device(q);
74 del_timer(&q->unplug_timer);
75 /* unplugging should not happen automatically... */
76 }
77 spin_unlock_irq(q->queue_lock);
78}
79
80static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm)
81{
82 return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK)
83 == CRYPTO_ALG_TYPE_HASH;
84}
85
86#ifndef __CHECKER__
87# undef __cond_lock
88# define __cond_lock(x,c) (c)
89#endif
90
91#endif
diff --git a/fs/aio.c b/fs/aio.c
index 02a2c9340573..c30dfc006108 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -15,6 +15,7 @@
15#include <linux/aio_abi.h> 15#include <linux/aio_abi.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/syscalls.h> 17#include <linux/syscalls.h>
18#include <linux/backing-dev.h>
18#include <linux/uio.h> 19#include <linux/uio.h>
19 20
20#define DEBUG 0 21#define DEBUG 0
@@ -32,6 +33,9 @@
32#include <linux/workqueue.h> 33#include <linux/workqueue.h>
33#include <linux/security.h> 34#include <linux/security.h>
34#include <linux/eventfd.h> 35#include <linux/eventfd.h>
36#include <linux/blkdev.h>
37#include <linux/mempool.h>
38#include <linux/hash.h>
35 39
36#include <asm/kmap_types.h> 40#include <asm/kmap_types.h>
37#include <asm/uaccess.h> 41#include <asm/uaccess.h>
@@ -60,6 +64,14 @@ static DECLARE_WORK(fput_work, aio_fput_routine);
60static DEFINE_SPINLOCK(fput_lock); 64static DEFINE_SPINLOCK(fput_lock);
61static LIST_HEAD(fput_head); 65static LIST_HEAD(fput_head);
62 66
67#define AIO_BATCH_HASH_BITS 3 /* allocated on-stack, so don't go crazy */
68#define AIO_BATCH_HASH_SIZE (1 << AIO_BATCH_HASH_BITS)
69struct aio_batch_entry {
70 struct hlist_node list;
71 struct address_space *mapping;
72};
73mempool_t *abe_pool;
74
63static void aio_kick_handler(struct work_struct *); 75static void aio_kick_handler(struct work_struct *);
64static void aio_queue_work(struct kioctx *); 76static void aio_queue_work(struct kioctx *);
65 77
@@ -73,6 +85,8 @@ static int __init aio_setup(void)
73 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); 85 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
74 86
75 aio_wq = create_workqueue("aio"); 87 aio_wq = create_workqueue("aio");
88 abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
89 BUG_ON(!abe_pool);
76 90
77 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); 91 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
78 92
@@ -1531,8 +1545,44 @@ static int aio_wake_function(wait_queue_t *wait, unsigned mode,
1531 return 1; 1545 return 1;
1532} 1546}
1533 1547
1548static void aio_batch_add(struct address_space *mapping,
1549 struct hlist_head *batch_hash)
1550{
1551 struct aio_batch_entry *abe;
1552 struct hlist_node *pos;
1553 unsigned bucket;
1554
1555 bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS);
1556 hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) {
1557 if (abe->mapping == mapping)
1558 return;
1559 }
1560
1561 abe = mempool_alloc(abe_pool, GFP_KERNEL);
1562 BUG_ON(!igrab(mapping->host));
1563 abe->mapping = mapping;
1564 hlist_add_head(&abe->list, &batch_hash[bucket]);
1565 return;
1566}
1567
1568static void aio_batch_free(struct hlist_head *batch_hash)
1569{
1570 struct aio_batch_entry *abe;
1571 struct hlist_node *pos, *n;
1572 int i;
1573
1574 for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
1575 hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
1576 blk_run_address_space(abe->mapping);
1577 iput(abe->mapping->host);
1578 hlist_del(&abe->list);
1579 mempool_free(abe, abe_pool);
1580 }
1581 }
1582}
1583
1534static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, 1584static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1535 struct iocb *iocb) 1585 struct iocb *iocb, struct hlist_head *batch_hash)
1536{ 1586{
1537 struct kiocb *req; 1587 struct kiocb *req;
1538 struct file *file; 1588 struct file *file;
@@ -1608,6 +1658,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1608 ; 1658 ;
1609 } 1659 }
1610 spin_unlock_irq(&ctx->ctx_lock); 1660 spin_unlock_irq(&ctx->ctx_lock);
1661 if (req->ki_opcode == IOCB_CMD_PREAD ||
1662 req->ki_opcode == IOCB_CMD_PREADV ||
1663 req->ki_opcode == IOCB_CMD_PWRITE ||
1664 req->ki_opcode == IOCB_CMD_PWRITEV)
1665 aio_batch_add(file->f_mapping, batch_hash);
1666
1611 aio_put_req(req); /* drop extra ref to req */ 1667 aio_put_req(req); /* drop extra ref to req */
1612 return 0; 1668 return 0;
1613 1669
@@ -1635,6 +1691,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1635 struct kioctx *ctx; 1691 struct kioctx *ctx;
1636 long ret = 0; 1692 long ret = 0;
1637 int i; 1693 int i;
1694 struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, };
1638 1695
1639 if (unlikely(nr < 0)) 1696 if (unlikely(nr < 0))
1640 return -EINVAL; 1697 return -EINVAL;
@@ -1666,10 +1723,11 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1666 break; 1723 break;
1667 } 1724 }
1668 1725
1669 ret = io_submit_one(ctx, user_iocb, &tmp); 1726 ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash);
1670 if (ret) 1727 if (ret)
1671 break; 1728 break;
1672 } 1729 }
1730 aio_batch_free(batch_hash);
1673 1731
1674 put_ioctx(ctx); 1732 put_ioctx(ctx);
1675 return i ? i : ret; 1733 return i ? i : ret;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9cf4b926f8e4..dde91e7e1c3a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -405,7 +405,17 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
405 405
406static int block_fsync(struct file *filp, struct dentry *dentry, int datasync) 406static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
407{ 407{
408 return sync_blockdev(I_BDEV(filp->f_mapping->host)); 408 struct block_device *bdev = I_BDEV(filp->f_mapping->host);
409 int error;
410
411 error = sync_blockdev(bdev);
412 if (error)
413 return error;
414
415 error = blkdev_issue_flush(bdev, NULL);
416 if (error == -EOPNOTSUPP)
417 error = 0;
418 return error;
409} 419}
410 420
411/* 421/*
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 8b10b87dc01a..3af761c8c5cc 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1028,9 +1028,6 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1028 if (dio->bio) 1028 if (dio->bio)
1029 dio_bio_submit(dio); 1029 dio_bio_submit(dio);
1030 1030
1031 /* All IO is now issued, send it on its way */
1032 blk_run_address_space(inode->i_mapping);
1033
1034 /* 1031 /*
1035 * It is possible that, we return short IO due to end of file. 1032 * It is possible that, we return short IO due to end of file.
1036 * In that case, we need to release all the pages we got hold on. 1033 * In that case, we need to release all the pages we got hold on.
@@ -1057,8 +1054,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1057 ((rw & READ) || (dio->result == dio->size))) 1054 ((rw & READ) || (dio->result == dio->size)))
1058 ret = -EIOCBQUEUED; 1055 ret = -EIOCBQUEUED;
1059 1056
1060 if (ret != -EIOCBQUEUED) 1057 if (ret != -EIOCBQUEUED) {
1058 /* All IO is now issued, send it on its way */
1059 blk_run_address_space(inode->i_mapping);
1061 dio_await_completion(dio); 1060 dio_await_completion(dio);
1061 }
1062 1062
1063 /* 1063 /*
1064 * Sync will always be dropping the final ref and completing the 1064 * Sync will always be dropping the final ref and completing the
@@ -1124,7 +1124,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1124 int acquire_i_mutex = 0; 1124 int acquire_i_mutex = 0;
1125 1125
1126 if (rw & WRITE) 1126 if (rw & WRITE)
1127 rw = WRITE_ODIRECT; 1127 rw = WRITE_SYNC_PLUG;
1128 1128
1129 if (bdev) 1129 if (bdev)
1130 bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); 1130 bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index b449e738533a..fcbc26af00e4 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -331,4 +331,17 @@ static inline int bdi_sched_wait(void *word)
331 return 0; 331 return 0;
332} 332}
333 333
334static inline void blk_run_backing_dev(struct backing_dev_info *bdi,
335 struct page *page)
336{
337 if (bdi && bdi->unplug_io_fn)
338 bdi->unplug_io_fn(bdi, page);
339}
340
341static inline void blk_run_address_space(struct address_space *mapping)
342{
343 if (mapping)
344 blk_run_backing_dev(mapping->backing_dev_info, NULL);
345}
346
334#endif /* _LINUX_BACKING_DEV_H */ 347#endif /* _LINUX_BACKING_DEV_H */
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5be93f18d842..474792b825d0 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -450,11 +450,8 @@ extern struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly;
450/* 450/*
451 * remember never ever reenable interrupts between a bvec_kmap_irq and 451 * remember never ever reenable interrupts between a bvec_kmap_irq and
452 * bvec_kunmap_irq! 452 * bvec_kunmap_irq!
453 *
454 * This function MUST be inlined - it plays with the CPU interrupt flags.
455 */ 453 */
456static __always_inline char *bvec_kmap_irq(struct bio_vec *bvec, 454static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
457 unsigned long *flags)
458{ 455{
459 unsigned long addr; 456 unsigned long addr;
460 457
@@ -470,8 +467,7 @@ static __always_inline char *bvec_kmap_irq(struct bio_vec *bvec,
470 return (char *) addr + bvec->bv_offset; 467 return (char *) addr + bvec->bv_offset;
471} 468}
472 469
473static __always_inline void bvec_kunmap_irq(char *buffer, 470static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
474 unsigned long *flags)
475{ 471{
476 unsigned long ptr = (unsigned long) buffer & PAGE_MASK; 472 unsigned long ptr = (unsigned long) buffer & PAGE_MASK;
477 473
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 221cecd86bd3..39c601f783a0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -823,19 +823,6 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
823 return bdev->bd_disk->queue; 823 return bdev->bd_disk->queue;
824} 824}
825 825
826static inline void blk_run_backing_dev(struct backing_dev_info *bdi,
827 struct page *page)
828{
829 if (bdi && bdi->unplug_io_fn)
830 bdi->unplug_io_fn(bdi, page);
831}
832
833static inline void blk_run_address_space(struct address_space *mapping)
834{
835 if (mapping)
836 blk_run_backing_dev(mapping->backing_dev_info, NULL);
837}
838
839/* 826/*
840 * blk_rq_pos() : the current sector 827 * blk_rq_pos() : the current sector
841 * blk_rq_bytes() : bytes left in the entire request 828 * blk_rq_bytes() : bytes left in the entire request
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
new file mode 100644
index 000000000000..233db5c18b86
--- /dev/null
+++ b/include/linux/drbd.h
@@ -0,0 +1,350 @@
1/*
2 drbd.h
3 Kernel module for 2.6.x Kernels
4
5 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6
7 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
8 Copyright (C) 2001-2008, Philipp Reisner <philipp.reisner@linbit.com>.
9 Copyright (C) 2001-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
10
11 drbd is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 drbd is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with drbd; see the file COPYING. If not, write to
23 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
24
25*/
26#ifndef DRBD_H
27#define DRBD_H
28#include <linux/connector.h>
29#include <asm/types.h>
30
31#ifdef __KERNEL__
32#include <linux/types.h>
33#include <asm/byteorder.h>
34#else
35#include <sys/types.h>
36#include <sys/wait.h>
37#include <limits.h>
38
39/* Altough the Linux source code makes a difference between
40 generic endianness and the bitfields' endianness, there is no
41 architecture as of Linux-2.6.24-rc4 where the bitfileds' endianness
42 does not match the generic endianness. */
43
44#if __BYTE_ORDER == __LITTLE_ENDIAN
45#define __LITTLE_ENDIAN_BITFIELD
46#elif __BYTE_ORDER == __BIG_ENDIAN
47#define __BIG_ENDIAN_BITFIELD
48#else
49# error "sorry, weird endianness on this box"
50#endif
51
52#endif
53
54
55extern const char *drbd_buildtag(void);
56#define REL_VERSION "8.3.3rc2"
57#define API_VERSION 88
58#define PRO_VERSION_MIN 86
59#define PRO_VERSION_MAX 91
60
61
62enum drbd_io_error_p {
63 EP_PASS_ON, /* FIXME should the better be named "Ignore"? */
64 EP_CALL_HELPER,
65 EP_DETACH
66};
67
68enum drbd_fencing_p {
69 FP_DONT_CARE,
70 FP_RESOURCE,
71 FP_STONITH
72};
73
74enum drbd_disconnect_p {
75 DP_RECONNECT,
76 DP_DROP_NET_CONF,
77 DP_FREEZE_IO
78};
79
80enum drbd_after_sb_p {
81 ASB_DISCONNECT,
82 ASB_DISCARD_YOUNGER_PRI,
83 ASB_DISCARD_OLDER_PRI,
84 ASB_DISCARD_ZERO_CHG,
85 ASB_DISCARD_LEAST_CHG,
86 ASB_DISCARD_LOCAL,
87 ASB_DISCARD_REMOTE,
88 ASB_CONSENSUS,
89 ASB_DISCARD_SECONDARY,
90 ASB_CALL_HELPER,
91 ASB_VIOLENTLY
92};
93
94/* KEEP the order, do not delete or insert. Only append. */
95enum drbd_ret_codes {
96 ERR_CODE_BASE = 100,
97 NO_ERROR = 101,
98 ERR_LOCAL_ADDR = 102,
99 ERR_PEER_ADDR = 103,
100 ERR_OPEN_DISK = 104,
101 ERR_OPEN_MD_DISK = 105,
102 ERR_DISK_NOT_BDEV = 107,
103 ERR_MD_NOT_BDEV = 108,
104 ERR_DISK_TO_SMALL = 111,
105 ERR_MD_DISK_TO_SMALL = 112,
106 ERR_BDCLAIM_DISK = 114,
107 ERR_BDCLAIM_MD_DISK = 115,
108 ERR_MD_IDX_INVALID = 116,
109 ERR_IO_MD_DISK = 118,
110 ERR_MD_INVALID = 119,
111 ERR_AUTH_ALG = 120,
112 ERR_AUTH_ALG_ND = 121,
113 ERR_NOMEM = 122,
114 ERR_DISCARD = 123,
115 ERR_DISK_CONFIGURED = 124,
116 ERR_NET_CONFIGURED = 125,
117 ERR_MANDATORY_TAG = 126,
118 ERR_MINOR_INVALID = 127,
119 ERR_INTR = 129, /* EINTR */
120 ERR_RESIZE_RESYNC = 130,
121 ERR_NO_PRIMARY = 131,
122 ERR_SYNC_AFTER = 132,
123 ERR_SYNC_AFTER_CYCLE = 133,
124 ERR_PAUSE_IS_SET = 134,
125 ERR_PAUSE_IS_CLEAR = 135,
126 ERR_PACKET_NR = 137,
127 ERR_NO_DISK = 138,
128 ERR_NOT_PROTO_C = 139,
129 ERR_NOMEM_BITMAP = 140,
130 ERR_INTEGRITY_ALG = 141, /* DRBD 8.2 only */
131 ERR_INTEGRITY_ALG_ND = 142, /* DRBD 8.2 only */
132 ERR_CPU_MASK_PARSE = 143, /* DRBD 8.2 only */
133 ERR_CSUMS_ALG = 144, /* DRBD 8.2 only */
134 ERR_CSUMS_ALG_ND = 145, /* DRBD 8.2 only */
135 ERR_VERIFY_ALG = 146, /* DRBD 8.2 only */
136 ERR_VERIFY_ALG_ND = 147, /* DRBD 8.2 only */
137 ERR_CSUMS_RESYNC_RUNNING= 148, /* DRBD 8.2 only */
138 ERR_VERIFY_RUNNING = 149, /* DRBD 8.2 only */
139 ERR_DATA_NOT_CURRENT = 150,
140 ERR_CONNECTED = 151, /* DRBD 8.3 only */
141 ERR_PERM = 152,
142
143 /* insert new ones above this line */
144 AFTER_LAST_ERR_CODE
145};
146
147#define DRBD_PROT_A 1
148#define DRBD_PROT_B 2
149#define DRBD_PROT_C 3
150
151enum drbd_role {
152 R_UNKNOWN = 0,
153 R_PRIMARY = 1, /* role */
154 R_SECONDARY = 2, /* role */
155 R_MASK = 3,
156};
157
158/* The order of these constants is important.
159 * The lower ones (<C_WF_REPORT_PARAMS) indicate
160 * that there is no socket!
161 * >=C_WF_REPORT_PARAMS ==> There is a socket
162 */
163enum drbd_conns {
164 C_STANDALONE,
165 C_DISCONNECTING, /* Temporal state on the way to StandAlone. */
166 C_UNCONNECTED, /* >= C_UNCONNECTED -> inc_net() succeeds */
167
168 /* These temporal states are all used on the way
169 * from >= C_CONNECTED to Unconnected.
170 * The 'disconnect reason' states
171 * I do not allow to change beween them. */
172 C_TIMEOUT,
173 C_BROKEN_PIPE,
174 C_NETWORK_FAILURE,
175 C_PROTOCOL_ERROR,
176 C_TEAR_DOWN,
177
178 C_WF_CONNECTION,
179 C_WF_REPORT_PARAMS, /* we have a socket */
180 C_CONNECTED, /* we have introduced each other */
181 C_STARTING_SYNC_S, /* starting full sync by admin request. */
182 C_STARTING_SYNC_T, /* stariing full sync by admin request. */
183 C_WF_BITMAP_S,
184 C_WF_BITMAP_T,
185 C_WF_SYNC_UUID,
186
187 /* All SyncStates are tested with this comparison
188 * xx >= C_SYNC_SOURCE && xx <= C_PAUSED_SYNC_T */
189 C_SYNC_SOURCE,
190 C_SYNC_TARGET,
191 C_VERIFY_S,
192 C_VERIFY_T,
193 C_PAUSED_SYNC_S,
194 C_PAUSED_SYNC_T,
195 C_MASK = 31
196};
197
198enum drbd_disk_state {
199 D_DISKLESS,
200 D_ATTACHING, /* In the process of reading the meta-data */
201 D_FAILED, /* Becomes D_DISKLESS as soon as we told it the peer */
202 /* when >= D_FAILED it is legal to access mdev->bc */
203 D_NEGOTIATING, /* Late attaching state, we need to talk to the peer */
204 D_INCONSISTENT,
205 D_OUTDATED,
206 D_UNKNOWN, /* Only used for the peer, never for myself */
207 D_CONSISTENT, /* Might be D_OUTDATED, might be D_UP_TO_DATE ... */
208 D_UP_TO_DATE, /* Only this disk state allows applications' IO ! */
209 D_MASK = 15
210};
211
212union drbd_state {
213/* According to gcc's docs is the ...
214 * The order of allocation of bit-fields within a unit (C90 6.5.2.1, C99 6.7.2.1).
215 * Determined by ABI.
216 * pointed out by Maxim Uvarov q<muvarov@ru.mvista.com>
217 * even though we transmit as "cpu_to_be32(state)",
218 * the offsets of the bitfields still need to be swapped
219 * on different endianess.
220 */
221 struct {
222#if defined(__LITTLE_ENDIAN_BITFIELD)
223 unsigned role:2 ; /* 3/4 primary/secondary/unknown */
224 unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
225 unsigned conn:5 ; /* 17/32 cstates */
226 unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
227 unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
228 unsigned susp:1 ; /* 2/2 IO suspended no/yes */
229 unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
230 unsigned peer_isp:1 ;
231 unsigned user_isp:1 ;
232 unsigned _pad:11; /* 0 unused */
233#elif defined(__BIG_ENDIAN_BITFIELD)
234 unsigned _pad:11; /* 0 unused */
235 unsigned user_isp:1 ;
236 unsigned peer_isp:1 ;
237 unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
238 unsigned susp:1 ; /* 2/2 IO suspended no/yes */
239 unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
240 unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
241 unsigned conn:5 ; /* 17/32 cstates */
242 unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
243 unsigned role:2 ; /* 3/4 primary/secondary/unknown */
244#else
245# error "this endianess is not supported"
246#endif
247 };
248 unsigned int i;
249};
250
251enum drbd_state_ret_codes {
252 SS_CW_NO_NEED = 4,
253 SS_CW_SUCCESS = 3,
254 SS_NOTHING_TO_DO = 2,
255 SS_SUCCESS = 1,
256 SS_UNKNOWN_ERROR = 0, /* Used to sleep longer in _drbd_request_state */
257 SS_TWO_PRIMARIES = -1,
258 SS_NO_UP_TO_DATE_DISK = -2,
259 SS_NO_LOCAL_DISK = -4,
260 SS_NO_REMOTE_DISK = -5,
261 SS_CONNECTED_OUTDATES = -6,
262 SS_PRIMARY_NOP = -7,
263 SS_RESYNC_RUNNING = -8,
264 SS_ALREADY_STANDALONE = -9,
265 SS_CW_FAILED_BY_PEER = -10,
266 SS_IS_DISKLESS = -11,
267 SS_DEVICE_IN_USE = -12,
268 SS_NO_NET_CONFIG = -13,
269 SS_NO_VERIFY_ALG = -14, /* drbd-8.2 only */
270 SS_NEED_CONNECTION = -15, /* drbd-8.2 only */
271 SS_LOWER_THAN_OUTDATED = -16,
272 SS_NOT_SUPPORTED = -17, /* drbd-8.2 only */
273 SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */
274 SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */
275 SS_AFTER_LAST_ERROR = -20, /* Keep this at bottom */
276};
277
278/* from drbd_strings.c */
279extern const char *drbd_conn_str(enum drbd_conns);
280extern const char *drbd_role_str(enum drbd_role);
281extern const char *drbd_disk_str(enum drbd_disk_state);
282extern const char *drbd_set_st_err_str(enum drbd_state_ret_codes);
283
284#define SHARED_SECRET_MAX 64
285
286#define MDF_CONSISTENT (1 << 0)
287#define MDF_PRIMARY_IND (1 << 1)
288#define MDF_CONNECTED_IND (1 << 2)
289#define MDF_FULL_SYNC (1 << 3)
290#define MDF_WAS_UP_TO_DATE (1 << 4)
291#define MDF_PEER_OUT_DATED (1 << 5)
292#define MDF_CRASHED_PRIMARY (1 << 6)
293
294enum drbd_uuid_index {
295 UI_CURRENT,
296 UI_BITMAP,
297 UI_HISTORY_START,
298 UI_HISTORY_END,
299 UI_SIZE, /* nl-packet: number of dirty bits */
300 UI_FLAGS, /* nl-packet: flags */
301 UI_EXTENDED_SIZE /* Everything. */
302};
303
304enum drbd_timeout_flag {
305 UT_DEFAULT = 0,
306 UT_DEGRADED = 1,
307 UT_PEER_OUTDATED = 2,
308};
309
310#define UUID_JUST_CREATED ((__u64)4)
311
312#define DRBD_MAGIC 0x83740267
313#define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC)
314
315/* these are of type "int" */
316#define DRBD_MD_INDEX_INTERNAL -1
317#define DRBD_MD_INDEX_FLEX_EXT -2
318#define DRBD_MD_INDEX_FLEX_INT -3
319
320/* Start of the new netlink/connector stuff */
321
322#define DRBD_NL_CREATE_DEVICE 0x01
323#define DRBD_NL_SET_DEFAULTS 0x02
324
325/* The following line should be moved over to linux/connector.h
326 * when the time comes */
327#ifndef CN_IDX_DRBD
328# define CN_IDX_DRBD 0x4
329/* Ubuntu "intrepid ibex" release defined CN_IDX_DRBD as 0x6 */
330#endif
331#define CN_VAL_DRBD 0x1
332
333/* For searching a vacant cn_idx value */
334#define CN_IDX_STEP 6977
335
336struct drbd_nl_cfg_req {
337 int packet_type;
338 unsigned int drbd_minor;
339 int flags;
340 unsigned short tag_list[];
341};
342
343struct drbd_nl_cfg_reply {
344 int packet_type;
345 unsigned int minor;
346 int ret_code; /* enum ret_code or set_st_err_t */
347 unsigned short tag_list[]; /* only used with get_* calls */
348};
349
350#endif
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
new file mode 100644
index 000000000000..9d067ce46960
--- /dev/null
+++ b/include/linux/drbd_limits.h
@@ -0,0 +1,137 @@
1/*
2 drbd_limits.h
3 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
4*/
5
6/*
7 * Our current limitations.
8 * Some of them are hard limits,
9 * some of them are arbitrary range limits, that make it easier to provide
10 * feedback about nonsense settings for certain configurable values.
11 */
12
13#ifndef DRBD_LIMITS_H
14#define DRBD_LIMITS_H 1
15
16#define DEBUG_RANGE_CHECK 0
17
18#define DRBD_MINOR_COUNT_MIN 1
19#define DRBD_MINOR_COUNT_MAX 255
20
21#define DRBD_DIALOG_REFRESH_MIN 0
22#define DRBD_DIALOG_REFRESH_MAX 600
23
24/* valid port number */
25#define DRBD_PORT_MIN 1
26#define DRBD_PORT_MAX 0xffff
27
28/* startup { */
29 /* if you want more than 3.4 days, disable */
30#define DRBD_WFC_TIMEOUT_MIN 0
31#define DRBD_WFC_TIMEOUT_MAX 300000
32#define DRBD_WFC_TIMEOUT_DEF 0
33
34#define DRBD_DEGR_WFC_TIMEOUT_MIN 0
35#define DRBD_DEGR_WFC_TIMEOUT_MAX 300000
36#define DRBD_DEGR_WFC_TIMEOUT_DEF 0
37
38#define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0
39#define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000
40#define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0
41/* }*/
42
43/* net { */
44 /* timeout, unit centi seconds
45 * more than one minute timeout is not usefull */
46#define DRBD_TIMEOUT_MIN 1
47#define DRBD_TIMEOUT_MAX 600
48#define DRBD_TIMEOUT_DEF 60 /* 6 seconds */
49
50 /* active connection retries when C_WF_CONNECTION */
51#define DRBD_CONNECT_INT_MIN 1
52#define DRBD_CONNECT_INT_MAX 120
53#define DRBD_CONNECT_INT_DEF 10 /* seconds */
54
55 /* keep-alive probes when idle */
56#define DRBD_PING_INT_MIN 1
57#define DRBD_PING_INT_MAX 120
58#define DRBD_PING_INT_DEF 10
59
60 /* timeout for the ping packets.*/
61#define DRBD_PING_TIMEO_MIN 1
62#define DRBD_PING_TIMEO_MAX 100
63#define DRBD_PING_TIMEO_DEF 5
64
65 /* max number of write requests between write barriers */
66#define DRBD_MAX_EPOCH_SIZE_MIN 1
67#define DRBD_MAX_EPOCH_SIZE_MAX 20000
68#define DRBD_MAX_EPOCH_SIZE_DEF 2048
69
70 /* I don't think that a tcp send buffer of more than 10M is usefull */
71#define DRBD_SNDBUF_SIZE_MIN 0
72#define DRBD_SNDBUF_SIZE_MAX (10<<20)
73#define DRBD_SNDBUF_SIZE_DEF (2*65535)
74
75#define DRBD_RCVBUF_SIZE_MIN 0
76#define DRBD_RCVBUF_SIZE_MAX (10<<20)
77#define DRBD_RCVBUF_SIZE_DEF (2*65535)
78
79 /* @4k PageSize -> 128kB - 512MB */
80#define DRBD_MAX_BUFFERS_MIN 32
81#define DRBD_MAX_BUFFERS_MAX 131072
82#define DRBD_MAX_BUFFERS_DEF 2048
83
84 /* @4k PageSize -> 4kB - 512MB */
85#define DRBD_UNPLUG_WATERMARK_MIN 1
86#define DRBD_UNPLUG_WATERMARK_MAX 131072
87#define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16)
88
89 /* 0 is disabled.
90 * 200 should be more than enough even for very short timeouts */
91#define DRBD_KO_COUNT_MIN 0
92#define DRBD_KO_COUNT_MAX 200
93#define DRBD_KO_COUNT_DEF 0
94/* } */
95
96/* syncer { */
97 /* FIXME allow rate to be zero? */
98#define DRBD_RATE_MIN 1
99/* channel bonding 10 GbE, or other hardware */
100#define DRBD_RATE_MAX (4 << 20)
101#define DRBD_RATE_DEF 250 /* kb/second */
102
103 /* less than 7 would hit performance unneccessarily.
104 * 3833 is the largest prime that still does fit
105 * into 64 sectors of activity log */
106#define DRBD_AL_EXTENTS_MIN 7
107#define DRBD_AL_EXTENTS_MAX 3833
108#define DRBD_AL_EXTENTS_DEF 127
109
110#define DRBD_AFTER_MIN -1
111#define DRBD_AFTER_MAX 255
112#define DRBD_AFTER_DEF -1
113
114/* } */
115
116/* drbdsetup XY resize -d Z
117 * you are free to reduce the device size to nothing, if you want to.
118 * the upper limit with 64bit kernel, enough ram and flexible meta data
119 * is 16 TB, currently. */
120/* DRBD_MAX_SECTORS */
121#define DRBD_DISK_SIZE_SECT_MIN 0
122#define DRBD_DISK_SIZE_SECT_MAX (16 * (2LLU << 30))
123#define DRBD_DISK_SIZE_SECT_DEF 0 /* = disabled = no user size... */
124
125#define DRBD_ON_IO_ERROR_DEF EP_PASS_ON
126#define DRBD_FENCING_DEF FP_DONT_CARE
127#define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT
128#define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT
129#define DRBD_AFTER_SB_2P_DEF ASB_DISCONNECT
130#define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT
131
132#define DRBD_MAX_BIO_BVECS_MIN 0
133#define DRBD_MAX_BIO_BVECS_MAX 128
134#define DRBD_MAX_BIO_BVECS_DEF 0
135
136#undef RANGE
137#endif
diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h
new file mode 100644
index 000000000000..db5721ad50d1
--- /dev/null
+++ b/include/linux/drbd_nl.h
@@ -0,0 +1,137 @@
1/*
2 PAKET( name,
3 TYPE ( pn, pr, member )
4 ...
5 )
6
7 You may never reissue one of the pn arguments
8*/
9
10#if !defined(NL_PACKET) || !defined(NL_STRING) || !defined(NL_INTEGER) || !defined(NL_BIT) || !defined(NL_INT64)
11#error "The macros NL_PACKET, NL_STRING, NL_INTEGER, NL_INT64 and NL_BIT needs to be defined"
12#endif
13
14NL_PACKET(primary, 1,
15 NL_BIT( 1, T_MAY_IGNORE, overwrite_peer)
16)
17
18NL_PACKET(secondary, 2, )
19
20NL_PACKET(disk_conf, 3,
21 NL_INT64( 2, T_MAY_IGNORE, disk_size)
22 NL_STRING( 3, T_MANDATORY, backing_dev, 128)
23 NL_STRING( 4, T_MANDATORY, meta_dev, 128)
24 NL_INTEGER( 5, T_MANDATORY, meta_dev_idx)
25 NL_INTEGER( 6, T_MAY_IGNORE, on_io_error)
26 NL_INTEGER( 7, T_MAY_IGNORE, fencing)
27 NL_BIT( 37, T_MAY_IGNORE, use_bmbv)
28 NL_BIT( 53, T_MAY_IGNORE, no_disk_flush)
29 NL_BIT( 54, T_MAY_IGNORE, no_md_flush)
30 /* 55 max_bio_size was available in 8.2.6rc2 */
31 NL_INTEGER( 56, T_MAY_IGNORE, max_bio_bvecs)
32 NL_BIT( 57, T_MAY_IGNORE, no_disk_barrier)
33 NL_BIT( 58, T_MAY_IGNORE, no_disk_drain)
34)
35
36NL_PACKET(detach, 4, )
37
38NL_PACKET(net_conf, 5,
39 NL_STRING( 8, T_MANDATORY, my_addr, 128)
40 NL_STRING( 9, T_MANDATORY, peer_addr, 128)
41 NL_STRING( 10, T_MAY_IGNORE, shared_secret, SHARED_SECRET_MAX)
42 NL_STRING( 11, T_MAY_IGNORE, cram_hmac_alg, SHARED_SECRET_MAX)
43 NL_STRING( 44, T_MAY_IGNORE, integrity_alg, SHARED_SECRET_MAX)
44 NL_INTEGER( 14, T_MAY_IGNORE, timeout)
45 NL_INTEGER( 15, T_MANDATORY, wire_protocol)
46 NL_INTEGER( 16, T_MAY_IGNORE, try_connect_int)
47 NL_INTEGER( 17, T_MAY_IGNORE, ping_int)
48 NL_INTEGER( 18, T_MAY_IGNORE, max_epoch_size)
49 NL_INTEGER( 19, T_MAY_IGNORE, max_buffers)
50 NL_INTEGER( 20, T_MAY_IGNORE, unplug_watermark)
51 NL_INTEGER( 21, T_MAY_IGNORE, sndbuf_size)
52 NL_INTEGER( 22, T_MAY_IGNORE, ko_count)
53 NL_INTEGER( 24, T_MAY_IGNORE, after_sb_0p)
54 NL_INTEGER( 25, T_MAY_IGNORE, after_sb_1p)
55 NL_INTEGER( 26, T_MAY_IGNORE, after_sb_2p)
56 NL_INTEGER( 39, T_MAY_IGNORE, rr_conflict)
57 NL_INTEGER( 40, T_MAY_IGNORE, ping_timeo)
58 NL_INTEGER( 67, T_MAY_IGNORE, rcvbuf_size)
59 /* 59 addr_family was available in GIT, never released */
60 NL_BIT( 60, T_MANDATORY, mind_af)
61 NL_BIT( 27, T_MAY_IGNORE, want_lose)
62 NL_BIT( 28, T_MAY_IGNORE, two_primaries)
63 NL_BIT( 41, T_MAY_IGNORE, always_asbp)
64 NL_BIT( 61, T_MAY_IGNORE, no_cork)
65 NL_BIT( 62, T_MANDATORY, auto_sndbuf_size)
66)
67
68NL_PACKET(disconnect, 6, )
69
70NL_PACKET(resize, 7,
71 NL_INT64( 29, T_MAY_IGNORE, resize_size)
72)
73
74NL_PACKET(syncer_conf, 8,
75 NL_INTEGER( 30, T_MAY_IGNORE, rate)
76 NL_INTEGER( 31, T_MAY_IGNORE, after)
77 NL_INTEGER( 32, T_MAY_IGNORE, al_extents)
78 NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX)
79 NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32)
80 NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX)
81 NL_BIT( 65, T_MAY_IGNORE, use_rle)
82)
83
84NL_PACKET(invalidate, 9, )
85NL_PACKET(invalidate_peer, 10, )
86NL_PACKET(pause_sync, 11, )
87NL_PACKET(resume_sync, 12, )
88NL_PACKET(suspend_io, 13, )
89NL_PACKET(resume_io, 14, )
90NL_PACKET(outdate, 15, )
91NL_PACKET(get_config, 16, )
92NL_PACKET(get_state, 17,
93 NL_INTEGER( 33, T_MAY_IGNORE, state_i)
94)
95
96NL_PACKET(get_uuids, 18,
97 NL_STRING( 34, T_MAY_IGNORE, uuids, (UI_SIZE*sizeof(__u64)))
98 NL_INTEGER( 35, T_MAY_IGNORE, uuids_flags)
99)
100
101NL_PACKET(get_timeout_flag, 19,
102 NL_BIT( 36, T_MAY_IGNORE, use_degraded)
103)
104
105NL_PACKET(call_helper, 20,
106 NL_STRING( 38, T_MAY_IGNORE, helper, 32)
107)
108
109/* Tag nr 42 already allocated in drbd-8.1 development. */
110
111NL_PACKET(sync_progress, 23,
112 NL_INTEGER( 43, T_MAY_IGNORE, sync_progress)
113)
114
115NL_PACKET(dump_ee, 24,
116 NL_STRING( 45, T_MAY_IGNORE, dump_ee_reason, 32)
117 NL_STRING( 46, T_MAY_IGNORE, seen_digest, SHARED_SECRET_MAX)
118 NL_STRING( 47, T_MAY_IGNORE, calc_digest, SHARED_SECRET_MAX)
119 NL_INT64( 48, T_MAY_IGNORE, ee_sector)
120 NL_INT64( 49, T_MAY_IGNORE, ee_block_id)
121 NL_STRING( 50, T_MAY_IGNORE, ee_data, 32 << 10)
122)
123
124NL_PACKET(start_ov, 25,
125 NL_INT64( 66, T_MAY_IGNORE, start_sector)
126)
127
128NL_PACKET(new_c_uuid, 26,
129 NL_BIT( 63, T_MANDATORY, clear_bm)
130)
131
132#undef NL_PACKET
133#undef NL_INTEGER
134#undef NL_INT64
135#undef NL_BIT
136#undef NL_STRING
137
diff --git a/include/linux/drbd_tag_magic.h b/include/linux/drbd_tag_magic.h
new file mode 100644
index 000000000000..fcdff8410e99
--- /dev/null
+++ b/include/linux/drbd_tag_magic.h
@@ -0,0 +1,83 @@
1#ifndef DRBD_TAG_MAGIC_H
2#define DRBD_TAG_MAGIC_H
3
4#define TT_END 0
5#define TT_REMOVED 0xE000
6
7/* declare packet_type enums */
8enum packet_types {
9#define NL_PACKET(name, number, fields) P_ ## name = number,
10#define NL_INTEGER(pn, pr, member)
11#define NL_INT64(pn, pr, member)
12#define NL_BIT(pn, pr, member)
13#define NL_STRING(pn, pr, member, len)
14#include "drbd_nl.h"
15 P_nl_after_last_packet,
16};
17
18/* These struct are used to deduce the size of the tag lists: */
19#define NL_PACKET(name, number, fields) \
20 struct name ## _tag_len_struct { fields };
21#define NL_INTEGER(pn, pr, member) \
22 int member; int tag_and_len ## member;
23#define NL_INT64(pn, pr, member) \
24 __u64 member; int tag_and_len ## member;
25#define NL_BIT(pn, pr, member) \
26 unsigned char member:1; int tag_and_len ## member;
27#define NL_STRING(pn, pr, member, len) \
28 unsigned char member[len]; int member ## _len; \
29 int tag_and_len ## member;
30#include "linux/drbd_nl.h"
31
32/* declate tag-list-sizes */
33static const int tag_list_sizes[] = {
34#define NL_PACKET(name, number, fields) 2 fields ,
35#define NL_INTEGER(pn, pr, member) + 4 + 4
36#define NL_INT64(pn, pr, member) + 4 + 8
37#define NL_BIT(pn, pr, member) + 4 + 1
38#define NL_STRING(pn, pr, member, len) + 4 + (len)
39#include "drbd_nl.h"
40};
41
42/* The two highest bits are used for the tag type */
43#define TT_MASK 0xC000
44#define TT_INTEGER 0x0000
45#define TT_INT64 0x4000
46#define TT_BIT 0x8000
47#define TT_STRING 0xC000
48/* The next bit indicates if processing of the tag is mandatory */
49#define T_MANDATORY 0x2000
50#define T_MAY_IGNORE 0x0000
51#define TN_MASK 0x1fff
52/* The remaining 13 bits are used to enumerate the tags */
53
54#define tag_type(T) ((T) & TT_MASK)
55#define tag_number(T) ((T) & TN_MASK)
56
57/* declare tag enums */
58#define NL_PACKET(name, number, fields) fields
59enum drbd_tags {
60#define NL_INTEGER(pn, pr, member) T_ ## member = pn | TT_INTEGER | pr ,
61#define NL_INT64(pn, pr, member) T_ ## member = pn | TT_INT64 | pr ,
62#define NL_BIT(pn, pr, member) T_ ## member = pn | TT_BIT | pr ,
63#define NL_STRING(pn, pr, member, len) T_ ## member = pn | TT_STRING | pr ,
64#include "drbd_nl.h"
65};
66
67struct tag {
68 const char *name;
69 int type_n_flags;
70 int max_len;
71};
72
73/* declare tag names */
74#define NL_PACKET(name, number, fields) fields
75static const struct tag tag_descriptions[] = {
76#define NL_INTEGER(pn, pr, member) [ pn ] = { #member, TT_INTEGER | pr, sizeof(int) },
77#define NL_INT64(pn, pr, member) [ pn ] = { #member, TT_INT64 | pr, sizeof(__u64) },
78#define NL_BIT(pn, pr, member) [ pn ] = { #member, TT_BIT | pr, sizeof(int) },
79#define NL_STRING(pn, pr, member, len) [ pn ] = { #member, TT_STRING | pr, (len) },
80#include "drbd_nl.h"
81};
82
83#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2620a8c63571..2f5fca4147c2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -129,7 +129,6 @@ struct inodes_stat_t {
129 * WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device 129 * WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device
130 * immediately after submission. The write equivalent 130 * immediately after submission. The write equivalent
131 * of READ_SYNC. 131 * of READ_SYNC.
132 * WRITE_ODIRECT Special case write for O_DIRECT only.
133 * SWRITE_SYNC 132 * SWRITE_SYNC
134 * SWRITE_SYNC_PLUG Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer. 133 * SWRITE_SYNC_PLUG Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer.
135 * See SWRITE. 134 * See SWRITE.
@@ -151,7 +150,6 @@ struct inodes_stat_t {
151#define READ_META (READ | (1 << BIO_RW_META)) 150#define READ_META (READ | (1 << BIO_RW_META))
152#define WRITE_SYNC_PLUG (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) 151#define WRITE_SYNC_PLUG (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
153#define WRITE_SYNC (WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) 152#define WRITE_SYNC (WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
154#define WRITE_ODIRECT (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
155#define SWRITE_SYNC_PLUG \ 153#define SWRITE_SYNC_PLUG \
156 (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) 154 (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
157#define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) 155#define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 4da4a75c3f1e..eb73632440f1 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -40,16 +40,11 @@ struct cfq_io_context {
40 struct io_context *ioc; 40 struct io_context *ioc;
41 41
42 unsigned long last_end_request; 42 unsigned long last_end_request;
43 sector_t last_request_pos;
44 43
45 unsigned long ttime_total; 44 unsigned long ttime_total;
46 unsigned long ttime_samples; 45 unsigned long ttime_samples;
47 unsigned long ttime_mean; 46 unsigned long ttime_mean;
48 47
49 unsigned int seek_samples;
50 u64 seek_total;
51 sector_t seek_mean;
52
53 struct list_head queue_list; 48 struct list_head queue_list;
54 struct hlist_node cic_list; 49 struct hlist_node cic_list;
55 50
diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h
new file mode 100644
index 000000000000..3a2b2d9b0472
--- /dev/null
+++ b/include/linux/lru_cache.h
@@ -0,0 +1,294 @@
1/*
2 lru_cache.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#ifndef LRU_CACHE_H
27#define LRU_CACHE_H
28
29#include <linux/list.h>
30#include <linux/slab.h>
31#include <linux/bitops.h>
32#include <linux/string.h> /* for memset */
33#include <linux/seq_file.h>
34
35/*
36This header file (and its .c file; kernel-doc of functions see there)
37 define a helper framework to easily keep track of index:label associations,
38 and changes to an "active set" of objects, as well as pending transactions,
39 to persistently record those changes.
40
41 We use an LRU policy if it is necessary to "cool down" a region currently in
42 the active set before we can "heat" a previously unused region.
43
44 Because of this later property, it is called "lru_cache".
45 As it actually Tracks Objects in an Active SeT, we could also call it
46 toast (incidentally that is what may happen to the data on the
47 backend storage uppon next resync, if we don't get it right).
48
49What for?
50
51We replicate IO (more or less synchronously) to local and remote disk.
52
53For crash recovery after replication node failure,
54 we need to resync all regions that have been target of in-flight WRITE IO
55 (in use, or "hot", regions), as we don't know wether or not those WRITEs have
56 made it to stable storage.
57
58 To avoid a "full resync", we need to persistently track these regions.
59
60 This is known as "write intent log", and can be implemented as on-disk
61 (coarse or fine grained) bitmap, or other meta data.
62
63 To avoid the overhead of frequent extra writes to this meta data area,
64 usually the condition is softened to regions that _may_ have been target of
65 in-flight WRITE IO, e.g. by only lazily clearing the on-disk write-intent
66 bitmap, trading frequency of meta data transactions against amount of
67 (possibly unneccessary) resync traffic.
68
69 If we set a hard limit on the area that may be "hot" at any given time, we
70 limit the amount of resync traffic needed for crash recovery.
71
72For recovery after replication link failure,
73 we need to resync all blocks that have been changed on the other replica
74 in the mean time, or, if both replica have been changed independently [*],
75 all blocks that have been changed on either replica in the mean time.
76 [*] usually as a result of a cluster split-brain and insufficient protection.
77 but there are valid use cases to do this on purpose.
78
79 Tracking those blocks can be implemented as "dirty bitmap".
80 Having it fine-grained reduces the amount of resync traffic.
81 It should also be persistent, to allow for reboots (or crashes)
82 while the replication link is down.
83
84There are various possible implementations for persistently storing
85write intent log information, three of which are mentioned here.
86
87"Chunk dirtying"
88 The on-disk "dirty bitmap" may be re-used as "write-intent" bitmap as well.
89 To reduce the frequency of bitmap updates for write-intent log purposes,
90 one could dirty "chunks" (of some size) at a time of the (fine grained)
91 on-disk bitmap, while keeping the in-memory "dirty" bitmap as clean as
92 possible, flushing it to disk again when a previously "hot" (and on-disk
93 dirtied as full chunk) area "cools down" again (no IO in flight anymore,
94 and none expected in the near future either).
95
96"Explicit (coarse) write intent bitmap"
97 An other implementation could chose a (probably coarse) explicit bitmap,
98 for write-intent log purposes, additionally to the fine grained dirty bitmap.
99
100"Activity log"
101 Yet an other implementation may keep track of the hot regions, by starting
102 with an empty set, and writing down a journal of region numbers that have
103 become "hot", or have "cooled down" again.
104
105 To be able to use a ring buffer for this journal of changes to the active
106 set, we not only record the actual changes to that set, but also record the
107 not changing members of the set in a round robin fashion. To do so, we use a
108 fixed (but configurable) number of slots which we can identify by index, and
109 associate region numbers (labels) with these indices.
110 For each transaction recording a change to the active set, we record the
111 change itself (index: -old_label, +new_label), and which index is associated
112 with which label (index: current_label) within a certain sliding window that
113 is moved further over the available indices with each such transaction.
114
115 Thus, for crash recovery, if the ringbuffer is sufficiently large, we can
116 accurately reconstruct the active set.
117
118 Sufficiently large depends only on maximum number of active objects, and the
119 size of the sliding window recording "index: current_label" associations within
120 each transaction.
121
122 This is what we call the "activity log".
123
124 Currently we need one activity log transaction per single label change, which
125 does not give much benefit over the "dirty chunks of bitmap" approach, other
126 than potentially less seeks.
127
128 We plan to change the transaction format to support multiple changes per
129 transaction, which then would reduce several (disjoint, "random") updates to
130 the bitmap into one transaction to the activity log ring buffer.
131*/
132
133/* this defines an element in a tracked set
134 * .colision is for hash table lookup.
135 * When we process a new IO request, we know its sector, thus can deduce the
136 * region number (label) easily. To do the label -> object lookup without a
137 * full list walk, we use a simple hash table.
138 *
139 * .list is on one of three lists:
140 * in_use: currently in use (refcnt > 0, lc_number != LC_FREE)
141 * lru: unused but ready to be reused or recycled
142 * (ts_refcnt == 0, lc_number != LC_FREE),
143 * free: unused but ready to be recycled
144 * (ts_refcnt == 0, lc_number == LC_FREE),
145 *
146 * an element is said to be "in the active set",
147 * if either on "in_use" or "lru", i.e. lc_number != LC_FREE.
148 *
149 * DRBD currently (May 2009) only uses 61 elements on the resync lru_cache
150 * (total memory usage 2 pages), and up to 3833 elements on the act_log
151 * lru_cache, totalling ~215 kB for 64bit architechture, ~53 pages.
152 *
153 * We usually do not actually free these objects again, but only "recycle"
154 * them, as the change "index: -old_label, +LC_FREE" would need a transaction
155 * as well. Which also means that using a kmem_cache to allocate the objects
156 * from wastes some resources.
157 * But it avoids high order page allocations in kmalloc.
158 */
159struct lc_element {
160 struct hlist_node colision;
161 struct list_head list; /* LRU list or free list */
162 unsigned refcnt;
163 /* back "pointer" into ts_cache->element[index],
164 * for paranoia, and for "ts_element_to_index" */
165 unsigned lc_index;
166 /* if we want to track a larger set of objects,
167 * it needs to become arch independend u64 */
168 unsigned lc_number;
169
170 /* special label when on free list */
171#define LC_FREE (~0U)
172};
173
174struct lru_cache {
175 /* the least recently used item is kept at lru->prev */
176 struct list_head lru;
177 struct list_head free;
178 struct list_head in_use;
179
180 /* the pre-created kmem cache to allocate the objects from */
181 struct kmem_cache *lc_cache;
182
183 /* size of tracked objects, used to memset(,0,) them in lc_reset */
184 size_t element_size;
185 /* offset of struct lc_element member in the tracked object */
186 size_t element_off;
187
188 /* number of elements (indices) */
189 unsigned int nr_elements;
190 /* Arbitrary limit on maximum tracked objects. Practical limit is much
191 * lower due to allocation failures, probably. For typical use cases,
192 * nr_elements should be a few thousand at most.
193 * This also limits the maximum value of ts_element.ts_index, allowing the
194 * 8 high bits of .ts_index to be overloaded with flags in the future. */
195#define LC_MAX_ACTIVE (1<<24)
196
197 /* statistics */
198 unsigned used; /* number of lelements currently on in_use list */
199 unsigned long hits, misses, starving, dirty, changed;
200
201 /* see below: flag-bits for lru_cache */
202 unsigned long flags;
203
204 /* when changing the label of an index element */
205 unsigned int new_number;
206
207 /* for paranoia when changing the label of an index element */
208 struct lc_element *changing_element;
209
210 void *lc_private;
211 const char *name;
212
213 /* nr_elements there */
214 struct hlist_head *lc_slot;
215 struct lc_element **lc_element;
216};
217
218
219/* flag-bits for lru_cache */
220enum {
221 /* debugging aid, to catch concurrent access early.
222 * user needs to guarantee exclusive access by proper locking! */
223 __LC_PARANOIA,
224 /* if we need to change the set, but currently there is a changing
225 * transaction pending, we are "dirty", and must deferr further
226 * changing requests */
227 __LC_DIRTY,
228 /* if we need to change the set, but currently there is no free nor
229 * unused element available, we are "starving", and must not give out
230 * further references, to guarantee that eventually some refcnt will
231 * drop to zero and we will be able to make progress again, changing
232 * the set, writing the transaction.
233 * if the statistics say we are frequently starving,
234 * nr_elements is too small. */
235 __LC_STARVING,
236};
237#define LC_PARANOIA (1<<__LC_PARANOIA)
238#define LC_DIRTY (1<<__LC_DIRTY)
239#define LC_STARVING (1<<__LC_STARVING)
240
241extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
242 unsigned e_count, size_t e_size, size_t e_off);
243extern void lc_reset(struct lru_cache *lc);
244extern void lc_destroy(struct lru_cache *lc);
245extern void lc_set(struct lru_cache *lc, unsigned int enr, int index);
246extern void lc_del(struct lru_cache *lc, struct lc_element *element);
247
248extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr);
249extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr);
250extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr);
251extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e);
252extern void lc_changed(struct lru_cache *lc, struct lc_element *e);
253
254struct seq_file;
255extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc);
256
257extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
258 void (*detail) (struct seq_file *, struct lc_element *));
259
260/**
261 * lc_try_lock - can be used to stop lc_get() from changing the tracked set
262 * @lc: the lru cache to operate on
263 *
264 * Note that the reference counts and order on the active and lru lists may
265 * still change. Returns true if we aquired the lock.
266 */
267static inline int lc_try_lock(struct lru_cache *lc)
268{
269 return !test_and_set_bit(__LC_DIRTY, &lc->flags);
270}
271
272/**
273 * lc_unlock - unlock @lc, allow lc_get() to change the set again
274 * @lc: the lru cache to operate on
275 */
276static inline void lc_unlock(struct lru_cache *lc)
277{
278 clear_bit(__LC_DIRTY, &lc->flags);
279 smp_mb__after_clear_bit();
280}
281
282static inline int lc_is_used(struct lru_cache *lc, unsigned int enr)
283{
284 struct lc_element *e = lc_find(lc, enr);
285 return e && e->refcnt;
286}
287
288#define lc_entry(ptr, type, member) \
289 container_of(ptr, type, member)
290
291extern struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i);
292extern unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e);
293
294#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index bb1326d3839c..1cfe51628e1b 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -200,4 +200,7 @@ config NLATTR
200config GENERIC_ATOMIC64 200config GENERIC_ATOMIC64
201 bool 201 bool
202 202
203config LRU_CACHE
204 tristate
205
203endmenu 206endmenu
diff --git a/lib/Makefile b/lib/Makefile
index 2e78277eff9d..347ad8db29d3 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -91,6 +91,8 @@ obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o
91 91
92obj-$(CONFIG_NLATTR) += nlattr.o 92obj-$(CONFIG_NLATTR) += nlattr.o
93 93
94obj-$(CONFIG_LRU_CACHE) += lru_cache.o
95
94obj-$(CONFIG_DMA_API_DEBUG) += dma-debug.o 96obj-$(CONFIG_DMA_API_DEBUG) += dma-debug.o
95 97
96obj-$(CONFIG_GENERIC_CSUM) += checksum.o 98obj-$(CONFIG_GENERIC_CSUM) += checksum.o
diff --git a/lib/lru_cache.c b/lib/lru_cache.c
new file mode 100644
index 000000000000..270de9d31b8c
--- /dev/null
+++ b/lib/lru_cache.c
@@ -0,0 +1,560 @@
1/*
2 lru_cache.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/module.h>
27#include <linux/bitops.h>
28#include <linux/slab.h>
29#include <linux/string.h> /* for memset */
30#include <linux/seq_file.h> /* for seq_printf */
31#include <linux/lru_cache.h>
32
33MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
34 "Lars Ellenberg <lars@linbit.com>");
35MODULE_DESCRIPTION("lru_cache - Track sets of hot objects");
36MODULE_LICENSE("GPL");
37
38/* this is developers aid only.
39 * it catches concurrent access (lack of locking on the users part) */
40#define PARANOIA_ENTRY() do { \
41 BUG_ON(!lc); \
42 BUG_ON(!lc->nr_elements); \
43 BUG_ON(test_and_set_bit(__LC_PARANOIA, &lc->flags)); \
44} while (0)
45
46#define RETURN(x...) do { \
47 clear_bit(__LC_PARANOIA, &lc->flags); \
48 smp_mb__after_clear_bit(); return x ; } while (0)
49
50/* BUG() if e is not one of the elements tracked by lc */
51#define PARANOIA_LC_ELEMENT(lc, e) do { \
52 struct lru_cache *lc_ = (lc); \
53 struct lc_element *e_ = (e); \
54 unsigned i = e_->lc_index; \
55 BUG_ON(i >= lc_->nr_elements); \
56 BUG_ON(lc_->lc_element[i] != e_); } while (0)
57
58/**
59 * lc_create - prepares to track objects in an active set
60 * @name: descriptive name only used in lc_seq_printf_stats and lc_seq_dump_details
61 * @e_count: number of elements allowed to be active simultaneously
62 * @e_size: size of the tracked objects
63 * @e_off: offset to the &struct lc_element member in a tracked object
64 *
65 * Returns a pointer to a newly initialized struct lru_cache on success,
66 * or NULL on (allocation) failure.
67 */
68struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
69 unsigned e_count, size_t e_size, size_t e_off)
70{
71 struct hlist_head *slot = NULL;
72 struct lc_element **element = NULL;
73 struct lru_cache *lc;
74 struct lc_element *e;
75 unsigned cache_obj_size = kmem_cache_size(cache);
76 unsigned i;
77
78 WARN_ON(cache_obj_size < e_size);
79 if (cache_obj_size < e_size)
80 return NULL;
81
82 /* e_count too big; would probably fail the allocation below anyways.
83 * for typical use cases, e_count should be few thousand at most. */
84 if (e_count > LC_MAX_ACTIVE)
85 return NULL;
86
87 slot = kzalloc(e_count * sizeof(struct hlist_head*), GFP_KERNEL);
88 if (!slot)
89 goto out_fail;
90 element = kzalloc(e_count * sizeof(struct lc_element *), GFP_KERNEL);
91 if (!element)
92 goto out_fail;
93
94 lc = kzalloc(sizeof(*lc), GFP_KERNEL);
95 if (!lc)
96 goto out_fail;
97
98 INIT_LIST_HEAD(&lc->in_use);
99 INIT_LIST_HEAD(&lc->lru);
100 INIT_LIST_HEAD(&lc->free);
101
102 lc->name = name;
103 lc->element_size = e_size;
104 lc->element_off = e_off;
105 lc->nr_elements = e_count;
106 lc->new_number = LC_FREE;
107 lc->lc_cache = cache;
108 lc->lc_element = element;
109 lc->lc_slot = slot;
110
111 /* preallocate all objects */
112 for (i = 0; i < e_count; i++) {
113 void *p = kmem_cache_alloc(cache, GFP_KERNEL);
114 if (!p)
115 break;
116 memset(p, 0, lc->element_size);
117 e = p + e_off;
118 e->lc_index = i;
119 e->lc_number = LC_FREE;
120 list_add(&e->list, &lc->free);
121 element[i] = e;
122 }
123 if (i == e_count)
124 return lc;
125
126 /* else: could not allocate all elements, give up */
127 for (i--; i; i--) {
128 void *p = element[i];
129 kmem_cache_free(cache, p - e_off);
130 }
131 kfree(lc);
132out_fail:
133 kfree(element);
134 kfree(slot);
135 return NULL;
136}
137
138void lc_free_by_index(struct lru_cache *lc, unsigned i)
139{
140 void *p = lc->lc_element[i];
141 WARN_ON(!p);
142 if (p) {
143 p -= lc->element_off;
144 kmem_cache_free(lc->lc_cache, p);
145 }
146}
147
148/**
149 * lc_destroy - frees memory allocated by lc_create()
150 * @lc: the lru cache to destroy
151 */
152void lc_destroy(struct lru_cache *lc)
153{
154 unsigned i;
155 if (!lc)
156 return;
157 for (i = 0; i < lc->nr_elements; i++)
158 lc_free_by_index(lc, i);
159 kfree(lc->lc_element);
160 kfree(lc->lc_slot);
161 kfree(lc);
162}
163
164/**
165 * lc_reset - does a full reset for @lc and the hash table slots.
166 * @lc: the lru cache to operate on
167 *
168 * It is roughly the equivalent of re-allocating a fresh lru_cache object,
169 * basically a short cut to lc_destroy(lc); lc = lc_create(...);
170 */
171void lc_reset(struct lru_cache *lc)
172{
173 unsigned i;
174
175 INIT_LIST_HEAD(&lc->in_use);
176 INIT_LIST_HEAD(&lc->lru);
177 INIT_LIST_HEAD(&lc->free);
178 lc->used = 0;
179 lc->hits = 0;
180 lc->misses = 0;
181 lc->starving = 0;
182 lc->dirty = 0;
183 lc->changed = 0;
184 lc->flags = 0;
185 lc->changing_element = NULL;
186 lc->new_number = LC_FREE;
187 memset(lc->lc_slot, 0, sizeof(struct hlist_head) * lc->nr_elements);
188
189 for (i = 0; i < lc->nr_elements; i++) {
190 struct lc_element *e = lc->lc_element[i];
191 void *p = e;
192 p -= lc->element_off;
193 memset(p, 0, lc->element_size);
194 /* re-init it */
195 e->lc_index = i;
196 e->lc_number = LC_FREE;
197 list_add(&e->list, &lc->free);
198 }
199}
200
201/**
202 * lc_seq_printf_stats - print stats about @lc into @seq
203 * @seq: the seq_file to print into
204 * @lc: the lru cache to print statistics of
205 */
206size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc)
207{
208 /* NOTE:
209 * total calls to lc_get are
210 * (starving + hits + misses)
211 * misses include "dirty" count (update from an other thread in
212 * progress) and "changed", when this in fact lead to an successful
213 * update of the cache.
214 */
215 return seq_printf(seq, "\t%s: used:%u/%u "
216 "hits:%lu misses:%lu starving:%lu dirty:%lu changed:%lu\n",
217 lc->name, lc->used, lc->nr_elements,
218 lc->hits, lc->misses, lc->starving, lc->dirty, lc->changed);
219}
220
221static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr)
222{
223 return lc->lc_slot + (enr % lc->nr_elements);
224}
225
226
227/**
228 * lc_find - find element by label, if present in the hash table
229 * @lc: The lru_cache object
230 * @enr: element number
231 *
232 * Returns the pointer to an element, if the element with the requested
233 * "label" or element number is present in the hash table,
234 * or NULL if not found. Does not change the refcnt.
235 */
236struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr)
237{
238 struct hlist_node *n;
239 struct lc_element *e;
240
241 BUG_ON(!lc);
242 BUG_ON(!lc->nr_elements);
243 hlist_for_each_entry(e, n, lc_hash_slot(lc, enr), colision) {
244 if (e->lc_number == enr)
245 return e;
246 }
247 return NULL;
248}
249
250/* returned element will be "recycled" immediately */
251static struct lc_element *lc_evict(struct lru_cache *lc)
252{
253 struct list_head *n;
254 struct lc_element *e;
255
256 if (list_empty(&lc->lru))
257 return NULL;
258
259 n = lc->lru.prev;
260 e = list_entry(n, struct lc_element, list);
261
262 PARANOIA_LC_ELEMENT(lc, e);
263
264 list_del(&e->list);
265 hlist_del(&e->colision);
266 return e;
267}
268
269/**
270 * lc_del - removes an element from the cache
271 * @lc: The lru_cache object
272 * @e: The element to remove
273 *
274 * @e must be unused (refcnt == 0). Moves @e from "lru" to "free" list,
275 * sets @e->enr to %LC_FREE.
276 */
277void lc_del(struct lru_cache *lc, struct lc_element *e)
278{
279 PARANOIA_ENTRY();
280 PARANOIA_LC_ELEMENT(lc, e);
281 BUG_ON(e->refcnt);
282
283 e->lc_number = LC_FREE;
284 hlist_del_init(&e->colision);
285 list_move(&e->list, &lc->free);
286 RETURN();
287}
288
289static struct lc_element *lc_get_unused_element(struct lru_cache *lc)
290{
291 struct list_head *n;
292
293 if (list_empty(&lc->free))
294 return lc_evict(lc);
295
296 n = lc->free.next;
297 list_del(n);
298 return list_entry(n, struct lc_element, list);
299}
300
301static int lc_unused_element_available(struct lru_cache *lc)
302{
303 if (!list_empty(&lc->free))
304 return 1; /* something on the free list */
305 if (!list_empty(&lc->lru))
306 return 1; /* something to evict */
307
308 return 0;
309}
310
311
312/**
313 * lc_get - get element by label, maybe change the active set
314 * @lc: the lru cache to operate on
315 * @enr: the label to look up
316 *
317 * Finds an element in the cache, increases its usage count,
318 * "touches" and returns it.
319 *
320 * In case the requested number is not present, it needs to be added to the
321 * cache. Therefore it is possible that an other element becomes evicted from
322 * the cache. In either case, the user is notified so he is able to e.g. keep
323 * a persistent log of the cache changes, and therefore the objects in use.
324 *
325 * Return values:
326 * NULL
327 * The cache was marked %LC_STARVING,
328 * or the requested label was not in the active set
329 * and a changing transaction is still pending (@lc was marked %LC_DIRTY).
330 * Or no unused or free element could be recycled (@lc will be marked as
331 * %LC_STARVING, blocking further lc_get() operations).
332 *
333 * pointer to the element with the REQUESTED element number.
334 * In this case, it can be used right away
335 *
336 * pointer to an UNUSED element with some different element number,
337 * where that different number may also be %LC_FREE.
338 *
339 * In this case, the cache is marked %LC_DIRTY (blocking further changes),
340 * and the returned element pointer is removed from the lru list and
341 * hash collision chains. The user now should do whatever housekeeping
342 * is necessary.
343 * Then he must call lc_changed(lc,element_pointer), to finish
344 * the change.
345 *
346 * NOTE: The user needs to check the lc_number on EACH use, so he recognizes
347 * any cache set change.
348 */
349struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr)
350{
351 struct lc_element *e;
352
353 PARANOIA_ENTRY();
354 if (lc->flags & LC_STARVING) {
355 ++lc->starving;
356 RETURN(NULL);
357 }
358
359 e = lc_find(lc, enr);
360 if (e) {
361 ++lc->hits;
362 if (e->refcnt++ == 0)
363 lc->used++;
364 list_move(&e->list, &lc->in_use); /* Not evictable... */
365 RETURN(e);
366 }
367
368 ++lc->misses;
369
370 /* In case there is nothing available and we can not kick out
371 * the LRU element, we have to wait ...
372 */
373 if (!lc_unused_element_available(lc)) {
374 __set_bit(__LC_STARVING, &lc->flags);
375 RETURN(NULL);
376 }
377
378 /* it was not present in the active set.
379 * we are going to recycle an unused (or even "free") element.
380 * user may need to commit a transaction to record that change.
381 * we serialize on flags & TF_DIRTY */
382 if (test_and_set_bit(__LC_DIRTY, &lc->flags)) {
383 ++lc->dirty;
384 RETURN(NULL);
385 }
386
387 e = lc_get_unused_element(lc);
388 BUG_ON(!e);
389
390 clear_bit(__LC_STARVING, &lc->flags);
391 BUG_ON(++e->refcnt != 1);
392 lc->used++;
393
394 lc->changing_element = e;
395 lc->new_number = enr;
396
397 RETURN(e);
398}
399
400/* similar to lc_get,
401 * but only gets a new reference on an existing element.
402 * you either get the requested element, or NULL.
403 * will be consolidated into one function.
404 */
405struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr)
406{
407 struct lc_element *e;
408
409 PARANOIA_ENTRY();
410 if (lc->flags & LC_STARVING) {
411 ++lc->starving;
412 RETURN(NULL);
413 }
414
415 e = lc_find(lc, enr);
416 if (e) {
417 ++lc->hits;
418 if (e->refcnt++ == 0)
419 lc->used++;
420 list_move(&e->list, &lc->in_use); /* Not evictable... */
421 }
422 RETURN(e);
423}
424
425/**
426 * lc_changed - tell @lc that the change has been recorded
427 * @lc: the lru cache to operate on
428 * @e: the element pending label change
429 */
430void lc_changed(struct lru_cache *lc, struct lc_element *e)
431{
432 PARANOIA_ENTRY();
433 BUG_ON(e != lc->changing_element);
434 PARANOIA_LC_ELEMENT(lc, e);
435 ++lc->changed;
436 e->lc_number = lc->new_number;
437 list_add(&e->list, &lc->in_use);
438 hlist_add_head(&e->colision, lc_hash_slot(lc, lc->new_number));
439 lc->changing_element = NULL;
440 lc->new_number = LC_FREE;
441 clear_bit(__LC_DIRTY, &lc->flags);
442 smp_mb__after_clear_bit();
443 RETURN();
444}
445
446
447/**
448 * lc_put - give up refcnt of @e
449 * @lc: the lru cache to operate on
450 * @e: the element to put
451 *
452 * If refcnt reaches zero, the element is moved to the lru list,
453 * and a %LC_STARVING (if set) is cleared.
454 * Returns the new (post-decrement) refcnt.
455 */
456unsigned int lc_put(struct lru_cache *lc, struct lc_element *e)
457{
458 PARANOIA_ENTRY();
459 PARANOIA_LC_ELEMENT(lc, e);
460 BUG_ON(e->refcnt == 0);
461 BUG_ON(e == lc->changing_element);
462 if (--e->refcnt == 0) {
463 /* move it to the front of LRU. */
464 list_move(&e->list, &lc->lru);
465 lc->used--;
466 clear_bit(__LC_STARVING, &lc->flags);
467 smp_mb__after_clear_bit();
468 }
469 RETURN(e->refcnt);
470}
471
472/**
473 * lc_element_by_index
474 * @lc: the lru cache to operate on
475 * @i: the index of the element to return
476 */
477struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i)
478{
479 BUG_ON(i >= lc->nr_elements);
480 BUG_ON(lc->lc_element[i] == NULL);
481 BUG_ON(lc->lc_element[i]->lc_index != i);
482 return lc->lc_element[i];
483}
484
485/**
486 * lc_index_of
487 * @lc: the lru cache to operate on
488 * @e: the element to query for its index position in lc->element
489 */
490unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e)
491{
492 PARANOIA_LC_ELEMENT(lc, e);
493 return e->lc_index;
494}
495
496/**
497 * lc_set - associate index with label
498 * @lc: the lru cache to operate on
499 * @enr: the label to set
500 * @index: the element index to associate label with.
501 *
502 * Used to initialize the active set to some previously recorded state.
503 */
504void lc_set(struct lru_cache *lc, unsigned int enr, int index)
505{
506 struct lc_element *e;
507
508 if (index < 0 || index >= lc->nr_elements)
509 return;
510
511 e = lc_element_by_index(lc, index);
512 e->lc_number = enr;
513
514 hlist_del_init(&e->colision);
515 hlist_add_head(&e->colision, lc_hash_slot(lc, enr));
516 list_move(&e->list, e->refcnt ? &lc->in_use : &lc->lru);
517}
518
519/**
520 * lc_dump - Dump a complete LRU cache to seq in textual form.
521 * @lc: the lru cache to operate on
522 * @seq: the &struct seq_file pointer to seq_printf into
523 * @utext: user supplied "heading" or other info
524 * @detail: function pointer the user may provide to dump further details
525 * of the object the lc_element is embedded in.
526 */
527void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
528 void (*detail) (struct seq_file *, struct lc_element *))
529{
530 unsigned int nr_elements = lc->nr_elements;
531 struct lc_element *e;
532 int i;
533
534 seq_printf(seq, "\tnn: lc_number refcnt %s\n ", utext);
535 for (i = 0; i < nr_elements; i++) {
536 e = lc_element_by_index(lc, i);
537 if (e->lc_number == LC_FREE) {
538 seq_printf(seq, "\t%2d: FREE\n", i);
539 } else {
540 seq_printf(seq, "\t%2d: %4u %4u ", i,
541 e->lc_number, e->refcnt);
542 detail(seq, e);
543 }
544 }
545}
546
547EXPORT_SYMBOL(lc_create);
548EXPORT_SYMBOL(lc_reset);
549EXPORT_SYMBOL(lc_destroy);
550EXPORT_SYMBOL(lc_set);
551EXPORT_SYMBOL(lc_del);
552EXPORT_SYMBOL(lc_try_get);
553EXPORT_SYMBOL(lc_find);
554EXPORT_SYMBOL(lc_get);
555EXPORT_SYMBOL(lc_put);
556EXPORT_SYMBOL(lc_changed);
557EXPORT_SYMBOL(lc_element_by_index);
558EXPORT_SYMBOL(lc_index_of);
559EXPORT_SYMBOL(lc_seq_printf_stats);
560EXPORT_SYMBOL(lc_seq_dump_details);