aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-12-08 11:19:16 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-12-08 11:19:16 -0500
commit6035ccd8e9e40bb654fbfdef325902ab531679a5 (patch)
treec1810d8a4d4ef150cdf14af72e6087dfc3f4b6e0
parent23eb3b64b5e44680c867e165fe1cd18e57fba255 (diff)
parent878eaddd05d251cefa9632c2b8046833c5eead66 (diff)
Merge branch 'for-2.6.33' of git://git.kernel.dk/linux-2.6-block
* 'for-2.6.33' of git://git.kernel.dk/linux-2.6-block: (113 commits) cfq-iosched: Do not access cfqq after freeing it block: include linux/err.h to use ERR_PTR cfq-iosched: use call_rcu() instead of doing grace period stall on queue exit blkio: Allow CFQ group IO scheduling even when CFQ is a module blkio: Implement dynamic io controlling policy registration blkio: Export some symbols from blkio as its user CFQ can be a module block: Fix io_context leak after failure of clone with CLONE_IO block: Fix io_context leak after clone with CLONE_IO cfq-iosched: make nonrot check logic consistent io controller: quick fix for blk-cgroup and modular CFQ cfq-iosched: move IO controller declerations to a header file cfq-iosched: fix compile problem with !CONFIG_CGROUP blkio: Documentation blkio: Wait on sync-noidle queue even if rq_noidle = 1 blkio: Implement group_isolation tunable blkio: Determine async workload length based on total number of queues blkio: Wait for cfq queue to get backlogged if group is empty blkio: Propagate cgroup weight updation to cfq groups blkio: Drop the reference to queue once the task changes cgroup blkio: Provide some isolation between groups ...
-rw-r--r--Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg588
-rw-r--r--Documentation/blockdev/drbd/DRBD-data-packets.svg459
-rw-r--r--Documentation/blockdev/drbd/README.txt16
-rw-r--r--Documentation/blockdev/drbd/conn-states-8.dot18
-rw-r--r--Documentation/blockdev/drbd/disk-states-8.dot16
-rw-r--r--Documentation/blockdev/drbd/drbd-connection-state-overview.dot85
-rw-r--r--Documentation/blockdev/drbd/node-states-8.dot14
-rw-r--r--Documentation/cgroups/blkio-controller.txt135
-rw-r--r--MAINTAINERS13
-rw-r--r--arch/alpha/include/asm/cacheflush.h1
-rw-r--r--arch/arm/include/asm/cacheflush.h1
-rw-r--r--arch/avr32/include/asm/cacheflush.h1
-rw-r--r--arch/blackfin/include/asm/cacheflush.h2
-rw-r--r--arch/cris/include/asm/cacheflush.h1
-rw-r--r--arch/frv/include/asm/cacheflush.h1
-rw-r--r--arch/h8300/include/asm/cacheflush.h1
-rw-r--r--arch/ia64/include/asm/cacheflush.h1
-rw-r--r--arch/m32r/include/asm/cacheflush.h3
-rw-r--r--arch/m68k/include/asm/cacheflush_mm.h1
-rw-r--r--arch/m68k/include/asm/cacheflush_no.h1
-rw-r--r--arch/microblaze/include/asm/cacheflush.h1
-rw-r--r--arch/mips/include/asm/cacheflush.h1
-rw-r--r--arch/mn10300/include/asm/cacheflush.h1
-rw-r--r--arch/parisc/include/asm/cacheflush.h1
-rw-r--r--arch/powerpc/include/asm/cacheflush.h1
-rw-r--r--arch/s390/include/asm/cacheflush.h1
-rw-r--r--arch/score/include/asm/cacheflush.h1
-rw-r--r--arch/sh/include/asm/cacheflush.h1
-rw-r--r--arch/sparc/include/asm/cacheflush_32.h1
-rw-r--r--arch/sparc/include/asm/cacheflush_64.h1
-rw-r--r--arch/x86/include/asm/cacheflush.h1
-rw-r--r--arch/xtensa/include/asm/cacheflush.h1
-rw-r--r--block/Kconfig22
-rw-r--r--block/Kconfig.iosched43
-rw-r--r--block/Makefile2
-rw-r--r--block/as-iosched.c1520
-rw-r--r--block/blk-cgroup.c361
-rw-r--r--block/blk-cgroup.h127
-rw-r--r--block/blk-core.c19
-rw-r--r--block/blk-ioc.c12
-rw-r--r--block/blk-settings.c51
-rw-r--r--block/blk-sysfs.c33
-rw-r--r--block/bsg.c3
-rw-r--r--block/cfq-iosched.c1493
-rw-r--r--block/compat_ioctl.c2
-rw-r--r--block/elevator.c10
-rw-r--r--block/genhd.c12
-rw-r--r--block/ioctl.c2
-rw-r--r--block/scsi_ioctl.c6
-rw-r--r--drivers/block/Kconfig2
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/cciss.c544
-rw-r--r--drivers/block/cciss.h18
-rw-r--r--drivers/block/cciss_cmd.h7
-rw-r--r--drivers/block/cciss_scsi.c4
-rw-r--r--drivers/block/drbd/Kconfig71
-rw-r--r--drivers/block/drbd/Makefile5
-rw-r--r--drivers/block/drbd/drbd_actlog.c1424
-rw-r--r--drivers/block/drbd/drbd_bitmap.c1327
-rw-r--r--drivers/block/drbd/drbd_int.h2252
-rw-r--r--drivers/block/drbd/drbd_main.c3699
-rw-r--r--drivers/block/drbd/drbd_nl.c2364
-rw-r--r--drivers/block/drbd/drbd_proc.c265
-rw-r--r--drivers/block/drbd/drbd_receiver.c4426
-rw-r--r--drivers/block/drbd/drbd_req.c1125
-rw-r--r--drivers/block/drbd/drbd_req.h326
-rw-r--r--drivers/block/drbd/drbd_strings.c113
-rw-r--r--drivers/block/drbd/drbd_vli.h351
-rw-r--r--drivers/block/drbd/drbd_worker.c1512
-rw-r--r--drivers/block/drbd/drbd_wrappers.h91
-rw-r--r--drivers/block/ps3vram.c10
-rw-r--r--drivers/mtd/mtd_blkdevs.c2
-rw-r--r--drivers/staging/pohmelfs/inode.c10
-rw-r--r--fs/aio.c62
-rw-r--r--fs/bio.c12
-rw-r--r--fs/block_dev.c12
-rw-r--r--fs/direct-io.c10
-rw-r--r--fs/fs-writeback.c28
-rw-r--r--fs/nfs/write.c2
-rw-r--r--fs/partitions/check.c12
-rw-r--r--fs/partitions/efi.c30
-rw-r--r--fs/partitions/efi.h8
-rw-r--r--fs/read_write.c2
-rw-r--r--fs/splice.c24
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c9
-rw-r--r--include/asm-generic/cacheflush.h1
-rw-r--r--include/linux/backing-dev.h13
-rw-r--r--include/linux/bio.h20
-rw-r--r--include/linux/blkdev.h56
-rw-r--r--include/linux/cgroup_subsys.h6
-rw-r--r--include/linux/connector.h2
-rw-r--r--include/linux/drbd.h343
-rw-r--r--include/linux/drbd_limits.h137
-rw-r--r--include/linux/drbd_nl.h137
-rw-r--r--include/linux/drbd_tag_magic.h83
-rw-r--r--include/linux/fs.h5
-rw-r--r--include/linux/genhd.h1
-rw-r--r--include/linux/iocontext.h14
-rw-r--r--include/linux/lru_cache.h294
-rw-r--r--include/linux/writeback.h1
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c3
-rw-r--r--lib/Kconfig3
-rw-r--r--lib/Makefile2
-rw-r--r--lib/lru_cache.c560
-rw-r--r--mm/backing-dev.c2
-rw-r--r--mm/page-writeback.c12
107 files changed, 24813 insertions, 2132 deletions
diff --git a/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg b/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg
new file mode 100644
index 000000000000..f87cfa0dc2fb
--- /dev/null
+++ b/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg
@@ -0,0 +1,588 @@
1<?xml version="1.0" encoding="UTF-8" standalone="no"?>
2<!-- Created with Inkscape (http://www.inkscape.org/) -->
3<svg
4 xmlns:svg="http://www.w3.org/2000/svg"
5 xmlns="http://www.w3.org/2000/svg"
6 version="1.0"
7 width="210mm"
8 height="297mm"
9 viewBox="0 0 21000 29700"
10 id="svg2"
11 style="fill-rule:evenodd">
12 <defs
13 id="defs4" />
14 <g
15 id="Default"
16 style="visibility:visible">
17 <desc
18 id="desc180">Master slide</desc>
19 </g>
20 <path
21 d="M 11999,8601 L 11899,8301 L 12099,8301 L 11999,8601 z"
22 id="path193"
23 style="fill:#008000;visibility:visible" />
24 <path
25 d="M 11999,7801 L 11999,8361"
26 id="path197"
27 style="fill:none;stroke:#008000;visibility:visible" />
28 <path
29 d="M 7999,10401 L 7899,10101 L 8099,10101 L 7999,10401 z"
30 id="path209"
31 style="fill:#008000;visibility:visible" />
32 <path
33 d="M 7999,9601 L 7999,10161"
34 id="path213"
35 style="fill:none;stroke:#008000;visibility:visible" />
36 <path
37 d="M 11999,7801 L 11685,7840 L 11724,7644 L 11999,7801 z"
38 id="path225"
39 style="fill:#008000;visibility:visible" />
40 <path
41 d="M 7999,7001 L 11764,7754"
42 id="path229"
43 style="fill:none;stroke:#008000;visibility:visible" />
44 <g
45 transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-1244.4792,1416.5139)"
46 id="g245"
47 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
48 <text
49 id="text247">
50 <tspan
51 x="9139 9368 9579 9808 9986 10075 10252 10481 10659 10837 10909"
52 y="9284"
53 id="tspan249">RSDataReply</tspan>
54 </text>
55 </g>
56 <path
57 d="M 7999,9601 L 8281,9458 L 8311,9655 L 7999,9601 z"
58 id="path259"
59 style="fill:#008000;visibility:visible" />
60 <path
61 d="M 11999,9001 L 8236,9565"
62 id="path263"
63 style="fill:none;stroke:#008000;visibility:visible" />
64 <g
65 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,1620.9382,-1639.4947)"
66 id="g279"
67 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
68 <text
69 id="text281">
70 <tspan
71 x="8743 8972 9132 9310 9573 9801 10013 10242 10419 10597 10775 10953 11114"
72 y="7023"
73 id="tspan283">CsumRSRequest</tspan>
74 </text>
75 </g>
76 <text
77 id="text297"
78 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
79 <tspan
80 x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692"
81 y="5707"
82 id="tspan299">w_make_resync_request()</tspan>
83 </text>
84 <text
85 id="text313"
86 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
87 <tspan
88 x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378"
89 y="7806"
90 id="tspan315">receive_DataRequest()</tspan>
91 </text>
92 <text
93 id="text329"
94 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
95 <tspan
96 x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399"
97 y="8606"
98 id="tspan331">drbd_endio_read_sec()</tspan>
99 </text>
100 <text
101 id="text345"
102 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
103 <tspan
104 x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13825 13986 14164 14426 14604 14710 14871 15049 15154 15332 15510 15616"
105 y="9007"
106 id="tspan347">w_e_end_csum_rs_req()</tspan>
107 </text>
108 <text
109 id="text361"
110 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
111 <tspan
112 x="4444 4550 4728 4889 5066 5138 5299 5477 5655 5883 6095 6324 6501 6590 6768 6997 7175 7352 7424 7585 7691"
113 y="9507"
114 id="tspan363">receive_RSDataReply()</tspan>
115 </text>
116 <text
117 id="text377"
118 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
119 <tspan
120 x="4457 4635 4741 4918 5096 5274 5452 5630 5807 5879 6057 6235 6464 6569 6641 6730 6908 7086 7247 7425 7585 7691"
121 y="10407"
122 id="tspan379">drbd_endio_write_sec()</tspan>
123 </text>
124 <text
125 id="text393"
126 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
127 <tspan
128 x="4647 4825 5003 5180 5358 5536 5714 5820 5997 6158 6319 6497 6658 6836 7013 7085 7263 7424 7585 7691"
129 y="10907"
130 id="tspan395">e_end_resync_block()</tspan>
131 </text>
132 <path
133 d="M 11999,11601 L 11685,11640 L 11724,11444 L 11999,11601 z"
134 id="path405"
135 style="fill:#000080;visibility:visible" />
136 <path
137 d="M 7999,10801 L 11764,11554"
138 id="path409"
139 style="fill:none;stroke:#000080;visibility:visible" />
140 <g
141 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,2434.7562,-1674.649)"
142 id="g425"
143 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
144 <text
145 id="text427">
146 <tspan
147 x="9320 9621 9726 9798 9887 10065 10277 10438"
148 y="10943"
149 id="tspan429">WriteAck</tspan>
150 </text>
151 </g>
152 <text
153 id="text443"
154 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
155 <tspan
156 x="12199 12377 12555 12644 12821 13033 13105 13283 13444 13604 13816 13977 14138 14244"
157 y="11559"
158 id="tspan445">got_BlockAck()</tspan>
159 </text>
160 <text
161 id="text459"
162 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
163 <tspan
164 x="7999 8304 8541 8778 8990 9201 9413 9650 10001 10120 10357 10594 10806 11043 11280 11398 11703 11940 12152 12364 12601 12812 12931 13049 13261 13498 13710 13947 14065 14302 14540 14658 14777 14870 15107 15225 15437 15649 15886"
165 y="4877"
166 id="tspan461">Checksum based Resync, case not in sync</tspan>
167 </text>
168 <text
169 id="text475"
170 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
171 <tspan
172 x="6961 7266 7571 7854 8159 8299 8536 8654 8891 9010 9247 9484 9603 9840 9958 10077 10170 10407"
173 y="2806"
174 id="tspan477">DRBD-8.3 data flow</tspan>
175 </text>
176 <text
177 id="text491"
178 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
179 <tspan
180 x="5190 5419 5596 5774 5952 6113 6291 6468 6646 6824 6985 7146 7324 7586 7692"
181 y="7005"
182 id="tspan493">w_e_send_csum()</tspan>
183 </text>
184 <path
185 d="M 11999,17601 L 11899,17301 L 12099,17301 L 11999,17601 z"
186 id="path503"
187 style="fill:#008000;visibility:visible" />
188 <path
189 d="M 11999,16801 L 11999,17361"
190 id="path507"
191 style="fill:none;stroke:#008000;visibility:visible" />
192 <path
193 d="M 11999,16801 L 11685,16840 L 11724,16644 L 11999,16801 z"
194 id="path519"
195 style="fill:#008000;visibility:visible" />
196 <path
197 d="M 7999,16001 L 11764,16754"
198 id="path523"
199 style="fill:none;stroke:#008000;visibility:visible" />
200 <g
201 transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-2539.5806,1529.3491)"
202 id="g539"
203 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
204 <text
205 id="text541">
206 <tspan
207 x="9269 9498 9709 9798 9959 10048 10226 10437 10598 10776"
208 y="18265"
209 id="tspan543">RSIsInSync</tspan>
210 </text>
211 </g>
212 <path
213 d="M 7999,18601 L 8281,18458 L 8311,18655 L 7999,18601 z"
214 id="path553"
215 style="fill:#000080;visibility:visible" />
216 <path
217 d="M 11999,18001 L 8236,18565"
218 id="path557"
219 style="fill:none;stroke:#000080;visibility:visible" />
220 <g
221 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,3461.4027,-1449.3012)"
222 id="g573"
223 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
224 <text
225 id="text575">
226 <tspan
227 x="8743 8972 9132 9310 9573 9801 10013 10242 10419 10597 10775 10953 11114"
228 y="16023"
229 id="tspan577">CsumRSRequest</tspan>
230 </text>
231 </g>
232 <text
233 id="text591"
234 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
235 <tspan
236 x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378"
237 y="16806"
238 id="tspan593">receive_DataRequest()</tspan>
239 </text>
240 <text
241 id="text607"
242 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
243 <tspan
244 x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399"
245 y="17606"
246 id="tspan609">drbd_endio_read_sec()</tspan>
247 </text>
248 <text
249 id="text623"
250 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
251 <tspan
252 x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13825 13986 14164 14426 14604 14710 14871 15049 15154 15332 15510 15616"
253 y="18007"
254 id="tspan625">w_e_end_csum_rs_req()</tspan>
255 </text>
256 <text
257 id="text639"
258 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
259 <tspan
260 x="5735 5913 6091 6180 6357 6446 6607 6696 6874 7085 7246 7424 7585 7691"
261 y="18507"
262 id="tspan641">got_IsInSync()</tspan>
263 </text>
264 <text
265 id="text655"
266 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
267 <tspan
268 x="7999 8304 8541 8778 8990 9201 9413 9650 10001 10120 10357 10594 10806 11043 11280 11398 11703 11940 12152 12364 12601 12812 12931 13049 13261 13498 13710 13947 14065 14159 14396 14514 14726 14937 15175"
269 y="13877"
270 id="tspan657">Checksum based Resync, case in sync</tspan>
271 </text>
272 <path
273 d="M 12000,24601 L 11900,24301 L 12100,24301 L 12000,24601 z"
274 id="path667"
275 style="fill:#008000;visibility:visible" />
276 <path
277 d="M 12000,23801 L 12000,24361"
278 id="path671"
279 style="fill:none;stroke:#008000;visibility:visible" />
280 <path
281 d="M 8000,26401 L 7900,26101 L 8100,26101 L 8000,26401 z"
282 id="path683"
283 style="fill:#008000;visibility:visible" />
284 <path
285 d="M 8000,25601 L 8000,26161"
286 id="path687"
287 style="fill:none;stroke:#008000;visibility:visible" />
288 <path
289 d="M 12000,23801 L 11686,23840 L 11725,23644 L 12000,23801 z"
290 id="path699"
291 style="fill:#008000;visibility:visible" />
292 <path
293 d="M 8000,23001 L 11765,23754"
294 id="path703"
295 style="fill:none;stroke:#008000;visibility:visible" />
296 <g
297 transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-3543.8452,1630.5143)"
298 id="g719"
299 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
300 <text
301 id="text721">
302 <tspan
303 x="9464 9710 9921 10150 10328 10505 10577"
304 y="25236"
305 id="tspan723">OVReply</tspan>
306 </text>
307 </g>
308 <path
309 d="M 8000,25601 L 8282,25458 L 8312,25655 L 8000,25601 z"
310 id="path733"
311 style="fill:#008000;visibility:visible" />
312 <path
313 d="M 12000,25001 L 8237,25565"
314 id="path737"
315 style="fill:none;stroke:#008000;visibility:visible" />
316 <g
317 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,4918.2801,-1381.2128)"
318 id="g753"
319 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
320 <text
321 id="text755">
322 <tspan
323 x="9142 9388 9599 9828 10006 10183 10361 10539 10700"
324 y="23106"
325 id="tspan757">OVRequest</tspan>
326 </text>
327 </g>
328 <text
329 id="text771"
330 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
331 <tspan
332 x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13656 13868 14097 14274 14452 14630 14808 14969 15058 15163"
333 y="23806"
334 id="tspan773">receive_OVRequest()</tspan>
335 </text>
336 <text
337 id="text787"
338 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
339 <tspan
340 x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14084 14262 14439 14617 14795 14956 15134 15295 15400"
341 y="24606"
342 id="tspan789">drbd_endio_read_sec()</tspan>
343 </text>
344 <text
345 id="text803"
346 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
347 <tspan
348 x="12192 12421 12598 12776 12954 13132 13310 13487 13665 13843 14004 14182 14288 14465 14643 14749"
349 y="25007"
350 id="tspan805">w_e_end_ov_req()</tspan>
351 </text>
352 <text
353 id="text819"
354 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
355 <tspan
356 x="5101 5207 5385 5546 5723 5795 5956 6134 6312 6557 6769 6998 7175 7353 7425 7586 7692"
357 y="25507"
358 id="tspan821">receive_OVReply()</tspan>
359 </text>
360 <text
361 id="text835"
362 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
363 <tspan
364 x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692"
365 y="26407"
366 id="tspan837">drbd_endio_read_sec()</tspan>
367 </text>
368 <text
369 id="text851"
370 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
371 <tspan
372 x="4902 5131 5308 5486 5664 5842 6020 6197 6375 6553 6714 6892 6998 7175 7353 7425 7586 7692"
373 y="26907"
374 id="tspan853">w_e_end_ov_reply()</tspan>
375 </text>
376 <path
377 d="M 12000,27601 L 11686,27640 L 11725,27444 L 12000,27601 z"
378 id="path863"
379 style="fill:#000080;visibility:visible" />
380 <path
381 d="M 8000,26801 L 11765,27554"
382 id="path867"
383 style="fill:none;stroke:#000080;visibility:visible" />
384 <g
385 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,5704.1907,-1328.312)"
386 id="g883"
387 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
388 <text
389 id="text885">
390 <tspan
391 x="9279 9525 9736 9965 10143 10303 10481 10553"
392 y="26935"
393 id="tspan887">OVResult</tspan>
394 </text>
395 </g>
396 <text
397 id="text901"
398 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
399 <tspan
400 x="12200 12378 12556 12645 12822 13068 13280 13508 13686 13847 14025 14097 14185 14291"
401 y="27559"
402 id="tspan903">got_OVResult()</tspan>
403 </text>
404 <text
405 id="text917"
406 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
407 <tspan
408 x="8000 8330 8567 8660 8754 8991 9228 9346 9558 9795 9935 10028 10146"
409 y="21877"
410 id="tspan919">Online verify</tspan>
411 </text>
412 <text
413 id="text933"
414 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
415 <tspan
416 x="4641 4870 5047 5310 5488 5649 5826 6004 6182 6343 6521 6626 6804 6982 7160 7338 7499 7587 7693"
417 y="23005"
418 id="tspan935">w_make_ov_request()</tspan>
419 </text>
420 <path
421 d="M 8000,6500 L 7900,6200 L 8100,6200 L 8000,6500 z"
422 id="path945"
423 style="fill:#008000;visibility:visible" />
424 <path
425 d="M 8000,5700 L 8000,6260"
426 id="path949"
427 style="fill:none;stroke:#008000;visibility:visible" />
428 <path
429 d="M 3900,5500 L 3700,5500 L 3700,11000 L 3900,11000"
430 id="path961"
431 style="fill:none;stroke:#000000;visibility:visible" />
432 <path
433 d="M 3900,14500 L 3700,14500 L 3700,18600 L 3900,18600"
434 id="path973"
435 style="fill:none;stroke:#000000;visibility:visible" />
436 <path
437 d="M 3900,22800 L 3700,22800 L 3700,26900 L 3900,26900"
438 id="path985"
439 style="fill:none;stroke:#000000;visibility:visible" />
440 <text
441 id="text1001"
442 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
443 <tspan
444 x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692"
445 y="6506"
446 id="tspan1003">drbd_endio_read_sec()</tspan>
447 </text>
448 <text
449 id="text1017"
450 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
451 <tspan
452 x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692"
453 y="14708"
454 id="tspan1019">w_make_resync_request()</tspan>
455 </text>
456 <text
457 id="text1033"
458 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
459 <tspan
460 x="5190 5419 5596 5774 5952 6113 6291 6468 6646 6824 6985 7146 7324 7586 7692"
461 y="16006"
462 id="tspan1035">w_e_send_csum()</tspan>
463 </text>
464 <path
465 d="M 8000,15501 L 7900,15201 L 8100,15201 L 8000,15501 z"
466 id="path1045"
467 style="fill:#008000;visibility:visible" />
468 <path
469 d="M 8000,14701 L 8000,15261"
470 id="path1049"
471 style="fill:none;stroke:#008000;visibility:visible" />
472 <text
473 id="text1065"
474 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
475 <tspan
476 x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692"
477 y="15507"
478 id="tspan1067">drbd_endio_read_sec()</tspan>
479 </text>
480 <path
481 d="M 16100,9000 L 16300,9000 L 16300,7500 L 16100,7500"
482 id="path1077"
483 style="fill:none;stroke:#000000;visibility:visible" />
484 <path
485 d="M 16100,18000 L 16300,18000 L 16300,16500 L 16100,16500"
486 id="path1089"
487 style="fill:none;stroke:#000000;visibility:visible" />
488 <path
489 d="M 16100,25000 L 16300,25000 L 16300,23500 L 16100,23500"
490 id="path1101"
491 style="fill:none;stroke:#000000;visibility:visible" />
492 <text
493 id="text1117"
494 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
495 <tspan
496 x="2026 2132 2293 2471 2648 2826 3004 3076 3254 3431 3503 3681 3787"
497 y="5402"
498 id="tspan1119">rs_begin_io()</tspan>
499 </text>
500 <text
501 id="text1133"
502 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
503 <tspan
504 x="2027 2133 2294 2472 2649 2827 3005 3077 3255 3432 3504 3682 3788"
505 y="14402"
506 id="tspan1135">rs_begin_io()</tspan>
507 </text>
508 <text
509 id="text1149"
510 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
511 <tspan
512 x="2026 2132 2293 2471 2648 2826 3004 3076 3254 3431 3503 3681 3787"
513 y="22602"
514 id="tspan1151">rs_begin_io()</tspan>
515 </text>
516 <text
517 id="text1165"
518 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
519 <tspan
520 x="1426 1532 1693 1871 2031 2209 2472 2649 2721 2899 2988 3166 3344 3416 3593 3699"
521 y="11302"
522 id="tspan1167">rs_complete_io()</tspan>
523 </text>
524 <text
525 id="text1181"
526 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
527 <tspan
528 x="1526 1632 1793 1971 2131 2309 2572 2749 2821 2999 3088 3266 3444 3516 3693 3799"
529 y="18931"
530 id="tspan1183">rs_complete_io()</tspan>
531 </text>
532 <text
533 id="text1197"
534 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
535 <tspan
536 x="1526 1632 1793 1971 2131 2309 2572 2749 2821 2999 3088 3266 3444 3516 3693 3799"
537 y="27231"
538 id="tspan1199">rs_complete_io()</tspan>
539 </text>
540 <text
541 id="text1213"
542 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
543 <tspan
544 x="16126 16232 16393 16571 16748 16926 17104 17176 17354 17531 17603 17781 17887"
545 y="7402"
546 id="tspan1215">rs_begin_io()</tspan>
547 </text>
548 <text
549 id="text1229"
550 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
551 <tspan
552 x="16127 16233 16394 16572 16749 16927 17105 17177 17355 17532 17604 17782 17888"
553 y="16331"
554 id="tspan1231">rs_begin_io()</tspan>
555 </text>
556 <text
557 id="text1245"
558 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
559 <tspan
560 x="16127 16233 16394 16572 16749 16927 17105 17177 17355 17532 17604 17782 17888"
561 y="23302"
562 id="tspan1247">rs_begin_io()</tspan>
563 </text>
564 <text
565 id="text1261"
566 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
567 <tspan
568 x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388"
569 y="9302"
570 id="tspan1263">rs_complete_io()</tspan>
571 </text>
572 <text
573 id="text1277"
574 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
575 <tspan
576 x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388"
577 y="18331"
578 id="tspan1279">rs_complete_io()</tspan>
579 </text>
580 <text
581 id="text1293"
582 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
583 <tspan
584 x="16126 16232 16393 16571 16731 16909 17172 17349 17421 17599 17688 17866 18044 18116 18293 18399"
585 y="25302"
586 id="tspan1295">rs_complete_io()</tspan>
587 </text>
588</svg>
diff --git a/Documentation/blockdev/drbd/DRBD-data-packets.svg b/Documentation/blockdev/drbd/DRBD-data-packets.svg
new file mode 100644
index 000000000000..48a1e2165fec
--- /dev/null
+++ b/Documentation/blockdev/drbd/DRBD-data-packets.svg
@@ -0,0 +1,459 @@
1<?xml version="1.0" encoding="UTF-8" standalone="no"?>
2<!-- Created with Inkscape (http://www.inkscape.org/) -->
3<svg
4 xmlns:svg="http://www.w3.org/2000/svg"
5 xmlns="http://www.w3.org/2000/svg"
6 version="1.0"
7 width="210mm"
8 height="297mm"
9 viewBox="0 0 21000 29700"
10 id="svg2"
11 style="fill-rule:evenodd">
12 <defs
13 id="defs4" />
14 <g
15 id="Default"
16 style="visibility:visible">
17 <desc
18 id="desc176">Master slide</desc>
19 </g>
20 <path
21 d="M 11999,19601 L 11899,19301 L 12099,19301 L 11999,19601 z"
22 id="path189"
23 style="fill:#008000;visibility:visible" />
24 <path
25 d="M 11999,18801 L 11999,19361"
26 id="path193"
27 style="fill:none;stroke:#008000;visibility:visible" />
28 <path
29 d="M 7999,21401 L 7899,21101 L 8099,21101 L 7999,21401 z"
30 id="path205"
31 style="fill:#008000;visibility:visible" />
32 <path
33 d="M 7999,20601 L 7999,21161"
34 id="path209"
35 style="fill:none;stroke:#008000;visibility:visible" />
36 <path
37 d="M 11999,18801 L 11685,18840 L 11724,18644 L 11999,18801 z"
38 id="path221"
39 style="fill:#008000;visibility:visible" />
40 <path
41 d="M 7999,18001 L 11764,18754"
42 id="path225"
43 style="fill:none;stroke:#008000;visibility:visible" />
44 <text
45 x="-3023.845"
46 y="1106.8124"
47 transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)"
48 id="text243"
49 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
50 <tspan
51 x="6115.1553 6344.1553 6555.1553 6784.1553 6962.1553 7051.1553 7228.1553 7457.1553 7635.1553 7813.1553 7885.1553"
52 y="21390.812"
53 id="tspan245">RSDataReply</tspan>
54 </text>
55 <path
56 d="M 7999,20601 L 8281,20458 L 8311,20655 L 7999,20601 z"
57 id="path255"
58 style="fill:#008000;visibility:visible" />
59 <path
60 d="M 11999,20001 L 8236,20565"
61 id="path259"
62 style="fill:none;stroke:#008000;visibility:visible" />
63 <text
64 x="3502.5356"
65 y="-2184.6621"
66 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
67 id="text277"
68 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
69 <tspan
70 x="12321.536 12550.536 12761.536 12990.536 13168.536 13257.536 13434.536 13663.536 13841.536 14019.536 14196.536 14374.536 14535.536"
71 y="15854.338"
72 id="tspan279">RSDataRequest</tspan>
73 </text>
74 <text
75 id="text293"
76 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
77 <tspan
78 x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692"
79 y="17807"
80 id="tspan295">w_make_resync_request()</tspan>
81 </text>
82 <text
83 id="text309"
84 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
85 <tspan
86 x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378"
87 y="18806"
88 id="tspan311">receive_DataRequest()</tspan>
89 </text>
90 <text
91 id="text325"
92 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
93 <tspan
94 x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399"
95 y="19606"
96 id="tspan327">drbd_endio_read_sec()</tspan>
97 </text>
98 <text
99 id="text341"
100 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
101 <tspan
102 x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13770 13931 14109 14287 14375 14553 14731 14837 15015 15192 15298"
103 y="20007"
104 id="tspan343">w_e_end_rsdata_req()</tspan>
105 </text>
106 <text
107 id="text357"
108 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
109 <tspan
110 x="4444 4550 4728 4889 5066 5138 5299 5477 5655 5883 6095 6324 6501 6590 6768 6997 7175 7352 7424 7585 7691"
111 y="20507"
112 id="tspan359">receive_RSDataReply()</tspan>
113 </text>
114 <text
115 id="text373"
116 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
117 <tspan
118 x="4457 4635 4741 4918 5096 5274 5452 5630 5807 5879 6057 6235 6464 6569 6641 6730 6908 7086 7247 7425 7585 7691"
119 y="21407"
120 id="tspan375">drbd_endio_write_sec()</tspan>
121 </text>
122 <text
123 id="text389"
124 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
125 <tspan
126 x="4647 4825 5003 5180 5358 5536 5714 5820 5997 6158 6319 6497 6658 6836 7013 7085 7263 7424 7585 7691"
127 y="21907"
128 id="tspan391">e_end_resync_block()</tspan>
129 </text>
130 <path
131 d="M 11999,22601 L 11685,22640 L 11724,22444 L 11999,22601 z"
132 id="path401"
133 style="fill:#000080;visibility:visible" />
134 <path
135 d="M 7999,21801 L 11764,22554"
136 id="path405"
137 style="fill:none;stroke:#000080;visibility:visible" />
138 <text
139 x="4290.3008"
140 y="-2369.6162"
141 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
142 id="text423"
143 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
144 <tspan
145 x="13610.301 13911.301 14016.301 14088.301 14177.301 14355.301 14567.301 14728.301"
146 y="19573.385"
147 id="tspan425">WriteAck</tspan>
148 </text>
149 <text
150 id="text439"
151 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
152 <tspan
153 x="12199 12377 12555 12644 12821 13033 13105 13283 13444 13604 13816 13977 14138 14244"
154 y="22559"
155 id="tspan441">got_BlockAck()</tspan>
156 </text>
157 <text
158 id="text455"
159 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
160 <tspan
161 x="7999 8304 8541 8753 8964 9201 9413 9531 9769 9862 10099 10310 10522 10734 10852 10971 11208 11348 11585 11822"
162 y="16877"
163 id="tspan457">Resync blocks, 4-32K</tspan>
164 </text>
165 <path
166 d="M 12000,7601 L 11900,7301 L 12100,7301 L 12000,7601 z"
167 id="path467"
168 style="fill:#008000;visibility:visible" />
169 <path
170 d="M 12000,6801 L 12000,7361"
171 id="path471"
172 style="fill:none;stroke:#008000;visibility:visible" />
173 <path
174 d="M 12000,6801 L 11686,6840 L 11725,6644 L 12000,6801 z"
175 id="path483"
176 style="fill:#008000;visibility:visible" />
177 <path
178 d="M 8000,6001 L 11765,6754"
179 id="path487"
180 style="fill:none;stroke:#008000;visibility:visible" />
181 <text
182 x="-1288.1796"
183 y="1279.7666"
184 transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)"
185 id="text505"
186 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
187 <tspan
188 x="8174.8208 8475.8203 8580.8203 8652.8203 8741.8203 8919.8203 9131.8203 9292.8203"
189 y="9516.7666"
190 id="tspan507">WriteAck</tspan>
191 </text>
192 <path
193 d="M 8000,8601 L 8282,8458 L 8312,8655 L 8000,8601 z"
194 id="path517"
195 style="fill:#000080;visibility:visible" />
196 <path
197 d="M 12000,8001 L 8237,8565"
198 id="path521"
199 style="fill:none;stroke:#000080;visibility:visible" />
200 <text
201 x="1065.6655"
202 y="-2097.7664"
203 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
204 id="text539"
205 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
206 <tspan
207 x="10682.666 10911.666 11088.666 11177.666"
208 y="4107.2339"
209 id="tspan541">Data</tspan>
210 </text>
211 <text
212 id="text555"
213 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
214 <tspan
215 x="4746 4924 5030 5207 5385 5563 5826 6003 6164 6342 6520 6626 6803 6981 7159 7337 7498 7587 7692"
216 y="5505"
217 id="tspan557">drbd_make_request()</tspan>
218 </text>
219 <text
220 id="text571"
221 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
222 <tspan
223 x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13639 13817 13906 14084 14190"
224 y="6806"
225 id="tspan573">receive_Data()</tspan>
226 </text>
227 <text
228 id="text587"
229 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
230 <tspan
231 x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14207 14312 14384 14473 14651 14829 14990 15168 15328 15434"
232 y="7606"
233 id="tspan589">drbd_endio_write_sec()</tspan>
234 </text>
235 <text
236 id="text603"
237 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
238 <tspan
239 x="12192 12370 12548 12725 12903 13081 13259 13437 13509 13686 13847 14008 14114"
240 y="8007"
241 id="tspan605">e_end_block()</tspan>
242 </text>
243 <text
244 id="text619"
245 style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
246 <tspan
247 x="5647 5825 6003 6092 6269 6481 6553 6731 6892 7052 7264 7425 7586 7692"
248 y="8606"
249 id="tspan621">got_BlockAck()</tspan>
250 </text>
251 <text
252 id="text635"
253 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
254 <tspan
255 x="8000 8305 8542 8779 9016 9109 9346 9486 9604 9956 10049 10189 10328 10565 10705 10942 11179 11298 11603 11742 11835 11954 12191 12310 12428 12665 12902 13139 13279 13516 13753"
256 y="4877"
257 id="tspan637">Regular mirrored write, 512-32K</tspan>
258 </text>
259 <text
260 id="text651"
261 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
262 <tspan
263 x="5381 5610 5787 5948 6126 6304 6482 6659 6837 7015 7087 7265 7426 7587 7692"
264 y="6003"
265 id="tspan653">w_send_dblock()</tspan>
266 </text>
267 <path
268 d="M 8000,6800 L 7900,6500 L 8100,6500 L 8000,6800 z"
269 id="path663"
270 style="fill:#008000;visibility:visible" />
271 <path
272 d="M 8000,6000 L 8000,6560"
273 id="path667"
274 style="fill:none;stroke:#008000;visibility:visible" />
275 <text
276 id="text683"
277 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
278 <tspan
279 x="4602 4780 4886 5063 5241 5419 5597 5775 5952 6024 6202 6380 6609 6714 6786 6875 7053 7231 7409 7515 7587 7692"
280 y="6905"
281 id="tspan685">drbd_endio_write_pri()</tspan>
282 </text>
283 <path
284 d="M 12000,13602 L 11900,13302 L 12100,13302 L 12000,13602 z"
285 id="path695"
286 style="fill:#008000;visibility:visible" />
287 <path
288 d="M 12000,12802 L 12000,13362"
289 id="path699"
290 style="fill:none;stroke:#008000;visibility:visible" />
291 <path
292 d="M 12000,12802 L 11686,12841 L 11725,12645 L 12000,12802 z"
293 id="path711"
294 style="fill:#008000;visibility:visible" />
295 <path
296 d="M 8000,12002 L 11765,12755"
297 id="path715"
298 style="fill:none;stroke:#008000;visibility:visible" />
299 <text
300 x="-2155.5266"
301 y="1201.5964"
302 transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)"
303 id="text733"
304 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
305 <tspan
306 x="7202.4736 7431.4736 7608.4736 7697.4736 7875.4736 8104.4736 8282.4736 8459.4736 8531.4736"
307 y="15454.597"
308 id="tspan735">DataReply</tspan>
309 </text>
310 <path
311 d="M 8000,14602 L 8282,14459 L 8312,14656 L 8000,14602 z"
312 id="path745"
313 style="fill:#008000;visibility:visible" />
314 <path
315 d="M 12000,14002 L 8237,14566"
316 id="path749"
317 style="fill:none;stroke:#008000;visibility:visible" />
318 <text
319 x="2280.3804"
320 y="-2103.2141"
321 transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
322 id="text767"
323 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
324 <tspan
325 x="11316.381 11545.381 11722.381 11811.381 11989.381 12218.381 12396.381 12573.381 12751.381 12929.381 13090.381"
326 y="9981.7861"
327 id="tspan769">DataRequest</tspan>
328 </text>
329 <text
330 id="text783"
331 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
332 <tspan
333 x="4746 4924 5030 5207 5385 5563 5826 6003 6164 6342 6520 6626 6803 6981 7159 7337 7498 7587 7692"
334 y="11506"
335 id="tspan785">drbd_make_request()</tspan>
336 </text>
337 <text
338 id="text799"
339 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
340 <tspan
341 x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13639 13817 13906 14084 14312 14490 14668 14846 15024 15185 15273 15379"
342 y="12807"
343 id="tspan801">receive_DataRequest()</tspan>
344 </text>
345 <text
346 id="text815"
347 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
348 <tspan
349 x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14084 14262 14439 14617 14795 14956 15134 15295 15400"
350 y="13607"
351 id="tspan817">drbd_endio_read_sec()</tspan>
352 </text>
353 <text
354 id="text831"
355 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
356 <tspan
357 x="12192 12421 12598 12776 12954 13132 13310 13487 13665 13843 14021 14110 14288 14465 14571 14749 14927 15033"
358 y="14008"
359 id="tspan833">w_e_end_data_req()</tspan>
360 </text>
361 <g
362 id="g835"
363 style="visibility:visible">
364 <desc
365 id="desc837">Drawing</desc>
366 <text
367 id="text847"
368 style="font-size:318px;font-weight:400;fill:#008000;font-family:Helvetica embedded">
369 <tspan
370 x="4885 4991 5169 5330 5507 5579 5740 5918 6096 6324 6502 6591 6769 6997 7175 7353 7425 7586 7692"
371 y="14607"
372 id="tspan849">receive_DataReply()</tspan>
373 </text>
374 </g>
375 <text
376 id="text863"
377 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
378 <tspan
379 x="8000 8305 8398 8610 8821 8914 9151 9363 9575 9693 9833 10070 10307 10544 10663 10781 11018 11255 11493 11632 11869 12106"
380 y="10878"
381 id="tspan865">Diskless read, 512-32K</tspan>
382 </text>
383 <text
384 id="text879"
385 style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
386 <tspan
387 x="5029 5258 5435 5596 5774 5952 6130 6307 6413 6591 6769 6947 7125 7230 7408 7586 7692"
388 y="12004"
389 id="tspan881">w_send_read_req()</tspan>
390 </text>
391 <text
392 id="text895"
393 style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
394 <tspan
395 x="6961 7266 7571 7854 8159 8278 8515 8633 8870 9107 9226 9463 9581 9700 9793 10030"
396 y="2806"
397 id="tspan897">DRBD 8 data flow</tspan>
398 </text>
399 <path
400 d="M 3900,5300 L 3700,5300 L 3700,7000 L 3900,7000"
401 id="path907"
402 style="fill:none;stroke:#000000;visibility:visible" />
403 <path
404 d="M 3900,17600 L 3700,17600 L 3700,22000 L 3900,22000"
405 id="path919"
406 style="fill:none;stroke:#000000;visibility:visible" />
407 <path
408 d="M 16100,20000 L 16300,20000 L 16300,18500 L 16100,18500"
409 id="path931"
410 style="fill:none;stroke:#000000;visibility:visible" />
411 <text
412 id="text947"
413 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
414 <tspan
415 x="2126 2304 2376 2554 2731 2909 3087 3159 3337 3515 3587 3764 3870"
416 y="5202"
417 id="tspan949">al_begin_io()</tspan>
418 </text>
419 <text
420 id="text963"
421 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
422 <tspan
423 x="1632 1810 1882 2060 2220 2398 2661 2839 2910 3088 3177 3355 3533 3605 3783 3888"
424 y="7331"
425 id="tspan965">al_complete_io()</tspan>
426 </text>
427 <text
428 id="text979"
429 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
430 <tspan
431 x="2126 2232 2393 2571 2748 2926 3104 3176 3354 3531 3603 3781 3887"
432 y="17431"
433 id="tspan981">rs_begin_io()</tspan>
434 </text>
435 <text
436 id="text995"
437 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
438 <tspan
439 x="1626 1732 1893 2071 2231 2409 2672 2849 2921 3099 3188 3366 3544 3616 3793 3899"
440 y="22331"
441 id="tspan997">rs_complete_io()</tspan>
442 </text>
443 <text
444 id="text1011"
445 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
446 <tspan
447 x="16027 16133 16294 16472 16649 16827 17005 17077 17255 17432 17504 17682 17788"
448 y="18402"
449 id="tspan1013">rs_begin_io()</tspan>
450 </text>
451 <text
452 id="text1027"
453 style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
454 <tspan
455 x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388"
456 y="20331"
457 id="tspan1029">rs_complete_io()</tspan>
458 </text>
459</svg>
diff --git a/Documentation/blockdev/drbd/README.txt b/Documentation/blockdev/drbd/README.txt
new file mode 100644
index 000000000000..627b0a1bf35e
--- /dev/null
+++ b/Documentation/blockdev/drbd/README.txt
@@ -0,0 +1,16 @@
1Description
2
3 DRBD is a shared-nothing, synchronously replicated block device. It
4 is designed to serve as a building block for high availability
5 clusters and in this context, is a "drop-in" replacement for shared
6 storage. Simplistically, you could see it as a network RAID 1.
7
8 Please visit http://www.drbd.org to find out more.
9
10The here included files are intended to help understand the implementation
11
12DRBD-8.3-data-packets.svg, DRBD-data-packets.svg
13 relates some functions, and write packets.
14
15conn-states-8.dot, disk-states-8.dot, node-states-8.dot
16 The sub graphs of DRBD's state transitions
diff --git a/Documentation/blockdev/drbd/conn-states-8.dot b/Documentation/blockdev/drbd/conn-states-8.dot
new file mode 100644
index 000000000000..025e8cf5e64a
--- /dev/null
+++ b/Documentation/blockdev/drbd/conn-states-8.dot
@@ -0,0 +1,18 @@
1digraph conn_states {
2 StandAllone -> WFConnection [ label = "ioctl_set_net()" ]
3 WFConnection -> Unconnected [ label = "unable to bind()" ]
4 WFConnection -> WFReportParams [ label = "in connect() after accept" ]
5 WFReportParams -> StandAllone [ label = "checks in receive_param()" ]
6 WFReportParams -> Connected [ label = "in receive_param()" ]
7 WFReportParams -> WFBitMapS [ label = "sync_handshake()" ]
8 WFReportParams -> WFBitMapT [ label = "sync_handshake()" ]
9 WFBitMapS -> SyncSource [ label = "receive_bitmap()" ]
10 WFBitMapT -> SyncTarget [ label = "receive_bitmap()" ]
11 SyncSource -> Connected
12 SyncTarget -> Connected
13 SyncSource -> PausedSyncS
14 SyncTarget -> PausedSyncT
15 PausedSyncS -> SyncSource
16 PausedSyncT -> SyncTarget
17 Connected -> WFConnection [ label = "* on network error" ]
18}
diff --git a/Documentation/blockdev/drbd/disk-states-8.dot b/Documentation/blockdev/drbd/disk-states-8.dot
new file mode 100644
index 000000000000..d06cfb46fb98
--- /dev/null
+++ b/Documentation/blockdev/drbd/disk-states-8.dot
@@ -0,0 +1,16 @@
1digraph disk_states {
2 Diskless -> Inconsistent [ label = "ioctl_set_disk()" ]
3 Diskless -> Consistent [ label = "ioctl_set_disk()" ]
4 Diskless -> Outdated [ label = "ioctl_set_disk()" ]
5 Consistent -> Outdated [ label = "receive_param()" ]
6 Consistent -> UpToDate [ label = "receive_param()" ]
7 Consistent -> Inconsistent [ label = "start resync" ]
8 Outdated -> Inconsistent [ label = "start resync" ]
9 UpToDate -> Inconsistent [ label = "ioctl_replicate" ]
10 Inconsistent -> UpToDate [ label = "resync completed" ]
11 Consistent -> Failed [ label = "io completion error" ]
12 Outdated -> Failed [ label = "io completion error" ]
13 UpToDate -> Failed [ label = "io completion error" ]
14 Inconsistent -> Failed [ label = "io completion error" ]
15 Failed -> Diskless [ label = "sending notify to peer" ]
16}
diff --git a/Documentation/blockdev/drbd/drbd-connection-state-overview.dot b/Documentation/blockdev/drbd/drbd-connection-state-overview.dot
new file mode 100644
index 000000000000..6d9cf0a7b11d
--- /dev/null
+++ b/Documentation/blockdev/drbd/drbd-connection-state-overview.dot
@@ -0,0 +1,85 @@
1// vim: set sw=2 sts=2 :
2digraph {
3 rankdir=BT
4 bgcolor=white
5
6 node [shape=plaintext]
7 node [fontcolor=black]
8
9 StandAlone [ style=filled,fillcolor=gray,label=StandAlone ]
10
11 node [fontcolor=lightgray]
12
13 Unconnected [ label=Unconnected ]
14
15 CommTrouble [ shape=record,
16 label="{communication loss|{Timeout|BrokenPipe|NetworkFailure}}" ]
17
18 node [fontcolor=gray]
19
20 subgraph cluster_try_connect {
21 label="try to connect, handshake"
22 rank=max
23 WFConnection [ label=WFConnection ]
24 WFReportParams [ label=WFReportParams ]
25 }
26
27 TearDown [ label=TearDown ]
28
29 Connected [ label=Connected,style=filled,fillcolor=green,fontcolor=black ]
30
31 node [fontcolor=lightblue]
32
33 StartingSyncS [ label=StartingSyncS ]
34 StartingSyncT [ label=StartingSyncT ]
35
36 subgraph cluster_bitmap_exchange {
37 node [fontcolor=red]
38 fontcolor=red
39 label="new application (WRITE?) requests blocked\lwhile bitmap is exchanged"
40
41 WFBitMapT [ label=WFBitMapT ]
42 WFSyncUUID [ label=WFSyncUUID ]
43 WFBitMapS [ label=WFBitMapS ]
44 }
45
46 node [fontcolor=blue]
47
48 cluster_resync [ shape=record,label="{<any>resynchronisation process running\l'concurrent' application requests allowed|{{<T>PausedSyncT\nSyncTarget}|{<S>PausedSyncS\nSyncSource}}}" ]
49
50 node [shape=box,fontcolor=black]
51
52 // drbdadm [label="drbdadm connect"]
53 // handshake [label="drbd_connect()\ndrbd_do_handshake\ndrbd_sync_handshake() etc."]
54 // comm_error [label="communication trouble"]
55
56 //
57 // edges
58 // --------------------------------------
59
60 StandAlone -> Unconnected [ label="drbdadm connect" ]
61 Unconnected -> StandAlone [ label="drbdadm disconnect\lor serious communication trouble" ]
62 Unconnected -> WFConnection [ label="receiver thread is started" ]
63 WFConnection -> WFReportParams [ headlabel="accept()\land/or \lconnect()\l" ]
64
65 WFReportParams -> StandAlone [ label="during handshake\lpeers do not agree\labout something essential" ]
66 WFReportParams -> Connected [ label="data identical\lno sync needed",color=green,fontcolor=green ]
67
68 WFReportParams -> WFBitMapS
69 WFReportParams -> WFBitMapT
70 WFBitMapT -> WFSyncUUID [minlen=0.1,constraint=false]
71
72 WFBitMapS -> cluster_resync:S
73 WFSyncUUID -> cluster_resync:T
74
75 edge [color=green]
76 cluster_resync:any -> Connected [ label="resnyc done",fontcolor=green ]
77
78 edge [color=red]
79 WFReportParams -> CommTrouble
80 Connected -> CommTrouble
81 cluster_resync:any -> CommTrouble
82 edge [color=black]
83 CommTrouble -> Unconnected [label="receiver thread is stopped" ]
84
85}
diff --git a/Documentation/blockdev/drbd/node-states-8.dot b/Documentation/blockdev/drbd/node-states-8.dot
new file mode 100644
index 000000000000..4a2b00c23547
--- /dev/null
+++ b/Documentation/blockdev/drbd/node-states-8.dot
@@ -0,0 +1,14 @@
1digraph node_states {
2 Secondary -> Primary [ label = "ioctl_set_state()" ]
3 Primary -> Secondary [ label = "ioctl_set_state()" ]
4}
5
6digraph peer_states {
7 Secondary -> Primary [ label = "recv state packet" ]
8 Primary -> Secondary [ label = "recv state packet" ]
9 Primary -> Unknown [ label = "connection lost" ]
10 Secondary -> Unknown [ label = "connection lost" ]
11 Unknown -> Primary [ label = "connected" ]
12 Unknown -> Secondary [ label = "connected" ]
13}
14
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
new file mode 100644
index 000000000000..630879cd9a42
--- /dev/null
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -0,0 +1,135 @@
1 Block IO Controller
2 ===================
3Overview
4========
5cgroup subsys "blkio" implements the block io controller. There seems to be
6a need of various kinds of IO control policies (like proportional BW, max BW)
7both at leaf nodes as well as at intermediate nodes in a storage hierarchy.
8Plan is to use the same cgroup based management interface for blkio controller
9and based on user options switch IO policies in the background.
10
11In the first phase, this patchset implements proportional weight time based
12division of disk policy. It is implemented in CFQ. Hence this policy takes
13effect only on leaf nodes when CFQ is being used.
14
15HOWTO
16=====
17You can do a very simple testing of running two dd threads in two different
18cgroups. Here is what you can do.
19
20- Enable group scheduling in CFQ
21 CONFIG_CFQ_GROUP_IOSCHED=y
22
23- Compile and boot into kernel and mount IO controller (blkio).
24
25 mount -t cgroup -o blkio none /cgroup
26
27- Create two cgroups
28 mkdir -p /cgroup/test1/ /cgroup/test2
29
30- Set weights of group test1 and test2
31 echo 1000 > /cgroup/test1/blkio.weight
32 echo 500 > /cgroup/test2/blkio.weight
33
34- Create two same size files (say 512MB each) on same disk (file1, file2) and
35 launch two dd threads in different cgroup to read those files.
36
37 sync
38 echo 3 > /proc/sys/vm/drop_caches
39
40 dd if=/mnt/sdb/zerofile1 of=/dev/null &
41 echo $! > /cgroup/test1/tasks
42 cat /cgroup/test1/tasks
43
44 dd if=/mnt/sdb/zerofile2 of=/dev/null &
45 echo $! > /cgroup/test2/tasks
46 cat /cgroup/test2/tasks
47
48- At macro level, first dd should finish first. To get more precise data, keep
49 on looking at (with the help of script), at blkio.disk_time and
50 blkio.disk_sectors files of both test1 and test2 groups. This will tell how
51 much disk time (in milli seconds), each group got and how many secotors each
52 group dispatched to the disk. We provide fairness in terms of disk time, so
53 ideally io.disk_time of cgroups should be in proportion to the weight.
54
55Various user visible config options
56===================================
57CONFIG_CFQ_GROUP_IOSCHED
58 - Enables group scheduling in CFQ. Currently only 1 level of group
59 creation is allowed.
60
61CONFIG_DEBUG_CFQ_IOSCHED
62 - Enables some debugging messages in blktrace. Also creates extra
63 cgroup file blkio.dequeue.
64
65Config options selected automatically
66=====================================
67These config options are not user visible and are selected/deselected
68automatically based on IO scheduler configuration.
69
70CONFIG_BLK_CGROUP
71 - Block IO controller. Selected by CONFIG_CFQ_GROUP_IOSCHED.
72
73CONFIG_DEBUG_BLK_CGROUP
74 - Debug help. Selected by CONFIG_DEBUG_CFQ_IOSCHED.
75
76Details of cgroup files
77=======================
78- blkio.weight
79 - Specifies per cgroup weight.
80
81 Currently allowed range of weights is from 100 to 1000.
82
83- blkio.time
84 - disk time allocated to cgroup per device in milliseconds. First
85 two fields specify the major and minor number of the device and
86 third field specifies the disk time allocated to group in
87 milliseconds.
88
89- blkio.sectors
90 - number of sectors transferred to/from disk by the group. First
91 two fields specify the major and minor number of the device and
92 third field specifies the number of sectors transferred by the
93 group to/from the device.
94
95- blkio.dequeue
96 - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This
97 gives the statistics about how many a times a group was dequeued
98 from service tree of the device. First two fields specify the major
99 and minor number of the device and third field specifies the number
100 of times a group was dequeued from a particular device.
101
102CFQ sysfs tunable
103=================
104/sys/block/<disk>/queue/iosched/group_isolation
105
106If group_isolation=1, it provides stronger isolation between groups at the
107expense of throughput. By default group_isolation is 0. In general that
108means that if group_isolation=0, expect fairness for sequential workload
109only. Set group_isolation=1 to see fairness for random IO workload also.
110
111Generally CFQ will put random seeky workload in sync-noidle category. CFQ
112will disable idling on these queues and it does a collective idling on group
113of such queues. Generally these are slow moving queues and if there is a
114sync-noidle service tree in each group, that group gets exclusive access to
115disk for certain period. That means it will bring the throughput down if
116group does not have enough IO to drive deeper queue depths and utilize disk
117capacity to the fullest in the slice allocated to it. But the flip side is
118that even a random reader should get better latencies and overall throughput
119if there are lots of sequential readers/sync-idle workload running in the
120system.
121
122If group_isolation=0, then CFQ automatically moves all the random seeky queues
123in the root group. That means there will be no service differentiation for
124that kind of workload. This leads to better throughput as we do collective
125idling on root sync-noidle tree.
126
127By default one should run with group_isolation=0. If that is not sufficient
128and one wants stronger isolation between groups, then set group_isolation=1
129but this will come at cost of reduced throughput.
130
131What works
132==========
133- Currently only sync IO queues are support. All the buffered writes are
134 still system wide and not per group. Hence we will not see service
135 differentiation between buffered writes between groups.
diff --git a/MAINTAINERS b/MAINTAINERS
index 4a5d67620a51..ea781c1cfb5a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1848,6 +1848,19 @@ S: Maintained
1848F: drivers/scsi/dpt* 1848F: drivers/scsi/dpt*
1849F: drivers/scsi/dpt/ 1849F: drivers/scsi/dpt/
1850 1850
1851DRBD DRIVER
1852P: Philipp Reisner
1853P: Lars Ellenberg
1854M: drbd-dev@lists.linbit.com
1855L: drbd-user@lists.linbit.com
1856W: http://www.drbd.org
1857T: git git://git.drbd.org/linux-2.6-drbd.git drbd
1858T: git git://git.drbd.org/drbd-8.3.git
1859S: Supported
1860F: drivers/block/drbd/
1861F: lib/lru_cache.c
1862F: Documentation/blockdev/drbd/
1863
1851DRIVER CORE, KOBJECTS, AND SYSFS 1864DRIVER CORE, KOBJECTS, AND SYSFS
1852M: Greg Kroah-Hartman <gregkh@suse.de> 1865M: Greg Kroah-Hartman <gregkh@suse.de>
1853T: quilt kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/ 1866T: quilt kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/
diff --git a/arch/alpha/include/asm/cacheflush.h b/arch/alpha/include/asm/cacheflush.h
index b686cc7fc44e..01d71e1c8a9e 100644
--- a/arch/alpha/include/asm/cacheflush.h
+++ b/arch/alpha/include/asm/cacheflush.h
@@ -9,6 +9,7 @@
9#define flush_cache_dup_mm(mm) do { } while (0) 9#define flush_cache_dup_mm(mm) do { } while (0)
10#define flush_cache_range(vma, start, end) do { } while (0) 10#define flush_cache_range(vma, start, end) do { } while (0)
11#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) 11#define flush_cache_page(vma, vmaddr, pfn) do { } while (0)
12#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
12#define flush_dcache_page(page) do { } while (0) 13#define flush_dcache_page(page) do { } while (0)
13#define flush_dcache_mmap_lock(mapping) do { } while (0) 14#define flush_dcache_mmap_lock(mapping) do { } while (0)
14#define flush_dcache_mmap_unlock(mapping) do { } while (0) 15#define flush_dcache_mmap_unlock(mapping) do { } while (0)
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index 9fd6d3ab68c0..73eceb87e588 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -418,6 +418,7 @@ extern void flush_ptrace_access(struct vm_area_struct *vma, struct page *page,
418 * about to change to user space. This is the same method as used on SPARC64. 418 * about to change to user space. This is the same method as used on SPARC64.
419 * See update_mmu_cache for the user space part. 419 * See update_mmu_cache for the user space part.
420 */ 420 */
421#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
421extern void flush_dcache_page(struct page *); 422extern void flush_dcache_page(struct page *);
422 423
423static inline void __flush_icache_all(void) 424static inline void __flush_icache_all(void)
diff --git a/arch/avr32/include/asm/cacheflush.h b/arch/avr32/include/asm/cacheflush.h
index 670674749b20..96e53820bbbd 100644
--- a/arch/avr32/include/asm/cacheflush.h
+++ b/arch/avr32/include/asm/cacheflush.h
@@ -107,6 +107,7 @@ extern void flush_icache_page(struct vm_area_struct *vma, struct page *page);
107 * do something here, but only for certain configurations. No such 107 * do something here, but only for certain configurations. No such
108 * configurations exist at this time. 108 * configurations exist at this time.
109 */ 109 */
110#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
110#define flush_dcache_page(page) do { } while (0) 111#define flush_dcache_page(page) do { } while (0)
111#define flush_dcache_mmap_lock(page) do { } while (0) 112#define flush_dcache_mmap_lock(page) do { } while (0)
112#define flush_dcache_mmap_unlock(page) do { } while (0) 113#define flush_dcache_mmap_unlock(page) do { } while (0)
diff --git a/arch/blackfin/include/asm/cacheflush.h b/arch/blackfin/include/asm/cacheflush.h
index af03a36c7a4e..417eaac7fe99 100644
--- a/arch/blackfin/include/asm/cacheflush.h
+++ b/arch/blackfin/include/asm/cacheflush.h
@@ -68,9 +68,11 @@ do { memcpy(dst, src, len); \
68#endif 68#endif
69#if defined(CONFIG_BFIN_EXTMEM_WRITEBACK) || defined(CONFIG_BFIN_L2_WRITEBACK) 69#if defined(CONFIG_BFIN_EXTMEM_WRITEBACK) || defined(CONFIG_BFIN_L2_WRITEBACK)
70# define flush_dcache_range(start,end) blackfin_dcache_flush_range((start), (end)) 70# define flush_dcache_range(start,end) blackfin_dcache_flush_range((start), (end))
71#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
71# define flush_dcache_page(page) blackfin_dflush_page(page_address(page)) 72# define flush_dcache_page(page) blackfin_dflush_page(page_address(page))
72#else 73#else
73# define flush_dcache_range(start,end) do { } while (0) 74# define flush_dcache_range(start,end) do { } while (0)
75#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
74# define flush_dcache_page(page) do { } while (0) 76# define flush_dcache_page(page) do { } while (0)
75#endif 77#endif
76 78
diff --git a/arch/cris/include/asm/cacheflush.h b/arch/cris/include/asm/cacheflush.h
index cf60e3f69f8d..36795bca605e 100644
--- a/arch/cris/include/asm/cacheflush.h
+++ b/arch/cris/include/asm/cacheflush.h
@@ -12,6 +12,7 @@
12#define flush_cache_dup_mm(mm) do { } while (0) 12#define flush_cache_dup_mm(mm) do { } while (0)
13#define flush_cache_range(vma, start, end) do { } while (0) 13#define flush_cache_range(vma, start, end) do { } while (0)
14#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) 14#define flush_cache_page(vma, vmaddr, pfn) do { } while (0)
15#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
15#define flush_dcache_page(page) do { } while (0) 16#define flush_dcache_page(page) do { } while (0)
16#define flush_dcache_mmap_lock(mapping) do { } while (0) 17#define flush_dcache_mmap_lock(mapping) do { } while (0)
17#define flush_dcache_mmap_unlock(mapping) do { } while (0) 18#define flush_dcache_mmap_unlock(mapping) do { } while (0)
diff --git a/arch/frv/include/asm/cacheflush.h b/arch/frv/include/asm/cacheflush.h
index 432a69e7f3d4..edbac54ae015 100644
--- a/arch/frv/include/asm/cacheflush.h
+++ b/arch/frv/include/asm/cacheflush.h
@@ -47,6 +47,7 @@ static inline void __flush_cache_all(void)
47} 47}
48 48
49/* dcache/icache coherency... */ 49/* dcache/icache coherency... */
50#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
50#ifdef CONFIG_MMU 51#ifdef CONFIG_MMU
51extern void flush_dcache_page(struct page *page); 52extern void flush_dcache_page(struct page *page);
52#else 53#else
diff --git a/arch/h8300/include/asm/cacheflush.h b/arch/h8300/include/asm/cacheflush.h
index 5ffdca217b95..4cf2df20c1ce 100644
--- a/arch/h8300/include/asm/cacheflush.h
+++ b/arch/h8300/include/asm/cacheflush.h
@@ -15,6 +15,7 @@
15#define flush_cache_dup_mm(mm) do { } while (0) 15#define flush_cache_dup_mm(mm) do { } while (0)
16#define flush_cache_range(vma,a,b) 16#define flush_cache_range(vma,a,b)
17#define flush_cache_page(vma,p,pfn) 17#define flush_cache_page(vma,p,pfn)
18#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
18#define flush_dcache_page(page) 19#define flush_dcache_page(page)
19#define flush_dcache_mmap_lock(mapping) 20#define flush_dcache_mmap_lock(mapping)
20#define flush_dcache_mmap_unlock(mapping) 21#define flush_dcache_mmap_unlock(mapping)
diff --git a/arch/ia64/include/asm/cacheflush.h b/arch/ia64/include/asm/cacheflush.h
index c8ce2719fee8..429eefc93ee7 100644
--- a/arch/ia64/include/asm/cacheflush.h
+++ b/arch/ia64/include/asm/cacheflush.h
@@ -25,6 +25,7 @@
25#define flush_cache_vmap(start, end) do { } while (0) 25#define flush_cache_vmap(start, end) do { } while (0)
26#define flush_cache_vunmap(start, end) do { } while (0) 26#define flush_cache_vunmap(start, end) do { } while (0)
27 27
28#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
28#define flush_dcache_page(page) \ 29#define flush_dcache_page(page) \
29do { \ 30do { \
30 clear_bit(PG_arch_1, &(page)->flags); \ 31 clear_bit(PG_arch_1, &(page)->flags); \
diff --git a/arch/m32r/include/asm/cacheflush.h b/arch/m32r/include/asm/cacheflush.h
index 78587c958146..8e8e04516c39 100644
--- a/arch/m32r/include/asm/cacheflush.h
+++ b/arch/m32r/include/asm/cacheflush.h
@@ -12,6 +12,7 @@ extern void _flush_cache_copyback_all(void);
12#define flush_cache_dup_mm(mm) do { } while (0) 12#define flush_cache_dup_mm(mm) do { } while (0)
13#define flush_cache_range(vma, start, end) do { } while (0) 13#define flush_cache_range(vma, start, end) do { } while (0)
14#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) 14#define flush_cache_page(vma, vmaddr, pfn) do { } while (0)
15#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
15#define flush_dcache_page(page) do { } while (0) 16#define flush_dcache_page(page) do { } while (0)
16#define flush_dcache_mmap_lock(mapping) do { } while (0) 17#define flush_dcache_mmap_lock(mapping) do { } while (0)
17#define flush_dcache_mmap_unlock(mapping) do { } while (0) 18#define flush_dcache_mmap_unlock(mapping) do { } while (0)
@@ -33,6 +34,7 @@ extern void smp_flush_cache_all(void);
33#define flush_cache_dup_mm(mm) do { } while (0) 34#define flush_cache_dup_mm(mm) do { } while (0)
34#define flush_cache_range(vma, start, end) do { } while (0) 35#define flush_cache_range(vma, start, end) do { } while (0)
35#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) 36#define flush_cache_page(vma, vmaddr, pfn) do { } while (0)
37#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
36#define flush_dcache_page(page) do { } while (0) 38#define flush_dcache_page(page) do { } while (0)
37#define flush_dcache_mmap_lock(mapping) do { } while (0) 39#define flush_dcache_mmap_lock(mapping) do { } while (0)
38#define flush_dcache_mmap_unlock(mapping) do { } while (0) 40#define flush_dcache_mmap_unlock(mapping) do { } while (0)
@@ -46,6 +48,7 @@ extern void smp_flush_cache_all(void);
46#define flush_cache_dup_mm(mm) do { } while (0) 48#define flush_cache_dup_mm(mm) do { } while (0)
47#define flush_cache_range(vma, start, end) do { } while (0) 49#define flush_cache_range(vma, start, end) do { } while (0)
48#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) 50#define flush_cache_page(vma, vmaddr, pfn) do { } while (0)
51#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
49#define flush_dcache_page(page) do { } while (0) 52#define flush_dcache_page(page) do { } while (0)
50#define flush_dcache_mmap_lock(mapping) do { } while (0) 53#define flush_dcache_mmap_lock(mapping) do { } while (0)
51#define flush_dcache_mmap_unlock(mapping) do { } while (0) 54#define flush_dcache_mmap_unlock(mapping) do { } while (0)
diff --git a/arch/m68k/include/asm/cacheflush_mm.h b/arch/m68k/include/asm/cacheflush_mm.h
index 16bf375fdbe1..73de7c89d8e0 100644
--- a/arch/m68k/include/asm/cacheflush_mm.h
+++ b/arch/m68k/include/asm/cacheflush_mm.h
@@ -128,6 +128,7 @@ static inline void __flush_page_to_ram(void *vaddr)
128 } 128 }
129} 129}
130 130
131#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
131#define flush_dcache_page(page) __flush_page_to_ram(page_address(page)) 132#define flush_dcache_page(page) __flush_page_to_ram(page_address(page))
132#define flush_dcache_mmap_lock(mapping) do { } while (0) 133#define flush_dcache_mmap_lock(mapping) do { } while (0)
133#define flush_dcache_mmap_unlock(mapping) do { } while (0) 134#define flush_dcache_mmap_unlock(mapping) do { } while (0)
diff --git a/arch/m68k/include/asm/cacheflush_no.h b/arch/m68k/include/asm/cacheflush_no.h
index c65f00a94553..89f195656be7 100644
--- a/arch/m68k/include/asm/cacheflush_no.h
+++ b/arch/m68k/include/asm/cacheflush_no.h
@@ -12,6 +12,7 @@
12#define flush_cache_range(vma, start, end) __flush_cache_all() 12#define flush_cache_range(vma, start, end) __flush_cache_all()
13#define flush_cache_page(vma, vmaddr) do { } while (0) 13#define flush_cache_page(vma, vmaddr) do { } while (0)
14#define flush_dcache_range(start,len) __flush_cache_all() 14#define flush_dcache_range(start,len) __flush_cache_all()
15#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
15#define flush_dcache_page(page) do { } while (0) 16#define flush_dcache_page(page) do { } while (0)
16#define flush_dcache_mmap_lock(mapping) do { } while (0) 17#define flush_dcache_mmap_lock(mapping) do { } while (0)
17#define flush_dcache_mmap_unlock(mapping) do { } while (0) 18#define flush_dcache_mmap_unlock(mapping) do { } while (0)
diff --git a/arch/microblaze/include/asm/cacheflush.h b/arch/microblaze/include/asm/cacheflush.h
index f989d6aad648..088076e657b3 100644
--- a/arch/microblaze/include/asm/cacheflush.h
+++ b/arch/microblaze/include/asm/cacheflush.h
@@ -37,6 +37,7 @@
37#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) 37#define flush_cache_page(vma, vmaddr, pfn) do { } while (0)
38 38
39#define flush_dcache_range(start, end) __invalidate_dcache_range(start, end) 39#define flush_dcache_range(start, end) __invalidate_dcache_range(start, end)
40#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
40#define flush_dcache_page(page) do { } while (0) 41#define flush_dcache_page(page) do { } while (0)
41#define flush_dcache_mmap_lock(mapping) do { } while (0) 42#define flush_dcache_mmap_lock(mapping) do { } while (0)
42#define flush_dcache_mmap_unlock(mapping) do { } while (0) 43#define flush_dcache_mmap_unlock(mapping) do { } while (0)
diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h
index 03b1d69b142f..40bb9fde205f 100644
--- a/arch/mips/include/asm/cacheflush.h
+++ b/arch/mips/include/asm/cacheflush.h
@@ -38,6 +38,7 @@ extern void (*flush_cache_range)(struct vm_area_struct *vma,
38extern void (*flush_cache_page)(struct vm_area_struct *vma, unsigned long page, unsigned long pfn); 38extern void (*flush_cache_page)(struct vm_area_struct *vma, unsigned long page, unsigned long pfn);
39extern void __flush_dcache_page(struct page *page); 39extern void __flush_dcache_page(struct page *page);
40 40
41#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
41static inline void flush_dcache_page(struct page *page) 42static inline void flush_dcache_page(struct page *page)
42{ 43{
43 if (cpu_has_dc_aliases || !cpu_has_ic_fills_f_dc) 44 if (cpu_has_dc_aliases || !cpu_has_ic_fills_f_dc)
diff --git a/arch/mn10300/include/asm/cacheflush.h b/arch/mn10300/include/asm/cacheflush.h
index 1a55d61f0d06..29e692f7f030 100644
--- a/arch/mn10300/include/asm/cacheflush.h
+++ b/arch/mn10300/include/asm/cacheflush.h
@@ -26,6 +26,7 @@
26#define flush_cache_page(vma, vmaddr, pfn) do {} while (0) 26#define flush_cache_page(vma, vmaddr, pfn) do {} while (0)
27#define flush_cache_vmap(start, end) do {} while (0) 27#define flush_cache_vmap(start, end) do {} while (0)
28#define flush_cache_vunmap(start, end) do {} while (0) 28#define flush_cache_vunmap(start, end) do {} while (0)
29#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
29#define flush_dcache_page(page) do {} while (0) 30#define flush_dcache_page(page) do {} while (0)
30#define flush_dcache_mmap_lock(mapping) do {} while (0) 31#define flush_dcache_mmap_lock(mapping) do {} while (0)
31#define flush_dcache_mmap_unlock(mapping) do {} while (0) 32#define flush_dcache_mmap_unlock(mapping) do {} while (0)
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h
index 724395143f26..7a73b615c23d 100644
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -42,6 +42,7 @@ void flush_cache_mm(struct mm_struct *mm);
42#define flush_cache_vmap(start, end) flush_cache_all() 42#define flush_cache_vmap(start, end) flush_cache_all()
43#define flush_cache_vunmap(start, end) flush_cache_all() 43#define flush_cache_vunmap(start, end) flush_cache_all()
44 44
45#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
45extern void flush_dcache_page(struct page *page); 46extern void flush_dcache_page(struct page *page);
46 47
47#define flush_dcache_mmap_lock(mapping) \ 48#define flush_dcache_mmap_lock(mapping) \
diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h
index ba667a383b8c..ab9e402518e8 100644
--- a/arch/powerpc/include/asm/cacheflush.h
+++ b/arch/powerpc/include/asm/cacheflush.h
@@ -25,6 +25,7 @@
25#define flush_cache_vmap(start, end) do { } while (0) 25#define flush_cache_vmap(start, end) do { } while (0)
26#define flush_cache_vunmap(start, end) do { } while (0) 26#define flush_cache_vunmap(start, end) do { } while (0)
27 27
28#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
28extern void flush_dcache_page(struct page *page); 29extern void flush_dcache_page(struct page *page);
29#define flush_dcache_mmap_lock(mapping) do { } while (0) 30#define flush_dcache_mmap_lock(mapping) do { } while (0)
30#define flush_dcache_mmap_unlock(mapping) do { } while (0) 31#define flush_dcache_mmap_unlock(mapping) do { } while (0)
diff --git a/arch/s390/include/asm/cacheflush.h b/arch/s390/include/asm/cacheflush.h
index 49d5af916d01..405cc97c6249 100644
--- a/arch/s390/include/asm/cacheflush.h
+++ b/arch/s390/include/asm/cacheflush.h
@@ -10,6 +10,7 @@
10#define flush_cache_dup_mm(mm) do { } while (0) 10#define flush_cache_dup_mm(mm) do { } while (0)
11#define flush_cache_range(vma, start, end) do { } while (0) 11#define flush_cache_range(vma, start, end) do { } while (0)
12#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) 12#define flush_cache_page(vma, vmaddr, pfn) do { } while (0)
13#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
13#define flush_dcache_page(page) do { } while (0) 14#define flush_dcache_page(page) do { } while (0)
14#define flush_dcache_mmap_lock(mapping) do { } while (0) 15#define flush_dcache_mmap_lock(mapping) do { } while (0)
15#define flush_dcache_mmap_unlock(mapping) do { } while (0) 16#define flush_dcache_mmap_unlock(mapping) do { } while (0)
diff --git a/arch/score/include/asm/cacheflush.h b/arch/score/include/asm/cacheflush.h
index 07cc8fc457cd..caaba24036e3 100644
--- a/arch/score/include/asm/cacheflush.h
+++ b/arch/score/include/asm/cacheflush.h
@@ -16,6 +16,7 @@ extern void flush_icache_range(unsigned long start, unsigned long end);
16extern void flush_dcache_range(unsigned long start, unsigned long end); 16extern void flush_dcache_range(unsigned long start, unsigned long end);
17 17
18#define flush_cache_dup_mm(mm) do {} while (0) 18#define flush_cache_dup_mm(mm) do {} while (0)
19#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
19#define flush_dcache_page(page) do {} while (0) 20#define flush_dcache_page(page) do {} while (0)
20#define flush_dcache_mmap_lock(mapping) do {} while (0) 21#define flush_dcache_mmap_lock(mapping) do {} while (0)
21#define flush_dcache_mmap_unlock(mapping) do {} while (0) 22#define flush_dcache_mmap_unlock(mapping) do {} while (0)
diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h
index c29918f3c819..dda96eb3e7c0 100644
--- a/arch/sh/include/asm/cacheflush.h
+++ b/arch/sh/include/asm/cacheflush.h
@@ -42,6 +42,7 @@ extern void flush_cache_page(struct vm_area_struct *vma,
42 unsigned long addr, unsigned long pfn); 42 unsigned long addr, unsigned long pfn);
43extern void flush_cache_range(struct vm_area_struct *vma, 43extern void flush_cache_range(struct vm_area_struct *vma,
44 unsigned long start, unsigned long end); 44 unsigned long start, unsigned long end);
45#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
45extern void flush_dcache_page(struct page *page); 46extern void flush_dcache_page(struct page *page);
46extern void flush_icache_range(unsigned long start, unsigned long end); 47extern void flush_icache_range(unsigned long start, unsigned long end);
47extern void flush_icache_page(struct vm_area_struct *vma, 48extern void flush_icache_page(struct vm_area_struct *vma,
diff --git a/arch/sparc/include/asm/cacheflush_32.h b/arch/sparc/include/asm/cacheflush_32.h
index 68ac10910271..2e468773f250 100644
--- a/arch/sparc/include/asm/cacheflush_32.h
+++ b/arch/sparc/include/asm/cacheflush_32.h
@@ -75,6 +75,7 @@ BTFIXUPDEF_CALL(void, flush_sig_insns, struct mm_struct *, unsigned long)
75 75
76extern void sparc_flush_page_to_ram(struct page *page); 76extern void sparc_flush_page_to_ram(struct page *page);
77 77
78#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
78#define flush_dcache_page(page) sparc_flush_page_to_ram(page) 79#define flush_dcache_page(page) sparc_flush_page_to_ram(page)
79#define flush_dcache_mmap_lock(mapping) do { } while (0) 80#define flush_dcache_mmap_lock(mapping) do { } while (0)
80#define flush_dcache_mmap_unlock(mapping) do { } while (0) 81#define flush_dcache_mmap_unlock(mapping) do { } while (0)
diff --git a/arch/sparc/include/asm/cacheflush_64.h b/arch/sparc/include/asm/cacheflush_64.h
index c43321729b3b..b95384033e89 100644
--- a/arch/sparc/include/asm/cacheflush_64.h
+++ b/arch/sparc/include/asm/cacheflush_64.h
@@ -37,6 +37,7 @@ extern void flush_dcache_page_all(struct mm_struct *mm, struct page *page);
37#endif 37#endif
38 38
39extern void __flush_dcache_range(unsigned long start, unsigned long end); 39extern void __flush_dcache_range(unsigned long start, unsigned long end);
40#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
40extern void flush_dcache_page(struct page *page); 41extern void flush_dcache_page(struct page *page);
41 42
42#define flush_icache_page(vma, pg) do { } while(0) 43#define flush_icache_page(vma, pg) do { } while(0)
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index b54f6afe7ec4..9076add593a8 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -12,6 +12,7 @@ static inline void flush_cache_range(struct vm_area_struct *vma,
12 unsigned long start, unsigned long end) { } 12 unsigned long start, unsigned long end) { }
13static inline void flush_cache_page(struct vm_area_struct *vma, 13static inline void flush_cache_page(struct vm_area_struct *vma,
14 unsigned long vmaddr, unsigned long pfn) { } 14 unsigned long vmaddr, unsigned long pfn) { }
15#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
15static inline void flush_dcache_page(struct page *page) { } 16static inline void flush_dcache_page(struct page *page) { }
16static inline void flush_dcache_mmap_lock(struct address_space *mapping) { } 17static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
17static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { } 18static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
diff --git a/arch/xtensa/include/asm/cacheflush.h b/arch/xtensa/include/asm/cacheflush.h
index b7b8fbe47c77..a508f2f73bd7 100644
--- a/arch/xtensa/include/asm/cacheflush.h
+++ b/arch/xtensa/include/asm/cacheflush.h
@@ -101,6 +101,7 @@ static inline void __invalidate_icache_page_alias(unsigned long virt,
101#define flush_cache_vmap(start,end) flush_cache_all() 101#define flush_cache_vmap(start,end) flush_cache_all()
102#define flush_cache_vunmap(start,end) flush_cache_all() 102#define flush_cache_vunmap(start,end) flush_cache_all()
103 103
104#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
104extern void flush_dcache_page(struct page*); 105extern void flush_dcache_page(struct page*);
105extern void flush_cache_range(struct vm_area_struct*, ulong, ulong); 106extern void flush_cache_range(struct vm_area_struct*, ulong, ulong);
106extern void flush_cache_page(struct vm_area_struct*, unsigned long, unsigned long); 107extern void flush_cache_page(struct vm_area_struct*, unsigned long, unsigned long);
diff --git a/block/Kconfig b/block/Kconfig
index 9be0b56eaee1..e20fbde0875c 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,6 +77,28 @@ config BLK_DEV_INTEGRITY
77 T10/SCSI Data Integrity Field or the T13/ATA External Path 77 T10/SCSI Data Integrity Field or the T13/ATA External Path
78 Protection. If in doubt, say N. 78 Protection. If in doubt, say N.
79 79
80config BLK_CGROUP
81 bool
82 depends on CGROUPS
83 default n
84 ---help---
85 Generic block IO controller cgroup interface. This is the common
86 cgroup interface which should be used by various IO controlling
87 policies.
88
89 Currently, CFQ IO scheduler uses it to recognize task groups and
90 control disk bandwidth allocation (proportional time slice allocation)
91 to such task groups.
92
93config DEBUG_BLK_CGROUP
94 bool
95 depends on BLK_CGROUP
96 default n
97 ---help---
98 Enable some debugging help. Currently it stores the cgroup path
99 in the blk group which can be used by cfq for tracing various
100 group related activity.
101
80endif # BLOCK 102endif # BLOCK
81 103
82config BLOCK_COMPAT 104config BLOCK_COMPAT
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 7e803fc88770..b71abfb0d726 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -12,24 +12,14 @@ config IOSCHED_NOOP
12 that do their own scheduling and require only minimal assistance from 12 that do their own scheduling and require only minimal assistance from
13 the kernel. 13 the kernel.
14 14
15config IOSCHED_AS
16 tristate "Anticipatory I/O scheduler"
17 default y
18 ---help---
19 The anticipatory I/O scheduler is generally a good choice for most
20 environments, but is quite large and complex when compared to the
21 deadline I/O scheduler, it can also be slower in some cases
22 especially some database loads.
23
24config IOSCHED_DEADLINE 15config IOSCHED_DEADLINE
25 tristate "Deadline I/O scheduler" 16 tristate "Deadline I/O scheduler"
26 default y 17 default y
27 ---help--- 18 ---help---
28 The deadline I/O scheduler is simple and compact, and is often as 19 The deadline I/O scheduler is simple and compact. It will provide
29 good as the anticipatory I/O scheduler, and in some database 20 CSCAN service with FIFO expiration of requests, switching to
30 workloads, better. In the case of a single process performing I/O to 21 a new point in the service tree and doing a batch of IO from there
31 a disk at any one time, its behaviour is almost identical to the 22 in case of expiry.
32 anticipatory I/O scheduler and so is a good choice.
33 23
34config IOSCHED_CFQ 24config IOSCHED_CFQ
35 tristate "CFQ I/O scheduler" 25 tristate "CFQ I/O scheduler"
@@ -37,9 +27,28 @@ config IOSCHED_CFQ
37 ---help--- 27 ---help---
38 The CFQ I/O scheduler tries to distribute bandwidth equally 28 The CFQ I/O scheduler tries to distribute bandwidth equally
39 among all processes in the system. It should provide a fair 29 among all processes in the system. It should provide a fair
40 working environment, suitable for desktop systems. 30 and low latency working environment, suitable for both desktop
31 and server systems.
32
41 This is the default I/O scheduler. 33 This is the default I/O scheduler.
42 34
35config CFQ_GROUP_IOSCHED
36 bool "CFQ Group Scheduling support"
37 depends on IOSCHED_CFQ && CGROUPS
38 select BLK_CGROUP
39 default n
40 ---help---
41 Enable group IO scheduling in CFQ.
42
43config DEBUG_CFQ_IOSCHED
44 bool "Debug CFQ Scheduling"
45 depends on CFQ_GROUP_IOSCHED
46 select DEBUG_BLK_CGROUP
47 default n
48 ---help---
49 Enable CFQ IO scheduling debugging in CFQ. Currently it makes
50 blktrace output more verbose.
51
43choice 52choice
44 prompt "Default I/O scheduler" 53 prompt "Default I/O scheduler"
45 default DEFAULT_CFQ 54 default DEFAULT_CFQ
@@ -47,9 +56,6 @@ choice
47 Select the I/O scheduler which will be used by default for all 56 Select the I/O scheduler which will be used by default for all
48 block devices. 57 block devices.
49 58
50 config DEFAULT_AS
51 bool "Anticipatory" if IOSCHED_AS=y
52
53 config DEFAULT_DEADLINE 59 config DEFAULT_DEADLINE
54 bool "Deadline" if IOSCHED_DEADLINE=y 60 bool "Deadline" if IOSCHED_DEADLINE=y
55 61
@@ -63,7 +69,6 @@ endchoice
63 69
64config DEFAULT_IOSCHED 70config DEFAULT_IOSCHED
65 string 71 string
66 default "anticipatory" if DEFAULT_AS
67 default "deadline" if DEFAULT_DEADLINE 72 default "deadline" if DEFAULT_DEADLINE
68 default "cfq" if DEFAULT_CFQ 73 default "cfq" if DEFAULT_CFQ
69 default "noop" if DEFAULT_NOOP 74 default "noop" if DEFAULT_NOOP
diff --git a/block/Makefile b/block/Makefile
index ba74ca6bfa14..cb2d515ebd6e 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -8,8 +8,8 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
8 blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o 8 blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o
9 9
10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
11obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
11obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o 12obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
12obj-$(CONFIG_IOSCHED_AS) += as-iosched.o
13obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o 13obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
14obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o 14obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
15 15
diff --git a/block/as-iosched.c b/block/as-iosched.c
deleted file mode 100644
index ce8ba57c6557..000000000000
--- a/block/as-iosched.c
+++ /dev/null
@@ -1,1520 +0,0 @@
1/*
2 * Anticipatory & deadline i/o scheduler.
3 *
4 * Copyright (C) 2002 Jens Axboe <axboe@kernel.dk>
5 * Nick Piggin <nickpiggin@yahoo.com.au>
6 *
7 */
8#include <linux/kernel.h>
9#include <linux/fs.h>
10#include <linux/blkdev.h>
11#include <linux/elevator.h>
12#include <linux/bio.h>
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/init.h>
16#include <linux/compiler.h>
17#include <linux/rbtree.h>
18#include <linux/interrupt.h>
19
20/*
21 * See Documentation/block/as-iosched.txt
22 */
23
24/*
25 * max time before a read is submitted.
26 */
27#define default_read_expire (HZ / 8)
28
29/*
30 * ditto for writes, these limits are not hard, even
31 * if the disk is capable of satisfying them.
32 */
33#define default_write_expire (HZ / 4)
34
35/*
36 * read_batch_expire describes how long we will allow a stream of reads to
37 * persist before looking to see whether it is time to switch over to writes.
38 */
39#define default_read_batch_expire (HZ / 2)
40
41/*
42 * write_batch_expire describes how long we want a stream of writes to run for.
43 * This is not a hard limit, but a target we set for the auto-tuning thingy.
44 * See, the problem is: we can send a lot of writes to disk cache / TCQ in
45 * a short amount of time...
46 */
47#define default_write_batch_expire (HZ / 8)
48
49/*
50 * max time we may wait to anticipate a read (default around 6ms)
51 */
52#define default_antic_expire ((HZ / 150) ? HZ / 150 : 1)
53
54/*
55 * Keep track of up to 20ms thinktimes. We can go as big as we like here,
56 * however huge values tend to interfere and not decay fast enough. A program
57 * might be in a non-io phase of operation. Waiting on user input for example,
58 * or doing a lengthy computation. A small penalty can be justified there, and
59 * will still catch out those processes that constantly have large thinktimes.
60 */
61#define MAX_THINKTIME (HZ/50UL)
62
63/* Bits in as_io_context.state */
64enum as_io_states {
65 AS_TASK_RUNNING=0, /* Process has not exited */
66 AS_TASK_IOSTARTED, /* Process has started some IO */
67 AS_TASK_IORUNNING, /* Process has completed some IO */
68};
69
70enum anticipation_status {
71 ANTIC_OFF=0, /* Not anticipating (normal operation) */
72 ANTIC_WAIT_REQ, /* The last read has not yet completed */
73 ANTIC_WAIT_NEXT, /* Currently anticipating a request vs
74 last read (which has completed) */
75 ANTIC_FINISHED, /* Anticipating but have found a candidate
76 * or timed out */
77};
78
79struct as_data {
80 /*
81 * run time data
82 */
83
84 struct request_queue *q; /* the "owner" queue */
85
86 /*
87 * requests (as_rq s) are present on both sort_list and fifo_list
88 */
89 struct rb_root sort_list[2];
90 struct list_head fifo_list[2];
91
92 struct request *next_rq[2]; /* next in sort order */
93 sector_t last_sector[2]; /* last SYNC & ASYNC sectors */
94
95 unsigned long exit_prob; /* probability a task will exit while
96 being waited on */
97 unsigned long exit_no_coop; /* probablility an exited task will
98 not be part of a later cooperating
99 request */
100 unsigned long new_ttime_total; /* mean thinktime on new proc */
101 unsigned long new_ttime_mean;
102 u64 new_seek_total; /* mean seek on new proc */
103 sector_t new_seek_mean;
104
105 unsigned long current_batch_expires;
106 unsigned long last_check_fifo[2];
107 int changed_batch; /* 1: waiting for old batch to end */
108 int new_batch; /* 1: waiting on first read complete */
109 int batch_data_dir; /* current batch SYNC / ASYNC */
110 int write_batch_count; /* max # of reqs in a write batch */
111 int current_write_count; /* how many requests left this batch */
112 int write_batch_idled; /* has the write batch gone idle? */
113
114 enum anticipation_status antic_status;
115 unsigned long antic_start; /* jiffies: when it started */
116 struct timer_list antic_timer; /* anticipatory scheduling timer */
117 struct work_struct antic_work; /* Deferred unplugging */
118 struct io_context *io_context; /* Identify the expected process */
119 int ioc_finished; /* IO associated with io_context is finished */
120 int nr_dispatched;
121
122 /*
123 * settings that change how the i/o scheduler behaves
124 */
125 unsigned long fifo_expire[2];
126 unsigned long batch_expire[2];
127 unsigned long antic_expire;
128};
129
130/*
131 * per-request data.
132 */
133enum arq_state {
134 AS_RQ_NEW=0, /* New - not referenced and not on any lists */
135 AS_RQ_QUEUED, /* In the request queue. It belongs to the
136 scheduler */
137 AS_RQ_DISPATCHED, /* On the dispatch list. It belongs to the
138 driver now */
139 AS_RQ_PRESCHED, /* Debug poisoning for requests being used */
140 AS_RQ_REMOVED,
141 AS_RQ_MERGED,
142 AS_RQ_POSTSCHED, /* when they shouldn't be */
143};
144
145#define RQ_IOC(rq) ((struct io_context *) (rq)->elevator_private)
146#define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2)
147#define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state)
148
149static DEFINE_PER_CPU(unsigned long, as_ioc_count);
150static struct completion *ioc_gone;
151static DEFINE_SPINLOCK(ioc_gone_lock);
152
153static void as_move_to_dispatch(struct as_data *ad, struct request *rq);
154static void as_antic_stop(struct as_data *ad);
155
156/*
157 * IO Context helper functions
158 */
159
160/* Called to deallocate the as_io_context */
161static void free_as_io_context(struct as_io_context *aic)
162{
163 kfree(aic);
164 elv_ioc_count_dec(as_ioc_count);
165 if (ioc_gone) {
166 /*
167 * AS scheduler is exiting, grab exit lock and check
168 * the pending io context count. If it hits zero,
169 * complete ioc_gone and set it back to NULL.
170 */
171 spin_lock(&ioc_gone_lock);
172 if (ioc_gone && !elv_ioc_count_read(as_ioc_count)) {
173 complete(ioc_gone);
174 ioc_gone = NULL;
175 }
176 spin_unlock(&ioc_gone_lock);
177 }
178}
179
180static void as_trim(struct io_context *ioc)
181{
182 spin_lock_irq(&ioc->lock);
183 if (ioc->aic)
184 free_as_io_context(ioc->aic);
185 ioc->aic = NULL;
186 spin_unlock_irq(&ioc->lock);
187}
188
189/* Called when the task exits */
190static void exit_as_io_context(struct as_io_context *aic)
191{
192 WARN_ON(!test_bit(AS_TASK_RUNNING, &aic->state));
193 clear_bit(AS_TASK_RUNNING, &aic->state);
194}
195
196static struct as_io_context *alloc_as_io_context(void)
197{
198 struct as_io_context *ret;
199
200 ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
201 if (ret) {
202 ret->dtor = free_as_io_context;
203 ret->exit = exit_as_io_context;
204 ret->state = 1 << AS_TASK_RUNNING;
205 atomic_set(&ret->nr_queued, 0);
206 atomic_set(&ret->nr_dispatched, 0);
207 spin_lock_init(&ret->lock);
208 ret->ttime_total = 0;
209 ret->ttime_samples = 0;
210 ret->ttime_mean = 0;
211 ret->seek_total = 0;
212 ret->seek_samples = 0;
213 ret->seek_mean = 0;
214 elv_ioc_count_inc(as_ioc_count);
215 }
216
217 return ret;
218}
219
220/*
221 * If the current task has no AS IO context then create one and initialise it.
222 * Then take a ref on the task's io context and return it.
223 */
224static struct io_context *as_get_io_context(int node)
225{
226 struct io_context *ioc = get_io_context(GFP_ATOMIC, node);
227 if (ioc && !ioc->aic) {
228 ioc->aic = alloc_as_io_context();
229 if (!ioc->aic) {
230 put_io_context(ioc);
231 ioc = NULL;
232 }
233 }
234 return ioc;
235}
236
237static void as_put_io_context(struct request *rq)
238{
239 struct as_io_context *aic;
240
241 if (unlikely(!RQ_IOC(rq)))
242 return;
243
244 aic = RQ_IOC(rq)->aic;
245
246 if (rq_is_sync(rq) && aic) {
247 unsigned long flags;
248
249 spin_lock_irqsave(&aic->lock, flags);
250 set_bit(AS_TASK_IORUNNING, &aic->state);
251 aic->last_end_request = jiffies;
252 spin_unlock_irqrestore(&aic->lock, flags);
253 }
254
255 put_io_context(RQ_IOC(rq));
256}
257
258/*
259 * rb tree support functions
260 */
261#define RQ_RB_ROOT(ad, rq) (&(ad)->sort_list[rq_is_sync((rq))])
262
263static void as_add_rq_rb(struct as_data *ad, struct request *rq)
264{
265 struct request *alias;
266
267 while ((unlikely(alias = elv_rb_add(RQ_RB_ROOT(ad, rq), rq)))) {
268 as_move_to_dispatch(ad, alias);
269 as_antic_stop(ad);
270 }
271}
272
273static inline void as_del_rq_rb(struct as_data *ad, struct request *rq)
274{
275 elv_rb_del(RQ_RB_ROOT(ad, rq), rq);
276}
277
278/*
279 * IO Scheduler proper
280 */
281
282#define MAXBACK (1024 * 1024) /*
283 * Maximum distance the disk will go backward
284 * for a request.
285 */
286
287#define BACK_PENALTY 2
288
289/*
290 * as_choose_req selects the preferred one of two requests of the same data_dir
291 * ignoring time - eg. timeouts, which is the job of as_dispatch_request
292 */
293static struct request *
294as_choose_req(struct as_data *ad, struct request *rq1, struct request *rq2)
295{
296 int data_dir;
297 sector_t last, s1, s2, d1, d2;
298 int r1_wrap=0, r2_wrap=0; /* requests are behind the disk head */
299 const sector_t maxback = MAXBACK;
300
301 if (rq1 == NULL || rq1 == rq2)
302 return rq2;
303 if (rq2 == NULL)
304 return rq1;
305
306 data_dir = rq_is_sync(rq1);
307
308 last = ad->last_sector[data_dir];
309 s1 = blk_rq_pos(rq1);
310 s2 = blk_rq_pos(rq2);
311
312 BUG_ON(data_dir != rq_is_sync(rq2));
313
314 /*
315 * Strict one way elevator _except_ in the case where we allow
316 * short backward seeks which are biased as twice the cost of a
317 * similar forward seek.
318 */
319 if (s1 >= last)
320 d1 = s1 - last;
321 else if (s1+maxback >= last)
322 d1 = (last - s1)*BACK_PENALTY;
323 else {
324 r1_wrap = 1;
325 d1 = 0; /* shut up, gcc */
326 }
327
328 if (s2 >= last)
329 d2 = s2 - last;
330 else if (s2+maxback >= last)
331 d2 = (last - s2)*BACK_PENALTY;
332 else {
333 r2_wrap = 1;
334 d2 = 0;
335 }
336
337 /* Found required data */
338 if (!r1_wrap && r2_wrap)
339 return rq1;
340 else if (!r2_wrap && r1_wrap)
341 return rq2;
342 else if (r1_wrap && r2_wrap) {
343 /* both behind the head */
344 if (s1 <= s2)
345 return rq1;
346 else
347 return rq2;
348 }
349
350 /* Both requests in front of the head */
351 if (d1 < d2)
352 return rq1;
353 else if (d2 < d1)
354 return rq2;
355 else {
356 if (s1 >= s2)
357 return rq1;
358 else
359 return rq2;
360 }
361}
362
363/*
364 * as_find_next_rq finds the next request after @prev in elevator order.
365 * this with as_choose_req form the basis for how the scheduler chooses
366 * what request to process next. Anticipation works on top of this.
367 */
368static struct request *
369as_find_next_rq(struct as_data *ad, struct request *last)
370{
371 struct rb_node *rbnext = rb_next(&last->rb_node);
372 struct rb_node *rbprev = rb_prev(&last->rb_node);
373 struct request *next = NULL, *prev = NULL;
374
375 BUG_ON(RB_EMPTY_NODE(&last->rb_node));
376
377 if (rbprev)
378 prev = rb_entry_rq(rbprev);
379
380 if (rbnext)
381 next = rb_entry_rq(rbnext);
382 else {
383 const int data_dir = rq_is_sync(last);
384
385 rbnext = rb_first(&ad->sort_list[data_dir]);
386 if (rbnext && rbnext != &last->rb_node)
387 next = rb_entry_rq(rbnext);
388 }
389
390 return as_choose_req(ad, next, prev);
391}
392
393/*
394 * anticipatory scheduling functions follow
395 */
396
397/*
398 * as_antic_expired tells us when we have anticipated too long.
399 * The funny "absolute difference" math on the elapsed time is to handle
400 * jiffy wraps, and disks which have been idle for 0x80000000 jiffies.
401 */
402static int as_antic_expired(struct as_data *ad)
403{
404 long delta_jif;
405
406 delta_jif = jiffies - ad->antic_start;
407 if (unlikely(delta_jif < 0))
408 delta_jif = -delta_jif;
409 if (delta_jif < ad->antic_expire)
410 return 0;
411
412 return 1;
413}
414
415/*
416 * as_antic_waitnext starts anticipating that a nice request will soon be
417 * submitted. See also as_antic_waitreq
418 */
419static void as_antic_waitnext(struct as_data *ad)
420{
421 unsigned long timeout;
422
423 BUG_ON(ad->antic_status != ANTIC_OFF
424 && ad->antic_status != ANTIC_WAIT_REQ);
425
426 timeout = ad->antic_start + ad->antic_expire;
427
428 mod_timer(&ad->antic_timer, timeout);
429
430 ad->antic_status = ANTIC_WAIT_NEXT;
431}
432
433/*
434 * as_antic_waitreq starts anticipating. We don't start timing the anticipation
435 * until the request that we're anticipating on has finished. This means we
436 * are timing from when the candidate process wakes up hopefully.
437 */
438static void as_antic_waitreq(struct as_data *ad)
439{
440 BUG_ON(ad->antic_status == ANTIC_FINISHED);
441 if (ad->antic_status == ANTIC_OFF) {
442 if (!ad->io_context || ad->ioc_finished)
443 as_antic_waitnext(ad);
444 else
445 ad->antic_status = ANTIC_WAIT_REQ;
446 }
447}
448
449/*
450 * This is called directly by the functions in this file to stop anticipation.
451 * We kill the timer and schedule a call to the request_fn asap.
452 */
453static void as_antic_stop(struct as_data *ad)
454{
455 int status = ad->antic_status;
456
457 if (status == ANTIC_WAIT_REQ || status == ANTIC_WAIT_NEXT) {
458 if (status == ANTIC_WAIT_NEXT)
459 del_timer(&ad->antic_timer);
460 ad->antic_status = ANTIC_FINISHED;
461 /* see as_work_handler */
462 kblockd_schedule_work(ad->q, &ad->antic_work);
463 }
464}
465
466/*
467 * as_antic_timeout is the timer function set by as_antic_waitnext.
468 */
469static void as_antic_timeout(unsigned long data)
470{
471 struct request_queue *q = (struct request_queue *)data;
472 struct as_data *ad = q->elevator->elevator_data;
473 unsigned long flags;
474
475 spin_lock_irqsave(q->queue_lock, flags);
476 if (ad->antic_status == ANTIC_WAIT_REQ
477 || ad->antic_status == ANTIC_WAIT_NEXT) {
478 struct as_io_context *aic;
479 spin_lock(&ad->io_context->lock);
480 aic = ad->io_context->aic;
481
482 ad->antic_status = ANTIC_FINISHED;
483 kblockd_schedule_work(q, &ad->antic_work);
484
485 if (aic->ttime_samples == 0) {
486 /* process anticipated on has exited or timed out*/
487 ad->exit_prob = (7*ad->exit_prob + 256)/8;
488 }
489 if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
490 /* process not "saved" by a cooperating request */
491 ad->exit_no_coop = (7*ad->exit_no_coop + 256)/8;
492 }
493 spin_unlock(&ad->io_context->lock);
494 }
495 spin_unlock_irqrestore(q->queue_lock, flags);
496}
497
498static void as_update_thinktime(struct as_data *ad, struct as_io_context *aic,
499 unsigned long ttime)
500{
501 /* fixed point: 1.0 == 1<<8 */
502 if (aic->ttime_samples == 0) {
503 ad->new_ttime_total = (7*ad->new_ttime_total + 256*ttime) / 8;
504 ad->new_ttime_mean = ad->new_ttime_total / 256;
505
506 ad->exit_prob = (7*ad->exit_prob)/8;
507 }
508 aic->ttime_samples = (7*aic->ttime_samples + 256) / 8;
509 aic->ttime_total = (7*aic->ttime_total + 256*ttime) / 8;
510 aic->ttime_mean = (aic->ttime_total + 128) / aic->ttime_samples;
511}
512
513static void as_update_seekdist(struct as_data *ad, struct as_io_context *aic,
514 sector_t sdist)
515{
516 u64 total;
517
518 if (aic->seek_samples == 0) {
519 ad->new_seek_total = (7*ad->new_seek_total + 256*(u64)sdist)/8;
520 ad->new_seek_mean = ad->new_seek_total / 256;
521 }
522
523 /*
524 * Don't allow the seek distance to get too large from the
525 * odd fragment, pagein, etc
526 */
527 if (aic->seek_samples <= 60) /* second&third seek */
528 sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*1024);
529 else
530 sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*64);
531
532 aic->seek_samples = (7*aic->seek_samples + 256) / 8;
533 aic->seek_total = (7*aic->seek_total + (u64)256*sdist) / 8;
534 total = aic->seek_total + (aic->seek_samples/2);
535 do_div(total, aic->seek_samples);
536 aic->seek_mean = (sector_t)total;
537}
538
539/*
540 * as_update_iohist keeps a decaying histogram of IO thinktimes, and
541 * updates @aic->ttime_mean based on that. It is called when a new
542 * request is queued.
543 */
544static void as_update_iohist(struct as_data *ad, struct as_io_context *aic,
545 struct request *rq)
546{
547 int data_dir = rq_is_sync(rq);
548 unsigned long thinktime = 0;
549 sector_t seek_dist;
550
551 if (aic == NULL)
552 return;
553
554 if (data_dir == BLK_RW_SYNC) {
555 unsigned long in_flight = atomic_read(&aic->nr_queued)
556 + atomic_read(&aic->nr_dispatched);
557 spin_lock(&aic->lock);
558 if (test_bit(AS_TASK_IORUNNING, &aic->state) ||
559 test_bit(AS_TASK_IOSTARTED, &aic->state)) {
560 /* Calculate read -> read thinktime */
561 if (test_bit(AS_TASK_IORUNNING, &aic->state)
562 && in_flight == 0) {
563 thinktime = jiffies - aic->last_end_request;
564 thinktime = min(thinktime, MAX_THINKTIME-1);
565 }
566 as_update_thinktime(ad, aic, thinktime);
567
568 /* Calculate read -> read seek distance */
569 if (aic->last_request_pos < blk_rq_pos(rq))
570 seek_dist = blk_rq_pos(rq) -
571 aic->last_request_pos;
572 else
573 seek_dist = aic->last_request_pos -
574 blk_rq_pos(rq);
575 as_update_seekdist(ad, aic, seek_dist);
576 }
577 aic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
578 set_bit(AS_TASK_IOSTARTED, &aic->state);
579 spin_unlock(&aic->lock);
580 }
581}
582
583/*
584 * as_close_req decides if one request is considered "close" to the
585 * previous one issued.
586 */
587static int as_close_req(struct as_data *ad, struct as_io_context *aic,
588 struct request *rq)
589{
590 unsigned long delay; /* jiffies */
591 sector_t last = ad->last_sector[ad->batch_data_dir];
592 sector_t next = blk_rq_pos(rq);
593 sector_t delta; /* acceptable close offset (in sectors) */
594 sector_t s;
595
596 if (ad->antic_status == ANTIC_OFF || !ad->ioc_finished)
597 delay = 0;
598 else
599 delay = jiffies - ad->antic_start;
600
601 if (delay == 0)
602 delta = 8192;
603 else if (delay <= (20 * HZ / 1000) && delay <= ad->antic_expire)
604 delta = 8192 << delay;
605 else
606 return 1;
607
608 if ((last <= next + (delta>>1)) && (next <= last + delta))
609 return 1;
610
611 if (last < next)
612 s = next - last;
613 else
614 s = last - next;
615
616 if (aic->seek_samples == 0) {
617 /*
618 * Process has just started IO. Use past statistics to
619 * gauge success possibility
620 */
621 if (ad->new_seek_mean > s) {
622 /* this request is better than what we're expecting */
623 return 1;
624 }
625
626 } else {
627 if (aic->seek_mean > s) {
628 /* this request is better than what we're expecting */
629 return 1;
630 }
631 }
632
633 return 0;
634}
635
636/*
637 * as_can_break_anticipation returns true if we have been anticipating this
638 * request.
639 *
640 * It also returns true if the process against which we are anticipating
641 * submits a write - that's presumably an fsync, O_SYNC write, etc. We want to
642 * dispatch it ASAP, because we know that application will not be submitting
643 * any new reads.
644 *
645 * If the task which has submitted the request has exited, break anticipation.
646 *
647 * If this task has queued some other IO, do not enter enticipation.
648 */
649static int as_can_break_anticipation(struct as_data *ad, struct request *rq)
650{
651 struct io_context *ioc;
652 struct as_io_context *aic;
653
654 ioc = ad->io_context;
655 BUG_ON(!ioc);
656 spin_lock(&ioc->lock);
657
658 if (rq && ioc == RQ_IOC(rq)) {
659 /* request from same process */
660 spin_unlock(&ioc->lock);
661 return 1;
662 }
663
664 if (ad->ioc_finished && as_antic_expired(ad)) {
665 /*
666 * In this situation status should really be FINISHED,
667 * however the timer hasn't had the chance to run yet.
668 */
669 spin_unlock(&ioc->lock);
670 return 1;
671 }
672
673 aic = ioc->aic;
674 if (!aic) {
675 spin_unlock(&ioc->lock);
676 return 0;
677 }
678
679 if (atomic_read(&aic->nr_queued) > 0) {
680 /* process has more requests queued */
681 spin_unlock(&ioc->lock);
682 return 1;
683 }
684
685 if (atomic_read(&aic->nr_dispatched) > 0) {
686 /* process has more requests dispatched */
687 spin_unlock(&ioc->lock);
688 return 1;
689 }
690
691 if (rq && rq_is_sync(rq) && as_close_req(ad, aic, rq)) {
692 /*
693 * Found a close request that is not one of ours.
694 *
695 * This makes close requests from another process update
696 * our IO history. Is generally useful when there are
697 * two or more cooperating processes working in the same
698 * area.
699 */
700 if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
701 if (aic->ttime_samples == 0)
702 ad->exit_prob = (7*ad->exit_prob + 256)/8;
703
704 ad->exit_no_coop = (7*ad->exit_no_coop)/8;
705 }
706
707 as_update_iohist(ad, aic, rq);
708 spin_unlock(&ioc->lock);
709 return 1;
710 }
711
712 if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
713 /* process anticipated on has exited */
714 if (aic->ttime_samples == 0)
715 ad->exit_prob = (7*ad->exit_prob + 256)/8;
716
717 if (ad->exit_no_coop > 128) {
718 spin_unlock(&ioc->lock);
719 return 1;
720 }
721 }
722
723 if (aic->ttime_samples == 0) {
724 if (ad->new_ttime_mean > ad->antic_expire) {
725 spin_unlock(&ioc->lock);
726 return 1;
727 }
728 if (ad->exit_prob * ad->exit_no_coop > 128*256) {
729 spin_unlock(&ioc->lock);
730 return 1;
731 }
732 } else if (aic->ttime_mean > ad->antic_expire) {
733 /* the process thinks too much between requests */
734 spin_unlock(&ioc->lock);
735 return 1;
736 }
737 spin_unlock(&ioc->lock);
738 return 0;
739}
740
741/*
742 * as_can_anticipate indicates whether we should either run rq
743 * or keep anticipating a better request.
744 */
745static int as_can_anticipate(struct as_data *ad, struct request *rq)
746{
747#if 0 /* disable for now, we need to check tag level as well */
748 /*
749 * SSD device without seek penalty, disable idling
750 */
751 if (blk_queue_nonrot(ad->q)) axman
752 return 0;
753#endif
754
755 if (!ad->io_context)
756 /*
757 * Last request submitted was a write
758 */
759 return 0;
760
761 if (ad->antic_status == ANTIC_FINISHED)
762 /*
763 * Don't restart if we have just finished. Run the next request
764 */
765 return 0;
766
767 if (as_can_break_anticipation(ad, rq))
768 /*
769 * This request is a good candidate. Don't keep anticipating,
770 * run it.
771 */
772 return 0;
773
774 /*
775 * OK from here, we haven't finished, and don't have a decent request!
776 * Status is either ANTIC_OFF so start waiting,
777 * ANTIC_WAIT_REQ so continue waiting for request to finish
778 * or ANTIC_WAIT_NEXT so continue waiting for an acceptable request.
779 */
780
781 return 1;
782}
783
784/*
785 * as_update_rq must be called whenever a request (rq) is added to
786 * the sort_list. This function keeps caches up to date, and checks if the
787 * request might be one we are "anticipating"
788 */
789static void as_update_rq(struct as_data *ad, struct request *rq)
790{
791 const int data_dir = rq_is_sync(rq);
792
793 /* keep the next_rq cache up to date */
794 ad->next_rq[data_dir] = as_choose_req(ad, rq, ad->next_rq[data_dir]);
795
796 /*
797 * have we been anticipating this request?
798 * or does it come from the same process as the one we are anticipating
799 * for?
800 */
801 if (ad->antic_status == ANTIC_WAIT_REQ
802 || ad->antic_status == ANTIC_WAIT_NEXT) {
803 if (as_can_break_anticipation(ad, rq))
804 as_antic_stop(ad);
805 }
806}
807
808/*
809 * Gathers timings and resizes the write batch automatically
810 */
811static void update_write_batch(struct as_data *ad)
812{
813 unsigned long batch = ad->batch_expire[BLK_RW_ASYNC];
814 long write_time;
815
816 write_time = (jiffies - ad->current_batch_expires) + batch;
817 if (write_time < 0)
818 write_time = 0;
819
820 if (write_time > batch && !ad->write_batch_idled) {
821 if (write_time > batch * 3)
822 ad->write_batch_count /= 2;
823 else
824 ad->write_batch_count--;
825 } else if (write_time < batch && ad->current_write_count == 0) {
826 if (batch > write_time * 3)
827 ad->write_batch_count *= 2;
828 else
829 ad->write_batch_count++;
830 }
831
832 if (ad->write_batch_count < 1)
833 ad->write_batch_count = 1;
834}
835
836/*
837 * as_completed_request is to be called when a request has completed and
838 * returned something to the requesting process, be it an error or data.
839 */
840static void as_completed_request(struct request_queue *q, struct request *rq)
841{
842 struct as_data *ad = q->elevator->elevator_data;
843
844 WARN_ON(!list_empty(&rq->queuelist));
845
846 if (RQ_STATE(rq) != AS_RQ_REMOVED) {
847 WARN(1, "rq->state %d\n", RQ_STATE(rq));
848 goto out;
849 }
850
851 if (ad->changed_batch && ad->nr_dispatched == 1) {
852 ad->current_batch_expires = jiffies +
853 ad->batch_expire[ad->batch_data_dir];
854 kblockd_schedule_work(q, &ad->antic_work);
855 ad->changed_batch = 0;
856
857 if (ad->batch_data_dir == BLK_RW_SYNC)
858 ad->new_batch = 1;
859 }
860 WARN_ON(ad->nr_dispatched == 0);
861 ad->nr_dispatched--;
862
863 /*
864 * Start counting the batch from when a request of that direction is
865 * actually serviced. This should help devices with big TCQ windows
866 * and writeback caches
867 */
868 if (ad->new_batch && ad->batch_data_dir == rq_is_sync(rq)) {
869 update_write_batch(ad);
870 ad->current_batch_expires = jiffies +
871 ad->batch_expire[BLK_RW_SYNC];
872 ad->new_batch = 0;
873 }
874
875 if (ad->io_context == RQ_IOC(rq) && ad->io_context) {
876 ad->antic_start = jiffies;
877 ad->ioc_finished = 1;
878 if (ad->antic_status == ANTIC_WAIT_REQ) {
879 /*
880 * We were waiting on this request, now anticipate
881 * the next one
882 */
883 as_antic_waitnext(ad);
884 }
885 }
886
887 as_put_io_context(rq);
888out:
889 RQ_SET_STATE(rq, AS_RQ_POSTSCHED);
890}
891
892/*
893 * as_remove_queued_request removes a request from the pre dispatch queue
894 * without updating refcounts. It is expected the caller will drop the
895 * reference unless it replaces the request at somepart of the elevator
896 * (ie. the dispatch queue)
897 */
898static void as_remove_queued_request(struct request_queue *q,
899 struct request *rq)
900{
901 const int data_dir = rq_is_sync(rq);
902 struct as_data *ad = q->elevator->elevator_data;
903 struct io_context *ioc;
904
905 WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED);
906
907 ioc = RQ_IOC(rq);
908 if (ioc && ioc->aic) {
909 BUG_ON(!atomic_read(&ioc->aic->nr_queued));
910 atomic_dec(&ioc->aic->nr_queued);
911 }
912
913 /*
914 * Update the "next_rq" cache if we are about to remove its
915 * entry
916 */
917 if (ad->next_rq[data_dir] == rq)
918 ad->next_rq[data_dir] = as_find_next_rq(ad, rq);
919
920 rq_fifo_clear(rq);
921 as_del_rq_rb(ad, rq);
922}
923
924/*
925 * as_fifo_expired returns 0 if there are no expired requests on the fifo,
926 * 1 otherwise. It is ratelimited so that we only perform the check once per
927 * `fifo_expire' interval. Otherwise a large number of expired requests
928 * would create a hopeless seekstorm.
929 *
930 * See as_antic_expired comment.
931 */
932static int as_fifo_expired(struct as_data *ad, int adir)
933{
934 struct request *rq;
935 long delta_jif;
936
937 delta_jif = jiffies - ad->last_check_fifo[adir];
938 if (unlikely(delta_jif < 0))
939 delta_jif = -delta_jif;
940 if (delta_jif < ad->fifo_expire[adir])
941 return 0;
942
943 ad->last_check_fifo[adir] = jiffies;
944
945 if (list_empty(&ad->fifo_list[adir]))
946 return 0;
947
948 rq = rq_entry_fifo(ad->fifo_list[adir].next);
949
950 return time_after(jiffies, rq_fifo_time(rq));
951}
952
953/*
954 * as_batch_expired returns true if the current batch has expired. A batch
955 * is a set of reads or a set of writes.
956 */
957static inline int as_batch_expired(struct as_data *ad)
958{
959 if (ad->changed_batch || ad->new_batch)
960 return 0;
961
962 if (ad->batch_data_dir == BLK_RW_SYNC)
963 /* TODO! add a check so a complete fifo gets written? */
964 return time_after(jiffies, ad->current_batch_expires);
965
966 return time_after(jiffies, ad->current_batch_expires)
967 || ad->current_write_count == 0;
968}
969
970/*
971 * move an entry to dispatch queue
972 */
973static void as_move_to_dispatch(struct as_data *ad, struct request *rq)
974{
975 const int data_dir = rq_is_sync(rq);
976
977 BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
978
979 as_antic_stop(ad);
980 ad->antic_status = ANTIC_OFF;
981
982 /*
983 * This has to be set in order to be correctly updated by
984 * as_find_next_rq
985 */
986 ad->last_sector[data_dir] = blk_rq_pos(rq) + blk_rq_sectors(rq);
987
988 if (data_dir == BLK_RW_SYNC) {
989 struct io_context *ioc = RQ_IOC(rq);
990 /* In case we have to anticipate after this */
991 copy_io_context(&ad->io_context, &ioc);
992 } else {
993 if (ad->io_context) {
994 put_io_context(ad->io_context);
995 ad->io_context = NULL;
996 }
997
998 if (ad->current_write_count != 0)
999 ad->current_write_count--;
1000 }
1001 ad->ioc_finished = 0;
1002
1003 ad->next_rq[data_dir] = as_find_next_rq(ad, rq);
1004
1005 /*
1006 * take it off the sort and fifo list, add to dispatch queue
1007 */
1008 as_remove_queued_request(ad->q, rq);
1009 WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED);
1010
1011 elv_dispatch_sort(ad->q, rq);
1012
1013 RQ_SET_STATE(rq, AS_RQ_DISPATCHED);
1014 if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
1015 atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched);
1016 ad->nr_dispatched++;
1017}
1018
1019/*
1020 * as_dispatch_request selects the best request according to
1021 * read/write expire, batch expire, etc, and moves it to the dispatch
1022 * queue. Returns 1 if a request was found, 0 otherwise.
1023 */
1024static int as_dispatch_request(struct request_queue *q, int force)
1025{
1026 struct as_data *ad = q->elevator->elevator_data;
1027 const int reads = !list_empty(&ad->fifo_list[BLK_RW_SYNC]);
1028 const int writes = !list_empty(&ad->fifo_list[BLK_RW_ASYNC]);
1029 struct request *rq;
1030
1031 if (unlikely(force)) {
1032 /*
1033 * Forced dispatch, accounting is useless. Reset
1034 * accounting states and dump fifo_lists. Note that
1035 * batch_data_dir is reset to BLK_RW_SYNC to avoid
1036 * screwing write batch accounting as write batch
1037 * accounting occurs on W->R transition.
1038 */
1039 int dispatched = 0;
1040
1041 ad->batch_data_dir = BLK_RW_SYNC;
1042 ad->changed_batch = 0;
1043 ad->new_batch = 0;
1044
1045 while (ad->next_rq[BLK_RW_SYNC]) {
1046 as_move_to_dispatch(ad, ad->next_rq[BLK_RW_SYNC]);
1047 dispatched++;
1048 }
1049 ad->last_check_fifo[BLK_RW_SYNC] = jiffies;
1050
1051 while (ad->next_rq[BLK_RW_ASYNC]) {
1052 as_move_to_dispatch(ad, ad->next_rq[BLK_RW_ASYNC]);
1053 dispatched++;
1054 }
1055 ad->last_check_fifo[BLK_RW_ASYNC] = jiffies;
1056
1057 return dispatched;
1058 }
1059
1060 /* Signal that the write batch was uncontended, so we can't time it */
1061 if (ad->batch_data_dir == BLK_RW_ASYNC && !reads) {
1062 if (ad->current_write_count == 0 || !writes)
1063 ad->write_batch_idled = 1;
1064 }
1065
1066 if (!(reads || writes)
1067 || ad->antic_status == ANTIC_WAIT_REQ
1068 || ad->antic_status == ANTIC_WAIT_NEXT
1069 || ad->changed_batch)
1070 return 0;
1071
1072 if (!(reads && writes && as_batch_expired(ad))) {
1073 /*
1074 * batch is still running or no reads or no writes
1075 */
1076 rq = ad->next_rq[ad->batch_data_dir];
1077
1078 if (ad->batch_data_dir == BLK_RW_SYNC && ad->antic_expire) {
1079 if (as_fifo_expired(ad, BLK_RW_SYNC))
1080 goto fifo_expired;
1081
1082 if (as_can_anticipate(ad, rq)) {
1083 as_antic_waitreq(ad);
1084 return 0;
1085 }
1086 }
1087
1088 if (rq) {
1089 /* we have a "next request" */
1090 if (reads && !writes)
1091 ad->current_batch_expires =
1092 jiffies + ad->batch_expire[BLK_RW_SYNC];
1093 goto dispatch_request;
1094 }
1095 }
1096
1097 /*
1098 * at this point we are not running a batch. select the appropriate
1099 * data direction (read / write)
1100 */
1101
1102 if (reads) {
1103 BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_SYNC]));
1104
1105 if (writes && ad->batch_data_dir == BLK_RW_SYNC)
1106 /*
1107 * Last batch was a read, switch to writes
1108 */
1109 goto dispatch_writes;
1110
1111 if (ad->batch_data_dir == BLK_RW_ASYNC) {
1112 WARN_ON(ad->new_batch);
1113 ad->changed_batch = 1;
1114 }
1115 ad->batch_data_dir = BLK_RW_SYNC;
1116 rq = rq_entry_fifo(ad->fifo_list[BLK_RW_SYNC].next);
1117 ad->last_check_fifo[ad->batch_data_dir] = jiffies;
1118 goto dispatch_request;
1119 }
1120
1121 /*
1122 * the last batch was a read
1123 */
1124
1125 if (writes) {
1126dispatch_writes:
1127 BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_ASYNC]));
1128
1129 if (ad->batch_data_dir == BLK_RW_SYNC) {
1130 ad->changed_batch = 1;
1131
1132 /*
1133 * new_batch might be 1 when the queue runs out of
1134 * reads. A subsequent submission of a write might
1135 * cause a change of batch before the read is finished.
1136 */
1137 ad->new_batch = 0;
1138 }
1139 ad->batch_data_dir = BLK_RW_ASYNC;
1140 ad->current_write_count = ad->write_batch_count;
1141 ad->write_batch_idled = 0;
1142 rq = rq_entry_fifo(ad->fifo_list[BLK_RW_ASYNC].next);
1143 ad->last_check_fifo[BLK_RW_ASYNC] = jiffies;
1144 goto dispatch_request;
1145 }
1146
1147 BUG();
1148 return 0;
1149
1150dispatch_request:
1151 /*
1152 * If a request has expired, service it.
1153 */
1154
1155 if (as_fifo_expired(ad, ad->batch_data_dir)) {
1156fifo_expired:
1157 rq = rq_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
1158 }
1159
1160 if (ad->changed_batch) {
1161 WARN_ON(ad->new_batch);
1162
1163 if (ad->nr_dispatched)
1164 return 0;
1165
1166 if (ad->batch_data_dir == BLK_RW_ASYNC)
1167 ad->current_batch_expires = jiffies +
1168 ad->batch_expire[BLK_RW_ASYNC];
1169 else
1170 ad->new_batch = 1;
1171
1172 ad->changed_batch = 0;
1173 }
1174
1175 /*
1176 * rq is the selected appropriate request.
1177 */
1178 as_move_to_dispatch(ad, rq);
1179
1180 return 1;
1181}
1182
1183/*
1184 * add rq to rbtree and fifo
1185 */
1186static void as_add_request(struct request_queue *q, struct request *rq)
1187{
1188 struct as_data *ad = q->elevator->elevator_data;
1189 int data_dir;
1190
1191 RQ_SET_STATE(rq, AS_RQ_NEW);
1192
1193 data_dir = rq_is_sync(rq);
1194
1195 rq->elevator_private = as_get_io_context(q->node);
1196
1197 if (RQ_IOC(rq)) {
1198 as_update_iohist(ad, RQ_IOC(rq)->aic, rq);
1199 atomic_inc(&RQ_IOC(rq)->aic->nr_queued);
1200 }
1201
1202 as_add_rq_rb(ad, rq);
1203
1204 /*
1205 * set expire time and add to fifo list
1206 */
1207 rq_set_fifo_time(rq, jiffies + ad->fifo_expire[data_dir]);
1208 list_add_tail(&rq->queuelist, &ad->fifo_list[data_dir]);
1209
1210 as_update_rq(ad, rq); /* keep state machine up to date */
1211 RQ_SET_STATE(rq, AS_RQ_QUEUED);
1212}
1213
1214static void as_activate_request(struct request_queue *q, struct request *rq)
1215{
1216 WARN_ON(RQ_STATE(rq) != AS_RQ_DISPATCHED);
1217 RQ_SET_STATE(rq, AS_RQ_REMOVED);
1218 if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
1219 atomic_dec(&RQ_IOC(rq)->aic->nr_dispatched);
1220}
1221
1222static void as_deactivate_request(struct request_queue *q, struct request *rq)
1223{
1224 WARN_ON(RQ_STATE(rq) != AS_RQ_REMOVED);
1225 RQ_SET_STATE(rq, AS_RQ_DISPATCHED);
1226 if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
1227 atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched);
1228}
1229
1230/*
1231 * as_queue_empty tells us if there are requests left in the device. It may
1232 * not be the case that a driver can get the next request even if the queue
1233 * is not empty - it is used in the block layer to check for plugging and
1234 * merging opportunities
1235 */
1236static int as_queue_empty(struct request_queue *q)
1237{
1238 struct as_data *ad = q->elevator->elevator_data;
1239
1240 return list_empty(&ad->fifo_list[BLK_RW_ASYNC])
1241 && list_empty(&ad->fifo_list[BLK_RW_SYNC]);
1242}
1243
1244static int
1245as_merge(struct request_queue *q, struct request **req, struct bio *bio)
1246{
1247 struct as_data *ad = q->elevator->elevator_data;
1248 sector_t rb_key = bio->bi_sector + bio_sectors(bio);
1249 struct request *__rq;
1250
1251 /*
1252 * check for front merge
1253 */
1254 __rq = elv_rb_find(&ad->sort_list[bio_data_dir(bio)], rb_key);
1255 if (__rq && elv_rq_merge_ok(__rq, bio)) {
1256 *req = __rq;
1257 return ELEVATOR_FRONT_MERGE;
1258 }
1259
1260 return ELEVATOR_NO_MERGE;
1261}
1262
1263static void as_merged_request(struct request_queue *q, struct request *req,
1264 int type)
1265{
1266 struct as_data *ad = q->elevator->elevator_data;
1267
1268 /*
1269 * if the merge was a front merge, we need to reposition request
1270 */
1271 if (type == ELEVATOR_FRONT_MERGE) {
1272 as_del_rq_rb(ad, req);
1273 as_add_rq_rb(ad, req);
1274 /*
1275 * Note! At this stage of this and the next function, our next
1276 * request may not be optimal - eg the request may have "grown"
1277 * behind the disk head. We currently don't bother adjusting.
1278 */
1279 }
1280}
1281
1282static void as_merged_requests(struct request_queue *q, struct request *req,
1283 struct request *next)
1284{
1285 /*
1286 * if next expires before rq, assign its expire time to arq
1287 * and move into next position (next will be deleted) in fifo
1288 */
1289 if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
1290 if (time_before(rq_fifo_time(next), rq_fifo_time(req))) {
1291 list_move(&req->queuelist, &next->queuelist);
1292 rq_set_fifo_time(req, rq_fifo_time(next));
1293 }
1294 }
1295
1296 /*
1297 * kill knowledge of next, this one is a goner
1298 */
1299 as_remove_queued_request(q, next);
1300 as_put_io_context(next);
1301
1302 RQ_SET_STATE(next, AS_RQ_MERGED);
1303}
1304
1305/*
1306 * This is executed in a "deferred" process context, by kblockd. It calls the
1307 * driver's request_fn so the driver can submit that request.
1308 *
1309 * IMPORTANT! This guy will reenter the elevator, so set up all queue global
1310 * state before calling, and don't rely on any state over calls.
1311 *
1312 * FIXME! dispatch queue is not a queue at all!
1313 */
1314static void as_work_handler(struct work_struct *work)
1315{
1316 struct as_data *ad = container_of(work, struct as_data, antic_work);
1317
1318 blk_run_queue(ad->q);
1319}
1320
1321static int as_may_queue(struct request_queue *q, int rw)
1322{
1323 int ret = ELV_MQUEUE_MAY;
1324 struct as_data *ad = q->elevator->elevator_data;
1325 struct io_context *ioc;
1326 if (ad->antic_status == ANTIC_WAIT_REQ ||
1327 ad->antic_status == ANTIC_WAIT_NEXT) {
1328 ioc = as_get_io_context(q->node);
1329 if (ad->io_context == ioc)
1330 ret = ELV_MQUEUE_MUST;
1331 put_io_context(ioc);
1332 }
1333
1334 return ret;
1335}
1336
1337static void as_exit_queue(struct elevator_queue *e)
1338{
1339 struct as_data *ad = e->elevator_data;
1340
1341 del_timer_sync(&ad->antic_timer);
1342 cancel_work_sync(&ad->antic_work);
1343
1344 BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_SYNC]));
1345 BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_ASYNC]));
1346
1347 put_io_context(ad->io_context);
1348 kfree(ad);
1349}
1350
1351/*
1352 * initialize elevator private data (as_data).
1353 */
1354static void *as_init_queue(struct request_queue *q)
1355{
1356 struct as_data *ad;
1357
1358 ad = kmalloc_node(sizeof(*ad), GFP_KERNEL | __GFP_ZERO, q->node);
1359 if (!ad)
1360 return NULL;
1361
1362 ad->q = q; /* Identify what queue the data belongs to */
1363
1364 /* anticipatory scheduling helpers */
1365 ad->antic_timer.function = as_antic_timeout;
1366 ad->antic_timer.data = (unsigned long)q;
1367 init_timer(&ad->antic_timer);
1368 INIT_WORK(&ad->antic_work, as_work_handler);
1369
1370 INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_SYNC]);
1371 INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_ASYNC]);
1372 ad->sort_list[BLK_RW_SYNC] = RB_ROOT;
1373 ad->sort_list[BLK_RW_ASYNC] = RB_ROOT;
1374 ad->fifo_expire[BLK_RW_SYNC] = default_read_expire;
1375 ad->fifo_expire[BLK_RW_ASYNC] = default_write_expire;
1376 ad->antic_expire = default_antic_expire;
1377 ad->batch_expire[BLK_RW_SYNC] = default_read_batch_expire;
1378 ad->batch_expire[BLK_RW_ASYNC] = default_write_batch_expire;
1379
1380 ad->current_batch_expires = jiffies + ad->batch_expire[BLK_RW_SYNC];
1381 ad->write_batch_count = ad->batch_expire[BLK_RW_ASYNC] / 10;
1382 if (ad->write_batch_count < 2)
1383 ad->write_batch_count = 2;
1384
1385 return ad;
1386}
1387
1388/*
1389 * sysfs parts below
1390 */
1391
1392static ssize_t
1393as_var_show(unsigned int var, char *page)
1394{
1395 return sprintf(page, "%d\n", var);
1396}
1397
1398static ssize_t
1399as_var_store(unsigned long *var, const char *page, size_t count)
1400{
1401 char *p = (char *) page;
1402
1403 *var = simple_strtoul(p, &p, 10);
1404 return count;
1405}
1406
1407static ssize_t est_time_show(struct elevator_queue *e, char *page)
1408{
1409 struct as_data *ad = e->elevator_data;
1410 int pos = 0;
1411
1412 pos += sprintf(page+pos, "%lu %% exit probability\n",
1413 100*ad->exit_prob/256);
1414 pos += sprintf(page+pos, "%lu %% probability of exiting without a "
1415 "cooperating process submitting IO\n",
1416 100*ad->exit_no_coop/256);
1417 pos += sprintf(page+pos, "%lu ms new thinktime\n", ad->new_ttime_mean);
1418 pos += sprintf(page+pos, "%llu sectors new seek distance\n",
1419 (unsigned long long)ad->new_seek_mean);
1420
1421 return pos;
1422}
1423
1424#define SHOW_FUNCTION(__FUNC, __VAR) \
1425static ssize_t __FUNC(struct elevator_queue *e, char *page) \
1426{ \
1427 struct as_data *ad = e->elevator_data; \
1428 return as_var_show(jiffies_to_msecs((__VAR)), (page)); \
1429}
1430SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[BLK_RW_SYNC]);
1431SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[BLK_RW_ASYNC]);
1432SHOW_FUNCTION(as_antic_expire_show, ad->antic_expire);
1433SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[BLK_RW_SYNC]);
1434SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[BLK_RW_ASYNC]);
1435#undef SHOW_FUNCTION
1436
1437#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
1438static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
1439{ \
1440 struct as_data *ad = e->elevator_data; \
1441 int ret = as_var_store(__PTR, (page), count); \
1442 if (*(__PTR) < (MIN)) \
1443 *(__PTR) = (MIN); \
1444 else if (*(__PTR) > (MAX)) \
1445 *(__PTR) = (MAX); \
1446 *(__PTR) = msecs_to_jiffies(*(__PTR)); \
1447 return ret; \
1448}
1449STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[BLK_RW_SYNC], 0, INT_MAX);
1450STORE_FUNCTION(as_write_expire_store,
1451 &ad->fifo_expire[BLK_RW_ASYNC], 0, INT_MAX);
1452STORE_FUNCTION(as_antic_expire_store, &ad->antic_expire, 0, INT_MAX);
1453STORE_FUNCTION(as_read_batch_expire_store,
1454 &ad->batch_expire[BLK_RW_SYNC], 0, INT_MAX);
1455STORE_FUNCTION(as_write_batch_expire_store,
1456 &ad->batch_expire[BLK_RW_ASYNC], 0, INT_MAX);
1457#undef STORE_FUNCTION
1458
1459#define AS_ATTR(name) \
1460 __ATTR(name, S_IRUGO|S_IWUSR, as_##name##_show, as_##name##_store)
1461
1462static struct elv_fs_entry as_attrs[] = {
1463 __ATTR_RO(est_time),
1464 AS_ATTR(read_expire),
1465 AS_ATTR(write_expire),
1466 AS_ATTR(antic_expire),
1467 AS_ATTR(read_batch_expire),
1468 AS_ATTR(write_batch_expire),
1469 __ATTR_NULL
1470};
1471
1472static struct elevator_type iosched_as = {
1473 .ops = {
1474 .elevator_merge_fn = as_merge,
1475 .elevator_merged_fn = as_merged_request,
1476 .elevator_merge_req_fn = as_merged_requests,
1477 .elevator_dispatch_fn = as_dispatch_request,
1478 .elevator_add_req_fn = as_add_request,
1479 .elevator_activate_req_fn = as_activate_request,
1480 .elevator_deactivate_req_fn = as_deactivate_request,
1481 .elevator_queue_empty_fn = as_queue_empty,
1482 .elevator_completed_req_fn = as_completed_request,
1483 .elevator_former_req_fn = elv_rb_former_request,
1484 .elevator_latter_req_fn = elv_rb_latter_request,
1485 .elevator_may_queue_fn = as_may_queue,
1486 .elevator_init_fn = as_init_queue,
1487 .elevator_exit_fn = as_exit_queue,
1488 .trim = as_trim,
1489 },
1490
1491 .elevator_attrs = as_attrs,
1492 .elevator_name = "anticipatory",
1493 .elevator_owner = THIS_MODULE,
1494};
1495
1496static int __init as_init(void)
1497{
1498 elv_register(&iosched_as);
1499
1500 return 0;
1501}
1502
1503static void __exit as_exit(void)
1504{
1505 DECLARE_COMPLETION_ONSTACK(all_gone);
1506 elv_unregister(&iosched_as);
1507 ioc_gone = &all_gone;
1508 /* ioc_gone's update must be visible before reading ioc_count */
1509 smp_wmb();
1510 if (elv_ioc_count_read(as_ioc_count))
1511 wait_for_completion(&all_gone);
1512 synchronize_rcu();
1513}
1514
1515module_init(as_init);
1516module_exit(as_exit);
1517
1518MODULE_AUTHOR("Nick Piggin");
1519MODULE_LICENSE("GPL");
1520MODULE_DESCRIPTION("anticipatory IO scheduler");
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
new file mode 100644
index 000000000000..1fa2654db0a6
--- /dev/null
+++ b/block/blk-cgroup.c
@@ -0,0 +1,361 @@
1/*
2 * Common Block IO controller cgroup interface
3 *
4 * Based on ideas and code from CFQ, CFS and BFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6 *
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
9 *
10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11 * Nauman Rafique <nauman@google.com>
12 */
13#include <linux/ioprio.h>
14#include <linux/seq_file.h>
15#include <linux/kdev_t.h>
16#include <linux/module.h>
17#include <linux/err.h>
18#include "blk-cgroup.h"
19
20static DEFINE_SPINLOCK(blkio_list_lock);
21static LIST_HEAD(blkio_list);
22
23struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
24EXPORT_SYMBOL_GPL(blkio_root_cgroup);
25
26bool blkiocg_css_tryget(struct blkio_cgroup *blkcg)
27{
28 if (!css_tryget(&blkcg->css))
29 return false;
30 return true;
31}
32EXPORT_SYMBOL_GPL(blkiocg_css_tryget);
33
34void blkiocg_css_put(struct blkio_cgroup *blkcg)
35{
36 css_put(&blkcg->css);
37}
38EXPORT_SYMBOL_GPL(blkiocg_css_put);
39
40struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
41{
42 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
43 struct blkio_cgroup, css);
44}
45EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
46
47void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
48 unsigned long time, unsigned long sectors)
49{
50 blkg->time += time;
51 blkg->sectors += sectors;
52}
53EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats);
54
55void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
56 struct blkio_group *blkg, void *key, dev_t dev)
57{
58 unsigned long flags;
59
60 spin_lock_irqsave(&blkcg->lock, flags);
61 rcu_assign_pointer(blkg->key, key);
62 blkg->blkcg_id = css_id(&blkcg->css);
63 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
64 spin_unlock_irqrestore(&blkcg->lock, flags);
65#ifdef CONFIG_DEBUG_BLK_CGROUP
66 /* Need to take css reference ? */
67 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
68#endif
69 blkg->dev = dev;
70}
71EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
72
73static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
74{
75 hlist_del_init_rcu(&blkg->blkcg_node);
76 blkg->blkcg_id = 0;
77}
78
79/*
80 * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
81 * indicating that blk_group was unhashed by the time we got to it.
82 */
83int blkiocg_del_blkio_group(struct blkio_group *blkg)
84{
85 struct blkio_cgroup *blkcg;
86 unsigned long flags;
87 struct cgroup_subsys_state *css;
88 int ret = 1;
89
90 rcu_read_lock();
91 css = css_lookup(&blkio_subsys, blkg->blkcg_id);
92 if (!css)
93 goto out;
94
95 blkcg = container_of(css, struct blkio_cgroup, css);
96 spin_lock_irqsave(&blkcg->lock, flags);
97 if (!hlist_unhashed(&blkg->blkcg_node)) {
98 __blkiocg_del_blkio_group(blkg);
99 ret = 0;
100 }
101 spin_unlock_irqrestore(&blkcg->lock, flags);
102out:
103 rcu_read_unlock();
104 return ret;
105}
106EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
107
108/* called under rcu_read_lock(). */
109struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
110{
111 struct blkio_group *blkg;
112 struct hlist_node *n;
113 void *__key;
114
115 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
116 __key = blkg->key;
117 if (__key == key)
118 return blkg;
119 }
120
121 return NULL;
122}
123EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
124
125#define SHOW_FUNCTION(__VAR) \
126static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \
127 struct cftype *cftype) \
128{ \
129 struct blkio_cgroup *blkcg; \
130 \
131 blkcg = cgroup_to_blkio_cgroup(cgroup); \
132 return (u64)blkcg->__VAR; \
133}
134
135SHOW_FUNCTION(weight);
136#undef SHOW_FUNCTION
137
138static int
139blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
140{
141 struct blkio_cgroup *blkcg;
142 struct blkio_group *blkg;
143 struct hlist_node *n;
144 struct blkio_policy_type *blkiop;
145
146 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
147 return -EINVAL;
148
149 blkcg = cgroup_to_blkio_cgroup(cgroup);
150 spin_lock_irq(&blkcg->lock);
151 blkcg->weight = (unsigned int)val;
152 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
153 spin_lock(&blkio_list_lock);
154 list_for_each_entry(blkiop, &blkio_list, list)
155 blkiop->ops.blkio_update_group_weight_fn(blkg,
156 blkcg->weight);
157 spin_unlock(&blkio_list_lock);
158 }
159 spin_unlock_irq(&blkcg->lock);
160 return 0;
161}
162
163#define SHOW_FUNCTION_PER_GROUP(__VAR) \
164static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
165 struct cftype *cftype, struct seq_file *m) \
166{ \
167 struct blkio_cgroup *blkcg; \
168 struct blkio_group *blkg; \
169 struct hlist_node *n; \
170 \
171 if (!cgroup_lock_live_group(cgroup)) \
172 return -ENODEV; \
173 \
174 blkcg = cgroup_to_blkio_cgroup(cgroup); \
175 rcu_read_lock(); \
176 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
177 if (blkg->dev) \
178 seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev), \
179 MINOR(blkg->dev), blkg->__VAR); \
180 } \
181 rcu_read_unlock(); \
182 cgroup_unlock(); \
183 return 0; \
184}
185
186SHOW_FUNCTION_PER_GROUP(time);
187SHOW_FUNCTION_PER_GROUP(sectors);
188#ifdef CONFIG_DEBUG_BLK_CGROUP
189SHOW_FUNCTION_PER_GROUP(dequeue);
190#endif
191#undef SHOW_FUNCTION_PER_GROUP
192
193#ifdef CONFIG_DEBUG_BLK_CGROUP
194void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
195 unsigned long dequeue)
196{
197 blkg->dequeue += dequeue;
198}
199EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
200#endif
201
202struct cftype blkio_files[] = {
203 {
204 .name = "weight",
205 .read_u64 = blkiocg_weight_read,
206 .write_u64 = blkiocg_weight_write,
207 },
208 {
209 .name = "time",
210 .read_seq_string = blkiocg_time_read,
211 },
212 {
213 .name = "sectors",
214 .read_seq_string = blkiocg_sectors_read,
215 },
216#ifdef CONFIG_DEBUG_BLK_CGROUP
217 {
218 .name = "dequeue",
219 .read_seq_string = blkiocg_dequeue_read,
220 },
221#endif
222};
223
224static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
225{
226 return cgroup_add_files(cgroup, subsys, blkio_files,
227 ARRAY_SIZE(blkio_files));
228}
229
230static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
231{
232 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
233 unsigned long flags;
234 struct blkio_group *blkg;
235 void *key;
236 struct blkio_policy_type *blkiop;
237
238 rcu_read_lock();
239remove_entry:
240 spin_lock_irqsave(&blkcg->lock, flags);
241
242 if (hlist_empty(&blkcg->blkg_list)) {
243 spin_unlock_irqrestore(&blkcg->lock, flags);
244 goto done;
245 }
246
247 blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
248 blkcg_node);
249 key = rcu_dereference(blkg->key);
250 __blkiocg_del_blkio_group(blkg);
251
252 spin_unlock_irqrestore(&blkcg->lock, flags);
253
254 /*
255 * This blkio_group is being unlinked as associated cgroup is going
256 * away. Let all the IO controlling policies know about this event.
257 *
258 * Currently this is static call to one io controlling policy. Once
259 * we have more policies in place, we need some dynamic registration
260 * of callback function.
261 */
262 spin_lock(&blkio_list_lock);
263 list_for_each_entry(blkiop, &blkio_list, list)
264 blkiop->ops.blkio_unlink_group_fn(key, blkg);
265 spin_unlock(&blkio_list_lock);
266 goto remove_entry;
267done:
268 free_css_id(&blkio_subsys, &blkcg->css);
269 rcu_read_unlock();
270 kfree(blkcg);
271}
272
273static struct cgroup_subsys_state *
274blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
275{
276 struct blkio_cgroup *blkcg, *parent_blkcg;
277
278 if (!cgroup->parent) {
279 blkcg = &blkio_root_cgroup;
280 goto done;
281 }
282
283 /* Currently we do not support hierarchy deeper than two level (0,1) */
284 parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
285 if (css_depth(&parent_blkcg->css) > 0)
286 return ERR_PTR(-EINVAL);
287
288 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
289 if (!blkcg)
290 return ERR_PTR(-ENOMEM);
291
292 blkcg->weight = BLKIO_WEIGHT_DEFAULT;
293done:
294 spin_lock_init(&blkcg->lock);
295 INIT_HLIST_HEAD(&blkcg->blkg_list);
296
297 return &blkcg->css;
298}
299
300/*
301 * We cannot support shared io contexts, as we have no mean to support
302 * two tasks with the same ioc in two different groups without major rework
303 * of the main cic data structures. For now we allow a task to change
304 * its cgroup only if it's the only owner of its ioc.
305 */
306static int blkiocg_can_attach(struct cgroup_subsys *subsys,
307 struct cgroup *cgroup, struct task_struct *tsk,
308 bool threadgroup)
309{
310 struct io_context *ioc;
311 int ret = 0;
312
313 /* task_lock() is needed to avoid races with exit_io_context() */
314 task_lock(tsk);
315 ioc = tsk->io_context;
316 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
317 ret = -EINVAL;
318 task_unlock(tsk);
319
320 return ret;
321}
322
323static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
324 struct cgroup *prev, struct task_struct *tsk,
325 bool threadgroup)
326{
327 struct io_context *ioc;
328
329 task_lock(tsk);
330 ioc = tsk->io_context;
331 if (ioc)
332 ioc->cgroup_changed = 1;
333 task_unlock(tsk);
334}
335
336struct cgroup_subsys blkio_subsys = {
337 .name = "blkio",
338 .create = blkiocg_create,
339 .can_attach = blkiocg_can_attach,
340 .attach = blkiocg_attach,
341 .destroy = blkiocg_destroy,
342 .populate = blkiocg_populate,
343 .subsys_id = blkio_subsys_id,
344 .use_id = 1,
345};
346
347void blkio_policy_register(struct blkio_policy_type *blkiop)
348{
349 spin_lock(&blkio_list_lock);
350 list_add_tail(&blkiop->list, &blkio_list);
351 spin_unlock(&blkio_list_lock);
352}
353EXPORT_SYMBOL_GPL(blkio_policy_register);
354
355void blkio_policy_unregister(struct blkio_policy_type *blkiop)
356{
357 spin_lock(&blkio_list_lock);
358 list_del_init(&blkiop->list);
359 spin_unlock(&blkio_list_lock);
360}
361EXPORT_SYMBOL_GPL(blkio_policy_unregister);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
new file mode 100644
index 000000000000..4d316df863b4
--- /dev/null
+++ b/block/blk-cgroup.h
@@ -0,0 +1,127 @@
1#ifndef _BLK_CGROUP_H
2#define _BLK_CGROUP_H
3/*
4 * Common Block IO controller cgroup interface
5 *
6 * Based on ideas and code from CFQ, CFS and BFQ:
7 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
8 *
9 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
10 * Paolo Valente <paolo.valente@unimore.it>
11 *
12 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
13 * Nauman Rafique <nauman@google.com>
14 */
15
16#include <linux/cgroup.h>
17
18#ifdef CONFIG_BLK_CGROUP
19
20struct blkio_cgroup {
21 struct cgroup_subsys_state css;
22 unsigned int weight;
23 spinlock_t lock;
24 struct hlist_head blkg_list;
25};
26
27struct blkio_group {
28 /* An rcu protected unique identifier for the group */
29 void *key;
30 struct hlist_node blkcg_node;
31 unsigned short blkcg_id;
32#ifdef CONFIG_DEBUG_BLK_CGROUP
33 /* Store cgroup path */
34 char path[128];
35 /* How many times this group has been removed from service tree */
36 unsigned long dequeue;
37#endif
38 /* The device MKDEV(major, minor), this group has been created for */
39 dev_t dev;
40
41 /* total disk time and nr sectors dispatched by this group */
42 unsigned long time;
43 unsigned long sectors;
44};
45
46extern bool blkiocg_css_tryget(struct blkio_cgroup *blkcg);
47extern void blkiocg_css_put(struct blkio_cgroup *blkcg);
48
49typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
50typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
51 unsigned int weight);
52
53struct blkio_policy_ops {
54 blkio_unlink_group_fn *blkio_unlink_group_fn;
55 blkio_update_group_weight_fn *blkio_update_group_weight_fn;
56};
57
58struct blkio_policy_type {
59 struct list_head list;
60 struct blkio_policy_ops ops;
61};
62
63/* Blkio controller policy registration */
64extern void blkio_policy_register(struct blkio_policy_type *);
65extern void blkio_policy_unregister(struct blkio_policy_type *);
66
67#else
68
69struct blkio_group {
70};
71
72struct blkio_policy_type {
73};
74
75static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
76static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
77
78#endif
79
80#define BLKIO_WEIGHT_MIN 100
81#define BLKIO_WEIGHT_MAX 1000
82#define BLKIO_WEIGHT_DEFAULT 500
83
84#ifdef CONFIG_DEBUG_BLK_CGROUP
85static inline char *blkg_path(struct blkio_group *blkg)
86{
87 return blkg->path;
88}
89void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
90 unsigned long dequeue);
91#else
92static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
93static inline void blkiocg_update_blkio_group_dequeue_stats(
94 struct blkio_group *blkg, unsigned long dequeue) {}
95#endif
96
97#ifdef CONFIG_BLK_CGROUP
98extern struct blkio_cgroup blkio_root_cgroup;
99extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
100extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
101 struct blkio_group *blkg, void *key, dev_t dev);
102extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
103extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
104 void *key);
105void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
106 unsigned long time, unsigned long sectors);
107#else
108struct cgroup;
109static inline struct blkio_cgroup *
110cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
111
112static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
113 struct blkio_group *blkg, void *key, dev_t dev)
114{
115}
116
117static inline int
118blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
119
120static inline struct blkio_group *
121blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
122static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
123 unsigned long time, unsigned long sectors)
124{
125}
126#endif
127#endif /* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 71da5111120c..718897e6d37f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2358,6 +2358,25 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
2358 rq->rq_disk = bio->bi_bdev->bd_disk; 2358 rq->rq_disk = bio->bi_bdev->bd_disk;
2359} 2359}
2360 2360
2361#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
2362/**
2363 * rq_flush_dcache_pages - Helper function to flush all pages in a request
2364 * @rq: the request to be flushed
2365 *
2366 * Description:
2367 * Flush all pages in @rq.
2368 */
2369void rq_flush_dcache_pages(struct request *rq)
2370{
2371 struct req_iterator iter;
2372 struct bio_vec *bvec;
2373
2374 rq_for_each_segment(bvec, rq, iter)
2375 flush_dcache_page(bvec->bv_page);
2376}
2377EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
2378#endif
2379
2361/** 2380/**
2362 * blk_lld_busy - Check if underlying low-level drivers of a device are busy 2381 * blk_lld_busy - Check if underlying low-level drivers of a device are busy
2363 * @q : the queue of the device being checked 2382 * @q : the queue of the device being checked
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index d4ed6000147d..cbdabb0dd6d7 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -66,22 +66,22 @@ static void cfq_exit(struct io_context *ioc)
66} 66}
67 67
68/* Called by the exitting task */ 68/* Called by the exitting task */
69void exit_io_context(void) 69void exit_io_context(struct task_struct *task)
70{ 70{
71 struct io_context *ioc; 71 struct io_context *ioc;
72 72
73 task_lock(current); 73 task_lock(task);
74 ioc = current->io_context; 74 ioc = task->io_context;
75 current->io_context = NULL; 75 task->io_context = NULL;
76 task_unlock(current); 76 task_unlock(task);
77 77
78 if (atomic_dec_and_test(&ioc->nr_tasks)) { 78 if (atomic_dec_and_test(&ioc->nr_tasks)) {
79 if (ioc->aic && ioc->aic->exit) 79 if (ioc->aic && ioc->aic->exit)
80 ioc->aic->exit(ioc->aic); 80 ioc->aic->exit(ioc->aic);
81 cfq_exit(ioc); 81 cfq_exit(ioc);
82 82
83 put_io_context(ioc);
84 } 83 }
84 put_io_context(ioc);
85} 85}
86 86
87struct io_context *alloc_io_context(gfp_t gfp_flags, int node) 87struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 66d4aa8799b7..dd1f1e0e196f 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -8,6 +8,7 @@
8#include <linux/blkdev.h> 8#include <linux/blkdev.h>
9#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ 9#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */
10#include <linux/gcd.h> 10#include <linux/gcd.h>
11#include <linux/jiffies.h>
11 12
12#include "blk.h" 13#include "blk.h"
13 14
@@ -96,7 +97,11 @@ void blk_set_default_limits(struct queue_limits *lim)
96 lim->max_segment_size = MAX_SEGMENT_SIZE; 97 lim->max_segment_size = MAX_SEGMENT_SIZE;
97 lim->max_sectors = BLK_DEF_MAX_SECTORS; 98 lim->max_sectors = BLK_DEF_MAX_SECTORS;
98 lim->max_hw_sectors = INT_MAX; 99 lim->max_hw_sectors = INT_MAX;
99 lim->max_discard_sectors = SAFE_MAX_SECTORS; 100 lim->max_discard_sectors = 0;
101 lim->discard_granularity = 0;
102 lim->discard_alignment = 0;
103 lim->discard_misaligned = 0;
104 lim->discard_zeroes_data = -1;
100 lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; 105 lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
101 lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); 106 lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
102 lim->alignment_offset = 0; 107 lim->alignment_offset = 0;
@@ -141,7 +146,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
141 q->nr_batching = BLK_BATCH_REQ; 146 q->nr_batching = BLK_BATCH_REQ;
142 147
143 q->unplug_thresh = 4; /* hmm */ 148 q->unplug_thresh = 4; /* hmm */
144 q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ 149 q->unplug_delay = msecs_to_jiffies(3); /* 3 milliseconds */
145 if (q->unplug_delay == 0) 150 if (q->unplug_delay == 0)
146 q->unplug_delay = 1; 151 q->unplug_delay = 1;
147 152
@@ -488,6 +493,16 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
488} 493}
489EXPORT_SYMBOL(blk_queue_stack_limits); 494EXPORT_SYMBOL(blk_queue_stack_limits);
490 495
496static unsigned int lcm(unsigned int a, unsigned int b)
497{
498 if (a && b)
499 return (a * b) / gcd(a, b);
500 else if (b)
501 return b;
502
503 return a;
504}
505
491/** 506/**
492 * blk_stack_limits - adjust queue_limits for stacked devices 507 * blk_stack_limits - adjust queue_limits for stacked devices
493 * @t: the stacking driver limits (top) 508 * @t: the stacking driver limits (top)
@@ -502,6 +517,10 @@ EXPORT_SYMBOL(blk_queue_stack_limits);
502int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, 517int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
503 sector_t offset) 518 sector_t offset)
504{ 519{
520 int ret;
521
522 ret = 0;
523
505 t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); 524 t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
506 t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); 525 t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
507 t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); 526 t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
@@ -526,12 +545,19 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
526 545
527 t->io_min = max(t->io_min, b->io_min); 546 t->io_min = max(t->io_min, b->io_min);
528 t->no_cluster |= b->no_cluster; 547 t->no_cluster |= b->no_cluster;
548 t->discard_zeroes_data &= b->discard_zeroes_data;
529 549
530 /* Bottom device offset aligned? */ 550 /* Bottom device offset aligned? */
531 if (offset && 551 if (offset &&
532 (offset & (b->physical_block_size - 1)) != b->alignment_offset) { 552 (offset & (b->physical_block_size - 1)) != b->alignment_offset) {
533 t->misaligned = 1; 553 t->misaligned = 1;
534 return -1; 554 ret = -1;
555 }
556
557 if (offset &&
558 (offset & (b->discard_granularity - 1)) != b->discard_alignment) {
559 t->discard_misaligned = 1;
560 ret = -1;
535 } 561 }
536 562
537 /* If top has no alignment offset, inherit from bottom */ 563 /* If top has no alignment offset, inherit from bottom */
@@ -539,23 +565,26 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
539 t->alignment_offset = 565 t->alignment_offset =
540 b->alignment_offset & (b->physical_block_size - 1); 566 b->alignment_offset & (b->physical_block_size - 1);
541 567
568 if (!t->discard_alignment)
569 t->discard_alignment =
570 b->discard_alignment & (b->discard_granularity - 1);
571
542 /* Top device aligned on logical block boundary? */ 572 /* Top device aligned on logical block boundary? */
543 if (t->alignment_offset & (t->logical_block_size - 1)) { 573 if (t->alignment_offset & (t->logical_block_size - 1)) {
544 t->misaligned = 1; 574 t->misaligned = 1;
545 return -1; 575 ret = -1;
546 } 576 }
547 577
548 /* Find lcm() of optimal I/O size */ 578 /* Find lcm() of optimal I/O size and granularity */
549 if (t->io_opt && b->io_opt) 579 t->io_opt = lcm(t->io_opt, b->io_opt);
550 t->io_opt = (t->io_opt * b->io_opt) / gcd(t->io_opt, b->io_opt); 580 t->discard_granularity = lcm(t->discard_granularity,
551 else if (b->io_opt) 581 b->discard_granularity);
552 t->io_opt = b->io_opt;
553 582
554 /* Verify that optimal I/O size is a multiple of io_min */ 583 /* Verify that optimal I/O size is a multiple of io_min */
555 if (t->io_min && t->io_opt % t->io_min) 584 if (t->io_min && t->io_opt % t->io_min)
556 return -1; 585 ret = -1;
557 586
558 return 0; 587 return ret;
559} 588}
560EXPORT_SYMBOL(blk_stack_limits); 589EXPORT_SYMBOL(blk_stack_limits);
561 590
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 8a6d81afb284..8606c9543fdd 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -126,6 +126,21 @@ static ssize_t queue_io_opt_show(struct request_queue *q, char *page)
126 return queue_var_show(queue_io_opt(q), page); 126 return queue_var_show(queue_io_opt(q), page);
127} 127}
128 128
129static ssize_t queue_discard_granularity_show(struct request_queue *q, char *page)
130{
131 return queue_var_show(q->limits.discard_granularity, page);
132}
133
134static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
135{
136 return queue_var_show(q->limits.max_discard_sectors << 9, page);
137}
138
139static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
140{
141 return queue_var_show(queue_discard_zeroes_data(q), page);
142}
143
129static ssize_t 144static ssize_t
130queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) 145queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
131{ 146{
@@ -293,6 +308,21 @@ static struct queue_sysfs_entry queue_io_opt_entry = {
293 .show = queue_io_opt_show, 308 .show = queue_io_opt_show,
294}; 309};
295 310
311static struct queue_sysfs_entry queue_discard_granularity_entry = {
312 .attr = {.name = "discard_granularity", .mode = S_IRUGO },
313 .show = queue_discard_granularity_show,
314};
315
316static struct queue_sysfs_entry queue_discard_max_entry = {
317 .attr = {.name = "discard_max_bytes", .mode = S_IRUGO },
318 .show = queue_discard_max_show,
319};
320
321static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
322 .attr = {.name = "discard_zeroes_data", .mode = S_IRUGO },
323 .show = queue_discard_zeroes_data_show,
324};
325
296static struct queue_sysfs_entry queue_nonrot_entry = { 326static struct queue_sysfs_entry queue_nonrot_entry = {
297 .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, 327 .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
298 .show = queue_nonrot_show, 328 .show = queue_nonrot_show,
@@ -328,6 +358,9 @@ static struct attribute *default_attrs[] = {
328 &queue_physical_block_size_entry.attr, 358 &queue_physical_block_size_entry.attr,
329 &queue_io_min_entry.attr, 359 &queue_io_min_entry.attr,
330 &queue_io_opt_entry.attr, 360 &queue_io_opt_entry.attr,
361 &queue_discard_granularity_entry.attr,
362 &queue_discard_max_entry.attr,
363 &queue_discard_zeroes_data_entry.attr,
331 &queue_nonrot_entry.attr, 364 &queue_nonrot_entry.attr,
332 &queue_nomerges_entry.attr, 365 &queue_nomerges_entry.attr,
333 &queue_rq_affinity_entry.attr, 366 &queue_rq_affinity_entry.attr,
diff --git a/block/bsg.c b/block/bsg.c
index 0676301f16d0..a9fd2d84b53a 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -15,6 +15,7 @@
15#include <linux/blkdev.h> 15#include <linux/blkdev.h>
16#include <linux/poll.h> 16#include <linux/poll.h>
17#include <linux/cdev.h> 17#include <linux/cdev.h>
18#include <linux/jiffies.h>
18#include <linux/percpu.h> 19#include <linux/percpu.h>
19#include <linux/uio.h> 20#include <linux/uio.h>
20#include <linux/idr.h> 21#include <linux/idr.h>
@@ -197,7 +198,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
197 rq->cmd_len = hdr->request_len; 198 rq->cmd_len = hdr->request_len;
198 rq->cmd_type = REQ_TYPE_BLOCK_PC; 199 rq->cmd_type = REQ_TYPE_BLOCK_PC;
199 200
200 rq->timeout = (hdr->timeout * HZ) / 1000; 201 rq->timeout = msecs_to_jiffies(hdr->timeout);
201 if (!rq->timeout) 202 if (!rq->timeout)
202 rq->timeout = q->sg_timeout; 203 rq->timeout = q->sg_timeout;
203 if (!rq->timeout) 204 if (!rq->timeout)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index aa1e9535e358..cfb0b2f5f63d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -9,9 +9,11 @@
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/blkdev.h> 10#include <linux/blkdev.h>
11#include <linux/elevator.h> 11#include <linux/elevator.h>
12#include <linux/jiffies.h>
12#include <linux/rbtree.h> 13#include <linux/rbtree.h>
13#include <linux/ioprio.h> 14#include <linux/ioprio.h>
14#include <linux/blktrace_api.h> 15#include <linux/blktrace_api.h>
16#include "blk-cgroup.h"
15 17
16/* 18/*
17 * tunables 19 * tunables
@@ -27,6 +29,8 @@ static const int cfq_slice_sync = HZ / 10;
27static int cfq_slice_async = HZ / 25; 29static int cfq_slice_async = HZ / 25;
28static const int cfq_slice_async_rq = 2; 30static const int cfq_slice_async_rq = 2;
29static int cfq_slice_idle = HZ / 125; 31static int cfq_slice_idle = HZ / 125;
32static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
33static const int cfq_hist_divisor = 4;
30 34
31/* 35/*
32 * offset from end of service tree 36 * offset from end of service tree
@@ -38,8 +42,15 @@ static int cfq_slice_idle = HZ / 125;
38 */ 42 */
39#define CFQ_MIN_TT (2) 43#define CFQ_MIN_TT (2)
40 44
45/*
46 * Allow merged cfqqs to perform this amount of seeky I/O before
47 * deciding to break the queues up again.
48 */
49#define CFQQ_COOP_TOUT (HZ)
50
41#define CFQ_SLICE_SCALE (5) 51#define CFQ_SLICE_SCALE (5)
42#define CFQ_HW_QUEUE_MIN (5) 52#define CFQ_HW_QUEUE_MIN (5)
53#define CFQ_SERVICE_SHIFT 12
43 54
44#define RQ_CIC(rq) \ 55#define RQ_CIC(rq) \
45 ((struct cfq_io_context *) (rq)->elevator_private) 56 ((struct cfq_io_context *) (rq)->elevator_private)
@@ -57,6 +68,7 @@ static DEFINE_SPINLOCK(ioc_gone_lock);
57#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) 68#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
58 69
59#define sample_valid(samples) ((samples) > 80) 70#define sample_valid(samples) ((samples) > 80)
71#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
60 72
61/* 73/*
62 * Most of our rbtree usage is for sorting with min extraction, so 74 * Most of our rbtree usage is for sorting with min extraction, so
@@ -67,8 +79,12 @@ static DEFINE_SPINLOCK(ioc_gone_lock);
67struct cfq_rb_root { 79struct cfq_rb_root {
68 struct rb_root rb; 80 struct rb_root rb;
69 struct rb_node *left; 81 struct rb_node *left;
82 unsigned count;
83 u64 min_vdisktime;
84 struct rb_node *active;
85 unsigned total_weight;
70}; 86};
71#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, } 87#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, }
72 88
73/* 89/*
74 * Per process-grouping structure 90 * Per process-grouping structure
@@ -99,6 +115,11 @@ struct cfq_queue {
99 /* fifo list of requests in sort_list */ 115 /* fifo list of requests in sort_list */
100 struct list_head fifo; 116 struct list_head fifo;
101 117
118 /* time when queue got scheduled in to dispatch first request. */
119 unsigned long dispatch_start;
120 unsigned int allocated_slice;
121 /* time when first request from queue completed and slice started. */
122 unsigned long slice_start;
102 unsigned long slice_end; 123 unsigned long slice_end;
103 long slice_resid; 124 long slice_resid;
104 unsigned int slice_dispatch; 125 unsigned int slice_dispatch;
@@ -112,7 +133,71 @@ struct cfq_queue {
112 unsigned short ioprio, org_ioprio; 133 unsigned short ioprio, org_ioprio;
113 unsigned short ioprio_class, org_ioprio_class; 134 unsigned short ioprio_class, org_ioprio_class;
114 135
136 unsigned int seek_samples;
137 u64 seek_total;
138 sector_t seek_mean;
139 sector_t last_request_pos;
140 unsigned long seeky_start;
141
115 pid_t pid; 142 pid_t pid;
143
144 struct cfq_rb_root *service_tree;
145 struct cfq_queue *new_cfqq;
146 struct cfq_group *cfqg;
147 struct cfq_group *orig_cfqg;
148 /* Sectors dispatched in current dispatch round */
149 unsigned long nr_sectors;
150};
151
152/*
153 * First index in the service_trees.
154 * IDLE is handled separately, so it has negative index
155 */
156enum wl_prio_t {
157 BE_WORKLOAD = 0,
158 RT_WORKLOAD = 1,
159 IDLE_WORKLOAD = 2,
160};
161
162/*
163 * Second index in the service_trees.
164 */
165enum wl_type_t {
166 ASYNC_WORKLOAD = 0,
167 SYNC_NOIDLE_WORKLOAD = 1,
168 SYNC_WORKLOAD = 2
169};
170
171/* This is per cgroup per device grouping structure */
172struct cfq_group {
173 /* group service_tree member */
174 struct rb_node rb_node;
175
176 /* group service_tree key */
177 u64 vdisktime;
178 unsigned int weight;
179 bool on_st;
180
181 /* number of cfqq currently on this group */
182 int nr_cfqq;
183
184 /* Per group busy queus average. Useful for workload slice calc. */
185 unsigned int busy_queues_avg[2];
186 /*
187 * rr lists of queues with requests, onle rr for each priority class.
188 * Counts are embedded in the cfq_rb_root
189 */
190 struct cfq_rb_root service_trees[2][3];
191 struct cfq_rb_root service_tree_idle;
192
193 unsigned long saved_workload_slice;
194 enum wl_type_t saved_workload;
195 enum wl_prio_t saved_serving_prio;
196 struct blkio_group blkg;
197#ifdef CONFIG_CFQ_GROUP_IOSCHED
198 struct hlist_node cfqd_node;
199 atomic_t ref;
200#endif
116}; 201};
117 202
118/* 203/*
@@ -120,11 +205,20 @@ struct cfq_queue {
120 */ 205 */
121struct cfq_data { 206struct cfq_data {
122 struct request_queue *queue; 207 struct request_queue *queue;
208 /* Root service tree for cfq_groups */
209 struct cfq_rb_root grp_service_tree;
210 struct cfq_group root_group;
211 /* Number of active cfq groups on group service tree */
212 int nr_groups;
123 213
124 /* 214 /*
125 * rr list of queues with requests and the count of them 215 * The priority currently being served
126 */ 216 */
127 struct cfq_rb_root service_tree; 217 enum wl_prio_t serving_prio;
218 enum wl_type_t serving_type;
219 unsigned long workload_expires;
220 struct cfq_group *serving_group;
221 bool noidle_tree_requires_idle;
128 222
129 /* 223 /*
130 * Each priority tree is sorted by next_request position. These 224 * Each priority tree is sorted by next_request position. These
@@ -143,8 +237,14 @@ struct cfq_data {
143 */ 237 */
144 int rq_queued; 238 int rq_queued;
145 int hw_tag; 239 int hw_tag;
146 int hw_tag_samples; 240 /*
147 int rq_in_driver_peak; 241 * hw_tag can be
242 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
243 * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
244 * 0 => no NCQ
245 */
246 int hw_tag_est_depth;
247 unsigned int hw_tag_samples;
148 248
149 /* 249 /*
150 * idle window management 250 * idle window management
@@ -174,6 +274,7 @@ struct cfq_data {
174 unsigned int cfq_slice_async_rq; 274 unsigned int cfq_slice_async_rq;
175 unsigned int cfq_slice_idle; 275 unsigned int cfq_slice_idle;
176 unsigned int cfq_latency; 276 unsigned int cfq_latency;
277 unsigned int cfq_group_isolation;
177 278
178 struct list_head cic_list; 279 struct list_head cic_list;
179 280
@@ -183,8 +284,28 @@ struct cfq_data {
183 struct cfq_queue oom_cfqq; 284 struct cfq_queue oom_cfqq;
184 285
185 unsigned long last_end_sync_rq; 286 unsigned long last_end_sync_rq;
287
288 /* List of cfq groups being managed on this device*/
289 struct hlist_head cfqg_list;
290 struct rcu_head rcu;
186}; 291};
187 292
293static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
294
295static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
296 enum wl_prio_t prio,
297 enum wl_type_t type,
298 struct cfq_data *cfqd)
299{
300 if (!cfqg)
301 return NULL;
302
303 if (prio == IDLE_WORKLOAD)
304 return &cfqg->service_tree_idle;
305
306 return &cfqg->service_trees[prio][type];
307}
308
188enum cfqq_state_flags { 309enum cfqq_state_flags {
189 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ 310 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */
190 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ 311 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */
@@ -195,8 +316,10 @@ enum cfqq_state_flags {
195 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ 316 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */
196 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ 317 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */
197 CFQ_CFQQ_FLAG_sync, /* synchronous queue */ 318 CFQ_CFQQ_FLAG_sync, /* synchronous queue */
198 CFQ_CFQQ_FLAG_coop, /* has done a coop jump of the queue */ 319 CFQ_CFQQ_FLAG_coop, /* cfqq is shared */
199 CFQ_CFQQ_FLAG_coop_preempt, /* coop preempt */ 320 CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */
321 CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */
322 CFQ_CFQQ_FLAG_wait_busy_done, /* Got new request. Expire the queue */
200}; 323};
201 324
202#define CFQ_CFQQ_FNS(name) \ 325#define CFQ_CFQQ_FNS(name) \
@@ -223,14 +346,78 @@ CFQ_CFQQ_FNS(prio_changed);
223CFQ_CFQQ_FNS(slice_new); 346CFQ_CFQQ_FNS(slice_new);
224CFQ_CFQQ_FNS(sync); 347CFQ_CFQQ_FNS(sync);
225CFQ_CFQQ_FNS(coop); 348CFQ_CFQQ_FNS(coop);
226CFQ_CFQQ_FNS(coop_preempt); 349CFQ_CFQQ_FNS(deep);
350CFQ_CFQQ_FNS(wait_busy);
351CFQ_CFQQ_FNS(wait_busy_done);
227#undef CFQ_CFQQ_FNS 352#undef CFQ_CFQQ_FNS
228 353
354#ifdef CONFIG_DEBUG_CFQ_IOSCHED
355#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
356 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
357 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
358 blkg_path(&(cfqq)->cfqg->blkg), ##args);
359
360#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \
361 blk_add_trace_msg((cfqd)->queue, "%s " fmt, \
362 blkg_path(&(cfqg)->blkg), ##args); \
363
364#else
229#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 365#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
230 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) 366 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
367#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0);
368#endif
231#define cfq_log(cfqd, fmt, args...) \ 369#define cfq_log(cfqd, fmt, args...) \
232 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) 370 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
233 371
372/* Traverses through cfq group service trees */
373#define for_each_cfqg_st(cfqg, i, j, st) \
374 for (i = 0; i <= IDLE_WORKLOAD; i++) \
375 for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
376 : &cfqg->service_tree_idle; \
377 (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
378 (i == IDLE_WORKLOAD && j == 0); \
379 j++, st = i < IDLE_WORKLOAD ? \
380 &cfqg->service_trees[i][j]: NULL) \
381
382
383static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
384{
385 if (cfq_class_idle(cfqq))
386 return IDLE_WORKLOAD;
387 if (cfq_class_rt(cfqq))
388 return RT_WORKLOAD;
389 return BE_WORKLOAD;
390}
391
392
393static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
394{
395 if (!cfq_cfqq_sync(cfqq))
396 return ASYNC_WORKLOAD;
397 if (!cfq_cfqq_idle_window(cfqq))
398 return SYNC_NOIDLE_WORKLOAD;
399 return SYNC_WORKLOAD;
400}
401
402static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
403 struct cfq_data *cfqd,
404 struct cfq_group *cfqg)
405{
406 if (wl == IDLE_WORKLOAD)
407 return cfqg->service_tree_idle.count;
408
409 return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
410 + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
411 + cfqg->service_trees[wl][SYNC_WORKLOAD].count;
412}
413
414static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
415 struct cfq_group *cfqg)
416{
417 return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
418 + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
419}
420
234static void cfq_dispatch_insert(struct request_queue *, struct request *); 421static void cfq_dispatch_insert(struct request_queue *, struct request *);
235static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, 422static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
236 struct io_context *, gfp_t); 423 struct io_context *, gfp_t);
@@ -279,7 +466,7 @@ static int cfq_queue_empty(struct request_queue *q)
279{ 466{
280 struct cfq_data *cfqd = q->elevator->elevator_data; 467 struct cfq_data *cfqd = q->elevator->elevator_data;
281 468
282 return !cfqd->busy_queues; 469 return !cfqd->rq_queued;
283} 470}
284 471
285/* 472/*
@@ -303,10 +490,110 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
303 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); 490 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
304} 491}
305 492
493static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
494{
495 u64 d = delta << CFQ_SERVICE_SHIFT;
496
497 d = d * BLKIO_WEIGHT_DEFAULT;
498 do_div(d, cfqg->weight);
499 return d;
500}
501
502static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
503{
504 s64 delta = (s64)(vdisktime - min_vdisktime);
505 if (delta > 0)
506 min_vdisktime = vdisktime;
507
508 return min_vdisktime;
509}
510
511static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
512{
513 s64 delta = (s64)(vdisktime - min_vdisktime);
514 if (delta < 0)
515 min_vdisktime = vdisktime;
516
517 return min_vdisktime;
518}
519
520static void update_min_vdisktime(struct cfq_rb_root *st)
521{
522 u64 vdisktime = st->min_vdisktime;
523 struct cfq_group *cfqg;
524
525 if (st->active) {
526 cfqg = rb_entry_cfqg(st->active);
527 vdisktime = cfqg->vdisktime;
528 }
529
530 if (st->left) {
531 cfqg = rb_entry_cfqg(st->left);
532 vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
533 }
534
535 st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
536}
537
538/*
539 * get averaged number of queues of RT/BE priority.
540 * average is updated, with a formula that gives more weight to higher numbers,
541 * to quickly follows sudden increases and decrease slowly
542 */
543
544static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
545 struct cfq_group *cfqg, bool rt)
546{
547 unsigned min_q, max_q;
548 unsigned mult = cfq_hist_divisor - 1;
549 unsigned round = cfq_hist_divisor / 2;
550 unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
551
552 min_q = min(cfqg->busy_queues_avg[rt], busy);
553 max_q = max(cfqg->busy_queues_avg[rt], busy);
554 cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
555 cfq_hist_divisor;
556 return cfqg->busy_queues_avg[rt];
557}
558
559static inline unsigned
560cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
561{
562 struct cfq_rb_root *st = &cfqd->grp_service_tree;
563
564 return cfq_target_latency * cfqg->weight / st->total_weight;
565}
566
306static inline void 567static inline void
307cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 568cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
308{ 569{
309 cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies; 570 unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
571 if (cfqd->cfq_latency) {
572 /*
573 * interested queues (we consider only the ones with the same
574 * priority class in the cfq group)
575 */
576 unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
577 cfq_class_rt(cfqq));
578 unsigned sync_slice = cfqd->cfq_slice[1];
579 unsigned expect_latency = sync_slice * iq;
580 unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
581
582 if (expect_latency > group_slice) {
583 unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
584 /* scale low_slice according to IO priority
585 * and sync vs async */
586 unsigned low_slice =
587 min(slice, base_low_slice * slice / sync_slice);
588 /* the adapted slice value is scaled to fit all iqs
589 * into the target latency */
590 slice = max(slice * group_slice / expect_latency,
591 low_slice);
592 }
593 }
594 cfqq->slice_start = jiffies;
595 cfqq->slice_end = jiffies + slice;
596 cfqq->allocated_slice = slice;
310 cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); 597 cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
311} 598}
312 599
@@ -331,9 +618,9 @@ static inline bool cfq_slice_used(struct cfq_queue *cfqq)
331 * behind the head is penalized and only allowed to a certain extent. 618 * behind the head is penalized and only allowed to a certain extent.
332 */ 619 */
333static struct request * 620static struct request *
334cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2) 621cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
335{ 622{
336 sector_t last, s1, s2, d1 = 0, d2 = 0; 623 sector_t s1, s2, d1 = 0, d2 = 0;
337 unsigned long back_max; 624 unsigned long back_max;
338#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ 625#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */
339#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ 626#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */
@@ -356,8 +643,6 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
356 s1 = blk_rq_pos(rq1); 643 s1 = blk_rq_pos(rq1);
357 s2 = blk_rq_pos(rq2); 644 s2 = blk_rq_pos(rq2);
358 645
359 last = cfqd->last_position;
360
361 /* 646 /*
362 * by definition, 1KiB is 2 sectors 647 * by definition, 1KiB is 2 sectors
363 */ 648 */
@@ -425,6 +710,10 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
425 */ 710 */
426static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) 711static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
427{ 712{
713 /* Service tree is empty */
714 if (!root->count)
715 return NULL;
716
428 if (!root->left) 717 if (!root->left)
429 root->left = rb_first(&root->rb); 718 root->left = rb_first(&root->rb);
430 719
@@ -434,6 +723,17 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
434 return NULL; 723 return NULL;
435} 724}
436 725
726static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
727{
728 if (!root->left)
729 root->left = rb_first(&root->rb);
730
731 if (root->left)
732 return rb_entry_cfqg(root->left);
733
734 return NULL;
735}
736
437static void rb_erase_init(struct rb_node *n, struct rb_root *root) 737static void rb_erase_init(struct rb_node *n, struct rb_root *root)
438{ 738{
439 rb_erase(n, root); 739 rb_erase(n, root);
@@ -445,6 +745,7 @@ static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
445 if (root->left == n) 745 if (root->left == n)
446 root->left = NULL; 746 root->left = NULL;
447 rb_erase_init(n, &root->rb); 747 rb_erase_init(n, &root->rb);
748 --root->count;
448} 749}
449 750
450/* 751/*
@@ -471,7 +772,7 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
471 next = rb_entry_rq(rbnext); 772 next = rb_entry_rq(rbnext);
472 } 773 }
473 774
474 return cfq_choose_req(cfqd, next, prev); 775 return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
475} 776}
476 777
477static unsigned long cfq_slice_offset(struct cfq_data *cfqd, 778static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
@@ -480,12 +781,336 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
480 /* 781 /*
481 * just an approximation, should be ok. 782 * just an approximation, should be ok.
482 */ 783 */
483 return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) - 784 return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
484 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); 785 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
485} 786}
486 787
788static inline s64
789cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
790{
791 return cfqg->vdisktime - st->min_vdisktime;
792}
793
794static void
795__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
796{
797 struct rb_node **node = &st->rb.rb_node;
798 struct rb_node *parent = NULL;
799 struct cfq_group *__cfqg;
800 s64 key = cfqg_key(st, cfqg);
801 int left = 1;
802
803 while (*node != NULL) {
804 parent = *node;
805 __cfqg = rb_entry_cfqg(parent);
806
807 if (key < cfqg_key(st, __cfqg))
808 node = &parent->rb_left;
809 else {
810 node = &parent->rb_right;
811 left = 0;
812 }
813 }
814
815 if (left)
816 st->left = &cfqg->rb_node;
817
818 rb_link_node(&cfqg->rb_node, parent, node);
819 rb_insert_color(&cfqg->rb_node, &st->rb);
820}
821
822static void
823cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
824{
825 struct cfq_rb_root *st = &cfqd->grp_service_tree;
826 struct cfq_group *__cfqg;
827 struct rb_node *n;
828
829 cfqg->nr_cfqq++;
830 if (cfqg->on_st)
831 return;
832
833 /*
834 * Currently put the group at the end. Later implement something
835 * so that groups get lesser vtime based on their weights, so that
836 * if group does not loose all if it was not continously backlogged.
837 */
838 n = rb_last(&st->rb);
839 if (n) {
840 __cfqg = rb_entry_cfqg(n);
841 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
842 } else
843 cfqg->vdisktime = st->min_vdisktime;
844
845 __cfq_group_service_tree_add(st, cfqg);
846 cfqg->on_st = true;
847 cfqd->nr_groups++;
848 st->total_weight += cfqg->weight;
849}
850
851static void
852cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
853{
854 struct cfq_rb_root *st = &cfqd->grp_service_tree;
855
856 if (st->active == &cfqg->rb_node)
857 st->active = NULL;
858
859 BUG_ON(cfqg->nr_cfqq < 1);
860 cfqg->nr_cfqq--;
861
862 /* If there are other cfq queues under this group, don't delete it */
863 if (cfqg->nr_cfqq)
864 return;
865
866 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
867 cfqg->on_st = false;
868 cfqd->nr_groups--;
869 st->total_weight -= cfqg->weight;
870 if (!RB_EMPTY_NODE(&cfqg->rb_node))
871 cfq_rb_erase(&cfqg->rb_node, st);
872 cfqg->saved_workload_slice = 0;
873 blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1);
874}
875
876static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
877{
878 unsigned int slice_used;
879
880 /*
881 * Queue got expired before even a single request completed or
882 * got expired immediately after first request completion.
883 */
884 if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
885 /*
886 * Also charge the seek time incurred to the group, otherwise
887 * if there are mutiple queues in the group, each can dispatch
888 * a single request on seeky media and cause lots of seek time
889 * and group will never know it.
890 */
891 slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
892 1);
893 } else {
894 slice_used = jiffies - cfqq->slice_start;
895 if (slice_used > cfqq->allocated_slice)
896 slice_used = cfqq->allocated_slice;
897 }
898
899 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used,
900 cfqq->nr_sectors);
901 return slice_used;
902}
903
904static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
905 struct cfq_queue *cfqq)
906{
907 struct cfq_rb_root *st = &cfqd->grp_service_tree;
908 unsigned int used_sl, charge_sl;
909 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
910 - cfqg->service_tree_idle.count;
911
912 BUG_ON(nr_sync < 0);
913 used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq);
914
915 if (!cfq_cfqq_sync(cfqq) && !nr_sync)
916 charge_sl = cfqq->allocated_slice;
917
918 /* Can't update vdisktime while group is on service tree */
919 cfq_rb_erase(&cfqg->rb_node, st);
920 cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg);
921 __cfq_group_service_tree_add(st, cfqg);
922
923 /* This group is being expired. Save the context */
924 if (time_after(cfqd->workload_expires, jiffies)) {
925 cfqg->saved_workload_slice = cfqd->workload_expires
926 - jiffies;
927 cfqg->saved_workload = cfqd->serving_type;
928 cfqg->saved_serving_prio = cfqd->serving_prio;
929 } else
930 cfqg->saved_workload_slice = 0;
931
932 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
933 st->min_vdisktime);
934 blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl,
935 cfqq->nr_sectors);
936}
937
938#ifdef CONFIG_CFQ_GROUP_IOSCHED
939static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
940{
941 if (blkg)
942 return container_of(blkg, struct cfq_group, blkg);
943 return NULL;
944}
945
946void
947cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
948{
949 cfqg_of_blkg(blkg)->weight = weight;
950}
951
952static struct cfq_group *
953cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
954{
955 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
956 struct cfq_group *cfqg = NULL;
957 void *key = cfqd;
958 int i, j;
959 struct cfq_rb_root *st;
960 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
961 unsigned int major, minor;
962
963 /* Do we need to take this reference */
964 if (!blkiocg_css_tryget(blkcg))
965 return NULL;;
966
967 cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
968 if (cfqg || !create)
969 goto done;
970
971 cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
972 if (!cfqg)
973 goto done;
974
975 cfqg->weight = blkcg->weight;
976 for_each_cfqg_st(cfqg, i, j, st)
977 *st = CFQ_RB_ROOT;
978 RB_CLEAR_NODE(&cfqg->rb_node);
979
980 /*
981 * Take the initial reference that will be released on destroy
982 * This can be thought of a joint reference by cgroup and
983 * elevator which will be dropped by either elevator exit
984 * or cgroup deletion path depending on who is exiting first.
985 */
986 atomic_set(&cfqg->ref, 1);
987
988 /* Add group onto cgroup list */
989 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
990 blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
991 MKDEV(major, minor));
992
993 /* Add group on cfqd list */
994 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
995
996done:
997 blkiocg_css_put(blkcg);
998 return cfqg;
999}
1000
487/* 1001/*
488 * The cfqd->service_tree holds all pending cfq_queue's that have 1002 * Search for the cfq group current task belongs to. If create = 1, then also
1003 * create the cfq group if it does not exist. request_queue lock must be held.
1004 */
1005static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
1006{
1007 struct cgroup *cgroup;
1008 struct cfq_group *cfqg = NULL;
1009
1010 rcu_read_lock();
1011 cgroup = task_cgroup(current, blkio_subsys_id);
1012 cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create);
1013 if (!cfqg && create)
1014 cfqg = &cfqd->root_group;
1015 rcu_read_unlock();
1016 return cfqg;
1017}
1018
1019static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1020{
1021 /* Currently, all async queues are mapped to root group */
1022 if (!cfq_cfqq_sync(cfqq))
1023 cfqg = &cfqq->cfqd->root_group;
1024
1025 cfqq->cfqg = cfqg;
1026 /* cfqq reference on cfqg */
1027 atomic_inc(&cfqq->cfqg->ref);
1028}
1029
1030static void cfq_put_cfqg(struct cfq_group *cfqg)
1031{
1032 struct cfq_rb_root *st;
1033 int i, j;
1034
1035 BUG_ON(atomic_read(&cfqg->ref) <= 0);
1036 if (!atomic_dec_and_test(&cfqg->ref))
1037 return;
1038 for_each_cfqg_st(cfqg, i, j, st)
1039 BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
1040 kfree(cfqg);
1041}
1042
1043static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
1044{
1045 /* Something wrong if we are trying to remove same group twice */
1046 BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
1047
1048 hlist_del_init(&cfqg->cfqd_node);
1049
1050 /*
1051 * Put the reference taken at the time of creation so that when all
1052 * queues are gone, group can be destroyed.
1053 */
1054 cfq_put_cfqg(cfqg);
1055}
1056
1057static void cfq_release_cfq_groups(struct cfq_data *cfqd)
1058{
1059 struct hlist_node *pos, *n;
1060 struct cfq_group *cfqg;
1061
1062 hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
1063 /*
1064 * If cgroup removal path got to blk_group first and removed
1065 * it from cgroup list, then it will take care of destroying
1066 * cfqg also.
1067 */
1068 if (!blkiocg_del_blkio_group(&cfqg->blkg))
1069 cfq_destroy_cfqg(cfqd, cfqg);
1070 }
1071}
1072
1073/*
1074 * Blk cgroup controller notification saying that blkio_group object is being
1075 * delinked as associated cgroup object is going away. That also means that
1076 * no new IO will come in this group. So get rid of this group as soon as
1077 * any pending IO in the group is finished.
1078 *
1079 * This function is called under rcu_read_lock(). key is the rcu protected
1080 * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu
1081 * read lock.
1082 *
1083 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
1084 * it should not be NULL as even if elevator was exiting, cgroup deltion
1085 * path got to it first.
1086 */
1087void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
1088{
1089 unsigned long flags;
1090 struct cfq_data *cfqd = key;
1091
1092 spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1093 cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
1094 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1095}
1096
1097#else /* GROUP_IOSCHED */
1098static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
1099{
1100 return &cfqd->root_group;
1101}
1102static inline void
1103cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
1104 cfqq->cfqg = cfqg;
1105}
1106
1107static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
1108static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
1109
1110#endif /* GROUP_IOSCHED */
1111
1112/*
1113 * The cfqd->service_trees holds all pending cfq_queue's that have
489 * requests waiting to be processed. It is sorted in the order that 1114 * requests waiting to be processed. It is sorted in the order that
490 * we will service the queues. 1115 * we will service the queues.
491 */ 1116 */
@@ -495,11 +1120,42 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
495 struct rb_node **p, *parent; 1120 struct rb_node **p, *parent;
496 struct cfq_queue *__cfqq; 1121 struct cfq_queue *__cfqq;
497 unsigned long rb_key; 1122 unsigned long rb_key;
1123 struct cfq_rb_root *service_tree;
498 int left; 1124 int left;
1125 int new_cfqq = 1;
1126 int group_changed = 0;
1127
1128#ifdef CONFIG_CFQ_GROUP_IOSCHED
1129 if (!cfqd->cfq_group_isolation
1130 && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
1131 && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
1132 /* Move this cfq to root group */
1133 cfq_log_cfqq(cfqd, cfqq, "moving to root group");
1134 if (!RB_EMPTY_NODE(&cfqq->rb_node))
1135 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1136 cfqq->orig_cfqg = cfqq->cfqg;
1137 cfqq->cfqg = &cfqd->root_group;
1138 atomic_inc(&cfqd->root_group.ref);
1139 group_changed = 1;
1140 } else if (!cfqd->cfq_group_isolation
1141 && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
1142 /* cfqq is sequential now needs to go to its original group */
1143 BUG_ON(cfqq->cfqg != &cfqd->root_group);
1144 if (!RB_EMPTY_NODE(&cfqq->rb_node))
1145 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1146 cfq_put_cfqg(cfqq->cfqg);
1147 cfqq->cfqg = cfqq->orig_cfqg;
1148 cfqq->orig_cfqg = NULL;
1149 group_changed = 1;
1150 cfq_log_cfqq(cfqd, cfqq, "moved to origin group");
1151 }
1152#endif
499 1153
1154 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
1155 cfqq_type(cfqq), cfqd);
500 if (cfq_class_idle(cfqq)) { 1156 if (cfq_class_idle(cfqq)) {
501 rb_key = CFQ_IDLE_DELAY; 1157 rb_key = CFQ_IDLE_DELAY;
502 parent = rb_last(&cfqd->service_tree.rb); 1158 parent = rb_last(&service_tree->rb);
503 if (parent && parent != &cfqq->rb_node) { 1159 if (parent && parent != &cfqq->rb_node) {
504 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 1160 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
505 rb_key += __cfqq->rb_key; 1161 rb_key += __cfqq->rb_key;
@@ -517,23 +1173,27 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
517 cfqq->slice_resid = 0; 1173 cfqq->slice_resid = 0;
518 } else { 1174 } else {
519 rb_key = -HZ; 1175 rb_key = -HZ;
520 __cfqq = cfq_rb_first(&cfqd->service_tree); 1176 __cfqq = cfq_rb_first(service_tree);
521 rb_key += __cfqq ? __cfqq->rb_key : jiffies; 1177 rb_key += __cfqq ? __cfqq->rb_key : jiffies;
522 } 1178 }
523 1179
524 if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 1180 if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1181 new_cfqq = 0;
525 /* 1182 /*
526 * same position, nothing more to do 1183 * same position, nothing more to do
527 */ 1184 */
528 if (rb_key == cfqq->rb_key) 1185 if (rb_key == cfqq->rb_key &&
1186 cfqq->service_tree == service_tree)
529 return; 1187 return;
530 1188
531 cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); 1189 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1190 cfqq->service_tree = NULL;
532 } 1191 }
533 1192
534 left = 1; 1193 left = 1;
535 parent = NULL; 1194 parent = NULL;
536 p = &cfqd->service_tree.rb.rb_node; 1195 cfqq->service_tree = service_tree;
1196 p = &service_tree->rb.rb_node;
537 while (*p) { 1197 while (*p) {
538 struct rb_node **n; 1198 struct rb_node **n;
539 1199
@@ -541,35 +1201,28 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
541 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 1201 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
542 1202
543 /* 1203 /*
544 * sort RT queues first, we always want to give 1204 * sort by key, that represents service time.
545 * preference to them. IDLE queues goes to the back.
546 * after that, sort on the next service time.
547 */ 1205 */
548 if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq)) 1206 if (time_before(rb_key, __cfqq->rb_key))
549 n = &(*p)->rb_left;
550 else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq))
551 n = &(*p)->rb_right;
552 else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq))
553 n = &(*p)->rb_left;
554 else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq))
555 n = &(*p)->rb_right;
556 else if (time_before(rb_key, __cfqq->rb_key))
557 n = &(*p)->rb_left; 1207 n = &(*p)->rb_left;
558 else 1208 else {
559 n = &(*p)->rb_right; 1209 n = &(*p)->rb_right;
560
561 if (n == &(*p)->rb_right)
562 left = 0; 1210 left = 0;
1211 }
563 1212
564 p = n; 1213 p = n;
565 } 1214 }
566 1215
567 if (left) 1216 if (left)
568 cfqd->service_tree.left = &cfqq->rb_node; 1217 service_tree->left = &cfqq->rb_node;
569 1218
570 cfqq->rb_key = rb_key; 1219 cfqq->rb_key = rb_key;
571 rb_link_node(&cfqq->rb_node, parent, p); 1220 rb_link_node(&cfqq->rb_node, parent, p);
572 rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb); 1221 rb_insert_color(&cfqq->rb_node, &service_tree->rb);
1222 service_tree->count++;
1223 if ((add_front || !new_cfqq) && !group_changed)
1224 return;
1225 cfq_group_service_tree_add(cfqd, cfqq->cfqg);
573} 1226}
574 1227
575static struct cfq_queue * 1228static struct cfq_queue *
@@ -671,13 +1324,16 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
671 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 1324 BUG_ON(!cfq_cfqq_on_rr(cfqq));
672 cfq_clear_cfqq_on_rr(cfqq); 1325 cfq_clear_cfqq_on_rr(cfqq);
673 1326
674 if (!RB_EMPTY_NODE(&cfqq->rb_node)) 1327 if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
675 cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); 1328 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1329 cfqq->service_tree = NULL;
1330 }
676 if (cfqq->p_root) { 1331 if (cfqq->p_root) {
677 rb_erase(&cfqq->p_node, cfqq->p_root); 1332 rb_erase(&cfqq->p_node, cfqq->p_root);
678 cfqq->p_root = NULL; 1333 cfqq->p_root = NULL;
679 } 1334 }
680 1335
1336 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
681 BUG_ON(!cfqd->busy_queues); 1337 BUG_ON(!cfqd->busy_queues);
682 cfqd->busy_queues--; 1338 cfqd->busy_queues--;
683} 1339}
@@ -688,7 +1344,6 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
688static void cfq_del_rq_rb(struct request *rq) 1344static void cfq_del_rq_rb(struct request *rq)
689{ 1345{
690 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1346 struct cfq_queue *cfqq = RQ_CFQQ(rq);
691 struct cfq_data *cfqd = cfqq->cfqd;
692 const int sync = rq_is_sync(rq); 1347 const int sync = rq_is_sync(rq);
693 1348
694 BUG_ON(!cfqq->queued[sync]); 1349 BUG_ON(!cfqq->queued[sync]);
@@ -696,8 +1351,17 @@ static void cfq_del_rq_rb(struct request *rq)
696 1351
697 elv_rb_del(&cfqq->sort_list, rq); 1352 elv_rb_del(&cfqq->sort_list, rq);
698 1353
699 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) 1354 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
700 cfq_del_cfqq_rr(cfqd, cfqq); 1355 /*
1356 * Queue will be deleted from service tree when we actually
1357 * expire it later. Right now just remove it from prio tree
1358 * as it is empty.
1359 */
1360 if (cfqq->p_root) {
1361 rb_erase(&cfqq->p_node, cfqq->p_root);
1362 cfqq->p_root = NULL;
1363 }
1364 }
701} 1365}
702 1366
703static void cfq_add_rq_rb(struct request *rq) 1367static void cfq_add_rq_rb(struct request *rq)
@@ -722,7 +1386,7 @@ static void cfq_add_rq_rb(struct request *rq)
722 * check if this request is a better next-serve candidate 1386 * check if this request is a better next-serve candidate
723 */ 1387 */
724 prev = cfqq->next_rq; 1388 prev = cfqq->next_rq;
725 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq); 1389 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
726 1390
727 /* 1391 /*
728 * adjust priority tree position, if ->next_rq changes 1392 * adjust priority tree position, if ->next_rq changes
@@ -829,6 +1493,7 @@ static void
829cfq_merged_requests(struct request_queue *q, struct request *rq, 1493cfq_merged_requests(struct request_queue *q, struct request *rq,
830 struct request *next) 1494 struct request *next)
831{ 1495{
1496 struct cfq_queue *cfqq = RQ_CFQQ(rq);
832 /* 1497 /*
833 * reposition in fifo if next is older than rq 1498 * reposition in fifo if next is older than rq
834 */ 1499 */
@@ -838,6 +1503,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
838 rq_set_fifo_time(rq, rq_fifo_time(next)); 1503 rq_set_fifo_time(rq, rq_fifo_time(next));
839 } 1504 }
840 1505
1506 if (cfqq->next_rq == next)
1507 cfqq->next_rq = rq;
841 cfq_remove_request(next); 1508 cfq_remove_request(next);
842} 1509}
843 1510
@@ -848,6 +1515,9 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
848 struct cfq_io_context *cic; 1515 struct cfq_io_context *cic;
849 struct cfq_queue *cfqq; 1516 struct cfq_queue *cfqq;
850 1517
1518 /* Deny merge if bio and rq don't belong to same cfq group */
1519 if ((RQ_CFQQ(rq))->cfqg != cfq_get_cfqg(cfqd, 0))
1520 return false;
851 /* 1521 /*
852 * Disallow merge of a sync bio into an async request. 1522 * Disallow merge of a sync bio into an async request.
853 */ 1523 */
@@ -871,8 +1541,12 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
871{ 1541{
872 if (cfqq) { 1542 if (cfqq) {
873 cfq_log_cfqq(cfqd, cfqq, "set_active"); 1543 cfq_log_cfqq(cfqd, cfqq, "set_active");
1544 cfqq->slice_start = 0;
1545 cfqq->dispatch_start = jiffies;
1546 cfqq->allocated_slice = 0;
874 cfqq->slice_end = 0; 1547 cfqq->slice_end = 0;
875 cfqq->slice_dispatch = 0; 1548 cfqq->slice_dispatch = 0;
1549 cfqq->nr_sectors = 0;
876 1550
877 cfq_clear_cfqq_wait_request(cfqq); 1551 cfq_clear_cfqq_wait_request(cfqq);
878 cfq_clear_cfqq_must_dispatch(cfqq); 1552 cfq_clear_cfqq_must_dispatch(cfqq);
@@ -899,6 +1573,8 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
899 del_timer(&cfqd->idle_slice_timer); 1573 del_timer(&cfqd->idle_slice_timer);
900 1574
901 cfq_clear_cfqq_wait_request(cfqq); 1575 cfq_clear_cfqq_wait_request(cfqq);
1576 cfq_clear_cfqq_wait_busy(cfqq);
1577 cfq_clear_cfqq_wait_busy_done(cfqq);
902 1578
903 /* 1579 /*
904 * store what was left of this slice, if the queue idled/timed out 1580 * store what was left of this slice, if the queue idled/timed out
@@ -908,11 +1584,19 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
908 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); 1584 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
909 } 1585 }
910 1586
1587 cfq_group_served(cfqd, cfqq->cfqg, cfqq);
1588
1589 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
1590 cfq_del_cfqq_rr(cfqd, cfqq);
1591
911 cfq_resort_rr_list(cfqd, cfqq); 1592 cfq_resort_rr_list(cfqd, cfqq);
912 1593
913 if (cfqq == cfqd->active_queue) 1594 if (cfqq == cfqd->active_queue)
914 cfqd->active_queue = NULL; 1595 cfqd->active_queue = NULL;
915 1596
1597 if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
1598 cfqd->grp_service_tree.active = NULL;
1599
916 if (cfqd->active_cic) { 1600 if (cfqd->active_cic) {
917 put_io_context(cfqd->active_cic->ioc); 1601 put_io_context(cfqd->active_cic->ioc);
918 cfqd->active_cic = NULL; 1602 cfqd->active_cic = NULL;
@@ -933,10 +1617,39 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
933 */ 1617 */
934static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) 1618static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
935{ 1619{
936 if (RB_EMPTY_ROOT(&cfqd->service_tree.rb)) 1620 struct cfq_rb_root *service_tree =
1621 service_tree_for(cfqd->serving_group, cfqd->serving_prio,
1622 cfqd->serving_type, cfqd);
1623
1624 if (!cfqd->rq_queued)
937 return NULL; 1625 return NULL;
938 1626
939 return cfq_rb_first(&cfqd->service_tree); 1627 /* There is nothing to dispatch */
1628 if (!service_tree)
1629 return NULL;
1630 if (RB_EMPTY_ROOT(&service_tree->rb))
1631 return NULL;
1632 return cfq_rb_first(service_tree);
1633}
1634
1635static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
1636{
1637 struct cfq_group *cfqg;
1638 struct cfq_queue *cfqq;
1639 int i, j;
1640 struct cfq_rb_root *st;
1641
1642 if (!cfqd->rq_queued)
1643 return NULL;
1644
1645 cfqg = cfq_get_next_cfqg(cfqd);
1646 if (!cfqg)
1647 return NULL;
1648
1649 for_each_cfqg_st(cfqg, i, j, st)
1650 if ((cfqq = cfq_rb_first(st)) != NULL)
1651 return cfqq;
1652 return NULL;
940} 1653}
941 1654
942/* 1655/*
@@ -945,14 +1658,8 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
945static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, 1658static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
946 struct cfq_queue *cfqq) 1659 struct cfq_queue *cfqq)
947{ 1660{
948 if (!cfqq) { 1661 if (!cfqq)
949 cfqq = cfq_get_next_queue(cfqd); 1662 cfqq = cfq_get_next_queue(cfqd);
950 if (cfqq && !cfq_cfqq_coop_preempt(cfqq))
951 cfq_clear_cfqq_coop(cfqq);
952 }
953
954 if (cfqq)
955 cfq_clear_cfqq_coop_preempt(cfqq);
956 1663
957 __cfq_set_active_queue(cfqd, cfqq); 1664 __cfq_set_active_queue(cfqd, cfqq);
958 return cfqq; 1665 return cfqq;
@@ -967,16 +1674,16 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
967 return cfqd->last_position - blk_rq_pos(rq); 1674 return cfqd->last_position - blk_rq_pos(rq);
968} 1675}
969 1676
970#define CIC_SEEK_THR 8 * 1024 1677#define CFQQ_SEEK_THR 8 * 1024
971#define CIC_SEEKY(cic) ((cic)->seek_mean > CIC_SEEK_THR) 1678#define CFQQ_SEEKY(cfqq) ((cfqq)->seek_mean > CFQQ_SEEK_THR)
972 1679
973static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq) 1680static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1681 struct request *rq)
974{ 1682{
975 struct cfq_io_context *cic = cfqd->active_cic; 1683 sector_t sdist = cfqq->seek_mean;
976 sector_t sdist = cic->seek_mean;
977 1684
978 if (!sample_valid(cic->seek_samples)) 1685 if (!sample_valid(cfqq->seek_samples))
979 sdist = CIC_SEEK_THR; 1686 sdist = CFQQ_SEEK_THR;
980 1687
981 return cfq_dist_from_last(cfqd, rq) <= sdist; 1688 return cfq_dist_from_last(cfqd, rq) <= sdist;
982} 1689}
@@ -1005,7 +1712,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
1005 * will contain the closest sector. 1712 * will contain the closest sector.
1006 */ 1713 */
1007 __cfqq = rb_entry(parent, struct cfq_queue, p_node); 1714 __cfqq = rb_entry(parent, struct cfq_queue, p_node);
1008 if (cfq_rq_close(cfqd, __cfqq->next_rq)) 1715 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1009 return __cfqq; 1716 return __cfqq;
1010 1717
1011 if (blk_rq_pos(__cfqq->next_rq) < sector) 1718 if (blk_rq_pos(__cfqq->next_rq) < sector)
@@ -1016,7 +1723,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
1016 return NULL; 1723 return NULL;
1017 1724
1018 __cfqq = rb_entry(node, struct cfq_queue, p_node); 1725 __cfqq = rb_entry(node, struct cfq_queue, p_node);
1019 if (cfq_rq_close(cfqd, __cfqq->next_rq)) 1726 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1020 return __cfqq; 1727 return __cfqq;
1021 1728
1022 return NULL; 1729 return NULL;
@@ -1033,16 +1740,13 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
1033 * assumption. 1740 * assumption.
1034 */ 1741 */
1035static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, 1742static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
1036 struct cfq_queue *cur_cfqq, 1743 struct cfq_queue *cur_cfqq)
1037 bool probe)
1038{ 1744{
1039 struct cfq_queue *cfqq; 1745 struct cfq_queue *cfqq;
1040 1746
1041 /* 1747 if (!cfq_cfqq_sync(cur_cfqq))
1042 * A valid cfq_io_context is necessary to compare requests against 1748 return NULL;
1043 * the seek_mean of the current cfqq. 1749 if (CFQQ_SEEKY(cur_cfqq))
1044 */
1045 if (!cfqd->active_cic)
1046 return NULL; 1750 return NULL;
1047 1751
1048 /* 1752 /*
@@ -1054,14 +1758,55 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
1054 if (!cfqq) 1758 if (!cfqq)
1055 return NULL; 1759 return NULL;
1056 1760
1057 if (cfq_cfqq_coop(cfqq)) 1761 /* If new queue belongs to different cfq_group, don't choose it */
1762 if (cur_cfqq->cfqg != cfqq->cfqg)
1763 return NULL;
1764
1765 /*
1766 * It only makes sense to merge sync queues.
1767 */
1768 if (!cfq_cfqq_sync(cfqq))
1769 return NULL;
1770 if (CFQQ_SEEKY(cfqq))
1771 return NULL;
1772
1773 /*
1774 * Do not merge queues of different priority classes
1775 */
1776 if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
1058 return NULL; 1777 return NULL;
1059 1778
1060 if (!probe)
1061 cfq_mark_cfqq_coop(cfqq);
1062 return cfqq; 1779 return cfqq;
1063} 1780}
1064 1781
1782/*
1783 * Determine whether we should enforce idle window for this queue.
1784 */
1785
1786static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1787{
1788 enum wl_prio_t prio = cfqq_prio(cfqq);
1789 struct cfq_rb_root *service_tree = cfqq->service_tree;
1790
1791 BUG_ON(!service_tree);
1792 BUG_ON(!service_tree->count);
1793
1794 /* We never do for idle class queues. */
1795 if (prio == IDLE_WORKLOAD)
1796 return false;
1797
1798 /* We do for queues that were marked with idle window flag. */
1799 if (cfq_cfqq_idle_window(cfqq) &&
1800 !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
1801 return true;
1802
1803 /*
1804 * Otherwise, we do only if they are the last ones
1805 * in their service tree.
1806 */
1807 return service_tree->count == 1;
1808}
1809
1065static void cfq_arm_slice_timer(struct cfq_data *cfqd) 1810static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1066{ 1811{
1067 struct cfq_queue *cfqq = cfqd->active_queue; 1812 struct cfq_queue *cfqq = cfqd->active_queue;
@@ -1082,13 +1827,13 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1082 /* 1827 /*
1083 * idle is disabled, either manually or by past process history 1828 * idle is disabled, either manually or by past process history
1084 */ 1829 */
1085 if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq)) 1830 if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))
1086 return; 1831 return;
1087 1832
1088 /* 1833 /*
1089 * still requests with the driver, don't idle 1834 * still active requests from this queue, don't idle
1090 */ 1835 */
1091 if (rq_in_driver(cfqd)) 1836 if (cfqq->dispatched)
1092 return; 1837 return;
1093 1838
1094 /* 1839 /*
@@ -1109,14 +1854,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1109 1854
1110 cfq_mark_cfqq_wait_request(cfqq); 1855 cfq_mark_cfqq_wait_request(cfqq);
1111 1856
1112 /*
1113 * we don't want to idle for seeks, but we do want to allow
1114 * fair distribution of slice time for a process doing back-to-back
1115 * seeks. so allow a little bit of time for him to submit a new rq
1116 */
1117 sl = cfqd->cfq_slice_idle; 1857 sl = cfqd->cfq_slice_idle;
1118 if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
1119 sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
1120 1858
1121 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 1859 mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
1122 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); 1860 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
@@ -1139,6 +1877,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
1139 1877
1140 if (cfq_cfqq_sync(cfqq)) 1878 if (cfq_cfqq_sync(cfqq))
1141 cfqd->sync_flight++; 1879 cfqd->sync_flight++;
1880 cfqq->nr_sectors += blk_rq_sectors(rq);
1142} 1881}
1143 1882
1144/* 1883/*
@@ -1175,6 +1914,207 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1175} 1914}
1176 1915
1177/* 1916/*
1917 * Must be called with the queue_lock held.
1918 */
1919static int cfqq_process_refs(struct cfq_queue *cfqq)
1920{
1921 int process_refs, io_refs;
1922
1923 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
1924 process_refs = atomic_read(&cfqq->ref) - io_refs;
1925 BUG_ON(process_refs < 0);
1926 return process_refs;
1927}
1928
1929static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
1930{
1931 int process_refs, new_process_refs;
1932 struct cfq_queue *__cfqq;
1933
1934 /* Avoid a circular list and skip interim queue merges */
1935 while ((__cfqq = new_cfqq->new_cfqq)) {
1936 if (__cfqq == cfqq)
1937 return;
1938 new_cfqq = __cfqq;
1939 }
1940
1941 process_refs = cfqq_process_refs(cfqq);
1942 /*
1943 * If the process for the cfqq has gone away, there is no
1944 * sense in merging the queues.
1945 */
1946 if (process_refs == 0)
1947 return;
1948
1949 /*
1950 * Merge in the direction of the lesser amount of work.
1951 */
1952 new_process_refs = cfqq_process_refs(new_cfqq);
1953 if (new_process_refs >= process_refs) {
1954 cfqq->new_cfqq = new_cfqq;
1955 atomic_add(process_refs, &new_cfqq->ref);
1956 } else {
1957 new_cfqq->new_cfqq = cfqq;
1958 atomic_add(new_process_refs, &cfqq->ref);
1959 }
1960}
1961
1962static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
1963 struct cfq_group *cfqg, enum wl_prio_t prio,
1964 bool prio_changed)
1965{
1966 struct cfq_queue *queue;
1967 int i;
1968 bool key_valid = false;
1969 unsigned long lowest_key = 0;
1970 enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
1971
1972 if (prio_changed) {
1973 /*
1974 * When priorities switched, we prefer starting
1975 * from SYNC_NOIDLE (first choice), or just SYNC
1976 * over ASYNC
1977 */
1978 if (service_tree_for(cfqg, prio, cur_best, cfqd)->count)
1979 return cur_best;
1980 cur_best = SYNC_WORKLOAD;
1981 if (service_tree_for(cfqg, prio, cur_best, cfqd)->count)
1982 return cur_best;
1983
1984 return ASYNC_WORKLOAD;
1985 }
1986
1987 for (i = 0; i < 3; ++i) {
1988 /* otherwise, select the one with lowest rb_key */
1989 queue = cfq_rb_first(service_tree_for(cfqg, prio, i, cfqd));
1990 if (queue &&
1991 (!key_valid || time_before(queue->rb_key, lowest_key))) {
1992 lowest_key = queue->rb_key;
1993 cur_best = i;
1994 key_valid = true;
1995 }
1996 }
1997
1998 return cur_best;
1999}
2000
2001static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2002{
2003 enum wl_prio_t previous_prio = cfqd->serving_prio;
2004 bool prio_changed;
2005 unsigned slice;
2006 unsigned count;
2007 struct cfq_rb_root *st;
2008 unsigned group_slice;
2009
2010 if (!cfqg) {
2011 cfqd->serving_prio = IDLE_WORKLOAD;
2012 cfqd->workload_expires = jiffies + 1;
2013 return;
2014 }
2015
2016 /* Choose next priority. RT > BE > IDLE */
2017 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
2018 cfqd->serving_prio = RT_WORKLOAD;
2019 else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
2020 cfqd->serving_prio = BE_WORKLOAD;
2021 else {
2022 cfqd->serving_prio = IDLE_WORKLOAD;
2023 cfqd->workload_expires = jiffies + 1;
2024 return;
2025 }
2026
2027 /*
2028 * For RT and BE, we have to choose also the type
2029 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
2030 * expiration time
2031 */
2032 prio_changed = (cfqd->serving_prio != previous_prio);
2033 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type,
2034 cfqd);
2035 count = st->count;
2036
2037 /*
2038 * If priority didn't change, check workload expiration,
2039 * and that we still have other queues ready
2040 */
2041 if (!prio_changed && count &&
2042 !time_after(jiffies, cfqd->workload_expires))
2043 return;
2044
2045 /* otherwise select new workload type */
2046 cfqd->serving_type =
2047 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio, prio_changed);
2048 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type,
2049 cfqd);
2050 count = st->count;
2051
2052 /*
2053 * the workload slice is computed as a fraction of target latency
2054 * proportional to the number of queues in that workload, over
2055 * all the queues in the same priority class
2056 */
2057 group_slice = cfq_group_slice(cfqd, cfqg);
2058
2059 slice = group_slice * count /
2060 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
2061 cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
2062
2063 if (cfqd->serving_type == ASYNC_WORKLOAD) {
2064 unsigned int tmp;
2065
2066 /*
2067 * Async queues are currently system wide. Just taking
2068 * proportion of queues with-in same group will lead to higher
2069 * async ratio system wide as generally root group is going
2070 * to have higher weight. A more accurate thing would be to
2071 * calculate system wide asnc/sync ratio.
2072 */
2073 tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
2074 tmp = tmp/cfqd->busy_queues;
2075 slice = min_t(unsigned, slice, tmp);
2076
2077 /* async workload slice is scaled down according to
2078 * the sync/async slice ratio. */
2079 slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
2080 } else
2081 /* sync workload slice is at least 2 * cfq_slice_idle */
2082 slice = max(slice, 2 * cfqd->cfq_slice_idle);
2083
2084 slice = max_t(unsigned, slice, CFQ_MIN_TT);
2085 cfqd->workload_expires = jiffies + slice;
2086 cfqd->noidle_tree_requires_idle = false;
2087}
2088
2089static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
2090{
2091 struct cfq_rb_root *st = &cfqd->grp_service_tree;
2092 struct cfq_group *cfqg;
2093
2094 if (RB_EMPTY_ROOT(&st->rb))
2095 return NULL;
2096 cfqg = cfq_rb_first_group(st);
2097 st->active = &cfqg->rb_node;
2098 update_min_vdisktime(st);
2099 return cfqg;
2100}
2101
2102static void cfq_choose_cfqg(struct cfq_data *cfqd)
2103{
2104 struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
2105
2106 cfqd->serving_group = cfqg;
2107
2108 /* Restore the workload type data */
2109 if (cfqg->saved_workload_slice) {
2110 cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
2111 cfqd->serving_type = cfqg->saved_workload;
2112 cfqd->serving_prio = cfqg->saved_serving_prio;
2113 }
2114 choose_service_tree(cfqd, cfqg);
2115}
2116
2117/*
1178 * Select a queue for service. If we have a current active queue, 2118 * Select a queue for service. If we have a current active queue,
1179 * check whether to continue servicing it, or retrieve and set a new one. 2119 * check whether to continue servicing it, or retrieve and set a new one.
1180 */ 2120 */
@@ -1186,10 +2126,13 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1186 if (!cfqq) 2126 if (!cfqq)
1187 goto new_queue; 2127 goto new_queue;
1188 2128
2129 if (!cfqd->rq_queued)
2130 return NULL;
1189 /* 2131 /*
1190 * The active queue has run out of time, expire it and select new. 2132 * The active queue has run out of time, expire it and select new.
1191 */ 2133 */
1192 if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) 2134 if ((cfq_slice_used(cfqq) || cfq_cfqq_wait_busy_done(cfqq))
2135 && !cfq_cfqq_must_dispatch(cfqq))
1193 goto expire; 2136 goto expire;
1194 2137
1195 /* 2138 /*
@@ -1203,11 +2146,14 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1203 * If another queue has a request waiting within our mean seek 2146 * If another queue has a request waiting within our mean seek
1204 * distance, let it run. The expire code will check for close 2147 * distance, let it run. The expire code will check for close
1205 * cooperators and put the close queue at the front of the service 2148 * cooperators and put the close queue at the front of the service
1206 * tree. 2149 * tree. If possible, merge the expiring queue with the new cfqq.
1207 */ 2150 */
1208 new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0); 2151 new_cfqq = cfq_close_cooperator(cfqd, cfqq);
1209 if (new_cfqq) 2152 if (new_cfqq) {
2153 if (!cfqq->new_cfqq)
2154 cfq_setup_merge(cfqq, new_cfqq);
1210 goto expire; 2155 goto expire;
2156 }
1211 2157
1212 /* 2158 /*
1213 * No requests pending. If the active queue still has requests in 2159 * No requests pending. If the active queue still has requests in
@@ -1215,7 +2161,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1215 * conditions to happen (or time out) before selecting a new queue. 2161 * conditions to happen (or time out) before selecting a new queue.
1216 */ 2162 */
1217 if (timer_pending(&cfqd->idle_slice_timer) || 2163 if (timer_pending(&cfqd->idle_slice_timer) ||
1218 (cfqq->dispatched && cfq_cfqq_idle_window(cfqq))) { 2164 (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) {
1219 cfqq = NULL; 2165 cfqq = NULL;
1220 goto keep_queue; 2166 goto keep_queue;
1221 } 2167 }
@@ -1223,6 +2169,13 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1223expire: 2169expire:
1224 cfq_slice_expired(cfqd, 0); 2170 cfq_slice_expired(cfqd, 0);
1225new_queue: 2171new_queue:
2172 /*
2173 * Current queue expired. Check if we have to switch to a new
2174 * service tree
2175 */
2176 if (!new_cfqq)
2177 cfq_choose_cfqg(cfqd);
2178
1226 cfqq = cfq_set_active_queue(cfqd, new_cfqq); 2179 cfqq = cfq_set_active_queue(cfqd, new_cfqq);
1227keep_queue: 2180keep_queue:
1228 return cfqq; 2181 return cfqq;
@@ -1238,6 +2191,9 @@ static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
1238 } 2191 }
1239 2192
1240 BUG_ON(!list_empty(&cfqq->fifo)); 2193 BUG_ON(!list_empty(&cfqq->fifo));
2194
2195 /* By default cfqq is not expired if it is empty. Do it explicitly */
2196 __cfq_slice_expired(cfqq->cfqd, cfqq, 0);
1241 return dispatched; 2197 return dispatched;
1242} 2198}
1243 2199
@@ -1250,11 +2206,10 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
1250 struct cfq_queue *cfqq; 2206 struct cfq_queue *cfqq;
1251 int dispatched = 0; 2207 int dispatched = 0;
1252 2208
1253 while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL) 2209 while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL)
1254 dispatched += __cfq_forced_dispatch_cfqq(cfqq); 2210 dispatched += __cfq_forced_dispatch_cfqq(cfqq);
1255 2211
1256 cfq_slice_expired(cfqd, 0); 2212 cfq_slice_expired(cfqd, 0);
1257
1258 BUG_ON(cfqd->busy_queues); 2213 BUG_ON(cfqd->busy_queues);
1259 2214
1260 cfq_log(cfqd, "forced_dispatch=%d", dispatched); 2215 cfq_log(cfqd, "forced_dispatch=%d", dispatched);
@@ -1268,7 +2223,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1268 /* 2223 /*
1269 * Drain async requests before we start sync IO 2224 * Drain async requests before we start sync IO
1270 */ 2225 */
1271 if (cfq_cfqq_idle_window(cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC]) 2226 if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC])
1272 return false; 2227 return false;
1273 2228
1274 /* 2229 /*
@@ -1298,9 +2253,9 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1298 return false; 2253 return false;
1299 2254
1300 /* 2255 /*
1301 * Sole queue user, allow bigger slice 2256 * Sole queue user, no limit
1302 */ 2257 */
1303 max_dispatch *= 4; 2258 max_dispatch = -1;
1304 } 2259 }
1305 2260
1306 /* 2261 /*
@@ -1407,11 +2362,13 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
1407 * task holds one reference to the queue, dropped when task exits. each rq 2362 * task holds one reference to the queue, dropped when task exits. each rq
1408 * in-flight on this queue also holds a reference, dropped when rq is freed. 2363 * in-flight on this queue also holds a reference, dropped when rq is freed.
1409 * 2364 *
2365 * Each cfq queue took a reference on the parent group. Drop it now.
1410 * queue lock must be held here. 2366 * queue lock must be held here.
1411 */ 2367 */
1412static void cfq_put_queue(struct cfq_queue *cfqq) 2368static void cfq_put_queue(struct cfq_queue *cfqq)
1413{ 2369{
1414 struct cfq_data *cfqd = cfqq->cfqd; 2370 struct cfq_data *cfqd = cfqq->cfqd;
2371 struct cfq_group *cfqg, *orig_cfqg;
1415 2372
1416 BUG_ON(atomic_read(&cfqq->ref) <= 0); 2373 BUG_ON(atomic_read(&cfqq->ref) <= 0);
1417 2374
@@ -1421,14 +2378,19 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
1421 cfq_log_cfqq(cfqd, cfqq, "put_queue"); 2378 cfq_log_cfqq(cfqd, cfqq, "put_queue");
1422 BUG_ON(rb_first(&cfqq->sort_list)); 2379 BUG_ON(rb_first(&cfqq->sort_list));
1423 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); 2380 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
1424 BUG_ON(cfq_cfqq_on_rr(cfqq)); 2381 cfqg = cfqq->cfqg;
2382 orig_cfqg = cfqq->orig_cfqg;
1425 2383
1426 if (unlikely(cfqd->active_queue == cfqq)) { 2384 if (unlikely(cfqd->active_queue == cfqq)) {
1427 __cfq_slice_expired(cfqd, cfqq, 0); 2385 __cfq_slice_expired(cfqd, cfqq, 0);
1428 cfq_schedule_dispatch(cfqd); 2386 cfq_schedule_dispatch(cfqd);
1429 } 2387 }
1430 2388
2389 BUG_ON(cfq_cfqq_on_rr(cfqq));
1431 kmem_cache_free(cfq_pool, cfqq); 2390 kmem_cache_free(cfq_pool, cfqq);
2391 cfq_put_cfqg(cfqg);
2392 if (orig_cfqg)
2393 cfq_put_cfqg(orig_cfqg);
1432} 2394}
1433 2395
1434/* 2396/*
@@ -1518,11 +2480,29 @@ static void cfq_free_io_context(struct io_context *ioc)
1518 2480
1519static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2481static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1520{ 2482{
2483 struct cfq_queue *__cfqq, *next;
2484
1521 if (unlikely(cfqq == cfqd->active_queue)) { 2485 if (unlikely(cfqq == cfqd->active_queue)) {
1522 __cfq_slice_expired(cfqd, cfqq, 0); 2486 __cfq_slice_expired(cfqd, cfqq, 0);
1523 cfq_schedule_dispatch(cfqd); 2487 cfq_schedule_dispatch(cfqd);
1524 } 2488 }
1525 2489
2490 /*
2491 * If this queue was scheduled to merge with another queue, be
2492 * sure to drop the reference taken on that queue (and others in
2493 * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs.
2494 */
2495 __cfqq = cfqq->new_cfqq;
2496 while (__cfqq) {
2497 if (__cfqq == cfqq) {
2498 WARN(1, "cfqq->new_cfqq loop detected\n");
2499 break;
2500 }
2501 next = __cfqq->new_cfqq;
2502 cfq_put_queue(__cfqq);
2503 __cfqq = next;
2504 }
2505
1526 cfq_put_queue(cfqq); 2506 cfq_put_queue(cfqq);
1527} 2507}
1528 2508
@@ -1703,14 +2683,51 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1703 cfqq->pid = pid; 2683 cfqq->pid = pid;
1704} 2684}
1705 2685
2686#ifdef CONFIG_CFQ_GROUP_IOSCHED
2687static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
2688{
2689 struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
2690 struct cfq_data *cfqd = cic->key;
2691 unsigned long flags;
2692 struct request_queue *q;
2693
2694 if (unlikely(!cfqd))
2695 return;
2696
2697 q = cfqd->queue;
2698
2699 spin_lock_irqsave(q->queue_lock, flags);
2700
2701 if (sync_cfqq) {
2702 /*
2703 * Drop reference to sync queue. A new sync queue will be
2704 * assigned in new group upon arrival of a fresh request.
2705 */
2706 cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
2707 cic_set_cfqq(cic, NULL, 1);
2708 cfq_put_queue(sync_cfqq);
2709 }
2710
2711 spin_unlock_irqrestore(q->queue_lock, flags);
2712}
2713
2714static void cfq_ioc_set_cgroup(struct io_context *ioc)
2715{
2716 call_for_each_cic(ioc, changed_cgroup);
2717 ioc->cgroup_changed = 0;
2718}
2719#endif /* CONFIG_CFQ_GROUP_IOSCHED */
2720
1706static struct cfq_queue * 2721static struct cfq_queue *
1707cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, 2722cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
1708 struct io_context *ioc, gfp_t gfp_mask) 2723 struct io_context *ioc, gfp_t gfp_mask)
1709{ 2724{
1710 struct cfq_queue *cfqq, *new_cfqq = NULL; 2725 struct cfq_queue *cfqq, *new_cfqq = NULL;
1711 struct cfq_io_context *cic; 2726 struct cfq_io_context *cic;
2727 struct cfq_group *cfqg;
1712 2728
1713retry: 2729retry:
2730 cfqg = cfq_get_cfqg(cfqd, 1);
1714 cic = cfq_cic_lookup(cfqd, ioc); 2731 cic = cfq_cic_lookup(cfqd, ioc);
1715 /* cic always exists here */ 2732 /* cic always exists here */
1716 cfqq = cic_to_cfqq(cic, is_sync); 2733 cfqq = cic_to_cfqq(cic, is_sync);
@@ -1741,6 +2758,7 @@ retry:
1741 if (cfqq) { 2758 if (cfqq) {
1742 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); 2759 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
1743 cfq_init_prio_data(cfqq, ioc); 2760 cfq_init_prio_data(cfqq, ioc);
2761 cfq_link_cfqq_cfqg(cfqq, cfqg);
1744 cfq_log_cfqq(cfqd, cfqq, "alloced"); 2762 cfq_log_cfqq(cfqd, cfqq, "alloced");
1745 } else 2763 } else
1746 cfqq = &cfqd->oom_cfqq; 2764 cfqq = &cfqd->oom_cfqq;
@@ -1932,6 +2950,10 @@ out:
1932 if (unlikely(ioc->ioprio_changed)) 2950 if (unlikely(ioc->ioprio_changed))
1933 cfq_ioc_set_ioprio(ioc); 2951 cfq_ioc_set_ioprio(ioc);
1934 2952
2953#ifdef CONFIG_CFQ_GROUP_IOSCHED
2954 if (unlikely(ioc->cgroup_changed))
2955 cfq_ioc_set_cgroup(ioc);
2956#endif
1935 return cic; 2957 return cic;
1936err_free: 2958err_free:
1937 cfq_cic_free(cic); 2959 cfq_cic_free(cic);
@@ -1952,33 +2974,46 @@ cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
1952} 2974}
1953 2975
1954static void 2976static void
1955cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic, 2977cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1956 struct request *rq) 2978 struct request *rq)
1957{ 2979{
1958 sector_t sdist; 2980 sector_t sdist;
1959 u64 total; 2981 u64 total;
1960 2982
1961 if (!cic->last_request_pos) 2983 if (!cfqq->last_request_pos)
1962 sdist = 0; 2984 sdist = 0;
1963 else if (cic->last_request_pos < blk_rq_pos(rq)) 2985 else if (cfqq->last_request_pos < blk_rq_pos(rq))
1964 sdist = blk_rq_pos(rq) - cic->last_request_pos; 2986 sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
1965 else 2987 else
1966 sdist = cic->last_request_pos - blk_rq_pos(rq); 2988 sdist = cfqq->last_request_pos - blk_rq_pos(rq);
1967 2989
1968 /* 2990 /*
1969 * Don't allow the seek distance to get too large from the 2991 * Don't allow the seek distance to get too large from the
1970 * odd fragment, pagein, etc 2992 * odd fragment, pagein, etc
1971 */ 2993 */
1972 if (cic->seek_samples <= 60) /* second&third seek */ 2994 if (cfqq->seek_samples <= 60) /* second&third seek */
1973 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024); 2995 sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*1024);
1974 else 2996 else
1975 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64); 2997 sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*64);
1976 2998
1977 cic->seek_samples = (7*cic->seek_samples + 256) / 8; 2999 cfqq->seek_samples = (7*cfqq->seek_samples + 256) / 8;
1978 cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8; 3000 cfqq->seek_total = (7*cfqq->seek_total + (u64)256*sdist) / 8;
1979 total = cic->seek_total + (cic->seek_samples/2); 3001 total = cfqq->seek_total + (cfqq->seek_samples/2);
1980 do_div(total, cic->seek_samples); 3002 do_div(total, cfqq->seek_samples);
1981 cic->seek_mean = (sector_t)total; 3003 cfqq->seek_mean = (sector_t)total;
3004
3005 /*
3006 * If this cfqq is shared between multiple processes, check to
3007 * make sure that those processes are still issuing I/Os within
3008 * the mean seek distance. If not, it may be time to break the
3009 * queues apart again.
3010 */
3011 if (cfq_cfqq_coop(cfqq)) {
3012 if (CFQQ_SEEKY(cfqq) && !cfqq->seeky_start)
3013 cfqq->seeky_start = jiffies;
3014 else if (!CFQQ_SEEKY(cfqq))
3015 cfqq->seeky_start = 0;
3016 }
1982} 3017}
1983 3018
1984/* 3019/*
@@ -1999,14 +3034,15 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1999 3034
2000 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); 3035 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
2001 3036
3037 if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3038 cfq_mark_cfqq_deep(cfqq);
3039
2002 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || 3040 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
2003 (!cfqd->cfq_latency && cfqd->hw_tag && CIC_SEEKY(cic))) 3041 (!cfq_cfqq_deep(cfqq) && sample_valid(cfqq->seek_samples)
3042 && CFQQ_SEEKY(cfqq)))
2004 enable_idle = 0; 3043 enable_idle = 0;
2005 else if (sample_valid(cic->ttime_samples)) { 3044 else if (sample_valid(cic->ttime_samples)) {
2006 unsigned int slice_idle = cfqd->cfq_slice_idle; 3045 if (cic->ttime_mean > cfqd->cfq_slice_idle)
2007 if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
2008 slice_idle = msecs_to_jiffies(CFQ_MIN_TT);
2009 if (cic->ttime_mean > slice_idle)
2010 enable_idle = 0; 3046 enable_idle = 0;
2011 else 3047 else
2012 enable_idle = 1; 3048 enable_idle = 1;
@@ -2035,9 +3071,6 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
2035 if (!cfqq) 3071 if (!cfqq)
2036 return false; 3072 return false;
2037 3073
2038 if (cfq_slice_used(cfqq))
2039 return true;
2040
2041 if (cfq_class_idle(new_cfqq)) 3074 if (cfq_class_idle(new_cfqq))
2042 return false; 3075 return false;
2043 3076
@@ -2051,6 +3084,19 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
2051 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) 3084 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
2052 return true; 3085 return true;
2053 3086
3087 if (new_cfqq->cfqg != cfqq->cfqg)
3088 return false;
3089
3090 if (cfq_slice_used(cfqq))
3091 return true;
3092
3093 /* Allow preemption only if we are idling on sync-noidle tree */
3094 if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
3095 cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
3096 new_cfqq->service_tree->count == 2 &&
3097 RB_EMPTY_ROOT(&cfqq->sort_list))
3098 return true;
3099
2054 /* 3100 /*
2055 * So both queues are sync. Let the new request get disk time if 3101 * So both queues are sync. Let the new request get disk time if
2056 * it's a metadata request and the current queue is doing regular IO. 3102 * it's a metadata request and the current queue is doing regular IO.
@@ -2071,16 +3117,8 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
2071 * if this request is as-good as one we would expect from the 3117 * if this request is as-good as one we would expect from the
2072 * current cfqq, let it preempt 3118 * current cfqq, let it preempt
2073 */ 3119 */
2074 if (cfq_rq_close(cfqd, rq) && (!cfq_cfqq_coop(new_cfqq) || 3120 if (cfq_rq_close(cfqd, cfqq, rq))
2075 cfqd->busy_queues == 1)) {
2076 /*
2077 * Mark new queue coop_preempt, so its coop flag will not be
2078 * cleared when new queue gets scheduled at the very first time
2079 */
2080 cfq_mark_cfqq_coop_preempt(new_cfqq);
2081 cfq_mark_cfqq_coop(new_cfqq);
2082 return true; 3121 return true;
2083 }
2084 3122
2085 return false; 3123 return false;
2086} 3124}
@@ -2121,12 +3159,16 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2121 cfqq->meta_pending++; 3159 cfqq->meta_pending++;
2122 3160
2123 cfq_update_io_thinktime(cfqd, cic); 3161 cfq_update_io_thinktime(cfqd, cic);
2124 cfq_update_io_seektime(cfqd, cic, rq); 3162 cfq_update_io_seektime(cfqd, cfqq, rq);
2125 cfq_update_idle_window(cfqd, cfqq, cic); 3163 cfq_update_idle_window(cfqd, cfqq, cic);
2126 3164
2127 cic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); 3165 cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
2128 3166
2129 if (cfqq == cfqd->active_queue) { 3167 if (cfqq == cfqd->active_queue) {
3168 if (cfq_cfqq_wait_busy(cfqq)) {
3169 cfq_clear_cfqq_wait_busy(cfqq);
3170 cfq_mark_cfqq_wait_busy_done(cfqq);
3171 }
2130 /* 3172 /*
2131 * Remember that we saw a request from this process, but 3173 * Remember that we saw a request from this process, but
2132 * don't start queuing just yet. Otherwise we risk seeing lots 3174 * don't start queuing just yet. Otherwise we risk seeing lots
@@ -2141,9 +3183,9 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2141 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || 3183 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
2142 cfqd->busy_queues > 1) { 3184 cfqd->busy_queues > 1) {
2143 del_timer(&cfqd->idle_slice_timer); 3185 del_timer(&cfqd->idle_slice_timer);
2144 __blk_run_queue(cfqd->queue); 3186 __blk_run_queue(cfqd->queue);
2145 } 3187 } else
2146 cfq_mark_cfqq_must_dispatch(cfqq); 3188 cfq_mark_cfqq_must_dispatch(cfqq);
2147 } 3189 }
2148 } else if (cfq_should_preempt(cfqd, cfqq, rq)) { 3190 } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
2149 /* 3191 /*
@@ -2165,10 +3207,9 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
2165 cfq_log_cfqq(cfqd, cfqq, "insert_request"); 3207 cfq_log_cfqq(cfqd, cfqq, "insert_request");
2166 cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc); 3208 cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
2167 3209
2168 cfq_add_rq_rb(rq);
2169
2170 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); 3210 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
2171 list_add_tail(&rq->queuelist, &cfqq->fifo); 3211 list_add_tail(&rq->queuelist, &cfqq->fifo);
3212 cfq_add_rq_rb(rq);
2172 3213
2173 cfq_rq_enqueued(cfqd, cfqq, rq); 3214 cfq_rq_enqueued(cfqd, cfqq, rq);
2174} 3215}
@@ -2179,23 +3220,35 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
2179 */ 3220 */
2180static void cfq_update_hw_tag(struct cfq_data *cfqd) 3221static void cfq_update_hw_tag(struct cfq_data *cfqd)
2181{ 3222{
2182 if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak) 3223 struct cfq_queue *cfqq = cfqd->active_queue;
2183 cfqd->rq_in_driver_peak = rq_in_driver(cfqd); 3224
3225 if (rq_in_driver(cfqd) > cfqd->hw_tag_est_depth)
3226 cfqd->hw_tag_est_depth = rq_in_driver(cfqd);
3227
3228 if (cfqd->hw_tag == 1)
3229 return;
2184 3230
2185 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && 3231 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
2186 rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN) 3232 rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN)
2187 return; 3233 return;
2188 3234
3235 /*
3236 * If active queue hasn't enough requests and can idle, cfq might not
3237 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
3238 * case
3239 */
3240 if (cfqq && cfq_cfqq_idle_window(cfqq) &&
3241 cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
3242 CFQ_HW_QUEUE_MIN && rq_in_driver(cfqd) < CFQ_HW_QUEUE_MIN)
3243 return;
3244
2189 if (cfqd->hw_tag_samples++ < 50) 3245 if (cfqd->hw_tag_samples++ < 50)
2190 return; 3246 return;
2191 3247
2192 if (cfqd->rq_in_driver_peak >= CFQ_HW_QUEUE_MIN) 3248 if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
2193 cfqd->hw_tag = 1; 3249 cfqd->hw_tag = 1;
2194 else 3250 else
2195 cfqd->hw_tag = 0; 3251 cfqd->hw_tag = 0;
2196
2197 cfqd->hw_tag_samples = 0;
2198 cfqd->rq_in_driver_peak = 0;
2199} 3252}
2200 3253
2201static void cfq_completed_request(struct request_queue *q, struct request *rq) 3254static void cfq_completed_request(struct request_queue *q, struct request *rq)
@@ -2206,7 +3259,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
2206 unsigned long now; 3259 unsigned long now;
2207 3260
2208 now = jiffies; 3261 now = jiffies;
2209 cfq_log_cfqq(cfqd, cfqq, "complete"); 3262 cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq));
2210 3263
2211 cfq_update_hw_tag(cfqd); 3264 cfq_update_hw_tag(cfqd);
2212 3265
@@ -2234,18 +3287,40 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
2234 cfq_set_prio_slice(cfqd, cfqq); 3287 cfq_set_prio_slice(cfqd, cfqq);
2235 cfq_clear_cfqq_slice_new(cfqq); 3288 cfq_clear_cfqq_slice_new(cfqq);
2236 } 3289 }
3290
3291 /*
3292 * If this queue consumed its slice and this is last queue
3293 * in the group, wait for next request before we expire
3294 * the queue
3295 */
3296 if (cfq_slice_used(cfqq) && cfqq->cfqg->nr_cfqq == 1) {
3297 cfqq->slice_end = jiffies + cfqd->cfq_slice_idle;
3298 cfq_mark_cfqq_wait_busy(cfqq);
3299 }
3300
2237 /* 3301 /*
2238 * If there are no requests waiting in this queue, and 3302 * Idling is not enabled on:
2239 * there are other queues ready to issue requests, AND 3303 * - expired queues
2240 * those other queues are issuing requests within our 3304 * - idle-priority queues
2241 * mean seek distance, give them a chance to run instead 3305 * - async queues
2242 * of idling. 3306 * - queues with still some requests queued
3307 * - when there is a close cooperator
2243 */ 3308 */
2244 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) 3309 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
2245 cfq_slice_expired(cfqd, 1); 3310 cfq_slice_expired(cfqd, 1);
2246 else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) && 3311 else if (sync && cfqq_empty &&
2247 sync && !rq_noidle(rq)) 3312 !cfq_close_cooperator(cfqd, cfqq)) {
2248 cfq_arm_slice_timer(cfqd); 3313 cfqd->noidle_tree_requires_idle |= !rq_noidle(rq);
3314 /*
3315 * Idling is enabled for SYNC_WORKLOAD.
3316 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
3317 * only if we processed at least one !rq_noidle request
3318 */
3319 if (cfqd->serving_type == SYNC_WORKLOAD
3320 || cfqd->noidle_tree_requires_idle
3321 || cfqq->cfqg->nr_cfqq == 1)
3322 cfq_arm_slice_timer(cfqd);
3323 }
2249 } 3324 }
2250 3325
2251 if (!rq_in_driver(cfqd)) 3326 if (!rq_in_driver(cfqd))
@@ -2269,12 +3344,10 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
2269 cfqq->ioprio = IOPRIO_NORM; 3344 cfqq->ioprio = IOPRIO_NORM;
2270 } else { 3345 } else {
2271 /* 3346 /*
2272 * check if we need to unboost the queue 3347 * unboost the queue (if needed)
2273 */ 3348 */
2274 if (cfqq->ioprio_class != cfqq->org_ioprio_class) 3349 cfqq->ioprio_class = cfqq->org_ioprio_class;
2275 cfqq->ioprio_class = cfqq->org_ioprio_class; 3350 cfqq->ioprio = cfqq->org_ioprio;
2276 if (cfqq->ioprio != cfqq->org_ioprio)
2277 cfqq->ioprio = cfqq->org_ioprio;
2278 } 3351 }
2279} 3352}
2280 3353
@@ -2338,6 +3411,43 @@ static void cfq_put_request(struct request *rq)
2338 } 3411 }
2339} 3412}
2340 3413
3414static struct cfq_queue *
3415cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
3416 struct cfq_queue *cfqq)
3417{
3418 cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
3419 cic_set_cfqq(cic, cfqq->new_cfqq, 1);
3420 cfq_mark_cfqq_coop(cfqq->new_cfqq);
3421 cfq_put_queue(cfqq);
3422 return cic_to_cfqq(cic, 1);
3423}
3424
3425static int should_split_cfqq(struct cfq_queue *cfqq)
3426{
3427 if (cfqq->seeky_start &&
3428 time_after(jiffies, cfqq->seeky_start + CFQQ_COOP_TOUT))
3429 return 1;
3430 return 0;
3431}
3432
3433/*
3434 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
3435 * was the last process referring to said cfqq.
3436 */
3437static struct cfq_queue *
3438split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
3439{
3440 if (cfqq_process_refs(cfqq) == 1) {
3441 cfqq->seeky_start = 0;
3442 cfqq->pid = current->pid;
3443 cfq_clear_cfqq_coop(cfqq);
3444 return cfqq;
3445 }
3446
3447 cic_set_cfqq(cic, NULL, 1);
3448 cfq_put_queue(cfqq);
3449 return NULL;
3450}
2341/* 3451/*
2342 * Allocate cfq data structures associated with this request. 3452 * Allocate cfq data structures associated with this request.
2343 */ 3453 */
@@ -2360,10 +3470,30 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
2360 if (!cic) 3470 if (!cic)
2361 goto queue_fail; 3471 goto queue_fail;
2362 3472
3473new_queue:
2363 cfqq = cic_to_cfqq(cic, is_sync); 3474 cfqq = cic_to_cfqq(cic, is_sync);
2364 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 3475 if (!cfqq || cfqq == &cfqd->oom_cfqq) {
2365 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); 3476 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
2366 cic_set_cfqq(cic, cfqq, is_sync); 3477 cic_set_cfqq(cic, cfqq, is_sync);
3478 } else {
3479 /*
3480 * If the queue was seeky for too long, break it apart.
3481 */
3482 if (cfq_cfqq_coop(cfqq) && should_split_cfqq(cfqq)) {
3483 cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
3484 cfqq = split_cfqq(cic, cfqq);
3485 if (!cfqq)
3486 goto new_queue;
3487 }
3488
3489 /*
3490 * Check to see if this queue is scheduled to merge with
3491 * another, closely cooperating queue. The merging of
3492 * queues happens here as it must be done in process context.
3493 * The reference on new_cfqq was taken in merge_cfqqs.
3494 */
3495 if (cfqq->new_cfqq)
3496 cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
2367 } 3497 }
2368 3498
2369 cfqq->allocated[rw]++; 3499 cfqq->allocated[rw]++;
@@ -2438,6 +3568,11 @@ static void cfq_idle_slice_timer(unsigned long data)
2438 */ 3568 */
2439 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 3569 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
2440 goto out_kick; 3570 goto out_kick;
3571
3572 /*
3573 * Queue depth flag is reset only when the idle didn't succeed
3574 */
3575 cfq_clear_cfqq_deep(cfqq);
2441 } 3576 }
2442expire: 3577expire:
2443 cfq_slice_expired(cfqd, timed_out); 3578 cfq_slice_expired(cfqd, timed_out);
@@ -2468,6 +3603,11 @@ static void cfq_put_async_queues(struct cfq_data *cfqd)
2468 cfq_put_queue(cfqd->async_idle_cfqq); 3603 cfq_put_queue(cfqd->async_idle_cfqq);
2469} 3604}
2470 3605
3606static void cfq_cfqd_free(struct rcu_head *head)
3607{
3608 kfree(container_of(head, struct cfq_data, rcu));
3609}
3610
2471static void cfq_exit_queue(struct elevator_queue *e) 3611static void cfq_exit_queue(struct elevator_queue *e)
2472{ 3612{
2473 struct cfq_data *cfqd = e->elevator_data; 3613 struct cfq_data *cfqd = e->elevator_data;
@@ -2489,25 +3629,49 @@ static void cfq_exit_queue(struct elevator_queue *e)
2489 } 3629 }
2490 3630
2491 cfq_put_async_queues(cfqd); 3631 cfq_put_async_queues(cfqd);
3632 cfq_release_cfq_groups(cfqd);
3633 blkiocg_del_blkio_group(&cfqd->root_group.blkg);
2492 3634
2493 spin_unlock_irq(q->queue_lock); 3635 spin_unlock_irq(q->queue_lock);
2494 3636
2495 cfq_shutdown_timer_wq(cfqd); 3637 cfq_shutdown_timer_wq(cfqd);
2496 3638
2497 kfree(cfqd); 3639 /* Wait for cfqg->blkg->key accessors to exit their grace periods. */
3640 call_rcu(&cfqd->rcu, cfq_cfqd_free);
2498} 3641}
2499 3642
2500static void *cfq_init_queue(struct request_queue *q) 3643static void *cfq_init_queue(struct request_queue *q)
2501{ 3644{
2502 struct cfq_data *cfqd; 3645 struct cfq_data *cfqd;
2503 int i; 3646 int i, j;
3647 struct cfq_group *cfqg;
3648 struct cfq_rb_root *st;
2504 3649
2505 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 3650 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
2506 if (!cfqd) 3651 if (!cfqd)
2507 return NULL; 3652 return NULL;
2508 3653
2509 cfqd->service_tree = CFQ_RB_ROOT; 3654 /* Init root service tree */
3655 cfqd->grp_service_tree = CFQ_RB_ROOT;
3656
3657 /* Init root group */
3658 cfqg = &cfqd->root_group;
3659 for_each_cfqg_st(cfqg, i, j, st)
3660 *st = CFQ_RB_ROOT;
3661 RB_CLEAR_NODE(&cfqg->rb_node);
2510 3662
3663 /* Give preference to root group over other groups */
3664 cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
3665
3666#ifdef CONFIG_CFQ_GROUP_IOSCHED
3667 /*
3668 * Take a reference to root group which we never drop. This is just
3669 * to make sure that cfq_put_cfqg() does not try to kfree root group
3670 */
3671 atomic_set(&cfqg->ref, 1);
3672 blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd,
3673 0);
3674#endif
2511 /* 3675 /*
2512 * Not strictly needed (since RB_ROOT just clears the node and we 3676 * Not strictly needed (since RB_ROOT just clears the node and we
2513 * zeroed cfqd on alloc), but better be safe in case someone decides 3677 * zeroed cfqd on alloc), but better be safe in case someone decides
@@ -2523,6 +3687,7 @@ static void *cfq_init_queue(struct request_queue *q)
2523 */ 3687 */
2524 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); 3688 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
2525 atomic_inc(&cfqd->oom_cfqq.ref); 3689 atomic_inc(&cfqd->oom_cfqq.ref);
3690 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
2526 3691
2527 INIT_LIST_HEAD(&cfqd->cic_list); 3692 INIT_LIST_HEAD(&cfqd->cic_list);
2528 3693
@@ -2544,8 +3709,10 @@ static void *cfq_init_queue(struct request_queue *q)
2544 cfqd->cfq_slice_async_rq = cfq_slice_async_rq; 3709 cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
2545 cfqd->cfq_slice_idle = cfq_slice_idle; 3710 cfqd->cfq_slice_idle = cfq_slice_idle;
2546 cfqd->cfq_latency = 1; 3711 cfqd->cfq_latency = 1;
2547 cfqd->hw_tag = 1; 3712 cfqd->cfq_group_isolation = 0;
3713 cfqd->hw_tag = -1;
2548 cfqd->last_end_sync_rq = jiffies; 3714 cfqd->last_end_sync_rq = jiffies;
3715 INIT_RCU_HEAD(&cfqd->rcu);
2549 return cfqd; 3716 return cfqd;
2550} 3717}
2551 3718
@@ -2614,6 +3781,7 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
2614SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 3781SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
2615SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 3782SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
2616SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); 3783SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
3784SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
2617#undef SHOW_FUNCTION 3785#undef SHOW_FUNCTION
2618 3786
2619#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 3787#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
@@ -2646,6 +3814,7 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
2646STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, 3814STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
2647 UINT_MAX, 0); 3815 UINT_MAX, 0);
2648STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); 3816STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
3817STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
2649#undef STORE_FUNCTION 3818#undef STORE_FUNCTION
2650 3819
2651#define CFQ_ATTR(name) \ 3820#define CFQ_ATTR(name) \
@@ -2662,6 +3831,7 @@ static struct elv_fs_entry cfq_attrs[] = {
2662 CFQ_ATTR(slice_async_rq), 3831 CFQ_ATTR(slice_async_rq),
2663 CFQ_ATTR(slice_idle), 3832 CFQ_ATTR(slice_idle),
2664 CFQ_ATTR(low_latency), 3833 CFQ_ATTR(low_latency),
3834 CFQ_ATTR(group_isolation),
2665 __ATTR_NULL 3835 __ATTR_NULL
2666}; 3836};
2667 3837
@@ -2691,6 +3861,17 @@ static struct elevator_type iosched_cfq = {
2691 .elevator_owner = THIS_MODULE, 3861 .elevator_owner = THIS_MODULE,
2692}; 3862};
2693 3863
3864#ifdef CONFIG_CFQ_GROUP_IOSCHED
3865static struct blkio_policy_type blkio_policy_cfq = {
3866 .ops = {
3867 .blkio_unlink_group_fn = cfq_unlink_blkio_group,
3868 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight,
3869 },
3870};
3871#else
3872static struct blkio_policy_type blkio_policy_cfq;
3873#endif
3874
2694static int __init cfq_init(void) 3875static int __init cfq_init(void)
2695{ 3876{
2696 /* 3877 /*
@@ -2705,6 +3886,7 @@ static int __init cfq_init(void)
2705 return -ENOMEM; 3886 return -ENOMEM;
2706 3887
2707 elv_register(&iosched_cfq); 3888 elv_register(&iosched_cfq);
3889 blkio_policy_register(&blkio_policy_cfq);
2708 3890
2709 return 0; 3891 return 0;
2710} 3892}
@@ -2712,6 +3894,7 @@ static int __init cfq_init(void)
2712static void __exit cfq_exit(void) 3894static void __exit cfq_exit(void)
2713{ 3895{
2714 DECLARE_COMPLETION_ONSTACK(all_gone); 3896 DECLARE_COMPLETION_ONSTACK(all_gone);
3897 blkio_policy_unregister(&blkio_policy_cfq);
2715 elv_unregister(&iosched_cfq); 3898 elv_unregister(&iosched_cfq);
2716 ioc_gone = &all_gone; 3899 ioc_gone = &all_gone;
2717 /* ioc_gone's update must be visible before reading ioc_count */ 3900 /* ioc_gone's update must be visible before reading ioc_count */
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 9bd086c1a4d5..4eb8e9ea4af5 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -747,6 +747,8 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
747 return compat_put_uint(arg, bdev_io_opt(bdev)); 747 return compat_put_uint(arg, bdev_io_opt(bdev));
748 case BLKALIGNOFF: 748 case BLKALIGNOFF:
749 return compat_put_int(arg, bdev_alignment_offset(bdev)); 749 return compat_put_int(arg, bdev_alignment_offset(bdev));
750 case BLKDISCARDZEROES:
751 return compat_put_uint(arg, bdev_discard_zeroes_data(bdev));
750 case BLKFLSBUF: 752 case BLKFLSBUF:
751 case BLKROSET: 753 case BLKROSET:
752 case BLKDISCARD: 754 case BLKDISCARD:
diff --git a/block/elevator.c b/block/elevator.c
index a847046c6e53..9ad5ccc4c5ee 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -154,10 +154,7 @@ static struct elevator_type *elevator_get(const char *name)
154 154
155 spin_unlock(&elv_list_lock); 155 spin_unlock(&elv_list_lock);
156 156
157 if (!strcmp(name, "anticipatory")) 157 sprintf(elv, "%s-iosched", name);
158 sprintf(elv, "as-iosched");
159 else
160 sprintf(elv, "%s-iosched", name);
161 158
162 request_module("%s", elv); 159 request_module("%s", elv);
163 spin_lock(&elv_list_lock); 160 spin_lock(&elv_list_lock);
@@ -193,10 +190,7 @@ static int __init elevator_setup(char *str)
193 * Be backwards-compatible with previous kernels, so users 190 * Be backwards-compatible with previous kernels, so users
194 * won't get the wrong elevator. 191 * won't get the wrong elevator.
195 */ 192 */
196 if (!strcmp(str, "as")) 193 strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
197 strcpy(chosen_elevator, "anticipatory");
198 else
199 strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
200 return 1; 194 return 1;
201} 195}
202 196
diff --git a/block/genhd.c b/block/genhd.c
index 517e4332cb37..b11a4ad7d571 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -861,12 +861,23 @@ static ssize_t disk_alignment_offset_show(struct device *dev,
861 return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue)); 861 return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
862} 862}
863 863
864static ssize_t disk_discard_alignment_show(struct device *dev,
865 struct device_attribute *attr,
866 char *buf)
867{
868 struct gendisk *disk = dev_to_disk(dev);
869
870 return sprintf(buf, "%u\n", queue_discard_alignment(disk->queue));
871}
872
864static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); 873static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
865static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); 874static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
866static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); 875static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
867static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); 876static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
868static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); 877static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
869static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); 878static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
879static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
880 NULL);
870static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); 881static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
871static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); 882static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
872static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); 883static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
@@ -887,6 +898,7 @@ static struct attribute *disk_attrs[] = {
887 &dev_attr_ro.attr, 898 &dev_attr_ro.attr,
888 &dev_attr_size.attr, 899 &dev_attr_size.attr,
889 &dev_attr_alignment_offset.attr, 900 &dev_attr_alignment_offset.attr,
901 &dev_attr_discard_alignment.attr,
890 &dev_attr_capability.attr, 902 &dev_attr_capability.attr,
891 &dev_attr_stat.attr, 903 &dev_attr_stat.attr,
892 &dev_attr_inflight.attr, 904 &dev_attr_inflight.attr,
diff --git a/block/ioctl.c b/block/ioctl.c
index 1f4d1de12b09..be48ea51faee 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -280,6 +280,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
280 return put_uint(arg, bdev_io_opt(bdev)); 280 return put_uint(arg, bdev_io_opt(bdev));
281 case BLKALIGNOFF: 281 case BLKALIGNOFF:
282 return put_int(arg, bdev_alignment_offset(bdev)); 282 return put_int(arg, bdev_alignment_offset(bdev));
283 case BLKDISCARDZEROES:
284 return put_uint(arg, bdev_discard_zeroes_data(bdev));
283 case BLKSECTGET: 285 case BLKSECTGET:
284 return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev))); 286 return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
285 case BLKRASET: 287 case BLKRASET:
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index e5b10017a50b..a8b5a10eb5b0 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -35,7 +35,9 @@
35struct blk_cmd_filter { 35struct blk_cmd_filter {
36 unsigned long read_ok[BLK_SCSI_CMD_PER_LONG]; 36 unsigned long read_ok[BLK_SCSI_CMD_PER_LONG];
37 unsigned long write_ok[BLK_SCSI_CMD_PER_LONG]; 37 unsigned long write_ok[BLK_SCSI_CMD_PER_LONG];
38} blk_default_cmd_filter; 38};
39
40static struct blk_cmd_filter blk_default_cmd_filter;
39 41
40/* Command group 3 is reserved and should never be used. */ 42/* Command group 3 is reserved and should never be used. */
41const unsigned char scsi_command_size_tbl[8] = 43const unsigned char scsi_command_size_tbl[8] =
@@ -675,7 +677,7 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
675} 677}
676EXPORT_SYMBOL(scsi_cmd_ioctl); 678EXPORT_SYMBOL(scsi_cmd_ioctl);
677 679
678int __init blk_scsi_ioctl_init(void) 680static int __init blk_scsi_ioctl_init(void)
679{ 681{
680 blk_set_cmd_filter_defaults(&blk_default_cmd_filter); 682 blk_set_cmd_filter_defaults(&blk_default_cmd_filter);
681 return 0; 683 return 0;
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 1d886e079c58..77bfce52e9ca 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -271,6 +271,8 @@ config BLK_DEV_CRYPTOLOOP
271 instead, which can be configured to be on-disk compatible with the 271 instead, which can be configured to be on-disk compatible with the
272 cryptoloop device. 272 cryptoloop device.
273 273
274source "drivers/block/drbd/Kconfig"
275
274config BLK_DEV_NBD 276config BLK_DEV_NBD
275 tristate "Network block device support" 277 tristate "Network block device support"
276 depends on NET 278 depends on NET
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index cdaa3f8fddf0..aff5ac925c34 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -36,5 +36,6 @@ obj-$(CONFIG_BLK_DEV_UB) += ub.o
36obj-$(CONFIG_BLK_DEV_HD) += hd.o 36obj-$(CONFIG_BLK_DEV_HD) += hd.o
37 37
38obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o 38obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
39obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
39 40
40swim_mod-objs := swim.o swim_asm.o 41swim_mod-objs := swim.o swim_asm.o
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 92b126394fa1..873e594860d3 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -179,19 +179,17 @@ static int rebuild_lun_table(ctlr_info_t *h, int first_time, int via_ioctl);
179static int deregister_disk(ctlr_info_t *h, int drv_index, 179static int deregister_disk(ctlr_info_t *h, int drv_index,
180 int clear_all, int via_ioctl); 180 int clear_all, int via_ioctl);
181 181
182static void cciss_read_capacity(int ctlr, int logvol, int withirq, 182static void cciss_read_capacity(int ctlr, int logvol,
183 sector_t *total_size, unsigned int *block_size); 183 sector_t *total_size, unsigned int *block_size);
184static void cciss_read_capacity_16(int ctlr, int logvol, int withirq, 184static void cciss_read_capacity_16(int ctlr, int logvol,
185 sector_t *total_size, unsigned int *block_size); 185 sector_t *total_size, unsigned int *block_size);
186static void cciss_geometry_inquiry(int ctlr, int logvol, 186static void cciss_geometry_inquiry(int ctlr, int logvol,
187 int withirq, sector_t total_size, 187 sector_t total_size,
188 unsigned int block_size, InquiryData_struct *inq_buff, 188 unsigned int block_size, InquiryData_struct *inq_buff,
189 drive_info_struct *drv); 189 drive_info_struct *drv);
190static void __devinit cciss_interrupt_mode(ctlr_info_t *, struct pci_dev *, 190static void __devinit cciss_interrupt_mode(ctlr_info_t *, struct pci_dev *,
191 __u32); 191 __u32);
192static void start_io(ctlr_info_t *h); 192static void start_io(ctlr_info_t *h);
193static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
194 __u8 page_code, unsigned char *scsi3addr, int cmd_type);
195static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size, 193static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size,
196 __u8 page_code, unsigned char scsi3addr[], 194 __u8 page_code, unsigned char scsi3addr[],
197 int cmd_type); 195 int cmd_type);
@@ -424,12 +422,9 @@ cciss_proc_write(struct file *file, const char __user *buf,
424 if (strncmp(ENGAGE_SCSI, buffer, sizeof ENGAGE_SCSI - 1) == 0) { 422 if (strncmp(ENGAGE_SCSI, buffer, sizeof ENGAGE_SCSI - 1) == 0) {
425 struct seq_file *seq = file->private_data; 423 struct seq_file *seq = file->private_data;
426 ctlr_info_t *h = seq->private; 424 ctlr_info_t *h = seq->private;
427 int rc;
428 425
429 rc = cciss_engage_scsi(h->ctlr); 426 err = cciss_engage_scsi(h->ctlr);
430 if (rc != 0) 427 if (err == 0)
431 err = -rc;
432 else
433 err = length; 428 err = length;
434 } else 429 } else
435#endif /* CONFIG_CISS_SCSI_TAPE */ 430#endif /* CONFIG_CISS_SCSI_TAPE */
@@ -1657,9 +1652,11 @@ static void cciss_softirq_done(struct request *rq)
1657{ 1652{
1658 CommandList_struct *cmd = rq->completion_data; 1653 CommandList_struct *cmd = rq->completion_data;
1659 ctlr_info_t *h = hba[cmd->ctlr]; 1654 ctlr_info_t *h = hba[cmd->ctlr];
1655 SGDescriptor_struct *curr_sg = cmd->SG;
1660 unsigned long flags; 1656 unsigned long flags;
1661 u64bit temp64; 1657 u64bit temp64;
1662 int i, ddir; 1658 int i, ddir;
1659 int sg_index = 0;
1663 1660
1664 if (cmd->Request.Type.Direction == XFER_READ) 1661 if (cmd->Request.Type.Direction == XFER_READ)
1665 ddir = PCI_DMA_FROMDEVICE; 1662 ddir = PCI_DMA_FROMDEVICE;
@@ -1669,9 +1666,22 @@ static void cciss_softirq_done(struct request *rq)
1669 /* command did not need to be retried */ 1666 /* command did not need to be retried */
1670 /* unmap the DMA mapping for all the scatter gather elements */ 1667 /* unmap the DMA mapping for all the scatter gather elements */
1671 for (i = 0; i < cmd->Header.SGList; i++) { 1668 for (i = 0; i < cmd->Header.SGList; i++) {
1672 temp64.val32.lower = cmd->SG[i].Addr.lower; 1669 if (curr_sg[sg_index].Ext == CCISS_SG_CHAIN) {
1673 temp64.val32.upper = cmd->SG[i].Addr.upper; 1670 temp64.val32.lower = cmd->SG[i].Addr.lower;
1674 pci_unmap_page(h->pdev, temp64.val, cmd->SG[i].Len, ddir); 1671 temp64.val32.upper = cmd->SG[i].Addr.upper;
1672 pci_dma_sync_single_for_cpu(h->pdev, temp64.val,
1673 cmd->SG[i].Len, ddir);
1674 pci_unmap_single(h->pdev, temp64.val,
1675 cmd->SG[i].Len, ddir);
1676 /* Point to the next block */
1677 curr_sg = h->cmd_sg_list[cmd->cmdindex]->sgchain;
1678 sg_index = 0;
1679 }
1680 temp64.val32.lower = curr_sg[sg_index].Addr.lower;
1681 temp64.val32.upper = curr_sg[sg_index].Addr.upper;
1682 pci_unmap_page(h->pdev, temp64.val, curr_sg[sg_index].Len,
1683 ddir);
1684 ++sg_index;
1675 } 1685 }
1676 1686
1677#ifdef CCISS_DEBUG 1687#ifdef CCISS_DEBUG
@@ -1701,7 +1711,7 @@ static inline void log_unit_to_scsi3addr(ctlr_info_t *h,
1701 * via the inquiry page 0. Model, vendor, and rev are set to empty strings if 1711 * via the inquiry page 0. Model, vendor, and rev are set to empty strings if
1702 * they cannot be read. 1712 * they cannot be read.
1703 */ 1713 */
1704static void cciss_get_device_descr(int ctlr, int logvol, int withirq, 1714static void cciss_get_device_descr(int ctlr, int logvol,
1705 char *vendor, char *model, char *rev) 1715 char *vendor, char *model, char *rev)
1706{ 1716{
1707 int rc; 1717 int rc;
@@ -1717,14 +1727,8 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
1717 return; 1727 return;
1718 1728
1719 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); 1729 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
1720 if (withirq) 1730 rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf, sizeof(*inq_buf), 0,
1721 rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf, 1731 scsi3addr, TYPE_CMD);
1722 sizeof(InquiryData_struct), 0,
1723 scsi3addr, TYPE_CMD);
1724 else
1725 rc = sendcmd(CISS_INQUIRY, ctlr, inq_buf,
1726 sizeof(InquiryData_struct), 0,
1727 scsi3addr, TYPE_CMD);
1728 if (rc == IO_OK) { 1732 if (rc == IO_OK) {
1729 memcpy(vendor, &inq_buf->data_byte[8], VENDOR_LEN); 1733 memcpy(vendor, &inq_buf->data_byte[8], VENDOR_LEN);
1730 vendor[VENDOR_LEN] = '\0'; 1734 vendor[VENDOR_LEN] = '\0';
@@ -1743,7 +1747,7 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
1743 * number cannot be had, for whatever reason, 16 bytes of 0xff 1747 * number cannot be had, for whatever reason, 16 bytes of 0xff
1744 * are returned instead. 1748 * are returned instead.
1745 */ 1749 */
1746static void cciss_get_serial_no(int ctlr, int logvol, int withirq, 1750static void cciss_get_serial_no(int ctlr, int logvol,
1747 unsigned char *serial_no, int buflen) 1751 unsigned char *serial_no, int buflen)
1748{ 1752{
1749#define PAGE_83_INQ_BYTES 64 1753#define PAGE_83_INQ_BYTES 64
@@ -1759,12 +1763,8 @@ static void cciss_get_serial_no(int ctlr, int logvol, int withirq,
1759 return; 1763 return;
1760 memset(serial_no, 0, buflen); 1764 memset(serial_no, 0, buflen);
1761 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); 1765 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
1762 if (withirq) 1766 rc = sendcmd_withirq(CISS_INQUIRY, ctlr, buf,
1763 rc = sendcmd_withirq(CISS_INQUIRY, ctlr, buf, 1767 PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
1764 PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
1765 else
1766 rc = sendcmd(CISS_INQUIRY, ctlr, buf,
1767 PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
1768 if (rc == IO_OK) 1768 if (rc == IO_OK)
1769 memcpy(serial_no, &buf[8], buflen); 1769 memcpy(serial_no, &buf[8], buflen);
1770 kfree(buf); 1770 kfree(buf);
@@ -1793,10 +1793,10 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
1793 blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask); 1793 blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask);
1794 1794
1795 /* This is a hardware imposed limit. */ 1795 /* This is a hardware imposed limit. */
1796 blk_queue_max_hw_segments(disk->queue, MAXSGENTRIES); 1796 blk_queue_max_hw_segments(disk->queue, h->maxsgentries);
1797 1797
1798 /* This is a limit in the driver and could be eliminated. */ 1798 /* This is a limit in the driver and could be eliminated. */
1799 blk_queue_max_phys_segments(disk->queue, MAXSGENTRIES); 1799 blk_queue_max_phys_segments(disk->queue, h->maxsgentries);
1800 1800
1801 blk_queue_max_sectors(disk->queue, h->cciss_max_sectors); 1801 blk_queue_max_sectors(disk->queue, h->cciss_max_sectors);
1802 1802
@@ -1852,18 +1852,16 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time,
1852 1852
1853 /* testing to see if 16-byte CDBs are already being used */ 1853 /* testing to see if 16-byte CDBs are already being used */
1854 if (h->cciss_read == CCISS_READ_16) { 1854 if (h->cciss_read == CCISS_READ_16) {
1855 cciss_read_capacity_16(h->ctlr, drv_index, 1, 1855 cciss_read_capacity_16(h->ctlr, drv_index,
1856 &total_size, &block_size); 1856 &total_size, &block_size);
1857 1857
1858 } else { 1858 } else {
1859 cciss_read_capacity(ctlr, drv_index, 1, 1859 cciss_read_capacity(ctlr, drv_index, &total_size, &block_size);
1860 &total_size, &block_size);
1861
1862 /* if read_capacity returns all F's this volume is >2TB */ 1860 /* if read_capacity returns all F's this volume is >2TB */
1863 /* in size so we switch to 16-byte CDB's for all */ 1861 /* in size so we switch to 16-byte CDB's for all */
1864 /* read/write ops */ 1862 /* read/write ops */
1865 if (total_size == 0xFFFFFFFFULL) { 1863 if (total_size == 0xFFFFFFFFULL) {
1866 cciss_read_capacity_16(ctlr, drv_index, 1, 1864 cciss_read_capacity_16(ctlr, drv_index,
1867 &total_size, &block_size); 1865 &total_size, &block_size);
1868 h->cciss_read = CCISS_READ_16; 1866 h->cciss_read = CCISS_READ_16;
1869 h->cciss_write = CCISS_WRITE_16; 1867 h->cciss_write = CCISS_WRITE_16;
@@ -1873,14 +1871,14 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time,
1873 } 1871 }
1874 } 1872 }
1875 1873
1876 cciss_geometry_inquiry(ctlr, drv_index, 1, total_size, block_size, 1874 cciss_geometry_inquiry(ctlr, drv_index, total_size, block_size,
1877 inq_buff, drvinfo); 1875 inq_buff, drvinfo);
1878 drvinfo->block_size = block_size; 1876 drvinfo->block_size = block_size;
1879 drvinfo->nr_blocks = total_size + 1; 1877 drvinfo->nr_blocks = total_size + 1;
1880 1878
1881 cciss_get_device_descr(ctlr, drv_index, 1, drvinfo->vendor, 1879 cciss_get_device_descr(ctlr, drv_index, drvinfo->vendor,
1882 drvinfo->model, drvinfo->rev); 1880 drvinfo->model, drvinfo->rev);
1883 cciss_get_serial_no(ctlr, drv_index, 1, drvinfo->serial_no, 1881 cciss_get_serial_no(ctlr, drv_index, drvinfo->serial_no,
1884 sizeof(drvinfo->serial_no)); 1882 sizeof(drvinfo->serial_no));
1885 /* Save the lunid in case we deregister the disk, below. */ 1883 /* Save the lunid in case we deregister the disk, below. */
1886 memcpy(drvinfo->LunID, h->drv[drv_index]->LunID, 1884 memcpy(drvinfo->LunID, h->drv[drv_index]->LunID,
@@ -2531,6 +2529,8 @@ static int check_target_status(ctlr_info_t *h, CommandList_struct *c)
2531 case 0: return IO_OK; /* no sense */ 2529 case 0: return IO_OK; /* no sense */
2532 case 1: return IO_OK; /* recovered error */ 2530 case 1: return IO_OK; /* recovered error */
2533 default: 2531 default:
2532 if (check_for_unit_attention(h, c))
2533 return IO_NEEDS_RETRY;
2534 printk(KERN_WARNING "cciss%d: cmd 0x%02x " 2534 printk(KERN_WARNING "cciss%d: cmd 0x%02x "
2535 "check condition, sense key = 0x%02x\n", 2535 "check condition, sense key = 0x%02x\n",
2536 h->ctlr, c->Request.CDB[0], 2536 h->ctlr, c->Request.CDB[0],
@@ -2672,7 +2672,7 @@ static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size,
2672} 2672}
2673 2673
2674static void cciss_geometry_inquiry(int ctlr, int logvol, 2674static void cciss_geometry_inquiry(int ctlr, int logvol,
2675 int withirq, sector_t total_size, 2675 sector_t total_size,
2676 unsigned int block_size, 2676 unsigned int block_size,
2677 InquiryData_struct *inq_buff, 2677 InquiryData_struct *inq_buff,
2678 drive_info_struct *drv) 2678 drive_info_struct *drv)
@@ -2683,14 +2683,8 @@ static void cciss_geometry_inquiry(int ctlr, int logvol,
2683 2683
2684 memset(inq_buff, 0, sizeof(InquiryData_struct)); 2684 memset(inq_buff, 0, sizeof(InquiryData_struct));
2685 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); 2685 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
2686 if (withirq) 2686 return_code = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buff,
2687 return_code = sendcmd_withirq(CISS_INQUIRY, ctlr, 2687 sizeof(*inq_buff), 0xC1, scsi3addr, TYPE_CMD);
2688 inq_buff, sizeof(*inq_buff),
2689 0xC1, scsi3addr, TYPE_CMD);
2690 else
2691 return_code = sendcmd(CISS_INQUIRY, ctlr, inq_buff,
2692 sizeof(*inq_buff), 0xC1, scsi3addr,
2693 TYPE_CMD);
2694 if (return_code == IO_OK) { 2688 if (return_code == IO_OK) {
2695 if (inq_buff->data_byte[8] == 0xFF) { 2689 if (inq_buff->data_byte[8] == 0xFF) {
2696 printk(KERN_WARNING 2690 printk(KERN_WARNING
@@ -2723,7 +2717,7 @@ static void cciss_geometry_inquiry(int ctlr, int logvol,
2723} 2717}
2724 2718
2725static void 2719static void
2726cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size, 2720cciss_read_capacity(int ctlr, int logvol, sector_t *total_size,
2727 unsigned int *block_size) 2721 unsigned int *block_size)
2728{ 2722{
2729 ReadCapdata_struct *buf; 2723 ReadCapdata_struct *buf;
@@ -2737,14 +2731,8 @@ cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size,
2737 } 2731 }
2738 2732
2739 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); 2733 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
2740 if (withirq) 2734 return_code = sendcmd_withirq(CCISS_READ_CAPACITY, ctlr, buf,
2741 return_code = sendcmd_withirq(CCISS_READ_CAPACITY, 2735 sizeof(ReadCapdata_struct), 0, scsi3addr, TYPE_CMD);
2742 ctlr, buf, sizeof(ReadCapdata_struct),
2743 0, scsi3addr, TYPE_CMD);
2744 else
2745 return_code = sendcmd(CCISS_READ_CAPACITY,
2746 ctlr, buf, sizeof(ReadCapdata_struct),
2747 0, scsi3addr, TYPE_CMD);
2748 if (return_code == IO_OK) { 2736 if (return_code == IO_OK) {
2749 *total_size = be32_to_cpu(*(__be32 *) buf->total_size); 2737 *total_size = be32_to_cpu(*(__be32 *) buf->total_size);
2750 *block_size = be32_to_cpu(*(__be32 *) buf->block_size); 2738 *block_size = be32_to_cpu(*(__be32 *) buf->block_size);
@@ -2756,8 +2744,8 @@ cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size,
2756 kfree(buf); 2744 kfree(buf);
2757} 2745}
2758 2746
2759static void 2747static void cciss_read_capacity_16(int ctlr, int logvol,
2760cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size, unsigned int *block_size) 2748 sector_t *total_size, unsigned int *block_size)
2761{ 2749{
2762 ReadCapdata_struct_16 *buf; 2750 ReadCapdata_struct_16 *buf;
2763 int return_code; 2751 int return_code;
@@ -2770,16 +2758,9 @@ cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size,
2770 } 2758 }
2771 2759
2772 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); 2760 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
2773 if (withirq) { 2761 return_code = sendcmd_withirq(CCISS_READ_CAPACITY_16,
2774 return_code = sendcmd_withirq(CCISS_READ_CAPACITY_16, 2762 ctlr, buf, sizeof(ReadCapdata_struct_16),
2775 ctlr, buf, sizeof(ReadCapdata_struct_16), 2763 0, scsi3addr, TYPE_CMD);
2776 0, scsi3addr, TYPE_CMD);
2777 }
2778 else {
2779 return_code = sendcmd(CCISS_READ_CAPACITY_16,
2780 ctlr, buf, sizeof(ReadCapdata_struct_16),
2781 0, scsi3addr, TYPE_CMD);
2782 }
2783 if (return_code == IO_OK) { 2764 if (return_code == IO_OK) {
2784 *total_size = be64_to_cpu(*(__be64 *) buf->total_size); 2765 *total_size = be64_to_cpu(*(__be64 *) buf->total_size);
2785 *block_size = be32_to_cpu(*(__be32 *) buf->block_size); 2766 *block_size = be32_to_cpu(*(__be32 *) buf->block_size);
@@ -2820,13 +2801,13 @@ static int cciss_revalidate(struct gendisk *disk)
2820 return 1; 2801 return 1;
2821 } 2802 }
2822 if (h->cciss_read == CCISS_READ_10) { 2803 if (h->cciss_read == CCISS_READ_10) {
2823 cciss_read_capacity(h->ctlr, logvol, 1, 2804 cciss_read_capacity(h->ctlr, logvol,
2824 &total_size, &block_size); 2805 &total_size, &block_size);
2825 } else { 2806 } else {
2826 cciss_read_capacity_16(h->ctlr, logvol, 1, 2807 cciss_read_capacity_16(h->ctlr, logvol,
2827 &total_size, &block_size); 2808 &total_size, &block_size);
2828 } 2809 }
2829 cciss_geometry_inquiry(h->ctlr, logvol, 1, total_size, block_size, 2810 cciss_geometry_inquiry(h->ctlr, logvol, total_size, block_size,
2830 inq_buff, drv); 2811 inq_buff, drv);
2831 2812
2832 blk_queue_logical_block_size(drv->queue, drv->block_size); 2813 blk_queue_logical_block_size(drv->queue, drv->block_size);
@@ -2837,167 +2818,6 @@ static int cciss_revalidate(struct gendisk *disk)
2837} 2818}
2838 2819
2839/* 2820/*
2840 * Wait polling for a command to complete.
2841 * The memory mapped FIFO is polled for the completion.
2842 * Used only at init time, interrupts from the HBA are disabled.
2843 */
2844static unsigned long pollcomplete(int ctlr)
2845{
2846 unsigned long done;
2847 int i;
2848
2849 /* Wait (up to 20 seconds) for a command to complete */
2850
2851 for (i = 20 * HZ; i > 0; i--) {
2852 done = hba[ctlr]->access.command_completed(hba[ctlr]);
2853 if (done == FIFO_EMPTY)
2854 schedule_timeout_uninterruptible(1);
2855 else
2856 return done;
2857 }
2858 /* Invalid address to tell caller we ran out of time */
2859 return 1;
2860}
2861
2862/* Send command c to controller h and poll for it to complete.
2863 * Turns interrupts off on the board. Used at driver init time
2864 * and during SCSI error recovery.
2865 */
2866static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c)
2867{
2868 int i;
2869 unsigned long complete;
2870 int status = IO_ERROR;
2871 u64bit buff_dma_handle;
2872
2873resend_cmd1:
2874
2875 /* Disable interrupt on the board. */
2876 h->access.set_intr_mask(h, CCISS_INTR_OFF);
2877
2878 /* Make sure there is room in the command FIFO */
2879 /* Actually it should be completely empty at this time */
2880 /* unless we are in here doing error handling for the scsi */
2881 /* tape side of the driver. */
2882 for (i = 200000; i > 0; i--) {
2883 /* if fifo isn't full go */
2884 if (!(h->access.fifo_full(h)))
2885 break;
2886 udelay(10);
2887 printk(KERN_WARNING "cciss cciss%d: SendCmd FIFO full,"
2888 " waiting!\n", h->ctlr);
2889 }
2890 h->access.submit_command(h, c); /* Send the cmd */
2891 do {
2892 complete = pollcomplete(h->ctlr);
2893
2894#ifdef CCISS_DEBUG
2895 printk(KERN_DEBUG "cciss: command completed\n");
2896#endif /* CCISS_DEBUG */
2897
2898 if (complete == 1) {
2899 printk(KERN_WARNING
2900 "cciss cciss%d: SendCmd Timeout out, "
2901 "No command list address returned!\n", h->ctlr);
2902 status = IO_ERROR;
2903 break;
2904 }
2905
2906 /* Make sure it's the command we're expecting. */
2907 if ((complete & ~CISS_ERROR_BIT) != c->busaddr) {
2908 printk(KERN_WARNING "cciss%d: Unexpected command "
2909 "completion.\n", h->ctlr);
2910 continue;
2911 }
2912
2913 /* It is our command. If no error, we're done. */
2914 if (!(complete & CISS_ERROR_BIT)) {
2915 status = IO_OK;
2916 break;
2917 }
2918
2919 /* There is an error... */
2920
2921 /* if data overrun or underun on Report command ignore it */
2922 if (((c->Request.CDB[0] == CISS_REPORT_LOG) ||
2923 (c->Request.CDB[0] == CISS_REPORT_PHYS) ||
2924 (c->Request.CDB[0] == CISS_INQUIRY)) &&
2925 ((c->err_info->CommandStatus == CMD_DATA_OVERRUN) ||
2926 (c->err_info->CommandStatus == CMD_DATA_UNDERRUN))) {
2927 complete = c->busaddr;
2928 status = IO_OK;
2929 break;
2930 }
2931
2932 if (c->err_info->CommandStatus == CMD_UNSOLICITED_ABORT) {
2933 printk(KERN_WARNING "cciss%d: unsolicited abort %p\n",
2934 h->ctlr, c);
2935 if (c->retry_count < MAX_CMD_RETRIES) {
2936 printk(KERN_WARNING "cciss%d: retrying %p\n",
2937 h->ctlr, c);
2938 c->retry_count++;
2939 /* erase the old error information */
2940 memset(c->err_info, 0, sizeof(c->err_info));
2941 goto resend_cmd1;
2942 }
2943 printk(KERN_WARNING "cciss%d: retried %p too many "
2944 "times\n", h->ctlr, c);
2945 status = IO_ERROR;
2946 break;
2947 }
2948
2949 if (c->err_info->CommandStatus == CMD_UNABORTABLE) {
2950 printk(KERN_WARNING "cciss%d: command could not be "
2951 "aborted.\n", h->ctlr);
2952 status = IO_ERROR;
2953 break;
2954 }
2955
2956 if (c->err_info->CommandStatus == CMD_TARGET_STATUS) {
2957 status = check_target_status(h, c);
2958 break;
2959 }
2960
2961 printk(KERN_WARNING "cciss%d: sendcmd error\n", h->ctlr);
2962 printk(KERN_WARNING "cmd = 0x%02x, CommandStatus = 0x%02x\n",
2963 c->Request.CDB[0], c->err_info->CommandStatus);
2964 status = IO_ERROR;
2965 break;
2966
2967 } while (1);
2968
2969 /* unlock the data buffer from DMA */
2970 buff_dma_handle.val32.lower = c->SG[0].Addr.lower;
2971 buff_dma_handle.val32.upper = c->SG[0].Addr.upper;
2972 pci_unmap_single(h->pdev, (dma_addr_t) buff_dma_handle.val,
2973 c->SG[0].Len, PCI_DMA_BIDIRECTIONAL);
2974 return status;
2975}
2976
2977/*
2978 * Send a command to the controller, and wait for it to complete.
2979 * Used at init time, and during SCSI error recovery.
2980 */
2981static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
2982 __u8 page_code, unsigned char *scsi3addr, int cmd_type)
2983{
2984 CommandList_struct *c;
2985 int status;
2986
2987 c = cmd_alloc(hba[ctlr], 1);
2988 if (!c) {
2989 printk(KERN_WARNING "cciss: unable to get memory");
2990 return IO_ERROR;
2991 }
2992 status = fill_cmd(c, cmd, ctlr, buff, size, page_code,
2993 scsi3addr, cmd_type);
2994 if (status == IO_OK)
2995 status = sendcmd_core(hba[ctlr], c);
2996 cmd_free(hba[ctlr], c, 1);
2997 return status;
2998}
2999
3000/*
3001 * Map (physical) PCI mem into (virtual) kernel space 2821 * Map (physical) PCI mem into (virtual) kernel space
3002 */ 2822 */
3003static void __iomem *remap_pci_mem(ulong base, ulong size) 2823static void __iomem *remap_pci_mem(ulong base, ulong size)
@@ -3255,9 +3075,13 @@ static void do_cciss_request(struct request_queue *q)
3255 int seg; 3075 int seg;
3256 struct request *creq; 3076 struct request *creq;
3257 u64bit temp64; 3077 u64bit temp64;
3258 struct scatterlist tmp_sg[MAXSGENTRIES]; 3078 struct scatterlist *tmp_sg;
3079 SGDescriptor_struct *curr_sg;
3259 drive_info_struct *drv; 3080 drive_info_struct *drv;
3260 int i, dir; 3081 int i, dir;
3082 int nseg = 0;
3083 int sg_index = 0;
3084 int chained = 0;
3261 3085
3262 /* We call start_io here in case there is a command waiting on the 3086 /* We call start_io here in case there is a command waiting on the
3263 * queue that has not been sent. 3087 * queue that has not been sent.
@@ -3270,13 +3094,14 @@ static void do_cciss_request(struct request_queue *q)
3270 if (!creq) 3094 if (!creq)
3271 goto startio; 3095 goto startio;
3272 3096
3273 BUG_ON(creq->nr_phys_segments > MAXSGENTRIES); 3097 BUG_ON(creq->nr_phys_segments > h->maxsgentries);
3274 3098
3275 if ((c = cmd_alloc(h, 1)) == NULL) 3099 if ((c = cmd_alloc(h, 1)) == NULL)
3276 goto full; 3100 goto full;
3277 3101
3278 blk_start_request(creq); 3102 blk_start_request(creq);
3279 3103
3104 tmp_sg = h->scatter_list[c->cmdindex];
3280 spin_unlock_irq(q->queue_lock); 3105 spin_unlock_irq(q->queue_lock);
3281 3106
3282 c->cmd_type = CMD_RWREQ; 3107 c->cmd_type = CMD_RWREQ;
@@ -3305,7 +3130,7 @@ static void do_cciss_request(struct request_queue *q)
3305 (int)blk_rq_pos(creq), (int)blk_rq_sectors(creq)); 3130 (int)blk_rq_pos(creq), (int)blk_rq_sectors(creq));
3306#endif /* CCISS_DEBUG */ 3131#endif /* CCISS_DEBUG */
3307 3132
3308 sg_init_table(tmp_sg, MAXSGENTRIES); 3133 sg_init_table(tmp_sg, h->maxsgentries);
3309 seg = blk_rq_map_sg(q, creq, tmp_sg); 3134 seg = blk_rq_map_sg(q, creq, tmp_sg);
3310 3135
3311 /* get the DMA records for the setup */ 3136 /* get the DMA records for the setup */
@@ -3314,25 +3139,70 @@ static void do_cciss_request(struct request_queue *q)
3314 else 3139 else
3315 dir = PCI_DMA_TODEVICE; 3140 dir = PCI_DMA_TODEVICE;
3316 3141
3142 curr_sg = c->SG;
3143 sg_index = 0;
3144 chained = 0;
3145
3317 for (i = 0; i < seg; i++) { 3146 for (i = 0; i < seg; i++) {
3318 c->SG[i].Len = tmp_sg[i].length; 3147 if (((sg_index+1) == (h->max_cmd_sgentries)) &&
3148 !chained && ((seg - i) > 1)) {
3149 nseg = seg - i;
3150 curr_sg[sg_index].Len = (nseg) *
3151 sizeof(SGDescriptor_struct);
3152 curr_sg[sg_index].Ext = CCISS_SG_CHAIN;
3153
3154 /* Point to next chain block. */
3155 curr_sg = h->cmd_sg_list[c->cmdindex]->sgchain;
3156 sg_index = 0;
3157 chained = 1;
3158 }
3159 curr_sg[sg_index].Len = tmp_sg[i].length;
3319 temp64.val = (__u64) pci_map_page(h->pdev, sg_page(&tmp_sg[i]), 3160 temp64.val = (__u64) pci_map_page(h->pdev, sg_page(&tmp_sg[i]),
3320 tmp_sg[i].offset, 3161 tmp_sg[i].offset,
3321 tmp_sg[i].length, dir); 3162 tmp_sg[i].length, dir);
3322 c->SG[i].Addr.lower = temp64.val32.lower; 3163 curr_sg[sg_index].Addr.lower = temp64.val32.lower;
3323 c->SG[i].Addr.upper = temp64.val32.upper; 3164 curr_sg[sg_index].Addr.upper = temp64.val32.upper;
3324 c->SG[i].Ext = 0; // we are not chaining 3165 curr_sg[sg_index].Ext = 0; /* we are not chaining */
3166
3167 ++sg_index;
3168 }
3169
3170 if (chained) {
3171 int len;
3172 curr_sg = c->SG;
3173 sg_index = h->max_cmd_sgentries - 1;
3174 len = curr_sg[sg_index].Len;
3175 /* Setup pointer to next chain block.
3176 * Fill out last element in current chain
3177 * block with address of next chain block.
3178 */
3179 temp64.val = pci_map_single(h->pdev,
3180 h->cmd_sg_list[c->cmdindex]->sgchain,
3181 len, dir);
3182
3183 h->cmd_sg_list[c->cmdindex]->sg_chain_dma = temp64.val;
3184 curr_sg[sg_index].Addr.lower = temp64.val32.lower;
3185 curr_sg[sg_index].Addr.upper = temp64.val32.upper;
3186
3187 pci_dma_sync_single_for_device(h->pdev,
3188 h->cmd_sg_list[c->cmdindex]->sg_chain_dma,
3189 len, dir);
3325 } 3190 }
3191
3326 /* track how many SG entries we are using */ 3192 /* track how many SG entries we are using */
3327 if (seg > h->maxSG) 3193 if (seg > h->maxSG)
3328 h->maxSG = seg; 3194 h->maxSG = seg;
3329 3195
3330#ifdef CCISS_DEBUG 3196#ifdef CCISS_DEBUG
3331 printk(KERN_DEBUG "cciss: Submitting %u sectors in %d segments\n", 3197 printk(KERN_DEBUG "cciss: Submitting %ld sectors in %d segments "
3332 blk_rq_sectors(creq), seg); 3198 "chained[%d]\n",
3199 blk_rq_sectors(creq), seg, chained);
3333#endif /* CCISS_DEBUG */ 3200#endif /* CCISS_DEBUG */
3334 3201
3335 c->Header.SGList = c->Header.SGTotal = seg; 3202 c->Header.SGList = c->Header.SGTotal = seg + chained;
3203 if (seg > h->max_cmd_sgentries)
3204 c->Header.SGList = h->max_cmd_sgentries;
3205
3336 if (likely(blk_fs_request(creq))) { 3206 if (likely(blk_fs_request(creq))) {
3337 if(h->cciss_read == CCISS_READ_10) { 3207 if(h->cciss_read == CCISS_READ_10) {
3338 c->Request.CDB[1] = 0; 3208 c->Request.CDB[1] = 0;
@@ -3513,28 +3383,33 @@ static int add_to_scan_list(struct ctlr_info *h)
3513 * @h: Pointer to the controller. 3383 * @h: Pointer to the controller.
3514 * 3384 *
3515 * Removes the controller from the rescan queue if present. Blocks if 3385 * Removes the controller from the rescan queue if present. Blocks if
3516 * the controller is currently conducting a rescan. 3386 * the controller is currently conducting a rescan. The controller
3387 * can be in one of three states:
3388 * 1. Doesn't need a scan
3389 * 2. On the scan list, but not scanning yet (we remove it)
3390 * 3. Busy scanning (and not on the list). In this case we want to wait for
3391 * the scan to complete to make sure the scanning thread for this
3392 * controller is completely idle.
3517 **/ 3393 **/
3518static void remove_from_scan_list(struct ctlr_info *h) 3394static void remove_from_scan_list(struct ctlr_info *h)
3519{ 3395{
3520 struct ctlr_info *test_h, *tmp_h; 3396 struct ctlr_info *test_h, *tmp_h;
3521 int scanning = 0;
3522 3397
3523 mutex_lock(&scan_mutex); 3398 mutex_lock(&scan_mutex);
3524 list_for_each_entry_safe(test_h, tmp_h, &scan_q, scan_list) { 3399 list_for_each_entry_safe(test_h, tmp_h, &scan_q, scan_list) {
3525 if (test_h == h) { 3400 if (test_h == h) { /* state 2. */
3526 list_del(&h->scan_list); 3401 list_del(&h->scan_list);
3527 complete_all(&h->scan_wait); 3402 complete_all(&h->scan_wait);
3528 mutex_unlock(&scan_mutex); 3403 mutex_unlock(&scan_mutex);
3529 return; 3404 return;
3530 } 3405 }
3531 } 3406 }
3532 if (&h->busy_scanning) 3407 if (h->busy_scanning) { /* state 3. */
3533 scanning = 0; 3408 mutex_unlock(&scan_mutex);
3534 mutex_unlock(&scan_mutex);
3535
3536 if (scanning)
3537 wait_for_completion(&h->scan_wait); 3409 wait_for_completion(&h->scan_wait);
3410 } else { /* state 1, nothing to do. */
3411 mutex_unlock(&scan_mutex);
3412 }
3538} 3413}
3539 3414
3540/** 3415/**
@@ -3573,13 +3448,11 @@ static int scan_thread(void *data)
3573 h->busy_scanning = 1; 3448 h->busy_scanning = 1;
3574 mutex_unlock(&scan_mutex); 3449 mutex_unlock(&scan_mutex);
3575 3450
3576 if (h) { 3451 rebuild_lun_table(h, 0, 0);
3577 rebuild_lun_table(h, 0, 0); 3452 complete_all(&h->scan_wait);
3578 complete_all(&h->scan_wait); 3453 mutex_lock(&scan_mutex);
3579 mutex_lock(&scan_mutex); 3454 h->busy_scanning = 0;
3580 h->busy_scanning = 0; 3455 mutex_unlock(&scan_mutex);
3581 mutex_unlock(&scan_mutex);
3582 }
3583 } 3456 }
3584 } 3457 }
3585 3458
@@ -3605,8 +3478,22 @@ static int check_for_unit_attention(ctlr_info_t *h, CommandList_struct *c)
3605 case REPORT_LUNS_CHANGED: 3478 case REPORT_LUNS_CHANGED:
3606 printk(KERN_WARNING "cciss%d: report LUN data " 3479 printk(KERN_WARNING "cciss%d: report LUN data "
3607 "changed\n", h->ctlr); 3480 "changed\n", h->ctlr);
3608 add_to_scan_list(h); 3481 /*
3609 wake_up_process(cciss_scan_thread); 3482 * Here, we could call add_to_scan_list and wake up the scan thread,
3483 * except that it's quite likely that we will get more than one
3484 * REPORT_LUNS_CHANGED condition in quick succession, which means
3485 * that those which occur after the first one will likely happen
3486 * *during* the scan_thread's rescan. And the rescan code is not
3487 * robust enough to restart in the middle, undoing what it has already
3488 * done, and it's not clear that it's even possible to do this, since
3489 * part of what it does is notify the block layer, which starts
3490 * doing it's own i/o to read partition tables and so on, and the
3491 * driver doesn't have visibility to know what might need undoing.
3492 * In any event, if possible, it is horribly complicated to get right
3493 * so we just don't do it for now.
3494 *
3495 * Note: this REPORT_LUNS_CHANGED condition only occurs on the MSA2012.
3496 */
3610 return 1; 3497 return 1;
3611 break; 3498 break;
3612 case POWER_OR_RESET: 3499 case POWER_OR_RESET:
@@ -3888,6 +3775,23 @@ static int __devinit cciss_pci_init(ctlr_info_t *c, struct pci_dev *pdev)
3888 * leave a little room for ioctl calls. 3775 * leave a little room for ioctl calls.
3889 */ 3776 */
3890 c->max_commands = readl(&(c->cfgtable->CmdsOutMax)); 3777 c->max_commands = readl(&(c->cfgtable->CmdsOutMax));
3778 c->maxsgentries = readl(&(c->cfgtable->MaxSGElements));
3779
3780 /*
3781 * Limit native command to 32 s/g elements to save dma'able memory.
3782 * Howvever spec says if 0, use 31
3783 */
3784
3785 c->max_cmd_sgentries = 31;
3786 if (c->maxsgentries > 512) {
3787 c->max_cmd_sgentries = 32;
3788 c->chainsize = c->maxsgentries - c->max_cmd_sgentries + 1;
3789 c->maxsgentries -= 1; /* account for chain pointer */
3790 } else {
3791 c->maxsgentries = 31; /* Default to traditional value */
3792 c->chainsize = 0; /* traditional */
3793 }
3794
3891 c->product_name = products[prod_index].product_name; 3795 c->product_name = products[prod_index].product_name;
3892 c->access = *(products[prod_index].access); 3796 c->access = *(products[prod_index].access);
3893 c->nr_cmds = c->max_commands - 4; 3797 c->nr_cmds = c->max_commands - 4;
@@ -4214,6 +4118,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
4214{ 4118{
4215 int i; 4119 int i;
4216 int j = 0; 4120 int j = 0;
4121 int k = 0;
4217 int rc; 4122 int rc;
4218 int dac, return_code; 4123 int dac, return_code;
4219 InquiryData_struct *inq_buff; 4124 InquiryData_struct *inq_buff;
@@ -4317,6 +4222,53 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
4317 printk(KERN_ERR "cciss: out of memory"); 4222 printk(KERN_ERR "cciss: out of memory");
4318 goto clean4; 4223 goto clean4;
4319 } 4224 }
4225
4226 /* Need space for temp scatter list */
4227 hba[i]->scatter_list = kmalloc(hba[i]->max_commands *
4228 sizeof(struct scatterlist *),
4229 GFP_KERNEL);
4230 for (k = 0; k < hba[i]->nr_cmds; k++) {
4231 hba[i]->scatter_list[k] = kmalloc(sizeof(struct scatterlist) *
4232 hba[i]->maxsgentries,
4233 GFP_KERNEL);
4234 if (hba[i]->scatter_list[k] == NULL) {
4235 printk(KERN_ERR "cciss%d: could not allocate "
4236 "s/g lists\n", i);
4237 goto clean4;
4238 }
4239 }
4240 hba[i]->cmd_sg_list = kmalloc(sizeof(struct Cmd_sg_list *) *
4241 hba[i]->nr_cmds,
4242 GFP_KERNEL);
4243 if (!hba[i]->cmd_sg_list) {
4244 printk(KERN_ERR "cciss%d: Cannot get memory for "
4245 "s/g chaining.\n", i);
4246 goto clean4;
4247 }
4248 /* Build up chain blocks for each command */
4249 if (hba[i]->chainsize > 0) {
4250 for (j = 0; j < hba[i]->nr_cmds; j++) {
4251 hba[i]->cmd_sg_list[j] =
4252 kmalloc(sizeof(struct Cmd_sg_list),
4253 GFP_KERNEL);
4254 if (!hba[i]->cmd_sg_list[j]) {
4255 printk(KERN_ERR "cciss%d: Cannot get memory "
4256 "for chain block.\n", i);
4257 goto clean4;
4258 }
4259 /* Need a block of chainsized s/g elements. */
4260 hba[i]->cmd_sg_list[j]->sgchain =
4261 kmalloc((hba[i]->chainsize *
4262 sizeof(SGDescriptor_struct)),
4263 GFP_KERNEL);
4264 if (!hba[i]->cmd_sg_list[j]->sgchain) {
4265 printk(KERN_ERR "cciss%d: Cannot get memory "
4266 "for s/g chains\n", i);
4267 goto clean4;
4268 }
4269 }
4270 }
4271
4320 spin_lock_init(&hba[i]->lock); 4272 spin_lock_init(&hba[i]->lock);
4321 4273
4322 /* Initialize the pdev driver private data. 4274 /* Initialize the pdev driver private data.
@@ -4362,7 +4314,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
4362 4314
4363 cciss_procinit(i); 4315 cciss_procinit(i);
4364 4316
4365 hba[i]->cciss_max_sectors = 2048; 4317 hba[i]->cciss_max_sectors = 8192;
4366 4318
4367 rebuild_lun_table(hba[i], 1, 0); 4319 rebuild_lun_table(hba[i], 1, 0);
4368 hba[i]->busy_initializing = 0; 4320 hba[i]->busy_initializing = 0;
@@ -4370,6 +4322,20 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
4370 4322
4371clean4: 4323clean4:
4372 kfree(hba[i]->cmd_pool_bits); 4324 kfree(hba[i]->cmd_pool_bits);
4325 /* Free up sg elements */
4326 for (k = 0; k < hba[i]->nr_cmds; k++)
4327 kfree(hba[i]->scatter_list[k]);
4328 kfree(hba[i]->scatter_list);
4329 /* Only free up extra s/g lists if controller supports them */
4330 if (hba[i]->chainsize > 0) {
4331 for (j = 0; j < hba[i]->nr_cmds; j++) {
4332 if (hba[i]->cmd_sg_list[j]) {
4333 kfree(hba[i]->cmd_sg_list[j]->sgchain);
4334 kfree(hba[i]->cmd_sg_list[j]);
4335 }
4336 }
4337 kfree(hba[i]->cmd_sg_list);
4338 }
4373 if (hba[i]->cmd_pool) 4339 if (hba[i]->cmd_pool)
4374 pci_free_consistent(hba[i]->pdev, 4340 pci_free_consistent(hba[i]->pdev,
4375 hba[i]->nr_cmds * sizeof(CommandList_struct), 4341 hba[i]->nr_cmds * sizeof(CommandList_struct),
@@ -4400,30 +4366,28 @@ clean_no_release_regions:
4400 4366
4401static void cciss_shutdown(struct pci_dev *pdev) 4367static void cciss_shutdown(struct pci_dev *pdev)
4402{ 4368{
4403 ctlr_info_t *tmp_ptr; 4369 ctlr_info_t *h;
4404 int i; 4370 char *flush_buf;
4405 char flush_buf[4];
4406 int return_code; 4371 int return_code;
4407 4372
4408 tmp_ptr = pci_get_drvdata(pdev); 4373 h = pci_get_drvdata(pdev);
4409 if (tmp_ptr == NULL) 4374 flush_buf = kzalloc(4, GFP_KERNEL);
4410 return; 4375 if (!flush_buf) {
4411 i = tmp_ptr->ctlr; 4376 printk(KERN_WARNING
4412 if (hba[i] == NULL) 4377 "cciss:%d cache not flushed, out of memory.\n",
4378 h->ctlr);
4413 return; 4379 return;
4414
4415 /* Turn board interrupts off and send the flush cache command */
4416 /* sendcmd will turn off interrupt, and send the flush...
4417 * To write all data in the battery backed cache to disks */
4418 memset(flush_buf, 0, 4);
4419 return_code = sendcmd(CCISS_CACHE_FLUSH, i, flush_buf, 4, 0,
4420 CTLR_LUNID, TYPE_CMD);
4421 if (return_code == IO_OK) {
4422 printk(KERN_INFO "Completed flushing cache on controller %d\n", i);
4423 } else {
4424 printk(KERN_WARNING "Error flushing cache on controller %d\n", i);
4425 } 4380 }
4426 free_irq(hba[i]->intr[2], hba[i]); 4381 /* write all data in the battery backed cache to disk */
4382 memset(flush_buf, 0, 4);
4383 return_code = sendcmd_withirq(CCISS_CACHE_FLUSH, h->ctlr, flush_buf,
4384 4, 0, CTLR_LUNID, TYPE_CMD);
4385 kfree(flush_buf);
4386 if (return_code != IO_OK)
4387 printk(KERN_WARNING "cciss%d: Error flushing cache\n",
4388 h->ctlr);
4389 h->access.set_intr_mask(h, CCISS_INTR_OFF);
4390 free_irq(h->intr[2], h);
4427} 4391}
4428 4392
4429static void __devexit cciss_remove_one(struct pci_dev *pdev) 4393static void __devexit cciss_remove_one(struct pci_dev *pdev)
@@ -4485,6 +4449,20 @@ static void __devexit cciss_remove_one(struct pci_dev *pdev)
4485 pci_free_consistent(hba[i]->pdev, hba[i]->nr_cmds * sizeof(ErrorInfo_struct), 4449 pci_free_consistent(hba[i]->pdev, hba[i]->nr_cmds * sizeof(ErrorInfo_struct),
4486 hba[i]->errinfo_pool, hba[i]->errinfo_pool_dhandle); 4450 hba[i]->errinfo_pool, hba[i]->errinfo_pool_dhandle);
4487 kfree(hba[i]->cmd_pool_bits); 4451 kfree(hba[i]->cmd_pool_bits);
4452 /* Free up sg elements */
4453 for (j = 0; j < hba[i]->nr_cmds; j++)
4454 kfree(hba[i]->scatter_list[j]);
4455 kfree(hba[i]->scatter_list);
4456 /* Only free up extra s/g lists if controller supports them */
4457 if (hba[i]->chainsize > 0) {
4458 for (j = 0; j < hba[i]->nr_cmds; j++) {
4459 if (hba[i]->cmd_sg_list[j]) {
4460 kfree(hba[i]->cmd_sg_list[j]->sgchain);
4461 kfree(hba[i]->cmd_sg_list[j]);
4462 }
4463 }
4464 kfree(hba[i]->cmd_sg_list);
4465 }
4488 /* 4466 /*
4489 * Deliberately omit pci_disable_device(): it does something nasty to 4467 * Deliberately omit pci_disable_device(): it does something nasty to
4490 * Smart Array controllers that pci_enable_device does not undo 4468 * Smart Array controllers that pci_enable_device does not undo
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index 31524cf42c77..1d95db254069 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -55,7 +55,13 @@ typedef struct _drive_info_struct
55 char device_initialized; /* indicates whether dev is initialized */ 55 char device_initialized; /* indicates whether dev is initialized */
56} drive_info_struct; 56} drive_info_struct;
57 57
58struct ctlr_info 58struct Cmd_sg_list {
59 SGDescriptor_struct *sgchain;
60 dma_addr_t sg_chain_dma;
61 int chain_block_size;
62};
63
64struct ctlr_info
59{ 65{
60 int ctlr; 66 int ctlr;
61 char devname[8]; 67 char devname[8];
@@ -75,6 +81,16 @@ struct ctlr_info
75 int num_luns; 81 int num_luns;
76 int highest_lun; 82 int highest_lun;
77 int usage_count; /* number of opens all all minor devices */ 83 int usage_count; /* number of opens all all minor devices */
84 /* Need space for temp sg list
85 * number of scatter/gathers supported
86 * number of scatter/gathers in chained block
87 */
88 struct scatterlist **scatter_list;
89 int maxsgentries;
90 int chainsize;
91 int max_cmd_sgentries;
92 struct Cmd_sg_list **cmd_sg_list;
93
78# define DOORBELL_INT 0 94# define DOORBELL_INT 0
79# define PERF_MODE_INT 1 95# define PERF_MODE_INT 1
80# define SIMPLE_MODE_INT 2 96# define SIMPLE_MODE_INT 2
diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h
index dbaed1ea0da3..b50a9b261b85 100644
--- a/drivers/block/cciss_cmd.h
+++ b/drivers/block/cciss_cmd.h
@@ -7,7 +7,8 @@
7 7
8//general boundary defintions 8//general boundary defintions
9#define SENSEINFOBYTES 32//note that this value may vary between host implementations 9#define SENSEINFOBYTES 32//note that this value may vary between host implementations
10#define MAXSGENTRIES 31 10#define MAXSGENTRIES 32
11#define CCISS_SG_CHAIN 0x80000000
11#define MAXREPLYQS 256 12#define MAXREPLYQS 256
12 13
13//Command Status value 14//Command Status value
@@ -319,6 +320,10 @@ typedef struct _CfgTable_struct {
319 BYTE ServerName[16]; 320 BYTE ServerName[16];
320 DWORD HeartBeat; 321 DWORD HeartBeat;
321 DWORD SCSI_Prefetch; 322 DWORD SCSI_Prefetch;
323 DWORD MaxSGElements;
324 DWORD MaxLogicalUnits;
325 DWORD MaxPhysicalDrives;
326 DWORD MaxPhysicalDrivesPerLogicalUnit;
322} CfgTable_struct; 327} CfgTable_struct;
323#pragma pack() 328#pragma pack()
324#endif // CCISS_CMD_H 329#endif // CCISS_CMD_H
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 3315268b4ec7..5d0e46dc3632 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -755,7 +755,7 @@ complete_scsi_command( CommandList_struct *cp, int timeout, __u32 tag)
755 cp, 755 cp,
756 ei->ScsiStatus); 756 ei->ScsiStatus);
757#endif 757#endif
758 cmd->result |= (ei->ScsiStatus < 1); 758 cmd->result |= (ei->ScsiStatus << 1);
759 } 759 }
760 else { /* scsi status is zero??? How??? */ 760 else { /* scsi status is zero??? How??? */
761 761
@@ -1547,7 +1547,7 @@ cciss_engage_scsi(int ctlr)
1547 if (sa->registered) { 1547 if (sa->registered) {
1548 printk("cciss%d: SCSI subsystem already engaged.\n", ctlr); 1548 printk("cciss%d: SCSI subsystem already engaged.\n", ctlr);
1549 spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags); 1549 spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
1550 return ENXIO; 1550 return -ENXIO;
1551 } 1551 }
1552 sa->registered = 1; 1552 sa->registered = 1;
1553 spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags); 1553 spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig
new file mode 100644
index 000000000000..f4acd04ebeef
--- /dev/null
+++ b/drivers/block/drbd/Kconfig
@@ -0,0 +1,71 @@
1#
2# DRBD device driver configuration
3#
4
5comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected"
6 depends on !PROC_FS || !INET || !CONNECTOR
7
8config BLK_DEV_DRBD
9 tristate "DRBD Distributed Replicated Block Device support"
10 depends on PROC_FS && INET && CONNECTOR
11 select LRU_CACHE
12 default n
13 help
14
15 NOTE: In order to authenticate connections you have to select
16 CRYPTO_HMAC and a hash function as well.
17
18 DRBD is a shared-nothing, synchronously replicated block device. It
19 is designed to serve as a building block for high availability
20 clusters and in this context, is a "drop-in" replacement for shared
21 storage. Simplistically, you could see it as a network RAID 1.
22
23 Each minor device has a role, which can be 'primary' or 'secondary'.
24 On the node with the primary device the application is supposed to
25 run and to access the device (/dev/drbdX). Every write is sent to
26 the local 'lower level block device' and, across the network, to the
27 node with the device in 'secondary' state. The secondary device
28 simply writes the data to its lower level block device.
29
30 DRBD can also be used in dual-Primary mode (device writable on both
31 nodes), which means it can exhibit shared disk semantics in a
32 shared-nothing cluster. Needless to say, on top of dual-Primary
33 DRBD utilizing a cluster file system is necessary to maintain for
34 cache coherency.
35
36 For automatic failover you need a cluster manager (e.g. heartbeat).
37 See also: http://www.drbd.org/, http://www.linux-ha.org
38
39 If unsure, say N.
40
41config DRBD_FAULT_INJECTION
42 bool "DRBD fault injection"
43 depends on BLK_DEV_DRBD
44 help
45
46 Say Y here if you want to simulate IO errors, in order to test DRBD's
47 behavior.
48
49 The actual simulation of IO errors is done by writing 3 values to
50 /sys/module/drbd/parameters/
51
52 enable_faults: bitmask of...
53 1 meta data write
54 2 read
55 4 resync data write
56 8 read
57 16 data write
58 32 data read
59 64 read ahead
60 128 kmalloc of bitmap
61 256 allocation of EE (epoch_entries)
62
63 fault_devs: bitmask of minor numbers
64 fault_rate: frequency in percent
65
66 Example: Simulate data write errors on /dev/drbd0 with a probability of 5%.
67 echo 16 > /sys/module/drbd/parameters/enable_faults
68 echo 1 > /sys/module/drbd/parameters/fault_devs
69 echo 5 > /sys/module/drbd/parameters/fault_rate
70
71 If unsure, say N.
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile
new file mode 100644
index 000000000000..0d3f337ff5ff
--- /dev/null
+++ b/drivers/block/drbd/Makefile
@@ -0,0 +1,5 @@
1drbd-y := drbd_bitmap.o drbd_proc.o
2drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
3drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
4
5obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
new file mode 100644
index 000000000000..17956ff6a08d
--- /dev/null
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -0,0 +1,1424 @@
1/*
2 drbd_actlog.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/slab.h>
27#include <linux/drbd.h>
28#include "drbd_int.h"
29#include "drbd_wrappers.h"
30
31/* We maintain a trivial check sum in our on disk activity log.
32 * With that we can ensure correct operation even when the storage
33 * device might do a partial (last) sector write while loosing power.
34 */
35struct __packed al_transaction {
36 u32 magic;
37 u32 tr_number;
38 struct __packed {
39 u32 pos;
40 u32 extent; } updates[1 + AL_EXTENTS_PT];
41 u32 xor_sum;
42};
43
44struct update_odbm_work {
45 struct drbd_work w;
46 unsigned int enr;
47};
48
49struct update_al_work {
50 struct drbd_work w;
51 struct lc_element *al_ext;
52 struct completion event;
53 unsigned int enr;
54 /* if old_enr != LC_FREE, write corresponding bitmap sector, too */
55 unsigned int old_enr;
56};
57
58struct drbd_atodb_wait {
59 atomic_t count;
60 struct completion io_done;
61 struct drbd_conf *mdev;
62 int error;
63};
64
65
66int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);
67
68static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
69 struct drbd_backing_dev *bdev,
70 struct page *page, sector_t sector,
71 int rw, int size)
72{
73 struct bio *bio;
74 struct drbd_md_io md_io;
75 int ok;
76
77 md_io.mdev = mdev;
78 init_completion(&md_io.event);
79 md_io.error = 0;
80
81 if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags))
82 rw |= (1 << BIO_RW_BARRIER);
83 rw |= ((1<<BIO_RW_UNPLUG) | (1<<BIO_RW_SYNCIO));
84
85 retry:
86 bio = bio_alloc(GFP_NOIO, 1);
87 bio->bi_bdev = bdev->md_bdev;
88 bio->bi_sector = sector;
89 ok = (bio_add_page(bio, page, size, 0) == size);
90 if (!ok)
91 goto out;
92 bio->bi_private = &md_io;
93 bio->bi_end_io = drbd_md_io_complete;
94 bio->bi_rw = rw;
95
96 if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
97 bio_endio(bio, -EIO);
98 else
99 submit_bio(rw, bio);
100 wait_for_completion(&md_io.event);
101 ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;
102
103 /* check for unsupported barrier op.
104 * would rather check on EOPNOTSUPP, but that is not reliable.
105 * don't try again for ANY return value != 0 */
106 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && !ok)) {
107 /* Try again with no barrier */
108 dev_warn(DEV, "Barriers not supported on meta data device - disabling\n");
109 set_bit(MD_NO_BARRIER, &mdev->flags);
110 rw &= ~(1 << BIO_RW_BARRIER);
111 bio_put(bio);
112 goto retry;
113 }
114 out:
115 bio_put(bio);
116 return ok;
117}
118
119int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
120 sector_t sector, int rw)
121{
122 int logical_block_size, mask, ok;
123 int offset = 0;
124 struct page *iop = mdev->md_io_page;
125
126 D_ASSERT(mutex_is_locked(&mdev->md_io_mutex));
127
128 BUG_ON(!bdev->md_bdev);
129
130 logical_block_size = bdev_logical_block_size(bdev->md_bdev);
131 if (logical_block_size == 0)
132 logical_block_size = MD_SECTOR_SIZE;
133
134 /* in case logical_block_size != 512 [ s390 only? ] */
135 if (logical_block_size != MD_SECTOR_SIZE) {
136 mask = (logical_block_size / MD_SECTOR_SIZE) - 1;
137 D_ASSERT(mask == 1 || mask == 3 || mask == 7);
138 D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE);
139 offset = sector & mask;
140 sector = sector & ~mask;
141 iop = mdev->md_io_tmpp;
142
143 if (rw & WRITE) {
144 /* these are GFP_KERNEL pages, pre-allocated
145 * on device initialization */
146 void *p = page_address(mdev->md_io_page);
147 void *hp = page_address(mdev->md_io_tmpp);
148
149 ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector,
150 READ, logical_block_size);
151
152 if (unlikely(!ok)) {
153 dev_err(DEV, "drbd_md_sync_page_io(,%llus,"
154 "READ [logical_block_size!=512]) failed!\n",
155 (unsigned long long)sector);
156 return 0;
157 }
158
159 memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE);
160 }
161 }
162
163 if (sector < drbd_md_first_sector(bdev) ||
164 sector > drbd_md_last_sector(bdev))
165 dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
166 current->comm, current->pid, __func__,
167 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
168
169 ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size);
170 if (unlikely(!ok)) {
171 dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n",
172 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
173 return 0;
174 }
175
176 if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) {
177 void *p = page_address(mdev->md_io_page);
178 void *hp = page_address(mdev->md_io_tmpp);
179
180 memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE);
181 }
182
183 return ok;
184}
185
186static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
187{
188 struct lc_element *al_ext;
189 struct lc_element *tmp;
190 unsigned long al_flags = 0;
191
192 spin_lock_irq(&mdev->al_lock);
193 tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
194 if (unlikely(tmp != NULL)) {
195 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
196 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
197 spin_unlock_irq(&mdev->al_lock);
198 return NULL;
199 }
200 }
201 al_ext = lc_get(mdev->act_log, enr);
202 al_flags = mdev->act_log->flags;
203 spin_unlock_irq(&mdev->al_lock);
204
205 /*
206 if (!al_ext) {
207 if (al_flags & LC_STARVING)
208 dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n");
209 if (al_flags & LC_DIRTY)
210 dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n");
211 }
212 */
213
214 return al_ext;
215}
216
217void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector)
218{
219 unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
220 struct lc_element *al_ext;
221 struct update_al_work al_work;
222
223 D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
224
225 wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)));
226
227 if (al_ext->lc_number != enr) {
228 /* drbd_al_write_transaction(mdev,al_ext,enr);
229 * recurses into generic_make_request(), which
230 * disallows recursion, bios being serialized on the
231 * current->bio_tail list now.
232 * we have to delegate updates to the activity log
233 * to the worker thread. */
234 init_completion(&al_work.event);
235 al_work.al_ext = al_ext;
236 al_work.enr = enr;
237 al_work.old_enr = al_ext->lc_number;
238 al_work.w.cb = w_al_write_transaction;
239 drbd_queue_work_front(&mdev->data.work, &al_work.w);
240 wait_for_completion(&al_work.event);
241
242 mdev->al_writ_cnt++;
243
244 spin_lock_irq(&mdev->al_lock);
245 lc_changed(mdev->act_log, al_ext);
246 spin_unlock_irq(&mdev->al_lock);
247 wake_up(&mdev->al_wait);
248 }
249}
250
251void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)
252{
253 unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
254 struct lc_element *extent;
255 unsigned long flags;
256
257 spin_lock_irqsave(&mdev->al_lock, flags);
258
259 extent = lc_find(mdev->act_log, enr);
260
261 if (!extent) {
262 spin_unlock_irqrestore(&mdev->al_lock, flags);
263 dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
264 return;
265 }
266
267 if (lc_put(mdev->act_log, extent) == 0)
268 wake_up(&mdev->al_wait);
269
270 spin_unlock_irqrestore(&mdev->al_lock, flags);
271}
272
273int
274w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
275{
276 struct update_al_work *aw = container_of(w, struct update_al_work, w);
277 struct lc_element *updated = aw->al_ext;
278 const unsigned int new_enr = aw->enr;
279 const unsigned int evicted = aw->old_enr;
280 struct al_transaction *buffer;
281 sector_t sector;
282 int i, n, mx;
283 unsigned int extent_nr;
284 u32 xor_sum = 0;
285
286 if (!get_ldev(mdev)) {
287 dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n");
288 complete(&((struct update_al_work *)w)->event);
289 return 1;
290 }
291 /* do we have to do a bitmap write, first?
292 * TODO reduce maximum latency:
293 * submit both bios, then wait for both,
294 * instead of doing two synchronous sector writes. */
295 if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
296 drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT);
297
298 mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */
299 buffer = (struct al_transaction *)page_address(mdev->md_io_page);
300
301 buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
302 buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
303
304 n = lc_index_of(mdev->act_log, updated);
305
306 buffer->updates[0].pos = cpu_to_be32(n);
307 buffer->updates[0].extent = cpu_to_be32(new_enr);
308
309 xor_sum ^= new_enr;
310
311 mx = min_t(int, AL_EXTENTS_PT,
312 mdev->act_log->nr_elements - mdev->al_tr_cycle);
313 for (i = 0; i < mx; i++) {
314 unsigned idx = mdev->al_tr_cycle + i;
315 extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number;
316 buffer->updates[i+1].pos = cpu_to_be32(idx);
317 buffer->updates[i+1].extent = cpu_to_be32(extent_nr);
318 xor_sum ^= extent_nr;
319 }
320 for (; i < AL_EXTENTS_PT; i++) {
321 buffer->updates[i+1].pos = __constant_cpu_to_be32(-1);
322 buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE);
323 xor_sum ^= LC_FREE;
324 }
325 mdev->al_tr_cycle += AL_EXTENTS_PT;
326 if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
327 mdev->al_tr_cycle = 0;
328
329 buffer->xor_sum = cpu_to_be32(xor_sum);
330
331 sector = mdev->ldev->md.md_offset
332 + mdev->ldev->md.al_offset + mdev->al_tr_pos;
333
334 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
335 drbd_chk_io_error(mdev, 1, TRUE);
336
337 if (++mdev->al_tr_pos >
338 div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
339 mdev->al_tr_pos = 0;
340
341 D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
342 mdev->al_tr_number++;
343
344 mutex_unlock(&mdev->md_io_mutex);
345
346 complete(&((struct update_al_work *)w)->event);
347 put_ldev(mdev);
348
349 return 1;
350}
351
352/**
353 * drbd_al_read_tr() - Read a single transaction from the on disk activity log
354 * @mdev: DRBD device.
355 * @bdev: Block device to read form.
356 * @b: pointer to an al_transaction.
357 * @index: On disk slot of the transaction to read.
358 *
359 * Returns -1 on IO error, 0 on checksum error and 1 upon success.
360 */
361static int drbd_al_read_tr(struct drbd_conf *mdev,
362 struct drbd_backing_dev *bdev,
363 struct al_transaction *b,
364 int index)
365{
366 sector_t sector;
367 int rv, i;
368 u32 xor_sum = 0;
369
370 sector = bdev->md.md_offset + bdev->md.al_offset + index;
371
372 /* Dont process error normally,
373 * as this is done before disk is attached! */
374 if (!drbd_md_sync_page_io(mdev, bdev, sector, READ))
375 return -1;
376
377 rv = (be32_to_cpu(b->magic) == DRBD_MAGIC);
378
379 for (i = 0; i < AL_EXTENTS_PT + 1; i++)
380 xor_sum ^= be32_to_cpu(b->updates[i].extent);
381 rv &= (xor_sum == be32_to_cpu(b->xor_sum));
382
383 return rv;
384}
385
386/**
387 * drbd_al_read_log() - Restores the activity log from its on disk representation.
388 * @mdev: DRBD device.
389 * @bdev: Block device to read form.
390 *
391 * Returns 1 on success, returns 0 when reading the log failed due to IO errors.
392 */
393int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
394{
395 struct al_transaction *buffer;
396 int i;
397 int rv;
398 int mx;
399 int active_extents = 0;
400 int transactions = 0;
401 int found_valid = 0;
402 int from = 0;
403 int to = 0;
404 u32 from_tnr = 0;
405 u32 to_tnr = 0;
406 u32 cnr;
407
408 mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT);
409
410 /* lock out all other meta data io for now,
411 * and make sure the page is mapped.
412 */
413 mutex_lock(&mdev->md_io_mutex);
414 buffer = page_address(mdev->md_io_page);
415
416 /* Find the valid transaction in the log */
417 for (i = 0; i <= mx; i++) {
418 rv = drbd_al_read_tr(mdev, bdev, buffer, i);
419 if (rv == 0)
420 continue;
421 if (rv == -1) {
422 mutex_unlock(&mdev->md_io_mutex);
423 return 0;
424 }
425 cnr = be32_to_cpu(buffer->tr_number);
426
427 if (++found_valid == 1) {
428 from = i;
429 to = i;
430 from_tnr = cnr;
431 to_tnr = cnr;
432 continue;
433 }
434 if ((int)cnr - (int)from_tnr < 0) {
435 D_ASSERT(from_tnr - cnr + i - from == mx+1);
436 from = i;
437 from_tnr = cnr;
438 }
439 if ((int)cnr - (int)to_tnr > 0) {
440 D_ASSERT(cnr - to_tnr == i - to);
441 to = i;
442 to_tnr = cnr;
443 }
444 }
445
446 if (!found_valid) {
447 dev_warn(DEV, "No usable activity log found.\n");
448 mutex_unlock(&mdev->md_io_mutex);
449 return 1;
450 }
451
452 /* Read the valid transactions.
453 * dev_info(DEV, "Reading from %d to %d.\n",from,to); */
454 i = from;
455 while (1) {
456 int j, pos;
457 unsigned int extent_nr;
458 unsigned int trn;
459
460 rv = drbd_al_read_tr(mdev, bdev, buffer, i);
461 ERR_IF(rv == 0) goto cancel;
462 if (rv == -1) {
463 mutex_unlock(&mdev->md_io_mutex);
464 return 0;
465 }
466
467 trn = be32_to_cpu(buffer->tr_number);
468
469 spin_lock_irq(&mdev->al_lock);
470
471 /* This loop runs backwards because in the cyclic
472 elements there might be an old version of the
473 updated element (in slot 0). So the element in slot 0
474 can overwrite old versions. */
475 for (j = AL_EXTENTS_PT; j >= 0; j--) {
476 pos = be32_to_cpu(buffer->updates[j].pos);
477 extent_nr = be32_to_cpu(buffer->updates[j].extent);
478
479 if (extent_nr == LC_FREE)
480 continue;
481
482 lc_set(mdev->act_log, extent_nr, pos);
483 active_extents++;
484 }
485 spin_unlock_irq(&mdev->al_lock);
486
487 transactions++;
488
489cancel:
490 if (i == to)
491 break;
492 i++;
493 if (i > mx)
494 i = 0;
495 }
496
497 mdev->al_tr_number = to_tnr+1;
498 mdev->al_tr_pos = to;
499 if (++mdev->al_tr_pos >
500 div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
501 mdev->al_tr_pos = 0;
502
503 /* ok, we are done with it */
504 mutex_unlock(&mdev->md_io_mutex);
505
506 dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
507 transactions, active_extents);
508
509 return 1;
510}
511
512static void atodb_endio(struct bio *bio, int error)
513{
514 struct drbd_atodb_wait *wc = bio->bi_private;
515 struct drbd_conf *mdev = wc->mdev;
516 struct page *page;
517 int uptodate = bio_flagged(bio, BIO_UPTODATE);
518
519 /* strange behavior of some lower level drivers...
520 * fail the request by clearing the uptodate flag,
521 * but do not return any error?! */
522 if (!error && !uptodate)
523 error = -EIO;
524
525 drbd_chk_io_error(mdev, error, TRUE);
526 if (error && wc->error == 0)
527 wc->error = error;
528
529 if (atomic_dec_and_test(&wc->count))
530 complete(&wc->io_done);
531
532 page = bio->bi_io_vec[0].bv_page;
533 put_page(page);
534 bio_put(bio);
535 mdev->bm_writ_cnt++;
536 put_ldev(mdev);
537}
538
539#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
540/* activity log to on disk bitmap -- prepare bio unless that sector
541 * is already covered by previously prepared bios */
542static int atodb_prepare_unless_covered(struct drbd_conf *mdev,
543 struct bio **bios,
544 unsigned int enr,
545 struct drbd_atodb_wait *wc) __must_hold(local)
546{
547 struct bio *bio;
548 struct page *page;
549 sector_t on_disk_sector = enr + mdev->ldev->md.md_offset
550 + mdev->ldev->md.bm_offset;
551 unsigned int page_offset = PAGE_SIZE;
552 int offset;
553 int i = 0;
554 int err = -ENOMEM;
555
556 /* Check if that enr is already covered by an already created bio.
557 * Caution, bios[] is not NULL terminated,
558 * but only initialized to all NULL.
559 * For completely scattered activity log,
560 * the last invocation iterates over all bios,
561 * and finds the last NULL entry.
562 */
563 while ((bio = bios[i])) {
564 if (bio->bi_sector == on_disk_sector)
565 return 0;
566 i++;
567 }
568 /* bios[i] == NULL, the next not yet used slot */
569
570 /* GFP_KERNEL, we are not in the write-out path */
571 bio = bio_alloc(GFP_KERNEL, 1);
572 if (bio == NULL)
573 return -ENOMEM;
574
575 if (i > 0) {
576 const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec;
577 page_offset = prev_bv->bv_offset + prev_bv->bv_len;
578 page = prev_bv->bv_page;
579 }
580 if (page_offset == PAGE_SIZE) {
581 page = alloc_page(__GFP_HIGHMEM);
582 if (page == NULL)
583 goto out_bio_put;
584 page_offset = 0;
585 } else {
586 get_page(page);
587 }
588
589 offset = S2W(enr);
590 drbd_bm_get_lel(mdev, offset,
591 min_t(size_t, S2W(1), drbd_bm_words(mdev) - offset),
592 kmap(page) + page_offset);
593 kunmap(page);
594
595 bio->bi_private = wc;
596 bio->bi_end_io = atodb_endio;
597 bio->bi_bdev = mdev->ldev->md_bdev;
598 bio->bi_sector = on_disk_sector;
599
600 if (bio_add_page(bio, page, MD_SECTOR_SIZE, page_offset) != MD_SECTOR_SIZE)
601 goto out_put_page;
602
603 atomic_inc(&wc->count);
604 /* we already know that we may do this...
605 * get_ldev_if_state(mdev,D_ATTACHING);
606 * just get the extra reference, so that the local_cnt reflects
607 * the number of pending IO requests DRBD at its backing device.
608 */
609 atomic_inc(&mdev->local_cnt);
610
611 bios[i] = bio;
612
613 return 0;
614
615out_put_page:
616 err = -EINVAL;
617 put_page(page);
618out_bio_put:
619 bio_put(bio);
620 return err;
621}
622
623/**
624 * drbd_al_to_on_disk_bm() - * Writes bitmap parts covered by active AL extents
625 * @mdev: DRBD device.
626 *
627 * Called when we detach (unconfigure) local storage,
628 * or when we go from R_PRIMARY to R_SECONDARY role.
629 */
630void drbd_al_to_on_disk_bm(struct drbd_conf *mdev)
631{
632 int i, nr_elements;
633 unsigned int enr;
634 struct bio **bios;
635 struct drbd_atodb_wait wc;
636
637 ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING))
638 return; /* sorry, I don't have any act_log etc... */
639
640 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
641
642 nr_elements = mdev->act_log->nr_elements;
643
644 /* GFP_KERNEL, we are not in anyone's write-out path */
645 bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL);
646 if (!bios)
647 goto submit_one_by_one;
648
649 atomic_set(&wc.count, 0);
650 init_completion(&wc.io_done);
651 wc.mdev = mdev;
652 wc.error = 0;
653
654 for (i = 0; i < nr_elements; i++) {
655 enr = lc_element_by_index(mdev->act_log, i)->lc_number;
656 if (enr == LC_FREE)
657 continue;
658 /* next statement also does atomic_inc wc.count and local_cnt */
659 if (atodb_prepare_unless_covered(mdev, bios,
660 enr/AL_EXT_PER_BM_SECT,
661 &wc))
662 goto free_bios_submit_one_by_one;
663 }
664
665 /* unnecessary optimization? */
666 lc_unlock(mdev->act_log);
667 wake_up(&mdev->al_wait);
668
669 /* all prepared, submit them */
670 for (i = 0; i < nr_elements; i++) {
671 if (bios[i] == NULL)
672 break;
673 if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) {
674 bios[i]->bi_rw = WRITE;
675 bio_endio(bios[i], -EIO);
676 } else {
677 submit_bio(WRITE, bios[i]);
678 }
679 }
680
681 drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
682
683 /* always (try to) flush bitmap to stable storage */
684 drbd_md_flush(mdev);
685
686 /* In case we did not submit a single IO do not wait for
687 * them to complete. ( Because we would wait forever here. )
688 *
689 * In case we had IOs and they are already complete, there
690 * is not point in waiting anyways.
691 * Therefore this if () ... */
692 if (atomic_read(&wc.count))
693 wait_for_completion(&wc.io_done);
694
695 put_ldev(mdev);
696
697 kfree(bios);
698 return;
699
700 free_bios_submit_one_by_one:
701 /* free everything by calling the endio callback directly. */
702 for (i = 0; i < nr_elements && bios[i]; i++)
703 bio_endio(bios[i], 0);
704
705 kfree(bios);
706
707 submit_one_by_one:
708 dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n");
709
710 for (i = 0; i < mdev->act_log->nr_elements; i++) {
711 enr = lc_element_by_index(mdev->act_log, i)->lc_number;
712 if (enr == LC_FREE)
713 continue;
714 /* Really slow: if we have al-extents 16..19 active,
715 * sector 4 will be written four times! Synchronous! */
716 drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT);
717 }
718
719 lc_unlock(mdev->act_log);
720 wake_up(&mdev->al_wait);
721 put_ldev(mdev);
722}
723
724/**
725 * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents
726 * @mdev: DRBD device.
727 */
728void drbd_al_apply_to_bm(struct drbd_conf *mdev)
729{
730 unsigned int enr;
731 unsigned long add = 0;
732 char ppb[10];
733 int i;
734
735 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
736
737 for (i = 0; i < mdev->act_log->nr_elements; i++) {
738 enr = lc_element_by_index(mdev->act_log, i)->lc_number;
739 if (enr == LC_FREE)
740 continue;
741 add += drbd_bm_ALe_set_all(mdev, enr);
742 }
743
744 lc_unlock(mdev->act_log);
745 wake_up(&mdev->al_wait);
746
747 dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n",
748 ppsize(ppb, Bit2KB(add)));
749}
750
751static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
752{
753 int rv;
754
755 spin_lock_irq(&mdev->al_lock);
756 rv = (al_ext->refcnt == 0);
757 if (likely(rv))
758 lc_del(mdev->act_log, al_ext);
759 spin_unlock_irq(&mdev->al_lock);
760
761 return rv;
762}
763
764/**
765 * drbd_al_shrink() - Removes all active extents form the activity log
766 * @mdev: DRBD device.
767 *
768 * Removes all active extents form the activity log, waiting until
769 * the reference count of each entry dropped to 0 first, of course.
770 *
771 * You need to lock mdev->act_log with lc_try_lock() / lc_unlock()
772 */
773void drbd_al_shrink(struct drbd_conf *mdev)
774{
775 struct lc_element *al_ext;
776 int i;
777
778 D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags));
779
780 for (i = 0; i < mdev->act_log->nr_elements; i++) {
781 al_ext = lc_element_by_index(mdev->act_log, i);
782 if (al_ext->lc_number == LC_FREE)
783 continue;
784 wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext));
785 }
786
787 wake_up(&mdev->al_wait);
788}
789
790static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused)
791{
792 struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
793
794 if (!get_ldev(mdev)) {
795 if (__ratelimit(&drbd_ratelimit_state))
796 dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n");
797 kfree(udw);
798 return 1;
799 }
800
801 drbd_bm_write_sect(mdev, udw->enr);
802 put_ldev(mdev);
803
804 kfree(udw);
805
806 if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) {
807 switch (mdev->state.conn) {
808 case C_SYNC_SOURCE: case C_SYNC_TARGET:
809 case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:
810 drbd_resync_finished(mdev);
811 default:
812 /* nothing to do */
813 break;
814 }
815 }
816 drbd_bcast_sync_progress(mdev);
817
818 return 1;
819}
820
821
822/* ATTENTION. The AL's extents are 4MB each, while the extents in the
823 * resync LRU-cache are 16MB each.
824 * The caller of this function has to hold an get_ldev() reference.
825 *
826 * TODO will be obsoleted once we have a caching lru of the on disk bitmap
827 */
828static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
829 int count, int success)
830{
831 struct lc_element *e;
832 struct update_odbm_work *udw;
833
834 unsigned int enr;
835
836 D_ASSERT(atomic_read(&mdev->local_cnt));
837
838 /* I simply assume that a sector/size pair never crosses
839 * a 16 MB extent border. (Currently this is true...) */
840 enr = BM_SECT_TO_EXT(sector);
841
842 e = lc_get(mdev->resync, enr);
843 if (e) {
844 struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
845 if (ext->lce.lc_number == enr) {
846 if (success)
847 ext->rs_left -= count;
848 else
849 ext->rs_failed += count;
850 if (ext->rs_left < ext->rs_failed) {
851 dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
852 "rs_failed=%d count=%d\n",
853 (unsigned long long)sector,
854 ext->lce.lc_number, ext->rs_left,
855 ext->rs_failed, count);
856 dump_stack();
857
858 lc_put(mdev->resync, &ext->lce);
859 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
860 return;
861 }
862 } else {
863 /* Normally this element should be in the cache,
864 * since drbd_rs_begin_io() pulled it already in.
865 *
866 * But maybe an application write finished, and we set
867 * something outside the resync lru_cache in sync.
868 */
869 int rs_left = drbd_bm_e_weight(mdev, enr);
870 if (ext->flags != 0) {
871 dev_warn(DEV, "changing resync lce: %d[%u;%02lx]"
872 " -> %d[%u;00]\n",
873 ext->lce.lc_number, ext->rs_left,
874 ext->flags, enr, rs_left);
875 ext->flags = 0;
876 }
877 if (ext->rs_failed) {
878 dev_warn(DEV, "Kicking resync_lru element enr=%u "
879 "out with rs_failed=%d\n",
880 ext->lce.lc_number, ext->rs_failed);
881 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
882 }
883 ext->rs_left = rs_left;
884 ext->rs_failed = success ? 0 : count;
885 lc_changed(mdev->resync, &ext->lce);
886 }
887 lc_put(mdev->resync, &ext->lce);
888 /* no race, we are within the al_lock! */
889
890 if (ext->rs_left == ext->rs_failed) {
891 ext->rs_failed = 0;
892
893 udw = kmalloc(sizeof(*udw), GFP_ATOMIC);
894 if (udw) {
895 udw->enr = ext->lce.lc_number;
896 udw->w.cb = w_update_odbm;
897 drbd_queue_work_front(&mdev->data.work, &udw->w);
898 } else {
899 dev_warn(DEV, "Could not kmalloc an udw\n");
900 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
901 }
902 }
903 } else {
904 dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n",
905 mdev->resync_locked,
906 mdev->resync->nr_elements,
907 mdev->resync->flags);
908 }
909}
910
911/* clear the bit corresponding to the piece of storage in question:
912 * size byte of data starting from sector. Only clear a bits of the affected
913 * one ore more _aligned_ BM_BLOCK_SIZE blocks.
914 *
915 * called by worker on C_SYNC_TARGET and receiver on SyncSource.
916 *
917 */
918void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
919 const char *file, const unsigned int line)
920{
921 /* Is called from worker and receiver context _only_ */
922 unsigned long sbnr, ebnr, lbnr;
923 unsigned long count = 0;
924 sector_t esector, nr_sectors;
925 int wake_up = 0;
926 unsigned long flags;
927
928 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
929 dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
930 (unsigned long long)sector, size);
931 return;
932 }
933 nr_sectors = drbd_get_capacity(mdev->this_bdev);
934 esector = sector + (size >> 9) - 1;
935
936 ERR_IF(sector >= nr_sectors) return;
937 ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
938
939 lbnr = BM_SECT_TO_BIT(nr_sectors-1);
940
941 /* we clear it (in sync).
942 * round up start sector, round down end sector. we make sure we only
943 * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
944 if (unlikely(esector < BM_SECT_PER_BIT-1))
945 return;
946 if (unlikely(esector == (nr_sectors-1)))
947 ebnr = lbnr;
948 else
949 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
950 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
951
952 if (sbnr > ebnr)
953 return;
954
955 /*
956 * ok, (capacity & 7) != 0 sometimes, but who cares...
957 * we count rs_{total,left} in bits, not sectors.
958 */
959 spin_lock_irqsave(&mdev->al_lock, flags);
960 count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
961 if (count) {
962 /* we need the lock for drbd_try_clear_on_disk_bm */
963 if (jiffies - mdev->rs_mark_time > HZ*10) {
964 /* should be rolling marks,
965 * but we estimate only anyways. */
966 if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) &&
967 mdev->state.conn != C_PAUSED_SYNC_T &&
968 mdev->state.conn != C_PAUSED_SYNC_S) {
969 mdev->rs_mark_time = jiffies;
970 mdev->rs_mark_left = drbd_bm_total_weight(mdev);
971 }
972 }
973 if (get_ldev(mdev)) {
974 drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE);
975 put_ldev(mdev);
976 }
977 /* just wake_up unconditional now, various lc_chaged(),
978 * lc_put() in drbd_try_clear_on_disk_bm(). */
979 wake_up = 1;
980 }
981 spin_unlock_irqrestore(&mdev->al_lock, flags);
982 if (wake_up)
983 wake_up(&mdev->al_wait);
984}
985
986/*
987 * this is intended to set one request worth of data out of sync.
988 * affects at least 1 bit,
989 * and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits.
990 *
991 * called by tl_clear and drbd_send_dblock (==drbd_make_request).
992 * so this can be _any_ process.
993 */
994void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
995 const char *file, const unsigned int line)
996{
997 unsigned long sbnr, ebnr, lbnr, flags;
998 sector_t esector, nr_sectors;
999 unsigned int enr, count;
1000 struct lc_element *e;
1001
1002 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
1003 dev_err(DEV, "sector: %llus, size: %d\n",
1004 (unsigned long long)sector, size);
1005 return;
1006 }
1007
1008 if (!get_ldev(mdev))
1009 return; /* no disk, no metadata, no bitmap to set bits in */
1010
1011 nr_sectors = drbd_get_capacity(mdev->this_bdev);
1012 esector = sector + (size >> 9) - 1;
1013
1014 ERR_IF(sector >= nr_sectors)
1015 goto out;
1016 ERR_IF(esector >= nr_sectors)
1017 esector = (nr_sectors-1);
1018
1019 lbnr = BM_SECT_TO_BIT(nr_sectors-1);
1020
1021 /* we set it out of sync,
1022 * we do not need to round anything here */
1023 sbnr = BM_SECT_TO_BIT(sector);
1024 ebnr = BM_SECT_TO_BIT(esector);
1025
1026 /* ok, (capacity & 7) != 0 sometimes, but who cares...
1027 * we count rs_{total,left} in bits, not sectors. */
1028 spin_lock_irqsave(&mdev->al_lock, flags);
1029 count = drbd_bm_set_bits(mdev, sbnr, ebnr);
1030
1031 enr = BM_SECT_TO_EXT(sector);
1032 e = lc_find(mdev->resync, enr);
1033 if (e)
1034 lc_entry(e, struct bm_extent, lce)->rs_left += count;
1035 spin_unlock_irqrestore(&mdev->al_lock, flags);
1036
1037out:
1038 put_ldev(mdev);
1039}
1040
1041static
1042struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
1043{
1044 struct lc_element *e;
1045 struct bm_extent *bm_ext;
1046 int wakeup = 0;
1047 unsigned long rs_flags;
1048
1049 spin_lock_irq(&mdev->al_lock);
1050 if (mdev->resync_locked > mdev->resync->nr_elements/2) {
1051 spin_unlock_irq(&mdev->al_lock);
1052 return NULL;
1053 }
1054 e = lc_get(mdev->resync, enr);
1055 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1056 if (bm_ext) {
1057 if (bm_ext->lce.lc_number != enr) {
1058 bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
1059 bm_ext->rs_failed = 0;
1060 lc_changed(mdev->resync, &bm_ext->lce);
1061 wakeup = 1;
1062 }
1063 if (bm_ext->lce.refcnt == 1)
1064 mdev->resync_locked++;
1065 set_bit(BME_NO_WRITES, &bm_ext->flags);
1066 }
1067 rs_flags = mdev->resync->flags;
1068 spin_unlock_irq(&mdev->al_lock);
1069 if (wakeup)
1070 wake_up(&mdev->al_wait);
1071
1072 if (!bm_ext) {
1073 if (rs_flags & LC_STARVING)
1074 dev_warn(DEV, "Have to wait for element"
1075 " (resync LRU too small?)\n");
1076 BUG_ON(rs_flags & LC_DIRTY);
1077 }
1078
1079 return bm_ext;
1080}
1081
1082static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
1083{
1084 struct lc_element *al_ext;
1085 int rv = 0;
1086
1087 spin_lock_irq(&mdev->al_lock);
1088 if (unlikely(enr == mdev->act_log->new_number))
1089 rv = 1;
1090 else {
1091 al_ext = lc_find(mdev->act_log, enr);
1092 if (al_ext) {
1093 if (al_ext->refcnt)
1094 rv = 1;
1095 }
1096 }
1097 spin_unlock_irq(&mdev->al_lock);
1098
1099 /*
1100 if (unlikely(rv)) {
1101 dev_info(DEV, "Delaying sync read until app's write is done\n");
1102 }
1103 */
1104 return rv;
1105}
1106
1107/**
1108 * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED
1109 * @mdev: DRBD device.
1110 * @sector: The sector number.
1111 *
1112 * This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted.
1113 */
1114int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
1115{
1116 unsigned int enr = BM_SECT_TO_EXT(sector);
1117 struct bm_extent *bm_ext;
1118 int i, sig;
1119
1120 sig = wait_event_interruptible(mdev->al_wait,
1121 (bm_ext = _bme_get(mdev, enr)));
1122 if (sig)
1123 return 0;
1124
1125 if (test_bit(BME_LOCKED, &bm_ext->flags))
1126 return 1;
1127
1128 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
1129 sig = wait_event_interruptible(mdev->al_wait,
1130 !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i));
1131 if (sig) {
1132 spin_lock_irq(&mdev->al_lock);
1133 if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
1134 clear_bit(BME_NO_WRITES, &bm_ext->flags);
1135 mdev->resync_locked--;
1136 wake_up(&mdev->al_wait);
1137 }
1138 spin_unlock_irq(&mdev->al_lock);
1139 return 0;
1140 }
1141 }
1142
1143 set_bit(BME_LOCKED, &bm_ext->flags);
1144
1145 return 1;
1146}
1147
1148/**
1149 * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep
1150 * @mdev: DRBD device.
1151 * @sector: The sector number.
1152 *
1153 * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then
1154 * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN
1155 * if there is still application IO going on in this area.
1156 */
1157int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
1158{
1159 unsigned int enr = BM_SECT_TO_EXT(sector);
1160 const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT;
1161 struct lc_element *e;
1162 struct bm_extent *bm_ext;
1163 int i;
1164
1165 spin_lock_irq(&mdev->al_lock);
1166 if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) {
1167 /* in case you have very heavy scattered io, it may
1168 * stall the syncer undefined if we give up the ref count
1169 * when we try again and requeue.
1170 *
1171 * if we don't give up the refcount, but the next time
1172 * we are scheduled this extent has been "synced" by new
1173 * application writes, we'd miss the lc_put on the
1174 * extent we keep the refcount on.
1175 * so we remembered which extent we had to try again, and
1176 * if the next requested one is something else, we do
1177 * the lc_put here...
1178 * we also have to wake_up
1179 */
1180 e = lc_find(mdev->resync, mdev->resync_wenr);
1181 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1182 if (bm_ext) {
1183 D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
1184 D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
1185 clear_bit(BME_NO_WRITES, &bm_ext->flags);
1186 mdev->resync_wenr = LC_FREE;
1187 if (lc_put(mdev->resync, &bm_ext->lce) == 0)
1188 mdev->resync_locked--;
1189 wake_up(&mdev->al_wait);
1190 } else {
1191 dev_alert(DEV, "LOGIC BUG\n");
1192 }
1193 }
1194 /* TRY. */
1195 e = lc_try_get(mdev->resync, enr);
1196 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1197 if (bm_ext) {
1198 if (test_bit(BME_LOCKED, &bm_ext->flags))
1199 goto proceed;
1200 if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) {
1201 mdev->resync_locked++;
1202 } else {
1203 /* we did set the BME_NO_WRITES,
1204 * but then could not set BME_LOCKED,
1205 * so we tried again.
1206 * drop the extra reference. */
1207 bm_ext->lce.refcnt--;
1208 D_ASSERT(bm_ext->lce.refcnt > 0);
1209 }
1210 goto check_al;
1211 } else {
1212 /* do we rather want to try later? */
1213 if (mdev->resync_locked > mdev->resync->nr_elements-3)
1214 goto try_again;
1215 /* Do or do not. There is no try. -- Yoda */
1216 e = lc_get(mdev->resync, enr);
1217 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1218 if (!bm_ext) {
1219 const unsigned long rs_flags = mdev->resync->flags;
1220 if (rs_flags & LC_STARVING)
1221 dev_warn(DEV, "Have to wait for element"
1222 " (resync LRU too small?)\n");
1223 BUG_ON(rs_flags & LC_DIRTY);
1224 goto try_again;
1225 }
1226 if (bm_ext->lce.lc_number != enr) {
1227 bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
1228 bm_ext->rs_failed = 0;
1229 lc_changed(mdev->resync, &bm_ext->lce);
1230 wake_up(&mdev->al_wait);
1231 D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0);
1232 }
1233 set_bit(BME_NO_WRITES, &bm_ext->flags);
1234 D_ASSERT(bm_ext->lce.refcnt == 1);
1235 mdev->resync_locked++;
1236 goto check_al;
1237 }
1238check_al:
1239 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
1240 if (unlikely(al_enr+i == mdev->act_log->new_number))
1241 goto try_again;
1242 if (lc_is_used(mdev->act_log, al_enr+i))
1243 goto try_again;
1244 }
1245 set_bit(BME_LOCKED, &bm_ext->flags);
1246proceed:
1247 mdev->resync_wenr = LC_FREE;
1248 spin_unlock_irq(&mdev->al_lock);
1249 return 0;
1250
1251try_again:
1252 if (bm_ext)
1253 mdev->resync_wenr = enr;
1254 spin_unlock_irq(&mdev->al_lock);
1255 return -EAGAIN;
1256}
1257
1258void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector)
1259{
1260 unsigned int enr = BM_SECT_TO_EXT(sector);
1261 struct lc_element *e;
1262 struct bm_extent *bm_ext;
1263 unsigned long flags;
1264
1265 spin_lock_irqsave(&mdev->al_lock, flags);
1266 e = lc_find(mdev->resync, enr);
1267 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1268 if (!bm_ext) {
1269 spin_unlock_irqrestore(&mdev->al_lock, flags);
1270 if (__ratelimit(&drbd_ratelimit_state))
1271 dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n");
1272 return;
1273 }
1274
1275 if (bm_ext->lce.refcnt == 0) {
1276 spin_unlock_irqrestore(&mdev->al_lock, flags);
1277 dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, "
1278 "but refcnt is 0!?\n",
1279 (unsigned long long)sector, enr);
1280 return;
1281 }
1282
1283 if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
1284 clear_bit(BME_LOCKED, &bm_ext->flags);
1285 clear_bit(BME_NO_WRITES, &bm_ext->flags);
1286 mdev->resync_locked--;
1287 wake_up(&mdev->al_wait);
1288 }
1289
1290 spin_unlock_irqrestore(&mdev->al_lock, flags);
1291}
1292
1293/**
1294 * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED)
1295 * @mdev: DRBD device.
1296 */
1297void drbd_rs_cancel_all(struct drbd_conf *mdev)
1298{
1299 spin_lock_irq(&mdev->al_lock);
1300
1301 if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */
1302 lc_reset(mdev->resync);
1303 put_ldev(mdev);
1304 }
1305 mdev->resync_locked = 0;
1306 mdev->resync_wenr = LC_FREE;
1307 spin_unlock_irq(&mdev->al_lock);
1308 wake_up(&mdev->al_wait);
1309}
1310
1311/**
1312 * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU
1313 * @mdev: DRBD device.
1314 *
1315 * Returns 0 upon success, -EAGAIN if at least one reference count was
1316 * not zero.
1317 */
1318int drbd_rs_del_all(struct drbd_conf *mdev)
1319{
1320 struct lc_element *e;
1321 struct bm_extent *bm_ext;
1322 int i;
1323
1324 spin_lock_irq(&mdev->al_lock);
1325
1326 if (get_ldev_if_state(mdev, D_FAILED)) {
1327 /* ok, ->resync is there. */
1328 for (i = 0; i < mdev->resync->nr_elements; i++) {
1329 e = lc_element_by_index(mdev->resync, i);
1330 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1331 if (bm_ext->lce.lc_number == LC_FREE)
1332 continue;
1333 if (bm_ext->lce.lc_number == mdev->resync_wenr) {
1334 dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently"
1335 " got 'synced' by application io\n",
1336 mdev->resync_wenr);
1337 D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
1338 D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
1339 clear_bit(BME_NO_WRITES, &bm_ext->flags);
1340 mdev->resync_wenr = LC_FREE;
1341 lc_put(mdev->resync, &bm_ext->lce);
1342 }
1343 if (bm_ext->lce.refcnt != 0) {
1344 dev_info(DEV, "Retrying drbd_rs_del_all() later. "
1345 "refcnt=%d\n", bm_ext->lce.refcnt);
1346 put_ldev(mdev);
1347 spin_unlock_irq(&mdev->al_lock);
1348 return -EAGAIN;
1349 }
1350 D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
1351 D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags));
1352 lc_del(mdev->resync, &bm_ext->lce);
1353 }
1354 D_ASSERT(mdev->resync->used == 0);
1355 put_ldev(mdev);
1356 }
1357 spin_unlock_irq(&mdev->al_lock);
1358
1359 return 0;
1360}
1361
1362/**
1363 * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks
1364 * @mdev: DRBD device.
1365 * @sector: The sector number.
1366 * @size: Size of failed IO operation, in byte.
1367 */
1368void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
1369{
1370 /* Is called from worker and receiver context _only_ */
1371 unsigned long sbnr, ebnr, lbnr;
1372 unsigned long count;
1373 sector_t esector, nr_sectors;
1374 int wake_up = 0;
1375
1376 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
1377 dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
1378 (unsigned long long)sector, size);
1379 return;
1380 }
1381 nr_sectors = drbd_get_capacity(mdev->this_bdev);
1382 esector = sector + (size >> 9) - 1;
1383
1384 ERR_IF(sector >= nr_sectors) return;
1385 ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
1386
1387 lbnr = BM_SECT_TO_BIT(nr_sectors-1);
1388
1389 /*
1390 * round up start sector, round down end sector. we make sure we only
1391 * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */
1392 if (unlikely(esector < BM_SECT_PER_BIT-1))
1393 return;
1394 if (unlikely(esector == (nr_sectors-1)))
1395 ebnr = lbnr;
1396 else
1397 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
1398 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
1399
1400 if (sbnr > ebnr)
1401 return;
1402
1403 /*
1404 * ok, (capacity & 7) != 0 sometimes, but who cares...
1405 * we count rs_{total,left} in bits, not sectors.
1406 */
1407 spin_lock_irq(&mdev->al_lock);
1408 count = drbd_bm_count_bits(mdev, sbnr, ebnr);
1409 if (count) {
1410 mdev->rs_failed += count;
1411
1412 if (get_ldev(mdev)) {
1413 drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE);
1414 put_ldev(mdev);
1415 }
1416
1417 /* just wake_up unconditional now, various lc_chaged(),
1418 * lc_put() in drbd_try_clear_on_disk_bm(). */
1419 wake_up = 1;
1420 }
1421 spin_unlock_irq(&mdev->al_lock);
1422 if (wake_up)
1423 wake_up(&mdev->al_wait);
1424}
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
new file mode 100644
index 000000000000..b61057e77882
--- /dev/null
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -0,0 +1,1327 @@
1/*
2 drbd_bitmap.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25#include <linux/bitops.h>
26#include <linux/vmalloc.h>
27#include <linux/string.h>
28#include <linux/drbd.h>
29#include <asm/kmap_types.h>
30#include "drbd_int.h"
31
32/* OPAQUE outside this file!
33 * interface defined in drbd_int.h
34
35 * convention:
36 * function name drbd_bm_... => used elsewhere, "public".
37 * function name bm_... => internal to implementation, "private".
38
39 * Note that since find_first_bit returns int, at the current granularity of
40 * the bitmap (4KB per byte), this implementation "only" supports up to
41 * 1<<(32+12) == 16 TB...
42 */
43
44/*
45 * NOTE
46 * Access to the *bm_pages is protected by bm_lock.
47 * It is safe to read the other members within the lock.
48 *
49 * drbd_bm_set_bits is called from bio_endio callbacks,
50 * We may be called with irq already disabled,
51 * so we need spin_lock_irqsave().
52 * And we need the kmap_atomic.
53 */
54struct drbd_bitmap {
55 struct page **bm_pages;
56 spinlock_t bm_lock;
57 /* WARNING unsigned long bm_*:
58 * 32bit number of bit offset is just enough for 512 MB bitmap.
59 * it will blow up if we make the bitmap bigger...
60 * not that it makes much sense to have a bitmap that large,
61 * rather change the granularity to 16k or 64k or something.
62 * (that implies other problems, however...)
63 */
64 unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */
65 unsigned long bm_bits;
66 size_t bm_words;
67 size_t bm_number_of_pages;
68 sector_t bm_dev_capacity;
69 struct semaphore bm_change; /* serializes resize operations */
70
71 atomic_t bm_async_io;
72 wait_queue_head_t bm_io_wait;
73
74 unsigned long bm_flags;
75
76 /* debugging aid, in case we are still racy somewhere */
77 char *bm_why;
78 struct task_struct *bm_task;
79};
80
81/* definition of bits in bm_flags */
82#define BM_LOCKED 0
83#define BM_MD_IO_ERROR 1
84#define BM_P_VMALLOCED 2
85
86static int bm_is_locked(struct drbd_bitmap *b)
87{
88 return test_bit(BM_LOCKED, &b->bm_flags);
89}
90
91#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
92static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
93{
94 struct drbd_bitmap *b = mdev->bitmap;
95 if (!__ratelimit(&drbd_ratelimit_state))
96 return;
97 dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n",
98 current == mdev->receiver.task ? "receiver" :
99 current == mdev->asender.task ? "asender" :
100 current == mdev->worker.task ? "worker" : current->comm,
101 func, b->bm_why ?: "?",
102 b->bm_task == mdev->receiver.task ? "receiver" :
103 b->bm_task == mdev->asender.task ? "asender" :
104 b->bm_task == mdev->worker.task ? "worker" : "?");
105}
106
107void drbd_bm_lock(struct drbd_conf *mdev, char *why)
108{
109 struct drbd_bitmap *b = mdev->bitmap;
110 int trylock_failed;
111
112 if (!b) {
113 dev_err(DEV, "FIXME no bitmap in drbd_bm_lock!?\n");
114 return;
115 }
116
117 trylock_failed = down_trylock(&b->bm_change);
118
119 if (trylock_failed) {
120 dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n",
121 current == mdev->receiver.task ? "receiver" :
122 current == mdev->asender.task ? "asender" :
123 current == mdev->worker.task ? "worker" : current->comm,
124 why, b->bm_why ?: "?",
125 b->bm_task == mdev->receiver.task ? "receiver" :
126 b->bm_task == mdev->asender.task ? "asender" :
127 b->bm_task == mdev->worker.task ? "worker" : "?");
128 down(&b->bm_change);
129 }
130 if (__test_and_set_bit(BM_LOCKED, &b->bm_flags))
131 dev_err(DEV, "FIXME bitmap already locked in bm_lock\n");
132
133 b->bm_why = why;
134 b->bm_task = current;
135}
136
137void drbd_bm_unlock(struct drbd_conf *mdev)
138{
139 struct drbd_bitmap *b = mdev->bitmap;
140 if (!b) {
141 dev_err(DEV, "FIXME no bitmap in drbd_bm_unlock!?\n");
142 return;
143 }
144
145 if (!__test_and_clear_bit(BM_LOCKED, &mdev->bitmap->bm_flags))
146 dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n");
147
148 b->bm_why = NULL;
149 b->bm_task = NULL;
150 up(&b->bm_change);
151}
152
153/* word offset to long pointer */
154static unsigned long *__bm_map_paddr(struct drbd_bitmap *b, unsigned long offset, const enum km_type km)
155{
156 struct page *page;
157 unsigned long page_nr;
158
159 /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
160 page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
161 BUG_ON(page_nr >= b->bm_number_of_pages);
162 page = b->bm_pages[page_nr];
163
164 return (unsigned long *) kmap_atomic(page, km);
165}
166
167static unsigned long * bm_map_paddr(struct drbd_bitmap *b, unsigned long offset)
168{
169 return __bm_map_paddr(b, offset, KM_IRQ1);
170}
171
172static void __bm_unmap(unsigned long *p_addr, const enum km_type km)
173{
174 kunmap_atomic(p_addr, km);
175};
176
177static void bm_unmap(unsigned long *p_addr)
178{
179 return __bm_unmap(p_addr, KM_IRQ1);
180}
181
182/* long word offset of _bitmap_ sector */
183#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
184/* word offset from start of bitmap to word number _in_page_
185 * modulo longs per page
186#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
187 hm, well, Philipp thinks gcc might not optimze the % into & (... - 1)
188 so do it explicitly:
189 */
190#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
191
192/* Long words per page */
193#define LWPP (PAGE_SIZE/sizeof(long))
194
195/*
196 * actually most functions herein should take a struct drbd_bitmap*, not a
197 * struct drbd_conf*, but for the debug macros I like to have the mdev around
198 * to be able to report device specific.
199 */
200
201static void bm_free_pages(struct page **pages, unsigned long number)
202{
203 unsigned long i;
204 if (!pages)
205 return;
206
207 for (i = 0; i < number; i++) {
208 if (!pages[i]) {
209 printk(KERN_ALERT "drbd: bm_free_pages tried to free "
210 "a NULL pointer; i=%lu n=%lu\n",
211 i, number);
212 continue;
213 }
214 __free_page(pages[i]);
215 pages[i] = NULL;
216 }
217}
218
219static void bm_vk_free(void *ptr, int v)
220{
221 if (v)
222 vfree(ptr);
223 else
224 kfree(ptr);
225}
226
227/*
228 * "have" and "want" are NUMBER OF PAGES.
229 */
230static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
231{
232 struct page **old_pages = b->bm_pages;
233 struct page **new_pages, *page;
234 unsigned int i, bytes, vmalloced = 0;
235 unsigned long have = b->bm_number_of_pages;
236
237 BUG_ON(have == 0 && old_pages != NULL);
238 BUG_ON(have != 0 && old_pages == NULL);
239
240 if (have == want)
241 return old_pages;
242
243 /* Trying kmalloc first, falling back to vmalloc.
244 * GFP_KERNEL is ok, as this is done when a lower level disk is
245 * "attached" to the drbd. Context is receiver thread or cqueue
246 * thread. As we have no disk yet, we are not in the IO path,
247 * not even the IO path of the peer. */
248 bytes = sizeof(struct page *)*want;
249 new_pages = kmalloc(bytes, GFP_KERNEL);
250 if (!new_pages) {
251 new_pages = vmalloc(bytes);
252 if (!new_pages)
253 return NULL;
254 vmalloced = 1;
255 }
256
257 memset(new_pages, 0, bytes);
258 if (want >= have) {
259 for (i = 0; i < have; i++)
260 new_pages[i] = old_pages[i];
261 for (; i < want; i++) {
262 page = alloc_page(GFP_HIGHUSER);
263 if (!page) {
264 bm_free_pages(new_pages + have, i - have);
265 bm_vk_free(new_pages, vmalloced);
266 return NULL;
267 }
268 new_pages[i] = page;
269 }
270 } else {
271 for (i = 0; i < want; i++)
272 new_pages[i] = old_pages[i];
273 /* NOT HERE, we are outside the spinlock!
274 bm_free_pages(old_pages + want, have - want);
275 */
276 }
277
278 if (vmalloced)
279 set_bit(BM_P_VMALLOCED, &b->bm_flags);
280 else
281 clear_bit(BM_P_VMALLOCED, &b->bm_flags);
282
283 return new_pages;
284}
285
286/*
287 * called on driver init only. TODO call when a device is created.
288 * allocates the drbd_bitmap, and stores it in mdev->bitmap.
289 */
290int drbd_bm_init(struct drbd_conf *mdev)
291{
292 struct drbd_bitmap *b = mdev->bitmap;
293 WARN_ON(b != NULL);
294 b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);
295 if (!b)
296 return -ENOMEM;
297 spin_lock_init(&b->bm_lock);
298 init_MUTEX(&b->bm_change);
299 init_waitqueue_head(&b->bm_io_wait);
300
301 mdev->bitmap = b;
302
303 return 0;
304}
305
306sector_t drbd_bm_capacity(struct drbd_conf *mdev)
307{
308 ERR_IF(!mdev->bitmap) return 0;
309 return mdev->bitmap->bm_dev_capacity;
310}
311
312/* called on driver unload. TODO: call when a device is destroyed.
313 */
314void drbd_bm_cleanup(struct drbd_conf *mdev)
315{
316 ERR_IF (!mdev->bitmap) return;
317 bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
318 bm_vk_free(mdev->bitmap->bm_pages, test_bit(BM_P_VMALLOCED, &mdev->bitmap->bm_flags));
319 kfree(mdev->bitmap);
320 mdev->bitmap = NULL;
321}
322
323/*
324 * since (b->bm_bits % BITS_PER_LONG) != 0,
325 * this masks out the remaining bits.
326 * Returns the number of bits cleared.
327 */
328static int bm_clear_surplus(struct drbd_bitmap *b)
329{
330 const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1;
331 size_t w = b->bm_bits >> LN2_BPL;
332 int cleared = 0;
333 unsigned long *p_addr, *bm;
334
335 p_addr = bm_map_paddr(b, w);
336 bm = p_addr + MLPP(w);
337 if (w < b->bm_words) {
338 cleared = hweight_long(*bm & ~mask);
339 *bm &= mask;
340 w++; bm++;
341 }
342
343 if (w < b->bm_words) {
344 cleared += hweight_long(*bm);
345 *bm = 0;
346 }
347 bm_unmap(p_addr);
348 return cleared;
349}
350
351static void bm_set_surplus(struct drbd_bitmap *b)
352{
353 const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1;
354 size_t w = b->bm_bits >> LN2_BPL;
355 unsigned long *p_addr, *bm;
356
357 p_addr = bm_map_paddr(b, w);
358 bm = p_addr + MLPP(w);
359 if (w < b->bm_words) {
360 *bm |= ~mask;
361 bm++; w++;
362 }
363
364 if (w < b->bm_words) {
365 *bm = ~(0UL);
366 }
367 bm_unmap(p_addr);
368}
369
370static unsigned long __bm_count_bits(struct drbd_bitmap *b, const int swap_endian)
371{
372 unsigned long *p_addr, *bm, offset = 0;
373 unsigned long bits = 0;
374 unsigned long i, do_now;
375
376 while (offset < b->bm_words) {
377 i = do_now = min_t(size_t, b->bm_words-offset, LWPP);
378 p_addr = __bm_map_paddr(b, offset, KM_USER0);
379 bm = p_addr + MLPP(offset);
380 while (i--) {
381#ifndef __LITTLE_ENDIAN
382 if (swap_endian)
383 *bm = lel_to_cpu(*bm);
384#endif
385 bits += hweight_long(*bm++);
386 }
387 __bm_unmap(p_addr, KM_USER0);
388 offset += do_now;
389 cond_resched();
390 }
391
392 return bits;
393}
394
395static unsigned long bm_count_bits(struct drbd_bitmap *b)
396{
397 return __bm_count_bits(b, 0);
398}
399
400static unsigned long bm_count_bits_swap_endian(struct drbd_bitmap *b)
401{
402 return __bm_count_bits(b, 1);
403}
404
405/* offset and len in long words.*/
406static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
407{
408 unsigned long *p_addr, *bm;
409 size_t do_now, end;
410
411#define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512)
412
413 end = offset + len;
414
415 if (end > b->bm_words) {
416 printk(KERN_ALERT "drbd: bm_memset end > bm_words\n");
417 return;
418 }
419
420 while (offset < end) {
421 do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
422 p_addr = bm_map_paddr(b, offset);
423 bm = p_addr + MLPP(offset);
424 if (bm+do_now > p_addr + LWPP) {
425 printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
426 p_addr, bm, (int)do_now);
427 break; /* breaks to after catch_oob_access_end() only! */
428 }
429 memset(bm, c, do_now * sizeof(long));
430 bm_unmap(p_addr);
431 offset += do_now;
432 }
433}
434
435/*
436 * make sure the bitmap has enough room for the attached storage,
437 * if necessary, resize.
438 * called whenever we may have changed the device size.
439 * returns -ENOMEM if we could not allocate enough memory, 0 on success.
440 * In case this is actually a resize, we copy the old bitmap into the new one.
441 * Otherwise, the bitmap is initialized to all bits set.
442 */
443int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity)
444{
445 struct drbd_bitmap *b = mdev->bitmap;
446 unsigned long bits, words, owords, obits, *p_addr, *bm;
447 unsigned long want, have, onpages; /* number of pages */
448 struct page **npages, **opages = NULL;
449 int err = 0, growing;
450 int opages_vmalloced;
451
452 ERR_IF(!b) return -ENOMEM;
453
454 drbd_bm_lock(mdev, "resize");
455
456 dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n",
457 (unsigned long long)capacity);
458
459 if (capacity == b->bm_dev_capacity)
460 goto out;
461
462 opages_vmalloced = test_bit(BM_P_VMALLOCED, &b->bm_flags);
463
464 if (capacity == 0) {
465 spin_lock_irq(&b->bm_lock);
466 opages = b->bm_pages;
467 onpages = b->bm_number_of_pages;
468 owords = b->bm_words;
469 b->bm_pages = NULL;
470 b->bm_number_of_pages =
471 b->bm_set =
472 b->bm_bits =
473 b->bm_words =
474 b->bm_dev_capacity = 0;
475 spin_unlock_irq(&b->bm_lock);
476 bm_free_pages(opages, onpages);
477 bm_vk_free(opages, opages_vmalloced);
478 goto out;
479 }
480 bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
481
482 /* if we would use
483 words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
484 a 32bit host could present the wrong number of words
485 to a 64bit host.
486 */
487 words = ALIGN(bits, 64) >> LN2_BPL;
488
489 if (get_ldev(mdev)) {
490 D_ASSERT((u64)bits <= (((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12));
491 put_ldev(mdev);
492 }
493
494 /* one extra long to catch off by one errors */
495 want = ALIGN((words+1)*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
496 have = b->bm_number_of_pages;
497 if (want == have) {
498 D_ASSERT(b->bm_pages != NULL);
499 npages = b->bm_pages;
500 } else {
501 if (FAULT_ACTIVE(mdev, DRBD_FAULT_BM_ALLOC))
502 npages = NULL;
503 else
504 npages = bm_realloc_pages(b, want);
505 }
506
507 if (!npages) {
508 err = -ENOMEM;
509 goto out;
510 }
511
512 spin_lock_irq(&b->bm_lock);
513 opages = b->bm_pages;
514 owords = b->bm_words;
515 obits = b->bm_bits;
516
517 growing = bits > obits;
518 if (opages)
519 bm_set_surplus(b);
520
521 b->bm_pages = npages;
522 b->bm_number_of_pages = want;
523 b->bm_bits = bits;
524 b->bm_words = words;
525 b->bm_dev_capacity = capacity;
526
527 if (growing) {
528 bm_memset(b, owords, 0xff, words-owords);
529 b->bm_set += bits - obits;
530 }
531
532 if (want < have) {
533 /* implicit: (opages != NULL) && (opages != npages) */
534 bm_free_pages(opages + want, have - want);
535 }
536
537 p_addr = bm_map_paddr(b, words);
538 bm = p_addr + MLPP(words);
539 *bm = DRBD_MAGIC;
540 bm_unmap(p_addr);
541
542 (void)bm_clear_surplus(b);
543
544 spin_unlock_irq(&b->bm_lock);
545 if (opages != npages)
546 bm_vk_free(opages, opages_vmalloced);
547 if (!growing)
548 b->bm_set = bm_count_bits(b);
549 dev_info(DEV, "resync bitmap: bits=%lu words=%lu\n", bits, words);
550
551 out:
552 drbd_bm_unlock(mdev);
553 return err;
554}
555
556/* inherently racy:
557 * if not protected by other means, return value may be out of date when
558 * leaving this function...
559 * we still need to lock it, since it is important that this returns
560 * bm_set == 0 precisely.
561 *
562 * maybe bm_set should be atomic_t ?
563 */
564static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
565{
566 struct drbd_bitmap *b = mdev->bitmap;
567 unsigned long s;
568 unsigned long flags;
569
570 ERR_IF(!b) return 0;
571 ERR_IF(!b->bm_pages) return 0;
572
573 spin_lock_irqsave(&b->bm_lock, flags);
574 s = b->bm_set;
575 spin_unlock_irqrestore(&b->bm_lock, flags);
576
577 return s;
578}
579
580unsigned long drbd_bm_total_weight(struct drbd_conf *mdev)
581{
582 unsigned long s;
583 /* if I don't have a disk, I don't know about out-of-sync status */
584 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
585 return 0;
586 s = _drbd_bm_total_weight(mdev);
587 put_ldev(mdev);
588 return s;
589}
590
591size_t drbd_bm_words(struct drbd_conf *mdev)
592{
593 struct drbd_bitmap *b = mdev->bitmap;
594 ERR_IF(!b) return 0;
595 ERR_IF(!b->bm_pages) return 0;
596
597 return b->bm_words;
598}
599
600unsigned long drbd_bm_bits(struct drbd_conf *mdev)
601{
602 struct drbd_bitmap *b = mdev->bitmap;
603 ERR_IF(!b) return 0;
604
605 return b->bm_bits;
606}
607
608/* merge number words from buffer into the bitmap starting at offset.
609 * buffer[i] is expected to be little endian unsigned long.
610 * bitmap must be locked by drbd_bm_lock.
611 * currently only used from receive_bitmap.
612 */
613void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
614 unsigned long *buffer)
615{
616 struct drbd_bitmap *b = mdev->bitmap;
617 unsigned long *p_addr, *bm;
618 unsigned long word, bits;
619 size_t end, do_now;
620
621 end = offset + number;
622
623 ERR_IF(!b) return;
624 ERR_IF(!b->bm_pages) return;
625 if (number == 0)
626 return;
627 WARN_ON(offset >= b->bm_words);
628 WARN_ON(end > b->bm_words);
629
630 spin_lock_irq(&b->bm_lock);
631 while (offset < end) {
632 do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
633 p_addr = bm_map_paddr(b, offset);
634 bm = p_addr + MLPP(offset);
635 offset += do_now;
636 while (do_now--) {
637 bits = hweight_long(*bm);
638 word = *bm | lel_to_cpu(*buffer++);
639 *bm++ = word;
640 b->bm_set += hweight_long(word) - bits;
641 }
642 bm_unmap(p_addr);
643 }
644 /* with 32bit <-> 64bit cross-platform connect
645 * this is only correct for current usage,
646 * where we _know_ that we are 64 bit aligned,
647 * and know that this function is used in this way, too...
648 */
649 if (end == b->bm_words)
650 b->bm_set -= bm_clear_surplus(b);
651
652 spin_unlock_irq(&b->bm_lock);
653}
654
655/* copy number words from the bitmap starting at offset into the buffer.
656 * buffer[i] will be little endian unsigned long.
657 */
658void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
659 unsigned long *buffer)
660{
661 struct drbd_bitmap *b = mdev->bitmap;
662 unsigned long *p_addr, *bm;
663 size_t end, do_now;
664
665 end = offset + number;
666
667 ERR_IF(!b) return;
668 ERR_IF(!b->bm_pages) return;
669
670 spin_lock_irq(&b->bm_lock);
671 if ((offset >= b->bm_words) ||
672 (end > b->bm_words) ||
673 (number <= 0))
674 dev_err(DEV, "offset=%lu number=%lu bm_words=%lu\n",
675 (unsigned long) offset,
676 (unsigned long) number,
677 (unsigned long) b->bm_words);
678 else {
679 while (offset < end) {
680 do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
681 p_addr = bm_map_paddr(b, offset);
682 bm = p_addr + MLPP(offset);
683 offset += do_now;
684 while (do_now--)
685 *buffer++ = cpu_to_lel(*bm++);
686 bm_unmap(p_addr);
687 }
688 }
689 spin_unlock_irq(&b->bm_lock);
690}
691
692/* set all bits in the bitmap */
693void drbd_bm_set_all(struct drbd_conf *mdev)
694{
695 struct drbd_bitmap *b = mdev->bitmap;
696 ERR_IF(!b) return;
697 ERR_IF(!b->bm_pages) return;
698
699 spin_lock_irq(&b->bm_lock);
700 bm_memset(b, 0, 0xff, b->bm_words);
701 (void)bm_clear_surplus(b);
702 b->bm_set = b->bm_bits;
703 spin_unlock_irq(&b->bm_lock);
704}
705
706/* clear all bits in the bitmap */
707void drbd_bm_clear_all(struct drbd_conf *mdev)
708{
709 struct drbd_bitmap *b = mdev->bitmap;
710 ERR_IF(!b) return;
711 ERR_IF(!b->bm_pages) return;
712
713 spin_lock_irq(&b->bm_lock);
714 bm_memset(b, 0, 0, b->bm_words);
715 b->bm_set = 0;
716 spin_unlock_irq(&b->bm_lock);
717}
718
719static void bm_async_io_complete(struct bio *bio, int error)
720{
721 struct drbd_bitmap *b = bio->bi_private;
722 int uptodate = bio_flagged(bio, BIO_UPTODATE);
723
724
725 /* strange behavior of some lower level drivers...
726 * fail the request by clearing the uptodate flag,
727 * but do not return any error?!
728 * do we want to WARN() on this? */
729 if (!error && !uptodate)
730 error = -EIO;
731
732 if (error) {
733 /* doh. what now?
734 * for now, set all bits, and flag MD_IO_ERROR */
735 __set_bit(BM_MD_IO_ERROR, &b->bm_flags);
736 }
737 if (atomic_dec_and_test(&b->bm_async_io))
738 wake_up(&b->bm_io_wait);
739
740 bio_put(bio);
741}
742
743static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local)
744{
745 /* we are process context. we always get a bio */
746 struct bio *bio = bio_alloc(GFP_KERNEL, 1);
747 unsigned int len;
748 sector_t on_disk_sector =
749 mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset;
750 on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
751
752 /* this might happen with very small
753 * flexible external meta data device */
754 len = min_t(unsigned int, PAGE_SIZE,
755 (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9);
756
757 bio->bi_bdev = mdev->ldev->md_bdev;
758 bio->bi_sector = on_disk_sector;
759 bio_add_page(bio, b->bm_pages[page_nr], len, 0);
760 bio->bi_private = b;
761 bio->bi_end_io = bm_async_io_complete;
762
763 if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
764 bio->bi_rw |= rw;
765 bio_endio(bio, -EIO);
766 } else {
767 submit_bio(rw, bio);
768 }
769}
770
771# if defined(__LITTLE_ENDIAN)
772 /* nothing to do, on disk == in memory */
773# define bm_cpu_to_lel(x) ((void)0)
774# else
775void bm_cpu_to_lel(struct drbd_bitmap *b)
776{
777 /* need to cpu_to_lel all the pages ...
778 * this may be optimized by using
779 * cpu_to_lel(-1) == -1 and cpu_to_lel(0) == 0;
780 * the following is still not optimal, but better than nothing */
781 unsigned int i;
782 unsigned long *p_addr, *bm;
783 if (b->bm_set == 0) {
784 /* no page at all; avoid swap if all is 0 */
785 i = b->bm_number_of_pages;
786 } else if (b->bm_set == b->bm_bits) {
787 /* only the last page */
788 i = b->bm_number_of_pages - 1;
789 } else {
790 /* all pages */
791 i = 0;
792 }
793 for (; i < b->bm_number_of_pages; i++) {
794 p_addr = kmap_atomic(b->bm_pages[i], KM_USER0);
795 for (bm = p_addr; bm < p_addr + PAGE_SIZE/sizeof(long); bm++)
796 *bm = cpu_to_lel(*bm);
797 kunmap_atomic(p_addr, KM_USER0);
798 }
799}
800# endif
801/* lel_to_cpu == cpu_to_lel */
802# define bm_lel_to_cpu(x) bm_cpu_to_lel(x)
803
804/*
805 * bm_rw: read/write the whole bitmap from/to its on disk location.
806 */
807static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local)
808{
809 struct drbd_bitmap *b = mdev->bitmap;
810 /* sector_t sector; */
811 int bm_words, num_pages, i;
812 unsigned long now;
813 char ppb[10];
814 int err = 0;
815
816 WARN_ON(!bm_is_locked(b));
817
818 /* no spinlock here, the drbd_bm_lock should be enough! */
819
820 bm_words = drbd_bm_words(mdev);
821 num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT;
822
823 /* on disk bitmap is little endian */
824 if (rw == WRITE)
825 bm_cpu_to_lel(b);
826
827 now = jiffies;
828 atomic_set(&b->bm_async_io, num_pages);
829 __clear_bit(BM_MD_IO_ERROR, &b->bm_flags);
830
831 /* let the layers below us try to merge these bios... */
832 for (i = 0; i < num_pages; i++)
833 bm_page_io_async(mdev, b, i, rw);
834
835 drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
836 wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0);
837
838 if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) {
839 dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
840 drbd_chk_io_error(mdev, 1, TRUE);
841 err = -EIO;
842 }
843
844 now = jiffies;
845 if (rw == WRITE) {
846 /* swap back endianness */
847 bm_lel_to_cpu(b);
848 /* flush bitmap to stable storage */
849 drbd_md_flush(mdev);
850 } else /* rw == READ */ {
851 /* just read, if necessary adjust endianness */
852 b->bm_set = bm_count_bits_swap_endian(b);
853 dev_info(DEV, "recounting of set bits took additional %lu jiffies\n",
854 jiffies - now);
855 }
856 now = b->bm_set;
857
858 dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
859 ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
860
861 return err;
862}
863
864/**
865 * drbd_bm_read() - Read the whole bitmap from its on disk location.
866 * @mdev: DRBD device.
867 */
868int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
869{
870 return bm_rw(mdev, READ);
871}
872
873/**
874 * drbd_bm_write() - Write the whole bitmap to its on disk location.
875 * @mdev: DRBD device.
876 */
877int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
878{
879 return bm_rw(mdev, WRITE);
880}
881
882/**
883 * drbd_bm_write_sect: Writes a 512 (MD_SECTOR_SIZE) byte piece of the bitmap
884 * @mdev: DRBD device.
885 * @enr: Extent number in the resync lru (happens to be sector offset)
886 *
887 * The BM_EXT_SIZE is on purpose exactly the amount of the bitmap covered
888 * by a single sector write. Therefore enr == sector offset from the
889 * start of the bitmap.
890 */
891int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local)
892{
893 sector_t on_disk_sector = enr + mdev->ldev->md.md_offset
894 + mdev->ldev->md.bm_offset;
895 int bm_words, num_words, offset;
896 int err = 0;
897
898 mutex_lock(&mdev->md_io_mutex);
899 bm_words = drbd_bm_words(mdev);
900 offset = S2W(enr); /* word offset into bitmap */
901 num_words = min(S2W(1), bm_words - offset);
902 if (num_words < S2W(1))
903 memset(page_address(mdev->md_io_page), 0, MD_SECTOR_SIZE);
904 drbd_bm_get_lel(mdev, offset, num_words,
905 page_address(mdev->md_io_page));
906 if (!drbd_md_sync_page_io(mdev, mdev->ldev, on_disk_sector, WRITE)) {
907 int i;
908 err = -EIO;
909 dev_err(DEV, "IO ERROR writing bitmap sector %lu "
910 "(meta-disk sector %llus)\n",
911 enr, (unsigned long long)on_disk_sector);
912 drbd_chk_io_error(mdev, 1, TRUE);
913 for (i = 0; i < AL_EXT_PER_BM_SECT; i++)
914 drbd_bm_ALe_set_all(mdev, enr*AL_EXT_PER_BM_SECT+i);
915 }
916 mdev->bm_writ_cnt++;
917 mutex_unlock(&mdev->md_io_mutex);
918 return err;
919}
920
921/* NOTE
922 * find_first_bit returns int, we return unsigned long.
923 * should not make much difference anyways, but ...
924 *
925 * this returns a bit number, NOT a sector!
926 */
927#define BPP_MASK ((1UL << (PAGE_SHIFT+3)) - 1)
928static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
929 const int find_zero_bit, const enum km_type km)
930{
931 struct drbd_bitmap *b = mdev->bitmap;
932 unsigned long i = -1UL;
933 unsigned long *p_addr;
934 unsigned long bit_offset; /* bit offset of the mapped page. */
935
936 if (bm_fo > b->bm_bits) {
937 dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
938 } else {
939 while (bm_fo < b->bm_bits) {
940 unsigned long offset;
941 bit_offset = bm_fo & ~BPP_MASK; /* bit offset of the page */
942 offset = bit_offset >> LN2_BPL; /* word offset of the page */
943 p_addr = __bm_map_paddr(b, offset, km);
944
945 if (find_zero_bit)
946 i = find_next_zero_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK);
947 else
948 i = find_next_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK);
949
950 __bm_unmap(p_addr, km);
951 if (i < PAGE_SIZE*8) {
952 i = bit_offset + i;
953 if (i >= b->bm_bits)
954 break;
955 goto found;
956 }
957 bm_fo = bit_offset + PAGE_SIZE*8;
958 }
959 i = -1UL;
960 }
961 found:
962 return i;
963}
964
965static unsigned long bm_find_next(struct drbd_conf *mdev,
966 unsigned long bm_fo, const int find_zero_bit)
967{
968 struct drbd_bitmap *b = mdev->bitmap;
969 unsigned long i = -1UL;
970
971 ERR_IF(!b) return i;
972 ERR_IF(!b->bm_pages) return i;
973
974 spin_lock_irq(&b->bm_lock);
975 if (bm_is_locked(b))
976 bm_print_lock_info(mdev);
977
978 i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1);
979
980 spin_unlock_irq(&b->bm_lock);
981 return i;
982}
983
984unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
985{
986 return bm_find_next(mdev, bm_fo, 0);
987}
988
989#if 0
990/* not yet needed for anything. */
991unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
992{
993 return bm_find_next(mdev, bm_fo, 1);
994}
995#endif
996
997/* does not spin_lock_irqsave.
998 * you must take drbd_bm_lock() first */
999unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
1000{
1001 /* WARN_ON(!bm_is_locked(mdev)); */
1002 return __bm_find_next(mdev, bm_fo, 0, KM_USER1);
1003}
1004
1005unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
1006{
1007 /* WARN_ON(!bm_is_locked(mdev)); */
1008 return __bm_find_next(mdev, bm_fo, 1, KM_USER1);
1009}
1010
1011/* returns number of bits actually changed.
1012 * for val != 0, we change 0 -> 1, return code positive
1013 * for val == 0, we change 1 -> 0, return code negative
1014 * wants bitnr, not sector.
1015 * expected to be called for only a few bits (e - s about BITS_PER_LONG).
1016 * Must hold bitmap lock already. */
1017int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1018 unsigned long e, int val, const enum km_type km)
1019{
1020 struct drbd_bitmap *b = mdev->bitmap;
1021 unsigned long *p_addr = NULL;
1022 unsigned long bitnr;
1023 unsigned long last_page_nr = -1UL;
1024 int c = 0;
1025
1026 if (e >= b->bm_bits) {
1027 dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
1028 s, e, b->bm_bits);
1029 e = b->bm_bits ? b->bm_bits -1 : 0;
1030 }
1031 for (bitnr = s; bitnr <= e; bitnr++) {
1032 unsigned long offset = bitnr>>LN2_BPL;
1033 unsigned long page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
1034 if (page_nr != last_page_nr) {
1035 if (p_addr)
1036 __bm_unmap(p_addr, km);
1037 p_addr = __bm_map_paddr(b, offset, km);
1038 last_page_nr = page_nr;
1039 }
1040 if (val)
1041 c += (0 == __test_and_set_bit(bitnr & BPP_MASK, p_addr));
1042 else
1043 c -= (0 != __test_and_clear_bit(bitnr & BPP_MASK, p_addr));
1044 }
1045 if (p_addr)
1046 __bm_unmap(p_addr, km);
1047 b->bm_set += c;
1048 return c;
1049}
1050
1051/* returns number of bits actually changed.
1052 * for val != 0, we change 0 -> 1, return code positive
1053 * for val == 0, we change 1 -> 0, return code negative
1054 * wants bitnr, not sector */
1055int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1056 const unsigned long e, int val)
1057{
1058 unsigned long flags;
1059 struct drbd_bitmap *b = mdev->bitmap;
1060 int c = 0;
1061
1062 ERR_IF(!b) return 1;
1063 ERR_IF(!b->bm_pages) return 0;
1064
1065 spin_lock_irqsave(&b->bm_lock, flags);
1066 if (bm_is_locked(b))
1067 bm_print_lock_info(mdev);
1068
1069 c = __bm_change_bits_to(mdev, s, e, val, KM_IRQ1);
1070
1071 spin_unlock_irqrestore(&b->bm_lock, flags);
1072 return c;
1073}
1074
1075/* returns number of bits changed 0 -> 1 */
1076int drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1077{
1078 return bm_change_bits_to(mdev, s, e, 1);
1079}
1080
1081/* returns number of bits changed 1 -> 0 */
1082int drbd_bm_clear_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1083{
1084 return -bm_change_bits_to(mdev, s, e, 0);
1085}
1086
1087/* sets all bits in full words,
1088 * from first_word up to, but not including, last_word */
1089static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
1090 int page_nr, int first_word, int last_word)
1091{
1092 int i;
1093 int bits;
1094 unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_USER0);
1095 for (i = first_word; i < last_word; i++) {
1096 bits = hweight_long(paddr[i]);
1097 paddr[i] = ~0UL;
1098 b->bm_set += BITS_PER_LONG - bits;
1099 }
1100 kunmap_atomic(paddr, KM_USER0);
1101}
1102
1103/* Same thing as drbd_bm_set_bits, but without taking the spin_lock_irqsave.
1104 * You must first drbd_bm_lock().
1105 * Can be called to set the whole bitmap in one go.
1106 * Sets bits from s to e _inclusive_. */
1107void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1108{
1109 /* First set_bit from the first bit (s)
1110 * up to the next long boundary (sl),
1111 * then assign full words up to the last long boundary (el),
1112 * then set_bit up to and including the last bit (e).
1113 *
1114 * Do not use memset, because we must account for changes,
1115 * so we need to loop over the words with hweight() anyways.
1116 */
1117 unsigned long sl = ALIGN(s,BITS_PER_LONG);
1118 unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
1119 int first_page;
1120 int last_page;
1121 int page_nr;
1122 int first_word;
1123 int last_word;
1124
1125 if (e - s <= 3*BITS_PER_LONG) {
1126 /* don't bother; el and sl may even be wrong. */
1127 __bm_change_bits_to(mdev, s, e, 1, KM_USER0);
1128 return;
1129 }
1130
1131 /* difference is large enough that we can trust sl and el */
1132
1133 /* bits filling the current long */
1134 if (sl)
1135 __bm_change_bits_to(mdev, s, sl-1, 1, KM_USER0);
1136
1137 first_page = sl >> (3 + PAGE_SHIFT);
1138 last_page = el >> (3 + PAGE_SHIFT);
1139
1140 /* MLPP: modulo longs per page */
1141 /* LWPP: long words per page */
1142 first_word = MLPP(sl >> LN2_BPL);
1143 last_word = LWPP;
1144
1145 /* first and full pages, unless first page == last page */
1146 for (page_nr = first_page; page_nr < last_page; page_nr++) {
1147 bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word);
1148 cond_resched();
1149 first_word = 0;
1150 }
1151
1152 /* last page (respectively only page, for first page == last page) */
1153 last_word = MLPP(el >> LN2_BPL);
1154 bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word);
1155
1156 /* possibly trailing bits.
1157 * example: (e & 63) == 63, el will be e+1.
1158 * if that even was the very last bit,
1159 * it would trigger an assert in __bm_change_bits_to()
1160 */
1161 if (el <= e)
1162 __bm_change_bits_to(mdev, el, e, 1, KM_USER0);
1163}
1164
1165/* returns bit state
1166 * wants bitnr, NOT sector.
1167 * inherently racy... area needs to be locked by means of {al,rs}_lru
1168 * 1 ... bit set
1169 * 0 ... bit not set
1170 * -1 ... first out of bounds access, stop testing for bits!
1171 */
1172int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
1173{
1174 unsigned long flags;
1175 struct drbd_bitmap *b = mdev->bitmap;
1176 unsigned long *p_addr;
1177 int i;
1178
1179 ERR_IF(!b) return 0;
1180 ERR_IF(!b->bm_pages) return 0;
1181
1182 spin_lock_irqsave(&b->bm_lock, flags);
1183 if (bm_is_locked(b))
1184 bm_print_lock_info(mdev);
1185 if (bitnr < b->bm_bits) {
1186 unsigned long offset = bitnr>>LN2_BPL;
1187 p_addr = bm_map_paddr(b, offset);
1188 i = test_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0;
1189 bm_unmap(p_addr);
1190 } else if (bitnr == b->bm_bits) {
1191 i = -1;
1192 } else { /* (bitnr > b->bm_bits) */
1193 dev_err(DEV, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
1194 i = 0;
1195 }
1196
1197 spin_unlock_irqrestore(&b->bm_lock, flags);
1198 return i;
1199}
1200
1201/* returns number of bits set in the range [s, e] */
1202int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1203{
1204 unsigned long flags;
1205 struct drbd_bitmap *b = mdev->bitmap;
1206 unsigned long *p_addr = NULL, page_nr = -1;
1207 unsigned long bitnr;
1208 int c = 0;
1209 size_t w;
1210
1211 /* If this is called without a bitmap, that is a bug. But just to be
1212 * robust in case we screwed up elsewhere, in that case pretend there
1213 * was one dirty bit in the requested area, so we won't try to do a
1214 * local read there (no bitmap probably implies no disk) */
1215 ERR_IF(!b) return 1;
1216 ERR_IF(!b->bm_pages) return 1;
1217
1218 spin_lock_irqsave(&b->bm_lock, flags);
1219 if (bm_is_locked(b))
1220 bm_print_lock_info(mdev);
1221 for (bitnr = s; bitnr <= e; bitnr++) {
1222 w = bitnr >> LN2_BPL;
1223 if (page_nr != w >> (PAGE_SHIFT - LN2_BPL + 3)) {
1224 page_nr = w >> (PAGE_SHIFT - LN2_BPL + 3);
1225 if (p_addr)
1226 bm_unmap(p_addr);
1227 p_addr = bm_map_paddr(b, w);
1228 }
1229 ERR_IF (bitnr >= b->bm_bits) {
1230 dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
1231 } else {
1232 c += (0 != test_bit(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
1233 }
1234 }
1235 if (p_addr)
1236 bm_unmap(p_addr);
1237 spin_unlock_irqrestore(&b->bm_lock, flags);
1238 return c;
1239}
1240
1241
1242/* inherently racy...
1243 * return value may be already out-of-date when this function returns.
1244 * but the general usage is that this is only use during a cstate when bits are
1245 * only cleared, not set, and typically only care for the case when the return
1246 * value is zero, or we already "locked" this "bitmap extent" by other means.
1247 *
1248 * enr is bm-extent number, since we chose to name one sector (512 bytes)
1249 * worth of the bitmap a "bitmap extent".
1250 *
1251 * TODO
1252 * I think since we use it like a reference count, we should use the real
1253 * reference count of some bitmap extent element from some lru instead...
1254 *
1255 */
1256int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
1257{
1258 struct drbd_bitmap *b = mdev->bitmap;
1259 int count, s, e;
1260 unsigned long flags;
1261 unsigned long *p_addr, *bm;
1262
1263 ERR_IF(!b) return 0;
1264 ERR_IF(!b->bm_pages) return 0;
1265
1266 spin_lock_irqsave(&b->bm_lock, flags);
1267 if (bm_is_locked(b))
1268 bm_print_lock_info(mdev);
1269
1270 s = S2W(enr);
1271 e = min((size_t)S2W(enr+1), b->bm_words);
1272 count = 0;
1273 if (s < b->bm_words) {
1274 int n = e-s;
1275 p_addr = bm_map_paddr(b, s);
1276 bm = p_addr + MLPP(s);
1277 while (n--)
1278 count += hweight_long(*bm++);
1279 bm_unmap(p_addr);
1280 } else {
1281 dev_err(DEV, "start offset (%d) too large in drbd_bm_e_weight\n", s);
1282 }
1283 spin_unlock_irqrestore(&b->bm_lock, flags);
1284 return count;
1285}
1286
1287/* set all bits covered by the AL-extent al_enr */
1288unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
1289{
1290 struct drbd_bitmap *b = mdev->bitmap;
1291 unsigned long *p_addr, *bm;
1292 unsigned long weight;
1293 int count, s, e, i, do_now;
1294 ERR_IF(!b) return 0;
1295 ERR_IF(!b->bm_pages) return 0;
1296
1297 spin_lock_irq(&b->bm_lock);
1298 if (bm_is_locked(b))
1299 bm_print_lock_info(mdev);
1300 weight = b->bm_set;
1301
1302 s = al_enr * BM_WORDS_PER_AL_EXT;
1303 e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);
1304 /* assert that s and e are on the same page */
1305 D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3)
1306 == s >> (PAGE_SHIFT - LN2_BPL + 3));
1307 count = 0;
1308 if (s < b->bm_words) {
1309 i = do_now = e-s;
1310 p_addr = bm_map_paddr(b, s);
1311 bm = p_addr + MLPP(s);
1312 while (i--) {
1313 count += hweight_long(*bm);
1314 *bm = -1UL;
1315 bm++;
1316 }
1317 bm_unmap(p_addr);
1318 b->bm_set += do_now*BITS_PER_LONG - count;
1319 if (e == b->bm_words)
1320 b->bm_set -= bm_clear_surplus(b);
1321 } else {
1322 dev_err(DEV, "start offset (%d) too large in drbd_bm_ALe_set_all\n", s);
1323 }
1324 weight = b->bm_set - weight;
1325 spin_unlock_irq(&b->bm_lock);
1326 return weight;
1327}
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
new file mode 100644
index 000000000000..2312d782fe99
--- /dev/null
+++ b/drivers/block/drbd/drbd_int.h
@@ -0,0 +1,2252 @@
1/*
2 drbd_int.h
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24*/
25
26#ifndef _DRBD_INT_H
27#define _DRBD_INT_H
28
29#include <linux/compiler.h>
30#include <linux/types.h>
31#include <linux/version.h>
32#include <linux/list.h>
33#include <linux/sched.h>
34#include <linux/bitops.h>
35#include <linux/slab.h>
36#include <linux/crypto.h>
37#include <linux/ratelimit.h>
38#include <linux/tcp.h>
39#include <linux/mutex.h>
40#include <linux/major.h>
41#include <linux/blkdev.h>
42#include <linux/genhd.h>
43#include <net/tcp.h>
44#include <linux/lru_cache.h>
45
46#ifdef __CHECKER__
47# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr")))
48# define __protected_read_by(x) __attribute__((require_context(x,1,999,"read")))
49# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write")))
50# define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call")))
51#else
52# define __protected_by(x)
53# define __protected_read_by(x)
54# define __protected_write_by(x)
55# define __must_hold(x)
56#endif
57
58#define __no_warn(lock, stmt) do { __acquire(lock); stmt; __release(lock); } while (0)
59
60/* module parameter, defined in drbd_main.c */
61extern unsigned int minor_count;
62extern int disable_sendpage;
63extern int allow_oos;
64extern unsigned int cn_idx;
65
66#ifdef CONFIG_DRBD_FAULT_INJECTION
67extern int enable_faults;
68extern int fault_rate;
69extern int fault_devs;
70#endif
71
72extern char usermode_helper[];
73
74
75#ifndef TRUE
76#define TRUE 1
77#endif
78#ifndef FALSE
79#define FALSE 0
80#endif
81
82/* I don't remember why XCPU ...
83 * This is used to wake the asender,
84 * and to interrupt sending the sending task
85 * on disconnect.
86 */
87#define DRBD_SIG SIGXCPU
88
89/* This is used to stop/restart our threads.
90 * Cannot use SIGTERM nor SIGKILL, since these
91 * are sent out by init on runlevel changes
92 * I choose SIGHUP for now.
93 */
94#define DRBD_SIGKILL SIGHUP
95
96/* All EEs on the free list should have ID_VACANT (== 0)
97 * freshly allocated EEs get !ID_VACANT (== 1)
98 * so if it says "cannot dereference null pointer at adress 0x00000001",
99 * it is most likely one of these :( */
100
101#define ID_IN_SYNC (4711ULL)
102#define ID_OUT_OF_SYNC (4712ULL)
103
104#define ID_SYNCER (-1ULL)
105#define ID_VACANT 0
106#define is_syncer_block_id(id) ((id) == ID_SYNCER)
107
108struct drbd_conf;
109
110
111/* to shorten dev_warn(DEV, "msg"); and relatives statements */
112#define DEV (disk_to_dev(mdev->vdisk))
113
114#define D_ASSERT(exp) if (!(exp)) \
115 dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__)
116
117#define ERR_IF(exp) if (({ \
118 int _b = (exp) != 0; \
119 if (_b) dev_err(DEV, "%s: (%s) in %s:%d\n", \
120 __func__, #exp, __FILE__, __LINE__); \
121 _b; \
122 }))
123
124/* Defines to control fault insertion */
125enum {
126 DRBD_FAULT_MD_WR = 0, /* meta data write */
127 DRBD_FAULT_MD_RD = 1, /* read */
128 DRBD_FAULT_RS_WR = 2, /* resync */
129 DRBD_FAULT_RS_RD = 3,
130 DRBD_FAULT_DT_WR = 4, /* data */
131 DRBD_FAULT_DT_RD = 5,
132 DRBD_FAULT_DT_RA = 6, /* data read ahead */
133 DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */
134 DRBD_FAULT_AL_EE = 8, /* alloc ee */
135
136 DRBD_FAULT_MAX,
137};
138
139#ifdef CONFIG_DRBD_FAULT_INJECTION
140extern unsigned int
141_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type);
142static inline int
143drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) {
144 return fault_rate &&
145 (enable_faults & (1<<type)) &&
146 _drbd_insert_fault(mdev, type);
147}
148#define FAULT_ACTIVE(_m, _t) (drbd_insert_fault((_m), (_t)))
149
150#else
151#define FAULT_ACTIVE(_m, _t) (0)
152#endif
153
154/* integer division, round _UP_ to the next integer */
155#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0))
156/* usual integer division */
157#define div_floor(A, B) ((A)/(B))
158
159/* drbd_meta-data.c (still in drbd_main.c) */
160/* 4th incarnation of the disk layout. */
161#define DRBD_MD_MAGIC (DRBD_MAGIC+4)
162
163extern struct drbd_conf **minor_table;
164extern struct ratelimit_state drbd_ratelimit_state;
165
166/* on the wire */
167enum drbd_packets {
168 /* receiver (data socket) */
169 P_DATA = 0x00,
170 P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */
171 P_RS_DATA_REPLY = 0x02, /* Response to P_RS_DATA_REQUEST */
172 P_BARRIER = 0x03,
173 P_BITMAP = 0x04,
174 P_BECOME_SYNC_TARGET = 0x05,
175 P_BECOME_SYNC_SOURCE = 0x06,
176 P_UNPLUG_REMOTE = 0x07, /* Used at various times to hint the peer */
177 P_DATA_REQUEST = 0x08, /* Used to ask for a data block */
178 P_RS_DATA_REQUEST = 0x09, /* Used to ask for a data block for resync */
179 P_SYNC_PARAM = 0x0a,
180 P_PROTOCOL = 0x0b,
181 P_UUIDS = 0x0c,
182 P_SIZES = 0x0d,
183 P_STATE = 0x0e,
184 P_SYNC_UUID = 0x0f,
185 P_AUTH_CHALLENGE = 0x10,
186 P_AUTH_RESPONSE = 0x11,
187 P_STATE_CHG_REQ = 0x12,
188
189 /* asender (meta socket */
190 P_PING = 0x13,
191 P_PING_ACK = 0x14,
192 P_RECV_ACK = 0x15, /* Used in protocol B */
193 P_WRITE_ACK = 0x16, /* Used in protocol C */
194 P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
195 P_DISCARD_ACK = 0x18, /* Used in proto C, two-primaries conflict detection */
196 P_NEG_ACK = 0x19, /* Sent if local disk is unusable */
197 P_NEG_DREPLY = 0x1a, /* Local disk is broken... */
198 P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */
199 P_BARRIER_ACK = 0x1c,
200 P_STATE_CHG_REPLY = 0x1d,
201
202 /* "new" commands, no longer fitting into the ordering scheme above */
203
204 P_OV_REQUEST = 0x1e, /* data socket */
205 P_OV_REPLY = 0x1f,
206 P_OV_RESULT = 0x20, /* meta socket */
207 P_CSUM_RS_REQUEST = 0x21, /* data socket */
208 P_RS_IS_IN_SYNC = 0x22, /* meta socket */
209 P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */
210 P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */
211
212 P_MAX_CMD = 0x25,
213 P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
214 P_MAX_OPT_CMD = 0x101,
215
216 /* special command ids for handshake */
217
218 P_HAND_SHAKE_M = 0xfff1, /* First Packet on the MetaSock */
219 P_HAND_SHAKE_S = 0xfff2, /* First Packet on the Socket */
220
221 P_HAND_SHAKE = 0xfffe /* FIXED for the next century! */
222};
223
224static inline const char *cmdname(enum drbd_packets cmd)
225{
226 /* THINK may need to become several global tables
227 * when we want to support more than
228 * one PRO_VERSION */
229 static const char *cmdnames[] = {
230 [P_DATA] = "Data",
231 [P_DATA_REPLY] = "DataReply",
232 [P_RS_DATA_REPLY] = "RSDataReply",
233 [P_BARRIER] = "Barrier",
234 [P_BITMAP] = "ReportBitMap",
235 [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
236 [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
237 [P_UNPLUG_REMOTE] = "UnplugRemote",
238 [P_DATA_REQUEST] = "DataRequest",
239 [P_RS_DATA_REQUEST] = "RSDataRequest",
240 [P_SYNC_PARAM] = "SyncParam",
241 [P_SYNC_PARAM89] = "SyncParam89",
242 [P_PROTOCOL] = "ReportProtocol",
243 [P_UUIDS] = "ReportUUIDs",
244 [P_SIZES] = "ReportSizes",
245 [P_STATE] = "ReportState",
246 [P_SYNC_UUID] = "ReportSyncUUID",
247 [P_AUTH_CHALLENGE] = "AuthChallenge",
248 [P_AUTH_RESPONSE] = "AuthResponse",
249 [P_PING] = "Ping",
250 [P_PING_ACK] = "PingAck",
251 [P_RECV_ACK] = "RecvAck",
252 [P_WRITE_ACK] = "WriteAck",
253 [P_RS_WRITE_ACK] = "RSWriteAck",
254 [P_DISCARD_ACK] = "DiscardAck",
255 [P_NEG_ACK] = "NegAck",
256 [P_NEG_DREPLY] = "NegDReply",
257 [P_NEG_RS_DREPLY] = "NegRSDReply",
258 [P_BARRIER_ACK] = "BarrierAck",
259 [P_STATE_CHG_REQ] = "StateChgRequest",
260 [P_STATE_CHG_REPLY] = "StateChgReply",
261 [P_OV_REQUEST] = "OVRequest",
262 [P_OV_REPLY] = "OVReply",
263 [P_OV_RESULT] = "OVResult",
264 [P_MAX_CMD] = NULL,
265 };
266
267 if (cmd == P_HAND_SHAKE_M)
268 return "HandShakeM";
269 if (cmd == P_HAND_SHAKE_S)
270 return "HandShakeS";
271 if (cmd == P_HAND_SHAKE)
272 return "HandShake";
273 if (cmd >= P_MAX_CMD)
274 return "Unknown";
275 return cmdnames[cmd];
276}
277
278/* for sending/receiving the bitmap,
279 * possibly in some encoding scheme */
280struct bm_xfer_ctx {
281 /* "const"
282 * stores total bits and long words
283 * of the bitmap, so we don't need to
284 * call the accessor functions over and again. */
285 unsigned long bm_bits;
286 unsigned long bm_words;
287 /* during xfer, current position within the bitmap */
288 unsigned long bit_offset;
289 unsigned long word_offset;
290
291 /* statistics; index: (h->command == P_BITMAP) */
292 unsigned packets[2];
293 unsigned bytes[2];
294};
295
296extern void INFO_bm_xfer_stats(struct drbd_conf *mdev,
297 const char *direction, struct bm_xfer_ctx *c);
298
299static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
300{
301 /* word_offset counts "native long words" (32 or 64 bit),
302 * aligned at 64 bit.
303 * Encoded packet may end at an unaligned bit offset.
304 * In case a fallback clear text packet is transmitted in
305 * between, we adjust this offset back to the last 64bit
306 * aligned "native long word", which makes coding and decoding
307 * the plain text bitmap much more convenient. */
308#if BITS_PER_LONG == 64
309 c->word_offset = c->bit_offset >> 6;
310#elif BITS_PER_LONG == 32
311 c->word_offset = c->bit_offset >> 5;
312 c->word_offset &= ~(1UL);
313#else
314# error "unsupported BITS_PER_LONG"
315#endif
316}
317
318#ifndef __packed
319#define __packed __attribute__((packed))
320#endif
321
322/* This is the layout for a packet on the wire.
323 * The byteorder is the network byte order.
324 * (except block_id and barrier fields.
325 * these are pointers to local structs
326 * and have no relevance for the partner,
327 * which just echoes them as received.)
328 *
329 * NOTE that the payload starts at a long aligned offset,
330 * regardless of 32 or 64 bit arch!
331 */
332struct p_header {
333 u32 magic;
334 u16 command;
335 u16 length; /* bytes of data after this header */
336 u8 payload[0];
337} __packed;
338/* 8 bytes. packet FIXED for the next century! */
339
340/*
341 * short commands, packets without payload, plain p_header:
342 * P_PING
343 * P_PING_ACK
344 * P_BECOME_SYNC_TARGET
345 * P_BECOME_SYNC_SOURCE
346 * P_UNPLUG_REMOTE
347 */
348
349/*
350 * commands with out-of-struct payload:
351 * P_BITMAP (no additional fields)
352 * P_DATA, P_DATA_REPLY (see p_data)
353 * P_COMPRESSED_BITMAP (see receive_compressed_bitmap)
354 */
355
356/* these defines must not be changed without changing the protocol version */
357#define DP_HARDBARRIER 1
358#define DP_RW_SYNC 2
359#define DP_MAY_SET_IN_SYNC 4
360
361struct p_data {
362 struct p_header head;
363 u64 sector; /* 64 bits sector number */
364 u64 block_id; /* to identify the request in protocol B&C */
365 u32 seq_num;
366 u32 dp_flags;
367} __packed;
368
369/*
370 * commands which share a struct:
371 * p_block_ack:
372 * P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
373 * P_DISCARD_ACK (proto C, two-primaries conflict detection)
374 * p_block_req:
375 * P_DATA_REQUEST, P_RS_DATA_REQUEST
376 */
377struct p_block_ack {
378 struct p_header head;
379 u64 sector;
380 u64 block_id;
381 u32 blksize;
382 u32 seq_num;
383} __packed;
384
385
386struct p_block_req {
387 struct p_header head;
388 u64 sector;
389 u64 block_id;
390 u32 blksize;
391 u32 pad; /* to multiple of 8 Byte */
392} __packed;
393
394/*
395 * commands with their own struct for additional fields:
396 * P_HAND_SHAKE
397 * P_BARRIER
398 * P_BARRIER_ACK
399 * P_SYNC_PARAM
400 * ReportParams
401 */
402
403struct p_handshake {
404 struct p_header head; /* 8 bytes */
405 u32 protocol_min;
406 u32 feature_flags;
407 u32 protocol_max;
408
409 /* should be more than enough for future enhancements
410 * for now, feature_flags and the reserverd array shall be zero.
411 */
412
413 u32 _pad;
414 u64 reserverd[7];
415} __packed;
416/* 80 bytes, FIXED for the next century */
417
418struct p_barrier {
419 struct p_header head;
420 u32 barrier; /* barrier number _handle_ only */
421 u32 pad; /* to multiple of 8 Byte */
422} __packed;
423
424struct p_barrier_ack {
425 struct p_header head;
426 u32 barrier;
427 u32 set_size;
428} __packed;
429
430struct p_rs_param {
431 struct p_header head;
432 u32 rate;
433
434 /* Since protocol version 88 and higher. */
435 char verify_alg[0];
436} __packed;
437
438struct p_rs_param_89 {
439 struct p_header head;
440 u32 rate;
441 /* protocol version 89: */
442 char verify_alg[SHARED_SECRET_MAX];
443 char csums_alg[SHARED_SECRET_MAX];
444} __packed;
445
446struct p_protocol {
447 struct p_header head;
448 u32 protocol;
449 u32 after_sb_0p;
450 u32 after_sb_1p;
451 u32 after_sb_2p;
452 u32 want_lose;
453 u32 two_primaries;
454
455 /* Since protocol version 87 and higher. */
456 char integrity_alg[0];
457
458} __packed;
459
460struct p_uuids {
461 struct p_header head;
462 u64 uuid[UI_EXTENDED_SIZE];
463} __packed;
464
465struct p_rs_uuid {
466 struct p_header head;
467 u64 uuid;
468} __packed;
469
470struct p_sizes {
471 struct p_header head;
472 u64 d_size; /* size of disk */
473 u64 u_size; /* user requested size */
474 u64 c_size; /* current exported size */
475 u32 max_segment_size; /* Maximal size of a BIO */
476 u32 queue_order_type;
477} __packed;
478
479struct p_state {
480 struct p_header head;
481 u32 state;
482} __packed;
483
484struct p_req_state {
485 struct p_header head;
486 u32 mask;
487 u32 val;
488} __packed;
489
490struct p_req_state_reply {
491 struct p_header head;
492 u32 retcode;
493} __packed;
494
495struct p_drbd06_param {
496 u64 size;
497 u32 state;
498 u32 blksize;
499 u32 protocol;
500 u32 version;
501 u32 gen_cnt[5];
502 u32 bit_map_gen[5];
503} __packed;
504
505struct p_discard {
506 struct p_header head;
507 u64 block_id;
508 u32 seq_num;
509 u32 pad;
510} __packed;
511
512/* Valid values for the encoding field.
513 * Bump proto version when changing this. */
514enum drbd_bitmap_code {
515 /* RLE_VLI_Bytes = 0,
516 * and other bit variants had been defined during
517 * algorithm evaluation. */
518 RLE_VLI_Bits = 2,
519};
520
521struct p_compressed_bm {
522 struct p_header head;
523 /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
524 * (encoding & 0x80): polarity (set/unset) of first runlength
525 * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
526 * used to pad up to head.length bytes
527 */
528 u8 encoding;
529
530 u8 code[0];
531} __packed;
532
533/* DCBP: Drbd Compressed Bitmap Packet ... */
534static inline enum drbd_bitmap_code
535DCBP_get_code(struct p_compressed_bm *p)
536{
537 return (enum drbd_bitmap_code)(p->encoding & 0x0f);
538}
539
540static inline void
541DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
542{
543 BUG_ON(code & ~0xf);
544 p->encoding = (p->encoding & ~0xf) | code;
545}
546
547static inline int
548DCBP_get_start(struct p_compressed_bm *p)
549{
550 return (p->encoding & 0x80) != 0;
551}
552
553static inline void
554DCBP_set_start(struct p_compressed_bm *p, int set)
555{
556 p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
557}
558
559static inline int
560DCBP_get_pad_bits(struct p_compressed_bm *p)
561{
562 return (p->encoding >> 4) & 0x7;
563}
564
565static inline void
566DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
567{
568 BUG_ON(n & ~0x7);
569 p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
570}
571
572/* one bitmap packet, including the p_header,
573 * should fit within one _architecture independend_ page.
574 * so we need to use the fixed size 4KiB page size
575 * most architechtures have used for a long time.
576 */
577#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header))
578#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
579#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
580#if (PAGE_SIZE < 4096)
581/* drbd_send_bitmap / receive_bitmap would break horribly */
582#error "PAGE_SIZE too small"
583#endif
584
585union p_polymorph {
586 struct p_header header;
587 struct p_handshake handshake;
588 struct p_data data;
589 struct p_block_ack block_ack;
590 struct p_barrier barrier;
591 struct p_barrier_ack barrier_ack;
592 struct p_rs_param_89 rs_param_89;
593 struct p_protocol protocol;
594 struct p_sizes sizes;
595 struct p_uuids uuids;
596 struct p_state state;
597 struct p_req_state req_state;
598 struct p_req_state_reply req_state_reply;
599 struct p_block_req block_req;
600} __packed;
601
602/**********************************************************************/
603enum drbd_thread_state {
604 None,
605 Running,
606 Exiting,
607 Restarting
608};
609
610struct drbd_thread {
611 spinlock_t t_lock;
612 struct task_struct *task;
613 struct completion stop;
614 enum drbd_thread_state t_state;
615 int (*function) (struct drbd_thread *);
616 struct drbd_conf *mdev;
617 int reset_cpu_mask;
618};
619
620static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
621{
622 /* THINK testing the t_state seems to be uncritical in all cases
623 * (but thread_{start,stop}), so we can read it *without* the lock.
624 * --lge */
625
626 smp_rmb();
627 return thi->t_state;
628}
629
630
631/*
632 * Having this as the first member of a struct provides sort of "inheritance".
633 * "derived" structs can be "drbd_queue_work()"ed.
634 * The callback should know and cast back to the descendant struct.
635 * drbd_request and drbd_epoch_entry are descendants of drbd_work.
636 */
637struct drbd_work;
638typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel);
639struct drbd_work {
640 struct list_head list;
641 drbd_work_cb cb;
642};
643
644struct drbd_tl_epoch;
645struct drbd_request {
646 struct drbd_work w;
647 struct drbd_conf *mdev;
648
649 /* if local IO is not allowed, will be NULL.
650 * if local IO _is_ allowed, holds the locally submitted bio clone,
651 * or, after local IO completion, the ERR_PTR(error).
652 * see drbd_endio_pri(). */
653 struct bio *private_bio;
654
655 struct hlist_node colision;
656 sector_t sector;
657 unsigned int size;
658 unsigned int epoch; /* barrier_nr */
659
660 /* barrier_nr: used to check on "completion" whether this req was in
661 * the current epoch, and we therefore have to close it,
662 * starting a new epoch...
663 */
664
665 /* up to here, the struct layout is identical to drbd_epoch_entry;
666 * we might be able to use that to our advantage... */
667
668 struct list_head tl_requests; /* ring list in the transfer log */
669 struct bio *master_bio; /* master bio pointer */
670 unsigned long rq_state; /* see comments above _req_mod() */
671 int seq_num;
672 unsigned long start_time;
673};
674
675struct drbd_tl_epoch {
676 struct drbd_work w;
677 struct list_head requests; /* requests before */
678 struct drbd_tl_epoch *next; /* pointer to the next barrier */
679 unsigned int br_number; /* the barriers identifier. */
680 int n_req; /* number of requests attached before this barrier */
681};
682
683struct drbd_request;
684
685/* These Tl_epoch_entries may be in one of 6 lists:
686 active_ee .. data packet being written
687 sync_ee .. syncer block being written
688 done_ee .. block written, need to send P_WRITE_ACK
689 read_ee .. [RS]P_DATA_REQUEST being read
690*/
691
692struct drbd_epoch {
693 struct list_head list;
694 unsigned int barrier_nr;
695 atomic_t epoch_size; /* increased on every request added. */
696 atomic_t active; /* increased on every req. added, and dec on every finished. */
697 unsigned long flags;
698};
699
700/* drbd_epoch flag bits */
701enum {
702 DE_BARRIER_IN_NEXT_EPOCH_ISSUED,
703 DE_BARRIER_IN_NEXT_EPOCH_DONE,
704 DE_CONTAINS_A_BARRIER,
705 DE_HAVE_BARRIER_NUMBER,
706 DE_IS_FINISHING,
707};
708
709enum epoch_event {
710 EV_PUT,
711 EV_GOT_BARRIER_NR,
712 EV_BARRIER_DONE,
713 EV_BECAME_LAST,
714 EV_CLEANUP = 32, /* used as flag */
715};
716
717struct drbd_epoch_entry {
718 struct drbd_work w;
719 struct drbd_conf *mdev;
720 struct bio *private_bio;
721 struct hlist_node colision;
722 sector_t sector;
723 unsigned int size;
724 struct drbd_epoch *epoch;
725
726 /* up to here, the struct layout is identical to drbd_request;
727 * we might be able to use that to our advantage... */
728
729 unsigned int flags;
730 u64 block_id;
731};
732
733struct drbd_wq_barrier {
734 struct drbd_work w;
735 struct completion done;
736};
737
738struct digest_info {
739 int digest_size;
740 void *digest;
741};
742
743/* ee flag bits */
744enum {
745 __EE_CALL_AL_COMPLETE_IO,
746 __EE_CONFLICT_PENDING,
747 __EE_MAY_SET_IN_SYNC,
748 __EE_IS_BARRIER,
749};
750#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
751#define EE_CONFLICT_PENDING (1<<__EE_CONFLICT_PENDING)
752#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
753#define EE_IS_BARRIER (1<<__EE_IS_BARRIER)
754
755/* global flag bits */
756enum {
757 CREATE_BARRIER, /* next P_DATA is preceeded by a P_BARRIER */
758 SIGNAL_ASENDER, /* whether asender wants to be interrupted */
759 SEND_PING, /* whether asender should send a ping asap */
760
761 STOP_SYNC_TIMER, /* tell timer to cancel itself */
762 UNPLUG_QUEUED, /* only relevant with kernel 2.4 */
763 UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */
764 MD_DIRTY, /* current uuids and flags not yet on disk */
765 DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */
766 USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */
767 CLUSTER_ST_CHANGE, /* Cluster wide state change going on... */
768 CL_ST_CHG_SUCCESS,
769 CL_ST_CHG_FAIL,
770 CRASHED_PRIMARY, /* This node was a crashed primary.
771 * Gets cleared when the state.conn
772 * goes into C_CONNECTED state. */
773 WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */
774 NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */
775 CONSIDER_RESYNC,
776
777 MD_NO_BARRIER, /* meta data device does not support barriers,
778 so don't even try */
779 SUSPEND_IO, /* suspend application io */
780 BITMAP_IO, /* suspend application io;
781 once no more io in flight, start bitmap io */
782 BITMAP_IO_QUEUED, /* Started bitmap IO */
783 RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
784 NET_CONGESTED, /* The data socket is congested */
785
786 CONFIG_PENDING, /* serialization of (re)configuration requests.
787 * if set, also prevents the device from dying */
788 DEVICE_DYING, /* device became unconfigured,
789 * but worker thread is still handling the cleanup.
790 * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed,
791 * while this is set. */
792 RESIZE_PENDING, /* Size change detected locally, waiting for the response from
793 * the peer, if it changed there as well. */
794};
795
796struct drbd_bitmap; /* opaque for drbd_conf */
797
798/* TODO sort members for performance
799 * MAYBE group them further */
800
801/* THINK maybe we actually want to use the default "event/%s" worker threads
802 * or similar in linux 2.6, which uses per cpu data and threads.
803 *
804 * To be general, this might need a spin_lock member.
805 * For now, please use the mdev->req_lock to protect list_head,
806 * see drbd_queue_work below.
807 */
808struct drbd_work_queue {
809 struct list_head q;
810 struct semaphore s; /* producers up it, worker down()s it */
811 spinlock_t q_lock; /* to protect the list. */
812};
813
814struct drbd_socket {
815 struct drbd_work_queue work;
816 struct mutex mutex;
817 struct socket *socket;
818 /* this way we get our
819 * send/receive buffers off the stack */
820 union p_polymorph sbuf;
821 union p_polymorph rbuf;
822};
823
824struct drbd_md {
825 u64 md_offset; /* sector offset to 'super' block */
826
827 u64 la_size_sect; /* last agreed size, unit sectors */
828 u64 uuid[UI_SIZE];
829 u64 device_uuid;
830 u32 flags;
831 u32 md_size_sect;
832
833 s32 al_offset; /* signed relative sector offset to al area */
834 s32 bm_offset; /* signed relative sector offset to bitmap */
835
836 /* u32 al_nr_extents; important for restoring the AL
837 * is stored into sync_conf.al_extents, which in turn
838 * gets applied to act_log->nr_elements
839 */
840};
841
842/* for sync_conf and other types... */
843#define NL_PACKET(name, number, fields) struct name { fields };
844#define NL_INTEGER(pn,pr,member) int member;
845#define NL_INT64(pn,pr,member) __u64 member;
846#define NL_BIT(pn,pr,member) unsigned member:1;
847#define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len;
848#include "linux/drbd_nl.h"
849
850struct drbd_backing_dev {
851 struct block_device *backing_bdev;
852 struct block_device *md_bdev;
853 struct file *lo_file;
854 struct file *md_file;
855 struct drbd_md md;
856 struct disk_conf dc; /* The user provided config... */
857 sector_t known_size; /* last known size of that backing device */
858};
859
860struct drbd_md_io {
861 struct drbd_conf *mdev;
862 struct completion event;
863 int error;
864};
865
866struct bm_io_work {
867 struct drbd_work w;
868 char *why;
869 int (*io_fn)(struct drbd_conf *mdev);
870 void (*done)(struct drbd_conf *mdev, int rv);
871};
872
873enum write_ordering_e {
874 WO_none,
875 WO_drain_io,
876 WO_bdev_flush,
877 WO_bio_barrier
878};
879
880struct drbd_conf {
881 /* things that are stored as / read from meta data on disk */
882 unsigned long flags;
883
884 /* configured by drbdsetup */
885 struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */
886 struct syncer_conf sync_conf;
887 struct drbd_backing_dev *ldev __protected_by(local);
888
889 sector_t p_size; /* partner's disk size */
890 struct request_queue *rq_queue;
891 struct block_device *this_bdev;
892 struct gendisk *vdisk;
893
894 struct drbd_socket data; /* data/barrier/cstate/parameter packets */
895 struct drbd_socket meta; /* ping/ack (metadata) packets */
896 int agreed_pro_version; /* actually used protocol version */
897 unsigned long last_received; /* in jiffies, either socket */
898 unsigned int ko_count;
899 struct drbd_work resync_work,
900 unplug_work,
901 md_sync_work;
902 struct timer_list resync_timer;
903 struct timer_list md_sync_timer;
904
905 /* Used after attach while negotiating new disk state. */
906 union drbd_state new_state_tmp;
907
908 union drbd_state state;
909 wait_queue_head_t misc_wait;
910 wait_queue_head_t state_wait; /* upon each state change. */
911 unsigned int send_cnt;
912 unsigned int recv_cnt;
913 unsigned int read_cnt;
914 unsigned int writ_cnt;
915 unsigned int al_writ_cnt;
916 unsigned int bm_writ_cnt;
917 atomic_t ap_bio_cnt; /* Requests we need to complete */
918 atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
919 atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
920 atomic_t unacked_cnt; /* Need to send replys for */
921 atomic_t local_cnt; /* Waiting for local completion */
922 atomic_t net_cnt; /* Users of net_conf */
923 spinlock_t req_lock;
924 struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */
925 struct drbd_tl_epoch *newest_tle;
926 struct drbd_tl_epoch *oldest_tle;
927 struct list_head out_of_sequence_requests;
928 struct hlist_head *tl_hash;
929 unsigned int tl_hash_s;
930
931 /* blocks to sync in this run [unit BM_BLOCK_SIZE] */
932 unsigned long rs_total;
933 /* number of sync IOs that failed in this run */
934 unsigned long rs_failed;
935 /* Syncer's start time [unit jiffies] */
936 unsigned long rs_start;
937 /* cumulated time in PausedSyncX state [unit jiffies] */
938 unsigned long rs_paused;
939 /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
940 unsigned long rs_mark_left;
941 /* marks's time [unit jiffies] */
942 unsigned long rs_mark_time;
943 /* skipped because csum was equeal [unit BM_BLOCK_SIZE] */
944 unsigned long rs_same_csum;
945
946 /* where does the admin want us to start? (sector) */
947 sector_t ov_start_sector;
948 /* where are we now? (sector) */
949 sector_t ov_position;
950 /* Start sector of out of sync range (to merge printk reporting). */
951 sector_t ov_last_oos_start;
952 /* size of out-of-sync range in sectors. */
953 sector_t ov_last_oos_size;
954 unsigned long ov_left; /* in bits */
955 struct crypto_hash *csums_tfm;
956 struct crypto_hash *verify_tfm;
957
958 struct drbd_thread receiver;
959 struct drbd_thread worker;
960 struct drbd_thread asender;
961 struct drbd_bitmap *bitmap;
962 unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */
963
964 /* Used to track operations of resync... */
965 struct lru_cache *resync;
966 /* Number of locked elements in resync LRU */
967 unsigned int resync_locked;
968 /* resync extent number waiting for application requests */
969 unsigned int resync_wenr;
970
971 int open_cnt;
972 u64 *p_uuid;
973 struct drbd_epoch *current_epoch;
974 spinlock_t epoch_lock;
975 unsigned int epochs;
976 enum write_ordering_e write_ordering;
977 struct list_head active_ee; /* IO in progress */
978 struct list_head sync_ee; /* IO in progress */
979 struct list_head done_ee; /* send ack */
980 struct list_head read_ee; /* IO in progress */
981 struct list_head net_ee; /* zero-copy network send in progress */
982 struct hlist_head *ee_hash; /* is proteced by req_lock! */
983 unsigned int ee_hash_s;
984
985 /* this one is protected by ee_lock, single thread */
986 struct drbd_epoch_entry *last_write_w_barrier;
987
988 int next_barrier_nr;
989 struct hlist_head *app_reads_hash; /* is proteced by req_lock */
990 struct list_head resync_reads;
991 atomic_t pp_in_use;
992 wait_queue_head_t ee_wait;
993 struct page *md_io_page; /* one page buffer for md_io */
994 struct page *md_io_tmpp; /* for logical_block_size != 512 */
995 struct mutex md_io_mutex; /* protects the md_io_buffer */
996 spinlock_t al_lock;
997 wait_queue_head_t al_wait;
998 struct lru_cache *act_log; /* activity log */
999 unsigned int al_tr_number;
1000 int al_tr_cycle;
1001 int al_tr_pos; /* position of the next transaction in the journal */
1002 struct crypto_hash *cram_hmac_tfm;
1003 struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */
1004 struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */
1005 void *int_dig_out;
1006 void *int_dig_in;
1007 void *int_dig_vv;
1008 wait_queue_head_t seq_wait;
1009 atomic_t packet_seq;
1010 unsigned int peer_seq;
1011 spinlock_t peer_seq_lock;
1012 unsigned int minor;
1013 unsigned long comm_bm_set; /* communicated number of set bits. */
1014 cpumask_var_t cpu_mask;
1015 struct bm_io_work bm_io_work;
1016 u64 ed_uuid; /* UUID of the exposed data */
1017 struct mutex state_mutex;
1018 char congestion_reason; /* Why we where congested... */
1019};
1020
1021static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
1022{
1023 struct drbd_conf *mdev;
1024
1025 mdev = minor < minor_count ? minor_table[minor] : NULL;
1026
1027 return mdev;
1028}
1029
1030static inline unsigned int mdev_to_minor(struct drbd_conf *mdev)
1031{
1032 return mdev->minor;
1033}
1034
1035/* returns 1 if it was successfull,
1036 * returns 0 if there was no data socket.
1037 * so wherever you are going to use the data.socket, e.g. do
1038 * if (!drbd_get_data_sock(mdev))
1039 * return 0;
1040 * CODE();
1041 * drbd_put_data_sock(mdev);
1042 */
1043static inline int drbd_get_data_sock(struct drbd_conf *mdev)
1044{
1045 mutex_lock(&mdev->data.mutex);
1046 /* drbd_disconnect() could have called drbd_free_sock()
1047 * while we were waiting in down()... */
1048 if (unlikely(mdev->data.socket == NULL)) {
1049 mutex_unlock(&mdev->data.mutex);
1050 return 0;
1051 }
1052 return 1;
1053}
1054
1055static inline void drbd_put_data_sock(struct drbd_conf *mdev)
1056{
1057 mutex_unlock(&mdev->data.mutex);
1058}
1059
1060/*
1061 * function declarations
1062 *************************/
1063
1064/* drbd_main.c */
1065
1066enum chg_state_flags {
1067 CS_HARD = 1,
1068 CS_VERBOSE = 2,
1069 CS_WAIT_COMPLETE = 4,
1070 CS_SERIALIZE = 8,
1071 CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE,
1072};
1073
1074extern void drbd_init_set_defaults(struct drbd_conf *mdev);
1075extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
1076 union drbd_state mask, union drbd_state val);
1077extern void drbd_force_state(struct drbd_conf *, union drbd_state,
1078 union drbd_state);
1079extern int _drbd_request_state(struct drbd_conf *, union drbd_state,
1080 union drbd_state, enum chg_state_flags);
1081extern int __drbd_set_state(struct drbd_conf *, union drbd_state,
1082 enum chg_state_flags, struct completion *done);
1083extern void print_st_err(struct drbd_conf *, union drbd_state,
1084 union drbd_state, int);
1085extern int drbd_thread_start(struct drbd_thread *thi);
1086extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait);
1087#ifdef CONFIG_SMP
1088extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev);
1089extern void drbd_calc_cpu_mask(struct drbd_conf *mdev);
1090#else
1091#define drbd_thread_current_set_cpu(A) ({})
1092#define drbd_calc_cpu_mask(A) ({})
1093#endif
1094extern void drbd_free_resources(struct drbd_conf *mdev);
1095extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
1096 unsigned int set_size);
1097extern void tl_clear(struct drbd_conf *mdev);
1098extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
1099extern void drbd_free_sock(struct drbd_conf *mdev);
1100extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
1101 void *buf, size_t size, unsigned msg_flags);
1102extern int drbd_send_protocol(struct drbd_conf *mdev);
1103extern int drbd_send_uuids(struct drbd_conf *mdev);
1104extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
1105extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val);
1106extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply);
1107extern int _drbd_send_state(struct drbd_conf *mdev);
1108extern int drbd_send_state(struct drbd_conf *mdev);
1109extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1110 enum drbd_packets cmd, struct p_header *h,
1111 size_t size, unsigned msg_flags);
1112#define USE_DATA_SOCKET 1
1113#define USE_META_SOCKET 0
1114extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1115 enum drbd_packets cmd, struct p_header *h,
1116 size_t size);
1117extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
1118 char *data, size_t size);
1119extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc);
1120extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr,
1121 u32 set_size);
1122extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
1123 struct drbd_epoch_entry *e);
1124extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
1125 struct p_block_req *rp);
1126extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
1127 struct p_data *dp);
1128extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
1129 sector_t sector, int blksize, u64 block_id);
1130extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
1131 struct drbd_epoch_entry *e);
1132extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req);
1133extern int _drbd_send_barrier(struct drbd_conf *mdev,
1134 struct drbd_tl_epoch *barrier);
1135extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
1136 sector_t sector, int size, u64 block_id);
1137extern int drbd_send_drequest_csum(struct drbd_conf *mdev,
1138 sector_t sector,int size,
1139 void *digest, int digest_size,
1140 enum drbd_packets cmd);
1141extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size);
1142
1143extern int drbd_send_bitmap(struct drbd_conf *mdev);
1144extern int _drbd_send_bitmap(struct drbd_conf *mdev);
1145extern int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode);
1146extern void drbd_free_bc(struct drbd_backing_dev *ldev);
1147extern void drbd_mdev_cleanup(struct drbd_conf *mdev);
1148
1149/* drbd_meta-data.c (still in drbd_main.c) */
1150extern void drbd_md_sync(struct drbd_conf *mdev);
1151extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev);
1152/* maybe define them below as inline? */
1153extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
1154extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
1155extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
1156extern void _drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
1157extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local);
1158extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local);
1159extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local);
1160extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
1161extern void drbd_md_mark_dirty(struct drbd_conf *mdev);
1162extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
1163 int (*io_fn)(struct drbd_conf *),
1164 void (*done)(struct drbd_conf *, int),
1165 char *why);
1166extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
1167extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
1168extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
1169
1170
1171/* Meta data layout
1172 We reserve a 128MB Block (4k aligned)
1173 * either at the end of the backing device
1174 * or on a seperate meta data device. */
1175
1176#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */
1177/* The following numbers are sectors */
1178#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */
1179#define MD_AL_MAX_SIZE 64 /* = 32 kb LOG ~ 3776 extents ~ 14 GB Storage */
1180/* Allows up to about 3.8TB */
1181#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE)
1182
1183/* Since the smalles IO unit is usually 512 byte */
1184#define MD_SECTOR_SHIFT 9
1185#define MD_SECTOR_SIZE (1<<MD_SECTOR_SHIFT)
1186
1187/* activity log */
1188#define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */
1189#define AL_EXTENT_SHIFT 22 /* One extent represents 4M Storage */
1190#define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT)
1191
1192#if BITS_PER_LONG == 32
1193#define LN2_BPL 5
1194#define cpu_to_lel(A) cpu_to_le32(A)
1195#define lel_to_cpu(A) le32_to_cpu(A)
1196#elif BITS_PER_LONG == 64
1197#define LN2_BPL 6
1198#define cpu_to_lel(A) cpu_to_le64(A)
1199#define lel_to_cpu(A) le64_to_cpu(A)
1200#else
1201#error "LN2 of BITS_PER_LONG unknown!"
1202#endif
1203
1204/* resync bitmap */
1205/* 16MB sized 'bitmap extent' to track syncer usage */
1206struct bm_extent {
1207 int rs_left; /* number of bits set (out of sync) in this extent. */
1208 int rs_failed; /* number of failed resync requests in this extent. */
1209 unsigned long flags;
1210 struct lc_element lce;
1211};
1212
1213#define BME_NO_WRITES 0 /* bm_extent.flags: no more requests on this one! */
1214#define BME_LOCKED 1 /* bm_extent.flags: syncer active on this one. */
1215
1216/* drbd_bitmap.c */
1217/*
1218 * We need to store one bit for a block.
1219 * Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap.
1220 * Bit 0 ==> local node thinks this block is binary identical on both nodes
1221 * Bit 1 ==> local node thinks this block needs to be synced.
1222 */
1223
1224#define BM_BLOCK_SHIFT 12 /* 4k per bit */
1225#define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT)
1226/* (9+3) : 512 bytes @ 8 bits; representing 16M storage
1227 * per sector of on disk bitmap */
1228#define BM_EXT_SHIFT (BM_BLOCK_SHIFT + MD_SECTOR_SHIFT + 3) /* = 24 */
1229#define BM_EXT_SIZE (1<<BM_EXT_SHIFT)
1230
1231#if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12)
1232#error "HAVE YOU FIXED drbdmeta AS WELL??"
1233#endif
1234
1235/* thus many _storage_ sectors are described by one bit */
1236#define BM_SECT_TO_BIT(x) ((x)>>(BM_BLOCK_SHIFT-9))
1237#define BM_BIT_TO_SECT(x) ((sector_t)(x)<<(BM_BLOCK_SHIFT-9))
1238#define BM_SECT_PER_BIT BM_BIT_TO_SECT(1)
1239
1240/* bit to represented kilo byte conversion */
1241#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10))
1242
1243/* in which _bitmap_ extent (resp. sector) the bit for a certain
1244 * _storage_ sector is located in */
1245#define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9))
1246
1247/* how much _storage_ sectors we have per bitmap sector */
1248#define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9))
1249#define BM_SECT_PER_EXT BM_EXT_TO_SECT(1)
1250
1251/* in one sector of the bitmap, we have this many activity_log extents. */
1252#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
1253#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
1254
1255#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
1256#define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
1257
1258/* the extent in "PER_EXTENT" below is an activity log extent
1259 * we need that many (long words/bytes) to store the bitmap
1260 * of one AL_EXTENT_SIZE chunk of storage.
1261 * we can store the bitmap for that many AL_EXTENTS within
1262 * one sector of the _on_disk_ bitmap:
1263 * bit 0 bit 37 bit 38 bit (512*8)-1
1264 * ...|........|........|.. // ..|........|
1265 * sect. 0 `296 `304 ^(512*8*8)-1
1266 *
1267#define BM_WORDS_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG )
1268#define BM_BYTES_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 ) // 128
1269#define BM_EXT_PER_SECT ( 512 / BM_BYTES_PER_EXTENT ) // 4
1270 */
1271
1272#define DRBD_MAX_SECTORS_32 (0xffffffffLU)
1273#define DRBD_MAX_SECTORS_BM \
1274 ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9)))
1275#if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32
1276#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
1277#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM
1278#elif !defined(CONFIG_LBD) && BITS_PER_LONG == 32
1279#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32
1280#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
1281#else
1282#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
1283/* 16 TB in units of sectors */
1284#if BITS_PER_LONG == 32
1285/* adjust by one page worth of bitmap,
1286 * so we won't wrap around in drbd_bm_find_next_bit.
1287 * you should use 64bit OS for that much storage, anyways. */
1288#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff)
1289#else
1290#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0x1LU << 32)
1291#endif
1292#endif
1293
1294/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
1295 * With a value of 6 all IO in one 32K block make it to the same slot of the
1296 * hash table. */
1297#define HT_SHIFT 6
1298#define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT))
1299
1300/* Number of elements in the app_reads_hash */
1301#define APP_R_HSIZE 15
1302
1303extern int drbd_bm_init(struct drbd_conf *mdev);
1304extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors);
1305extern void drbd_bm_cleanup(struct drbd_conf *mdev);
1306extern void drbd_bm_set_all(struct drbd_conf *mdev);
1307extern void drbd_bm_clear_all(struct drbd_conf *mdev);
1308extern int drbd_bm_set_bits(
1309 struct drbd_conf *mdev, unsigned long s, unsigned long e);
1310extern int drbd_bm_clear_bits(
1311 struct drbd_conf *mdev, unsigned long s, unsigned long e);
1312/* bm_set_bits variant for use while holding drbd_bm_lock */
1313extern void _drbd_bm_set_bits(struct drbd_conf *mdev,
1314 const unsigned long s, const unsigned long e);
1315extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr);
1316extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
1317extern int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local);
1318extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
1319extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
1320extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
1321 unsigned long al_enr);
1322extern size_t drbd_bm_words(struct drbd_conf *mdev);
1323extern unsigned long drbd_bm_bits(struct drbd_conf *mdev);
1324extern sector_t drbd_bm_capacity(struct drbd_conf *mdev);
1325extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
1326/* bm_find_next variants for use while you hold drbd_bm_lock() */
1327extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
1328extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo);
1329extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev);
1330extern int drbd_bm_rs_done(struct drbd_conf *mdev);
1331/* for receive_bitmap */
1332extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset,
1333 size_t number, unsigned long *buffer);
1334/* for _drbd_send_bitmap and drbd_bm_write_sect */
1335extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset,
1336 size_t number, unsigned long *buffer);
1337
1338extern void drbd_bm_lock(struct drbd_conf *mdev, char *why);
1339extern void drbd_bm_unlock(struct drbd_conf *mdev);
1340
1341extern int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e);
1342/* drbd_main.c */
1343
1344extern struct kmem_cache *drbd_request_cache;
1345extern struct kmem_cache *drbd_ee_cache; /* epoch entries */
1346extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
1347extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
1348extern mempool_t *drbd_request_mempool;
1349extern mempool_t *drbd_ee_mempool;
1350
1351extern struct page *drbd_pp_pool; /* drbd's page pool */
1352extern spinlock_t drbd_pp_lock;
1353extern int drbd_pp_vacant;
1354extern wait_queue_head_t drbd_pp_wait;
1355
1356extern rwlock_t global_state_lock;
1357
1358extern struct drbd_conf *drbd_new_device(unsigned int minor);
1359extern void drbd_free_mdev(struct drbd_conf *mdev);
1360
1361extern int proc_details;
1362
1363/* drbd_req */
1364extern int drbd_make_request_26(struct request_queue *q, struct bio *bio);
1365extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
1366extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
1367extern int is_valid_ar_handle(struct drbd_request *, sector_t);
1368
1369
1370/* drbd_nl.c */
1371extern void drbd_suspend_io(struct drbd_conf *mdev);
1372extern void drbd_resume_io(struct drbd_conf *mdev);
1373extern char *ppsize(char *buf, unsigned long long size);
1374extern sector_t drbd_new_dev_size(struct drbd_conf *,
1375 struct drbd_backing_dev *);
1376enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
1377extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *) __must_hold(local);
1378extern void resync_after_online_grow(struct drbd_conf *);
1379extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
1380extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
1381 int force);
1382enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
1383extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
1384
1385/* drbd_worker.c */
1386extern int drbd_worker(struct drbd_thread *thi);
1387extern int drbd_alter_sa(struct drbd_conf *mdev, int na);
1388extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side);
1389extern void resume_next_sg(struct drbd_conf *mdev);
1390extern void suspend_other_sg(struct drbd_conf *mdev);
1391extern int drbd_resync_finished(struct drbd_conf *mdev);
1392/* maybe rather drbd_main.c ? */
1393extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
1394 struct drbd_backing_dev *bdev, sector_t sector, int rw);
1395extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
1396
1397static inline void ov_oos_print(struct drbd_conf *mdev)
1398{
1399 if (mdev->ov_last_oos_size) {
1400 dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n",
1401 (unsigned long long)mdev->ov_last_oos_start,
1402 (unsigned long)mdev->ov_last_oos_size);
1403 }
1404 mdev->ov_last_oos_size=0;
1405}
1406
1407
1408extern void drbd_csum(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
1409/* worker callbacks */
1410extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int);
1411extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int);
1412extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int);
1413extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int);
1414extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int);
1415extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int);
1416extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int);
1417extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int);
1418extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int);
1419extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int);
1420extern int w_io_error(struct drbd_conf *, struct drbd_work *, int);
1421extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int);
1422extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int);
1423extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int);
1424extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
1425extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
1426extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
1427extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
1428
1429extern void resync_timer_fn(unsigned long data);
1430
1431/* drbd_receiver.c */
1432extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
1433extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
1434 u64 id,
1435 sector_t sector,
1436 unsigned int data_size,
1437 gfp_t gfp_mask) __must_hold(local);
1438extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e);
1439extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
1440 struct list_head *head);
1441extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
1442 struct list_head *head);
1443extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
1444extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
1445extern void drbd_flush_workqueue(struct drbd_conf *mdev);
1446
1447/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
1448 * mess with get_fs/set_fs, we know we are KERNEL_DS always. */
1449static inline int drbd_setsockopt(struct socket *sock, int level, int optname,
1450 char __user *optval, int optlen)
1451{
1452 int err;
1453 if (level == SOL_SOCKET)
1454 err = sock_setsockopt(sock, level, optname, optval, optlen);
1455 else
1456 err = sock->ops->setsockopt(sock, level, optname, optval,
1457 optlen);
1458 return err;
1459}
1460
1461static inline void drbd_tcp_cork(struct socket *sock)
1462{
1463 int __user val = 1;
1464 (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
1465 (char __user *)&val, sizeof(val));
1466}
1467
1468static inline void drbd_tcp_uncork(struct socket *sock)
1469{
1470 int __user val = 0;
1471 (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
1472 (char __user *)&val, sizeof(val));
1473}
1474
1475static inline void drbd_tcp_nodelay(struct socket *sock)
1476{
1477 int __user val = 1;
1478 (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY,
1479 (char __user *)&val, sizeof(val));
1480}
1481
1482static inline void drbd_tcp_quickack(struct socket *sock)
1483{
1484 int __user val = 1;
1485 (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
1486 (char __user *)&val, sizeof(val));
1487}
1488
1489void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo);
1490
1491/* drbd_proc.c */
1492extern struct proc_dir_entry *drbd_proc;
1493extern struct file_operations drbd_proc_fops;
1494extern const char *drbd_conn_str(enum drbd_conns s);
1495extern const char *drbd_role_str(enum drbd_role s);
1496
1497/* drbd_actlog.c */
1498extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector);
1499extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector);
1500extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector);
1501extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
1502extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
1503extern void drbd_rs_cancel_all(struct drbd_conf *mdev);
1504extern int drbd_rs_del_all(struct drbd_conf *mdev);
1505extern void drbd_rs_failed_io(struct drbd_conf *mdev,
1506 sector_t sector, int size);
1507extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *);
1508extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector,
1509 int size, const char *file, const unsigned int line);
1510#define drbd_set_in_sync(mdev, sector, size) \
1511 __drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__)
1512extern void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
1513 int size, const char *file, const unsigned int line);
1514#define drbd_set_out_of_sync(mdev, sector, size) \
1515 __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
1516extern void drbd_al_apply_to_bm(struct drbd_conf *mdev);
1517extern void drbd_al_to_on_disk_bm(struct drbd_conf *mdev);
1518extern void drbd_al_shrink(struct drbd_conf *mdev);
1519
1520
1521/* drbd_nl.c */
1522
1523void drbd_nl_cleanup(void);
1524int __init drbd_nl_init(void);
1525void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state);
1526void drbd_bcast_sync_progress(struct drbd_conf *mdev);
1527void drbd_bcast_ee(struct drbd_conf *mdev,
1528 const char *reason, const int dgs,
1529 const char* seen_hash, const char* calc_hash,
1530 const struct drbd_epoch_entry* e);
1531
1532
1533/**
1534 * DOC: DRBD State macros
1535 *
1536 * These macros are used to express state changes in easily readable form.
1537 *
1538 * The NS macros expand to a mask and a value, that can be bit ored onto the
1539 * current state as soon as the spinlock (req_lock) was taken.
1540 *
1541 * The _NS macros are used for state functions that get called with the
1542 * spinlock. These macros expand directly to the new state value.
1543 *
1544 * Besides the basic forms NS() and _NS() additional _?NS[23] are defined
1545 * to express state changes that affect more than one aspect of the state.
1546 *
1547 * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
1548 * Means that the network connection was established and that the peer
1549 * is in secondary role.
1550 */
1551#define role_MASK R_MASK
1552#define peer_MASK R_MASK
1553#define disk_MASK D_MASK
1554#define pdsk_MASK D_MASK
1555#define conn_MASK C_MASK
1556#define susp_MASK 1
1557#define user_isp_MASK 1
1558#define aftr_isp_MASK 1
1559
1560#define NS(T, S) \
1561 ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
1562 ({ union drbd_state val; val.i = 0; val.T = (S); val; })
1563#define NS2(T1, S1, T2, S2) \
1564 ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
1565 mask.T2 = T2##_MASK; mask; }), \
1566 ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
1567 val.T2 = (S2); val; })
1568#define NS3(T1, S1, T2, S2, T3, S3) \
1569 ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
1570 mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
1571 ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
1572 val.T2 = (S2); val.T3 = (S3); val; })
1573
1574#define _NS(D, T, S) \
1575 D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; })
1576#define _NS2(D, T1, S1, T2, S2) \
1577 D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
1578 __ns.T2 = (S2); __ns; })
1579#define _NS3(D, T1, S1, T2, S2, T3, S3) \
1580 D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
1581 __ns.T2 = (S2); __ns.T3 = (S3); __ns; })
1582
1583/*
1584 * inline helper functions
1585 *************************/
1586
1587static inline void drbd_state_lock(struct drbd_conf *mdev)
1588{
1589 wait_event(mdev->misc_wait,
1590 !test_and_set_bit(CLUSTER_ST_CHANGE, &mdev->flags));
1591}
1592
1593static inline void drbd_state_unlock(struct drbd_conf *mdev)
1594{
1595 clear_bit(CLUSTER_ST_CHANGE, &mdev->flags);
1596 wake_up(&mdev->misc_wait);
1597}
1598
1599static inline int _drbd_set_state(struct drbd_conf *mdev,
1600 union drbd_state ns, enum chg_state_flags flags,
1601 struct completion *done)
1602{
1603 int rv;
1604
1605 read_lock(&global_state_lock);
1606 rv = __drbd_set_state(mdev, ns, flags, done);
1607 read_unlock(&global_state_lock);
1608
1609 return rv;
1610}
1611
1612/**
1613 * drbd_request_state() - Reqest a state change
1614 * @mdev: DRBD device.
1615 * @mask: mask of state bits to change.
1616 * @val: value of new state bits.
1617 *
1618 * This is the most graceful way of requesting a state change. It is verbose
1619 * quite verbose in case the state change is not possible, and all those
1620 * state changes are globally serialized.
1621 */
1622static inline int drbd_request_state(struct drbd_conf *mdev,
1623 union drbd_state mask,
1624 union drbd_state val)
1625{
1626 return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED);
1627}
1628
1629#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__)
1630static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, const char *where)
1631{
1632 switch (mdev->ldev->dc.on_io_error) {
1633 case EP_PASS_ON:
1634 if (!forcedetach) {
1635 if (printk_ratelimit())
1636 dev_err(DEV, "Local IO failed in %s."
1637 "Passing error on...\n", where);
1638 break;
1639 }
1640 /* NOTE fall through to detach case if forcedetach set */
1641 case EP_DETACH:
1642 case EP_CALL_HELPER:
1643 if (mdev->state.disk > D_FAILED) {
1644 _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
1645 dev_err(DEV, "Local IO failed in %s."
1646 "Detaching...\n", where);
1647 }
1648 break;
1649 }
1650}
1651
1652/**
1653 * drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers
1654 * @mdev: DRBD device.
1655 * @error: Error code passed to the IO completion callback
1656 * @forcedetach: Force detach. I.e. the error happened while accessing the meta data
1657 *
1658 * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED)
1659 */
1660#define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__)
1661static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
1662 int error, int forcedetach, const char *where)
1663{
1664 if (error) {
1665 unsigned long flags;
1666 spin_lock_irqsave(&mdev->req_lock, flags);
1667 __drbd_chk_io_error_(mdev, forcedetach, where);
1668 spin_unlock_irqrestore(&mdev->req_lock, flags);
1669 }
1670}
1671
1672
1673/**
1674 * drbd_md_first_sector() - Returns the first sector number of the meta data area
1675 * @bdev: Meta data block device.
1676 *
1677 * BTW, for internal meta data, this happens to be the maximum capacity
1678 * we could agree upon with our peer node.
1679 */
1680static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
1681{
1682 switch (bdev->dc.meta_dev_idx) {
1683 case DRBD_MD_INDEX_INTERNAL:
1684 case DRBD_MD_INDEX_FLEX_INT:
1685 return bdev->md.md_offset + bdev->md.bm_offset;
1686 case DRBD_MD_INDEX_FLEX_EXT:
1687 default:
1688 return bdev->md.md_offset;
1689 }
1690}
1691
1692/**
1693 * drbd_md_last_sector() - Return the last sector number of the meta data area
1694 * @bdev: Meta data block device.
1695 */
1696static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
1697{
1698 switch (bdev->dc.meta_dev_idx) {
1699 case DRBD_MD_INDEX_INTERNAL:
1700 case DRBD_MD_INDEX_FLEX_INT:
1701 return bdev->md.md_offset + MD_AL_OFFSET - 1;
1702 case DRBD_MD_INDEX_FLEX_EXT:
1703 default:
1704 return bdev->md.md_offset + bdev->md.md_size_sect;
1705 }
1706}
1707
1708/* Returns the number of 512 byte sectors of the device */
1709static inline sector_t drbd_get_capacity(struct block_device *bdev)
1710{
1711 /* return bdev ? get_capacity(bdev->bd_disk) : 0; */
1712 return bdev ? bdev->bd_inode->i_size >> 9 : 0;
1713}
1714
1715/**
1716 * drbd_get_max_capacity() - Returns the capacity we announce to out peer
1717 * @bdev: Meta data block device.
1718 *
1719 * returns the capacity we announce to out peer. we clip ourselves at the
1720 * various MAX_SECTORS, because if we don't, current implementation will
1721 * oops sooner or later
1722 */
1723static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
1724{
1725 sector_t s;
1726 switch (bdev->dc.meta_dev_idx) {
1727 case DRBD_MD_INDEX_INTERNAL:
1728 case DRBD_MD_INDEX_FLEX_INT:
1729 s = drbd_get_capacity(bdev->backing_bdev)
1730 ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
1731 drbd_md_first_sector(bdev))
1732 : 0;
1733 break;
1734 case DRBD_MD_INDEX_FLEX_EXT:
1735 s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
1736 drbd_get_capacity(bdev->backing_bdev));
1737 /* clip at maximum size the meta device can support */
1738 s = min_t(sector_t, s,
1739 BM_EXT_TO_SECT(bdev->md.md_size_sect
1740 - bdev->md.bm_offset));
1741 break;
1742 default:
1743 s = min_t(sector_t, DRBD_MAX_SECTORS,
1744 drbd_get_capacity(bdev->backing_bdev));
1745 }
1746 return s;
1747}
1748
1749/**
1750 * drbd_md_ss__() - Return the sector number of our meta data super block
1751 * @mdev: DRBD device.
1752 * @bdev: Meta data block device.
1753 */
1754static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
1755 struct drbd_backing_dev *bdev)
1756{
1757 switch (bdev->dc.meta_dev_idx) {
1758 default: /* external, some index */
1759 return MD_RESERVED_SECT * bdev->dc.meta_dev_idx;
1760 case DRBD_MD_INDEX_INTERNAL:
1761 /* with drbd08, internal meta data is always "flexible" */
1762 case DRBD_MD_INDEX_FLEX_INT:
1763 /* sizeof(struct md_on_disk_07) == 4k
1764 * position: last 4k aligned block of 4k size */
1765 if (!bdev->backing_bdev) {
1766 if (__ratelimit(&drbd_ratelimit_state)) {
1767 dev_err(DEV, "bdev->backing_bdev==NULL\n");
1768 dump_stack();
1769 }
1770 return 0;
1771 }
1772 return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL)
1773 - MD_AL_OFFSET;
1774 case DRBD_MD_INDEX_FLEX_EXT:
1775 return 0;
1776 }
1777}
1778
1779static inline void
1780_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
1781{
1782 list_add_tail(&w->list, &q->q);
1783 up(&q->s);
1784}
1785
1786static inline void
1787drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
1788{
1789 unsigned long flags;
1790 spin_lock_irqsave(&q->q_lock, flags);
1791 list_add(&w->list, &q->q);
1792 up(&q->s); /* within the spinlock,
1793 see comment near end of drbd_worker() */
1794 spin_unlock_irqrestore(&q->q_lock, flags);
1795}
1796
1797static inline void
1798drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
1799{
1800 unsigned long flags;
1801 spin_lock_irqsave(&q->q_lock, flags);
1802 list_add_tail(&w->list, &q->q);
1803 up(&q->s); /* within the spinlock,
1804 see comment near end of drbd_worker() */
1805 spin_unlock_irqrestore(&q->q_lock, flags);
1806}
1807
1808static inline void wake_asender(struct drbd_conf *mdev)
1809{
1810 if (test_bit(SIGNAL_ASENDER, &mdev->flags))
1811 force_sig(DRBD_SIG, mdev->asender.task);
1812}
1813
1814static inline void request_ping(struct drbd_conf *mdev)
1815{
1816 set_bit(SEND_PING, &mdev->flags);
1817 wake_asender(mdev);
1818}
1819
1820static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
1821 enum drbd_packets cmd)
1822{
1823 struct p_header h;
1824 return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
1825}
1826
1827static inline int drbd_send_ping(struct drbd_conf *mdev)
1828{
1829 struct p_header h;
1830 return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
1831}
1832
1833static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
1834{
1835 struct p_header h;
1836 return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
1837}
1838
1839static inline void drbd_thread_stop(struct drbd_thread *thi)
1840{
1841 _drbd_thread_stop(thi, FALSE, TRUE);
1842}
1843
1844static inline void drbd_thread_stop_nowait(struct drbd_thread *thi)
1845{
1846 _drbd_thread_stop(thi, FALSE, FALSE);
1847}
1848
1849static inline void drbd_thread_restart_nowait(struct drbd_thread *thi)
1850{
1851 _drbd_thread_stop(thi, TRUE, FALSE);
1852}
1853
1854/* counts how many answer packets packets we expect from our peer,
1855 * for either explicit application requests,
1856 * or implicit barrier packets as necessary.
1857 * increased:
1858 * w_send_barrier
1859 * _req_mod(req, queue_for_net_write or queue_for_net_read);
1860 * it is much easier and equally valid to count what we queue for the
1861 * worker, even before it actually was queued or send.
1862 * (drbd_make_request_common; recovery path on read io-error)
1863 * decreased:
1864 * got_BarrierAck (respective tl_clear, tl_clear_barrier)
1865 * _req_mod(req, data_received)
1866 * [from receive_DataReply]
1867 * _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked)
1868 * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)]
1869 * for some reason it is NOT decreased in got_NegAck,
1870 * but in the resulting cleanup code from report_params.
1871 * we should try to remember the reason for that...
1872 * _req_mod(req, send_failed or send_canceled)
1873 * _req_mod(req, connection_lost_while_pending)
1874 * [from tl_clear_barrier]
1875 */
1876static inline void inc_ap_pending(struct drbd_conf *mdev)
1877{
1878 atomic_inc(&mdev->ap_pending_cnt);
1879}
1880
1881#define ERR_IF_CNT_IS_NEGATIVE(which) \
1882 if (atomic_read(&mdev->which) < 0) \
1883 dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \
1884 __func__ , __LINE__ , \
1885 atomic_read(&mdev->which))
1886
1887#define dec_ap_pending(mdev) do { \
1888 typecheck(struct drbd_conf *, mdev); \
1889 if (atomic_dec_and_test(&mdev->ap_pending_cnt)) \
1890 wake_up(&mdev->misc_wait); \
1891 ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0)
1892
1893/* counts how many resync-related answers we still expect from the peer
1894 * increase decrease
1895 * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY)
1896 * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK whith ID_SYNCER)
1897 * (or P_NEG_ACK with ID_SYNCER)
1898 */
1899static inline void inc_rs_pending(struct drbd_conf *mdev)
1900{
1901 atomic_inc(&mdev->rs_pending_cnt);
1902}
1903
1904#define dec_rs_pending(mdev) do { \
1905 typecheck(struct drbd_conf *, mdev); \
1906 atomic_dec(&mdev->rs_pending_cnt); \
1907 ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0)
1908
1909/* counts how many answers we still need to send to the peer.
1910 * increased on
1911 * receive_Data unless protocol A;
1912 * we need to send a P_RECV_ACK (proto B)
1913 * or P_WRITE_ACK (proto C)
1914 * receive_RSDataReply (recv_resync_read) we need to send a P_WRITE_ACK
1915 * receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA
1916 * receive_Barrier_* we need to send a P_BARRIER_ACK
1917 */
1918static inline void inc_unacked(struct drbd_conf *mdev)
1919{
1920 atomic_inc(&mdev->unacked_cnt);
1921}
1922
1923#define dec_unacked(mdev) do { \
1924 typecheck(struct drbd_conf *, mdev); \
1925 atomic_dec(&mdev->unacked_cnt); \
1926 ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
1927
1928#define sub_unacked(mdev, n) do { \
1929 typecheck(struct drbd_conf *, mdev); \
1930 atomic_sub(n, &mdev->unacked_cnt); \
1931 ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
1932
1933
1934static inline void put_net_conf(struct drbd_conf *mdev)
1935{
1936 if (atomic_dec_and_test(&mdev->net_cnt))
1937 wake_up(&mdev->misc_wait);
1938}
1939
1940/**
1941 * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there
1942 * @mdev: DRBD device.
1943 *
1944 * You have to call put_net_conf() when finished working with mdev->net_conf.
1945 */
1946static inline int get_net_conf(struct drbd_conf *mdev)
1947{
1948 int have_net_conf;
1949
1950 atomic_inc(&mdev->net_cnt);
1951 have_net_conf = mdev->state.conn >= C_UNCONNECTED;
1952 if (!have_net_conf)
1953 put_net_conf(mdev);
1954 return have_net_conf;
1955}
1956
1957/**
1958 * get_ldev() - Increase the ref count on mdev->ldev. Returns 0 if there is no ldev
1959 * @M: DRBD device.
1960 *
1961 * You have to call put_ldev() when finished working with mdev->ldev.
1962 */
1963#define get_ldev(M) __cond_lock(local, _get_ldev_if_state(M,D_INCONSISTENT))
1964#define get_ldev_if_state(M,MINS) __cond_lock(local, _get_ldev_if_state(M,MINS))
1965
1966static inline void put_ldev(struct drbd_conf *mdev)
1967{
1968 __release(local);
1969 if (atomic_dec_and_test(&mdev->local_cnt))
1970 wake_up(&mdev->misc_wait);
1971 D_ASSERT(atomic_read(&mdev->local_cnt) >= 0);
1972}
1973
1974#ifndef __CHECKER__
1975static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
1976{
1977 int io_allowed;
1978
1979 atomic_inc(&mdev->local_cnt);
1980 io_allowed = (mdev->state.disk >= mins);
1981 if (!io_allowed)
1982 put_ldev(mdev);
1983 return io_allowed;
1984}
1985#else
1986extern int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins);
1987#endif
1988
1989/* you must have an "get_ldev" reference */
1990static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
1991 unsigned long *bits_left, unsigned int *per_mil_done)
1992{
1993 /*
1994 * this is to break it at compile time when we change that
1995 * (we may feel 4TB maximum storage per drbd is not enough)
1996 */
1997 typecheck(unsigned long, mdev->rs_total);
1998
1999 /* note: both rs_total and rs_left are in bits, i.e. in
2000 * units of BM_BLOCK_SIZE.
2001 * for the percentage, we don't care. */
2002
2003 *bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
2004 /* >> 10 to prevent overflow,
2005 * +1 to prevent division by zero */
2006 if (*bits_left > mdev->rs_total) {
2007 /* doh. maybe a logic bug somewhere.
2008 * may also be just a race condition
2009 * between this and a disconnect during sync.
2010 * for now, just prevent in-kernel buffer overflow.
2011 */
2012 smp_rmb();
2013 dev_warn(DEV, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n",
2014 drbd_conn_str(mdev->state.conn),
2015 *bits_left, mdev->rs_total, mdev->rs_failed);
2016 *per_mil_done = 0;
2017 } else {
2018 /* make sure the calculation happens in long context */
2019 unsigned long tmp = 1000UL -
2020 (*bits_left >> 10)*1000UL
2021 / ((mdev->rs_total >> 10) + 1UL);
2022 *per_mil_done = tmp;
2023 }
2024}
2025
2026
2027/* this throttles on-the-fly application requests
2028 * according to max_buffers settings;
2029 * maybe re-implement using semaphores? */
2030static inline int drbd_get_max_buffers(struct drbd_conf *mdev)
2031{
2032 int mxb = 1000000; /* arbitrary limit on open requests */
2033 if (get_net_conf(mdev)) {
2034 mxb = mdev->net_conf->max_buffers;
2035 put_net_conf(mdev);
2036 }
2037 return mxb;
2038}
2039
2040static inline int drbd_state_is_stable(union drbd_state s)
2041{
2042
2043 /* DO NOT add a default clause, we want the compiler to warn us
2044 * for any newly introduced state we may have forgotten to add here */
2045
2046 switch ((enum drbd_conns)s.conn) {
2047 /* new io only accepted when there is no connection, ... */
2048 case C_STANDALONE:
2049 case C_WF_CONNECTION:
2050 /* ... or there is a well established connection. */
2051 case C_CONNECTED:
2052 case C_SYNC_SOURCE:
2053 case C_SYNC_TARGET:
2054 case C_VERIFY_S:
2055 case C_VERIFY_T:
2056 case C_PAUSED_SYNC_S:
2057 case C_PAUSED_SYNC_T:
2058 /* maybe stable, look at the disk state */
2059 break;
2060
2061 /* no new io accepted during tansitional states
2062 * like handshake or teardown */
2063 case C_DISCONNECTING:
2064 case C_UNCONNECTED:
2065 case C_TIMEOUT:
2066 case C_BROKEN_PIPE:
2067 case C_NETWORK_FAILURE:
2068 case C_PROTOCOL_ERROR:
2069 case C_TEAR_DOWN:
2070 case C_WF_REPORT_PARAMS:
2071 case C_STARTING_SYNC_S:
2072 case C_STARTING_SYNC_T:
2073 case C_WF_BITMAP_S:
2074 case C_WF_BITMAP_T:
2075 case C_WF_SYNC_UUID:
2076 case C_MASK:
2077 /* not "stable" */
2078 return 0;
2079 }
2080
2081 switch ((enum drbd_disk_state)s.disk) {
2082 case D_DISKLESS:
2083 case D_INCONSISTENT:
2084 case D_OUTDATED:
2085 case D_CONSISTENT:
2086 case D_UP_TO_DATE:
2087 /* disk state is stable as well. */
2088 break;
2089
2090 /* no new io accepted during tansitional states */
2091 case D_ATTACHING:
2092 case D_FAILED:
2093 case D_NEGOTIATING:
2094 case D_UNKNOWN:
2095 case D_MASK:
2096 /* not "stable" */
2097 return 0;
2098 }
2099
2100 return 1;
2101}
2102
2103static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
2104{
2105 int mxb = drbd_get_max_buffers(mdev);
2106
2107 if (mdev->state.susp)
2108 return 0;
2109 if (test_bit(SUSPEND_IO, &mdev->flags))
2110 return 0;
2111
2112 /* to avoid potential deadlock or bitmap corruption,
2113 * in various places, we only allow new application io
2114 * to start during "stable" states. */
2115
2116 /* no new io accepted when attaching or detaching the disk */
2117 if (!drbd_state_is_stable(mdev->state))
2118 return 0;
2119
2120 /* since some older kernels don't have atomic_add_unless,
2121 * and we are within the spinlock anyways, we have this workaround. */
2122 if (atomic_read(&mdev->ap_bio_cnt) > mxb)
2123 return 0;
2124 if (test_bit(BITMAP_IO, &mdev->flags))
2125 return 0;
2126 return 1;
2127}
2128
2129/* I'd like to use wait_event_lock_irq,
2130 * but I'm not sure when it got introduced,
2131 * and not sure when it has 3 or 4 arguments */
2132static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two)
2133{
2134 /* compare with after_state_ch,
2135 * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */
2136 DEFINE_WAIT(wait);
2137
2138 /* we wait here
2139 * as long as the device is suspended
2140 * until the bitmap is no longer on the fly during connection
2141 * handshake as long as we would exeed the max_buffer limit.
2142 *
2143 * to avoid races with the reconnect code,
2144 * we need to atomic_inc within the spinlock. */
2145
2146 spin_lock_irq(&mdev->req_lock);
2147 while (!__inc_ap_bio_cond(mdev)) {
2148 prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
2149 spin_unlock_irq(&mdev->req_lock);
2150 schedule();
2151 finish_wait(&mdev->misc_wait, &wait);
2152 spin_lock_irq(&mdev->req_lock);
2153 }
2154 atomic_add(one_or_two, &mdev->ap_bio_cnt);
2155 spin_unlock_irq(&mdev->req_lock);
2156}
2157
2158static inline void dec_ap_bio(struct drbd_conf *mdev)
2159{
2160 int mxb = drbd_get_max_buffers(mdev);
2161 int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt);
2162
2163 D_ASSERT(ap_bio >= 0);
2164 /* this currently does wake_up for every dec_ap_bio!
2165 * maybe rather introduce some type of hysteresis?
2166 * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */
2167 if (ap_bio < mxb)
2168 wake_up(&mdev->misc_wait);
2169 if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) {
2170 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
2171 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
2172 }
2173}
2174
2175static inline void drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
2176{
2177 mdev->ed_uuid = val;
2178}
2179
2180static inline int seq_cmp(u32 a, u32 b)
2181{
2182 /* we assume wrap around at 32bit.
2183 * for wrap around at 24bit (old atomic_t),
2184 * we'd have to
2185 * a <<= 8; b <<= 8;
2186 */
2187 return (s32)(a) - (s32)(b);
2188}
2189#define seq_lt(a, b) (seq_cmp((a), (b)) < 0)
2190#define seq_gt(a, b) (seq_cmp((a), (b)) > 0)
2191#define seq_ge(a, b) (seq_cmp((a), (b)) >= 0)
2192#define seq_le(a, b) (seq_cmp((a), (b)) <= 0)
2193/* CAUTION: please no side effects in arguments! */
2194#define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b)))
2195
2196static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq)
2197{
2198 unsigned int m;
2199 spin_lock(&mdev->peer_seq_lock);
2200 m = seq_max(mdev->peer_seq, new_seq);
2201 mdev->peer_seq = m;
2202 spin_unlock(&mdev->peer_seq_lock);
2203 if (m == new_seq)
2204 wake_up(&mdev->seq_wait);
2205}
2206
2207static inline void drbd_update_congested(struct drbd_conf *mdev)
2208{
2209 struct sock *sk = mdev->data.socket->sk;
2210 if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
2211 set_bit(NET_CONGESTED, &mdev->flags);
2212}
2213
2214static inline int drbd_queue_order_type(struct drbd_conf *mdev)
2215{
2216 /* sorry, we currently have no working implementation
2217 * of distributed TCQ stuff */
2218#ifndef QUEUE_ORDERED_NONE
2219#define QUEUE_ORDERED_NONE 0
2220#endif
2221 return QUEUE_ORDERED_NONE;
2222}
2223
2224static inline void drbd_blk_run_queue(struct request_queue *q)
2225{
2226 if (q && q->unplug_fn)
2227 q->unplug_fn(q);
2228}
2229
2230static inline void drbd_kick_lo(struct drbd_conf *mdev)
2231{
2232 if (get_ldev(mdev)) {
2233 drbd_blk_run_queue(bdev_get_queue(mdev->ldev->backing_bdev));
2234 put_ldev(mdev);
2235 }
2236}
2237
2238static inline void drbd_md_flush(struct drbd_conf *mdev)
2239{
2240 int r;
2241
2242 if (test_bit(MD_NO_BARRIER, &mdev->flags))
2243 return;
2244
2245 r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL);
2246 if (r) {
2247 set_bit(MD_NO_BARRIER, &mdev->flags);
2248 dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
2249 }
2250}
2251
2252#endif
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
new file mode 100644
index 000000000000..157d1e4343c2
--- /dev/null
+++ b/drivers/block/drbd/drbd_main.c
@@ -0,0 +1,3699 @@
1/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
29#include <linux/module.h>
30#include <linux/version.h>
31#include <linux/drbd.h>
32#include <asm/uaccess.h>
33#include <asm/types.h>
34#include <net/sock.h>
35#include <linux/ctype.h>
36#include <linux/smp_lock.h>
37#include <linux/fs.h>
38#include <linux/file.h>
39#include <linux/proc_fs.h>
40#include <linux/init.h>
41#include <linux/mm.h>
42#include <linux/memcontrol.h>
43#include <linux/mm_inline.h>
44#include <linux/slab.h>
45#include <linux/random.h>
46#include <linux/reboot.h>
47#include <linux/notifier.h>
48#include <linux/kthread.h>
49
50#define __KERNEL_SYSCALLS__
51#include <linux/unistd.h>
52#include <linux/vmalloc.h>
53
54#include <linux/drbd_limits.h>
55#include "drbd_int.h"
56#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
57
58#include "drbd_vli.h"
59
60struct after_state_chg_work {
61 struct drbd_work w;
62 union drbd_state os;
63 union drbd_state ns;
64 enum chg_state_flags flags;
65 struct completion *done;
66};
67
68int drbdd_init(struct drbd_thread *);
69int drbd_worker(struct drbd_thread *);
70int drbd_asender(struct drbd_thread *);
71
72int drbd_init(void);
73static int drbd_open(struct block_device *bdev, fmode_t mode);
74static int drbd_release(struct gendisk *gd, fmode_t mode);
75static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79static void md_sync_timer_fn(unsigned long data);
80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
81
82MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
83 "Lars Ellenberg <lars@linbit.com>");
84MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
85MODULE_VERSION(REL_VERSION);
86MODULE_LICENSE("GPL");
87MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
88MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
89
90#include <linux/moduleparam.h>
91/* allow_open_on_secondary */
92MODULE_PARM_DESC(allow_oos, "DONT USE!");
93/* thanks to these macros, if compiled into the kernel (not-module),
94 * this becomes the boot parameter drbd.minor_count */
95module_param(minor_count, uint, 0444);
96module_param(disable_sendpage, bool, 0644);
97module_param(allow_oos, bool, 0);
98module_param(cn_idx, uint, 0444);
99module_param(proc_details, int, 0644);
100
101#ifdef CONFIG_DRBD_FAULT_INJECTION
102int enable_faults;
103int fault_rate;
104static int fault_count;
105int fault_devs;
106/* bitmap of enabled faults */
107module_param(enable_faults, int, 0664);
108/* fault rate % value - applies to all enabled faults */
109module_param(fault_rate, int, 0664);
110/* count of faults inserted */
111module_param(fault_count, int, 0664);
112/* bitmap of devices to insert faults on */
113module_param(fault_devs, int, 0644);
114#endif
115
116/* module parameter, defined */
117unsigned int minor_count = 32;
118int disable_sendpage;
119int allow_oos;
120unsigned int cn_idx = CN_IDX_DRBD;
121int proc_details; /* Detail level in proc drbd*/
122
123/* Module parameter for setting the user mode helper program
124 * to run. Default is /sbin/drbdadm */
125char usermode_helper[80] = "/sbin/drbdadm";
126
127module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
128
129/* in 2.6.x, our device mapping and config info contains our virtual gendisks
130 * as member "struct gendisk *vdisk;"
131 */
132struct drbd_conf **minor_table;
133
134struct kmem_cache *drbd_request_cache;
135struct kmem_cache *drbd_ee_cache; /* epoch entries */
136struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
137struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
138mempool_t *drbd_request_mempool;
139mempool_t *drbd_ee_mempool;
140
141/* I do not use a standard mempool, because:
142 1) I want to hand out the pre-allocated objects first.
143 2) I want to be able to interrupt sleeping allocation with a signal.
144 Note: This is a single linked list, the next pointer is the private
145 member of struct page.
146 */
147struct page *drbd_pp_pool;
148spinlock_t drbd_pp_lock;
149int drbd_pp_vacant;
150wait_queue_head_t drbd_pp_wait;
151
152DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
153
154static struct block_device_operations drbd_ops = {
155 .owner = THIS_MODULE,
156 .open = drbd_open,
157 .release = drbd_release,
158};
159
160#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
161
162#ifdef __CHECKER__
163/* When checking with sparse, and this is an inline function, sparse will
164 give tons of false positives. When this is a real functions sparse works.
165 */
166int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
167{
168 int io_allowed;
169
170 atomic_inc(&mdev->local_cnt);
171 io_allowed = (mdev->state.disk >= mins);
172 if (!io_allowed) {
173 if (atomic_dec_and_test(&mdev->local_cnt))
174 wake_up(&mdev->misc_wait);
175 }
176 return io_allowed;
177}
178
179#endif
180
181/**
182 * DOC: The transfer log
183 *
184 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
185 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
186 * of the list. There is always at least one &struct drbd_tl_epoch object.
187 *
188 * Each &struct drbd_tl_epoch has a circular double linked list of requests
189 * attached.
190 */
191static int tl_init(struct drbd_conf *mdev)
192{
193 struct drbd_tl_epoch *b;
194
195 /* during device minor initialization, we may well use GFP_KERNEL */
196 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
197 if (!b)
198 return 0;
199 INIT_LIST_HEAD(&b->requests);
200 INIT_LIST_HEAD(&b->w.list);
201 b->next = NULL;
202 b->br_number = 4711;
203 b->n_req = 0;
204 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
205
206 mdev->oldest_tle = b;
207 mdev->newest_tle = b;
208 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
209
210 mdev->tl_hash = NULL;
211 mdev->tl_hash_s = 0;
212
213 return 1;
214}
215
216static void tl_cleanup(struct drbd_conf *mdev)
217{
218 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
219 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
220 kfree(mdev->oldest_tle);
221 mdev->oldest_tle = NULL;
222 kfree(mdev->unused_spare_tle);
223 mdev->unused_spare_tle = NULL;
224 kfree(mdev->tl_hash);
225 mdev->tl_hash = NULL;
226 mdev->tl_hash_s = 0;
227}
228
229/**
230 * _tl_add_barrier() - Adds a barrier to the transfer log
231 * @mdev: DRBD device.
232 * @new: Barrier to be added before the current head of the TL.
233 *
234 * The caller must hold the req_lock.
235 */
236void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
237{
238 struct drbd_tl_epoch *newest_before;
239
240 INIT_LIST_HEAD(&new->requests);
241 INIT_LIST_HEAD(&new->w.list);
242 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
243 new->next = NULL;
244 new->n_req = 0;
245
246 newest_before = mdev->newest_tle;
247 /* never send a barrier number == 0, because that is special-cased
248 * when using TCQ for our write ordering code */
249 new->br_number = (newest_before->br_number+1) ?: 1;
250 if (mdev->newest_tle != new) {
251 mdev->newest_tle->next = new;
252 mdev->newest_tle = new;
253 }
254}
255
256/**
257 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
258 * @mdev: DRBD device.
259 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
260 * @set_size: Expected number of requests before that barrier.
261 *
262 * In case the passed barrier_nr or set_size does not match the oldest
263 * &struct drbd_tl_epoch objects this function will cause a termination
264 * of the connection.
265 */
266void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
267 unsigned int set_size)
268{
269 struct drbd_tl_epoch *b, *nob; /* next old barrier */
270 struct list_head *le, *tle;
271 struct drbd_request *r;
272
273 spin_lock_irq(&mdev->req_lock);
274
275 b = mdev->oldest_tle;
276
277 /* first some paranoia code */
278 if (b == NULL) {
279 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
280 barrier_nr);
281 goto bail;
282 }
283 if (b->br_number != barrier_nr) {
284 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
285 barrier_nr, b->br_number);
286 goto bail;
287 }
288 if (b->n_req != set_size) {
289 dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n",
290 barrier_nr, set_size, b->n_req);
291 goto bail;
292 }
293
294 /* Clean up list of requests processed during current epoch */
295 list_for_each_safe(le, tle, &b->requests) {
296 r = list_entry(le, struct drbd_request, tl_requests);
297 _req_mod(r, barrier_acked);
298 }
299 /* There could be requests on the list waiting for completion
300 of the write to the local disk. To avoid corruptions of
301 slab's data structures we have to remove the lists head.
302
303 Also there could have been a barrier ack out of sequence, overtaking
304 the write acks - which would be a bug and violating write ordering.
305 To not deadlock in case we lose connection while such requests are
306 still pending, we need some way to find them for the
307 _req_mode(connection_lost_while_pending).
308
309 These have been list_move'd to the out_of_sequence_requests list in
310 _req_mod(, barrier_acked) above.
311 */
312 list_del_init(&b->requests);
313
314 nob = b->next;
315 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
316 _tl_add_barrier(mdev, b);
317 if (nob)
318 mdev->oldest_tle = nob;
319 /* if nob == NULL b was the only barrier, and becomes the new
320 barrier. Therefore mdev->oldest_tle points already to b */
321 } else {
322 D_ASSERT(nob != NULL);
323 mdev->oldest_tle = nob;
324 kfree(b);
325 }
326
327 spin_unlock_irq(&mdev->req_lock);
328 dec_ap_pending(mdev);
329
330 return;
331
332bail:
333 spin_unlock_irq(&mdev->req_lock);
334 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
335}
336
337
338/**
339 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
340 * @mdev: DRBD device.
341 *
342 * This is called after the connection to the peer was lost. The storage covered
343 * by the requests on the transfer gets marked as our of sync. Called from the
344 * receiver thread and the worker thread.
345 */
346void tl_clear(struct drbd_conf *mdev)
347{
348 struct drbd_tl_epoch *b, *tmp;
349 struct list_head *le, *tle;
350 struct drbd_request *r;
351 int new_initial_bnr = net_random();
352
353 spin_lock_irq(&mdev->req_lock);
354
355 b = mdev->oldest_tle;
356 while (b) {
357 list_for_each_safe(le, tle, &b->requests) {
358 r = list_entry(le, struct drbd_request, tl_requests);
359 /* It would be nice to complete outside of spinlock.
360 * But this is easier for now. */
361 _req_mod(r, connection_lost_while_pending);
362 }
363 tmp = b->next;
364
365 /* there could still be requests on that ring list,
366 * in case local io is still pending */
367 list_del(&b->requests);
368
369 /* dec_ap_pending corresponding to queue_barrier.
370 * the newest barrier may not have been queued yet,
371 * in which case w.cb is still NULL. */
372 if (b->w.cb != NULL)
373 dec_ap_pending(mdev);
374
375 if (b == mdev->newest_tle) {
376 /* recycle, but reinit! */
377 D_ASSERT(tmp == NULL);
378 INIT_LIST_HEAD(&b->requests);
379 INIT_LIST_HEAD(&b->w.list);
380 b->w.cb = NULL;
381 b->br_number = new_initial_bnr;
382 b->n_req = 0;
383
384 mdev->oldest_tle = b;
385 break;
386 }
387 kfree(b);
388 b = tmp;
389 }
390
391 /* we expect this list to be empty. */
392 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
393
394 /* but just in case, clean it up anyways! */
395 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
396 r = list_entry(le, struct drbd_request, tl_requests);
397 /* It would be nice to complete outside of spinlock.
398 * But this is easier for now. */
399 _req_mod(r, connection_lost_while_pending);
400 }
401
402 /* ensure bit indicating barrier is required is clear */
403 clear_bit(CREATE_BARRIER, &mdev->flags);
404
405 spin_unlock_irq(&mdev->req_lock);
406}
407
408/**
409 * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
410 * @mdev: DRBD device.
411 * @os: old (current) state.
412 * @ns: new (wanted) state.
413 */
414static int cl_wide_st_chg(struct drbd_conf *mdev,
415 union drbd_state os, union drbd_state ns)
416{
417 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
418 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
419 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
420 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
421 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
422 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
423 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
424}
425
426int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
427 union drbd_state mask, union drbd_state val)
428{
429 unsigned long flags;
430 union drbd_state os, ns;
431 int rv;
432
433 spin_lock_irqsave(&mdev->req_lock, flags);
434 os = mdev->state;
435 ns.i = (os.i & ~mask.i) | val.i;
436 rv = _drbd_set_state(mdev, ns, f, NULL);
437 ns = mdev->state;
438 spin_unlock_irqrestore(&mdev->req_lock, flags);
439
440 return rv;
441}
442
443/**
444 * drbd_force_state() - Impose a change which happens outside our control on our state
445 * @mdev: DRBD device.
446 * @mask: mask of state bits to change.
447 * @val: value of new state bits.
448 */
449void drbd_force_state(struct drbd_conf *mdev,
450 union drbd_state mask, union drbd_state val)
451{
452 drbd_change_state(mdev, CS_HARD, mask, val);
453}
454
455static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
456static int is_valid_state_transition(struct drbd_conf *,
457 union drbd_state, union drbd_state);
458static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
459 union drbd_state ns, int *warn_sync_abort);
460int drbd_send_state_req(struct drbd_conf *,
461 union drbd_state, union drbd_state);
462
463static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
464 union drbd_state mask, union drbd_state val)
465{
466 union drbd_state os, ns;
467 unsigned long flags;
468 int rv;
469
470 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
471 return SS_CW_SUCCESS;
472
473 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
474 return SS_CW_FAILED_BY_PEER;
475
476 rv = 0;
477 spin_lock_irqsave(&mdev->req_lock, flags);
478 os = mdev->state;
479 ns.i = (os.i & ~mask.i) | val.i;
480 ns = sanitize_state(mdev, os, ns, NULL);
481
482 if (!cl_wide_st_chg(mdev, os, ns))
483 rv = SS_CW_NO_NEED;
484 if (!rv) {
485 rv = is_valid_state(mdev, ns);
486 if (rv == SS_SUCCESS) {
487 rv = is_valid_state_transition(mdev, ns, os);
488 if (rv == SS_SUCCESS)
489 rv = 0; /* cont waiting, otherwise fail. */
490 }
491 }
492 spin_unlock_irqrestore(&mdev->req_lock, flags);
493
494 return rv;
495}
496
497/**
498 * drbd_req_state() - Perform an eventually cluster wide state change
499 * @mdev: DRBD device.
500 * @mask: mask of state bits to change.
501 * @val: value of new state bits.
502 * @f: flags
503 *
504 * Should not be called directly, use drbd_request_state() or
505 * _drbd_request_state().
506 */
507static int drbd_req_state(struct drbd_conf *mdev,
508 union drbd_state mask, union drbd_state val,
509 enum chg_state_flags f)
510{
511 struct completion done;
512 unsigned long flags;
513 union drbd_state os, ns;
514 int rv;
515
516 init_completion(&done);
517
518 if (f & CS_SERIALIZE)
519 mutex_lock(&mdev->state_mutex);
520
521 spin_lock_irqsave(&mdev->req_lock, flags);
522 os = mdev->state;
523 ns.i = (os.i & ~mask.i) | val.i;
524 ns = sanitize_state(mdev, os, ns, NULL);
525
526 if (cl_wide_st_chg(mdev, os, ns)) {
527 rv = is_valid_state(mdev, ns);
528 if (rv == SS_SUCCESS)
529 rv = is_valid_state_transition(mdev, ns, os);
530 spin_unlock_irqrestore(&mdev->req_lock, flags);
531
532 if (rv < SS_SUCCESS) {
533 if (f & CS_VERBOSE)
534 print_st_err(mdev, os, ns, rv);
535 goto abort;
536 }
537
538 drbd_state_lock(mdev);
539 if (!drbd_send_state_req(mdev, mask, val)) {
540 drbd_state_unlock(mdev);
541 rv = SS_CW_FAILED_BY_PEER;
542 if (f & CS_VERBOSE)
543 print_st_err(mdev, os, ns, rv);
544 goto abort;
545 }
546
547 wait_event(mdev->state_wait,
548 (rv = _req_st_cond(mdev, mask, val)));
549
550 if (rv < SS_SUCCESS) {
551 drbd_state_unlock(mdev);
552 if (f & CS_VERBOSE)
553 print_st_err(mdev, os, ns, rv);
554 goto abort;
555 }
556 spin_lock_irqsave(&mdev->req_lock, flags);
557 os = mdev->state;
558 ns.i = (os.i & ~mask.i) | val.i;
559 rv = _drbd_set_state(mdev, ns, f, &done);
560 drbd_state_unlock(mdev);
561 } else {
562 rv = _drbd_set_state(mdev, ns, f, &done);
563 }
564
565 spin_unlock_irqrestore(&mdev->req_lock, flags);
566
567 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
568 D_ASSERT(current != mdev->worker.task);
569 wait_for_completion(&done);
570 }
571
572abort:
573 if (f & CS_SERIALIZE)
574 mutex_unlock(&mdev->state_mutex);
575
576 return rv;
577}
578
579/**
580 * _drbd_request_state() - Request a state change (with flags)
581 * @mdev: DRBD device.
582 * @mask: mask of state bits to change.
583 * @val: value of new state bits.
584 * @f: flags
585 *
586 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
587 * flag, or when logging of failed state change requests is not desired.
588 */
589int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
590 union drbd_state val, enum chg_state_flags f)
591{
592 int rv;
593
594 wait_event(mdev->state_wait,
595 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
596
597 return rv;
598}
599
600static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
601{
602 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
603 name,
604 drbd_conn_str(ns.conn),
605 drbd_role_str(ns.role),
606 drbd_role_str(ns.peer),
607 drbd_disk_str(ns.disk),
608 drbd_disk_str(ns.pdsk),
609 ns.susp ? 's' : 'r',
610 ns.aftr_isp ? 'a' : '-',
611 ns.peer_isp ? 'p' : '-',
612 ns.user_isp ? 'u' : '-'
613 );
614}
615
616void print_st_err(struct drbd_conf *mdev,
617 union drbd_state os, union drbd_state ns, int err)
618{
619 if (err == SS_IN_TRANSIENT_STATE)
620 return;
621 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
622 print_st(mdev, " state", os);
623 print_st(mdev, "wanted", ns);
624}
625
626
627#define drbd_peer_str drbd_role_str
628#define drbd_pdsk_str drbd_disk_str
629
630#define drbd_susp_str(A) ((A) ? "1" : "0")
631#define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
632#define drbd_peer_isp_str(A) ((A) ? "1" : "0")
633#define drbd_user_isp_str(A) ((A) ? "1" : "0")
634
635#define PSC(A) \
636 ({ if (ns.A != os.A) { \
637 pbp += sprintf(pbp, #A "( %s -> %s ) ", \
638 drbd_##A##_str(os.A), \
639 drbd_##A##_str(ns.A)); \
640 } })
641
642/**
643 * is_valid_state() - Returns an SS_ error code if ns is not valid
644 * @mdev: DRBD device.
645 * @ns: State to consider.
646 */
647static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
648{
649 /* See drbd_state_sw_errors in drbd_strings.c */
650
651 enum drbd_fencing_p fp;
652 int rv = SS_SUCCESS;
653
654 fp = FP_DONT_CARE;
655 if (get_ldev(mdev)) {
656 fp = mdev->ldev->dc.fencing;
657 put_ldev(mdev);
658 }
659
660 if (get_net_conf(mdev)) {
661 if (!mdev->net_conf->two_primaries &&
662 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
663 rv = SS_TWO_PRIMARIES;
664 put_net_conf(mdev);
665 }
666
667 if (rv <= 0)
668 /* already found a reason to abort */;
669 else if (ns.role == R_SECONDARY && mdev->open_cnt)
670 rv = SS_DEVICE_IN_USE;
671
672 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
673 rv = SS_NO_UP_TO_DATE_DISK;
674
675 else if (fp >= FP_RESOURCE &&
676 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
677 rv = SS_PRIMARY_NOP;
678
679 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
680 rv = SS_NO_UP_TO_DATE_DISK;
681
682 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
683 rv = SS_NO_LOCAL_DISK;
684
685 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
686 rv = SS_NO_REMOTE_DISK;
687
688 else if ((ns.conn == C_CONNECTED ||
689 ns.conn == C_WF_BITMAP_S ||
690 ns.conn == C_SYNC_SOURCE ||
691 ns.conn == C_PAUSED_SYNC_S) &&
692 ns.disk == D_OUTDATED)
693 rv = SS_CONNECTED_OUTDATES;
694
695 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
696 (mdev->sync_conf.verify_alg[0] == 0))
697 rv = SS_NO_VERIFY_ALG;
698
699 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
700 mdev->agreed_pro_version < 88)
701 rv = SS_NOT_SUPPORTED;
702
703 return rv;
704}
705
706/**
707 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
708 * @mdev: DRBD device.
709 * @ns: new state.
710 * @os: old state.
711 */
712static int is_valid_state_transition(struct drbd_conf *mdev,
713 union drbd_state ns, union drbd_state os)
714{
715 int rv = SS_SUCCESS;
716
717 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
718 os.conn > C_CONNECTED)
719 rv = SS_RESYNC_RUNNING;
720
721 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
722 rv = SS_ALREADY_STANDALONE;
723
724 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
725 rv = SS_IS_DISKLESS;
726
727 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
728 rv = SS_NO_NET_CONFIG;
729
730 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
731 rv = SS_LOWER_THAN_OUTDATED;
732
733 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
734 rv = SS_IN_TRANSIENT_STATE;
735
736 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
737 rv = SS_IN_TRANSIENT_STATE;
738
739 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
740 rv = SS_NEED_CONNECTION;
741
742 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
743 ns.conn != os.conn && os.conn > C_CONNECTED)
744 rv = SS_RESYNC_RUNNING;
745
746 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
747 os.conn < C_CONNECTED)
748 rv = SS_NEED_CONNECTION;
749
750 return rv;
751}
752
753/**
754 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
755 * @mdev: DRBD device.
756 * @os: old state.
757 * @ns: new state.
758 * @warn_sync_abort:
759 *
760 * When we loose connection, we have to set the state of the peers disk (pdsk)
761 * to D_UNKNOWN. This rule and many more along those lines are in this function.
762 */
763static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
764 union drbd_state ns, int *warn_sync_abort)
765{
766 enum drbd_fencing_p fp;
767
768 fp = FP_DONT_CARE;
769 if (get_ldev(mdev)) {
770 fp = mdev->ldev->dc.fencing;
771 put_ldev(mdev);
772 }
773
774 /* Disallow Network errors to configure a device's network part */
775 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
776 os.conn <= C_DISCONNECTING)
777 ns.conn = os.conn;
778
779 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */
780 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
781 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING)
782 ns.conn = os.conn;
783
784 /* After C_DISCONNECTING only C_STANDALONE may follow */
785 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
786 ns.conn = os.conn;
787
788 if (ns.conn < C_CONNECTED) {
789 ns.peer_isp = 0;
790 ns.peer = R_UNKNOWN;
791 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
792 ns.pdsk = D_UNKNOWN;
793 }
794
795 /* Clear the aftr_isp when becoming unconfigured */
796 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
797 ns.aftr_isp = 0;
798
799 if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS)
800 ns.pdsk = D_UNKNOWN;
801
802 /* Abort resync if a disk fails/detaches */
803 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
804 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
805 if (warn_sync_abort)
806 *warn_sync_abort = 1;
807 ns.conn = C_CONNECTED;
808 }
809
810 if (ns.conn >= C_CONNECTED &&
811 ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) ||
812 (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) {
813 switch (ns.conn) {
814 case C_WF_BITMAP_T:
815 case C_PAUSED_SYNC_T:
816 ns.disk = D_OUTDATED;
817 break;
818 case C_CONNECTED:
819 case C_WF_BITMAP_S:
820 case C_SYNC_SOURCE:
821 case C_PAUSED_SYNC_S:
822 ns.disk = D_UP_TO_DATE;
823 break;
824 case C_SYNC_TARGET:
825 ns.disk = D_INCONSISTENT;
826 dev_warn(DEV, "Implicitly set disk state Inconsistent!\n");
827 break;
828 }
829 if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE)
830 dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n");
831 }
832
833 if (ns.conn >= C_CONNECTED &&
834 (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) {
835 switch (ns.conn) {
836 case C_CONNECTED:
837 case C_WF_BITMAP_T:
838 case C_PAUSED_SYNC_T:
839 case C_SYNC_TARGET:
840 ns.pdsk = D_UP_TO_DATE;
841 break;
842 case C_WF_BITMAP_S:
843 case C_PAUSED_SYNC_S:
844 ns.pdsk = D_OUTDATED;
845 break;
846 case C_SYNC_SOURCE:
847 ns.pdsk = D_INCONSISTENT;
848 dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n");
849 break;
850 }
851 if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE)
852 dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n");
853 }
854
855 /* Connection breaks down before we finished "Negotiating" */
856 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
857 get_ldev_if_state(mdev, D_NEGOTIATING)) {
858 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
859 ns.disk = mdev->new_state_tmp.disk;
860 ns.pdsk = mdev->new_state_tmp.pdsk;
861 } else {
862 dev_alert(DEV, "Connection lost while negotiating, no data!\n");
863 ns.disk = D_DISKLESS;
864 ns.pdsk = D_UNKNOWN;
865 }
866 put_ldev(mdev);
867 }
868
869 if (fp == FP_STONITH &&
870 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
871 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
872 ns.susp = 1;
873
874 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
875 if (ns.conn == C_SYNC_SOURCE)
876 ns.conn = C_PAUSED_SYNC_S;
877 if (ns.conn == C_SYNC_TARGET)
878 ns.conn = C_PAUSED_SYNC_T;
879 } else {
880 if (ns.conn == C_PAUSED_SYNC_S)
881 ns.conn = C_SYNC_SOURCE;
882 if (ns.conn == C_PAUSED_SYNC_T)
883 ns.conn = C_SYNC_TARGET;
884 }
885
886 return ns;
887}
888
889/* helper for __drbd_set_state */
890static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
891{
892 if (cs == C_VERIFY_T) {
893 /* starting online verify from an arbitrary position
894 * does not fit well into the existing protocol.
895 * on C_VERIFY_T, we initialize ov_left and friends
896 * implicitly in receive_DataRequest once the
897 * first P_OV_REQUEST is received */
898 mdev->ov_start_sector = ~(sector_t)0;
899 } else {
900 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
901 if (bit >= mdev->rs_total)
902 mdev->ov_start_sector =
903 BM_BIT_TO_SECT(mdev->rs_total - 1);
904 mdev->ov_position = mdev->ov_start_sector;
905 }
906}
907
908/**
909 * __drbd_set_state() - Set a new DRBD state
910 * @mdev: DRBD device.
911 * @ns: new state.
912 * @flags: Flags
913 * @done: Optional completion, that will get completed after the after_state_ch() finished
914 *
915 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
916 */
917int __drbd_set_state(struct drbd_conf *mdev,
918 union drbd_state ns, enum chg_state_flags flags,
919 struct completion *done)
920{
921 union drbd_state os;
922 int rv = SS_SUCCESS;
923 int warn_sync_abort = 0;
924 struct after_state_chg_work *ascw;
925
926 os = mdev->state;
927
928 ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
929
930 if (ns.i == os.i)
931 return SS_NOTHING_TO_DO;
932
933 if (!(flags & CS_HARD)) {
934 /* pre-state-change checks ; only look at ns */
935 /* See drbd_state_sw_errors in drbd_strings.c */
936
937 rv = is_valid_state(mdev, ns);
938 if (rv < SS_SUCCESS) {
939 /* If the old state was illegal as well, then let
940 this happen...*/
941
942 if (is_valid_state(mdev, os) == rv) {
943 dev_err(DEV, "Considering state change from bad state. "
944 "Error would be: '%s'\n",
945 drbd_set_st_err_str(rv));
946 print_st(mdev, "old", os);
947 print_st(mdev, "new", ns);
948 rv = is_valid_state_transition(mdev, ns, os);
949 }
950 } else
951 rv = is_valid_state_transition(mdev, ns, os);
952 }
953
954 if (rv < SS_SUCCESS) {
955 if (flags & CS_VERBOSE)
956 print_st_err(mdev, os, ns, rv);
957 return rv;
958 }
959
960 if (warn_sync_abort)
961 dev_warn(DEV, "Resync aborted.\n");
962
963 {
964 char *pbp, pb[300];
965 pbp = pb;
966 *pbp = 0;
967 PSC(role);
968 PSC(peer);
969 PSC(conn);
970 PSC(disk);
971 PSC(pdsk);
972 PSC(susp);
973 PSC(aftr_isp);
974 PSC(peer_isp);
975 PSC(user_isp);
976 dev_info(DEV, "%s\n", pb);
977 }
978
979 /* solve the race between becoming unconfigured,
980 * worker doing the cleanup, and
981 * admin reconfiguring us:
982 * on (re)configure, first set CONFIG_PENDING,
983 * then wait for a potentially exiting worker,
984 * start the worker, and schedule one no_op.
985 * then proceed with configuration.
986 */
987 if (ns.disk == D_DISKLESS &&
988 ns.conn == C_STANDALONE &&
989 ns.role == R_SECONDARY &&
990 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
991 set_bit(DEVICE_DYING, &mdev->flags);
992
993 mdev->state.i = ns.i;
994 wake_up(&mdev->misc_wait);
995 wake_up(&mdev->state_wait);
996
997 /* post-state-change actions */
998 if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) {
999 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1000 mod_timer(&mdev->resync_timer, jiffies);
1001 }
1002
1003 /* aborted verify run. log the last position */
1004 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1005 ns.conn < C_CONNECTED) {
1006 mdev->ov_start_sector =
1007 BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left);
1008 dev_info(DEV, "Online Verify reached sector %llu\n",
1009 (unsigned long long)mdev->ov_start_sector);
1010 }
1011
1012 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1013 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1014 dev_info(DEV, "Syncer continues.\n");
1015 mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time;
1016 if (ns.conn == C_SYNC_TARGET) {
1017 if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))
1018 mod_timer(&mdev->resync_timer, jiffies);
1019 /* This if (!test_bit) is only needed for the case
1020 that a device that has ceased to used its timer,
1021 i.e. it is already in drbd_resync_finished() gets
1022 paused and resumed. */
1023 }
1024 }
1025
1026 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1027 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1028 dev_info(DEV, "Resync suspended\n");
1029 mdev->rs_mark_time = jiffies;
1030 if (ns.conn == C_PAUSED_SYNC_T)
1031 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1032 }
1033
1034 if (os.conn == C_CONNECTED &&
1035 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1036 mdev->ov_position = 0;
1037 mdev->rs_total =
1038 mdev->rs_mark_left = drbd_bm_bits(mdev);
1039 if (mdev->agreed_pro_version >= 90)
1040 set_ov_position(mdev, ns.conn);
1041 else
1042 mdev->ov_start_sector = 0;
1043 mdev->ov_left = mdev->rs_total
1044 - BM_SECT_TO_BIT(mdev->ov_position);
1045 mdev->rs_start =
1046 mdev->rs_mark_time = jiffies;
1047 mdev->ov_last_oos_size = 0;
1048 mdev->ov_last_oos_start = 0;
1049
1050 if (ns.conn == C_VERIFY_S) {
1051 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1052 (unsigned long long)mdev->ov_position);
1053 mod_timer(&mdev->resync_timer, jiffies);
1054 }
1055 }
1056
1057 if (get_ldev(mdev)) {
1058 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1059 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1060 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1061
1062 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1063 mdf |= MDF_CRASHED_PRIMARY;
1064 if (mdev->state.role == R_PRIMARY ||
1065 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1066 mdf |= MDF_PRIMARY_IND;
1067 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1068 mdf |= MDF_CONNECTED_IND;
1069 if (mdev->state.disk > D_INCONSISTENT)
1070 mdf |= MDF_CONSISTENT;
1071 if (mdev->state.disk > D_OUTDATED)
1072 mdf |= MDF_WAS_UP_TO_DATE;
1073 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1074 mdf |= MDF_PEER_OUT_DATED;
1075 if (mdf != mdev->ldev->md.flags) {
1076 mdev->ldev->md.flags = mdf;
1077 drbd_md_mark_dirty(mdev);
1078 }
1079 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1080 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1081 put_ldev(mdev);
1082 }
1083
1084 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1085 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1086 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1087 set_bit(CONSIDER_RESYNC, &mdev->flags);
1088
1089 /* Receiver should clean up itself */
1090 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1091 drbd_thread_stop_nowait(&mdev->receiver);
1092
1093 /* Now the receiver finished cleaning up itself, it should die */
1094 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1095 drbd_thread_stop_nowait(&mdev->receiver);
1096
1097 /* Upon network failure, we need to restart the receiver. */
1098 if (os.conn > C_TEAR_DOWN &&
1099 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1100 drbd_thread_restart_nowait(&mdev->receiver);
1101
1102 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1103 if (ascw) {
1104 ascw->os = os;
1105 ascw->ns = ns;
1106 ascw->flags = flags;
1107 ascw->w.cb = w_after_state_ch;
1108 ascw->done = done;
1109 drbd_queue_work(&mdev->data.work, &ascw->w);
1110 } else {
1111 dev_warn(DEV, "Could not kmalloc an ascw\n");
1112 }
1113
1114 return rv;
1115}
1116
1117static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1118{
1119 struct after_state_chg_work *ascw =
1120 container_of(w, struct after_state_chg_work, w);
1121 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1122 if (ascw->flags & CS_WAIT_COMPLETE) {
1123 D_ASSERT(ascw->done != NULL);
1124 complete(ascw->done);
1125 }
1126 kfree(ascw);
1127
1128 return 1;
1129}
1130
1131static void abw_start_sync(struct drbd_conf *mdev, int rv)
1132{
1133 if (rv) {
1134 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1135 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1136 return;
1137 }
1138
1139 switch (mdev->state.conn) {
1140 case C_STARTING_SYNC_T:
1141 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1142 break;
1143 case C_STARTING_SYNC_S:
1144 drbd_start_resync(mdev, C_SYNC_SOURCE);
1145 break;
1146 }
1147}
1148
1149/**
1150 * after_state_ch() - Perform after state change actions that may sleep
1151 * @mdev: DRBD device.
1152 * @os: old state.
1153 * @ns: new state.
1154 * @flags: Flags
1155 */
1156static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1157 union drbd_state ns, enum chg_state_flags flags)
1158{
1159 enum drbd_fencing_p fp;
1160
1161 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1162 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1163 if (mdev->p_uuid)
1164 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1165 }
1166
1167 fp = FP_DONT_CARE;
1168 if (get_ldev(mdev)) {
1169 fp = mdev->ldev->dc.fencing;
1170 put_ldev(mdev);
1171 }
1172
1173 /* Inform userspace about the change... */
1174 drbd_bcast_state(mdev, ns);
1175
1176 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1177 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1178 drbd_khelper(mdev, "pri-on-incon-degr");
1179
1180 /* Here we have the actions that are performed after a
1181 state change. This function might sleep */
1182
1183 if (fp == FP_STONITH && ns.susp) {
1184 /* case1: The outdate peer handler is successful:
1185 * case2: The connection was established again: */
1186 if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) ||
1187 (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) {
1188 tl_clear(mdev);
1189 spin_lock_irq(&mdev->req_lock);
1190 _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
1191 spin_unlock_irq(&mdev->req_lock);
1192 }
1193 }
1194 /* Do not change the order of the if above and the two below... */
1195 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1196 drbd_send_uuids(mdev);
1197 drbd_send_state(mdev);
1198 }
1199 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1200 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1201
1202 /* Lost contact to peer's copy of the data */
1203 if ((os.pdsk >= D_INCONSISTENT &&
1204 os.pdsk != D_UNKNOWN &&
1205 os.pdsk != D_OUTDATED)
1206 && (ns.pdsk < D_INCONSISTENT ||
1207 ns.pdsk == D_UNKNOWN ||
1208 ns.pdsk == D_OUTDATED)) {
1209 kfree(mdev->p_uuid);
1210 mdev->p_uuid = NULL;
1211 if (get_ldev(mdev)) {
1212 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1213 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1214 drbd_uuid_new_current(mdev);
1215 drbd_send_uuids(mdev);
1216 }
1217 put_ldev(mdev);
1218 }
1219 }
1220
1221 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1222 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
1223 drbd_uuid_new_current(mdev);
1224
1225 /* D_DISKLESS Peer becomes secondary */
1226 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1227 drbd_al_to_on_disk_bm(mdev);
1228 put_ldev(mdev);
1229 }
1230
1231 /* Last part of the attaching process ... */
1232 if (ns.conn >= C_CONNECTED &&
1233 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1234 kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */
1235 mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */
1236 drbd_send_sizes(mdev, 0); /* to start sync... */
1237 drbd_send_uuids(mdev);
1238 drbd_send_state(mdev);
1239 }
1240
1241 /* We want to pause/continue resync, tell peer. */
1242 if (ns.conn >= C_CONNECTED &&
1243 ((os.aftr_isp != ns.aftr_isp) ||
1244 (os.user_isp != ns.user_isp)))
1245 drbd_send_state(mdev);
1246
1247 /* In case one of the isp bits got set, suspend other devices. */
1248 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1249 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1250 suspend_other_sg(mdev);
1251
1252 /* Make sure the peer gets informed about eventual state
1253 changes (ISP bits) while we were in WFReportParams. */
1254 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1255 drbd_send_state(mdev);
1256
1257 /* We are in the progress to start a full sync... */
1258 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1259 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1260 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1261
1262 /* We are invalidating our self... */
1263 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1264 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1265 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1266
1267 if (os.disk > D_FAILED && ns.disk == D_FAILED) {
1268 enum drbd_io_error_p eh;
1269
1270 eh = EP_PASS_ON;
1271 if (get_ldev_if_state(mdev, D_FAILED)) {
1272 eh = mdev->ldev->dc.on_io_error;
1273 put_ldev(mdev);
1274 }
1275
1276 drbd_rs_cancel_all(mdev);
1277 /* since get_ldev() only works as long as disk>=D_INCONSISTENT,
1278 and it is D_DISKLESS here, local_cnt can only go down, it can
1279 not increase... It will reach zero */
1280 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1281 mdev->rs_total = 0;
1282 mdev->rs_failed = 0;
1283 atomic_set(&mdev->rs_pending_cnt, 0);
1284
1285 spin_lock_irq(&mdev->req_lock);
1286 _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL);
1287 spin_unlock_irq(&mdev->req_lock);
1288
1289 if (eh == EP_CALL_HELPER)
1290 drbd_khelper(mdev, "local-io-error");
1291 }
1292
1293 if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
1294
1295 if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ {
1296 if (drbd_send_state(mdev))
1297 dev_warn(DEV, "Notified peer that my disk is broken.\n");
1298 else
1299 dev_err(DEV, "Sending state in drbd_io_error() failed\n");
1300 }
1301
1302 lc_destroy(mdev->resync);
1303 mdev->resync = NULL;
1304 lc_destroy(mdev->act_log);
1305 mdev->act_log = NULL;
1306 __no_warn(local,
1307 drbd_free_bc(mdev->ldev);
1308 mdev->ldev = NULL;);
1309
1310 if (mdev->md_io_tmpp)
1311 __free_page(mdev->md_io_tmpp);
1312 }
1313
1314 /* Disks got bigger while they were detached */
1315 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1316 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1317 if (ns.conn == C_CONNECTED)
1318 resync_after_online_grow(mdev);
1319 }
1320
1321 /* A resync finished or aborted, wake paused devices... */
1322 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1323 (os.peer_isp && !ns.peer_isp) ||
1324 (os.user_isp && !ns.user_isp))
1325 resume_next_sg(mdev);
1326
1327 /* Upon network connection, we need to start the receiver */
1328 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1329 drbd_thread_start(&mdev->receiver);
1330
1331 /* Terminate worker thread if we are unconfigured - it will be
1332 restarted as needed... */
1333 if (ns.disk == D_DISKLESS &&
1334 ns.conn == C_STANDALONE &&
1335 ns.role == R_SECONDARY) {
1336 if (os.aftr_isp != ns.aftr_isp)
1337 resume_next_sg(mdev);
1338 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1339 if (test_bit(DEVICE_DYING, &mdev->flags))
1340 drbd_thread_stop_nowait(&mdev->worker);
1341 }
1342
1343 drbd_md_sync(mdev);
1344}
1345
1346
1347static int drbd_thread_setup(void *arg)
1348{
1349 struct drbd_thread *thi = (struct drbd_thread *) arg;
1350 struct drbd_conf *mdev = thi->mdev;
1351 unsigned long flags;
1352 int retval;
1353
1354restart:
1355 retval = thi->function(thi);
1356
1357 spin_lock_irqsave(&thi->t_lock, flags);
1358
1359 /* if the receiver has been "Exiting", the last thing it did
1360 * was set the conn state to "StandAlone",
1361 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1362 * and receiver thread will be "started".
1363 * drbd_thread_start needs to set "Restarting" in that case.
1364 * t_state check and assignment needs to be within the same spinlock,
1365 * so either thread_start sees Exiting, and can remap to Restarting,
1366 * or thread_start see None, and can proceed as normal.
1367 */
1368
1369 if (thi->t_state == Restarting) {
1370 dev_info(DEV, "Restarting %s\n", current->comm);
1371 thi->t_state = Running;
1372 spin_unlock_irqrestore(&thi->t_lock, flags);
1373 goto restart;
1374 }
1375
1376 thi->task = NULL;
1377 thi->t_state = None;
1378 smp_mb();
1379 complete(&thi->stop);
1380 spin_unlock_irqrestore(&thi->t_lock, flags);
1381
1382 dev_info(DEV, "Terminating %s\n", current->comm);
1383
1384 /* Release mod reference taken when thread was started */
1385 module_put(THIS_MODULE);
1386 return retval;
1387}
1388
1389static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1390 int (*func) (struct drbd_thread *))
1391{
1392 spin_lock_init(&thi->t_lock);
1393 thi->task = NULL;
1394 thi->t_state = None;
1395 thi->function = func;
1396 thi->mdev = mdev;
1397}
1398
1399int drbd_thread_start(struct drbd_thread *thi)
1400{
1401 struct drbd_conf *mdev = thi->mdev;
1402 struct task_struct *nt;
1403 unsigned long flags;
1404
1405 const char *me =
1406 thi == &mdev->receiver ? "receiver" :
1407 thi == &mdev->asender ? "asender" :
1408 thi == &mdev->worker ? "worker" : "NONSENSE";
1409
1410 /* is used from state engine doing drbd_thread_stop_nowait,
1411 * while holding the req lock irqsave */
1412 spin_lock_irqsave(&thi->t_lock, flags);
1413
1414 switch (thi->t_state) {
1415 case None:
1416 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1417 me, current->comm, current->pid);
1418
1419 /* Get ref on module for thread - this is released when thread exits */
1420 if (!try_module_get(THIS_MODULE)) {
1421 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1422 spin_unlock_irqrestore(&thi->t_lock, flags);
1423 return FALSE;
1424 }
1425
1426 init_completion(&thi->stop);
1427 D_ASSERT(thi->task == NULL);
1428 thi->reset_cpu_mask = 1;
1429 thi->t_state = Running;
1430 spin_unlock_irqrestore(&thi->t_lock, flags);
1431 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1432
1433 nt = kthread_create(drbd_thread_setup, (void *) thi,
1434 "drbd%d_%s", mdev_to_minor(mdev), me);
1435
1436 if (IS_ERR(nt)) {
1437 dev_err(DEV, "Couldn't start thread\n");
1438
1439 module_put(THIS_MODULE);
1440 return FALSE;
1441 }
1442 spin_lock_irqsave(&thi->t_lock, flags);
1443 thi->task = nt;
1444 thi->t_state = Running;
1445 spin_unlock_irqrestore(&thi->t_lock, flags);
1446 wake_up_process(nt);
1447 break;
1448 case Exiting:
1449 thi->t_state = Restarting;
1450 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1451 me, current->comm, current->pid);
1452 /* fall through */
1453 case Running:
1454 case Restarting:
1455 default:
1456 spin_unlock_irqrestore(&thi->t_lock, flags);
1457 break;
1458 }
1459
1460 return TRUE;
1461}
1462
1463
1464void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1465{
1466 unsigned long flags;
1467
1468 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1469
1470 /* may be called from state engine, holding the req lock irqsave */
1471 spin_lock_irqsave(&thi->t_lock, flags);
1472
1473 if (thi->t_state == None) {
1474 spin_unlock_irqrestore(&thi->t_lock, flags);
1475 if (restart)
1476 drbd_thread_start(thi);
1477 return;
1478 }
1479
1480 if (thi->t_state != ns) {
1481 if (thi->task == NULL) {
1482 spin_unlock_irqrestore(&thi->t_lock, flags);
1483 return;
1484 }
1485
1486 thi->t_state = ns;
1487 smp_mb();
1488 init_completion(&thi->stop);
1489 if (thi->task != current)
1490 force_sig(DRBD_SIGKILL, thi->task);
1491
1492 }
1493
1494 spin_unlock_irqrestore(&thi->t_lock, flags);
1495
1496 if (wait)
1497 wait_for_completion(&thi->stop);
1498}
1499
1500#ifdef CONFIG_SMP
1501/**
1502 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1503 * @mdev: DRBD device.
1504 *
1505 * Forces all threads of a device onto the same CPU. This is beneficial for
1506 * DRBD's performance. May be overwritten by user's configuration.
1507 */
1508void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1509{
1510 int ord, cpu;
1511
1512 /* user override. */
1513 if (cpumask_weight(mdev->cpu_mask))
1514 return;
1515
1516 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1517 for_each_online_cpu(cpu) {
1518 if (ord-- == 0) {
1519 cpumask_set_cpu(cpu, mdev->cpu_mask);
1520 return;
1521 }
1522 }
1523 /* should not be reached */
1524 cpumask_setall(mdev->cpu_mask);
1525}
1526
1527/**
1528 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1529 * @mdev: DRBD device.
1530 *
1531 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1532 * prematurely.
1533 */
1534void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1535{
1536 struct task_struct *p = current;
1537 struct drbd_thread *thi =
1538 p == mdev->asender.task ? &mdev->asender :
1539 p == mdev->receiver.task ? &mdev->receiver :
1540 p == mdev->worker.task ? &mdev->worker :
1541 NULL;
1542 ERR_IF(thi == NULL)
1543 return;
1544 if (!thi->reset_cpu_mask)
1545 return;
1546 thi->reset_cpu_mask = 0;
1547 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1548}
1549#endif
1550
1551/* the appropriate socket mutex must be held already */
1552int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1553 enum drbd_packets cmd, struct p_header *h,
1554 size_t size, unsigned msg_flags)
1555{
1556 int sent, ok;
1557
1558 ERR_IF(!h) return FALSE;
1559 ERR_IF(!size) return FALSE;
1560
1561 h->magic = BE_DRBD_MAGIC;
1562 h->command = cpu_to_be16(cmd);
1563 h->length = cpu_to_be16(size-sizeof(struct p_header));
1564
1565 sent = drbd_send(mdev, sock, h, size, msg_flags);
1566
1567 ok = (sent == size);
1568 if (!ok)
1569 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1570 cmdname(cmd), (int)size, sent);
1571 return ok;
1572}
1573
1574/* don't pass the socket. we may only look at it
1575 * when we hold the appropriate socket mutex.
1576 */
1577int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1578 enum drbd_packets cmd, struct p_header *h, size_t size)
1579{
1580 int ok = 0;
1581 struct socket *sock;
1582
1583 if (use_data_socket) {
1584 mutex_lock(&mdev->data.mutex);
1585 sock = mdev->data.socket;
1586 } else {
1587 mutex_lock(&mdev->meta.mutex);
1588 sock = mdev->meta.socket;
1589 }
1590
1591 /* drbd_disconnect() could have called drbd_free_sock()
1592 * while we were waiting in down()... */
1593 if (likely(sock != NULL))
1594 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1595
1596 if (use_data_socket)
1597 mutex_unlock(&mdev->data.mutex);
1598 else
1599 mutex_unlock(&mdev->meta.mutex);
1600 return ok;
1601}
1602
1603int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1604 size_t size)
1605{
1606 struct p_header h;
1607 int ok;
1608
1609 h.magic = BE_DRBD_MAGIC;
1610 h.command = cpu_to_be16(cmd);
1611 h.length = cpu_to_be16(size);
1612
1613 if (!drbd_get_data_sock(mdev))
1614 return 0;
1615
1616 ok = (sizeof(h) ==
1617 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1618 ok = ok && (size ==
1619 drbd_send(mdev, mdev->data.socket, data, size, 0));
1620
1621 drbd_put_data_sock(mdev);
1622
1623 return ok;
1624}
1625
1626int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1627{
1628 struct p_rs_param_89 *p;
1629 struct socket *sock;
1630 int size, rv;
1631 const int apv = mdev->agreed_pro_version;
1632
1633 size = apv <= 87 ? sizeof(struct p_rs_param)
1634 : apv == 88 ? sizeof(struct p_rs_param)
1635 + strlen(mdev->sync_conf.verify_alg) + 1
1636 : /* 89 */ sizeof(struct p_rs_param_89);
1637
1638 /* used from admin command context and receiver/worker context.
1639 * to avoid kmalloc, grab the socket right here,
1640 * then use the pre-allocated sbuf there */
1641 mutex_lock(&mdev->data.mutex);
1642 sock = mdev->data.socket;
1643
1644 if (likely(sock != NULL)) {
1645 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1646
1647 p = &mdev->data.sbuf.rs_param_89;
1648
1649 /* initialize verify_alg and csums_alg */
1650 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1651
1652 p->rate = cpu_to_be32(sc->rate);
1653
1654 if (apv >= 88)
1655 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1656 if (apv >= 89)
1657 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1658
1659 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1660 } else
1661 rv = 0; /* not ok */
1662
1663 mutex_unlock(&mdev->data.mutex);
1664
1665 return rv;
1666}
1667
1668int drbd_send_protocol(struct drbd_conf *mdev)
1669{
1670 struct p_protocol *p;
1671 int size, rv;
1672
1673 size = sizeof(struct p_protocol);
1674
1675 if (mdev->agreed_pro_version >= 87)
1676 size += strlen(mdev->net_conf->integrity_alg) + 1;
1677
1678 /* we must not recurse into our own queue,
1679 * as that is blocked during handshake */
1680 p = kmalloc(size, GFP_NOIO);
1681 if (p == NULL)
1682 return 0;
1683
1684 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
1685 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
1686 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
1687 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
1688 p->want_lose = cpu_to_be32(mdev->net_conf->want_lose);
1689 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1690
1691 if (mdev->agreed_pro_version >= 87)
1692 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1693
1694 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
1695 (struct p_header *)p, size);
1696 kfree(p);
1697 return rv;
1698}
1699
1700int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1701{
1702 struct p_uuids p;
1703 int i;
1704
1705 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1706 return 1;
1707
1708 for (i = UI_CURRENT; i < UI_SIZE; i++)
1709 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1710
1711 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1712 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1713 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1714 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1715 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1716 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1717
1718 put_ldev(mdev);
1719
1720 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
1721 (struct p_header *)&p, sizeof(p));
1722}
1723
1724int drbd_send_uuids(struct drbd_conf *mdev)
1725{
1726 return _drbd_send_uuids(mdev, 0);
1727}
1728
1729int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1730{
1731 return _drbd_send_uuids(mdev, 8);
1732}
1733
1734
1735int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1736{
1737 struct p_rs_uuid p;
1738
1739 p.uuid = cpu_to_be64(val);
1740
1741 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
1742 (struct p_header *)&p, sizeof(p));
1743}
1744
1745int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply)
1746{
1747 struct p_sizes p;
1748 sector_t d_size, u_size;
1749 int q_order_type;
1750 int ok;
1751
1752 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1753 D_ASSERT(mdev->ldev->backing_bdev);
1754 d_size = drbd_get_max_capacity(mdev->ldev);
1755 u_size = mdev->ldev->dc.disk_size;
1756 q_order_type = drbd_queue_order_type(mdev);
1757 p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev));
1758 put_ldev(mdev);
1759 } else {
1760 d_size = 0;
1761 u_size = 0;
1762 q_order_type = QUEUE_ORDERED_NONE;
1763 }
1764
1765 p.d_size = cpu_to_be64(d_size);
1766 p.u_size = cpu_to_be64(u_size);
1767 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1768 p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
1769 p.queue_order_type = cpu_to_be32(q_order_type);
1770
1771 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
1772 (struct p_header *)&p, sizeof(p));
1773 return ok;
1774}
1775
1776/**
1777 * drbd_send_state() - Sends the drbd state to the peer
1778 * @mdev: DRBD device.
1779 */
1780int drbd_send_state(struct drbd_conf *mdev)
1781{
1782 struct socket *sock;
1783 struct p_state p;
1784 int ok = 0;
1785
1786 /* Grab state lock so we wont send state if we're in the middle
1787 * of a cluster wide state change on another thread */
1788 drbd_state_lock(mdev);
1789
1790 mutex_lock(&mdev->data.mutex);
1791
1792 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
1793 sock = mdev->data.socket;
1794
1795 if (likely(sock != NULL)) {
1796 ok = _drbd_send_cmd(mdev, sock, P_STATE,
1797 (struct p_header *)&p, sizeof(p), 0);
1798 }
1799
1800 mutex_unlock(&mdev->data.mutex);
1801
1802 drbd_state_unlock(mdev);
1803 return ok;
1804}
1805
1806int drbd_send_state_req(struct drbd_conf *mdev,
1807 union drbd_state mask, union drbd_state val)
1808{
1809 struct p_req_state p;
1810
1811 p.mask = cpu_to_be32(mask.i);
1812 p.val = cpu_to_be32(val.i);
1813
1814 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
1815 (struct p_header *)&p, sizeof(p));
1816}
1817
1818int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
1819{
1820 struct p_req_state_reply p;
1821
1822 p.retcode = cpu_to_be32(retcode);
1823
1824 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
1825 (struct p_header *)&p, sizeof(p));
1826}
1827
1828int fill_bitmap_rle_bits(struct drbd_conf *mdev,
1829 struct p_compressed_bm *p,
1830 struct bm_xfer_ctx *c)
1831{
1832 struct bitstream bs;
1833 unsigned long plain_bits;
1834 unsigned long tmp;
1835 unsigned long rl;
1836 unsigned len;
1837 unsigned toggle;
1838 int bits;
1839
1840 /* may we use this feature? */
1841 if ((mdev->sync_conf.use_rle == 0) ||
1842 (mdev->agreed_pro_version < 90))
1843 return 0;
1844
1845 if (c->bit_offset >= c->bm_bits)
1846 return 0; /* nothing to do. */
1847
1848 /* use at most thus many bytes */
1849 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
1850 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
1851 /* plain bits covered in this code string */
1852 plain_bits = 0;
1853
1854 /* p->encoding & 0x80 stores whether the first run length is set.
1855 * bit offset is implicit.
1856 * start with toggle == 2 to be able to tell the first iteration */
1857 toggle = 2;
1858
1859 /* see how much plain bits we can stuff into one packet
1860 * using RLE and VLI. */
1861 do {
1862 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
1863 : _drbd_bm_find_next(mdev, c->bit_offset);
1864 if (tmp == -1UL)
1865 tmp = c->bm_bits;
1866 rl = tmp - c->bit_offset;
1867
1868 if (toggle == 2) { /* first iteration */
1869 if (rl == 0) {
1870 /* the first checked bit was set,
1871 * store start value, */
1872 DCBP_set_start(p, 1);
1873 /* but skip encoding of zero run length */
1874 toggle = !toggle;
1875 continue;
1876 }
1877 DCBP_set_start(p, 0);
1878 }
1879
1880 /* paranoia: catch zero runlength.
1881 * can only happen if bitmap is modified while we scan it. */
1882 if (rl == 0) {
1883 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
1884 "t:%u bo:%lu\n", toggle, c->bit_offset);
1885 return -1;
1886 }
1887
1888 bits = vli_encode_bits(&bs, rl);
1889 if (bits == -ENOBUFS) /* buffer full */
1890 break;
1891 if (bits <= 0) {
1892 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
1893 return 0;
1894 }
1895
1896 toggle = !toggle;
1897 plain_bits += rl;
1898 c->bit_offset = tmp;
1899 } while (c->bit_offset < c->bm_bits);
1900
1901 len = bs.cur.b - p->code + !!bs.cur.bit;
1902
1903 if (plain_bits < (len << 3)) {
1904 /* incompressible with this method.
1905 * we need to rewind both word and bit position. */
1906 c->bit_offset -= plain_bits;
1907 bm_xfer_ctx_bit_to_word_offset(c);
1908 c->bit_offset = c->word_offset * BITS_PER_LONG;
1909 return 0;
1910 }
1911
1912 /* RLE + VLI was able to compress it just fine.
1913 * update c->word_offset. */
1914 bm_xfer_ctx_bit_to_word_offset(c);
1915
1916 /* store pad_bits */
1917 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
1918
1919 return len;
1920}
1921
1922enum { OK, FAILED, DONE }
1923send_bitmap_rle_or_plain(struct drbd_conf *mdev,
1924 struct p_header *h, struct bm_xfer_ctx *c)
1925{
1926 struct p_compressed_bm *p = (void*)h;
1927 unsigned long num_words;
1928 int len;
1929 int ok;
1930
1931 len = fill_bitmap_rle_bits(mdev, p, c);
1932
1933 if (len < 0)
1934 return FAILED;
1935
1936 if (len) {
1937 DCBP_set_code(p, RLE_VLI_Bits);
1938 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
1939 sizeof(*p) + len, 0);
1940
1941 c->packets[0]++;
1942 c->bytes[0] += sizeof(*p) + len;
1943
1944 if (c->bit_offset >= c->bm_bits)
1945 len = 0; /* DONE */
1946 } else {
1947 /* was not compressible.
1948 * send a buffer full of plain text bits instead. */
1949 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
1950 len = num_words * sizeof(long);
1951 if (len)
1952 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
1953 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
1954 h, sizeof(struct p_header) + len, 0);
1955 c->word_offset += num_words;
1956 c->bit_offset = c->word_offset * BITS_PER_LONG;
1957
1958 c->packets[1]++;
1959 c->bytes[1] += sizeof(struct p_header) + len;
1960
1961 if (c->bit_offset > c->bm_bits)
1962 c->bit_offset = c->bm_bits;
1963 }
1964 ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
1965
1966 if (ok == DONE)
1967 INFO_bm_xfer_stats(mdev, "send", c);
1968 return ok;
1969}
1970
1971/* See the comment at receive_bitmap() */
1972int _drbd_send_bitmap(struct drbd_conf *mdev)
1973{
1974 struct bm_xfer_ctx c;
1975 struct p_header *p;
1976 int ret;
1977
1978 ERR_IF(!mdev->bitmap) return FALSE;
1979
1980 /* maybe we should use some per thread scratch page,
1981 * and allocate that during initial device creation? */
1982 p = (struct p_header *) __get_free_page(GFP_NOIO);
1983 if (!p) {
1984 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
1985 return FALSE;
1986 }
1987
1988 if (get_ldev(mdev)) {
1989 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
1990 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
1991 drbd_bm_set_all(mdev);
1992 if (drbd_bm_write(mdev)) {
1993 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
1994 * but otherwise process as per normal - need to tell other
1995 * side that a full resync is required! */
1996 dev_err(DEV, "Failed to write bitmap to disk!\n");
1997 } else {
1998 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
1999 drbd_md_sync(mdev);
2000 }
2001 }
2002 put_ldev(mdev);
2003 }
2004
2005 c = (struct bm_xfer_ctx) {
2006 .bm_bits = drbd_bm_bits(mdev),
2007 .bm_words = drbd_bm_words(mdev),
2008 };
2009
2010 do {
2011 ret = send_bitmap_rle_or_plain(mdev, p, &c);
2012 } while (ret == OK);
2013
2014 free_page((unsigned long) p);
2015 return (ret == DONE);
2016}
2017
2018int drbd_send_bitmap(struct drbd_conf *mdev)
2019{
2020 int err;
2021
2022 if (!drbd_get_data_sock(mdev))
2023 return -1;
2024 err = !_drbd_send_bitmap(mdev);
2025 drbd_put_data_sock(mdev);
2026 return err;
2027}
2028
2029int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2030{
2031 int ok;
2032 struct p_barrier_ack p;
2033
2034 p.barrier = barrier_nr;
2035 p.set_size = cpu_to_be32(set_size);
2036
2037 if (mdev->state.conn < C_CONNECTED)
2038 return FALSE;
2039 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2040 (struct p_header *)&p, sizeof(p));
2041 return ok;
2042}
2043
2044/**
2045 * _drbd_send_ack() - Sends an ack packet
2046 * @mdev: DRBD device.
2047 * @cmd: Packet command code.
2048 * @sector: sector, needs to be in big endian byte order
2049 * @blksize: size in byte, needs to be in big endian byte order
2050 * @block_id: Id, big endian byte order
2051 */
2052static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2053 u64 sector,
2054 u32 blksize,
2055 u64 block_id)
2056{
2057 int ok;
2058 struct p_block_ack p;
2059
2060 p.sector = sector;
2061 p.block_id = block_id;
2062 p.blksize = blksize;
2063 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2064
2065 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2066 return FALSE;
2067 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2068 (struct p_header *)&p, sizeof(p));
2069 return ok;
2070}
2071
2072int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2073 struct p_data *dp)
2074{
2075 const int header_size = sizeof(struct p_data)
2076 - sizeof(struct p_header);
2077 int data_size = ((struct p_header *)dp)->length - header_size;
2078
2079 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2080 dp->block_id);
2081}
2082
2083int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2084 struct p_block_req *rp)
2085{
2086 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2087}
2088
2089/**
2090 * drbd_send_ack() - Sends an ack packet
2091 * @mdev: DRBD device.
2092 * @cmd: Packet command code.
2093 * @e: Epoch entry.
2094 */
2095int drbd_send_ack(struct drbd_conf *mdev,
2096 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2097{
2098 return _drbd_send_ack(mdev, cmd,
2099 cpu_to_be64(e->sector),
2100 cpu_to_be32(e->size),
2101 e->block_id);
2102}
2103
2104/* This function misuses the block_id field to signal if the blocks
2105 * are is sync or not. */
2106int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2107 sector_t sector, int blksize, u64 block_id)
2108{
2109 return _drbd_send_ack(mdev, cmd,
2110 cpu_to_be64(sector),
2111 cpu_to_be32(blksize),
2112 cpu_to_be64(block_id));
2113}
2114
2115int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2116 sector_t sector, int size, u64 block_id)
2117{
2118 int ok;
2119 struct p_block_req p;
2120
2121 p.sector = cpu_to_be64(sector);
2122 p.block_id = block_id;
2123 p.blksize = cpu_to_be32(size);
2124
2125 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2126 (struct p_header *)&p, sizeof(p));
2127 return ok;
2128}
2129
2130int drbd_send_drequest_csum(struct drbd_conf *mdev,
2131 sector_t sector, int size,
2132 void *digest, int digest_size,
2133 enum drbd_packets cmd)
2134{
2135 int ok;
2136 struct p_block_req p;
2137
2138 p.sector = cpu_to_be64(sector);
2139 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2140 p.blksize = cpu_to_be32(size);
2141
2142 p.head.magic = BE_DRBD_MAGIC;
2143 p.head.command = cpu_to_be16(cmd);
2144 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size);
2145
2146 mutex_lock(&mdev->data.mutex);
2147
2148 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2149 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2150
2151 mutex_unlock(&mdev->data.mutex);
2152
2153 return ok;
2154}
2155
2156int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2157{
2158 int ok;
2159 struct p_block_req p;
2160
2161 p.sector = cpu_to_be64(sector);
2162 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2163 p.blksize = cpu_to_be32(size);
2164
2165 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2166 (struct p_header *)&p, sizeof(p));
2167 return ok;
2168}
2169
2170/* called on sndtimeo
2171 * returns FALSE if we should retry,
2172 * TRUE if we think connection is dead
2173 */
2174static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2175{
2176 int drop_it;
2177 /* long elapsed = (long)(jiffies - mdev->last_received); */
2178
2179 drop_it = mdev->meta.socket == sock
2180 || !mdev->asender.task
2181 || get_t_state(&mdev->asender) != Running
2182 || mdev->state.conn < C_CONNECTED;
2183
2184 if (drop_it)
2185 return TRUE;
2186
2187 drop_it = !--mdev->ko_count;
2188 if (!drop_it) {
2189 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2190 current->comm, current->pid, mdev->ko_count);
2191 request_ping(mdev);
2192 }
2193
2194 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2195}
2196
2197/* The idea of sendpage seems to be to put some kind of reference
2198 * to the page into the skb, and to hand it over to the NIC. In
2199 * this process get_page() gets called.
2200 *
2201 * As soon as the page was really sent over the network put_page()
2202 * gets called by some part of the network layer. [ NIC driver? ]
2203 *
2204 * [ get_page() / put_page() increment/decrement the count. If count
2205 * reaches 0 the page will be freed. ]
2206 *
2207 * This works nicely with pages from FSs.
2208 * But this means that in protocol A we might signal IO completion too early!
2209 *
2210 * In order not to corrupt data during a resync we must make sure
2211 * that we do not reuse our own buffer pages (EEs) to early, therefore
2212 * we have the net_ee list.
2213 *
2214 * XFS seems to have problems, still, it submits pages with page_count == 0!
2215 * As a workaround, we disable sendpage on pages
2216 * with page_count == 0 or PageSlab.
2217 */
2218static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2219 int offset, size_t size)
2220{
2221 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0);
2222 kunmap(page);
2223 if (sent == size)
2224 mdev->send_cnt += size>>9;
2225 return sent == size;
2226}
2227
2228static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2229 int offset, size_t size)
2230{
2231 mm_segment_t oldfs = get_fs();
2232 int sent, ok;
2233 int len = size;
2234
2235 /* e.g. XFS meta- & log-data is in slab pages, which have a
2236 * page_count of 0 and/or have PageSlab() set.
2237 * we cannot use send_page for those, as that does get_page();
2238 * put_page(); and would cause either a VM_BUG directly, or
2239 * __page_cache_release a page that would actually still be referenced
2240 * by someone, leading to some obscure delayed Oops somewhere else. */
2241 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2242 return _drbd_no_send_page(mdev, page, offset, size);
2243
2244 drbd_update_congested(mdev);
2245 set_fs(KERNEL_DS);
2246 do {
2247 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2248 offset, len,
2249 MSG_NOSIGNAL);
2250 if (sent == -EAGAIN) {
2251 if (we_should_drop_the_connection(mdev,
2252 mdev->data.socket))
2253 break;
2254 else
2255 continue;
2256 }
2257 if (sent <= 0) {
2258 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2259 __func__, (int)size, len, sent);
2260 break;
2261 }
2262 len -= sent;
2263 offset += sent;
2264 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2265 set_fs(oldfs);
2266 clear_bit(NET_CONGESTED, &mdev->flags);
2267
2268 ok = (len == 0);
2269 if (likely(ok))
2270 mdev->send_cnt += size>>9;
2271 return ok;
2272}
2273
2274static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2275{
2276 struct bio_vec *bvec;
2277 int i;
2278 __bio_for_each_segment(bvec, bio, i, 0) {
2279 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2280 bvec->bv_offset, bvec->bv_len))
2281 return 0;
2282 }
2283 return 1;
2284}
2285
2286static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2287{
2288 struct bio_vec *bvec;
2289 int i;
2290 __bio_for_each_segment(bvec, bio, i, 0) {
2291 if (!_drbd_send_page(mdev, bvec->bv_page,
2292 bvec->bv_offset, bvec->bv_len))
2293 return 0;
2294 }
2295
2296 return 1;
2297}
2298
2299/* Used to send write requests
2300 * R_PRIMARY -> Peer (P_DATA)
2301 */
2302int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2303{
2304 int ok = 1;
2305 struct p_data p;
2306 unsigned int dp_flags = 0;
2307 void *dgb;
2308 int dgs;
2309
2310 if (!drbd_get_data_sock(mdev))
2311 return 0;
2312
2313 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2314 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2315
2316 p.head.magic = BE_DRBD_MAGIC;
2317 p.head.command = cpu_to_be16(P_DATA);
2318 p.head.length =
2319 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size);
2320
2321 p.sector = cpu_to_be64(req->sector);
2322 p.block_id = (unsigned long)req;
2323 p.seq_num = cpu_to_be32(req->seq_num =
2324 atomic_add_return(1, &mdev->packet_seq));
2325 dp_flags = 0;
2326
2327 /* NOTE: no need to check if barriers supported here as we would
2328 * not pass the test in make_request_common in that case
2329 */
2330 if (bio_rw_flagged(req->master_bio, BIO_RW_BARRIER)) {
2331 dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n");
2332 /* dp_flags |= DP_HARDBARRIER; */
2333 }
2334 if (bio_rw_flagged(req->master_bio, BIO_RW_SYNCIO))
2335 dp_flags |= DP_RW_SYNC;
2336 /* for now handle SYNCIO and UNPLUG
2337 * as if they still were one and the same flag */
2338 if (bio_rw_flagged(req->master_bio, BIO_RW_UNPLUG))
2339 dp_flags |= DP_RW_SYNC;
2340 if (mdev->state.conn >= C_SYNC_SOURCE &&
2341 mdev->state.conn <= C_PAUSED_SYNC_T)
2342 dp_flags |= DP_MAY_SET_IN_SYNC;
2343
2344 p.dp_flags = cpu_to_be32(dp_flags);
2345 set_bit(UNPLUG_REMOTE, &mdev->flags);
2346 ok = (sizeof(p) ==
2347 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE));
2348 if (ok && dgs) {
2349 dgb = mdev->int_dig_out;
2350 drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2351 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2352 }
2353 if (ok) {
2354 if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
2355 ok = _drbd_send_bio(mdev, req->master_bio);
2356 else
2357 ok = _drbd_send_zc_bio(mdev, req->master_bio);
2358 }
2359
2360 drbd_put_data_sock(mdev);
2361 return ok;
2362}
2363
2364/* answer packet, used to send data back for read requests:
2365 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2366 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2367 */
2368int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2369 struct drbd_epoch_entry *e)
2370{
2371 int ok;
2372 struct p_data p;
2373 void *dgb;
2374 int dgs;
2375
2376 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2377 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2378
2379 p.head.magic = BE_DRBD_MAGIC;
2380 p.head.command = cpu_to_be16(cmd);
2381 p.head.length =
2382 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size);
2383
2384 p.sector = cpu_to_be64(e->sector);
2385 p.block_id = e->block_id;
2386 /* p.seq_num = 0; No sequence numbers here.. */
2387
2388 /* Only called by our kernel thread.
2389 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2390 * in response to admin command or module unload.
2391 */
2392 if (!drbd_get_data_sock(mdev))
2393 return 0;
2394
2395 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p,
2396 sizeof(p), MSG_MORE);
2397 if (ok && dgs) {
2398 dgb = mdev->int_dig_out;
2399 drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb);
2400 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2401 }
2402 if (ok)
2403 ok = _drbd_send_zc_bio(mdev, e->private_bio);
2404
2405 drbd_put_data_sock(mdev);
2406 return ok;
2407}
2408
2409/*
2410 drbd_send distinguishes two cases:
2411
2412 Packets sent via the data socket "sock"
2413 and packets sent via the meta data socket "msock"
2414
2415 sock msock
2416 -----------------+-------------------------+------------------------------
2417 timeout conf.timeout / 2 conf.timeout / 2
2418 timeout action send a ping via msock Abort communication
2419 and close all sockets
2420*/
2421
2422/*
2423 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2424 */
2425int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2426 void *buf, size_t size, unsigned msg_flags)
2427{
2428 struct kvec iov;
2429 struct msghdr msg;
2430 int rv, sent = 0;
2431
2432 if (!sock)
2433 return -1000;
2434
2435 /* THINK if (signal_pending) return ... ? */
2436
2437 iov.iov_base = buf;
2438 iov.iov_len = size;
2439
2440 msg.msg_name = NULL;
2441 msg.msg_namelen = 0;
2442 msg.msg_control = NULL;
2443 msg.msg_controllen = 0;
2444 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2445
2446 if (sock == mdev->data.socket) {
2447 mdev->ko_count = mdev->net_conf->ko_count;
2448 drbd_update_congested(mdev);
2449 }
2450 do {
2451 /* STRANGE
2452 * tcp_sendmsg does _not_ use its size parameter at all ?
2453 *
2454 * -EAGAIN on timeout, -EINTR on signal.
2455 */
2456/* THINK
2457 * do we need to block DRBD_SIG if sock == &meta.socket ??
2458 * otherwise wake_asender() might interrupt some send_*Ack !
2459 */
2460 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2461 if (rv == -EAGAIN) {
2462 if (we_should_drop_the_connection(mdev, sock))
2463 break;
2464 else
2465 continue;
2466 }
2467 D_ASSERT(rv != 0);
2468 if (rv == -EINTR) {
2469 flush_signals(current);
2470 rv = 0;
2471 }
2472 if (rv < 0)
2473 break;
2474 sent += rv;
2475 iov.iov_base += rv;
2476 iov.iov_len -= rv;
2477 } while (sent < size);
2478
2479 if (sock == mdev->data.socket)
2480 clear_bit(NET_CONGESTED, &mdev->flags);
2481
2482 if (rv <= 0) {
2483 if (rv != -EAGAIN) {
2484 dev_err(DEV, "%s_sendmsg returned %d\n",
2485 sock == mdev->meta.socket ? "msock" : "sock",
2486 rv);
2487 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2488 } else
2489 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2490 }
2491
2492 return sent;
2493}
2494
2495static int drbd_open(struct block_device *bdev, fmode_t mode)
2496{
2497 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2498 unsigned long flags;
2499 int rv = 0;
2500
2501 spin_lock_irqsave(&mdev->req_lock, flags);
2502 /* to have a stable mdev->state.role
2503 * and no race with updating open_cnt */
2504
2505 if (mdev->state.role != R_PRIMARY) {
2506 if (mode & FMODE_WRITE)
2507 rv = -EROFS;
2508 else if (!allow_oos)
2509 rv = -EMEDIUMTYPE;
2510 }
2511
2512 if (!rv)
2513 mdev->open_cnt++;
2514 spin_unlock_irqrestore(&mdev->req_lock, flags);
2515
2516 return rv;
2517}
2518
2519static int drbd_release(struct gendisk *gd, fmode_t mode)
2520{
2521 struct drbd_conf *mdev = gd->private_data;
2522 mdev->open_cnt--;
2523 return 0;
2524}
2525
2526static void drbd_unplug_fn(struct request_queue *q)
2527{
2528 struct drbd_conf *mdev = q->queuedata;
2529
2530 /* unplug FIRST */
2531 spin_lock_irq(q->queue_lock);
2532 blk_remove_plug(q);
2533 spin_unlock_irq(q->queue_lock);
2534
2535 /* only if connected */
2536 spin_lock_irq(&mdev->req_lock);
2537 if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) {
2538 D_ASSERT(mdev->state.role == R_PRIMARY);
2539 if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) {
2540 /* add to the data.work queue,
2541 * unless already queued.
2542 * XXX this might be a good addition to drbd_queue_work
2543 * anyways, to detect "double queuing" ... */
2544 if (list_empty(&mdev->unplug_work.list))
2545 drbd_queue_work(&mdev->data.work,
2546 &mdev->unplug_work);
2547 }
2548 }
2549 spin_unlock_irq(&mdev->req_lock);
2550
2551 if (mdev->state.disk >= D_INCONSISTENT)
2552 drbd_kick_lo(mdev);
2553}
2554
2555static void drbd_set_defaults(struct drbd_conf *mdev)
2556{
2557 mdev->sync_conf.after = DRBD_AFTER_DEF;
2558 mdev->sync_conf.rate = DRBD_RATE_DEF;
2559 mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_DEF;
2560 mdev->state = (union drbd_state) {
2561 { .role = R_SECONDARY,
2562 .peer = R_UNKNOWN,
2563 .conn = C_STANDALONE,
2564 .disk = D_DISKLESS,
2565 .pdsk = D_UNKNOWN,
2566 .susp = 0
2567 } };
2568}
2569
2570void drbd_init_set_defaults(struct drbd_conf *mdev)
2571{
2572 /* the memset(,0,) did most of this.
2573 * note: only assignments, no allocation in here */
2574
2575 drbd_set_defaults(mdev);
2576
2577 /* for now, we do NOT yet support it,
2578 * even though we start some framework
2579 * to eventually support barriers */
2580 set_bit(NO_BARRIER_SUPP, &mdev->flags);
2581
2582 atomic_set(&mdev->ap_bio_cnt, 0);
2583 atomic_set(&mdev->ap_pending_cnt, 0);
2584 atomic_set(&mdev->rs_pending_cnt, 0);
2585 atomic_set(&mdev->unacked_cnt, 0);
2586 atomic_set(&mdev->local_cnt, 0);
2587 atomic_set(&mdev->net_cnt, 0);
2588 atomic_set(&mdev->packet_seq, 0);
2589 atomic_set(&mdev->pp_in_use, 0);
2590
2591 mutex_init(&mdev->md_io_mutex);
2592 mutex_init(&mdev->data.mutex);
2593 mutex_init(&mdev->meta.mutex);
2594 sema_init(&mdev->data.work.s, 0);
2595 sema_init(&mdev->meta.work.s, 0);
2596 mutex_init(&mdev->state_mutex);
2597
2598 spin_lock_init(&mdev->data.work.q_lock);
2599 spin_lock_init(&mdev->meta.work.q_lock);
2600
2601 spin_lock_init(&mdev->al_lock);
2602 spin_lock_init(&mdev->req_lock);
2603 spin_lock_init(&mdev->peer_seq_lock);
2604 spin_lock_init(&mdev->epoch_lock);
2605
2606 INIT_LIST_HEAD(&mdev->active_ee);
2607 INIT_LIST_HEAD(&mdev->sync_ee);
2608 INIT_LIST_HEAD(&mdev->done_ee);
2609 INIT_LIST_HEAD(&mdev->read_ee);
2610 INIT_LIST_HEAD(&mdev->net_ee);
2611 INIT_LIST_HEAD(&mdev->resync_reads);
2612 INIT_LIST_HEAD(&mdev->data.work.q);
2613 INIT_LIST_HEAD(&mdev->meta.work.q);
2614 INIT_LIST_HEAD(&mdev->resync_work.list);
2615 INIT_LIST_HEAD(&mdev->unplug_work.list);
2616 INIT_LIST_HEAD(&mdev->md_sync_work.list);
2617 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
2618 mdev->resync_work.cb = w_resync_inactive;
2619 mdev->unplug_work.cb = w_send_write_hint;
2620 mdev->md_sync_work.cb = w_md_sync;
2621 mdev->bm_io_work.w.cb = w_bitmap_io;
2622 init_timer(&mdev->resync_timer);
2623 init_timer(&mdev->md_sync_timer);
2624 mdev->resync_timer.function = resync_timer_fn;
2625 mdev->resync_timer.data = (unsigned long) mdev;
2626 mdev->md_sync_timer.function = md_sync_timer_fn;
2627 mdev->md_sync_timer.data = (unsigned long) mdev;
2628
2629 init_waitqueue_head(&mdev->misc_wait);
2630 init_waitqueue_head(&mdev->state_wait);
2631 init_waitqueue_head(&mdev->ee_wait);
2632 init_waitqueue_head(&mdev->al_wait);
2633 init_waitqueue_head(&mdev->seq_wait);
2634
2635 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2636 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2637 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2638
2639 mdev->agreed_pro_version = PRO_VERSION_MAX;
2640 mdev->write_ordering = WO_bio_barrier;
2641 mdev->resync_wenr = LC_FREE;
2642}
2643
2644void drbd_mdev_cleanup(struct drbd_conf *mdev)
2645{
2646 if (mdev->receiver.t_state != None)
2647 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2648 mdev->receiver.t_state);
2649
2650 /* no need to lock it, I'm the only thread alive */
2651 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
2652 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2653 mdev->al_writ_cnt =
2654 mdev->bm_writ_cnt =
2655 mdev->read_cnt =
2656 mdev->recv_cnt =
2657 mdev->send_cnt =
2658 mdev->writ_cnt =
2659 mdev->p_size =
2660 mdev->rs_start =
2661 mdev->rs_total =
2662 mdev->rs_failed =
2663 mdev->rs_mark_left =
2664 mdev->rs_mark_time = 0;
2665 D_ASSERT(mdev->net_conf == NULL);
2666
2667 drbd_set_my_capacity(mdev, 0);
2668 if (mdev->bitmap) {
2669 /* maybe never allocated. */
2670 drbd_bm_resize(mdev, 0);
2671 drbd_bm_cleanup(mdev);
2672 }
2673
2674 drbd_free_resources(mdev);
2675
2676 /*
2677 * currently we drbd_init_ee only on module load, so
2678 * we may do drbd_release_ee only on module unload!
2679 */
2680 D_ASSERT(list_empty(&mdev->active_ee));
2681 D_ASSERT(list_empty(&mdev->sync_ee));
2682 D_ASSERT(list_empty(&mdev->done_ee));
2683 D_ASSERT(list_empty(&mdev->read_ee));
2684 D_ASSERT(list_empty(&mdev->net_ee));
2685 D_ASSERT(list_empty(&mdev->resync_reads));
2686 D_ASSERT(list_empty(&mdev->data.work.q));
2687 D_ASSERT(list_empty(&mdev->meta.work.q));
2688 D_ASSERT(list_empty(&mdev->resync_work.list));
2689 D_ASSERT(list_empty(&mdev->unplug_work.list));
2690
2691}
2692
2693
2694static void drbd_destroy_mempools(void)
2695{
2696 struct page *page;
2697
2698 while (drbd_pp_pool) {
2699 page = drbd_pp_pool;
2700 drbd_pp_pool = (struct page *)page_private(page);
2701 __free_page(page);
2702 drbd_pp_vacant--;
2703 }
2704
2705 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2706
2707 if (drbd_ee_mempool)
2708 mempool_destroy(drbd_ee_mempool);
2709 if (drbd_request_mempool)
2710 mempool_destroy(drbd_request_mempool);
2711 if (drbd_ee_cache)
2712 kmem_cache_destroy(drbd_ee_cache);
2713 if (drbd_request_cache)
2714 kmem_cache_destroy(drbd_request_cache);
2715 if (drbd_bm_ext_cache)
2716 kmem_cache_destroy(drbd_bm_ext_cache);
2717 if (drbd_al_ext_cache)
2718 kmem_cache_destroy(drbd_al_ext_cache);
2719
2720 drbd_ee_mempool = NULL;
2721 drbd_request_mempool = NULL;
2722 drbd_ee_cache = NULL;
2723 drbd_request_cache = NULL;
2724 drbd_bm_ext_cache = NULL;
2725 drbd_al_ext_cache = NULL;
2726
2727 return;
2728}
2729
2730static int drbd_create_mempools(void)
2731{
2732 struct page *page;
2733 const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count;
2734 int i;
2735
2736 /* prepare our caches and mempools */
2737 drbd_request_mempool = NULL;
2738 drbd_ee_cache = NULL;
2739 drbd_request_cache = NULL;
2740 drbd_bm_ext_cache = NULL;
2741 drbd_al_ext_cache = NULL;
2742 drbd_pp_pool = NULL;
2743
2744 /* caches */
2745 drbd_request_cache = kmem_cache_create(
2746 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
2747 if (drbd_request_cache == NULL)
2748 goto Enomem;
2749
2750 drbd_ee_cache = kmem_cache_create(
2751 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
2752 if (drbd_ee_cache == NULL)
2753 goto Enomem;
2754
2755 drbd_bm_ext_cache = kmem_cache_create(
2756 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
2757 if (drbd_bm_ext_cache == NULL)
2758 goto Enomem;
2759
2760 drbd_al_ext_cache = kmem_cache_create(
2761 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
2762 if (drbd_al_ext_cache == NULL)
2763 goto Enomem;
2764
2765 /* mempools */
2766 drbd_request_mempool = mempool_create(number,
2767 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
2768 if (drbd_request_mempool == NULL)
2769 goto Enomem;
2770
2771 drbd_ee_mempool = mempool_create(number,
2772 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2773 if (drbd_request_mempool == NULL)
2774 goto Enomem;
2775
2776 /* drbd's page pool */
2777 spin_lock_init(&drbd_pp_lock);
2778
2779 for (i = 0; i < number; i++) {
2780 page = alloc_page(GFP_HIGHUSER);
2781 if (!page)
2782 goto Enomem;
2783 set_page_private(page, (unsigned long)drbd_pp_pool);
2784 drbd_pp_pool = page;
2785 }
2786 drbd_pp_vacant = number;
2787
2788 return 0;
2789
2790Enomem:
2791 drbd_destroy_mempools(); /* in case we allocated some */
2792 return -ENOMEM;
2793}
2794
2795static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
2796 void *unused)
2797{
2798 /* just so we have it. you never know what interesting things we
2799 * might want to do here some day...
2800 */
2801
2802 return NOTIFY_DONE;
2803}
2804
2805static struct notifier_block drbd_notifier = {
2806 .notifier_call = drbd_notify_sys,
2807};
2808
2809static void drbd_release_ee_lists(struct drbd_conf *mdev)
2810{
2811 int rr;
2812
2813 rr = drbd_release_ee(mdev, &mdev->active_ee);
2814 if (rr)
2815 dev_err(DEV, "%d EEs in active list found!\n", rr);
2816
2817 rr = drbd_release_ee(mdev, &mdev->sync_ee);
2818 if (rr)
2819 dev_err(DEV, "%d EEs in sync list found!\n", rr);
2820
2821 rr = drbd_release_ee(mdev, &mdev->read_ee);
2822 if (rr)
2823 dev_err(DEV, "%d EEs in read list found!\n", rr);
2824
2825 rr = drbd_release_ee(mdev, &mdev->done_ee);
2826 if (rr)
2827 dev_err(DEV, "%d EEs in done list found!\n", rr);
2828
2829 rr = drbd_release_ee(mdev, &mdev->net_ee);
2830 if (rr)
2831 dev_err(DEV, "%d EEs in net list found!\n", rr);
2832}
2833
2834/* caution. no locking.
2835 * currently only used from module cleanup code. */
2836static void drbd_delete_device(unsigned int minor)
2837{
2838 struct drbd_conf *mdev = minor_to_mdev(minor);
2839
2840 if (!mdev)
2841 return;
2842
2843 /* paranoia asserts */
2844 if (mdev->open_cnt != 0)
2845 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
2846 __FILE__ , __LINE__);
2847
2848 ERR_IF (!list_empty(&mdev->data.work.q)) {
2849 struct list_head *lp;
2850 list_for_each(lp, &mdev->data.work.q) {
2851 dev_err(DEV, "lp = %p\n", lp);
2852 }
2853 };
2854 /* end paranoia asserts */
2855
2856 del_gendisk(mdev->vdisk);
2857
2858 /* cleanup stuff that may have been allocated during
2859 * device (re-)configuration or state changes */
2860
2861 if (mdev->this_bdev)
2862 bdput(mdev->this_bdev);
2863
2864 drbd_free_resources(mdev);
2865
2866 drbd_release_ee_lists(mdev);
2867
2868 /* should be free'd on disconnect? */
2869 kfree(mdev->ee_hash);
2870 /*
2871 mdev->ee_hash_s = 0;
2872 mdev->ee_hash = NULL;
2873 */
2874
2875 lc_destroy(mdev->act_log);
2876 lc_destroy(mdev->resync);
2877
2878 kfree(mdev->p_uuid);
2879 /* mdev->p_uuid = NULL; */
2880
2881 kfree(mdev->int_dig_out);
2882 kfree(mdev->int_dig_in);
2883 kfree(mdev->int_dig_vv);
2884
2885 /* cleanup the rest that has been
2886 * allocated from drbd_new_device
2887 * and actually free the mdev itself */
2888 drbd_free_mdev(mdev);
2889}
2890
2891static void drbd_cleanup(void)
2892{
2893 unsigned int i;
2894
2895 unregister_reboot_notifier(&drbd_notifier);
2896
2897 drbd_nl_cleanup();
2898
2899 if (minor_table) {
2900 if (drbd_proc)
2901 remove_proc_entry("drbd", NULL);
2902 i = minor_count;
2903 while (i--)
2904 drbd_delete_device(i);
2905 drbd_destroy_mempools();
2906 }
2907
2908 kfree(minor_table);
2909
2910 unregister_blkdev(DRBD_MAJOR, "drbd");
2911
2912 printk(KERN_INFO "drbd: module cleanup done.\n");
2913}
2914
2915/**
2916 * drbd_congested() - Callback for pdflush
2917 * @congested_data: User data
2918 * @bdi_bits: Bits pdflush is currently interested in
2919 *
2920 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
2921 */
2922static int drbd_congested(void *congested_data, int bdi_bits)
2923{
2924 struct drbd_conf *mdev = congested_data;
2925 struct request_queue *q;
2926 char reason = '-';
2927 int r = 0;
2928
2929 if (!__inc_ap_bio_cond(mdev)) {
2930 /* DRBD has frozen IO */
2931 r = bdi_bits;
2932 reason = 'd';
2933 goto out;
2934 }
2935
2936 if (get_ldev(mdev)) {
2937 q = bdev_get_queue(mdev->ldev->backing_bdev);
2938 r = bdi_congested(&q->backing_dev_info, bdi_bits);
2939 put_ldev(mdev);
2940 if (r)
2941 reason = 'b';
2942 }
2943
2944 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
2945 r |= (1 << BDI_async_congested);
2946 reason = reason == 'b' ? 'a' : 'n';
2947 }
2948
2949out:
2950 mdev->congestion_reason = reason;
2951 return r;
2952}
2953
2954struct drbd_conf *drbd_new_device(unsigned int minor)
2955{
2956 struct drbd_conf *mdev;
2957 struct gendisk *disk;
2958 struct request_queue *q;
2959
2960 /* GFP_KERNEL, we are outside of all write-out paths */
2961 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
2962 if (!mdev)
2963 return NULL;
2964 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
2965 goto out_no_cpumask;
2966
2967 mdev->minor = minor;
2968
2969 drbd_init_set_defaults(mdev);
2970
2971 q = blk_alloc_queue(GFP_KERNEL);
2972 if (!q)
2973 goto out_no_q;
2974 mdev->rq_queue = q;
2975 q->queuedata = mdev;
2976 blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE);
2977
2978 disk = alloc_disk(1);
2979 if (!disk)
2980 goto out_no_disk;
2981 mdev->vdisk = disk;
2982
2983 set_disk_ro(disk, TRUE);
2984
2985 disk->queue = q;
2986 disk->major = DRBD_MAJOR;
2987 disk->first_minor = minor;
2988 disk->fops = &drbd_ops;
2989 sprintf(disk->disk_name, "drbd%d", minor);
2990 disk->private_data = mdev;
2991
2992 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
2993 /* we have no partitions. we contain only ourselves. */
2994 mdev->this_bdev->bd_contains = mdev->this_bdev;
2995
2996 q->backing_dev_info.congested_fn = drbd_congested;
2997 q->backing_dev_info.congested_data = mdev;
2998
2999 blk_queue_make_request(q, drbd_make_request_26);
3000 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3001 blk_queue_merge_bvec(q, drbd_merge_bvec);
3002 q->queue_lock = &mdev->req_lock; /* needed since we use */
3003 /* plugging on a queue, that actually has no requests! */
3004 q->unplug_fn = drbd_unplug_fn;
3005
3006 mdev->md_io_page = alloc_page(GFP_KERNEL);
3007 if (!mdev->md_io_page)
3008 goto out_no_io_page;
3009
3010 if (drbd_bm_init(mdev))
3011 goto out_no_bitmap;
3012 /* no need to lock access, we are still initializing this minor device. */
3013 if (!tl_init(mdev))
3014 goto out_no_tl;
3015
3016 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3017 if (!mdev->app_reads_hash)
3018 goto out_no_app_reads;
3019
3020 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3021 if (!mdev->current_epoch)
3022 goto out_no_epoch;
3023
3024 INIT_LIST_HEAD(&mdev->current_epoch->list);
3025 mdev->epochs = 1;
3026
3027 return mdev;
3028
3029/* out_whatever_else:
3030 kfree(mdev->current_epoch); */
3031out_no_epoch:
3032 kfree(mdev->app_reads_hash);
3033out_no_app_reads:
3034 tl_cleanup(mdev);
3035out_no_tl:
3036 drbd_bm_cleanup(mdev);
3037out_no_bitmap:
3038 __free_page(mdev->md_io_page);
3039out_no_io_page:
3040 put_disk(disk);
3041out_no_disk:
3042 blk_cleanup_queue(q);
3043out_no_q:
3044 free_cpumask_var(mdev->cpu_mask);
3045out_no_cpumask:
3046 kfree(mdev);
3047 return NULL;
3048}
3049
3050/* counterpart of drbd_new_device.
3051 * last part of drbd_delete_device. */
3052void drbd_free_mdev(struct drbd_conf *mdev)
3053{
3054 kfree(mdev->current_epoch);
3055 kfree(mdev->app_reads_hash);
3056 tl_cleanup(mdev);
3057 if (mdev->bitmap) /* should no longer be there. */
3058 drbd_bm_cleanup(mdev);
3059 __free_page(mdev->md_io_page);
3060 put_disk(mdev->vdisk);
3061 blk_cleanup_queue(mdev->rq_queue);
3062 free_cpumask_var(mdev->cpu_mask);
3063 kfree(mdev);
3064}
3065
3066
3067int __init drbd_init(void)
3068{
3069 int err;
3070
3071 if (sizeof(struct p_handshake) != 80) {
3072 printk(KERN_ERR
3073 "drbd: never change the size or layout "
3074 "of the HandShake packet.\n");
3075 return -EINVAL;
3076 }
3077
3078 if (1 > minor_count || minor_count > 255) {
3079 printk(KERN_ERR
3080 "drbd: invalid minor_count (%d)\n", minor_count);
3081#ifdef MODULE
3082 return -EINVAL;
3083#else
3084 minor_count = 8;
3085#endif
3086 }
3087
3088 err = drbd_nl_init();
3089 if (err)
3090 return err;
3091
3092 err = register_blkdev(DRBD_MAJOR, "drbd");
3093 if (err) {
3094 printk(KERN_ERR
3095 "drbd: unable to register block device major %d\n",
3096 DRBD_MAJOR);
3097 return err;
3098 }
3099
3100 register_reboot_notifier(&drbd_notifier);
3101
3102 /*
3103 * allocate all necessary structs
3104 */
3105 err = -ENOMEM;
3106
3107 init_waitqueue_head(&drbd_pp_wait);
3108
3109 drbd_proc = NULL; /* play safe for drbd_cleanup */
3110 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3111 GFP_KERNEL);
3112 if (!minor_table)
3113 goto Enomem;
3114
3115 err = drbd_create_mempools();
3116 if (err)
3117 goto Enomem;
3118
3119 drbd_proc = proc_create("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops);
3120 if (!drbd_proc) {
3121 printk(KERN_ERR "drbd: unable to register proc file\n");
3122 goto Enomem;
3123 }
3124
3125 rwlock_init(&global_state_lock);
3126
3127 printk(KERN_INFO "drbd: initialized. "
3128 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3129 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3130 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3131 printk(KERN_INFO "drbd: registered as block device major %d\n",
3132 DRBD_MAJOR);
3133 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3134
3135 return 0; /* Success! */
3136
3137Enomem:
3138 drbd_cleanup();
3139 if (err == -ENOMEM)
3140 /* currently always the case */
3141 printk(KERN_ERR "drbd: ran out of memory\n");
3142 else
3143 printk(KERN_ERR "drbd: initialization failure\n");
3144 return err;
3145}
3146
3147void drbd_free_bc(struct drbd_backing_dev *ldev)
3148{
3149 if (ldev == NULL)
3150 return;
3151
3152 bd_release(ldev->backing_bdev);
3153 bd_release(ldev->md_bdev);
3154
3155 fput(ldev->lo_file);
3156 fput(ldev->md_file);
3157
3158 kfree(ldev);
3159}
3160
3161void drbd_free_sock(struct drbd_conf *mdev)
3162{
3163 if (mdev->data.socket) {
3164 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3165 sock_release(mdev->data.socket);
3166 mdev->data.socket = NULL;
3167 }
3168 if (mdev->meta.socket) {
3169 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3170 sock_release(mdev->meta.socket);
3171 mdev->meta.socket = NULL;
3172 }
3173}
3174
3175
3176void drbd_free_resources(struct drbd_conf *mdev)
3177{
3178 crypto_free_hash(mdev->csums_tfm);
3179 mdev->csums_tfm = NULL;
3180 crypto_free_hash(mdev->verify_tfm);
3181 mdev->verify_tfm = NULL;
3182 crypto_free_hash(mdev->cram_hmac_tfm);
3183 mdev->cram_hmac_tfm = NULL;
3184 crypto_free_hash(mdev->integrity_w_tfm);
3185 mdev->integrity_w_tfm = NULL;
3186 crypto_free_hash(mdev->integrity_r_tfm);
3187 mdev->integrity_r_tfm = NULL;
3188
3189 drbd_free_sock(mdev);
3190
3191 __no_warn(local,
3192 drbd_free_bc(mdev->ldev);
3193 mdev->ldev = NULL;);
3194}
3195
3196/* meta data management */
3197
3198struct meta_data_on_disk {
3199 u64 la_size; /* last agreed size. */
3200 u64 uuid[UI_SIZE]; /* UUIDs. */
3201 u64 device_uuid;
3202 u64 reserved_u64_1;
3203 u32 flags; /* MDF */
3204 u32 magic;
3205 u32 md_size_sect;
3206 u32 al_offset; /* offset to this block */
3207 u32 al_nr_extents; /* important for restoring the AL */
3208 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3209 u32 bm_offset; /* offset to the bitmap, from here */
3210 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3211 u32 reserved_u32[4];
3212
3213} __packed;
3214
3215/**
3216 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3217 * @mdev: DRBD device.
3218 */
3219void drbd_md_sync(struct drbd_conf *mdev)
3220{
3221 struct meta_data_on_disk *buffer;
3222 sector_t sector;
3223 int i;
3224
3225 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3226 return;
3227 del_timer(&mdev->md_sync_timer);
3228
3229 /* We use here D_FAILED and not D_ATTACHING because we try to write
3230 * metadata even if we detach due to a disk failure! */
3231 if (!get_ldev_if_state(mdev, D_FAILED))
3232 return;
3233
3234 mutex_lock(&mdev->md_io_mutex);
3235 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3236 memset(buffer, 0, 512);
3237
3238 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3239 for (i = UI_CURRENT; i < UI_SIZE; i++)
3240 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3241 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3242 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3243
3244 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3245 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3246 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3247 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3248 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3249
3250 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3251
3252 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3253 sector = mdev->ldev->md.md_offset;
3254
3255 if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3256 clear_bit(MD_DIRTY, &mdev->flags);
3257 } else {
3258 /* this was a try anyways ... */
3259 dev_err(DEV, "meta data update failed!\n");
3260
3261 drbd_chk_io_error(mdev, 1, TRUE);
3262 }
3263
3264 /* Update mdev->ldev->md.la_size_sect,
3265 * since we updated it on metadata. */
3266 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3267
3268 mutex_unlock(&mdev->md_io_mutex);
3269 put_ldev(mdev);
3270}
3271
3272/**
3273 * drbd_md_read() - Reads in the meta data super block
3274 * @mdev: DRBD device.
3275 * @bdev: Device from which the meta data should be read in.
3276 *
3277 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
3278 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3279 */
3280int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3281{
3282 struct meta_data_on_disk *buffer;
3283 int i, rv = NO_ERROR;
3284
3285 if (!get_ldev_if_state(mdev, D_ATTACHING))
3286 return ERR_IO_MD_DISK;
3287
3288 mutex_lock(&mdev->md_io_mutex);
3289 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3290
3291 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3292 /* NOTE: cant do normal error processing here as this is
3293 called BEFORE disk is attached */
3294 dev_err(DEV, "Error while reading metadata.\n");
3295 rv = ERR_IO_MD_DISK;
3296 goto err;
3297 }
3298
3299 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3300 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3301 rv = ERR_MD_INVALID;
3302 goto err;
3303 }
3304 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3305 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3306 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3307 rv = ERR_MD_INVALID;
3308 goto err;
3309 }
3310 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3311 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3312 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3313 rv = ERR_MD_INVALID;
3314 goto err;
3315 }
3316 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3317 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3318 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3319 rv = ERR_MD_INVALID;
3320 goto err;
3321 }
3322
3323 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3324 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3325 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3326 rv = ERR_MD_INVALID;
3327 goto err;
3328 }
3329
3330 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3331 for (i = UI_CURRENT; i < UI_SIZE; i++)
3332 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3333 bdev->md.flags = be32_to_cpu(buffer->flags);
3334 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3335 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3336
3337 if (mdev->sync_conf.al_extents < 7)
3338 mdev->sync_conf.al_extents = 127;
3339
3340 err:
3341 mutex_unlock(&mdev->md_io_mutex);
3342 put_ldev(mdev);
3343
3344 return rv;
3345}
3346
3347/**
3348 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3349 * @mdev: DRBD device.
3350 *
3351 * Call this function if you change anything that should be written to
3352 * the meta-data super block. This function sets MD_DIRTY, and starts a
3353 * timer that ensures that within five seconds you have to call drbd_md_sync().
3354 */
3355void drbd_md_mark_dirty(struct drbd_conf *mdev)
3356{
3357 set_bit(MD_DIRTY, &mdev->flags);
3358 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
3359}
3360
3361
3362static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3363{
3364 int i;
3365
3366 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
3367 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
3368}
3369
3370void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3371{
3372 if (idx == UI_CURRENT) {
3373 if (mdev->state.role == R_PRIMARY)
3374 val |= 1;
3375 else
3376 val &= ~((u64)1);
3377
3378 drbd_set_ed_uuid(mdev, val);
3379 }
3380
3381 mdev->ldev->md.uuid[idx] = val;
3382 drbd_md_mark_dirty(mdev);
3383}
3384
3385
3386void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3387{
3388 if (mdev->ldev->md.uuid[idx]) {
3389 drbd_uuid_move_history(mdev);
3390 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
3391 }
3392 _drbd_uuid_set(mdev, idx, val);
3393}
3394
3395/**
3396 * drbd_uuid_new_current() - Creates a new current UUID
3397 * @mdev: DRBD device.
3398 *
3399 * Creates a new current UUID, and rotates the old current UUID into
3400 * the bitmap slot. Causes an incremental resync upon next connect.
3401 */
3402void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3403{
3404 u64 val;
3405
3406 dev_info(DEV, "Creating new current UUID\n");
3407 D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3408 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
3409
3410 get_random_bytes(&val, sizeof(u64));
3411 _drbd_uuid_set(mdev, UI_CURRENT, val);
3412}
3413
3414void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3415{
3416 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3417 return;
3418
3419 if (val == 0) {
3420 drbd_uuid_move_history(mdev);
3421 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3422 mdev->ldev->md.uuid[UI_BITMAP] = 0;
3423 } else {
3424 if (mdev->ldev->md.uuid[UI_BITMAP])
3425 dev_warn(DEV, "bm UUID already set");
3426
3427 mdev->ldev->md.uuid[UI_BITMAP] = val;
3428 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3429
3430 }
3431 drbd_md_mark_dirty(mdev);
3432}
3433
3434/**
3435 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3436 * @mdev: DRBD device.
3437 *
3438 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3439 */
3440int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3441{
3442 int rv = -EIO;
3443
3444 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3445 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3446 drbd_md_sync(mdev);
3447 drbd_bm_set_all(mdev);
3448
3449 rv = drbd_bm_write(mdev);
3450
3451 if (!rv) {
3452 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3453 drbd_md_sync(mdev);
3454 }
3455
3456 put_ldev(mdev);
3457 }
3458
3459 return rv;
3460}
3461
3462/**
3463 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3464 * @mdev: DRBD device.
3465 *
3466 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3467 */
3468int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3469{
3470 int rv = -EIO;
3471
3472 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3473 drbd_bm_clear_all(mdev);
3474 rv = drbd_bm_write(mdev);
3475 put_ldev(mdev);
3476 }
3477
3478 return rv;
3479}
3480
3481static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3482{
3483 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3484 int rv;
3485
3486 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3487
3488 drbd_bm_lock(mdev, work->why);
3489 rv = work->io_fn(mdev);
3490 drbd_bm_unlock(mdev);
3491
3492 clear_bit(BITMAP_IO, &mdev->flags);
3493 wake_up(&mdev->misc_wait);
3494
3495 if (work->done)
3496 work->done(mdev, rv);
3497
3498 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3499 work->why = NULL;
3500
3501 return 1;
3502}
3503
3504/**
3505 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3506 * @mdev: DRBD device.
3507 * @io_fn: IO callback to be called when bitmap IO is possible
3508 * @done: callback to be called after the bitmap IO was performed
3509 * @why: Descriptive text of the reason for doing the IO
3510 *
3511 * While IO on the bitmap happens we freeze application IO thus we ensure
3512 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3513 * called from worker context. It MUST NOT be used while a previous such
3514 * work is still pending!
3515 */
3516void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3517 int (*io_fn)(struct drbd_conf *),
3518 void (*done)(struct drbd_conf *, int),
3519 char *why)
3520{
3521 D_ASSERT(current == mdev->worker.task);
3522
3523 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3524 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3525 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3526 if (mdev->bm_io_work.why)
3527 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3528 why, mdev->bm_io_work.why);
3529
3530 mdev->bm_io_work.io_fn = io_fn;
3531 mdev->bm_io_work.done = done;
3532 mdev->bm_io_work.why = why;
3533
3534 set_bit(BITMAP_IO, &mdev->flags);
3535 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
3536 if (list_empty(&mdev->bm_io_work.w.list)) {
3537 set_bit(BITMAP_IO_QUEUED, &mdev->flags);
3538 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
3539 } else
3540 dev_err(DEV, "FIXME avoided double queuing bm_io_work\n");
3541 }
3542}
3543
3544/**
3545 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
3546 * @mdev: DRBD device.
3547 * @io_fn: IO callback to be called when bitmap IO is possible
3548 * @why: Descriptive text of the reason for doing the IO
3549 *
3550 * freezes application IO while that the actual IO operations runs. This
3551 * functions MAY NOT be called from worker context.
3552 */
3553int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3554{
3555 int rv;
3556
3557 D_ASSERT(current != mdev->worker.task);
3558
3559 drbd_suspend_io(mdev);
3560
3561 drbd_bm_lock(mdev, why);
3562 rv = io_fn(mdev);
3563 drbd_bm_unlock(mdev);
3564
3565 drbd_resume_io(mdev);
3566
3567 return rv;
3568}
3569
3570void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3571{
3572 if ((mdev->ldev->md.flags & flag) != flag) {
3573 drbd_md_mark_dirty(mdev);
3574 mdev->ldev->md.flags |= flag;
3575 }
3576}
3577
3578void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3579{
3580 if ((mdev->ldev->md.flags & flag) != 0) {
3581 drbd_md_mark_dirty(mdev);
3582 mdev->ldev->md.flags &= ~flag;
3583 }
3584}
3585int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3586{
3587 return (bdev->md.flags & flag) != 0;
3588}
3589
3590static void md_sync_timer_fn(unsigned long data)
3591{
3592 struct drbd_conf *mdev = (struct drbd_conf *) data;
3593
3594 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
3595}
3596
3597static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3598{
3599 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
3600 drbd_md_sync(mdev);
3601
3602 return 1;
3603}
3604
3605#ifdef CONFIG_DRBD_FAULT_INJECTION
3606/* Fault insertion support including random number generator shamelessly
3607 * stolen from kernel/rcutorture.c */
3608struct fault_random_state {
3609 unsigned long state;
3610 unsigned long count;
3611};
3612
3613#define FAULT_RANDOM_MULT 39916801 /* prime */
3614#define FAULT_RANDOM_ADD 479001701 /* prime */
3615#define FAULT_RANDOM_REFRESH 10000
3616
3617/*
3618 * Crude but fast random-number generator. Uses a linear congruential
3619 * generator, with occasional help from get_random_bytes().
3620 */
3621static unsigned long
3622_drbd_fault_random(struct fault_random_state *rsp)
3623{
3624 long refresh;
3625
3626 if (--rsp->count < 0) {
3627 get_random_bytes(&refresh, sizeof(refresh));
3628 rsp->state += refresh;
3629 rsp->count = FAULT_RANDOM_REFRESH;
3630 }
3631 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3632 return swahw32(rsp->state);
3633}
3634
3635static char *
3636_drbd_fault_str(unsigned int type) {
3637 static char *_faults[] = {
3638 [DRBD_FAULT_MD_WR] = "Meta-data write",
3639 [DRBD_FAULT_MD_RD] = "Meta-data read",
3640 [DRBD_FAULT_RS_WR] = "Resync write",
3641 [DRBD_FAULT_RS_RD] = "Resync read",
3642 [DRBD_FAULT_DT_WR] = "Data write",
3643 [DRBD_FAULT_DT_RD] = "Data read",
3644 [DRBD_FAULT_DT_RA] = "Data read ahead",
3645 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
3646 [DRBD_FAULT_AL_EE] = "EE allocation"
3647 };
3648
3649 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3650}
3651
3652unsigned int
3653_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
3654{
3655 static struct fault_random_state rrs = {0, 0};
3656
3657 unsigned int ret = (
3658 (fault_devs == 0 ||
3659 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
3660 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
3661
3662 if (ret) {
3663 fault_count++;
3664
3665 if (printk_ratelimit())
3666 dev_warn(DEV, "***Simulating %s failure\n",
3667 _drbd_fault_str(type));
3668 }
3669
3670 return ret;
3671}
3672#endif
3673
3674const char *drbd_buildtag(void)
3675{
3676 /* DRBD built from external sources has here a reference to the
3677 git hash of the source code. */
3678
3679 static char buildtag[38] = "\0uilt-in";
3680
3681 if (buildtag[0] == 0) {
3682#ifdef CONFIG_MODULES
3683 if (THIS_MODULE != NULL)
3684 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3685 else
3686#endif
3687 buildtag[0] = 'b';
3688 }
3689
3690 return buildtag;
3691}
3692
3693module_init(drbd_init)
3694module_exit(drbd_cleanup)
3695
3696EXPORT_SYMBOL(drbd_conn_str);
3697EXPORT_SYMBOL(drbd_role_str);
3698EXPORT_SYMBOL(drbd_disk_str);
3699EXPORT_SYMBOL(drbd_set_st_err_str);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
new file mode 100644
index 000000000000..436a090b532b
--- /dev/null
+++ b/drivers/block/drbd/drbd_nl.c
@@ -0,0 +1,2364 @@
1/*
2 drbd_nl.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/module.h>
27#include <linux/drbd.h>
28#include <linux/in.h>
29#include <linux/fs.h>
30#include <linux/file.h>
31#include <linux/slab.h>
32#include <linux/connector.h>
33#include <linux/blkpg.h>
34#include <linux/cpumask.h>
35#include "drbd_int.h"
36#include "drbd_wrappers.h"
37#include <asm/unaligned.h>
38#include <linux/drbd_tag_magic.h>
39#include <linux/drbd_limits.h>
40
41static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int);
42static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *);
43static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *);
44
45/* see get_sb_bdev and bd_claim */
46static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
47
48/* Generate the tag_list to struct functions */
49#define NL_PACKET(name, number, fields) \
50static int name ## _from_tags(struct drbd_conf *mdev, \
51 unsigned short *tags, struct name *arg) __attribute__ ((unused)); \
52static int name ## _from_tags(struct drbd_conf *mdev, \
53 unsigned short *tags, struct name *arg) \
54{ \
55 int tag; \
56 int dlen; \
57 \
58 while ((tag = get_unaligned(tags++)) != TT_END) { \
59 dlen = get_unaligned(tags++); \
60 switch (tag_number(tag)) { \
61 fields \
62 default: \
63 if (tag & T_MANDATORY) { \
64 dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \
65 return 0; \
66 } \
67 } \
68 tags = (unsigned short *)((char *)tags + dlen); \
69 } \
70 return 1; \
71}
72#define NL_INTEGER(pn, pr, member) \
73 case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \
74 arg->member = get_unaligned((int *)(tags)); \
75 break;
76#define NL_INT64(pn, pr, member) \
77 case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \
78 arg->member = get_unaligned((u64 *)(tags)); \
79 break;
80#define NL_BIT(pn, pr, member) \
81 case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \
82 arg->member = *(char *)(tags) ? 1 : 0; \
83 break;
84#define NL_STRING(pn, pr, member, len) \
85 case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \
86 if (dlen > len) { \
87 dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \
88 #member, dlen, (unsigned int)len); \
89 return 0; \
90 } \
91 arg->member ## _len = dlen; \
92 memcpy(arg->member, tags, min_t(size_t, dlen, len)); \
93 break;
94#include "linux/drbd_nl.h"
95
96/* Generate the struct to tag_list functions */
97#define NL_PACKET(name, number, fields) \
98static unsigned short* \
99name ## _to_tags(struct drbd_conf *mdev, \
100 struct name *arg, unsigned short *tags) __attribute__ ((unused)); \
101static unsigned short* \
102name ## _to_tags(struct drbd_conf *mdev, \
103 struct name *arg, unsigned short *tags) \
104{ \
105 fields \
106 return tags; \
107}
108
109#define NL_INTEGER(pn, pr, member) \
110 put_unaligned(pn | pr | TT_INTEGER, tags++); \
111 put_unaligned(sizeof(int), tags++); \
112 put_unaligned(arg->member, (int *)tags); \
113 tags = (unsigned short *)((char *)tags+sizeof(int));
114#define NL_INT64(pn, pr, member) \
115 put_unaligned(pn | pr | TT_INT64, tags++); \
116 put_unaligned(sizeof(u64), tags++); \
117 put_unaligned(arg->member, (u64 *)tags); \
118 tags = (unsigned short *)((char *)tags+sizeof(u64));
119#define NL_BIT(pn, pr, member) \
120 put_unaligned(pn | pr | TT_BIT, tags++); \
121 put_unaligned(sizeof(char), tags++); \
122 *(char *)tags = arg->member; \
123 tags = (unsigned short *)((char *)tags+sizeof(char));
124#define NL_STRING(pn, pr, member, len) \
125 put_unaligned(pn | pr | TT_STRING, tags++); \
126 put_unaligned(arg->member ## _len, tags++); \
127 memcpy(tags, arg->member, arg->member ## _len); \
128 tags = (unsigned short *)((char *)tags + arg->member ## _len);
129#include "linux/drbd_nl.h"
130
131void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name);
132void drbd_nl_send_reply(struct cn_msg *, int);
133
134int drbd_khelper(struct drbd_conf *mdev, char *cmd)
135{
136 char *envp[] = { "HOME=/",
137 "TERM=linux",
138 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
139 NULL, /* Will be set to address family */
140 NULL, /* Will be set to address */
141 NULL };
142
143 char mb[12], af[20], ad[60], *afs;
144 char *argv[] = {usermode_helper, cmd, mb, NULL };
145 int ret;
146
147 snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev));
148
149 if (get_net_conf(mdev)) {
150 switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) {
151 case AF_INET6:
152 afs = "ipv6";
153 snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6",
154 &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr);
155 break;
156 case AF_INET:
157 afs = "ipv4";
158 snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
159 &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
160 break;
161 default:
162 afs = "ssocks";
163 snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
164 &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
165 }
166 snprintf(af, 20, "DRBD_PEER_AF=%s", afs);
167 envp[3]=af;
168 envp[4]=ad;
169 put_net_conf(mdev);
170 }
171
172 dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
173
174 drbd_bcast_ev_helper(mdev, cmd);
175 ret = call_usermodehelper(usermode_helper, argv, envp, 1);
176 if (ret)
177 dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
178 usermode_helper, cmd, mb,
179 (ret >> 8) & 0xff, ret);
180 else
181 dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
182 usermode_helper, cmd, mb,
183 (ret >> 8) & 0xff, ret);
184
185 if (ret < 0) /* Ignore any ERRNOs we got. */
186 ret = 0;
187
188 return ret;
189}
190
191enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
192{
193 char *ex_to_string;
194 int r;
195 enum drbd_disk_state nps;
196 enum drbd_fencing_p fp;
197
198 D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
199
200 if (get_ldev_if_state(mdev, D_CONSISTENT)) {
201 fp = mdev->ldev->dc.fencing;
202 put_ldev(mdev);
203 } else {
204 dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n");
205 return mdev->state.pdsk;
206 }
207
208 if (fp == FP_STONITH)
209 _drbd_request_state(mdev, NS(susp, 1), CS_WAIT_COMPLETE);
210
211 r = drbd_khelper(mdev, "fence-peer");
212
213 switch ((r>>8) & 0xff) {
214 case 3: /* peer is inconsistent */
215 ex_to_string = "peer is inconsistent or worse";
216 nps = D_INCONSISTENT;
217 break;
218 case 4: /* peer got outdated, or was already outdated */
219 ex_to_string = "peer was fenced";
220 nps = D_OUTDATED;
221 break;
222 case 5: /* peer was down */
223 if (mdev->state.disk == D_UP_TO_DATE) {
224 /* we will(have) create(d) a new UUID anyways... */
225 ex_to_string = "peer is unreachable, assumed to be dead";
226 nps = D_OUTDATED;
227 } else {
228 ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
229 nps = mdev->state.pdsk;
230 }
231 break;
232 case 6: /* Peer is primary, voluntarily outdate myself.
233 * This is useful when an unconnected R_SECONDARY is asked to
234 * become R_PRIMARY, but finds the other peer being active. */
235 ex_to_string = "peer is active";
236 dev_warn(DEV, "Peer is primary, outdating myself.\n");
237 nps = D_UNKNOWN;
238 _drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE);
239 break;
240 case 7:
241 if (fp != FP_STONITH)
242 dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n");
243 ex_to_string = "peer was stonithed";
244 nps = D_OUTDATED;
245 break;
246 default:
247 /* The script is broken ... */
248 nps = D_UNKNOWN;
249 dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
250 return nps;
251 }
252
253 dev_info(DEV, "fence-peer helper returned %d (%s)\n",
254 (r>>8) & 0xff, ex_to_string);
255 return nps;
256}
257
258
259int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
260{
261 const int max_tries = 4;
262 int r = 0;
263 int try = 0;
264 int forced = 0;
265 union drbd_state mask, val;
266 enum drbd_disk_state nps;
267
268 if (new_role == R_PRIMARY)
269 request_ping(mdev); /* Detect a dead peer ASAP */
270
271 mutex_lock(&mdev->state_mutex);
272
273 mask.i = 0; mask.role = R_MASK;
274 val.i = 0; val.role = new_role;
275
276 while (try++ < max_tries) {
277 r = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE);
278
279 /* in case we first succeeded to outdate,
280 * but now suddenly could establish a connection */
281 if (r == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
282 val.pdsk = 0;
283 mask.pdsk = 0;
284 continue;
285 }
286
287 if (r == SS_NO_UP_TO_DATE_DISK && force &&
288 (mdev->state.disk == D_INCONSISTENT ||
289 mdev->state.disk == D_OUTDATED)) {
290 mask.disk = D_MASK;
291 val.disk = D_UP_TO_DATE;
292 forced = 1;
293 continue;
294 }
295
296 if (r == SS_NO_UP_TO_DATE_DISK &&
297 mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) {
298 D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
299 nps = drbd_try_outdate_peer(mdev);
300
301 if (nps == D_OUTDATED || nps == D_INCONSISTENT) {
302 val.disk = D_UP_TO_DATE;
303 mask.disk = D_MASK;
304 }
305
306 val.pdsk = nps;
307 mask.pdsk = D_MASK;
308
309 continue;
310 }
311
312 if (r == SS_NOTHING_TO_DO)
313 goto fail;
314 if (r == SS_PRIMARY_NOP && mask.pdsk == 0) {
315 nps = drbd_try_outdate_peer(mdev);
316
317 if (force && nps > D_OUTDATED) {
318 dev_warn(DEV, "Forced into split brain situation!\n");
319 nps = D_OUTDATED;
320 }
321
322 mask.pdsk = D_MASK;
323 val.pdsk = nps;
324
325 continue;
326 }
327 if (r == SS_TWO_PRIMARIES) {
328 /* Maybe the peer is detected as dead very soon...
329 retry at most once more in this case. */
330 __set_current_state(TASK_INTERRUPTIBLE);
331 schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10);
332 if (try < max_tries)
333 try = max_tries - 1;
334 continue;
335 }
336 if (r < SS_SUCCESS) {
337 r = _drbd_request_state(mdev, mask, val,
338 CS_VERBOSE + CS_WAIT_COMPLETE);
339 if (r < SS_SUCCESS)
340 goto fail;
341 }
342 break;
343 }
344
345 if (r < SS_SUCCESS)
346 goto fail;
347
348 if (forced)
349 dev_warn(DEV, "Forced to consider local data as UpToDate!\n");
350
351 /* Wait until nothing is on the fly :) */
352 wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0);
353
354 if (new_role == R_SECONDARY) {
355 set_disk_ro(mdev->vdisk, TRUE);
356 if (get_ldev(mdev)) {
357 mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
358 put_ldev(mdev);
359 }
360 } else {
361 if (get_net_conf(mdev)) {
362 mdev->net_conf->want_lose = 0;
363 put_net_conf(mdev);
364 }
365 set_disk_ro(mdev->vdisk, FALSE);
366 if (get_ldev(mdev)) {
367 if (((mdev->state.conn < C_CONNECTED ||
368 mdev->state.pdsk <= D_FAILED)
369 && mdev->ldev->md.uuid[UI_BITMAP] == 0) || forced)
370 drbd_uuid_new_current(mdev);
371
372 mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1;
373 put_ldev(mdev);
374 }
375 }
376
377 if ((new_role == R_SECONDARY) && get_ldev(mdev)) {
378 drbd_al_to_on_disk_bm(mdev);
379 put_ldev(mdev);
380 }
381
382 if (mdev->state.conn >= C_WF_REPORT_PARAMS) {
383 /* if this was forced, we should consider sync */
384 if (forced)
385 drbd_send_uuids(mdev);
386 drbd_send_state(mdev);
387 }
388
389 drbd_md_sync(mdev);
390
391 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
392 fail:
393 mutex_unlock(&mdev->state_mutex);
394 return r;
395}
396
397
398static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
399 struct drbd_nl_cfg_reply *reply)
400{
401 struct primary primary_args;
402
403 memset(&primary_args, 0, sizeof(struct primary));
404 if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) {
405 reply->ret_code = ERR_MANDATORY_TAG;
406 return 0;
407 }
408
409 reply->ret_code =
410 drbd_set_role(mdev, R_PRIMARY, primary_args.overwrite_peer);
411
412 return 0;
413}
414
415static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
416 struct drbd_nl_cfg_reply *reply)
417{
418 reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0);
419
420 return 0;
421}
422
423/* initializes the md.*_offset members, so we are able to find
424 * the on disk meta data */
425static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
426 struct drbd_backing_dev *bdev)
427{
428 sector_t md_size_sect = 0;
429 switch (bdev->dc.meta_dev_idx) {
430 default:
431 /* v07 style fixed size indexed meta data */
432 bdev->md.md_size_sect = MD_RESERVED_SECT;
433 bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
434 bdev->md.al_offset = MD_AL_OFFSET;
435 bdev->md.bm_offset = MD_BM_OFFSET;
436 break;
437 case DRBD_MD_INDEX_FLEX_EXT:
438 /* just occupy the full device; unit: sectors */
439 bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
440 bdev->md.md_offset = 0;
441 bdev->md.al_offset = MD_AL_OFFSET;
442 bdev->md.bm_offset = MD_BM_OFFSET;
443 break;
444 case DRBD_MD_INDEX_INTERNAL:
445 case DRBD_MD_INDEX_FLEX_INT:
446 bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
447 /* al size is still fixed */
448 bdev->md.al_offset = -MD_AL_MAX_SIZE;
449 /* we need (slightly less than) ~ this much bitmap sectors: */
450 md_size_sect = drbd_get_capacity(bdev->backing_bdev);
451 md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
452 md_size_sect = BM_SECT_TO_EXT(md_size_sect);
453 md_size_sect = ALIGN(md_size_sect, 8);
454
455 /* plus the "drbd meta data super block",
456 * and the activity log; */
457 md_size_sect += MD_BM_OFFSET;
458
459 bdev->md.md_size_sect = md_size_sect;
460 /* bitmap offset is adjusted by 'super' block size */
461 bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET;
462 break;
463 }
464}
465
466char *ppsize(char *buf, unsigned long long size)
467{
468 /* Needs 9 bytes at max. */
469 static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
470 int base = 0;
471 while (size >= 10000) {
472 /* shift + round */
473 size = (size >> 10) + !!(size & (1<<9));
474 base++;
475 }
476 sprintf(buf, "%lu %cB", (long)size, units[base]);
477
478 return buf;
479}
480
481/* there is still a theoretical deadlock when called from receiver
482 * on an D_INCONSISTENT R_PRIMARY:
483 * remote READ does inc_ap_bio, receiver would need to receive answer
484 * packet from remote to dec_ap_bio again.
485 * receiver receive_sizes(), comes here,
486 * waits for ap_bio_cnt == 0. -> deadlock.
487 * but this cannot happen, actually, because:
488 * R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable
489 * (not connected, or bad/no disk on peer):
490 * see drbd_fail_request_early, ap_bio_cnt is zero.
491 * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
492 * peer may not initiate a resize.
493 */
494void drbd_suspend_io(struct drbd_conf *mdev)
495{
496 set_bit(SUSPEND_IO, &mdev->flags);
497 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
498}
499
500void drbd_resume_io(struct drbd_conf *mdev)
501{
502 clear_bit(SUSPEND_IO, &mdev->flags);
503 wake_up(&mdev->misc_wait);
504}
505
506/**
507 * drbd_determine_dev_size() - Sets the right device size obeying all constraints
508 * @mdev: DRBD device.
509 *
510 * Returns 0 on success, negative return values indicate errors.
511 * You should call drbd_md_sync() after calling this function.
512 */
513enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev) __must_hold(local)
514{
515 sector_t prev_first_sect, prev_size; /* previous meta location */
516 sector_t la_size;
517 sector_t size;
518 char ppb[10];
519
520 int md_moved, la_size_changed;
521 enum determine_dev_size rv = unchanged;
522
523 /* race:
524 * application request passes inc_ap_bio,
525 * but then cannot get an AL-reference.
526 * this function later may wait on ap_bio_cnt == 0. -> deadlock.
527 *
528 * to avoid that:
529 * Suspend IO right here.
530 * still lock the act_log to not trigger ASSERTs there.
531 */
532 drbd_suspend_io(mdev);
533
534 /* no wait necessary anymore, actually we could assert that */
535 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
536
537 prev_first_sect = drbd_md_first_sector(mdev->ldev);
538 prev_size = mdev->ldev->md.md_size_sect;
539 la_size = mdev->ldev->md.la_size_sect;
540
541 /* TODO: should only be some assert here, not (re)init... */
542 drbd_md_set_sector_offsets(mdev, mdev->ldev);
543
544 size = drbd_new_dev_size(mdev, mdev->ldev);
545
546 if (drbd_get_capacity(mdev->this_bdev) != size ||
547 drbd_bm_capacity(mdev) != size) {
548 int err;
549 err = drbd_bm_resize(mdev, size);
550 if (unlikely(err)) {
551 /* currently there is only one error: ENOMEM! */
552 size = drbd_bm_capacity(mdev)>>1;
553 if (size == 0) {
554 dev_err(DEV, "OUT OF MEMORY! "
555 "Could not allocate bitmap!\n");
556 } else {
557 dev_err(DEV, "BM resizing failed. "
558 "Leaving size unchanged at size = %lu KB\n",
559 (unsigned long)size);
560 }
561 rv = dev_size_error;
562 }
563 /* racy, see comments above. */
564 drbd_set_my_capacity(mdev, size);
565 mdev->ldev->md.la_size_sect = size;
566 dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
567 (unsigned long long)size>>1);
568 }
569 if (rv == dev_size_error)
570 goto out;
571
572 la_size_changed = (la_size != mdev->ldev->md.la_size_sect);
573
574 md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev)
575 || prev_size != mdev->ldev->md.md_size_sect;
576
577 if (la_size_changed || md_moved) {
578 drbd_al_shrink(mdev); /* All extents inactive. */
579 dev_info(DEV, "Writing the whole bitmap, %s\n",
580 la_size_changed && md_moved ? "size changed and md moved" :
581 la_size_changed ? "size changed" : "md moved");
582 rv = drbd_bitmap_io(mdev, &drbd_bm_write, "size changed"); /* does drbd_resume_io() ! */
583 drbd_md_mark_dirty(mdev);
584 }
585
586 if (size > la_size)
587 rv = grew;
588 if (size < la_size)
589 rv = shrunk;
590out:
591 lc_unlock(mdev->act_log);
592 wake_up(&mdev->al_wait);
593 drbd_resume_io(mdev);
594
595 return rv;
596}
597
598sector_t
599drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
600{
601 sector_t p_size = mdev->p_size; /* partner's disk size. */
602 sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */
603 sector_t m_size; /* my size */
604 sector_t u_size = bdev->dc.disk_size; /* size requested by user. */
605 sector_t size = 0;
606
607 m_size = drbd_get_max_capacity(bdev);
608
609 if (p_size && m_size) {
610 size = min_t(sector_t, p_size, m_size);
611 } else {
612 if (la_size) {
613 size = la_size;
614 if (m_size && m_size < size)
615 size = m_size;
616 if (p_size && p_size < size)
617 size = p_size;
618 } else {
619 if (m_size)
620 size = m_size;
621 if (p_size)
622 size = p_size;
623 }
624 }
625
626 if (size == 0)
627 dev_err(DEV, "Both nodes diskless!\n");
628
629 if (u_size) {
630 if (u_size > size)
631 dev_err(DEV, "Requested disk size is too big (%lu > %lu)\n",
632 (unsigned long)u_size>>1, (unsigned long)size>>1);
633 else
634 size = u_size;
635 }
636
637 return size;
638}
639
640/**
641 * drbd_check_al_size() - Ensures that the AL is of the right size
642 * @mdev: DRBD device.
643 *
644 * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation
645 * failed, and 0 on success. You should call drbd_md_sync() after you called
646 * this function.
647 */
648static int drbd_check_al_size(struct drbd_conf *mdev)
649{
650 struct lru_cache *n, *t;
651 struct lc_element *e;
652 unsigned int in_use;
653 int i;
654
655 ERR_IF(mdev->sync_conf.al_extents < 7)
656 mdev->sync_conf.al_extents = 127;
657
658 if (mdev->act_log &&
659 mdev->act_log->nr_elements == mdev->sync_conf.al_extents)
660 return 0;
661
662 in_use = 0;
663 t = mdev->act_log;
664 n = lc_create("act_log", drbd_al_ext_cache,
665 mdev->sync_conf.al_extents, sizeof(struct lc_element), 0);
666
667 if (n == NULL) {
668 dev_err(DEV, "Cannot allocate act_log lru!\n");
669 return -ENOMEM;
670 }
671 spin_lock_irq(&mdev->al_lock);
672 if (t) {
673 for (i = 0; i < t->nr_elements; i++) {
674 e = lc_element_by_index(t, i);
675 if (e->refcnt)
676 dev_err(DEV, "refcnt(%d)==%d\n",
677 e->lc_number, e->refcnt);
678 in_use += e->refcnt;
679 }
680 }
681 if (!in_use)
682 mdev->act_log = n;
683 spin_unlock_irq(&mdev->al_lock);
684 if (in_use) {
685 dev_err(DEV, "Activity log still in use!\n");
686 lc_destroy(n);
687 return -EBUSY;
688 } else {
689 if (t)
690 lc_destroy(t);
691 }
692 drbd_md_mark_dirty(mdev); /* we changed mdev->act_log->nr_elemens */
693 return 0;
694}
695
696void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __must_hold(local)
697{
698 struct request_queue * const q = mdev->rq_queue;
699 struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
700 int max_segments = mdev->ldev->dc.max_bio_bvecs;
701
702 if (b->merge_bvec_fn && !mdev->ldev->dc.use_bmbv)
703 max_seg_s = PAGE_SIZE;
704
705 max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s);
706
707 blk_queue_max_sectors(q, max_seg_s >> 9);
708 blk_queue_max_phys_segments(q, max_segments ? max_segments : MAX_PHYS_SEGMENTS);
709 blk_queue_max_hw_segments(q, max_segments ? max_segments : MAX_HW_SEGMENTS);
710 blk_queue_max_segment_size(q, max_seg_s);
711 blk_queue_logical_block_size(q, 512);
712 blk_queue_segment_boundary(q, PAGE_SIZE-1);
713 blk_stack_limits(&q->limits, &b->limits, 0);
714
715 if (b->merge_bvec_fn)
716 dev_warn(DEV, "Backing device's merge_bvec_fn() = %p\n",
717 b->merge_bvec_fn);
718 dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q));
719
720 if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
721 dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
722 q->backing_dev_info.ra_pages,
723 b->backing_dev_info.ra_pages);
724 q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
725 }
726}
727
728/* serialize deconfig (worker exiting, doing cleanup)
729 * and reconfig (drbdsetup disk, drbdsetup net)
730 *
731 * wait for a potentially exiting worker, then restart it,
732 * or start a new one.
733 */
734static void drbd_reconfig_start(struct drbd_conf *mdev)
735{
736 wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags));
737 wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags));
738 drbd_thread_start(&mdev->worker);
739}
740
741/* if still unconfigured, stops worker again.
742 * if configured now, clears CONFIG_PENDING.
743 * wakes potential waiters */
744static void drbd_reconfig_done(struct drbd_conf *mdev)
745{
746 spin_lock_irq(&mdev->req_lock);
747 if (mdev->state.disk == D_DISKLESS &&
748 mdev->state.conn == C_STANDALONE &&
749 mdev->state.role == R_SECONDARY) {
750 set_bit(DEVICE_DYING, &mdev->flags);
751 drbd_thread_stop_nowait(&mdev->worker);
752 } else
753 clear_bit(CONFIG_PENDING, &mdev->flags);
754 spin_unlock_irq(&mdev->req_lock);
755 wake_up(&mdev->state_wait);
756}
757
758/* does always return 0;
759 * interesting return code is in reply->ret_code */
760static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
761 struct drbd_nl_cfg_reply *reply)
762{
763 enum drbd_ret_codes retcode;
764 enum determine_dev_size dd;
765 sector_t max_possible_sectors;
766 sector_t min_md_device_sectors;
767 struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
768 struct inode *inode, *inode2;
769 struct lru_cache *resync_lru = NULL;
770 union drbd_state ns, os;
771 int rv;
772 int cp_discovered = 0;
773 int logical_block_size;
774
775 drbd_reconfig_start(mdev);
776
777 /* if you want to reconfigure, please tear down first */
778 if (mdev->state.disk > D_DISKLESS) {
779 retcode = ERR_DISK_CONFIGURED;
780 goto fail;
781 }
782
783 /* allocation not in the IO path, cqueue thread context */
784 nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
785 if (!nbc) {
786 retcode = ERR_NOMEM;
787 goto fail;
788 }
789
790 nbc->dc.disk_size = DRBD_DISK_SIZE_SECT_DEF;
791 nbc->dc.on_io_error = DRBD_ON_IO_ERROR_DEF;
792 nbc->dc.fencing = DRBD_FENCING_DEF;
793 nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF;
794
795 if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) {
796 retcode = ERR_MANDATORY_TAG;
797 goto fail;
798 }
799
800 if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
801 retcode = ERR_MD_IDX_INVALID;
802 goto fail;
803 }
804
805 nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0);
806 if (IS_ERR(nbc->lo_file)) {
807 dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
808 PTR_ERR(nbc->lo_file));
809 nbc->lo_file = NULL;
810 retcode = ERR_OPEN_DISK;
811 goto fail;
812 }
813
814 inode = nbc->lo_file->f_dentry->d_inode;
815
816 if (!S_ISBLK(inode->i_mode)) {
817 retcode = ERR_DISK_NOT_BDEV;
818 goto fail;
819 }
820
821 nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0);
822 if (IS_ERR(nbc->md_file)) {
823 dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
824 PTR_ERR(nbc->md_file));
825 nbc->md_file = NULL;
826 retcode = ERR_OPEN_MD_DISK;
827 goto fail;
828 }
829
830 inode2 = nbc->md_file->f_dentry->d_inode;
831
832 if (!S_ISBLK(inode2->i_mode)) {
833 retcode = ERR_MD_NOT_BDEV;
834 goto fail;
835 }
836
837 nbc->backing_bdev = inode->i_bdev;
838 if (bd_claim(nbc->backing_bdev, mdev)) {
839 printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n",
840 nbc->backing_bdev, mdev,
841 nbc->backing_bdev->bd_holder,
842 nbc->backing_bdev->bd_contains->bd_holder,
843 nbc->backing_bdev->bd_holders);
844 retcode = ERR_BDCLAIM_DISK;
845 goto fail;
846 }
847
848 resync_lru = lc_create("resync", drbd_bm_ext_cache,
849 61, sizeof(struct bm_extent),
850 offsetof(struct bm_extent, lce));
851 if (!resync_lru) {
852 retcode = ERR_NOMEM;
853 goto release_bdev_fail;
854 }
855
856 /* meta_dev_idx >= 0: external fixed size,
857 * possibly multiple drbd sharing one meta device.
858 * TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is
859 * not yet used by some other drbd minor!
860 * (if you use drbd.conf + drbdadm,
861 * that should check it for you already; but if you don't, or someone
862 * fooled it, we need to double check here) */
863 nbc->md_bdev = inode2->i_bdev;
864 if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev
865 : (void *) drbd_m_holder)) {
866 retcode = ERR_BDCLAIM_MD_DISK;
867 goto release_bdev_fail;
868 }
869
870 if ((nbc->backing_bdev == nbc->md_bdev) !=
871 (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
872 nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
873 retcode = ERR_MD_IDX_INVALID;
874 goto release_bdev2_fail;
875 }
876
877 /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
878 drbd_md_set_sector_offsets(mdev, nbc);
879
880 if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) {
881 dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
882 (unsigned long long) drbd_get_max_capacity(nbc),
883 (unsigned long long) nbc->dc.disk_size);
884 retcode = ERR_DISK_TO_SMALL;
885 goto release_bdev2_fail;
886 }
887
888 if (nbc->dc.meta_dev_idx < 0) {
889 max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
890 /* at least one MB, otherwise it does not make sense */
891 min_md_device_sectors = (2<<10);
892 } else {
893 max_possible_sectors = DRBD_MAX_SECTORS;
894 min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1);
895 }
896
897 if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
898 retcode = ERR_MD_DISK_TO_SMALL;
899 dev_warn(DEV, "refusing attach: md-device too small, "
900 "at least %llu sectors needed for this meta-disk type\n",
901 (unsigned long long) min_md_device_sectors);
902 goto release_bdev2_fail;
903 }
904
905 /* Make sure the new disk is big enough
906 * (we may currently be R_PRIMARY with no local disk...) */
907 if (drbd_get_max_capacity(nbc) <
908 drbd_get_capacity(mdev->this_bdev)) {
909 retcode = ERR_DISK_TO_SMALL;
910 goto release_bdev2_fail;
911 }
912
913 nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
914
915 if (nbc->known_size > max_possible_sectors) {
916 dev_warn(DEV, "==> truncating very big lower level device "
917 "to currently maximum possible %llu sectors <==\n",
918 (unsigned long long) max_possible_sectors);
919 if (nbc->dc.meta_dev_idx >= 0)
920 dev_warn(DEV, "==>> using internal or flexible "
921 "meta data may help <<==\n");
922 }
923
924 drbd_suspend_io(mdev);
925 /* also wait for the last barrier ack. */
926 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt));
927 /* and for any other previously queued work */
928 drbd_flush_workqueue(mdev);
929
930 retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE);
931 drbd_resume_io(mdev);
932 if (retcode < SS_SUCCESS)
933 goto release_bdev2_fail;
934
935 if (!get_ldev_if_state(mdev, D_ATTACHING))
936 goto force_diskless;
937
938 drbd_md_set_sector_offsets(mdev, nbc);
939
940 if (!mdev->bitmap) {
941 if (drbd_bm_init(mdev)) {
942 retcode = ERR_NOMEM;
943 goto force_diskless_dec;
944 }
945 }
946
947 retcode = drbd_md_read(mdev, nbc);
948 if (retcode != NO_ERROR)
949 goto force_diskless_dec;
950
951 if (mdev->state.conn < C_CONNECTED &&
952 mdev->state.role == R_PRIMARY &&
953 (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
954 dev_err(DEV, "Can only attach to data with current UUID=%016llX\n",
955 (unsigned long long)mdev->ed_uuid);
956 retcode = ERR_DATA_NOT_CURRENT;
957 goto force_diskless_dec;
958 }
959
960 /* Since we are diskless, fix the activity log first... */
961 if (drbd_check_al_size(mdev)) {
962 retcode = ERR_NOMEM;
963 goto force_diskless_dec;
964 }
965
966 /* Prevent shrinking of consistent devices ! */
967 if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
968 drbd_new_dev_size(mdev, nbc) < nbc->md.la_size_sect) {
969 dev_warn(DEV, "refusing to truncate a consistent device\n");
970 retcode = ERR_DISK_TO_SMALL;
971 goto force_diskless_dec;
972 }
973
974 if (!drbd_al_read_log(mdev, nbc)) {
975 retcode = ERR_IO_MD_DISK;
976 goto force_diskless_dec;
977 }
978
979 /* allocate a second IO page if logical_block_size != 512 */
980 logical_block_size = bdev_logical_block_size(nbc->md_bdev);
981 if (logical_block_size == 0)
982 logical_block_size = MD_SECTOR_SIZE;
983
984 if (logical_block_size != MD_SECTOR_SIZE) {
985 if (!mdev->md_io_tmpp) {
986 struct page *page = alloc_page(GFP_NOIO);
987 if (!page)
988 goto force_diskless_dec;
989
990 dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n",
991 logical_block_size, MD_SECTOR_SIZE);
992 dev_warn(DEV, "Workaround engaged (has performance impact).\n");
993
994 mdev->md_io_tmpp = page;
995 }
996 }
997
998 /* Reset the "barriers don't work" bits here, then force meta data to
999 * be written, to ensure we determine if barriers are supported. */
1000 if (nbc->dc.no_md_flush)
1001 set_bit(MD_NO_BARRIER, &mdev->flags);
1002 else
1003 clear_bit(MD_NO_BARRIER, &mdev->flags);
1004
1005 /* Point of no return reached.
1006 * Devices and memory are no longer released by error cleanup below.
1007 * now mdev takes over responsibility, and the state engine should
1008 * clean it up somewhere. */
1009 D_ASSERT(mdev->ldev == NULL);
1010 mdev->ldev = nbc;
1011 mdev->resync = resync_lru;
1012 nbc = NULL;
1013 resync_lru = NULL;
1014
1015 mdev->write_ordering = WO_bio_barrier;
1016 drbd_bump_write_ordering(mdev, WO_bio_barrier);
1017
1018 if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY))
1019 set_bit(CRASHED_PRIMARY, &mdev->flags);
1020 else
1021 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1022
1023 if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND)) {
1024 set_bit(CRASHED_PRIMARY, &mdev->flags);
1025 cp_discovered = 1;
1026 }
1027
1028 mdev->send_cnt = 0;
1029 mdev->recv_cnt = 0;
1030 mdev->read_cnt = 0;
1031 mdev->writ_cnt = 0;
1032
1033 drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE);
1034
1035 /* If I am currently not R_PRIMARY,
1036 * but meta data primary indicator is set,
1037 * I just now recover from a hard crash,
1038 * and have been R_PRIMARY before that crash.
1039 *
1040 * Now, if I had no connection before that crash
1041 * (have been degraded R_PRIMARY), chances are that
1042 * I won't find my peer now either.
1043 *
1044 * In that case, and _only_ in that case,
1045 * we use the degr-wfc-timeout instead of the default,
1046 * so we can automatically recover from a crash of a
1047 * degraded but active "cluster" after a certain timeout.
1048 */
1049 clear_bit(USE_DEGR_WFC_T, &mdev->flags);
1050 if (mdev->state.role != R_PRIMARY &&
1051 drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
1052 !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND))
1053 set_bit(USE_DEGR_WFC_T, &mdev->flags);
1054
1055 dd = drbd_determin_dev_size(mdev);
1056 if (dd == dev_size_error) {
1057 retcode = ERR_NOMEM_BITMAP;
1058 goto force_diskless_dec;
1059 } else if (dd == grew)
1060 set_bit(RESYNC_AFTER_NEG, &mdev->flags);
1061
1062 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
1063 dev_info(DEV, "Assuming that all blocks are out of sync "
1064 "(aka FullSync)\n");
1065 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from attaching")) {
1066 retcode = ERR_IO_MD_DISK;
1067 goto force_diskless_dec;
1068 }
1069 } else {
1070 if (drbd_bitmap_io(mdev, &drbd_bm_read, "read from attaching") < 0) {
1071 retcode = ERR_IO_MD_DISK;
1072 goto force_diskless_dec;
1073 }
1074 }
1075
1076 if (cp_discovered) {
1077 drbd_al_apply_to_bm(mdev);
1078 drbd_al_to_on_disk_bm(mdev);
1079 }
1080
1081 spin_lock_irq(&mdev->req_lock);
1082 os = mdev->state;
1083 ns.i = os.i;
1084 /* If MDF_CONSISTENT is not set go into inconsistent state,
1085 otherwise investigate MDF_WasUpToDate...
1086 If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
1087 otherwise into D_CONSISTENT state.
1088 */
1089 if (drbd_md_test_flag(mdev->ldev, MDF_CONSISTENT)) {
1090 if (drbd_md_test_flag(mdev->ldev, MDF_WAS_UP_TO_DATE))
1091 ns.disk = D_CONSISTENT;
1092 else
1093 ns.disk = D_OUTDATED;
1094 } else {
1095 ns.disk = D_INCONSISTENT;
1096 }
1097
1098 if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED))
1099 ns.pdsk = D_OUTDATED;
1100
1101 if ( ns.disk == D_CONSISTENT &&
1102 (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE))
1103 ns.disk = D_UP_TO_DATE;
1104
1105 /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
1106 MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before
1107 this point, because drbd_request_state() modifies these
1108 flags. */
1109
1110 /* In case we are C_CONNECTED postpone any decision on the new disk
1111 state after the negotiation phase. */
1112 if (mdev->state.conn == C_CONNECTED) {
1113 mdev->new_state_tmp.i = ns.i;
1114 ns.i = os.i;
1115 ns.disk = D_NEGOTIATING;
1116 }
1117
1118 rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
1119 ns = mdev->state;
1120 spin_unlock_irq(&mdev->req_lock);
1121
1122 if (rv < SS_SUCCESS)
1123 goto force_diskless_dec;
1124
1125 if (mdev->state.role == R_PRIMARY)
1126 mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1;
1127 else
1128 mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
1129
1130 drbd_md_mark_dirty(mdev);
1131 drbd_md_sync(mdev);
1132
1133 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
1134 put_ldev(mdev);
1135 reply->ret_code = retcode;
1136 drbd_reconfig_done(mdev);
1137 return 0;
1138
1139 force_diskless_dec:
1140 put_ldev(mdev);
1141 force_diskless:
1142 drbd_force_state(mdev, NS(disk, D_DISKLESS));
1143 drbd_md_sync(mdev);
1144 release_bdev2_fail:
1145 if (nbc)
1146 bd_release(nbc->md_bdev);
1147 release_bdev_fail:
1148 if (nbc)
1149 bd_release(nbc->backing_bdev);
1150 fail:
1151 if (nbc) {
1152 if (nbc->lo_file)
1153 fput(nbc->lo_file);
1154 if (nbc->md_file)
1155 fput(nbc->md_file);
1156 kfree(nbc);
1157 }
1158 lc_destroy(resync_lru);
1159
1160 reply->ret_code = retcode;
1161 drbd_reconfig_done(mdev);
1162 return 0;
1163}
1164
1165static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1166 struct drbd_nl_cfg_reply *reply)
1167{
1168 reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS));
1169 return 0;
1170}
1171
1172static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1173 struct drbd_nl_cfg_reply *reply)
1174{
1175 int i, ns;
1176 enum drbd_ret_codes retcode;
1177 struct net_conf *new_conf = NULL;
1178 struct crypto_hash *tfm = NULL;
1179 struct crypto_hash *integrity_w_tfm = NULL;
1180 struct crypto_hash *integrity_r_tfm = NULL;
1181 struct hlist_head *new_tl_hash = NULL;
1182 struct hlist_head *new_ee_hash = NULL;
1183 struct drbd_conf *odev;
1184 char hmac_name[CRYPTO_MAX_ALG_NAME];
1185 void *int_dig_out = NULL;
1186 void *int_dig_in = NULL;
1187 void *int_dig_vv = NULL;
1188 struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr;
1189
1190 drbd_reconfig_start(mdev);
1191
1192 if (mdev->state.conn > C_STANDALONE) {
1193 retcode = ERR_NET_CONFIGURED;
1194 goto fail;
1195 }
1196
1197 /* allocation not in the IO path, cqueue thread context */
1198 new_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
1199 if (!new_conf) {
1200 retcode = ERR_NOMEM;
1201 goto fail;
1202 }
1203
1204 memset(new_conf, 0, sizeof(struct net_conf));
1205 new_conf->timeout = DRBD_TIMEOUT_DEF;
1206 new_conf->try_connect_int = DRBD_CONNECT_INT_DEF;
1207 new_conf->ping_int = DRBD_PING_INT_DEF;
1208 new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF;
1209 new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF;
1210 new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF;
1211 new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF;
1212 new_conf->rcvbuf_size = DRBD_RCVBUF_SIZE_DEF;
1213 new_conf->ko_count = DRBD_KO_COUNT_DEF;
1214 new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF;
1215 new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF;
1216 new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF;
1217 new_conf->want_lose = 0;
1218 new_conf->two_primaries = 0;
1219 new_conf->wire_protocol = DRBD_PROT_C;
1220 new_conf->ping_timeo = DRBD_PING_TIMEO_DEF;
1221 new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF;
1222
1223 if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) {
1224 retcode = ERR_MANDATORY_TAG;
1225 goto fail;
1226 }
1227
1228 if (new_conf->two_primaries
1229 && (new_conf->wire_protocol != DRBD_PROT_C)) {
1230 retcode = ERR_NOT_PROTO_C;
1231 goto fail;
1232 };
1233
1234 if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
1235 retcode = ERR_DISCARD;
1236 goto fail;
1237 }
1238
1239 retcode = NO_ERROR;
1240
1241 new_my_addr = (struct sockaddr *)&new_conf->my_addr;
1242 new_peer_addr = (struct sockaddr *)&new_conf->peer_addr;
1243 for (i = 0; i < minor_count; i++) {
1244 odev = minor_to_mdev(i);
1245 if (!odev || odev == mdev)
1246 continue;
1247 if (get_net_conf(odev)) {
1248 taken_addr = (struct sockaddr *)&odev->net_conf->my_addr;
1249 if (new_conf->my_addr_len == odev->net_conf->my_addr_len &&
1250 !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len))
1251 retcode = ERR_LOCAL_ADDR;
1252
1253 taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr;
1254 if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len &&
1255 !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len))
1256 retcode = ERR_PEER_ADDR;
1257
1258 put_net_conf(odev);
1259 if (retcode != NO_ERROR)
1260 goto fail;
1261 }
1262 }
1263
1264 if (new_conf->cram_hmac_alg[0] != 0) {
1265 snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
1266 new_conf->cram_hmac_alg);
1267 tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC);
1268 if (IS_ERR(tfm)) {
1269 tfm = NULL;
1270 retcode = ERR_AUTH_ALG;
1271 goto fail;
1272 }
1273
1274 if (crypto_tfm_alg_type(crypto_hash_tfm(tfm))
1275 != CRYPTO_ALG_TYPE_HASH) {
1276 retcode = ERR_AUTH_ALG_ND;
1277 goto fail;
1278 }
1279 }
1280
1281 if (new_conf->integrity_alg[0]) {
1282 integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
1283 if (IS_ERR(integrity_w_tfm)) {
1284 integrity_w_tfm = NULL;
1285 retcode=ERR_INTEGRITY_ALG;
1286 goto fail;
1287 }
1288
1289 if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) {
1290 retcode=ERR_INTEGRITY_ALG_ND;
1291 goto fail;
1292 }
1293
1294 integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
1295 if (IS_ERR(integrity_r_tfm)) {
1296 integrity_r_tfm = NULL;
1297 retcode=ERR_INTEGRITY_ALG;
1298 goto fail;
1299 }
1300 }
1301
1302 ns = new_conf->max_epoch_size/8;
1303 if (mdev->tl_hash_s != ns) {
1304 new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
1305 if (!new_tl_hash) {
1306 retcode = ERR_NOMEM;
1307 goto fail;
1308 }
1309 }
1310
1311 ns = new_conf->max_buffers/8;
1312 if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) {
1313 new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
1314 if (!new_ee_hash) {
1315 retcode = ERR_NOMEM;
1316 goto fail;
1317 }
1318 }
1319
1320 ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
1321
1322 if (integrity_w_tfm) {
1323 i = crypto_hash_digestsize(integrity_w_tfm);
1324 int_dig_out = kmalloc(i, GFP_KERNEL);
1325 if (!int_dig_out) {
1326 retcode = ERR_NOMEM;
1327 goto fail;
1328 }
1329 int_dig_in = kmalloc(i, GFP_KERNEL);
1330 if (!int_dig_in) {
1331 retcode = ERR_NOMEM;
1332 goto fail;
1333 }
1334 int_dig_vv = kmalloc(i, GFP_KERNEL);
1335 if (!int_dig_vv) {
1336 retcode = ERR_NOMEM;
1337 goto fail;
1338 }
1339 }
1340
1341 if (!mdev->bitmap) {
1342 if(drbd_bm_init(mdev)) {
1343 retcode = ERR_NOMEM;
1344 goto fail;
1345 }
1346 }
1347
1348 spin_lock_irq(&mdev->req_lock);
1349 if (mdev->net_conf != NULL) {
1350 retcode = ERR_NET_CONFIGURED;
1351 spin_unlock_irq(&mdev->req_lock);
1352 goto fail;
1353 }
1354 mdev->net_conf = new_conf;
1355
1356 mdev->send_cnt = 0;
1357 mdev->recv_cnt = 0;
1358
1359 if (new_tl_hash) {
1360 kfree(mdev->tl_hash);
1361 mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8;
1362 mdev->tl_hash = new_tl_hash;
1363 }
1364
1365 if (new_ee_hash) {
1366 kfree(mdev->ee_hash);
1367 mdev->ee_hash_s = mdev->net_conf->max_buffers/8;
1368 mdev->ee_hash = new_ee_hash;
1369 }
1370
1371 crypto_free_hash(mdev->cram_hmac_tfm);
1372 mdev->cram_hmac_tfm = tfm;
1373
1374 crypto_free_hash(mdev->integrity_w_tfm);
1375 mdev->integrity_w_tfm = integrity_w_tfm;
1376
1377 crypto_free_hash(mdev->integrity_r_tfm);
1378 mdev->integrity_r_tfm = integrity_r_tfm;
1379
1380 kfree(mdev->int_dig_out);
1381 kfree(mdev->int_dig_in);
1382 kfree(mdev->int_dig_vv);
1383 mdev->int_dig_out=int_dig_out;
1384 mdev->int_dig_in=int_dig_in;
1385 mdev->int_dig_vv=int_dig_vv;
1386 spin_unlock_irq(&mdev->req_lock);
1387
1388 retcode = _drbd_request_state(mdev, NS(conn, C_UNCONNECTED), CS_VERBOSE);
1389
1390 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
1391 reply->ret_code = retcode;
1392 drbd_reconfig_done(mdev);
1393 return 0;
1394
1395fail:
1396 kfree(int_dig_out);
1397 kfree(int_dig_in);
1398 kfree(int_dig_vv);
1399 crypto_free_hash(tfm);
1400 crypto_free_hash(integrity_w_tfm);
1401 crypto_free_hash(integrity_r_tfm);
1402 kfree(new_tl_hash);
1403 kfree(new_ee_hash);
1404 kfree(new_conf);
1405
1406 reply->ret_code = retcode;
1407 drbd_reconfig_done(mdev);
1408 return 0;
1409}
1410
1411static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1412 struct drbd_nl_cfg_reply *reply)
1413{
1414 int retcode;
1415
1416 retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED);
1417
1418 if (retcode == SS_NOTHING_TO_DO)
1419 goto done;
1420 else if (retcode == SS_ALREADY_STANDALONE)
1421 goto done;
1422 else if (retcode == SS_PRIMARY_NOP) {
1423 /* Our statche checking code wants to see the peer outdated. */
1424 retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
1425 pdsk, D_OUTDATED));
1426 } else if (retcode == SS_CW_FAILED_BY_PEER) {
1427 /* The peer probably wants to see us outdated. */
1428 retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
1429 disk, D_OUTDATED),
1430 CS_ORDERED);
1431 if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) {
1432 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
1433 retcode = SS_SUCCESS;
1434 }
1435 }
1436
1437 if (retcode < SS_SUCCESS)
1438 goto fail;
1439
1440 if (wait_event_interruptible(mdev->state_wait,
1441 mdev->state.conn != C_DISCONNECTING)) {
1442 /* Do not test for mdev->state.conn == C_STANDALONE, since
1443 someone else might connect us in the mean time! */
1444 retcode = ERR_INTR;
1445 goto fail;
1446 }
1447
1448 done:
1449 retcode = NO_ERROR;
1450 fail:
1451 drbd_md_sync(mdev);
1452 reply->ret_code = retcode;
1453 return 0;
1454}
1455
1456void resync_after_online_grow(struct drbd_conf *mdev)
1457{
1458 int iass; /* I am sync source */
1459
1460 dev_info(DEV, "Resync of new storage after online grow\n");
1461 if (mdev->state.role != mdev->state.peer)
1462 iass = (mdev->state.role == R_PRIMARY);
1463 else
1464 iass = test_bit(DISCARD_CONCURRENT, &mdev->flags);
1465
1466 if (iass)
1467 drbd_start_resync(mdev, C_SYNC_SOURCE);
1468 else
1469 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
1470}
1471
1472static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1473 struct drbd_nl_cfg_reply *reply)
1474{
1475 struct resize rs;
1476 int retcode = NO_ERROR;
1477 int ldsc = 0; /* local disk size changed */
1478 enum determine_dev_size dd;
1479
1480 memset(&rs, 0, sizeof(struct resize));
1481 if (!resize_from_tags(mdev, nlp->tag_list, &rs)) {
1482 retcode = ERR_MANDATORY_TAG;
1483 goto fail;
1484 }
1485
1486 if (mdev->state.conn > C_CONNECTED) {
1487 retcode = ERR_RESIZE_RESYNC;
1488 goto fail;
1489 }
1490
1491 if (mdev->state.role == R_SECONDARY &&
1492 mdev->state.peer == R_SECONDARY) {
1493 retcode = ERR_NO_PRIMARY;
1494 goto fail;
1495 }
1496
1497 if (!get_ldev(mdev)) {
1498 retcode = ERR_NO_DISK;
1499 goto fail;
1500 }
1501
1502 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
1503 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
1504 ldsc = 1;
1505 }
1506
1507 mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
1508 dd = drbd_determin_dev_size(mdev);
1509 drbd_md_sync(mdev);
1510 put_ldev(mdev);
1511 if (dd == dev_size_error) {
1512 retcode = ERR_NOMEM_BITMAP;
1513 goto fail;
1514 }
1515
1516 if (mdev->state.conn == C_CONNECTED && (dd != unchanged || ldsc)) {
1517 if (dd == grew)
1518 set_bit(RESIZE_PENDING, &mdev->flags);
1519
1520 drbd_send_uuids(mdev);
1521 drbd_send_sizes(mdev, 1);
1522 }
1523
1524 fail:
1525 reply->ret_code = retcode;
1526 return 0;
1527}
1528
1529static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1530 struct drbd_nl_cfg_reply *reply)
1531{
1532 int retcode = NO_ERROR;
1533 int err;
1534 int ovr; /* online verify running */
1535 int rsr; /* re-sync running */
1536 struct crypto_hash *verify_tfm = NULL;
1537 struct crypto_hash *csums_tfm = NULL;
1538 struct syncer_conf sc;
1539 cpumask_var_t new_cpu_mask;
1540
1541 if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) {
1542 retcode = ERR_NOMEM;
1543 goto fail;
1544 }
1545
1546 if (nlp->flags & DRBD_NL_SET_DEFAULTS) {
1547 memset(&sc, 0, sizeof(struct syncer_conf));
1548 sc.rate = DRBD_RATE_DEF;
1549 sc.after = DRBD_AFTER_DEF;
1550 sc.al_extents = DRBD_AL_EXTENTS_DEF;
1551 } else
1552 memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
1553
1554 if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) {
1555 retcode = ERR_MANDATORY_TAG;
1556 goto fail;
1557 }
1558
1559 /* re-sync running */
1560 rsr = ( mdev->state.conn == C_SYNC_SOURCE ||
1561 mdev->state.conn == C_SYNC_TARGET ||
1562 mdev->state.conn == C_PAUSED_SYNC_S ||
1563 mdev->state.conn == C_PAUSED_SYNC_T );
1564
1565 if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) {
1566 retcode = ERR_CSUMS_RESYNC_RUNNING;
1567 goto fail;
1568 }
1569
1570 if (!rsr && sc.csums_alg[0]) {
1571 csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC);
1572 if (IS_ERR(csums_tfm)) {
1573 csums_tfm = NULL;
1574 retcode = ERR_CSUMS_ALG;
1575 goto fail;
1576 }
1577
1578 if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) {
1579 retcode = ERR_CSUMS_ALG_ND;
1580 goto fail;
1581 }
1582 }
1583
1584 /* online verify running */
1585 ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T);
1586
1587 if (ovr) {
1588 if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) {
1589 retcode = ERR_VERIFY_RUNNING;
1590 goto fail;
1591 }
1592 }
1593
1594 if (!ovr && sc.verify_alg[0]) {
1595 verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC);
1596 if (IS_ERR(verify_tfm)) {
1597 verify_tfm = NULL;
1598 retcode = ERR_VERIFY_ALG;
1599 goto fail;
1600 }
1601
1602 if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) {
1603 retcode = ERR_VERIFY_ALG_ND;
1604 goto fail;
1605 }
1606 }
1607
1608 /* silently ignore cpu mask on UP kernel */
1609 if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) {
1610 err = __bitmap_parse(sc.cpu_mask, 32, 0,
1611 cpumask_bits(new_cpu_mask), nr_cpu_ids);
1612 if (err) {
1613 dev_warn(DEV, "__bitmap_parse() failed with %d\n", err);
1614 retcode = ERR_CPU_MASK_PARSE;
1615 goto fail;
1616 }
1617 }
1618
1619 ERR_IF (sc.rate < 1) sc.rate = 1;
1620 ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */
1621#define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT)
1622 if (sc.al_extents > AL_MAX) {
1623 dev_err(DEV, "sc.al_extents > %d\n", AL_MAX);
1624 sc.al_extents = AL_MAX;
1625 }
1626#undef AL_MAX
1627
1628 /* most sanity checks done, try to assign the new sync-after
1629 * dependency. need to hold the global lock in there,
1630 * to avoid a race in the dependency loop check. */
1631 retcode = drbd_alter_sa(mdev, sc.after);
1632 if (retcode != NO_ERROR)
1633 goto fail;
1634
1635 /* ok, assign the rest of it as well.
1636 * lock against receive_SyncParam() */
1637 spin_lock(&mdev->peer_seq_lock);
1638 mdev->sync_conf = sc;
1639
1640 if (!rsr) {
1641 crypto_free_hash(mdev->csums_tfm);
1642 mdev->csums_tfm = csums_tfm;
1643 csums_tfm = NULL;
1644 }
1645
1646 if (!ovr) {
1647 crypto_free_hash(mdev->verify_tfm);
1648 mdev->verify_tfm = verify_tfm;
1649 verify_tfm = NULL;
1650 }
1651 spin_unlock(&mdev->peer_seq_lock);
1652
1653 if (get_ldev(mdev)) {
1654 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
1655 drbd_al_shrink(mdev);
1656 err = drbd_check_al_size(mdev);
1657 lc_unlock(mdev->act_log);
1658 wake_up(&mdev->al_wait);
1659
1660 put_ldev(mdev);
1661 drbd_md_sync(mdev);
1662
1663 if (err) {
1664 retcode = ERR_NOMEM;
1665 goto fail;
1666 }
1667 }
1668
1669 if (mdev->state.conn >= C_CONNECTED)
1670 drbd_send_sync_param(mdev, &sc);
1671
1672 if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) {
1673 cpumask_copy(mdev->cpu_mask, new_cpu_mask);
1674 drbd_calc_cpu_mask(mdev);
1675 mdev->receiver.reset_cpu_mask = 1;
1676 mdev->asender.reset_cpu_mask = 1;
1677 mdev->worker.reset_cpu_mask = 1;
1678 }
1679
1680 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
1681fail:
1682 free_cpumask_var(new_cpu_mask);
1683 crypto_free_hash(csums_tfm);
1684 crypto_free_hash(verify_tfm);
1685 reply->ret_code = retcode;
1686 return 0;
1687}
1688
1689static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1690 struct drbd_nl_cfg_reply *reply)
1691{
1692 int retcode;
1693
1694 retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
1695
1696 if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION)
1697 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
1698
1699 while (retcode == SS_NEED_CONNECTION) {
1700 spin_lock_irq(&mdev->req_lock);
1701 if (mdev->state.conn < C_CONNECTED)
1702 retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL);
1703 spin_unlock_irq(&mdev->req_lock);
1704
1705 if (retcode != SS_NEED_CONNECTION)
1706 break;
1707
1708 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
1709 }
1710
1711 reply->ret_code = retcode;
1712 return 0;
1713}
1714
1715static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1716 struct drbd_nl_cfg_reply *reply)
1717{
1718
1719 reply->ret_code = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
1720
1721 return 0;
1722}
1723
1724static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1725 struct drbd_nl_cfg_reply *reply)
1726{
1727 int retcode = NO_ERROR;
1728
1729 if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
1730 retcode = ERR_PAUSE_IS_SET;
1731
1732 reply->ret_code = retcode;
1733 return 0;
1734}
1735
1736static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1737 struct drbd_nl_cfg_reply *reply)
1738{
1739 int retcode = NO_ERROR;
1740
1741 if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO)
1742 retcode = ERR_PAUSE_IS_CLEAR;
1743
1744 reply->ret_code = retcode;
1745 return 0;
1746}
1747
1748static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1749 struct drbd_nl_cfg_reply *reply)
1750{
1751 reply->ret_code = drbd_request_state(mdev, NS(susp, 1));
1752
1753 return 0;
1754}
1755
1756static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1757 struct drbd_nl_cfg_reply *reply)
1758{
1759 reply->ret_code = drbd_request_state(mdev, NS(susp, 0));
1760 return 0;
1761}
1762
1763static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1764 struct drbd_nl_cfg_reply *reply)
1765{
1766 reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED));
1767 return 0;
1768}
1769
1770static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1771 struct drbd_nl_cfg_reply *reply)
1772{
1773 unsigned short *tl;
1774
1775 tl = reply->tag_list;
1776
1777 if (get_ldev(mdev)) {
1778 tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl);
1779 put_ldev(mdev);
1780 }
1781
1782 if (get_net_conf(mdev)) {
1783 tl = net_conf_to_tags(mdev, mdev->net_conf, tl);
1784 put_net_conf(mdev);
1785 }
1786 tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl);
1787
1788 put_unaligned(TT_END, tl++); /* Close the tag list */
1789
1790 return (int)((char *)tl - (char *)reply->tag_list);
1791}
1792
1793static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1794 struct drbd_nl_cfg_reply *reply)
1795{
1796 unsigned short *tl = reply->tag_list;
1797 union drbd_state s = mdev->state;
1798 unsigned long rs_left;
1799 unsigned int res;
1800
1801 tl = get_state_to_tags(mdev, (struct get_state *)&s, tl);
1802
1803 /* no local ref, no bitmap, no syncer progress. */
1804 if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) {
1805 if (get_ldev(mdev)) {
1806 drbd_get_syncer_progress(mdev, &rs_left, &res);
1807 tl = tl_add_int(tl, T_sync_progress, &res);
1808 put_ldev(mdev);
1809 }
1810 }
1811 put_unaligned(TT_END, tl++); /* Close the tag list */
1812
1813 return (int)((char *)tl - (char *)reply->tag_list);
1814}
1815
1816static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1817 struct drbd_nl_cfg_reply *reply)
1818{
1819 unsigned short *tl;
1820
1821 tl = reply->tag_list;
1822
1823 if (get_ldev(mdev)) {
1824 tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64));
1825 tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags);
1826 put_ldev(mdev);
1827 }
1828 put_unaligned(TT_END, tl++); /* Close the tag list */
1829
1830 return (int)((char *)tl - (char *)reply->tag_list);
1831}
1832
1833/**
1834 * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use
1835 * @mdev: DRBD device.
1836 * @nlp: Netlink/connector packet from drbdsetup
1837 * @reply: Reply packet for drbdsetup
1838 */
1839static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1840 struct drbd_nl_cfg_reply *reply)
1841{
1842 unsigned short *tl;
1843 char rv;
1844
1845 tl = reply->tag_list;
1846
1847 rv = mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
1848 test_bit(USE_DEGR_WFC_T, &mdev->flags) ? UT_DEGRADED : UT_DEFAULT;
1849
1850 tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv));
1851 put_unaligned(TT_END, tl++); /* Close the tag list */
1852
1853 return (int)((char *)tl - (char *)reply->tag_list);
1854}
1855
1856static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1857 struct drbd_nl_cfg_reply *reply)
1858{
1859 /* default to resume from last known position, if possible */
1860 struct start_ov args =
1861 { .start_sector = mdev->ov_start_sector };
1862
1863 if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) {
1864 reply->ret_code = ERR_MANDATORY_TAG;
1865 return 0;
1866 }
1867 /* w_make_ov_request expects position to be aligned */
1868 mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT;
1869 reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
1870 return 0;
1871}
1872
1873
1874static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1875 struct drbd_nl_cfg_reply *reply)
1876{
1877 int retcode = NO_ERROR;
1878 int skip_initial_sync = 0;
1879 int err;
1880
1881 struct new_c_uuid args;
1882
1883 memset(&args, 0, sizeof(struct new_c_uuid));
1884 if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) {
1885 reply->ret_code = ERR_MANDATORY_TAG;
1886 return 0;
1887 }
1888
1889 mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */
1890
1891 if (!get_ldev(mdev)) {
1892 retcode = ERR_NO_DISK;
1893 goto out;
1894 }
1895
1896 /* this is "skip initial sync", assume to be clean */
1897 if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 &&
1898 mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
1899 dev_info(DEV, "Preparing to skip initial sync\n");
1900 skip_initial_sync = 1;
1901 } else if (mdev->state.conn != C_STANDALONE) {
1902 retcode = ERR_CONNECTED;
1903 goto out_dec;
1904 }
1905
1906 drbd_uuid_set(mdev, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */
1907 drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */
1908
1909 if (args.clear_bm) {
1910 err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, "clear_n_write from new_c_uuid");
1911 if (err) {
1912 dev_err(DEV, "Writing bitmap failed with %d\n",err);
1913 retcode = ERR_IO_MD_DISK;
1914 }
1915 if (skip_initial_sync) {
1916 drbd_send_uuids_skip_initial_sync(mdev);
1917 _drbd_uuid_set(mdev, UI_BITMAP, 0);
1918 spin_lock_irq(&mdev->req_lock);
1919 _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
1920 CS_VERBOSE, NULL);
1921 spin_unlock_irq(&mdev->req_lock);
1922 }
1923 }
1924
1925 drbd_md_sync(mdev);
1926out_dec:
1927 put_ldev(mdev);
1928out:
1929 mutex_unlock(&mdev->state_mutex);
1930
1931 reply->ret_code = retcode;
1932 return 0;
1933}
1934
1935static struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp)
1936{
1937 struct drbd_conf *mdev;
1938
1939 if (nlp->drbd_minor >= minor_count)
1940 return NULL;
1941
1942 mdev = minor_to_mdev(nlp->drbd_minor);
1943
1944 if (!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) {
1945 struct gendisk *disk = NULL;
1946 mdev = drbd_new_device(nlp->drbd_minor);
1947
1948 spin_lock_irq(&drbd_pp_lock);
1949 if (minor_table[nlp->drbd_minor] == NULL) {
1950 minor_table[nlp->drbd_minor] = mdev;
1951 disk = mdev->vdisk;
1952 mdev = NULL;
1953 } /* else: we lost the race */
1954 spin_unlock_irq(&drbd_pp_lock);
1955
1956 if (disk) /* we won the race above */
1957 /* in case we ever add a drbd_delete_device(),
1958 * don't forget the del_gendisk! */
1959 add_disk(disk);
1960 else /* we lost the race above */
1961 drbd_free_mdev(mdev);
1962
1963 mdev = minor_to_mdev(nlp->drbd_minor);
1964 }
1965
1966 return mdev;
1967}
1968
1969struct cn_handler_struct {
1970 int (*function)(struct drbd_conf *,
1971 struct drbd_nl_cfg_req *,
1972 struct drbd_nl_cfg_reply *);
1973 int reply_body_size;
1974};
1975
1976static struct cn_handler_struct cnd_table[] = {
1977 [ P_primary ] = { &drbd_nl_primary, 0 },
1978 [ P_secondary ] = { &drbd_nl_secondary, 0 },
1979 [ P_disk_conf ] = { &drbd_nl_disk_conf, 0 },
1980 [ P_detach ] = { &drbd_nl_detach, 0 },
1981 [ P_net_conf ] = { &drbd_nl_net_conf, 0 },
1982 [ P_disconnect ] = { &drbd_nl_disconnect, 0 },
1983 [ P_resize ] = { &drbd_nl_resize, 0 },
1984 [ P_syncer_conf ] = { &drbd_nl_syncer_conf, 0 },
1985 [ P_invalidate ] = { &drbd_nl_invalidate, 0 },
1986 [ P_invalidate_peer ] = { &drbd_nl_invalidate_peer, 0 },
1987 [ P_pause_sync ] = { &drbd_nl_pause_sync, 0 },
1988 [ P_resume_sync ] = { &drbd_nl_resume_sync, 0 },
1989 [ P_suspend_io ] = { &drbd_nl_suspend_io, 0 },
1990 [ P_resume_io ] = { &drbd_nl_resume_io, 0 },
1991 [ P_outdate ] = { &drbd_nl_outdate, 0 },
1992 [ P_get_config ] = { &drbd_nl_get_config,
1993 sizeof(struct syncer_conf_tag_len_struct) +
1994 sizeof(struct disk_conf_tag_len_struct) +
1995 sizeof(struct net_conf_tag_len_struct) },
1996 [ P_get_state ] = { &drbd_nl_get_state,
1997 sizeof(struct get_state_tag_len_struct) +
1998 sizeof(struct sync_progress_tag_len_struct) },
1999 [ P_get_uuids ] = { &drbd_nl_get_uuids,
2000 sizeof(struct get_uuids_tag_len_struct) },
2001 [ P_get_timeout_flag ] = { &drbd_nl_get_timeout_flag,
2002 sizeof(struct get_timeout_flag_tag_len_struct)},
2003 [ P_start_ov ] = { &drbd_nl_start_ov, 0 },
2004 [ P_new_c_uuid ] = { &drbd_nl_new_c_uuid, 0 },
2005};
2006
2007static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms *nsp)
2008{
2009 struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data;
2010 struct cn_handler_struct *cm;
2011 struct cn_msg *cn_reply;
2012 struct drbd_nl_cfg_reply *reply;
2013 struct drbd_conf *mdev;
2014 int retcode, rr;
2015 int reply_size = sizeof(struct cn_msg)
2016 + sizeof(struct drbd_nl_cfg_reply)
2017 + sizeof(short int);
2018
2019 if (!try_module_get(THIS_MODULE)) {
2020 printk(KERN_ERR "drbd: try_module_get() failed!\n");
2021 return;
2022 }
2023
2024 if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN)) {
2025 retcode = ERR_PERM;
2026 goto fail;
2027 }
2028
2029 mdev = ensure_mdev(nlp);
2030 if (!mdev) {
2031 retcode = ERR_MINOR_INVALID;
2032 goto fail;
2033 }
2034
2035 if (nlp->packet_type >= P_nl_after_last_packet) {
2036 retcode = ERR_PACKET_NR;
2037 goto fail;
2038 }
2039
2040 cm = cnd_table + nlp->packet_type;
2041
2042 /* This may happen if packet number is 0: */
2043 if (cm->function == NULL) {
2044 retcode = ERR_PACKET_NR;
2045 goto fail;
2046 }
2047
2048 reply_size += cm->reply_body_size;
2049
2050 /* allocation not in the IO path, cqueue thread context */
2051 cn_reply = kmalloc(reply_size, GFP_KERNEL);
2052 if (!cn_reply) {
2053 retcode = ERR_NOMEM;
2054 goto fail;
2055 }
2056 reply = (struct drbd_nl_cfg_reply *) cn_reply->data;
2057
2058 reply->packet_type =
2059 cm->reply_body_size ? nlp->packet_type : P_nl_after_last_packet;
2060 reply->minor = nlp->drbd_minor;
2061 reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */
2062 /* reply->tag_list; might be modified by cm->function. */
2063
2064 rr = cm->function(mdev, nlp, reply);
2065
2066 cn_reply->id = req->id;
2067 cn_reply->seq = req->seq;
2068 cn_reply->ack = req->ack + 1;
2069 cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr;
2070 cn_reply->flags = 0;
2071
2072 rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL);
2073 if (rr && rr != -ESRCH)
2074 printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
2075
2076 kfree(cn_reply);
2077 module_put(THIS_MODULE);
2078 return;
2079 fail:
2080 drbd_nl_send_reply(req, retcode);
2081 module_put(THIS_MODULE);
2082}
2083
2084static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */
2085
2086static unsigned short *
2087__tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data,
2088 unsigned short len, int nul_terminated)
2089{
2090 unsigned short l = tag_descriptions[tag_number(tag)].max_len;
2091 len = (len < l) ? len : l;
2092 put_unaligned(tag, tl++);
2093 put_unaligned(len, tl++);
2094 memcpy(tl, data, len);
2095 tl = (unsigned short*)((char*)tl + len);
2096 if (nul_terminated)
2097 *((char*)tl - 1) = 0;
2098 return tl;
2099}
2100
2101static unsigned short *
2102tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len)
2103{
2104 return __tl_add_blob(tl, tag, data, len, 0);
2105}
2106
2107static unsigned short *
2108tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str)
2109{
2110 return __tl_add_blob(tl, tag, str, strlen(str)+1, 0);
2111}
2112
2113static unsigned short *
2114tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val)
2115{
2116 put_unaligned(tag, tl++);
2117 switch(tag_type(tag)) {
2118 case TT_INTEGER:
2119 put_unaligned(sizeof(int), tl++);
2120 put_unaligned(*(int *)val, (int *)tl);
2121 tl = (unsigned short*)((char*)tl+sizeof(int));
2122 break;
2123 case TT_INT64:
2124 put_unaligned(sizeof(u64), tl++);
2125 put_unaligned(*(u64 *)val, (u64 *)tl);
2126 tl = (unsigned short*)((char*)tl+sizeof(u64));
2127 break;
2128 default:
2129 /* someone did something stupid. */
2130 ;
2131 }
2132 return tl;
2133}
2134
2135void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state)
2136{
2137 char buffer[sizeof(struct cn_msg)+
2138 sizeof(struct drbd_nl_cfg_reply)+
2139 sizeof(struct get_state_tag_len_struct)+
2140 sizeof(short int)];
2141 struct cn_msg *cn_reply = (struct cn_msg *) buffer;
2142 struct drbd_nl_cfg_reply *reply =
2143 (struct drbd_nl_cfg_reply *)cn_reply->data;
2144 unsigned short *tl = reply->tag_list;
2145
2146 /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
2147
2148 tl = get_state_to_tags(mdev, (struct get_state *)&state, tl);
2149
2150 put_unaligned(TT_END, tl++); /* Close the tag list */
2151
2152 cn_reply->id.idx = CN_IDX_DRBD;
2153 cn_reply->id.val = CN_VAL_DRBD;
2154
2155 cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
2156 cn_reply->ack = 0; /* not used here. */
2157 cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
2158 (int)((char *)tl - (char *)reply->tag_list);
2159 cn_reply->flags = 0;
2160
2161 reply->packet_type = P_get_state;
2162 reply->minor = mdev_to_minor(mdev);
2163 reply->ret_code = NO_ERROR;
2164
2165 cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2166}
2167
2168void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name)
2169{
2170 char buffer[sizeof(struct cn_msg)+
2171 sizeof(struct drbd_nl_cfg_reply)+
2172 sizeof(struct call_helper_tag_len_struct)+
2173 sizeof(short int)];
2174 struct cn_msg *cn_reply = (struct cn_msg *) buffer;
2175 struct drbd_nl_cfg_reply *reply =
2176 (struct drbd_nl_cfg_reply *)cn_reply->data;
2177 unsigned short *tl = reply->tag_list;
2178
2179 /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
2180
2181 tl = tl_add_str(tl, T_helper, helper_name);
2182 put_unaligned(TT_END, tl++); /* Close the tag list */
2183
2184 cn_reply->id.idx = CN_IDX_DRBD;
2185 cn_reply->id.val = CN_VAL_DRBD;
2186
2187 cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
2188 cn_reply->ack = 0; /* not used here. */
2189 cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
2190 (int)((char *)tl - (char *)reply->tag_list);
2191 cn_reply->flags = 0;
2192
2193 reply->packet_type = P_call_helper;
2194 reply->minor = mdev_to_minor(mdev);
2195 reply->ret_code = NO_ERROR;
2196
2197 cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2198}
2199
2200void drbd_bcast_ee(struct drbd_conf *mdev,
2201 const char *reason, const int dgs,
2202 const char* seen_hash, const char* calc_hash,
2203 const struct drbd_epoch_entry* e)
2204{
2205 struct cn_msg *cn_reply;
2206 struct drbd_nl_cfg_reply *reply;
2207 struct bio_vec *bvec;
2208 unsigned short *tl;
2209 int i;
2210
2211 if (!e)
2212 return;
2213 if (!reason || !reason[0])
2214 return;
2215
2216 /* apparently we have to memcpy twice, first to prepare the data for the
2217 * struct cn_msg, then within cn_netlink_send from the cn_msg to the
2218 * netlink skb. */
2219 /* receiver thread context, which is not in the writeout path (of this node),
2220 * but may be in the writeout path of the _other_ node.
2221 * GFP_NOIO to avoid potential "distributed deadlock". */
2222 cn_reply = kmalloc(
2223 sizeof(struct cn_msg)+
2224 sizeof(struct drbd_nl_cfg_reply)+
2225 sizeof(struct dump_ee_tag_len_struct)+
2226 sizeof(short int),
2227 GFP_NOIO);
2228
2229 if (!cn_reply) {
2230 dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n",
2231 (unsigned long long)e->sector, e->size);
2232 return;
2233 }
2234
2235 reply = (struct drbd_nl_cfg_reply*)cn_reply->data;
2236 tl = reply->tag_list;
2237
2238 tl = tl_add_str(tl, T_dump_ee_reason, reason);
2239 tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs);
2240 tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs);
2241 tl = tl_add_int(tl, T_ee_sector, &e->sector);
2242 tl = tl_add_int(tl, T_ee_block_id, &e->block_id);
2243
2244 put_unaligned(T_ee_data, tl++);
2245 put_unaligned(e->size, tl++);
2246
2247 __bio_for_each_segment(bvec, e->private_bio, i, 0) {
2248 void *d = kmap(bvec->bv_page);
2249 memcpy(tl, d + bvec->bv_offset, bvec->bv_len);
2250 kunmap(bvec->bv_page);
2251 tl=(unsigned short*)((char*)tl + bvec->bv_len);
2252 }
2253 put_unaligned(TT_END, tl++); /* Close the tag list */
2254
2255 cn_reply->id.idx = CN_IDX_DRBD;
2256 cn_reply->id.val = CN_VAL_DRBD;
2257
2258 cn_reply->seq = atomic_add_return(1,&drbd_nl_seq);
2259 cn_reply->ack = 0; // not used here.
2260 cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
2261 (int)((char*)tl - (char*)reply->tag_list);
2262 cn_reply->flags = 0;
2263
2264 reply->packet_type = P_dump_ee;
2265 reply->minor = mdev_to_minor(mdev);
2266 reply->ret_code = NO_ERROR;
2267
2268 cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2269 kfree(cn_reply);
2270}
2271
2272void drbd_bcast_sync_progress(struct drbd_conf *mdev)
2273{
2274 char buffer[sizeof(struct cn_msg)+
2275 sizeof(struct drbd_nl_cfg_reply)+
2276 sizeof(struct sync_progress_tag_len_struct)+
2277 sizeof(short int)];
2278 struct cn_msg *cn_reply = (struct cn_msg *) buffer;
2279 struct drbd_nl_cfg_reply *reply =
2280 (struct drbd_nl_cfg_reply *)cn_reply->data;
2281 unsigned short *tl = reply->tag_list;
2282 unsigned long rs_left;
2283 unsigned int res;
2284
2285 /* no local ref, no bitmap, no syncer progress, no broadcast. */
2286 if (!get_ldev(mdev))
2287 return;
2288 drbd_get_syncer_progress(mdev, &rs_left, &res);
2289 put_ldev(mdev);
2290
2291 tl = tl_add_int(tl, T_sync_progress, &res);
2292 put_unaligned(TT_END, tl++); /* Close the tag list */
2293
2294 cn_reply->id.idx = CN_IDX_DRBD;
2295 cn_reply->id.val = CN_VAL_DRBD;
2296
2297 cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
2298 cn_reply->ack = 0; /* not used here. */
2299 cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
2300 (int)((char *)tl - (char *)reply->tag_list);
2301 cn_reply->flags = 0;
2302
2303 reply->packet_type = P_sync_progress;
2304 reply->minor = mdev_to_minor(mdev);
2305 reply->ret_code = NO_ERROR;
2306
2307 cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2308}
2309
2310int __init drbd_nl_init(void)
2311{
2312 static struct cb_id cn_id_drbd;
2313 int err, try=10;
2314
2315 cn_id_drbd.val = CN_VAL_DRBD;
2316 do {
2317 cn_id_drbd.idx = cn_idx;
2318 err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback);
2319 if (!err)
2320 break;
2321 cn_idx = (cn_idx + CN_IDX_STEP);
2322 } while (try--);
2323
2324 if (err) {
2325 printk(KERN_ERR "drbd: cn_drbd failed to register\n");
2326 return err;
2327 }
2328
2329 return 0;
2330}
2331
2332void drbd_nl_cleanup(void)
2333{
2334 static struct cb_id cn_id_drbd;
2335
2336 cn_id_drbd.idx = cn_idx;
2337 cn_id_drbd.val = CN_VAL_DRBD;
2338
2339 cn_del_callback(&cn_id_drbd);
2340}
2341
2342void drbd_nl_send_reply(struct cn_msg *req, int ret_code)
2343{
2344 char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)];
2345 struct cn_msg *cn_reply = (struct cn_msg *) buffer;
2346 struct drbd_nl_cfg_reply *reply =
2347 (struct drbd_nl_cfg_reply *)cn_reply->data;
2348 int rr;
2349
2350 cn_reply->id = req->id;
2351
2352 cn_reply->seq = req->seq;
2353 cn_reply->ack = req->ack + 1;
2354 cn_reply->len = sizeof(struct drbd_nl_cfg_reply);
2355 cn_reply->flags = 0;
2356
2357 reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor;
2358 reply->ret_code = ret_code;
2359
2360 rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
2361 if (rr && rr != -ESRCH)
2362 printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
2363}
2364
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
new file mode 100644
index 000000000000..bdd0b4943b10
--- /dev/null
+++ b/drivers/block/drbd/drbd_proc.c
@@ -0,0 +1,265 @@
1/*
2 drbd_proc.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/module.h>
27
28#include <asm/uaccess.h>
29#include <linux/fs.h>
30#include <linux/file.h>
31#include <linux/slab.h>
32#include <linux/proc_fs.h>
33#include <linux/seq_file.h>
34#include <linux/drbd.h>
35#include "drbd_int.h"
36
37static int drbd_proc_open(struct inode *inode, struct file *file);
38
39
40struct proc_dir_entry *drbd_proc;
41struct file_operations drbd_proc_fops = {
42 .owner = THIS_MODULE,
43 .open = drbd_proc_open,
44 .read = seq_read,
45 .llseek = seq_lseek,
46 .release = single_release,
47};
48
49
50/*lge
51 * progress bars shamelessly adapted from driver/md/md.c
52 * output looks like
53 * [=====>..............] 33.5% (23456/123456)
54 * finish: 2:20:20 speed: 6,345 (6,456) K/sec
55 */
56static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
57{
58 unsigned long db, dt, dbdt, rt, rs_left;
59 unsigned int res;
60 int i, x, y;
61
62 drbd_get_syncer_progress(mdev, &rs_left, &res);
63
64 x = res/50;
65 y = 20-x;
66 seq_printf(seq, "\t[");
67 for (i = 1; i < x; i++)
68 seq_printf(seq, "=");
69 seq_printf(seq, ">");
70 for (i = 0; i < y; i++)
71 seq_printf(seq, ".");
72 seq_printf(seq, "] ");
73
74 seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10);
75 /* if more than 1 GB display in MB */
76 if (mdev->rs_total > 0x100000L)
77 seq_printf(seq, "(%lu/%lu)M\n\t",
78 (unsigned long) Bit2KB(rs_left >> 10),
79 (unsigned long) Bit2KB(mdev->rs_total >> 10));
80 else
81 seq_printf(seq, "(%lu/%lu)K\n\t",
82 (unsigned long) Bit2KB(rs_left),
83 (unsigned long) Bit2KB(mdev->rs_total));
84
85 /* see drivers/md/md.c
86 * We do not want to overflow, so the order of operands and
87 * the * 100 / 100 trick are important. We do a +1 to be
88 * safe against division by zero. We only estimate anyway.
89 *
90 * dt: time from mark until now
91 * db: blocks written from mark until now
92 * rt: remaining time
93 */
94 dt = (jiffies - mdev->rs_mark_time) / HZ;
95
96 if (dt > 20) {
97 /* if we made no update to rs_mark_time for too long,
98 * we are stalled. show that. */
99 seq_printf(seq, "stalled\n");
100 return;
101 }
102
103 if (!dt)
104 dt++;
105 db = mdev->rs_mark_left - rs_left;
106 rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
107
108 seq_printf(seq, "finish: %lu:%02lu:%02lu",
109 rt / 3600, (rt % 3600) / 60, rt % 60);
110
111 /* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */
112 dbdt = Bit2KB(db/dt);
113 if (dbdt > 1000)
114 seq_printf(seq, " speed: %ld,%03ld",
115 dbdt/1000, dbdt % 1000);
116 else
117 seq_printf(seq, " speed: %ld", dbdt);
118
119 /* mean speed since syncer started
120 * we do account for PausedSync periods */
121 dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
122 if (dt <= 0)
123 dt = 1;
124 db = mdev->rs_total - rs_left;
125 dbdt = Bit2KB(db/dt);
126 if (dbdt > 1000)
127 seq_printf(seq, " (%ld,%03ld)",
128 dbdt/1000, dbdt % 1000);
129 else
130 seq_printf(seq, " (%ld)", dbdt);
131
132 seq_printf(seq, " K/sec\n");
133}
134
135static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
136{
137 struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
138
139 seq_printf(seq, "%5d %s %s\n", bme->rs_left,
140 bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------",
141 bme->flags & BME_LOCKED ? "LOCKED" : "------"
142 );
143}
144
145static int drbd_seq_show(struct seq_file *seq, void *v)
146{
147 int i, hole = 0;
148 const char *sn;
149 struct drbd_conf *mdev;
150
151 static char write_ordering_chars[] = {
152 [WO_none] = 'n',
153 [WO_drain_io] = 'd',
154 [WO_bdev_flush] = 'f',
155 [WO_bio_barrier] = 'b',
156 };
157
158 seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
159 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag());
160
161 /*
162 cs .. connection state
163 ro .. node role (local/remote)
164 ds .. disk state (local/remote)
165 protocol
166 various flags
167 ns .. network send
168 nr .. network receive
169 dw .. disk write
170 dr .. disk read
171 al .. activity log write count
172 bm .. bitmap update write count
173 pe .. pending (waiting for ack or data reply)
174 ua .. unack'd (still need to send ack or data reply)
175 ap .. application requests accepted, but not yet completed
176 ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending
177 wo .. write ordering mode currently in use
178 oos .. known out-of-sync kB
179 */
180
181 for (i = 0; i < minor_count; i++) {
182 mdev = minor_to_mdev(i);
183 if (!mdev) {
184 hole = 1;
185 continue;
186 }
187 if (hole) {
188 hole = 0;
189 seq_printf(seq, "\n");
190 }
191
192 sn = drbd_conn_str(mdev->state.conn);
193
194 if (mdev->state.conn == C_STANDALONE &&
195 mdev->state.disk == D_DISKLESS &&
196 mdev->state.role == R_SECONDARY) {
197 seq_printf(seq, "%2d: cs:Unconfigured\n", i);
198 } else {
199 seq_printf(seq,
200 "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n"
201 " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
202 "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
203 i, sn,
204 drbd_role_str(mdev->state.role),
205 drbd_role_str(mdev->state.peer),
206 drbd_disk_str(mdev->state.disk),
207 drbd_disk_str(mdev->state.pdsk),
208 (mdev->net_conf == NULL ? ' ' :
209 (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')),
210 mdev->state.susp ? 's' : 'r',
211 mdev->state.aftr_isp ? 'a' : '-',
212 mdev->state.peer_isp ? 'p' : '-',
213 mdev->state.user_isp ? 'u' : '-',
214 mdev->congestion_reason ?: '-',
215 mdev->send_cnt/2,
216 mdev->recv_cnt/2,
217 mdev->writ_cnt/2,
218 mdev->read_cnt/2,
219 mdev->al_writ_cnt,
220 mdev->bm_writ_cnt,
221 atomic_read(&mdev->local_cnt),
222 atomic_read(&mdev->ap_pending_cnt) +
223 atomic_read(&mdev->rs_pending_cnt),
224 atomic_read(&mdev->unacked_cnt),
225 atomic_read(&mdev->ap_bio_cnt),
226 mdev->epochs,
227 write_ordering_chars[mdev->write_ordering]
228 );
229 seq_printf(seq, " oos:%lu\n",
230 Bit2KB(drbd_bm_total_weight(mdev)));
231 }
232 if (mdev->state.conn == C_SYNC_SOURCE ||
233 mdev->state.conn == C_SYNC_TARGET)
234 drbd_syncer_progress(mdev, seq);
235
236 if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
237 seq_printf(seq, "\t%3d%% %lu/%lu\n",
238 (int)((mdev->rs_total-mdev->ov_left) /
239 (mdev->rs_total/100+1)),
240 mdev->rs_total - mdev->ov_left,
241 mdev->rs_total);
242
243 if (proc_details >= 1 && get_ldev_if_state(mdev, D_FAILED)) {
244 lc_seq_printf_stats(seq, mdev->resync);
245 lc_seq_printf_stats(seq, mdev->act_log);
246 put_ldev(mdev);
247 }
248
249 if (proc_details >= 2) {
250 if (mdev->resync) {
251 lc_seq_dump_details(seq, mdev->resync, "rs_left",
252 resync_dump_detail);
253 }
254 }
255 }
256
257 return 0;
258}
259
260static int drbd_proc_open(struct inode *inode, struct file *file)
261{
262 return single_open(file, drbd_seq_show, PDE(inode)->data);
263}
264
265/* PROC FS stuff end */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
new file mode 100644
index 000000000000..c548f24f54a1
--- /dev/null
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -0,0 +1,4426 @@
1/*
2 drbd_receiver.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25
26#include <linux/module.h>
27
28#include <asm/uaccess.h>
29#include <net/sock.h>
30
31#include <linux/version.h>
32#include <linux/drbd.h>
33#include <linux/fs.h>
34#include <linux/file.h>
35#include <linux/in.h>
36#include <linux/mm.h>
37#include <linux/memcontrol.h>
38#include <linux/mm_inline.h>
39#include <linux/slab.h>
40#include <linux/smp_lock.h>
41#include <linux/pkt_sched.h>
42#define __KERNEL_SYSCALLS__
43#include <linux/unistd.h>
44#include <linux/vmalloc.h>
45#include <linux/random.h>
46#include <linux/mm.h>
47#include <linux/string.h>
48#include <linux/scatterlist.h>
49#include "drbd_int.h"
50#include "drbd_req.h"
51
52#include "drbd_vli.h"
53
54struct flush_work {
55 struct drbd_work w;
56 struct drbd_epoch *epoch;
57};
58
59enum finish_epoch {
60 FE_STILL_LIVE,
61 FE_DESTROYED,
62 FE_RECYCLED,
63};
64
65static int drbd_do_handshake(struct drbd_conf *mdev);
66static int drbd_do_auth(struct drbd_conf *mdev);
67
68static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
69static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
70
71static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
72{
73 struct drbd_epoch *prev;
74 spin_lock(&mdev->epoch_lock);
75 prev = list_entry(epoch->list.prev, struct drbd_epoch, list);
76 if (prev == epoch || prev == mdev->current_epoch)
77 prev = NULL;
78 spin_unlock(&mdev->epoch_lock);
79 return prev;
80}
81
82#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
83
84static struct page *drbd_pp_first_page_or_try_alloc(struct drbd_conf *mdev)
85{
86 struct page *page = NULL;
87
88 /* Yes, testing drbd_pp_vacant outside the lock is racy.
89 * So what. It saves a spin_lock. */
90 if (drbd_pp_vacant > 0) {
91 spin_lock(&drbd_pp_lock);
92 page = drbd_pp_pool;
93 if (page) {
94 drbd_pp_pool = (struct page *)page_private(page);
95 set_page_private(page, 0); /* just to be polite */
96 drbd_pp_vacant--;
97 }
98 spin_unlock(&drbd_pp_lock);
99 }
100 /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
101 * "criss-cross" setup, that might cause write-out on some other DRBD,
102 * which in turn might block on the other node at this very place. */
103 if (!page)
104 page = alloc_page(GFP_TRY);
105 if (page)
106 atomic_inc(&mdev->pp_in_use);
107 return page;
108}
109
110/* kick lower level device, if we have more than (arbitrary number)
111 * reference counts on it, which typically are locally submitted io
112 * requests. don't use unacked_cnt, so we speed up proto A and B, too. */
113static void maybe_kick_lo(struct drbd_conf *mdev)
114{
115 if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark)
116 drbd_kick_lo(mdev);
117}
118
119static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed)
120{
121 struct drbd_epoch_entry *e;
122 struct list_head *le, *tle;
123
124 /* The EEs are always appended to the end of the list. Since
125 they are sent in order over the wire, they have to finish
126 in order. As soon as we see the first not finished we can
127 stop to examine the list... */
128
129 list_for_each_safe(le, tle, &mdev->net_ee) {
130 e = list_entry(le, struct drbd_epoch_entry, w.list);
131 if (drbd_bio_has_active_page(e->private_bio))
132 break;
133 list_move(le, to_be_freed);
134 }
135}
136
137static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
138{
139 LIST_HEAD(reclaimed);
140 struct drbd_epoch_entry *e, *t;
141
142 maybe_kick_lo(mdev);
143 spin_lock_irq(&mdev->req_lock);
144 reclaim_net_ee(mdev, &reclaimed);
145 spin_unlock_irq(&mdev->req_lock);
146
147 list_for_each_entry_safe(e, t, &reclaimed, w.list)
148 drbd_free_ee(mdev, e);
149}
150
151/**
152 * drbd_pp_alloc() - Returns a page, fails only if a signal comes in
153 * @mdev: DRBD device.
154 * @retry: whether or not to retry allocation forever (or until signalled)
155 *
156 * Tries to allocate a page, first from our own page pool, then from the
157 * kernel, unless this allocation would exceed the max_buffers setting.
158 * If @retry is non-zero, retry until DRBD frees a page somewhere else.
159 */
160static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry)
161{
162 struct page *page = NULL;
163 DEFINE_WAIT(wait);
164
165 if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
166 page = drbd_pp_first_page_or_try_alloc(mdev);
167 if (page)
168 return page;
169 }
170
171 for (;;) {
172 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
173
174 drbd_kick_lo_and_reclaim_net(mdev);
175
176 if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
177 page = drbd_pp_first_page_or_try_alloc(mdev);
178 if (page)
179 break;
180 }
181
182 if (!retry)
183 break;
184
185 if (signal_pending(current)) {
186 dev_warn(DEV, "drbd_pp_alloc interrupted!\n");
187 break;
188 }
189
190 schedule();
191 }
192 finish_wait(&drbd_pp_wait, &wait);
193
194 return page;
195}
196
197/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
198 * Is also used from inside an other spin_lock_irq(&mdev->req_lock) */
199static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
200{
201 int free_it;
202
203 spin_lock(&drbd_pp_lock);
204 if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
205 free_it = 1;
206 } else {
207 set_page_private(page, (unsigned long)drbd_pp_pool);
208 drbd_pp_pool = page;
209 drbd_pp_vacant++;
210 free_it = 0;
211 }
212 spin_unlock(&drbd_pp_lock);
213
214 atomic_dec(&mdev->pp_in_use);
215
216 if (free_it)
217 __free_page(page);
218
219 wake_up(&drbd_pp_wait);
220}
221
222static void drbd_pp_free_bio_pages(struct drbd_conf *mdev, struct bio *bio)
223{
224 struct page *p_to_be_freed = NULL;
225 struct page *page;
226 struct bio_vec *bvec;
227 int i;
228
229 spin_lock(&drbd_pp_lock);
230 __bio_for_each_segment(bvec, bio, i, 0) {
231 if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
232 set_page_private(bvec->bv_page, (unsigned long)p_to_be_freed);
233 p_to_be_freed = bvec->bv_page;
234 } else {
235 set_page_private(bvec->bv_page, (unsigned long)drbd_pp_pool);
236 drbd_pp_pool = bvec->bv_page;
237 drbd_pp_vacant++;
238 }
239 }
240 spin_unlock(&drbd_pp_lock);
241 atomic_sub(bio->bi_vcnt, &mdev->pp_in_use);
242
243 while (p_to_be_freed) {
244 page = p_to_be_freed;
245 p_to_be_freed = (struct page *)page_private(page);
246 set_page_private(page, 0); /* just to be polite */
247 put_page(page);
248 }
249
250 wake_up(&drbd_pp_wait);
251}
252
253/*
254You need to hold the req_lock:
255 _drbd_wait_ee_list_empty()
256
257You must not have the req_lock:
258 drbd_free_ee()
259 drbd_alloc_ee()
260 drbd_init_ee()
261 drbd_release_ee()
262 drbd_ee_fix_bhs()
263 drbd_process_done_ee()
264 drbd_clear_done_ee()
265 drbd_wait_ee_list_empty()
266*/
267
268struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
269 u64 id,
270 sector_t sector,
271 unsigned int data_size,
272 gfp_t gfp_mask) __must_hold(local)
273{
274 struct request_queue *q;
275 struct drbd_epoch_entry *e;
276 struct page *page;
277 struct bio *bio;
278 unsigned int ds;
279
280 if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE))
281 return NULL;
282
283 e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
284 if (!e) {
285 if (!(gfp_mask & __GFP_NOWARN))
286 dev_err(DEV, "alloc_ee: Allocation of an EE failed\n");
287 return NULL;
288 }
289
290 bio = bio_alloc(gfp_mask & ~__GFP_HIGHMEM, div_ceil(data_size, PAGE_SIZE));
291 if (!bio) {
292 if (!(gfp_mask & __GFP_NOWARN))
293 dev_err(DEV, "alloc_ee: Allocation of a bio failed\n");
294 goto fail1;
295 }
296
297 bio->bi_bdev = mdev->ldev->backing_bdev;
298 bio->bi_sector = sector;
299
300 ds = data_size;
301 while (ds) {
302 page = drbd_pp_alloc(mdev, (gfp_mask & __GFP_WAIT));
303 if (!page) {
304 if (!(gfp_mask & __GFP_NOWARN))
305 dev_err(DEV, "alloc_ee: Allocation of a page failed\n");
306 goto fail2;
307 }
308 if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) {
309 drbd_pp_free(mdev, page);
310 dev_err(DEV, "alloc_ee: bio_add_page(s=%llu,"
311 "data_size=%u,ds=%u) failed\n",
312 (unsigned long long)sector, data_size, ds);
313
314 q = bdev_get_queue(bio->bi_bdev);
315 if (q->merge_bvec_fn) {
316 struct bvec_merge_data bvm = {
317 .bi_bdev = bio->bi_bdev,
318 .bi_sector = bio->bi_sector,
319 .bi_size = bio->bi_size,
320 .bi_rw = bio->bi_rw,
321 };
322 int l = q->merge_bvec_fn(q, &bvm,
323 &bio->bi_io_vec[bio->bi_vcnt]);
324 dev_err(DEV, "merge_bvec_fn() = %d\n", l);
325 }
326
327 /* dump more of the bio. */
328 dev_err(DEV, "bio->bi_max_vecs = %d\n", bio->bi_max_vecs);
329 dev_err(DEV, "bio->bi_vcnt = %d\n", bio->bi_vcnt);
330 dev_err(DEV, "bio->bi_size = %d\n", bio->bi_size);
331 dev_err(DEV, "bio->bi_phys_segments = %d\n", bio->bi_phys_segments);
332
333 goto fail2;
334 break;
335 }
336 ds -= min_t(int, ds, PAGE_SIZE);
337 }
338
339 D_ASSERT(data_size == bio->bi_size);
340
341 bio->bi_private = e;
342 e->mdev = mdev;
343 e->sector = sector;
344 e->size = bio->bi_size;
345
346 e->private_bio = bio;
347 e->block_id = id;
348 INIT_HLIST_NODE(&e->colision);
349 e->epoch = NULL;
350 e->flags = 0;
351
352 return e;
353
354 fail2:
355 drbd_pp_free_bio_pages(mdev, bio);
356 bio_put(bio);
357 fail1:
358 mempool_free(e, drbd_ee_mempool);
359
360 return NULL;
361}
362
363void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
364{
365 struct bio *bio = e->private_bio;
366 drbd_pp_free_bio_pages(mdev, bio);
367 bio_put(bio);
368 D_ASSERT(hlist_unhashed(&e->colision));
369 mempool_free(e, drbd_ee_mempool);
370}
371
372int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
373{
374 LIST_HEAD(work_list);
375 struct drbd_epoch_entry *e, *t;
376 int count = 0;
377
378 spin_lock_irq(&mdev->req_lock);
379 list_splice_init(list, &work_list);
380 spin_unlock_irq(&mdev->req_lock);
381
382 list_for_each_entry_safe(e, t, &work_list, w.list) {
383 drbd_free_ee(mdev, e);
384 count++;
385 }
386 return count;
387}
388
389
390/*
391 * This function is called from _asender only_
392 * but see also comments in _req_mod(,barrier_acked)
393 * and receive_Barrier.
394 *
395 * Move entries from net_ee to done_ee, if ready.
396 * Grab done_ee, call all callbacks, free the entries.
397 * The callbacks typically send out ACKs.
398 */
399static int drbd_process_done_ee(struct drbd_conf *mdev)
400{
401 LIST_HEAD(work_list);
402 LIST_HEAD(reclaimed);
403 struct drbd_epoch_entry *e, *t;
404 int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS);
405
406 spin_lock_irq(&mdev->req_lock);
407 reclaim_net_ee(mdev, &reclaimed);
408 list_splice_init(&mdev->done_ee, &work_list);
409 spin_unlock_irq(&mdev->req_lock);
410
411 list_for_each_entry_safe(e, t, &reclaimed, w.list)
412 drbd_free_ee(mdev, e);
413
414 /* possible callbacks here:
415 * e_end_block, and e_end_resync_block, e_send_discard_ack.
416 * all ignore the last argument.
417 */
418 list_for_each_entry_safe(e, t, &work_list, w.list) {
419 /* list_del not necessary, next/prev members not touched */
420 ok = e->w.cb(mdev, &e->w, !ok) && ok;
421 drbd_free_ee(mdev, e);
422 }
423 wake_up(&mdev->ee_wait);
424
425 return ok;
426}
427
428void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
429{
430 DEFINE_WAIT(wait);
431
432 /* avoids spin_lock/unlock
433 * and calling prepare_to_wait in the fast path */
434 while (!list_empty(head)) {
435 prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
436 spin_unlock_irq(&mdev->req_lock);
437 drbd_kick_lo(mdev);
438 schedule();
439 finish_wait(&mdev->ee_wait, &wait);
440 spin_lock_irq(&mdev->req_lock);
441 }
442}
443
444void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
445{
446 spin_lock_irq(&mdev->req_lock);
447 _drbd_wait_ee_list_empty(mdev, head);
448 spin_unlock_irq(&mdev->req_lock);
449}
450
451/* see also kernel_accept; which is only present since 2.6.18.
452 * also we want to log which part of it failed, exactly */
453static int drbd_accept(struct drbd_conf *mdev, const char **what,
454 struct socket *sock, struct socket **newsock)
455{
456 struct sock *sk = sock->sk;
457 int err = 0;
458
459 *what = "listen";
460 err = sock->ops->listen(sock, 5);
461 if (err < 0)
462 goto out;
463
464 *what = "sock_create_lite";
465 err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
466 newsock);
467 if (err < 0)
468 goto out;
469
470 *what = "accept";
471 err = sock->ops->accept(sock, *newsock, 0);
472 if (err < 0) {
473 sock_release(*newsock);
474 *newsock = NULL;
475 goto out;
476 }
477 (*newsock)->ops = sock->ops;
478
479out:
480 return err;
481}
482
483static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock,
484 void *buf, size_t size, int flags)
485{
486 mm_segment_t oldfs;
487 struct kvec iov = {
488 .iov_base = buf,
489 .iov_len = size,
490 };
491 struct msghdr msg = {
492 .msg_iovlen = 1,
493 .msg_iov = (struct iovec *)&iov,
494 .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
495 };
496 int rv;
497
498 oldfs = get_fs();
499 set_fs(KERNEL_DS);
500 rv = sock_recvmsg(sock, &msg, size, msg.msg_flags);
501 set_fs(oldfs);
502
503 return rv;
504}
505
506static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size)
507{
508 mm_segment_t oldfs;
509 struct kvec iov = {
510 .iov_base = buf,
511 .iov_len = size,
512 };
513 struct msghdr msg = {
514 .msg_iovlen = 1,
515 .msg_iov = (struct iovec *)&iov,
516 .msg_flags = MSG_WAITALL | MSG_NOSIGNAL
517 };
518 int rv;
519
520 oldfs = get_fs();
521 set_fs(KERNEL_DS);
522
523 for (;;) {
524 rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags);
525 if (rv == size)
526 break;
527
528 /* Note:
529 * ECONNRESET other side closed the connection
530 * ERESTARTSYS (on sock) we got a signal
531 */
532
533 if (rv < 0) {
534 if (rv == -ECONNRESET)
535 dev_info(DEV, "sock was reset by peer\n");
536 else if (rv != -ERESTARTSYS)
537 dev_err(DEV, "sock_recvmsg returned %d\n", rv);
538 break;
539 } else if (rv == 0) {
540 dev_info(DEV, "sock was shut down by peer\n");
541 break;
542 } else {
543 /* signal came in, or peer/link went down,
544 * after we read a partial message
545 */
546 /* D_ASSERT(signal_pending(current)); */
547 break;
548 }
549 };
550
551 set_fs(oldfs);
552
553 if (rv != size)
554 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
555
556 return rv;
557}
558
559static struct socket *drbd_try_connect(struct drbd_conf *mdev)
560{
561 const char *what;
562 struct socket *sock;
563 struct sockaddr_in6 src_in6;
564 int err;
565 int disconnect_on_error = 1;
566
567 if (!get_net_conf(mdev))
568 return NULL;
569
570 what = "sock_create_kern";
571 err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
572 SOCK_STREAM, IPPROTO_TCP, &sock);
573 if (err < 0) {
574 sock = NULL;
575 goto out;
576 }
577
578 sock->sk->sk_rcvtimeo =
579 sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ;
580
581 /* explicitly bind to the configured IP as source IP
582 * for the outgoing connections.
583 * This is needed for multihomed hosts and to be
584 * able to use lo: interfaces for drbd.
585 * Make sure to use 0 as port number, so linux selects
586 * a free one dynamically.
587 */
588 memcpy(&src_in6, mdev->net_conf->my_addr,
589 min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6)));
590 if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6)
591 src_in6.sin6_port = 0;
592 else
593 ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
594
595 what = "bind before connect";
596 err = sock->ops->bind(sock,
597 (struct sockaddr *) &src_in6,
598 mdev->net_conf->my_addr_len);
599 if (err < 0)
600 goto out;
601
602 /* connect may fail, peer not yet available.
603 * stay C_WF_CONNECTION, don't go Disconnecting! */
604 disconnect_on_error = 0;
605 what = "connect";
606 err = sock->ops->connect(sock,
607 (struct sockaddr *)mdev->net_conf->peer_addr,
608 mdev->net_conf->peer_addr_len, 0);
609
610out:
611 if (err < 0) {
612 if (sock) {
613 sock_release(sock);
614 sock = NULL;
615 }
616 switch (-err) {
617 /* timeout, busy, signal pending */
618 case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
619 case EINTR: case ERESTARTSYS:
620 /* peer not (yet) available, network problem */
621 case ECONNREFUSED: case ENETUNREACH:
622 case EHOSTDOWN: case EHOSTUNREACH:
623 disconnect_on_error = 0;
624 break;
625 default:
626 dev_err(DEV, "%s failed, err = %d\n", what, err);
627 }
628 if (disconnect_on_error)
629 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
630 }
631 put_net_conf(mdev);
632 return sock;
633}
634
635static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
636{
637 int timeo, err;
638 struct socket *s_estab = NULL, *s_listen;
639 const char *what;
640
641 if (!get_net_conf(mdev))
642 return NULL;
643
644 what = "sock_create_kern";
645 err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
646 SOCK_STREAM, IPPROTO_TCP, &s_listen);
647 if (err) {
648 s_listen = NULL;
649 goto out;
650 }
651
652 timeo = mdev->net_conf->try_connect_int * HZ;
653 timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
654
655 s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */
656 s_listen->sk->sk_rcvtimeo = timeo;
657 s_listen->sk->sk_sndtimeo = timeo;
658
659 what = "bind before listen";
660 err = s_listen->ops->bind(s_listen,
661 (struct sockaddr *) mdev->net_conf->my_addr,
662 mdev->net_conf->my_addr_len);
663 if (err < 0)
664 goto out;
665
666 err = drbd_accept(mdev, &what, s_listen, &s_estab);
667
668out:
669 if (s_listen)
670 sock_release(s_listen);
671 if (err < 0) {
672 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
673 dev_err(DEV, "%s failed, err = %d\n", what, err);
674 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
675 }
676 }
677 put_net_conf(mdev);
678
679 return s_estab;
680}
681
682static int drbd_send_fp(struct drbd_conf *mdev,
683 struct socket *sock, enum drbd_packets cmd)
684{
685 struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
686
687 return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
688}
689
690static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
691{
692 struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
693 int rr;
694
695 rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
696
697 if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC)
698 return be16_to_cpu(h->command);
699
700 return 0xffff;
701}
702
703/**
704 * drbd_socket_okay() - Free the socket if its connection is not okay
705 * @mdev: DRBD device.
706 * @sock: pointer to the pointer to the socket.
707 */
708static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
709{
710 int rr;
711 char tb[4];
712
713 if (!*sock)
714 return FALSE;
715
716 rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
717
718 if (rr > 0 || rr == -EAGAIN) {
719 return TRUE;
720 } else {
721 sock_release(*sock);
722 *sock = NULL;
723 return FALSE;
724 }
725}
726
727/*
728 * return values:
729 * 1 yes, we have a valid connection
730 * 0 oops, did not work out, please try again
731 * -1 peer talks different language,
732 * no point in trying again, please go standalone.
733 * -2 We do not have a network config...
734 */
735static int drbd_connect(struct drbd_conf *mdev)
736{
737 struct socket *s, *sock, *msock;
738 int try, h, ok;
739
740 D_ASSERT(!mdev->data.socket);
741
742 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags))
743 dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n");
744
745 if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
746 return -2;
747
748 clear_bit(DISCARD_CONCURRENT, &mdev->flags);
749
750 sock = NULL;
751 msock = NULL;
752
753 do {
754 for (try = 0;;) {
755 /* 3 tries, this should take less than a second! */
756 s = drbd_try_connect(mdev);
757 if (s || ++try >= 3)
758 break;
759 /* give the other side time to call bind() & listen() */
760 __set_current_state(TASK_INTERRUPTIBLE);
761 schedule_timeout(HZ / 10);
762 }
763
764 if (s) {
765 if (!sock) {
766 drbd_send_fp(mdev, s, P_HAND_SHAKE_S);
767 sock = s;
768 s = NULL;
769 } else if (!msock) {
770 drbd_send_fp(mdev, s, P_HAND_SHAKE_M);
771 msock = s;
772 s = NULL;
773 } else {
774 dev_err(DEV, "Logic error in drbd_connect()\n");
775 goto out_release_sockets;
776 }
777 }
778
779 if (sock && msock) {
780 __set_current_state(TASK_INTERRUPTIBLE);
781 schedule_timeout(HZ / 10);
782 ok = drbd_socket_okay(mdev, &sock);
783 ok = drbd_socket_okay(mdev, &msock) && ok;
784 if (ok)
785 break;
786 }
787
788retry:
789 s = drbd_wait_for_connect(mdev);
790 if (s) {
791 try = drbd_recv_fp(mdev, s);
792 drbd_socket_okay(mdev, &sock);
793 drbd_socket_okay(mdev, &msock);
794 switch (try) {
795 case P_HAND_SHAKE_S:
796 if (sock) {
797 dev_warn(DEV, "initial packet S crossed\n");
798 sock_release(sock);
799 }
800 sock = s;
801 break;
802 case P_HAND_SHAKE_M:
803 if (msock) {
804 dev_warn(DEV, "initial packet M crossed\n");
805 sock_release(msock);
806 }
807 msock = s;
808 set_bit(DISCARD_CONCURRENT, &mdev->flags);
809 break;
810 default:
811 dev_warn(DEV, "Error receiving initial packet\n");
812 sock_release(s);
813 if (random32() & 1)
814 goto retry;
815 }
816 }
817
818 if (mdev->state.conn <= C_DISCONNECTING)
819 goto out_release_sockets;
820 if (signal_pending(current)) {
821 flush_signals(current);
822 smp_rmb();
823 if (get_t_state(&mdev->receiver) == Exiting)
824 goto out_release_sockets;
825 }
826
827 if (sock && msock) {
828 ok = drbd_socket_okay(mdev, &sock);
829 ok = drbd_socket_okay(mdev, &msock) && ok;
830 if (ok)
831 break;
832 }
833 } while (1);
834
835 msock->sk->sk_reuse = 1; /* SO_REUSEADDR */
836 sock->sk->sk_reuse = 1; /* SO_REUSEADDR */
837
838 sock->sk->sk_allocation = GFP_NOIO;
839 msock->sk->sk_allocation = GFP_NOIO;
840
841 sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
842 msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
843
844 if (mdev->net_conf->sndbuf_size) {
845 sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size;
846 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
847 }
848
849 if (mdev->net_conf->rcvbuf_size) {
850 sock->sk->sk_rcvbuf = mdev->net_conf->rcvbuf_size;
851 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
852 }
853
854 /* NOT YET ...
855 * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
856 * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
857 * first set it to the P_HAND_SHAKE timeout,
858 * which we set to 4x the configured ping_timeout. */
859 sock->sk->sk_sndtimeo =
860 sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10;
861
862 msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
863 msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
864
865 /* we don't want delays.
866 * we use TCP_CORK where apropriate, though */
867 drbd_tcp_nodelay(sock);
868 drbd_tcp_nodelay(msock);
869
870 mdev->data.socket = sock;
871 mdev->meta.socket = msock;
872 mdev->last_received = jiffies;
873
874 D_ASSERT(mdev->asender.task == NULL);
875
876 h = drbd_do_handshake(mdev);
877 if (h <= 0)
878 return h;
879
880 if (mdev->cram_hmac_tfm) {
881 /* drbd_request_state(mdev, NS(conn, WFAuth)); */
882 if (!drbd_do_auth(mdev)) {
883 dev_err(DEV, "Authentication of peer failed\n");
884 return -1;
885 }
886 }
887
888 if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS)
889 return 0;
890
891 sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
892 sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
893
894 atomic_set(&mdev->packet_seq, 0);
895 mdev->peer_seq = 0;
896
897 drbd_thread_start(&mdev->asender);
898
899 drbd_send_protocol(mdev);
900 drbd_send_sync_param(mdev, &mdev->sync_conf);
901 drbd_send_sizes(mdev, 0);
902 drbd_send_uuids(mdev);
903 drbd_send_state(mdev);
904 clear_bit(USE_DEGR_WFC_T, &mdev->flags);
905 clear_bit(RESIZE_PENDING, &mdev->flags);
906
907 return 1;
908
909out_release_sockets:
910 if (sock)
911 sock_release(sock);
912 if (msock)
913 sock_release(msock);
914 return -1;
915}
916
917static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h)
918{
919 int r;
920
921 r = drbd_recv(mdev, h, sizeof(*h));
922
923 if (unlikely(r != sizeof(*h))) {
924 dev_err(DEV, "short read expecting header on sock: r=%d\n", r);
925 return FALSE;
926 };
927 h->command = be16_to_cpu(h->command);
928 h->length = be16_to_cpu(h->length);
929 if (unlikely(h->magic != BE_DRBD_MAGIC)) {
930 dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n",
931 (long)be32_to_cpu(h->magic),
932 h->command, h->length);
933 return FALSE;
934 }
935 mdev->last_received = jiffies;
936
937 return TRUE;
938}
939
940static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
941{
942 int rv;
943
944 if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
945 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL);
946 if (rv) {
947 dev_err(DEV, "local disk flush failed with status %d\n", rv);
948 /* would rather check on EOPNOTSUPP, but that is not reliable.
949 * don't try again for ANY return value != 0
950 * if (rv == -EOPNOTSUPP) */
951 drbd_bump_write_ordering(mdev, WO_drain_io);
952 }
953 put_ldev(mdev);
954 }
955
956 return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
957}
958
959static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
960{
961 struct flush_work *fw = (struct flush_work *)w;
962 struct drbd_epoch *epoch = fw->epoch;
963
964 kfree(w);
965
966 if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags))
967 drbd_flush_after_epoch(mdev, epoch);
968
969 drbd_may_finish_epoch(mdev, epoch, EV_PUT |
970 (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0));
971
972 return 1;
973}
974
975/**
976 * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
977 * @mdev: DRBD device.
978 * @epoch: Epoch object.
979 * @ev: Epoch event.
980 */
981static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
982 struct drbd_epoch *epoch,
983 enum epoch_event ev)
984{
985 int finish, epoch_size;
986 struct drbd_epoch *next_epoch;
987 int schedule_flush = 0;
988 enum finish_epoch rv = FE_STILL_LIVE;
989
990 spin_lock(&mdev->epoch_lock);
991 do {
992 next_epoch = NULL;
993 finish = 0;
994
995 epoch_size = atomic_read(&epoch->epoch_size);
996
997 switch (ev & ~EV_CLEANUP) {
998 case EV_PUT:
999 atomic_dec(&epoch->active);
1000 break;
1001 case EV_GOT_BARRIER_NR:
1002 set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
1003
1004 /* Special case: If we just switched from WO_bio_barrier to
1005 WO_bdev_flush we should not finish the current epoch */
1006 if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 &&
1007 mdev->write_ordering != WO_bio_barrier &&
1008 epoch == mdev->current_epoch)
1009 clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags);
1010 break;
1011 case EV_BARRIER_DONE:
1012 set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags);
1013 break;
1014 case EV_BECAME_LAST:
1015 /* nothing to do*/
1016 break;
1017 }
1018
1019 if (epoch_size != 0 &&
1020 atomic_read(&epoch->active) == 0 &&
1021 test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) &&
1022 epoch->list.prev == &mdev->current_epoch->list &&
1023 !test_bit(DE_IS_FINISHING, &epoch->flags)) {
1024 /* Nearly all conditions are met to finish that epoch... */
1025 if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ||
1026 mdev->write_ordering == WO_none ||
1027 (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) ||
1028 ev & EV_CLEANUP) {
1029 finish = 1;
1030 set_bit(DE_IS_FINISHING, &epoch->flags);
1031 } else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) &&
1032 mdev->write_ordering == WO_bio_barrier) {
1033 atomic_inc(&epoch->active);
1034 schedule_flush = 1;
1035 }
1036 }
1037 if (finish) {
1038 if (!(ev & EV_CLEANUP)) {
1039 spin_unlock(&mdev->epoch_lock);
1040 drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
1041 spin_lock(&mdev->epoch_lock);
1042 }
1043 dec_unacked(mdev);
1044
1045 if (mdev->current_epoch != epoch) {
1046 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
1047 list_del(&epoch->list);
1048 ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
1049 mdev->epochs--;
1050 kfree(epoch);
1051
1052 if (rv == FE_STILL_LIVE)
1053 rv = FE_DESTROYED;
1054 } else {
1055 epoch->flags = 0;
1056 atomic_set(&epoch->epoch_size, 0);
1057 /* atomic_set(&epoch->active, 0); is alrady zero */
1058 if (rv == FE_STILL_LIVE)
1059 rv = FE_RECYCLED;
1060 }
1061 }
1062
1063 if (!next_epoch)
1064 break;
1065
1066 epoch = next_epoch;
1067 } while (1);
1068
1069 spin_unlock(&mdev->epoch_lock);
1070
1071 if (schedule_flush) {
1072 struct flush_work *fw;
1073 fw = kmalloc(sizeof(*fw), GFP_ATOMIC);
1074 if (fw) {
1075 fw->w.cb = w_flush;
1076 fw->epoch = epoch;
1077 drbd_queue_work(&mdev->data.work, &fw->w);
1078 } else {
1079 dev_warn(DEV, "Could not kmalloc a flush_work obj\n");
1080 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1081 /* That is not a recursion, only one level */
1082 drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
1083 drbd_may_finish_epoch(mdev, epoch, EV_PUT);
1084 }
1085 }
1086
1087 return rv;
1088}
1089
1090/**
1091 * drbd_bump_write_ordering() - Fall back to an other write ordering method
1092 * @mdev: DRBD device.
1093 * @wo: Write ordering method to try.
1094 */
1095void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local)
1096{
1097 enum write_ordering_e pwo;
1098 static char *write_ordering_str[] = {
1099 [WO_none] = "none",
1100 [WO_drain_io] = "drain",
1101 [WO_bdev_flush] = "flush",
1102 [WO_bio_barrier] = "barrier",
1103 };
1104
1105 pwo = mdev->write_ordering;
1106 wo = min(pwo, wo);
1107 if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier)
1108 wo = WO_bdev_flush;
1109 if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
1110 wo = WO_drain_io;
1111 if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
1112 wo = WO_none;
1113 mdev->write_ordering = wo;
1114 if (pwo != mdev->write_ordering || wo == WO_bio_barrier)
1115 dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
1116}
1117
1118/**
1119 * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set
1120 * @mdev: DRBD device.
1121 * @w: work object.
1122 * @cancel: The connection will be closed anyways (unused in this callback)
1123 */
1124int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
1125{
1126 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1127 struct bio *bio = e->private_bio;
1128
1129 /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
1130 (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
1131 so that we can finish that epoch in drbd_may_finish_epoch().
1132 That is necessary if we already have a long chain of Epochs, before
1133 we realize that BIO_RW_BARRIER is actually not supported */
1134
1135 /* As long as the -ENOTSUPP on the barrier is reported immediately
1136 that will never trigger. If it is reported late, we will just
1137 print that warning and continue correctly for all future requests
1138 with WO_bdev_flush */
1139 if (previous_epoch(mdev, e->epoch))
1140 dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
1141
1142 /* prepare bio for re-submit,
1143 * re-init volatile members */
1144 /* we still have a local reference,
1145 * get_ldev was done in receive_Data. */
1146 bio->bi_bdev = mdev->ldev->backing_bdev;
1147 bio->bi_sector = e->sector;
1148 bio->bi_size = e->size;
1149 bio->bi_idx = 0;
1150
1151 bio->bi_flags &= ~(BIO_POOL_MASK - 1);
1152 bio->bi_flags |= 1 << BIO_UPTODATE;
1153
1154 /* don't know whether this is necessary: */
1155 bio->bi_phys_segments = 0;
1156 bio->bi_next = NULL;
1157
1158 /* these should be unchanged: */
1159 /* bio->bi_end_io = drbd_endio_write_sec; */
1160 /* bio->bi_vcnt = whatever; */
1161
1162 e->w.cb = e_end_block;
1163
1164 /* This is no longer a barrier request. */
1165 bio->bi_rw &= ~(1UL << BIO_RW_BARRIER);
1166
1167 drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, bio);
1168
1169 return 1;
1170}
1171
1172static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h)
1173{
1174 int rv, issue_flush;
1175 struct p_barrier *p = (struct p_barrier *)h;
1176 struct drbd_epoch *epoch;
1177
1178 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
1179
1180 rv = drbd_recv(mdev, h->payload, h->length);
1181 ERR_IF(rv != h->length) return FALSE;
1182
1183 inc_unacked(mdev);
1184
1185 if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
1186 drbd_kick_lo(mdev);
1187
1188 mdev->current_epoch->barrier_nr = p->barrier;
1189 rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR);
1190
1191 /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
1192 * the activity log, which means it would not be resynced in case the
1193 * R_PRIMARY crashes now.
1194 * Therefore we must send the barrier_ack after the barrier request was
1195 * completed. */
1196 switch (mdev->write_ordering) {
1197 case WO_bio_barrier:
1198 case WO_none:
1199 if (rv == FE_RECYCLED)
1200 return TRUE;
1201 break;
1202
1203 case WO_bdev_flush:
1204 case WO_drain_io:
1205 D_ASSERT(rv == FE_STILL_LIVE);
1206 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
1207 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1208 rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
1209 if (rv == FE_RECYCLED)
1210 return TRUE;
1211
1212 /* The asender will send all the ACKs and barrier ACKs out, since
1213 all EEs moved from the active_ee to the done_ee. We need to
1214 provide a new epoch object for the EEs that come in soon */
1215 break;
1216 }
1217
1218 /* receiver context, in the writeout path of the other node.
1219 * avoid potential distributed deadlock */
1220 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1221 if (!epoch) {
1222 dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
1223 issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1224 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1225 if (issue_flush) {
1226 rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
1227 if (rv == FE_RECYCLED)
1228 return TRUE;
1229 }
1230
1231 drbd_wait_ee_list_empty(mdev, &mdev->done_ee);
1232
1233 return TRUE;
1234 }
1235
1236 epoch->flags = 0;
1237 atomic_set(&epoch->epoch_size, 0);
1238 atomic_set(&epoch->active, 0);
1239
1240 spin_lock(&mdev->epoch_lock);
1241 if (atomic_read(&mdev->current_epoch->epoch_size)) {
1242 list_add(&epoch->list, &mdev->current_epoch->list);
1243 mdev->current_epoch = epoch;
1244 mdev->epochs++;
1245 } else {
1246 /* The current_epoch got recycled while we allocated this one... */
1247 kfree(epoch);
1248 }
1249 spin_unlock(&mdev->epoch_lock);
1250
1251 return TRUE;
1252}
1253
1254/* used from receive_RSDataReply (recv_resync_read)
1255 * and from receive_Data */
1256static struct drbd_epoch_entry *
1257read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
1258{
1259 struct drbd_epoch_entry *e;
1260 struct bio_vec *bvec;
1261 struct page *page;
1262 struct bio *bio;
1263 int dgs, ds, i, rr;
1264 void *dig_in = mdev->int_dig_in;
1265 void *dig_vv = mdev->int_dig_vv;
1266
1267 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
1268 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
1269
1270 if (dgs) {
1271 rr = drbd_recv(mdev, dig_in, dgs);
1272 if (rr != dgs) {
1273 dev_warn(DEV, "short read receiving data digest: read %d expected %d\n",
1274 rr, dgs);
1275 return NULL;
1276 }
1277 }
1278
1279 data_size -= dgs;
1280
1281 ERR_IF(data_size & 0x1ff) return NULL;
1282 ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL;
1283
1284 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1285 * "criss-cross" setup, that might cause write-out on some other DRBD,
1286 * which in turn might block on the other node at this very place. */
1287 e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
1288 if (!e)
1289 return NULL;
1290 bio = e->private_bio;
1291 ds = data_size;
1292 bio_for_each_segment(bvec, bio, i) {
1293 page = bvec->bv_page;
1294 rr = drbd_recv(mdev, kmap(page), min_t(int, ds, PAGE_SIZE));
1295 kunmap(page);
1296 if (rr != min_t(int, ds, PAGE_SIZE)) {
1297 drbd_free_ee(mdev, e);
1298 dev_warn(DEV, "short read receiving data: read %d expected %d\n",
1299 rr, min_t(int, ds, PAGE_SIZE));
1300 return NULL;
1301 }
1302 ds -= rr;
1303 }
1304
1305 if (dgs) {
1306 drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv);
1307 if (memcmp(dig_in, dig_vv, dgs)) {
1308 dev_err(DEV, "Digest integrity check FAILED.\n");
1309 drbd_bcast_ee(mdev, "digest failed",
1310 dgs, dig_in, dig_vv, e);
1311 drbd_free_ee(mdev, e);
1312 return NULL;
1313 }
1314 }
1315 mdev->recv_cnt += data_size>>9;
1316 return e;
1317}
1318
1319/* drbd_drain_block() just takes a data block
1320 * out of the socket input buffer, and discards it.
1321 */
1322static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
1323{
1324 struct page *page;
1325 int rr, rv = 1;
1326 void *data;
1327
1328 page = drbd_pp_alloc(mdev, 1);
1329
1330 data = kmap(page);
1331 while (data_size) {
1332 rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE));
1333 if (rr != min_t(int, data_size, PAGE_SIZE)) {
1334 rv = 0;
1335 dev_warn(DEV, "short read receiving data: read %d expected %d\n",
1336 rr, min_t(int, data_size, PAGE_SIZE));
1337 break;
1338 }
1339 data_size -= rr;
1340 }
1341 kunmap(page);
1342 drbd_pp_free(mdev, page);
1343 return rv;
1344}
1345
1346static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
1347 sector_t sector, int data_size)
1348{
1349 struct bio_vec *bvec;
1350 struct bio *bio;
1351 int dgs, rr, i, expect;
1352 void *dig_in = mdev->int_dig_in;
1353 void *dig_vv = mdev->int_dig_vv;
1354
1355 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
1356 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
1357
1358 if (dgs) {
1359 rr = drbd_recv(mdev, dig_in, dgs);
1360 if (rr != dgs) {
1361 dev_warn(DEV, "short read receiving data reply digest: read %d expected %d\n",
1362 rr, dgs);
1363 return 0;
1364 }
1365 }
1366
1367 data_size -= dgs;
1368
1369 /* optimistically update recv_cnt. if receiving fails below,
1370 * we disconnect anyways, and counters will be reset. */
1371 mdev->recv_cnt += data_size>>9;
1372
1373 bio = req->master_bio;
1374 D_ASSERT(sector == bio->bi_sector);
1375
1376 bio_for_each_segment(bvec, bio, i) {
1377 expect = min_t(int, data_size, bvec->bv_len);
1378 rr = drbd_recv(mdev,
1379 kmap(bvec->bv_page)+bvec->bv_offset,
1380 expect);
1381 kunmap(bvec->bv_page);
1382 if (rr != expect) {
1383 dev_warn(DEV, "short read receiving data reply: "
1384 "read %d expected %d\n",
1385 rr, expect);
1386 return 0;
1387 }
1388 data_size -= rr;
1389 }
1390
1391 if (dgs) {
1392 drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv);
1393 if (memcmp(dig_in, dig_vv, dgs)) {
1394 dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
1395 return 0;
1396 }
1397 }
1398
1399 D_ASSERT(data_size == 0);
1400 return 1;
1401}
1402
1403/* e_end_resync_block() is called via
1404 * drbd_process_done_ee() by asender only */
1405static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1406{
1407 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1408 sector_t sector = e->sector;
1409 int ok;
1410
1411 D_ASSERT(hlist_unhashed(&e->colision));
1412
1413 if (likely(drbd_bio_uptodate(e->private_bio))) {
1414 drbd_set_in_sync(mdev, sector, e->size);
1415 ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
1416 } else {
1417 /* Record failure to sync */
1418 drbd_rs_failed_io(mdev, sector, e->size);
1419
1420 ok = drbd_send_ack(mdev, P_NEG_ACK, e);
1421 }
1422 dec_unacked(mdev);
1423
1424 return ok;
1425}
1426
1427static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local)
1428{
1429 struct drbd_epoch_entry *e;
1430
1431 e = read_in_block(mdev, ID_SYNCER, sector, data_size);
1432 if (!e) {
1433 put_ldev(mdev);
1434 return FALSE;
1435 }
1436
1437 dec_rs_pending(mdev);
1438
1439 e->private_bio->bi_end_io = drbd_endio_write_sec;
1440 e->private_bio->bi_rw = WRITE;
1441 e->w.cb = e_end_resync_block;
1442
1443 inc_unacked(mdev);
1444 /* corresponding dec_unacked() in e_end_resync_block()
1445 * respective _drbd_clear_done_ee */
1446
1447 spin_lock_irq(&mdev->req_lock);
1448 list_add(&e->w.list, &mdev->sync_ee);
1449 spin_unlock_irq(&mdev->req_lock);
1450
1451 drbd_generic_make_request(mdev, DRBD_FAULT_RS_WR, e->private_bio);
1452 /* accounting done in endio */
1453
1454 maybe_kick_lo(mdev);
1455 return TRUE;
1456}
1457
1458static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
1459{
1460 struct drbd_request *req;
1461 sector_t sector;
1462 unsigned int header_size, data_size;
1463 int ok;
1464 struct p_data *p = (struct p_data *)h;
1465
1466 header_size = sizeof(*p) - sizeof(*h);
1467 data_size = h->length - header_size;
1468
1469 ERR_IF(data_size == 0) return FALSE;
1470
1471 if (drbd_recv(mdev, h->payload, header_size) != header_size)
1472 return FALSE;
1473
1474 sector = be64_to_cpu(p->sector);
1475
1476 spin_lock_irq(&mdev->req_lock);
1477 req = _ar_id_to_req(mdev, p->block_id, sector);
1478 spin_unlock_irq(&mdev->req_lock);
1479 if (unlikely(!req)) {
1480 dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n");
1481 return FALSE;
1482 }
1483
1484 /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid
1485 * special casing it there for the various failure cases.
1486 * still no race with drbd_fail_pending_reads */
1487 ok = recv_dless_read(mdev, req, sector, data_size);
1488
1489 if (ok)
1490 req_mod(req, data_received);
1491 /* else: nothing. handled from drbd_disconnect...
1492 * I don't think we may complete this just yet
1493 * in case we are "on-disconnect: freeze" */
1494
1495 return ok;
1496}
1497
1498static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h)
1499{
1500 sector_t sector;
1501 unsigned int header_size, data_size;
1502 int ok;
1503 struct p_data *p = (struct p_data *)h;
1504
1505 header_size = sizeof(*p) - sizeof(*h);
1506 data_size = h->length - header_size;
1507
1508 ERR_IF(data_size == 0) return FALSE;
1509
1510 if (drbd_recv(mdev, h->payload, header_size) != header_size)
1511 return FALSE;
1512
1513 sector = be64_to_cpu(p->sector);
1514 D_ASSERT(p->block_id == ID_SYNCER);
1515
1516 if (get_ldev(mdev)) {
1517 /* data is submitted to disk within recv_resync_read.
1518 * corresponding put_ldev done below on error,
1519 * or in drbd_endio_write_sec. */
1520 ok = recv_resync_read(mdev, sector, data_size);
1521 } else {
1522 if (__ratelimit(&drbd_ratelimit_state))
1523 dev_err(DEV, "Can not write resync data to local disk.\n");
1524
1525 ok = drbd_drain_block(mdev, data_size);
1526
1527 drbd_send_ack_dp(mdev, P_NEG_ACK, p);
1528 }
1529
1530 return ok;
1531}
1532
1533/* e_end_block() is called via drbd_process_done_ee().
1534 * this means this function only runs in the asender thread
1535 */
1536static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1537{
1538 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1539 sector_t sector = e->sector;
1540 struct drbd_epoch *epoch;
1541 int ok = 1, pcmd;
1542
1543 if (e->flags & EE_IS_BARRIER) {
1544 epoch = previous_epoch(mdev, e->epoch);
1545 if (epoch)
1546 drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0));
1547 }
1548
1549 if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
1550 if (likely(drbd_bio_uptodate(e->private_bio))) {
1551 pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
1552 mdev->state.conn <= C_PAUSED_SYNC_T &&
1553 e->flags & EE_MAY_SET_IN_SYNC) ?
1554 P_RS_WRITE_ACK : P_WRITE_ACK;
1555 ok &= drbd_send_ack(mdev, pcmd, e);
1556 if (pcmd == P_RS_WRITE_ACK)
1557 drbd_set_in_sync(mdev, sector, e->size);
1558 } else {
1559 ok = drbd_send_ack(mdev, P_NEG_ACK, e);
1560 /* we expect it to be marked out of sync anyways...
1561 * maybe assert this? */
1562 }
1563 dec_unacked(mdev);
1564 }
1565 /* we delete from the conflict detection hash _after_ we sent out the
1566 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */
1567 if (mdev->net_conf->two_primaries) {
1568 spin_lock_irq(&mdev->req_lock);
1569 D_ASSERT(!hlist_unhashed(&e->colision));
1570 hlist_del_init(&e->colision);
1571 spin_unlock_irq(&mdev->req_lock);
1572 } else {
1573 D_ASSERT(hlist_unhashed(&e->colision));
1574 }
1575
1576 drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
1577
1578 return ok;
1579}
1580
1581static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1582{
1583 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1584 int ok = 1;
1585
1586 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
1587 ok = drbd_send_ack(mdev, P_DISCARD_ACK, e);
1588
1589 spin_lock_irq(&mdev->req_lock);
1590 D_ASSERT(!hlist_unhashed(&e->colision));
1591 hlist_del_init(&e->colision);
1592 spin_unlock_irq(&mdev->req_lock);
1593
1594 dec_unacked(mdev);
1595
1596 return ok;
1597}
1598
1599/* Called from receive_Data.
1600 * Synchronize packets on sock with packets on msock.
1601 *
1602 * This is here so even when a P_DATA packet traveling via sock overtook an Ack
1603 * packet traveling on msock, they are still processed in the order they have
1604 * been sent.
1605 *
1606 * Note: we don't care for Ack packets overtaking P_DATA packets.
1607 *
1608 * In case packet_seq is larger than mdev->peer_seq number, there are
1609 * outstanding packets on the msock. We wait for them to arrive.
1610 * In case we are the logically next packet, we update mdev->peer_seq
1611 * ourselves. Correctly handles 32bit wrap around.
1612 *
1613 * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
1614 * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
1615 * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
1616 * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
1617 *
1618 * returns 0 if we may process the packet,
1619 * -ERESTARTSYS if we were interrupted (by disconnect signal). */
1620static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
1621{
1622 DEFINE_WAIT(wait);
1623 unsigned int p_seq;
1624 long timeout;
1625 int ret = 0;
1626 spin_lock(&mdev->peer_seq_lock);
1627 for (;;) {
1628 prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
1629 if (seq_le(packet_seq, mdev->peer_seq+1))
1630 break;
1631 if (signal_pending(current)) {
1632 ret = -ERESTARTSYS;
1633 break;
1634 }
1635 p_seq = mdev->peer_seq;
1636 spin_unlock(&mdev->peer_seq_lock);
1637 timeout = schedule_timeout(30*HZ);
1638 spin_lock(&mdev->peer_seq_lock);
1639 if (timeout == 0 && p_seq == mdev->peer_seq) {
1640 ret = -ETIMEDOUT;
1641 dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n");
1642 break;
1643 }
1644 }
1645 finish_wait(&mdev->seq_wait, &wait);
1646 if (mdev->peer_seq+1 == packet_seq)
1647 mdev->peer_seq++;
1648 spin_unlock(&mdev->peer_seq_lock);
1649 return ret;
1650}
1651
1652/* mirrored write */
1653static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
1654{
1655 sector_t sector;
1656 struct drbd_epoch_entry *e;
1657 struct p_data *p = (struct p_data *)h;
1658 int header_size, data_size;
1659 int rw = WRITE;
1660 u32 dp_flags;
1661
1662 header_size = sizeof(*p) - sizeof(*h);
1663 data_size = h->length - header_size;
1664
1665 ERR_IF(data_size == 0) return FALSE;
1666
1667 if (drbd_recv(mdev, h->payload, header_size) != header_size)
1668 return FALSE;
1669
1670 if (!get_ldev(mdev)) {
1671 if (__ratelimit(&drbd_ratelimit_state))
1672 dev_err(DEV, "Can not write mirrored data block "
1673 "to local disk.\n");
1674 spin_lock(&mdev->peer_seq_lock);
1675 if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num))
1676 mdev->peer_seq++;
1677 spin_unlock(&mdev->peer_seq_lock);
1678
1679 drbd_send_ack_dp(mdev, P_NEG_ACK, p);
1680 atomic_inc(&mdev->current_epoch->epoch_size);
1681 return drbd_drain_block(mdev, data_size);
1682 }
1683
1684 /* get_ldev(mdev) successful.
1685 * Corresponding put_ldev done either below (on various errors),
1686 * or in drbd_endio_write_sec, if we successfully submit the data at
1687 * the end of this function. */
1688
1689 sector = be64_to_cpu(p->sector);
1690 e = read_in_block(mdev, p->block_id, sector, data_size);
1691 if (!e) {
1692 put_ldev(mdev);
1693 return FALSE;
1694 }
1695
1696 e->private_bio->bi_end_io = drbd_endio_write_sec;
1697 e->w.cb = e_end_block;
1698
1699 spin_lock(&mdev->epoch_lock);
1700 e->epoch = mdev->current_epoch;
1701 atomic_inc(&e->epoch->epoch_size);
1702 atomic_inc(&e->epoch->active);
1703
1704 if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) {
1705 struct drbd_epoch *epoch;
1706 /* Issue a barrier if we start a new epoch, and the previous epoch
1707 was not a epoch containing a single request which already was
1708 a Barrier. */
1709 epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list);
1710 if (epoch == e->epoch) {
1711 set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
1712 rw |= (1<<BIO_RW_BARRIER);
1713 e->flags |= EE_IS_BARRIER;
1714 } else {
1715 if (atomic_read(&epoch->epoch_size) > 1 ||
1716 !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) {
1717 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1718 set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
1719 rw |= (1<<BIO_RW_BARRIER);
1720 e->flags |= EE_IS_BARRIER;
1721 }
1722 }
1723 }
1724 spin_unlock(&mdev->epoch_lock);
1725
1726 dp_flags = be32_to_cpu(p->dp_flags);
1727 if (dp_flags & DP_HARDBARRIER) {
1728 dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n");
1729 /* rw |= (1<<BIO_RW_BARRIER); */
1730 }
1731 if (dp_flags & DP_RW_SYNC)
1732 rw |= (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG);
1733 if (dp_flags & DP_MAY_SET_IN_SYNC)
1734 e->flags |= EE_MAY_SET_IN_SYNC;
1735
1736 /* I'm the receiver, I do hold a net_cnt reference. */
1737 if (!mdev->net_conf->two_primaries) {
1738 spin_lock_irq(&mdev->req_lock);
1739 } else {
1740 /* don't get the req_lock yet,
1741 * we may sleep in drbd_wait_peer_seq */
1742 const int size = e->size;
1743 const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags);
1744 DEFINE_WAIT(wait);
1745 struct drbd_request *i;
1746 struct hlist_node *n;
1747 struct hlist_head *slot;
1748 int first;
1749
1750 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
1751 BUG_ON(mdev->ee_hash == NULL);
1752 BUG_ON(mdev->tl_hash == NULL);
1753
1754 /* conflict detection and handling:
1755 * 1. wait on the sequence number,
1756 * in case this data packet overtook ACK packets.
1757 * 2. check our hash tables for conflicting requests.
1758 * we only need to walk the tl_hash, since an ee can not
1759 * have a conflict with an other ee: on the submitting
1760 * node, the corresponding req had already been conflicting,
1761 * and a conflicting req is never sent.
1762 *
1763 * Note: for two_primaries, we are protocol C,
1764 * so there cannot be any request that is DONE
1765 * but still on the transfer log.
1766 *
1767 * unconditionally add to the ee_hash.
1768 *
1769 * if no conflicting request is found:
1770 * submit.
1771 *
1772 * if any conflicting request is found
1773 * that has not yet been acked,
1774 * AND I have the "discard concurrent writes" flag:
1775 * queue (via done_ee) the P_DISCARD_ACK; OUT.
1776 *
1777 * if any conflicting request is found:
1778 * block the receiver, waiting on misc_wait
1779 * until no more conflicting requests are there,
1780 * or we get interrupted (disconnect).
1781 *
1782 * we do not just write after local io completion of those
1783 * requests, but only after req is done completely, i.e.
1784 * we wait for the P_DISCARD_ACK to arrive!
1785 *
1786 * then proceed normally, i.e. submit.
1787 */
1788 if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num)))
1789 goto out_interrupted;
1790
1791 spin_lock_irq(&mdev->req_lock);
1792
1793 hlist_add_head(&e->colision, ee_hash_slot(mdev, sector));
1794
1795#define OVERLAPS overlaps(i->sector, i->size, sector, size)
1796 slot = tl_hash_slot(mdev, sector);
1797 first = 1;
1798 for (;;) {
1799 int have_unacked = 0;
1800 int have_conflict = 0;
1801 prepare_to_wait(&mdev->misc_wait, &wait,
1802 TASK_INTERRUPTIBLE);
1803 hlist_for_each_entry(i, n, slot, colision) {
1804 if (OVERLAPS) {
1805 /* only ALERT on first iteration,
1806 * we may be woken up early... */
1807 if (first)
1808 dev_alert(DEV, "%s[%u] Concurrent local write detected!"
1809 " new: %llus +%u; pending: %llus +%u\n",
1810 current->comm, current->pid,
1811 (unsigned long long)sector, size,
1812 (unsigned long long)i->sector, i->size);
1813 if (i->rq_state & RQ_NET_PENDING)
1814 ++have_unacked;
1815 ++have_conflict;
1816 }
1817 }
1818#undef OVERLAPS
1819 if (!have_conflict)
1820 break;
1821
1822 /* Discard Ack only for the _first_ iteration */
1823 if (first && discard && have_unacked) {
1824 dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n",
1825 (unsigned long long)sector);
1826 inc_unacked(mdev);
1827 e->w.cb = e_send_discard_ack;
1828 list_add_tail(&e->w.list, &mdev->done_ee);
1829
1830 spin_unlock_irq(&mdev->req_lock);
1831
1832 /* we could probably send that P_DISCARD_ACK ourselves,
1833 * but I don't like the receiver using the msock */
1834
1835 put_ldev(mdev);
1836 wake_asender(mdev);
1837 finish_wait(&mdev->misc_wait, &wait);
1838 return TRUE;
1839 }
1840
1841 if (signal_pending(current)) {
1842 hlist_del_init(&e->colision);
1843
1844 spin_unlock_irq(&mdev->req_lock);
1845
1846 finish_wait(&mdev->misc_wait, &wait);
1847 goto out_interrupted;
1848 }
1849
1850 spin_unlock_irq(&mdev->req_lock);
1851 if (first) {
1852 first = 0;
1853 dev_alert(DEV, "Concurrent write! [W AFTERWARDS] "
1854 "sec=%llus\n", (unsigned long long)sector);
1855 } else if (discard) {
1856 /* we had none on the first iteration.
1857 * there must be none now. */
1858 D_ASSERT(have_unacked == 0);
1859 }
1860 schedule();
1861 spin_lock_irq(&mdev->req_lock);
1862 }
1863 finish_wait(&mdev->misc_wait, &wait);
1864 }
1865
1866 list_add(&e->w.list, &mdev->active_ee);
1867 spin_unlock_irq(&mdev->req_lock);
1868
1869 switch (mdev->net_conf->wire_protocol) {
1870 case DRBD_PROT_C:
1871 inc_unacked(mdev);
1872 /* corresponding dec_unacked() in e_end_block()
1873 * respective _drbd_clear_done_ee */
1874 break;
1875 case DRBD_PROT_B:
1876 /* I really don't like it that the receiver thread
1877 * sends on the msock, but anyways */
1878 drbd_send_ack(mdev, P_RECV_ACK, e);
1879 break;
1880 case DRBD_PROT_A:
1881 /* nothing to do */
1882 break;
1883 }
1884
1885 if (mdev->state.pdsk == D_DISKLESS) {
1886 /* In case we have the only disk of the cluster, */
1887 drbd_set_out_of_sync(mdev, e->sector, e->size);
1888 e->flags |= EE_CALL_AL_COMPLETE_IO;
1889 drbd_al_begin_io(mdev, e->sector);
1890 }
1891
1892 e->private_bio->bi_rw = rw;
1893 drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, e->private_bio);
1894 /* accounting done in endio */
1895
1896 maybe_kick_lo(mdev);
1897 return TRUE;
1898
1899out_interrupted:
1900 /* yes, the epoch_size now is imbalanced.
1901 * but we drop the connection anyways, so we don't have a chance to
1902 * receive a barrier... atomic_inc(&mdev->epoch_size); */
1903 put_ldev(mdev);
1904 drbd_free_ee(mdev, e);
1905 return FALSE;
1906}
1907
1908static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
1909{
1910 sector_t sector;
1911 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
1912 struct drbd_epoch_entry *e;
1913 struct digest_info *di = NULL;
1914 int size, digest_size;
1915 unsigned int fault_type;
1916 struct p_block_req *p =
1917 (struct p_block_req *)h;
1918 const int brps = sizeof(*p)-sizeof(*h);
1919
1920 if (drbd_recv(mdev, h->payload, brps) != brps)
1921 return FALSE;
1922
1923 sector = be64_to_cpu(p->sector);
1924 size = be32_to_cpu(p->blksize);
1925
1926 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
1927 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
1928 (unsigned long long)sector, size);
1929 return FALSE;
1930 }
1931 if (sector + (size>>9) > capacity) {
1932 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
1933 (unsigned long long)sector, size);
1934 return FALSE;
1935 }
1936
1937 if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
1938 if (__ratelimit(&drbd_ratelimit_state))
1939 dev_err(DEV, "Can not satisfy peer's read request, "
1940 "no local data.\n");
1941 drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY :
1942 P_NEG_RS_DREPLY , p);
1943 return TRUE;
1944 }
1945
1946 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1947 * "criss-cross" setup, that might cause write-out on some other DRBD,
1948 * which in turn might block on the other node at this very place. */
1949 e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO);
1950 if (!e) {
1951 put_ldev(mdev);
1952 return FALSE;
1953 }
1954
1955 e->private_bio->bi_rw = READ;
1956 e->private_bio->bi_end_io = drbd_endio_read_sec;
1957
1958 switch (h->command) {
1959 case P_DATA_REQUEST:
1960 e->w.cb = w_e_end_data_req;
1961 fault_type = DRBD_FAULT_DT_RD;
1962 break;
1963 case P_RS_DATA_REQUEST:
1964 e->w.cb = w_e_end_rsdata_req;
1965 fault_type = DRBD_FAULT_RS_RD;
1966 /* Eventually this should become asynchronously. Currently it
1967 * blocks the whole receiver just to delay the reading of a
1968 * resync data block.
1969 * the drbd_work_queue mechanism is made for this...
1970 */
1971 if (!drbd_rs_begin_io(mdev, sector)) {
1972 /* we have been interrupted,
1973 * probably connection lost! */
1974 D_ASSERT(signal_pending(current));
1975 goto out_free_e;
1976 }
1977 break;
1978
1979 case P_OV_REPLY:
1980 case P_CSUM_RS_REQUEST:
1981 fault_type = DRBD_FAULT_RS_RD;
1982 digest_size = h->length - brps ;
1983 di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
1984 if (!di)
1985 goto out_free_e;
1986
1987 di->digest_size = digest_size;
1988 di->digest = (((char *)di)+sizeof(struct digest_info));
1989
1990 if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
1991 goto out_free_e;
1992
1993 e->block_id = (u64)(unsigned long)di;
1994 if (h->command == P_CSUM_RS_REQUEST) {
1995 D_ASSERT(mdev->agreed_pro_version >= 89);
1996 e->w.cb = w_e_end_csum_rs_req;
1997 } else if (h->command == P_OV_REPLY) {
1998 e->w.cb = w_e_end_ov_reply;
1999 dec_rs_pending(mdev);
2000 break;
2001 }
2002
2003 if (!drbd_rs_begin_io(mdev, sector)) {
2004 /* we have been interrupted, probably connection lost! */
2005 D_ASSERT(signal_pending(current));
2006 goto out_free_e;
2007 }
2008 break;
2009
2010 case P_OV_REQUEST:
2011 if (mdev->state.conn >= C_CONNECTED &&
2012 mdev->state.conn != C_VERIFY_T)
2013 dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n",
2014 drbd_conn_str(mdev->state.conn));
2015 if (mdev->ov_start_sector == ~(sector_t)0 &&
2016 mdev->agreed_pro_version >= 90) {
2017 mdev->ov_start_sector = sector;
2018 mdev->ov_position = sector;
2019 mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(sector);
2020 dev_info(DEV, "Online Verify start sector: %llu\n",
2021 (unsigned long long)sector);
2022 }
2023 e->w.cb = w_e_end_ov_req;
2024 fault_type = DRBD_FAULT_RS_RD;
2025 /* Eventually this should become asynchronous. Currently it
2026 * blocks the whole receiver just to delay the reading of a
2027 * resync data block.
2028 * the drbd_work_queue mechanism is made for this...
2029 */
2030 if (!drbd_rs_begin_io(mdev, sector)) {
2031 /* we have been interrupted,
2032 * probably connection lost! */
2033 D_ASSERT(signal_pending(current));
2034 goto out_free_e;
2035 }
2036 break;
2037
2038
2039 default:
2040 dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
2041 cmdname(h->command));
2042 fault_type = DRBD_FAULT_MAX;
2043 }
2044
2045 spin_lock_irq(&mdev->req_lock);
2046 list_add(&e->w.list, &mdev->read_ee);
2047 spin_unlock_irq(&mdev->req_lock);
2048
2049 inc_unacked(mdev);
2050
2051 drbd_generic_make_request(mdev, fault_type, e->private_bio);
2052 maybe_kick_lo(mdev);
2053
2054 return TRUE;
2055
2056out_free_e:
2057 kfree(di);
2058 put_ldev(mdev);
2059 drbd_free_ee(mdev, e);
2060 return FALSE;
2061}
2062
2063static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
2064{
2065 int self, peer, rv = -100;
2066 unsigned long ch_self, ch_peer;
2067
2068 self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2069 peer = mdev->p_uuid[UI_BITMAP] & 1;
2070
2071 ch_peer = mdev->p_uuid[UI_SIZE];
2072 ch_self = mdev->comm_bm_set;
2073
2074 switch (mdev->net_conf->after_sb_0p) {
2075 case ASB_CONSENSUS:
2076 case ASB_DISCARD_SECONDARY:
2077 case ASB_CALL_HELPER:
2078 dev_err(DEV, "Configuration error.\n");
2079 break;
2080 case ASB_DISCONNECT:
2081 break;
2082 case ASB_DISCARD_YOUNGER_PRI:
2083 if (self == 0 && peer == 1) {
2084 rv = -1;
2085 break;
2086 }
2087 if (self == 1 && peer == 0) {
2088 rv = 1;
2089 break;
2090 }
2091 /* Else fall through to one of the other strategies... */
2092 case ASB_DISCARD_OLDER_PRI:
2093 if (self == 0 && peer == 1) {
2094 rv = 1;
2095 break;
2096 }
2097 if (self == 1 && peer == 0) {
2098 rv = -1;
2099 break;
2100 }
2101 /* Else fall through to one of the other strategies... */
2102 dev_warn(DEV, "Discard younger/older primary did not find a decision\n"
2103 "Using discard-least-changes instead\n");
2104 case ASB_DISCARD_ZERO_CHG:
2105 if (ch_peer == 0 && ch_self == 0) {
2106 rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
2107 ? -1 : 1;
2108 break;
2109 } else {
2110 if (ch_peer == 0) { rv = 1; break; }
2111 if (ch_self == 0) { rv = -1; break; }
2112 }
2113 if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG)
2114 break;
2115 case ASB_DISCARD_LEAST_CHG:
2116 if (ch_self < ch_peer)
2117 rv = -1;
2118 else if (ch_self > ch_peer)
2119 rv = 1;
2120 else /* ( ch_self == ch_peer ) */
2121 /* Well, then use something else. */
2122 rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
2123 ? -1 : 1;
2124 break;
2125 case ASB_DISCARD_LOCAL:
2126 rv = -1;
2127 break;
2128 case ASB_DISCARD_REMOTE:
2129 rv = 1;
2130 }
2131
2132 return rv;
2133}
2134
2135static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
2136{
2137 int self, peer, hg, rv = -100;
2138
2139 self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2140 peer = mdev->p_uuid[UI_BITMAP] & 1;
2141
2142 switch (mdev->net_conf->after_sb_1p) {
2143 case ASB_DISCARD_YOUNGER_PRI:
2144 case ASB_DISCARD_OLDER_PRI:
2145 case ASB_DISCARD_LEAST_CHG:
2146 case ASB_DISCARD_LOCAL:
2147 case ASB_DISCARD_REMOTE:
2148 dev_err(DEV, "Configuration error.\n");
2149 break;
2150 case ASB_DISCONNECT:
2151 break;
2152 case ASB_CONSENSUS:
2153 hg = drbd_asb_recover_0p(mdev);
2154 if (hg == -1 && mdev->state.role == R_SECONDARY)
2155 rv = hg;
2156 if (hg == 1 && mdev->state.role == R_PRIMARY)
2157 rv = hg;
2158 break;
2159 case ASB_VIOLENTLY:
2160 rv = drbd_asb_recover_0p(mdev);
2161 break;
2162 case ASB_DISCARD_SECONDARY:
2163 return mdev->state.role == R_PRIMARY ? 1 : -1;
2164 case ASB_CALL_HELPER:
2165 hg = drbd_asb_recover_0p(mdev);
2166 if (hg == -1 && mdev->state.role == R_PRIMARY) {
2167 self = drbd_set_role(mdev, R_SECONDARY, 0);
2168 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2169 * we might be here in C_WF_REPORT_PARAMS which is transient.
2170 * we do not need to wait for the after state change work either. */
2171 self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2172 if (self != SS_SUCCESS) {
2173 drbd_khelper(mdev, "pri-lost-after-sb");
2174 } else {
2175 dev_warn(DEV, "Successfully gave up primary role.\n");
2176 rv = hg;
2177 }
2178 } else
2179 rv = hg;
2180 }
2181
2182 return rv;
2183}
2184
2185static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
2186{
2187 int self, peer, hg, rv = -100;
2188
2189 self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2190 peer = mdev->p_uuid[UI_BITMAP] & 1;
2191
2192 switch (mdev->net_conf->after_sb_2p) {
2193 case ASB_DISCARD_YOUNGER_PRI:
2194 case ASB_DISCARD_OLDER_PRI:
2195 case ASB_DISCARD_LEAST_CHG:
2196 case ASB_DISCARD_LOCAL:
2197 case ASB_DISCARD_REMOTE:
2198 case ASB_CONSENSUS:
2199 case ASB_DISCARD_SECONDARY:
2200 dev_err(DEV, "Configuration error.\n");
2201 break;
2202 case ASB_VIOLENTLY:
2203 rv = drbd_asb_recover_0p(mdev);
2204 break;
2205 case ASB_DISCONNECT:
2206 break;
2207 case ASB_CALL_HELPER:
2208 hg = drbd_asb_recover_0p(mdev);
2209 if (hg == -1) {
2210 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2211 * we might be here in C_WF_REPORT_PARAMS which is transient.
2212 * we do not need to wait for the after state change work either. */
2213 self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2214 if (self != SS_SUCCESS) {
2215 drbd_khelper(mdev, "pri-lost-after-sb");
2216 } else {
2217 dev_warn(DEV, "Successfully gave up primary role.\n");
2218 rv = hg;
2219 }
2220 } else
2221 rv = hg;
2222 }
2223
2224 return rv;
2225}
2226
2227static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid,
2228 u64 bits, u64 flags)
2229{
2230 if (!uuid) {
2231 dev_info(DEV, "%s uuid info vanished while I was looking!\n", text);
2232 return;
2233 }
2234 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
2235 text,
2236 (unsigned long long)uuid[UI_CURRENT],
2237 (unsigned long long)uuid[UI_BITMAP],
2238 (unsigned long long)uuid[UI_HISTORY_START],
2239 (unsigned long long)uuid[UI_HISTORY_END],
2240 (unsigned long long)bits,
2241 (unsigned long long)flags);
2242}
2243
2244/*
2245 100 after split brain try auto recover
2246 2 C_SYNC_SOURCE set BitMap
2247 1 C_SYNC_SOURCE use BitMap
2248 0 no Sync
2249 -1 C_SYNC_TARGET use BitMap
2250 -2 C_SYNC_TARGET set BitMap
2251 -100 after split brain, disconnect
2252-1000 unrelated data
2253 */
2254static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local)
2255{
2256 u64 self, peer;
2257 int i, j;
2258
2259 self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2260 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2261
2262 *rule_nr = 10;
2263 if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
2264 return 0;
2265
2266 *rule_nr = 20;
2267 if ((self == UUID_JUST_CREATED || self == (u64)0) &&
2268 peer != UUID_JUST_CREATED)
2269 return -2;
2270
2271 *rule_nr = 30;
2272 if (self != UUID_JUST_CREATED &&
2273 (peer == UUID_JUST_CREATED || peer == (u64)0))
2274 return 2;
2275
2276 if (self == peer) {
2277 int rct, dc; /* roles at crash time */
2278
2279 if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
2280
2281 if (mdev->agreed_pro_version < 91)
2282 return -1001;
2283
2284 if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
2285 (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
2286 dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n");
2287 drbd_uuid_set_bm(mdev, 0UL);
2288
2289 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2290 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2291 *rule_nr = 34;
2292 } else {
2293 dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n");
2294 *rule_nr = 36;
2295 }
2296
2297 return 1;
2298 }
2299
2300 if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
2301
2302 if (mdev->agreed_pro_version < 91)
2303 return -1001;
2304
2305 if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
2306 (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
2307 dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
2308
2309 mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START];
2310 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP];
2311 mdev->p_uuid[UI_BITMAP] = 0UL;
2312
2313 drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2314 *rule_nr = 35;
2315 } else {
2316 dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n");
2317 *rule_nr = 37;
2318 }
2319
2320 return -1;
2321 }
2322
2323 /* Common power [off|failure] */
2324 rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) +
2325 (mdev->p_uuid[UI_FLAGS] & 2);
2326 /* lowest bit is set when we were primary,
2327 * next bit (weight 2) is set when peer was primary */
2328 *rule_nr = 40;
2329
2330 switch (rct) {
2331 case 0: /* !self_pri && !peer_pri */ return 0;
2332 case 1: /* self_pri && !peer_pri */ return 1;
2333 case 2: /* !self_pri && peer_pri */ return -1;
2334 case 3: /* self_pri && peer_pri */
2335 dc = test_bit(DISCARD_CONCURRENT, &mdev->flags);
2336 return dc ? -1 : 1;
2337 }
2338 }
2339
2340 *rule_nr = 50;
2341 peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2342 if (self == peer)
2343 return -1;
2344
2345 *rule_nr = 51;
2346 peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
2347 if (self == peer) {
2348 self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2349 peer = mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1);
2350 if (self == peer) {
2351 /* The last P_SYNC_UUID did not get though. Undo the last start of
2352 resync as sync source modifications of the peer's UUIDs. */
2353
2354 if (mdev->agreed_pro_version < 91)
2355 return -1001;
2356
2357 mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
2358 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
2359 return -1;
2360 }
2361 }
2362
2363 *rule_nr = 60;
2364 self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2365 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2366 peer = mdev->p_uuid[i] & ~((u64)1);
2367 if (self == peer)
2368 return -2;
2369 }
2370
2371 *rule_nr = 70;
2372 self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2373 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2374 if (self == peer)
2375 return 1;
2376
2377 *rule_nr = 71;
2378 self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2379 if (self == peer) {
2380 self = mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1);
2381 peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
2382 if (self == peer) {
2383 /* The last P_SYNC_UUID did not get though. Undo the last start of
2384 resync as sync source modifications of our UUIDs. */
2385
2386 if (mdev->agreed_pro_version < 91)
2387 return -1001;
2388
2389 _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
2390 _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]);
2391
2392 dev_info(DEV, "Undid last start of resync:\n");
2393
2394 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2395 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2396
2397 return 1;
2398 }
2399 }
2400
2401
2402 *rule_nr = 80;
2403 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2404 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2405 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2406 if (self == peer)
2407 return 2;
2408 }
2409
2410 *rule_nr = 90;
2411 self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2412 peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2413 if (self == peer && self != ((u64)0))
2414 return 100;
2415
2416 *rule_nr = 100;
2417 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2418 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2419 for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
2420 peer = mdev->p_uuid[j] & ~((u64)1);
2421 if (self == peer)
2422 return -100;
2423 }
2424 }
2425
2426 return -1000;
2427}
2428
2429/* drbd_sync_handshake() returns the new conn state on success, or
2430 CONN_MASK (-1) on failure.
2431 */
2432static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role,
2433 enum drbd_disk_state peer_disk) __must_hold(local)
2434{
2435 int hg, rule_nr;
2436 enum drbd_conns rv = C_MASK;
2437 enum drbd_disk_state mydisk;
2438
2439 mydisk = mdev->state.disk;
2440 if (mydisk == D_NEGOTIATING)
2441 mydisk = mdev->new_state_tmp.disk;
2442
2443 dev_info(DEV, "drbd_sync_handshake:\n");
2444 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0);
2445 drbd_uuid_dump(mdev, "peer", mdev->p_uuid,
2446 mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2447
2448 hg = drbd_uuid_compare(mdev, &rule_nr);
2449
2450 dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
2451
2452 if (hg == -1000) {
2453 dev_alert(DEV, "Unrelated data, aborting!\n");
2454 return C_MASK;
2455 }
2456 if (hg == -1001) {
2457 dev_alert(DEV, "To resolve this both sides have to support at least protocol\n");
2458 return C_MASK;
2459 }
2460
2461 if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
2462 (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) {
2463 int f = (hg == -100) || abs(hg) == 2;
2464 hg = mydisk > D_INCONSISTENT ? 1 : -1;
2465 if (f)
2466 hg = hg*2;
2467 dev_info(DEV, "Becoming sync %s due to disk states.\n",
2468 hg > 0 ? "source" : "target");
2469 }
2470
2471 if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
2472 int pcount = (mdev->state.role == R_PRIMARY)
2473 + (peer_role == R_PRIMARY);
2474 int forced = (hg == -100);
2475
2476 switch (pcount) {
2477 case 0:
2478 hg = drbd_asb_recover_0p(mdev);
2479 break;
2480 case 1:
2481 hg = drbd_asb_recover_1p(mdev);
2482 break;
2483 case 2:
2484 hg = drbd_asb_recover_2p(mdev);
2485 break;
2486 }
2487 if (abs(hg) < 100) {
2488 dev_warn(DEV, "Split-Brain detected, %d primaries, "
2489 "automatically solved. Sync from %s node\n",
2490 pcount, (hg < 0) ? "peer" : "this");
2491 if (forced) {
2492 dev_warn(DEV, "Doing a full sync, since"
2493 " UUIDs where ambiguous.\n");
2494 hg = hg*2;
2495 }
2496 }
2497 }
2498
2499 if (hg == -100) {
2500 if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1))
2501 hg = -1;
2502 if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1))
2503 hg = 1;
2504
2505 if (abs(hg) < 100)
2506 dev_warn(DEV, "Split-Brain detected, manually solved. "
2507 "Sync from %s node\n",
2508 (hg < 0) ? "peer" : "this");
2509 }
2510
2511 if (hg == -100) {
2512 dev_alert(DEV, "Split-Brain detected, dropping connection!\n");
2513 drbd_khelper(mdev, "split-brain");
2514 return C_MASK;
2515 }
2516
2517 if (hg > 0 && mydisk <= D_INCONSISTENT) {
2518 dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n");
2519 return C_MASK;
2520 }
2521
2522 if (hg < 0 && /* by intention we do not use mydisk here. */
2523 mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) {
2524 switch (mdev->net_conf->rr_conflict) {
2525 case ASB_CALL_HELPER:
2526 drbd_khelper(mdev, "pri-lost");
2527 /* fall through */
2528 case ASB_DISCONNECT:
2529 dev_err(DEV, "I shall become SyncTarget, but I am primary!\n");
2530 return C_MASK;
2531 case ASB_VIOLENTLY:
2532 dev_warn(DEV, "Becoming SyncTarget, violating the stable-data"
2533 "assumption\n");
2534 }
2535 }
2536
2537 if (abs(hg) >= 2) {
2538 dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
2539 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake"))
2540 return C_MASK;
2541 }
2542
2543 if (hg > 0) { /* become sync source. */
2544 rv = C_WF_BITMAP_S;
2545 } else if (hg < 0) { /* become sync target */
2546 rv = C_WF_BITMAP_T;
2547 } else {
2548 rv = C_CONNECTED;
2549 if (drbd_bm_total_weight(mdev)) {
2550 dev_info(DEV, "No resync, but %lu bits in bitmap!\n",
2551 drbd_bm_total_weight(mdev));
2552 }
2553 }
2554
2555 return rv;
2556}
2557
2558/* returns 1 if invalid */
2559static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
2560{
2561 /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
2562 if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) ||
2563 (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL))
2564 return 0;
2565
2566 /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
2567 if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL ||
2568 self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL)
2569 return 1;
2570
2571 /* everything else is valid if they are equal on both sides. */
2572 if (peer == self)
2573 return 0;
2574
2575 /* everything es is invalid. */
2576 return 1;
2577}
2578
2579static int receive_protocol(struct drbd_conf *mdev, struct p_header *h)
2580{
2581 struct p_protocol *p = (struct p_protocol *)h;
2582 int header_size, data_size;
2583 int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
2584 int p_want_lose, p_two_primaries;
2585 char p_integrity_alg[SHARED_SECRET_MAX] = "";
2586
2587 header_size = sizeof(*p) - sizeof(*h);
2588 data_size = h->length - header_size;
2589
2590 if (drbd_recv(mdev, h->payload, header_size) != header_size)
2591 return FALSE;
2592
2593 p_proto = be32_to_cpu(p->protocol);
2594 p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
2595 p_after_sb_1p = be32_to_cpu(p->after_sb_1p);
2596 p_after_sb_2p = be32_to_cpu(p->after_sb_2p);
2597 p_want_lose = be32_to_cpu(p->want_lose);
2598 p_two_primaries = be32_to_cpu(p->two_primaries);
2599
2600 if (p_proto != mdev->net_conf->wire_protocol) {
2601 dev_err(DEV, "incompatible communication protocols\n");
2602 goto disconnect;
2603 }
2604
2605 if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) {
2606 dev_err(DEV, "incompatible after-sb-0pri settings\n");
2607 goto disconnect;
2608 }
2609
2610 if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) {
2611 dev_err(DEV, "incompatible after-sb-1pri settings\n");
2612 goto disconnect;
2613 }
2614
2615 if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) {
2616 dev_err(DEV, "incompatible after-sb-2pri settings\n");
2617 goto disconnect;
2618 }
2619
2620 if (p_want_lose && mdev->net_conf->want_lose) {
2621 dev_err(DEV, "both sides have the 'want_lose' flag set\n");
2622 goto disconnect;
2623 }
2624
2625 if (p_two_primaries != mdev->net_conf->two_primaries) {
2626 dev_err(DEV, "incompatible setting of the two-primaries options\n");
2627 goto disconnect;
2628 }
2629
2630 if (mdev->agreed_pro_version >= 87) {
2631 unsigned char *my_alg = mdev->net_conf->integrity_alg;
2632
2633 if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size)
2634 return FALSE;
2635
2636 p_integrity_alg[SHARED_SECRET_MAX-1] = 0;
2637 if (strcmp(p_integrity_alg, my_alg)) {
2638 dev_err(DEV, "incompatible setting of the data-integrity-alg\n");
2639 goto disconnect;
2640 }
2641 dev_info(DEV, "data-integrity-alg: %s\n",
2642 my_alg[0] ? my_alg : (unsigned char *)"<not-used>");
2643 }
2644
2645 return TRUE;
2646
2647disconnect:
2648 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2649 return FALSE;
2650}
2651
2652/* helper function
2653 * input: alg name, feature name
2654 * return: NULL (alg name was "")
2655 * ERR_PTR(error) if something goes wrong
2656 * or the crypto hash ptr, if it worked out ok. */
2657struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
2658 const char *alg, const char *name)
2659{
2660 struct crypto_hash *tfm;
2661
2662 if (!alg[0])
2663 return NULL;
2664
2665 tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
2666 if (IS_ERR(tfm)) {
2667 dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n",
2668 alg, name, PTR_ERR(tfm));
2669 return tfm;
2670 }
2671 if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
2672 crypto_free_hash(tfm);
2673 dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name);
2674 return ERR_PTR(-EINVAL);
2675 }
2676 return tfm;
2677}
2678
2679static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
2680{
2681 int ok = TRUE;
2682 struct p_rs_param_89 *p = (struct p_rs_param_89 *)h;
2683 unsigned int header_size, data_size, exp_max_sz;
2684 struct crypto_hash *verify_tfm = NULL;
2685 struct crypto_hash *csums_tfm = NULL;
2686 const int apv = mdev->agreed_pro_version;
2687
2688 exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
2689 : apv == 88 ? sizeof(struct p_rs_param)
2690 + SHARED_SECRET_MAX
2691 : /* 89 */ sizeof(struct p_rs_param_89);
2692
2693 if (h->length > exp_max_sz) {
2694 dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
2695 h->length, exp_max_sz);
2696 return FALSE;
2697 }
2698
2699 if (apv <= 88) {
2700 header_size = sizeof(struct p_rs_param) - sizeof(*h);
2701 data_size = h->length - header_size;
2702 } else /* apv >= 89 */ {
2703 header_size = sizeof(struct p_rs_param_89) - sizeof(*h);
2704 data_size = h->length - header_size;
2705 D_ASSERT(data_size == 0);
2706 }
2707
2708 /* initialize verify_alg and csums_alg */
2709 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
2710
2711 if (drbd_recv(mdev, h->payload, header_size) != header_size)
2712 return FALSE;
2713
2714 mdev->sync_conf.rate = be32_to_cpu(p->rate);
2715
2716 if (apv >= 88) {
2717 if (apv == 88) {
2718 if (data_size > SHARED_SECRET_MAX) {
2719 dev_err(DEV, "verify-alg too long, "
2720 "peer wants %u, accepting only %u byte\n",
2721 data_size, SHARED_SECRET_MAX);
2722 return FALSE;
2723 }
2724
2725 if (drbd_recv(mdev, p->verify_alg, data_size) != data_size)
2726 return FALSE;
2727
2728 /* we expect NUL terminated string */
2729 /* but just in case someone tries to be evil */
2730 D_ASSERT(p->verify_alg[data_size-1] == 0);
2731 p->verify_alg[data_size-1] = 0;
2732
2733 } else /* apv >= 89 */ {
2734 /* we still expect NUL terminated strings */
2735 /* but just in case someone tries to be evil */
2736 D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0);
2737 D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0);
2738 p->verify_alg[SHARED_SECRET_MAX-1] = 0;
2739 p->csums_alg[SHARED_SECRET_MAX-1] = 0;
2740 }
2741
2742 if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) {
2743 if (mdev->state.conn == C_WF_REPORT_PARAMS) {
2744 dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
2745 mdev->sync_conf.verify_alg, p->verify_alg);
2746 goto disconnect;
2747 }
2748 verify_tfm = drbd_crypto_alloc_digest_safe(mdev,
2749 p->verify_alg, "verify-alg");
2750 if (IS_ERR(verify_tfm)) {
2751 verify_tfm = NULL;
2752 goto disconnect;
2753 }
2754 }
2755
2756 if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) {
2757 if (mdev->state.conn == C_WF_REPORT_PARAMS) {
2758 dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
2759 mdev->sync_conf.csums_alg, p->csums_alg);
2760 goto disconnect;
2761 }
2762 csums_tfm = drbd_crypto_alloc_digest_safe(mdev,
2763 p->csums_alg, "csums-alg");
2764 if (IS_ERR(csums_tfm)) {
2765 csums_tfm = NULL;
2766 goto disconnect;
2767 }
2768 }
2769
2770
2771 spin_lock(&mdev->peer_seq_lock);
2772 /* lock against drbd_nl_syncer_conf() */
2773 if (verify_tfm) {
2774 strcpy(mdev->sync_conf.verify_alg, p->verify_alg);
2775 mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1;
2776 crypto_free_hash(mdev->verify_tfm);
2777 mdev->verify_tfm = verify_tfm;
2778 dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
2779 }
2780 if (csums_tfm) {
2781 strcpy(mdev->sync_conf.csums_alg, p->csums_alg);
2782 mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1;
2783 crypto_free_hash(mdev->csums_tfm);
2784 mdev->csums_tfm = csums_tfm;
2785 dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
2786 }
2787 spin_unlock(&mdev->peer_seq_lock);
2788 }
2789
2790 return ok;
2791disconnect:
2792 /* just for completeness: actually not needed,
2793 * as this is not reached if csums_tfm was ok. */
2794 crypto_free_hash(csums_tfm);
2795 /* but free the verify_tfm again, if csums_tfm did not work out */
2796 crypto_free_hash(verify_tfm);
2797 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2798 return FALSE;
2799}
2800
2801static void drbd_setup_order_type(struct drbd_conf *mdev, int peer)
2802{
2803 /* sorry, we currently have no working implementation
2804 * of distributed TCQ */
2805}
2806
2807/* warn if the arguments differ by more than 12.5% */
2808static void warn_if_differ_considerably(struct drbd_conf *mdev,
2809 const char *s, sector_t a, sector_t b)
2810{
2811 sector_t d;
2812 if (a == 0 || b == 0)
2813 return;
2814 d = (a > b) ? (a - b) : (b - a);
2815 if (d > (a>>3) || d > (b>>3))
2816 dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s,
2817 (unsigned long long)a, (unsigned long long)b);
2818}
2819
2820static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
2821{
2822 struct p_sizes *p = (struct p_sizes *)h;
2823 enum determine_dev_size dd = unchanged;
2824 unsigned int max_seg_s;
2825 sector_t p_size, p_usize, my_usize;
2826 int ldsc = 0; /* local disk size changed */
2827 enum drbd_conns nconn;
2828
2829 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
2830 if (drbd_recv(mdev, h->payload, h->length) != h->length)
2831 return FALSE;
2832
2833 p_size = be64_to_cpu(p->d_size);
2834 p_usize = be64_to_cpu(p->u_size);
2835
2836 if (p_size == 0 && mdev->state.disk == D_DISKLESS) {
2837 dev_err(DEV, "some backing storage is needed\n");
2838 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2839 return FALSE;
2840 }
2841
2842 /* just store the peer's disk size for now.
2843 * we still need to figure out whether we accept that. */
2844 mdev->p_size = p_size;
2845
2846#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
2847 if (get_ldev(mdev)) {
2848 warn_if_differ_considerably(mdev, "lower level device sizes",
2849 p_size, drbd_get_max_capacity(mdev->ldev));
2850 warn_if_differ_considerably(mdev, "user requested size",
2851 p_usize, mdev->ldev->dc.disk_size);
2852
2853 /* if this is the first connect, or an otherwise expected
2854 * param exchange, choose the minimum */
2855 if (mdev->state.conn == C_WF_REPORT_PARAMS)
2856 p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size,
2857 p_usize);
2858
2859 my_usize = mdev->ldev->dc.disk_size;
2860
2861 if (mdev->ldev->dc.disk_size != p_usize) {
2862 mdev->ldev->dc.disk_size = p_usize;
2863 dev_info(DEV, "Peer sets u_size to %lu sectors\n",
2864 (unsigned long)mdev->ldev->dc.disk_size);
2865 }
2866
2867 /* Never shrink a device with usable data during connect.
2868 But allow online shrinking if we are connected. */
2869 if (drbd_new_dev_size(mdev, mdev->ldev) <
2870 drbd_get_capacity(mdev->this_bdev) &&
2871 mdev->state.disk >= D_OUTDATED &&
2872 mdev->state.conn < C_CONNECTED) {
2873 dev_err(DEV, "The peer's disk size is too small!\n");
2874 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2875 mdev->ldev->dc.disk_size = my_usize;
2876 put_ldev(mdev);
2877 return FALSE;
2878 }
2879 put_ldev(mdev);
2880 }
2881#undef min_not_zero
2882
2883 if (get_ldev(mdev)) {
2884 dd = drbd_determin_dev_size(mdev);
2885 put_ldev(mdev);
2886 if (dd == dev_size_error)
2887 return FALSE;
2888 drbd_md_sync(mdev);
2889 } else {
2890 /* I am diskless, need to accept the peer's size. */
2891 drbd_set_my_capacity(mdev, p_size);
2892 }
2893
2894 if (mdev->p_uuid && mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
2895 nconn = drbd_sync_handshake(mdev,
2896 mdev->state.peer, mdev->state.pdsk);
2897 put_ldev(mdev);
2898
2899 if (nconn == C_MASK) {
2900 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2901 return FALSE;
2902 }
2903
2904 if (drbd_request_state(mdev, NS(conn, nconn)) < SS_SUCCESS) {
2905 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2906 return FALSE;
2907 }
2908 }
2909
2910 if (get_ldev(mdev)) {
2911 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
2912 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
2913 ldsc = 1;
2914 }
2915
2916 max_seg_s = be32_to_cpu(p->max_segment_size);
2917 if (max_seg_s != queue_max_segment_size(mdev->rq_queue))
2918 drbd_setup_queue_param(mdev, max_seg_s);
2919
2920 drbd_setup_order_type(mdev, be32_to_cpu(p->queue_order_type));
2921 put_ldev(mdev);
2922 }
2923
2924 if (mdev->state.conn > C_WF_REPORT_PARAMS) {
2925 if (be64_to_cpu(p->c_size) !=
2926 drbd_get_capacity(mdev->this_bdev) || ldsc) {
2927 /* we have different sizes, probably peer
2928 * needs to know my new size... */
2929 drbd_send_sizes(mdev, 0);
2930 }
2931 if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
2932 (dd == grew && mdev->state.conn == C_CONNECTED)) {
2933 if (mdev->state.pdsk >= D_INCONSISTENT &&
2934 mdev->state.disk >= D_INCONSISTENT)
2935 resync_after_online_grow(mdev);
2936 else
2937 set_bit(RESYNC_AFTER_NEG, &mdev->flags);
2938 }
2939 }
2940
2941 return TRUE;
2942}
2943
2944static int receive_uuids(struct drbd_conf *mdev, struct p_header *h)
2945{
2946 struct p_uuids *p = (struct p_uuids *)h;
2947 u64 *p_uuid;
2948 int i;
2949
2950 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
2951 if (drbd_recv(mdev, h->payload, h->length) != h->length)
2952 return FALSE;
2953
2954 p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
2955
2956 for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
2957 p_uuid[i] = be64_to_cpu(p->uuid[i]);
2958
2959 kfree(mdev->p_uuid);
2960 mdev->p_uuid = p_uuid;
2961
2962 if (mdev->state.conn < C_CONNECTED &&
2963 mdev->state.disk < D_INCONSISTENT &&
2964 mdev->state.role == R_PRIMARY &&
2965 (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
2966 dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
2967 (unsigned long long)mdev->ed_uuid);
2968 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2969 return FALSE;
2970 }
2971
2972 if (get_ldev(mdev)) {
2973 int skip_initial_sync =
2974 mdev->state.conn == C_CONNECTED &&
2975 mdev->agreed_pro_version >= 90 &&
2976 mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
2977 (p_uuid[UI_FLAGS] & 8);
2978 if (skip_initial_sync) {
2979 dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n");
2980 drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
2981 "clear_n_write from receive_uuids");
2982 _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]);
2983 _drbd_uuid_set(mdev, UI_BITMAP, 0);
2984 _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
2985 CS_VERBOSE, NULL);
2986 drbd_md_sync(mdev);
2987 }
2988 put_ldev(mdev);
2989 }
2990
2991 /* Before we test for the disk state, we should wait until an eventually
2992 ongoing cluster wide state change is finished. That is important if
2993 we are primary and are detaching from our disk. We need to see the
2994 new disk state... */
2995 wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags));
2996 if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
2997 drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
2998
2999 return TRUE;
3000}
3001
3002/**
3003 * convert_state() - Converts the peer's view of the cluster state to our point of view
3004 * @ps: The state as seen by the peer.
3005 */
3006static union drbd_state convert_state(union drbd_state ps)
3007{
3008 union drbd_state ms;
3009
3010 static enum drbd_conns c_tab[] = {
3011 [C_CONNECTED] = C_CONNECTED,
3012
3013 [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
3014 [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
3015 [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
3016 [C_VERIFY_S] = C_VERIFY_T,
3017 [C_MASK] = C_MASK,
3018 };
3019
3020 ms.i = ps.i;
3021
3022 ms.conn = c_tab[ps.conn];
3023 ms.peer = ps.role;
3024 ms.role = ps.peer;
3025 ms.pdsk = ps.disk;
3026 ms.disk = ps.pdsk;
3027 ms.peer_isp = (ps.aftr_isp | ps.user_isp);
3028
3029 return ms;
3030}
3031
3032static int receive_req_state(struct drbd_conf *mdev, struct p_header *h)
3033{
3034 struct p_req_state *p = (struct p_req_state *)h;
3035 union drbd_state mask, val;
3036 int rv;
3037
3038 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
3039 if (drbd_recv(mdev, h->payload, h->length) != h->length)
3040 return FALSE;
3041
3042 mask.i = be32_to_cpu(p->mask);
3043 val.i = be32_to_cpu(p->val);
3044
3045 if (test_bit(DISCARD_CONCURRENT, &mdev->flags) &&
3046 test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) {
3047 drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
3048 return TRUE;
3049 }
3050
3051 mask = convert_state(mask);
3052 val = convert_state(val);
3053
3054 rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
3055
3056 drbd_send_sr_reply(mdev, rv);
3057 drbd_md_sync(mdev);
3058
3059 return TRUE;
3060}
3061
3062static int receive_state(struct drbd_conf *mdev, struct p_header *h)
3063{
3064 struct p_state *p = (struct p_state *)h;
3065 enum drbd_conns nconn, oconn;
3066 union drbd_state ns, peer_state;
3067 enum drbd_disk_state real_peer_disk;
3068 int rv;
3069
3070 ERR_IF(h->length != (sizeof(*p)-sizeof(*h)))
3071 return FALSE;
3072
3073 if (drbd_recv(mdev, h->payload, h->length) != h->length)
3074 return FALSE;
3075
3076 peer_state.i = be32_to_cpu(p->state);
3077
3078 real_peer_disk = peer_state.disk;
3079 if (peer_state.disk == D_NEGOTIATING) {
3080 real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
3081 dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
3082 }
3083
3084 spin_lock_irq(&mdev->req_lock);
3085 retry:
3086 oconn = nconn = mdev->state.conn;
3087 spin_unlock_irq(&mdev->req_lock);
3088
3089 if (nconn == C_WF_REPORT_PARAMS)
3090 nconn = C_CONNECTED;
3091
3092 if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
3093 get_ldev_if_state(mdev, D_NEGOTIATING)) {
3094 int cr; /* consider resync */
3095
3096 /* if we established a new connection */
3097 cr = (oconn < C_CONNECTED);
3098 /* if we had an established connection
3099 * and one of the nodes newly attaches a disk */
3100 cr |= (oconn == C_CONNECTED &&
3101 (peer_state.disk == D_NEGOTIATING ||
3102 mdev->state.disk == D_NEGOTIATING));
3103 /* if we have both been inconsistent, and the peer has been
3104 * forced to be UpToDate with --overwrite-data */
3105 cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
3106 /* if we had been plain connected, and the admin requested to
3107 * start a sync by "invalidate" or "invalidate-remote" */
3108 cr |= (oconn == C_CONNECTED &&
3109 (peer_state.conn >= C_STARTING_SYNC_S &&
3110 peer_state.conn <= C_WF_BITMAP_T));
3111
3112 if (cr)
3113 nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
3114
3115 put_ldev(mdev);
3116 if (nconn == C_MASK) {
3117 if (mdev->state.disk == D_NEGOTIATING) {
3118 drbd_force_state(mdev, NS(disk, D_DISKLESS));
3119 nconn = C_CONNECTED;
3120 } else if (peer_state.disk == D_NEGOTIATING) {
3121 dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
3122 peer_state.disk = D_DISKLESS;
3123 } else {
3124 D_ASSERT(oconn == C_WF_REPORT_PARAMS);
3125 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3126 return FALSE;
3127 }
3128 }
3129 }
3130
3131 spin_lock_irq(&mdev->req_lock);
3132 if (mdev->state.conn != oconn)
3133 goto retry;
3134 clear_bit(CONSIDER_RESYNC, &mdev->flags);
3135 ns.i = mdev->state.i;
3136 ns.conn = nconn;
3137 ns.peer = peer_state.role;
3138 ns.pdsk = real_peer_disk;
3139 ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
3140 if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
3141 ns.disk = mdev->new_state_tmp.disk;
3142
3143 rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL);
3144 ns = mdev->state;
3145 spin_unlock_irq(&mdev->req_lock);
3146
3147 if (rv < SS_SUCCESS) {
3148 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3149 return FALSE;
3150 }
3151
3152 if (oconn > C_WF_REPORT_PARAMS) {
3153 if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
3154 peer_state.disk != D_NEGOTIATING ) {
3155 /* we want resync, peer has not yet decided to sync... */
3156 /* Nowadays only used when forcing a node into primary role and
3157 setting its disk to UpToDate with that */
3158 drbd_send_uuids(mdev);
3159 drbd_send_state(mdev);
3160 }
3161 }
3162
3163 mdev->net_conf->want_lose = 0;
3164
3165 drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
3166
3167 return TRUE;
3168}
3169
3170static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
3171{
3172 struct p_rs_uuid *p = (struct p_rs_uuid *)h;
3173
3174 wait_event(mdev->misc_wait,
3175 mdev->state.conn == C_WF_SYNC_UUID ||
3176 mdev->state.conn < C_CONNECTED ||
3177 mdev->state.disk < D_NEGOTIATING);
3178
3179 /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
3180
3181 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
3182 if (drbd_recv(mdev, h->payload, h->length) != h->length)
3183 return FALSE;
3184
3185 /* Here the _drbd_uuid_ functions are right, current should
3186 _not_ be rotated into the history */
3187 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
3188 _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid));
3189 _drbd_uuid_set(mdev, UI_BITMAP, 0UL);
3190
3191 drbd_start_resync(mdev, C_SYNC_TARGET);
3192
3193 put_ldev(mdev);
3194 } else
3195 dev_err(DEV, "Ignoring SyncUUID packet!\n");
3196
3197 return TRUE;
3198}
3199
3200enum receive_bitmap_ret { OK, DONE, FAILED };
3201
3202static enum receive_bitmap_ret
3203receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h,
3204 unsigned long *buffer, struct bm_xfer_ctx *c)
3205{
3206 unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
3207 unsigned want = num_words * sizeof(long);
3208
3209 if (want != h->length) {
3210 dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length);
3211 return FAILED;
3212 }
3213 if (want == 0)
3214 return DONE;
3215 if (drbd_recv(mdev, buffer, want) != want)
3216 return FAILED;
3217
3218 drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer);
3219
3220 c->word_offset += num_words;
3221 c->bit_offset = c->word_offset * BITS_PER_LONG;
3222 if (c->bit_offset > c->bm_bits)
3223 c->bit_offset = c->bm_bits;
3224
3225 return OK;
3226}
3227
3228static enum receive_bitmap_ret
3229recv_bm_rle_bits(struct drbd_conf *mdev,
3230 struct p_compressed_bm *p,
3231 struct bm_xfer_ctx *c)
3232{
3233 struct bitstream bs;
3234 u64 look_ahead;
3235 u64 rl;
3236 u64 tmp;
3237 unsigned long s = c->bit_offset;
3238 unsigned long e;
3239 int len = p->head.length - (sizeof(*p) - sizeof(p->head));
3240 int toggle = DCBP_get_start(p);
3241 int have;
3242 int bits;
3243
3244 bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p));
3245
3246 bits = bitstream_get_bits(&bs, &look_ahead, 64);
3247 if (bits < 0)
3248 return FAILED;
3249
3250 for (have = bits; have > 0; s += rl, toggle = !toggle) {
3251 bits = vli_decode_bits(&rl, look_ahead);
3252 if (bits <= 0)
3253 return FAILED;
3254
3255 if (toggle) {
3256 e = s + rl -1;
3257 if (e >= c->bm_bits) {
3258 dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
3259 return FAILED;
3260 }
3261 _drbd_bm_set_bits(mdev, s, e);
3262 }
3263
3264 if (have < bits) {
3265 dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
3266 have, bits, look_ahead,
3267 (unsigned int)(bs.cur.b - p->code),
3268 (unsigned int)bs.buf_len);
3269 return FAILED;
3270 }
3271 look_ahead >>= bits;
3272 have -= bits;
3273
3274 bits = bitstream_get_bits(&bs, &tmp, 64 - have);
3275 if (bits < 0)
3276 return FAILED;
3277 look_ahead |= tmp << have;
3278 have += bits;
3279 }
3280
3281 c->bit_offset = s;
3282 bm_xfer_ctx_bit_to_word_offset(c);
3283
3284 return (s == c->bm_bits) ? DONE : OK;
3285}
3286
3287static enum receive_bitmap_ret
3288decode_bitmap_c(struct drbd_conf *mdev,
3289 struct p_compressed_bm *p,
3290 struct bm_xfer_ctx *c)
3291{
3292 if (DCBP_get_code(p) == RLE_VLI_Bits)
3293 return recv_bm_rle_bits(mdev, p, c);
3294
3295 /* other variants had been implemented for evaluation,
3296 * but have been dropped as this one turned out to be "best"
3297 * during all our tests. */
3298
3299 dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
3300 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3301 return FAILED;
3302}
3303
3304void INFO_bm_xfer_stats(struct drbd_conf *mdev,
3305 const char *direction, struct bm_xfer_ctx *c)
3306{
3307 /* what would it take to transfer it "plaintext" */
3308 unsigned plain = sizeof(struct p_header) *
3309 ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
3310 + c->bm_words * sizeof(long);
3311 unsigned total = c->bytes[0] + c->bytes[1];
3312 unsigned r;
3313
3314 /* total can not be zero. but just in case: */
3315 if (total == 0)
3316 return;
3317
3318 /* don't report if not compressed */
3319 if (total >= plain)
3320 return;
3321
3322 /* total < plain. check for overflow, still */
3323 r = (total > UINT_MAX/1000) ? (total / (plain/1000))
3324 : (1000 * total / plain);
3325
3326 if (r > 1000)
3327 r = 1000;
3328
3329 r = 1000 - r;
3330 dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
3331 "total %u; compression: %u.%u%%\n",
3332 direction,
3333 c->bytes[1], c->packets[1],
3334 c->bytes[0], c->packets[0],
3335 total, r/10, r % 10);
3336}
3337
3338/* Since we are processing the bitfield from lower addresses to higher,
3339 it does not matter if the process it in 32 bit chunks or 64 bit
3340 chunks as long as it is little endian. (Understand it as byte stream,
3341 beginning with the lowest byte...) If we would use big endian
3342 we would need to process it from the highest address to the lowest,
3343 in order to be agnostic to the 32 vs 64 bits issue.
3344
3345 returns 0 on failure, 1 if we successfully received it. */
3346static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
3347{
3348 struct bm_xfer_ctx c;
3349 void *buffer;
3350 enum receive_bitmap_ret ret;
3351 int ok = FALSE;
3352
3353 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
3354
3355 drbd_bm_lock(mdev, "receive bitmap");
3356
3357 /* maybe we should use some per thread scratch page,
3358 * and allocate that during initial device creation? */
3359 buffer = (unsigned long *) __get_free_page(GFP_NOIO);
3360 if (!buffer) {
3361 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
3362 goto out;
3363 }
3364
3365 c = (struct bm_xfer_ctx) {
3366 .bm_bits = drbd_bm_bits(mdev),
3367 .bm_words = drbd_bm_words(mdev),
3368 };
3369
3370 do {
3371 if (h->command == P_BITMAP) {
3372 ret = receive_bitmap_plain(mdev, h, buffer, &c);
3373 } else if (h->command == P_COMPRESSED_BITMAP) {
3374 /* MAYBE: sanity check that we speak proto >= 90,
3375 * and the feature is enabled! */
3376 struct p_compressed_bm *p;
3377
3378 if (h->length > BM_PACKET_PAYLOAD_BYTES) {
3379 dev_err(DEV, "ReportCBitmap packet too large\n");
3380 goto out;
3381 }
3382 /* use the page buff */
3383 p = buffer;
3384 memcpy(p, h, sizeof(*h));
3385 if (drbd_recv(mdev, p->head.payload, h->length) != h->length)
3386 goto out;
3387 if (p->head.length <= (sizeof(*p) - sizeof(p->head))) {
3388 dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length);
3389 return FAILED;
3390 }
3391 ret = decode_bitmap_c(mdev, p, &c);
3392 } else {
3393 dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command);
3394 goto out;
3395 }
3396
3397 c.packets[h->command == P_BITMAP]++;
3398 c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length;
3399
3400 if (ret != OK)
3401 break;
3402
3403 if (!drbd_recv_header(mdev, h))
3404 goto out;
3405 } while (ret == OK);
3406 if (ret == FAILED)
3407 goto out;
3408
3409 INFO_bm_xfer_stats(mdev, "receive", &c);
3410
3411 if (mdev->state.conn == C_WF_BITMAP_T) {
3412 ok = !drbd_send_bitmap(mdev);
3413 if (!ok)
3414 goto out;
3415 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
3416 ok = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
3417 D_ASSERT(ok == SS_SUCCESS);
3418 } else if (mdev->state.conn != C_WF_BITMAP_S) {
3419 /* admin may have requested C_DISCONNECTING,
3420 * other threads may have noticed network errors */
3421 dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n",
3422 drbd_conn_str(mdev->state.conn));
3423 }
3424
3425 ok = TRUE;
3426 out:
3427 drbd_bm_unlock(mdev);
3428 if (ok && mdev->state.conn == C_WF_BITMAP_S)
3429 drbd_start_resync(mdev, C_SYNC_SOURCE);
3430 free_page((unsigned long) buffer);
3431 return ok;
3432}
3433
3434static int receive_skip(struct drbd_conf *mdev, struct p_header *h)
3435{
3436 /* TODO zero copy sink :) */
3437 static char sink[128];
3438 int size, want, r;
3439
3440 dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
3441 h->command, h->length);
3442
3443 size = h->length;
3444 while (size > 0) {
3445 want = min_t(int, size, sizeof(sink));
3446 r = drbd_recv(mdev, sink, want);
3447 ERR_IF(r <= 0) break;
3448 size -= r;
3449 }
3450 return size == 0;
3451}
3452
3453static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
3454{
3455 if (mdev->state.disk >= D_INCONSISTENT)
3456 drbd_kick_lo(mdev);
3457
3458 /* Make sure we've acked all the TCP data associated
3459 * with the data requests being unplugged */
3460 drbd_tcp_quickack(mdev->data.socket);
3461
3462 return TRUE;
3463}
3464
3465typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *);
3466
3467static drbd_cmd_handler_f drbd_default_handler[] = {
3468 [P_DATA] = receive_Data,
3469 [P_DATA_REPLY] = receive_DataReply,
3470 [P_RS_DATA_REPLY] = receive_RSDataReply,
3471 [P_BARRIER] = receive_Barrier,
3472 [P_BITMAP] = receive_bitmap,
3473 [P_COMPRESSED_BITMAP] = receive_bitmap,
3474 [P_UNPLUG_REMOTE] = receive_UnplugRemote,
3475 [P_DATA_REQUEST] = receive_DataRequest,
3476 [P_RS_DATA_REQUEST] = receive_DataRequest,
3477 [P_SYNC_PARAM] = receive_SyncParam,
3478 [P_SYNC_PARAM89] = receive_SyncParam,
3479 [P_PROTOCOL] = receive_protocol,
3480 [P_UUIDS] = receive_uuids,
3481 [P_SIZES] = receive_sizes,
3482 [P_STATE] = receive_state,
3483 [P_STATE_CHG_REQ] = receive_req_state,
3484 [P_SYNC_UUID] = receive_sync_uuid,
3485 [P_OV_REQUEST] = receive_DataRequest,
3486 [P_OV_REPLY] = receive_DataRequest,
3487 [P_CSUM_RS_REQUEST] = receive_DataRequest,
3488 /* anything missing from this table is in
3489 * the asender_tbl, see get_asender_cmd */
3490 [P_MAX_CMD] = NULL,
3491};
3492
3493static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler;
3494static drbd_cmd_handler_f *drbd_opt_cmd_handler;
3495
3496static void drbdd(struct drbd_conf *mdev)
3497{
3498 drbd_cmd_handler_f handler;
3499 struct p_header *header = &mdev->data.rbuf.header;
3500
3501 while (get_t_state(&mdev->receiver) == Running) {
3502 drbd_thread_current_set_cpu(mdev);
3503 if (!drbd_recv_header(mdev, header)) {
3504 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3505 break;
3506 }
3507
3508 if (header->command < P_MAX_CMD)
3509 handler = drbd_cmd_handler[header->command];
3510 else if (P_MAY_IGNORE < header->command
3511 && header->command < P_MAX_OPT_CMD)
3512 handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE];
3513 else if (header->command > P_MAX_OPT_CMD)
3514 handler = receive_skip;
3515 else
3516 handler = NULL;
3517
3518 if (unlikely(!handler)) {
3519 dev_err(DEV, "unknown packet type %d, l: %d!\n",
3520 header->command, header->length);
3521 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3522 break;
3523 }
3524 if (unlikely(!handler(mdev, header))) {
3525 dev_err(DEV, "error receiving %s, l: %d!\n",
3526 cmdname(header->command), header->length);
3527 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3528 break;
3529 }
3530 }
3531}
3532
3533static void drbd_fail_pending_reads(struct drbd_conf *mdev)
3534{
3535 struct hlist_head *slot;
3536 struct hlist_node *pos;
3537 struct hlist_node *tmp;
3538 struct drbd_request *req;
3539 int i;
3540
3541 /*
3542 * Application READ requests
3543 */
3544 spin_lock_irq(&mdev->req_lock);
3545 for (i = 0; i < APP_R_HSIZE; i++) {
3546 slot = mdev->app_reads_hash+i;
3547 hlist_for_each_entry_safe(req, pos, tmp, slot, colision) {
3548 /* it may (but should not any longer!)
3549 * be on the work queue; if that assert triggers,
3550 * we need to also grab the
3551 * spin_lock_irq(&mdev->data.work.q_lock);
3552 * and list_del_init here. */
3553 D_ASSERT(list_empty(&req->w.list));
3554 /* It would be nice to complete outside of spinlock.
3555 * But this is easier for now. */
3556 _req_mod(req, connection_lost_while_pending);
3557 }
3558 }
3559 for (i = 0; i < APP_R_HSIZE; i++)
3560 if (!hlist_empty(mdev->app_reads_hash+i))
3561 dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: "
3562 "%p, should be NULL\n", i, mdev->app_reads_hash[i].first);
3563
3564 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
3565 spin_unlock_irq(&mdev->req_lock);
3566}
3567
3568void drbd_flush_workqueue(struct drbd_conf *mdev)
3569{
3570 struct drbd_wq_barrier barr;
3571
3572 barr.w.cb = w_prev_work_done;
3573 init_completion(&barr.done);
3574 drbd_queue_work(&mdev->data.work, &barr.w);
3575 wait_for_completion(&barr.done);
3576}
3577
3578static void drbd_disconnect(struct drbd_conf *mdev)
3579{
3580 enum drbd_fencing_p fp;
3581 union drbd_state os, ns;
3582 int rv = SS_UNKNOWN_ERROR;
3583 unsigned int i;
3584
3585 if (mdev->state.conn == C_STANDALONE)
3586 return;
3587 if (mdev->state.conn >= C_WF_CONNECTION)
3588 dev_err(DEV, "ASSERT FAILED cstate = %s, expected < WFConnection\n",
3589 drbd_conn_str(mdev->state.conn));
3590
3591 /* asender does not clean up anything. it must not interfere, either */
3592 drbd_thread_stop(&mdev->asender);
3593
3594 mutex_lock(&mdev->data.mutex);
3595 drbd_free_sock(mdev);
3596 mutex_unlock(&mdev->data.mutex);
3597
3598 spin_lock_irq(&mdev->req_lock);
3599 _drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
3600 _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
3601 _drbd_wait_ee_list_empty(mdev, &mdev->read_ee);
3602 spin_unlock_irq(&mdev->req_lock);
3603
3604 /* We do not have data structures that would allow us to
3605 * get the rs_pending_cnt down to 0 again.
3606 * * On C_SYNC_TARGET we do not have any data structures describing
3607 * the pending RSDataRequest's we have sent.
3608 * * On C_SYNC_SOURCE there is no data structure that tracks
3609 * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
3610 * And no, it is not the sum of the reference counts in the
3611 * resync_LRU. The resync_LRU tracks the whole operation including
3612 * the disk-IO, while the rs_pending_cnt only tracks the blocks
3613 * on the fly. */
3614 drbd_rs_cancel_all(mdev);
3615 mdev->rs_total = 0;
3616 mdev->rs_failed = 0;
3617 atomic_set(&mdev->rs_pending_cnt, 0);
3618 wake_up(&mdev->misc_wait);
3619
3620 /* make sure syncer is stopped and w_resume_next_sg queued */
3621 del_timer_sync(&mdev->resync_timer);
3622 set_bit(STOP_SYNC_TIMER, &mdev->flags);
3623 resync_timer_fn((unsigned long)mdev);
3624
3625 /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
3626 * w_make_resync_request etc. which may still be on the worker queue
3627 * to be "canceled" */
3628 drbd_flush_workqueue(mdev);
3629
3630 /* This also does reclaim_net_ee(). If we do this too early, we might
3631 * miss some resync ee and pages.*/
3632 drbd_process_done_ee(mdev);
3633
3634 kfree(mdev->p_uuid);
3635 mdev->p_uuid = NULL;
3636
3637 if (!mdev->state.susp)
3638 tl_clear(mdev);
3639
3640 drbd_fail_pending_reads(mdev);
3641
3642 dev_info(DEV, "Connection closed\n");
3643
3644 drbd_md_sync(mdev);
3645
3646 fp = FP_DONT_CARE;
3647 if (get_ldev(mdev)) {
3648 fp = mdev->ldev->dc.fencing;
3649 put_ldev(mdev);
3650 }
3651
3652 if (mdev->state.role == R_PRIMARY) {
3653 if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) {
3654 enum drbd_disk_state nps = drbd_try_outdate_peer(mdev);
3655 drbd_request_state(mdev, NS(pdsk, nps));
3656 }
3657 }
3658
3659 spin_lock_irq(&mdev->req_lock);
3660 os = mdev->state;
3661 if (os.conn >= C_UNCONNECTED) {
3662 /* Do not restart in case we are C_DISCONNECTING */
3663 ns = os;
3664 ns.conn = C_UNCONNECTED;
3665 rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
3666 }
3667 spin_unlock_irq(&mdev->req_lock);
3668
3669 if (os.conn == C_DISCONNECTING) {
3670 struct hlist_head *h;
3671 wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0);
3672
3673 /* we must not free the tl_hash
3674 * while application io is still on the fly */
3675 wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0);
3676
3677 spin_lock_irq(&mdev->req_lock);
3678 /* paranoia code */
3679 for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
3680 if (h->first)
3681 dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
3682 (int)(h - mdev->ee_hash), h->first);
3683 kfree(mdev->ee_hash);
3684 mdev->ee_hash = NULL;
3685 mdev->ee_hash_s = 0;
3686
3687 /* paranoia code */
3688 for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
3689 if (h->first)
3690 dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
3691 (int)(h - mdev->tl_hash), h->first);
3692 kfree(mdev->tl_hash);
3693 mdev->tl_hash = NULL;
3694 mdev->tl_hash_s = 0;
3695 spin_unlock_irq(&mdev->req_lock);
3696
3697 crypto_free_hash(mdev->cram_hmac_tfm);
3698 mdev->cram_hmac_tfm = NULL;
3699
3700 kfree(mdev->net_conf);
3701 mdev->net_conf = NULL;
3702 drbd_request_state(mdev, NS(conn, C_STANDALONE));
3703 }
3704
3705 /* tcp_close and release of sendpage pages can be deferred. I don't
3706 * want to use SO_LINGER, because apparently it can be deferred for
3707 * more than 20 seconds (longest time I checked).
3708 *
3709 * Actually we don't care for exactly when the network stack does its
3710 * put_page(), but release our reference on these pages right here.
3711 */
3712 i = drbd_release_ee(mdev, &mdev->net_ee);
3713 if (i)
3714 dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
3715 i = atomic_read(&mdev->pp_in_use);
3716 if (i)
3717 dev_info(DEV, "pp_in_use = %u, expected 0\n", i);
3718
3719 D_ASSERT(list_empty(&mdev->read_ee));
3720 D_ASSERT(list_empty(&mdev->active_ee));
3721 D_ASSERT(list_empty(&mdev->sync_ee));
3722 D_ASSERT(list_empty(&mdev->done_ee));
3723
3724 /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
3725 atomic_set(&mdev->current_epoch->epoch_size, 0);
3726 D_ASSERT(list_empty(&mdev->current_epoch->list));
3727}
3728
3729/*
3730 * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
3731 * we can agree on is stored in agreed_pro_version.
3732 *
3733 * feature flags and the reserved array should be enough room for future
3734 * enhancements of the handshake protocol, and possible plugins...
3735 *
3736 * for now, they are expected to be zero, but ignored.
3737 */
3738static int drbd_send_handshake(struct drbd_conf *mdev)
3739{
3740 /* ASSERT current == mdev->receiver ... */
3741 struct p_handshake *p = &mdev->data.sbuf.handshake;
3742 int ok;
3743
3744 if (mutex_lock_interruptible(&mdev->data.mutex)) {
3745 dev_err(DEV, "interrupted during initial handshake\n");
3746 return 0; /* interrupted. not ok. */
3747 }
3748
3749 if (mdev->data.socket == NULL) {
3750 mutex_unlock(&mdev->data.mutex);
3751 return 0;
3752 }
3753
3754 memset(p, 0, sizeof(*p));
3755 p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
3756 p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
3757 ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
3758 (struct p_header *)p, sizeof(*p), 0 );
3759 mutex_unlock(&mdev->data.mutex);
3760 return ok;
3761}
3762
3763/*
3764 * return values:
3765 * 1 yes, we have a valid connection
3766 * 0 oops, did not work out, please try again
3767 * -1 peer talks different language,
3768 * no point in trying again, please go standalone.
3769 */
3770static int drbd_do_handshake(struct drbd_conf *mdev)
3771{
3772 /* ASSERT current == mdev->receiver ... */
3773 struct p_handshake *p = &mdev->data.rbuf.handshake;
3774 const int expect = sizeof(struct p_handshake)
3775 -sizeof(struct p_header);
3776 int rv;
3777
3778 rv = drbd_send_handshake(mdev);
3779 if (!rv)
3780 return 0;
3781
3782 rv = drbd_recv_header(mdev, &p->head);
3783 if (!rv)
3784 return 0;
3785
3786 if (p->head.command != P_HAND_SHAKE) {
3787 dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
3788 cmdname(p->head.command), p->head.command);
3789 return -1;
3790 }
3791
3792 if (p->head.length != expect) {
3793 dev_err(DEV, "expected HandShake length: %u, received: %u\n",
3794 expect, p->head.length);
3795 return -1;
3796 }
3797
3798 rv = drbd_recv(mdev, &p->head.payload, expect);
3799
3800 if (rv != expect) {
3801 dev_err(DEV, "short read receiving handshake packet: l=%u\n", rv);
3802 return 0;
3803 }
3804
3805 p->protocol_min = be32_to_cpu(p->protocol_min);
3806 p->protocol_max = be32_to_cpu(p->protocol_max);
3807 if (p->protocol_max == 0)
3808 p->protocol_max = p->protocol_min;
3809
3810 if (PRO_VERSION_MAX < p->protocol_min ||
3811 PRO_VERSION_MIN > p->protocol_max)
3812 goto incompat;
3813
3814 mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
3815
3816 dev_info(DEV, "Handshake successful: "
3817 "Agreed network protocol version %d\n", mdev->agreed_pro_version);
3818
3819 return 1;
3820
3821 incompat:
3822 dev_err(DEV, "incompatible DRBD dialects: "
3823 "I support %d-%d, peer supports %d-%d\n",
3824 PRO_VERSION_MIN, PRO_VERSION_MAX,
3825 p->protocol_min, p->protocol_max);
3826 return -1;
3827}
3828
3829#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
3830static int drbd_do_auth(struct drbd_conf *mdev)
3831{
3832 dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
3833 dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
3834 return 0;
3835}
3836#else
3837#define CHALLENGE_LEN 64
3838static int drbd_do_auth(struct drbd_conf *mdev)
3839{
3840 char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */
3841 struct scatterlist sg;
3842 char *response = NULL;
3843 char *right_response = NULL;
3844 char *peers_ch = NULL;
3845 struct p_header p;
3846 unsigned int key_len = strlen(mdev->net_conf->shared_secret);
3847 unsigned int resp_size;
3848 struct hash_desc desc;
3849 int rv;
3850
3851 desc.tfm = mdev->cram_hmac_tfm;
3852 desc.flags = 0;
3853
3854 rv = crypto_hash_setkey(mdev->cram_hmac_tfm,
3855 (u8 *)mdev->net_conf->shared_secret, key_len);
3856 if (rv) {
3857 dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv);
3858 rv = 0;
3859 goto fail;
3860 }
3861
3862 get_random_bytes(my_challenge, CHALLENGE_LEN);
3863
3864 rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN);
3865 if (!rv)
3866 goto fail;
3867
3868 rv = drbd_recv_header(mdev, &p);
3869 if (!rv)
3870 goto fail;
3871
3872 if (p.command != P_AUTH_CHALLENGE) {
3873 dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
3874 cmdname(p.command), p.command);
3875 rv = 0;
3876 goto fail;
3877 }
3878
3879 if (p.length > CHALLENGE_LEN*2) {
3880 dev_err(DEV, "expected AuthChallenge payload too big.\n");
3881 rv = 0;
3882 goto fail;
3883 }
3884
3885 peers_ch = kmalloc(p.length, GFP_NOIO);
3886 if (peers_ch == NULL) {
3887 dev_err(DEV, "kmalloc of peers_ch failed\n");
3888 rv = 0;
3889 goto fail;
3890 }
3891
3892 rv = drbd_recv(mdev, peers_ch, p.length);
3893
3894 if (rv != p.length) {
3895 dev_err(DEV, "short read AuthChallenge: l=%u\n", rv);
3896 rv = 0;
3897 goto fail;
3898 }
3899
3900 resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm);
3901 response = kmalloc(resp_size, GFP_NOIO);
3902 if (response == NULL) {
3903 dev_err(DEV, "kmalloc of response failed\n");
3904 rv = 0;
3905 goto fail;
3906 }
3907
3908 sg_init_table(&sg, 1);
3909 sg_set_buf(&sg, peers_ch, p.length);
3910
3911 rv = crypto_hash_digest(&desc, &sg, sg.length, response);
3912 if (rv) {
3913 dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
3914 rv = 0;
3915 goto fail;
3916 }
3917
3918 rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size);
3919 if (!rv)
3920 goto fail;
3921
3922 rv = drbd_recv_header(mdev, &p);
3923 if (!rv)
3924 goto fail;
3925
3926 if (p.command != P_AUTH_RESPONSE) {
3927 dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
3928 cmdname(p.command), p.command);
3929 rv = 0;
3930 goto fail;
3931 }
3932
3933 if (p.length != resp_size) {
3934 dev_err(DEV, "expected AuthResponse payload of wrong size\n");
3935 rv = 0;
3936 goto fail;
3937 }
3938
3939 rv = drbd_recv(mdev, response , resp_size);
3940
3941 if (rv != resp_size) {
3942 dev_err(DEV, "short read receiving AuthResponse: l=%u\n", rv);
3943 rv = 0;
3944 goto fail;
3945 }
3946
3947 right_response = kmalloc(resp_size, GFP_NOIO);
3948 if (response == NULL) {
3949 dev_err(DEV, "kmalloc of right_response failed\n");
3950 rv = 0;
3951 goto fail;
3952 }
3953
3954 sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
3955
3956 rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
3957 if (rv) {
3958 dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
3959 rv = 0;
3960 goto fail;
3961 }
3962
3963 rv = !memcmp(response, right_response, resp_size);
3964
3965 if (rv)
3966 dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n",
3967 resp_size, mdev->net_conf->cram_hmac_alg);
3968
3969 fail:
3970 kfree(peers_ch);
3971 kfree(response);
3972 kfree(right_response);
3973
3974 return rv;
3975}
3976#endif
3977
3978int drbdd_init(struct drbd_thread *thi)
3979{
3980 struct drbd_conf *mdev = thi->mdev;
3981 unsigned int minor = mdev_to_minor(mdev);
3982 int h;
3983
3984 sprintf(current->comm, "drbd%d_receiver", minor);
3985
3986 dev_info(DEV, "receiver (re)started\n");
3987
3988 do {
3989 h = drbd_connect(mdev);
3990 if (h == 0) {
3991 drbd_disconnect(mdev);
3992 __set_current_state(TASK_INTERRUPTIBLE);
3993 schedule_timeout(HZ);
3994 }
3995 if (h == -1) {
3996 dev_warn(DEV, "Discarding network configuration.\n");
3997 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3998 }
3999 } while (h == 0);
4000
4001 if (h > 0) {
4002 if (get_net_conf(mdev)) {
4003 drbdd(mdev);
4004 put_net_conf(mdev);
4005 }
4006 }
4007
4008 drbd_disconnect(mdev);
4009
4010 dev_info(DEV, "receiver terminated\n");
4011 return 0;
4012}
4013
4014/* ********* acknowledge sender ******** */
4015
4016static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h)
4017{
4018 struct p_req_state_reply *p = (struct p_req_state_reply *)h;
4019
4020 int retcode = be32_to_cpu(p->retcode);
4021
4022 if (retcode >= SS_SUCCESS) {
4023 set_bit(CL_ST_CHG_SUCCESS, &mdev->flags);
4024 } else {
4025 set_bit(CL_ST_CHG_FAIL, &mdev->flags);
4026 dev_err(DEV, "Requested state change failed by peer: %s (%d)\n",
4027 drbd_set_st_err_str(retcode), retcode);
4028 }
4029 wake_up(&mdev->state_wait);
4030
4031 return TRUE;
4032}
4033
4034static int got_Ping(struct drbd_conf *mdev, struct p_header *h)
4035{
4036 return drbd_send_ping_ack(mdev);
4037
4038}
4039
4040static int got_PingAck(struct drbd_conf *mdev, struct p_header *h)
4041{
4042 /* restore idle timeout */
4043 mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
4044
4045 return TRUE;
4046}
4047
4048static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h)
4049{
4050 struct p_block_ack *p = (struct p_block_ack *)h;
4051 sector_t sector = be64_to_cpu(p->sector);
4052 int blksize = be32_to_cpu(p->blksize);
4053
4054 D_ASSERT(mdev->agreed_pro_version >= 89);
4055
4056 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4057
4058 drbd_rs_complete_io(mdev, sector);
4059 drbd_set_in_sync(mdev, sector, blksize);
4060 /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
4061 mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
4062 dec_rs_pending(mdev);
4063
4064 return TRUE;
4065}
4066
4067/* when we receive the ACK for a write request,
4068 * verify that we actually know about it */
4069static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
4070 u64 id, sector_t sector)
4071{
4072 struct hlist_head *slot = tl_hash_slot(mdev, sector);
4073 struct hlist_node *n;
4074 struct drbd_request *req;
4075
4076 hlist_for_each_entry(req, n, slot, colision) {
4077 if ((unsigned long)req == (unsigned long)id) {
4078 if (req->sector != sector) {
4079 dev_err(DEV, "_ack_id_to_req: found req %p but it has "
4080 "wrong sector (%llus versus %llus)\n", req,
4081 (unsigned long long)req->sector,
4082 (unsigned long long)sector);
4083 break;
4084 }
4085 return req;
4086 }
4087 }
4088 dev_err(DEV, "_ack_id_to_req: failed to find req %p, sector %llus in list\n",
4089 (void *)(unsigned long)id, (unsigned long long)sector);
4090 return NULL;
4091}
4092
4093typedef struct drbd_request *(req_validator_fn)
4094 (struct drbd_conf *mdev, u64 id, sector_t sector);
4095
4096static int validate_req_change_req_state(struct drbd_conf *mdev,
4097 u64 id, sector_t sector, req_validator_fn validator,
4098 const char *func, enum drbd_req_event what)
4099{
4100 struct drbd_request *req;
4101 struct bio_and_error m;
4102
4103 spin_lock_irq(&mdev->req_lock);
4104 req = validator(mdev, id, sector);
4105 if (unlikely(!req)) {
4106 spin_unlock_irq(&mdev->req_lock);
4107 dev_err(DEV, "%s: got a corrupt block_id/sector pair\n", func);
4108 return FALSE;
4109 }
4110 __req_mod(req, what, &m);
4111 spin_unlock_irq(&mdev->req_lock);
4112
4113 if (m.bio)
4114 complete_master_bio(mdev, &m);
4115 return TRUE;
4116}
4117
4118static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h)
4119{
4120 struct p_block_ack *p = (struct p_block_ack *)h;
4121 sector_t sector = be64_to_cpu(p->sector);
4122 int blksize = be32_to_cpu(p->blksize);
4123 enum drbd_req_event what;
4124
4125 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4126
4127 if (is_syncer_block_id(p->block_id)) {
4128 drbd_set_in_sync(mdev, sector, blksize);
4129 dec_rs_pending(mdev);
4130 return TRUE;
4131 }
4132 switch (be16_to_cpu(h->command)) {
4133 case P_RS_WRITE_ACK:
4134 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4135 what = write_acked_by_peer_and_sis;
4136 break;
4137 case P_WRITE_ACK:
4138 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4139 what = write_acked_by_peer;
4140 break;
4141 case P_RECV_ACK:
4142 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B);
4143 what = recv_acked_by_peer;
4144 break;
4145 case P_DISCARD_ACK:
4146 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4147 what = conflict_discarded_by_peer;
4148 break;
4149 default:
4150 D_ASSERT(0);
4151 return FALSE;
4152 }
4153
4154 return validate_req_change_req_state(mdev, p->block_id, sector,
4155 _ack_id_to_req, __func__ , what);
4156}
4157
4158static int got_NegAck(struct drbd_conf *mdev, struct p_header *h)
4159{
4160 struct p_block_ack *p = (struct p_block_ack *)h;
4161 sector_t sector = be64_to_cpu(p->sector);
4162
4163 if (__ratelimit(&drbd_ratelimit_state))
4164 dev_warn(DEV, "Got NegAck packet. Peer is in troubles?\n");
4165
4166 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4167
4168 if (is_syncer_block_id(p->block_id)) {
4169 int size = be32_to_cpu(p->blksize);
4170 dec_rs_pending(mdev);
4171 drbd_rs_failed_io(mdev, sector, size);
4172 return TRUE;
4173 }
4174 return validate_req_change_req_state(mdev, p->block_id, sector,
4175 _ack_id_to_req, __func__ , neg_acked);
4176}
4177
4178static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h)
4179{
4180 struct p_block_ack *p = (struct p_block_ack *)h;
4181 sector_t sector = be64_to_cpu(p->sector);
4182
4183 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4184 dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n",
4185 (unsigned long long)sector, be32_to_cpu(p->blksize));
4186
4187 return validate_req_change_req_state(mdev, p->block_id, sector,
4188 _ar_id_to_req, __func__ , neg_acked);
4189}
4190
4191static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
4192{
4193 sector_t sector;
4194 int size;
4195 struct p_block_ack *p = (struct p_block_ack *)h;
4196
4197 sector = be64_to_cpu(p->sector);
4198 size = be32_to_cpu(p->blksize);
4199 D_ASSERT(p->block_id == ID_SYNCER);
4200
4201 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4202
4203 dec_rs_pending(mdev);
4204
4205 if (get_ldev_if_state(mdev, D_FAILED)) {
4206 drbd_rs_complete_io(mdev, sector);
4207 drbd_rs_failed_io(mdev, sector, size);
4208 put_ldev(mdev);
4209 }
4210
4211 return TRUE;
4212}
4213
4214static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h)
4215{
4216 struct p_barrier_ack *p = (struct p_barrier_ack *)h;
4217
4218 tl_release(mdev, p->barrier, be32_to_cpu(p->set_size));
4219
4220 return TRUE;
4221}
4222
4223static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
4224{
4225 struct p_block_ack *p = (struct p_block_ack *)h;
4226 struct drbd_work *w;
4227 sector_t sector;
4228 int size;
4229
4230 sector = be64_to_cpu(p->sector);
4231 size = be32_to_cpu(p->blksize);
4232
4233 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4234
4235 if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
4236 drbd_ov_oos_found(mdev, sector, size);
4237 else
4238 ov_oos_print(mdev);
4239
4240 drbd_rs_complete_io(mdev, sector);
4241 dec_rs_pending(mdev);
4242
4243 if (--mdev->ov_left == 0) {
4244 w = kmalloc(sizeof(*w), GFP_NOIO);
4245 if (w) {
4246 w->cb = w_ov_finished;
4247 drbd_queue_work_front(&mdev->data.work, w);
4248 } else {
4249 dev_err(DEV, "kmalloc(w) failed.");
4250 ov_oos_print(mdev);
4251 drbd_resync_finished(mdev);
4252 }
4253 }
4254 return TRUE;
4255}
4256
4257struct asender_cmd {
4258 size_t pkt_size;
4259 int (*process)(struct drbd_conf *mdev, struct p_header *h);
4260};
4261
4262static struct asender_cmd *get_asender_cmd(int cmd)
4263{
4264 static struct asender_cmd asender_tbl[] = {
4265 /* anything missing from this table is in
4266 * the drbd_cmd_handler (drbd_default_handler) table,
4267 * see the beginning of drbdd() */
4268 [P_PING] = { sizeof(struct p_header), got_Ping },
4269 [P_PING_ACK] = { sizeof(struct p_header), got_PingAck },
4270 [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4271 [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4272 [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4273 [P_DISCARD_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4274 [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck },
4275 [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply },
4276 [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply},
4277 [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult },
4278 [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
4279 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
4280 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
4281 [P_MAX_CMD] = { 0, NULL },
4282 };
4283 if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
4284 return NULL;
4285 return &asender_tbl[cmd];
4286}
4287
4288int drbd_asender(struct drbd_thread *thi)
4289{
4290 struct drbd_conf *mdev = thi->mdev;
4291 struct p_header *h = &mdev->meta.rbuf.header;
4292 struct asender_cmd *cmd = NULL;
4293
4294 int rv, len;
4295 void *buf = h;
4296 int received = 0;
4297 int expect = sizeof(struct p_header);
4298 int empty;
4299
4300 sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
4301
4302 current->policy = SCHED_RR; /* Make this a realtime task! */
4303 current->rt_priority = 2; /* more important than all other tasks */
4304
4305 while (get_t_state(thi) == Running) {
4306 drbd_thread_current_set_cpu(mdev);
4307 if (test_and_clear_bit(SEND_PING, &mdev->flags)) {
4308 ERR_IF(!drbd_send_ping(mdev)) goto reconnect;
4309 mdev->meta.socket->sk->sk_rcvtimeo =
4310 mdev->net_conf->ping_timeo*HZ/10;
4311 }
4312
4313 /* conditionally cork;
4314 * it may hurt latency if we cork without much to send */
4315 if (!mdev->net_conf->no_cork &&
4316 3 < atomic_read(&mdev->unacked_cnt))
4317 drbd_tcp_cork(mdev->meta.socket);
4318 while (1) {
4319 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4320 flush_signals(current);
4321 if (!drbd_process_done_ee(mdev)) {
4322 dev_err(DEV, "process_done_ee() = NOT_OK\n");
4323 goto reconnect;
4324 }
4325 /* to avoid race with newly queued ACKs */
4326 set_bit(SIGNAL_ASENDER, &mdev->flags);
4327 spin_lock_irq(&mdev->req_lock);
4328 empty = list_empty(&mdev->done_ee);
4329 spin_unlock_irq(&mdev->req_lock);
4330 /* new ack may have been queued right here,
4331 * but then there is also a signal pending,
4332 * and we start over... */
4333 if (empty)
4334 break;
4335 }
4336 /* but unconditionally uncork unless disabled */
4337 if (!mdev->net_conf->no_cork)
4338 drbd_tcp_uncork(mdev->meta.socket);
4339
4340 /* short circuit, recv_msg would return EINTR anyways. */
4341 if (signal_pending(current))
4342 continue;
4343
4344 rv = drbd_recv_short(mdev, mdev->meta.socket,
4345 buf, expect-received, 0);
4346 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4347
4348 flush_signals(current);
4349
4350 /* Note:
4351 * -EINTR (on meta) we got a signal
4352 * -EAGAIN (on meta) rcvtimeo expired
4353 * -ECONNRESET other side closed the connection
4354 * -ERESTARTSYS (on data) we got a signal
4355 * rv < 0 other than above: unexpected error!
4356 * rv == expected: full header or command
4357 * rv < expected: "woken" by signal during receive
4358 * rv == 0 : "connection shut down by peer"
4359 */
4360 if (likely(rv > 0)) {
4361 received += rv;
4362 buf += rv;
4363 } else if (rv == 0) {
4364 dev_err(DEV, "meta connection shut down by peer.\n");
4365 goto reconnect;
4366 } else if (rv == -EAGAIN) {
4367 if (mdev->meta.socket->sk->sk_rcvtimeo ==
4368 mdev->net_conf->ping_timeo*HZ/10) {
4369 dev_err(DEV, "PingAck did not arrive in time.\n");
4370 goto reconnect;
4371 }
4372 set_bit(SEND_PING, &mdev->flags);
4373 continue;
4374 } else if (rv == -EINTR) {
4375 continue;
4376 } else {
4377 dev_err(DEV, "sock_recvmsg returned %d\n", rv);
4378 goto reconnect;
4379 }
4380
4381 if (received == expect && cmd == NULL) {
4382 if (unlikely(h->magic != BE_DRBD_MAGIC)) {
4383 dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n",
4384 (long)be32_to_cpu(h->magic),
4385 h->command, h->length);
4386 goto reconnect;
4387 }
4388 cmd = get_asender_cmd(be16_to_cpu(h->command));
4389 len = be16_to_cpu(h->length);
4390 if (unlikely(cmd == NULL)) {
4391 dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n",
4392 (long)be32_to_cpu(h->magic),
4393 h->command, h->length);
4394 goto disconnect;
4395 }
4396 expect = cmd->pkt_size;
4397 ERR_IF(len != expect-sizeof(struct p_header))
4398 goto reconnect;
4399 }
4400 if (received == expect) {
4401 D_ASSERT(cmd != NULL);
4402 if (!cmd->process(mdev, h))
4403 goto reconnect;
4404
4405 buf = h;
4406 received = 0;
4407 expect = sizeof(struct p_header);
4408 cmd = NULL;
4409 }
4410 }
4411
4412 if (0) {
4413reconnect:
4414 drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
4415 }
4416 if (0) {
4417disconnect:
4418 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
4419 }
4420 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4421
4422 D_ASSERT(mdev->state.conn < C_CONNECTED);
4423 dev_info(DEV, "asender terminated\n");
4424
4425 return 0;
4426}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
new file mode 100644
index 000000000000..de81ab7b4627
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.c
@@ -0,0 +1,1125 @@
1/*
2 drbd_req.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/module.h>
27
28#include <linux/slab.h>
29#include <linux/drbd.h>
30#include "drbd_int.h"
31#include "drbd_req.h"
32
33
34/* Update disk stats at start of I/O request */
35static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio)
36{
37 const int rw = bio_data_dir(bio);
38 int cpu;
39 cpu = part_stat_lock();
40 part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
41 part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
42 part_inc_in_flight(&mdev->vdisk->part0, rw);
43 part_stat_unlock();
44}
45
46/* Update disk stats when completing request upwards */
47static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
48{
49 int rw = bio_data_dir(req->master_bio);
50 unsigned long duration = jiffies - req->start_time;
51 int cpu;
52 cpu = part_stat_lock();
53 part_stat_add(cpu, &mdev->vdisk->part0, ticks[rw], duration);
54 part_round_stats(cpu, &mdev->vdisk->part0);
55 part_dec_in_flight(&mdev->vdisk->part0, rw);
56 part_stat_unlock();
57}
58
59static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw)
60{
61 const unsigned long s = req->rq_state;
62 /* if it was a write, we may have to set the corresponding
63 * bit(s) out-of-sync first. If it had a local part, we need to
64 * release the reference to the activity log. */
65 if (rw == WRITE) {
66 /* remove it from the transfer log.
67 * well, only if it had been there in the first
68 * place... if it had not (local only or conflicting
69 * and never sent), it should still be "empty" as
70 * initialized in drbd_req_new(), so we can list_del() it
71 * here unconditionally */
72 list_del(&req->tl_requests);
73 /* Set out-of-sync unless both OK flags are set
74 * (local only or remote failed).
75 * Other places where we set out-of-sync:
76 * READ with local io-error */
77 if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
78 drbd_set_out_of_sync(mdev, req->sector, req->size);
79
80 if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
81 drbd_set_in_sync(mdev, req->sector, req->size);
82
83 /* one might be tempted to move the drbd_al_complete_io
84 * to the local io completion callback drbd_endio_pri.
85 * but, if this was a mirror write, we may only
86 * drbd_al_complete_io after this is RQ_NET_DONE,
87 * otherwise the extent could be dropped from the al
88 * before it has actually been written on the peer.
89 * if we crash before our peer knows about the request,
90 * but after the extent has been dropped from the al,
91 * we would forget to resync the corresponding extent.
92 */
93 if (s & RQ_LOCAL_MASK) {
94 if (get_ldev_if_state(mdev, D_FAILED)) {
95 drbd_al_complete_io(mdev, req->sector);
96 put_ldev(mdev);
97 } else if (__ratelimit(&drbd_ratelimit_state)) {
98 dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), "
99 "but my Disk seems to have failed :(\n",
100 (unsigned long long) req->sector);
101 }
102 }
103 }
104
105 /* if it was a local io error, we want to notify our
106 * peer about that, and see if we need to
107 * detach the disk and stuff.
108 * to avoid allocating some special work
109 * struct, reuse the request. */
110
111 /* THINK
112 * why do we do this not when we detect the error,
113 * but delay it until it is "done", i.e. possibly
114 * until the next barrier ack? */
115
116 if (rw == WRITE &&
117 ((s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) {
118 if (!(req->w.list.next == LIST_POISON1 ||
119 list_empty(&req->w.list))) {
120 /* DEBUG ASSERT only; if this triggers, we
121 * probably corrupt the worker list here */
122 dev_err(DEV, "req->w.list.next = %p\n", req->w.list.next);
123 dev_err(DEV, "req->w.list.prev = %p\n", req->w.list.prev);
124 }
125 req->w.cb = w_io_error;
126 drbd_queue_work(&mdev->data.work, &req->w);
127 /* drbd_req_free() is done in w_io_error */
128 } else {
129 drbd_req_free(req);
130 }
131}
132
133static void queue_barrier(struct drbd_conf *mdev)
134{
135 struct drbd_tl_epoch *b;
136
137 /* We are within the req_lock. Once we queued the barrier for sending,
138 * we set the CREATE_BARRIER bit. It is cleared as soon as a new
139 * barrier/epoch object is added. This is the only place this bit is
140 * set. It indicates that the barrier for this epoch is already queued,
141 * and no new epoch has been created yet. */
142 if (test_bit(CREATE_BARRIER, &mdev->flags))
143 return;
144
145 b = mdev->newest_tle;
146 b->w.cb = w_send_barrier;
147 /* inc_ap_pending done here, so we won't
148 * get imbalanced on connection loss.
149 * dec_ap_pending will be done in got_BarrierAck
150 * or (on connection loss) in tl_clear. */
151 inc_ap_pending(mdev);
152 drbd_queue_work(&mdev->data.work, &b->w);
153 set_bit(CREATE_BARRIER, &mdev->flags);
154}
155
156static void _about_to_complete_local_write(struct drbd_conf *mdev,
157 struct drbd_request *req)
158{
159 const unsigned long s = req->rq_state;
160 struct drbd_request *i;
161 struct drbd_epoch_entry *e;
162 struct hlist_node *n;
163 struct hlist_head *slot;
164
165 /* before we can signal completion to the upper layers,
166 * we may need to close the current epoch */
167 if (mdev->state.conn >= C_CONNECTED &&
168 req->epoch == mdev->newest_tle->br_number)
169 queue_barrier(mdev);
170
171 /* we need to do the conflict detection stuff,
172 * if we have the ee_hash (two_primaries) and
173 * this has been on the network */
174 if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) {
175 const sector_t sector = req->sector;
176 const int size = req->size;
177
178 /* ASSERT:
179 * there must be no conflicting requests, since
180 * they must have been failed on the spot */
181#define OVERLAPS overlaps(sector, size, i->sector, i->size)
182 slot = tl_hash_slot(mdev, sector);
183 hlist_for_each_entry(i, n, slot, colision) {
184 if (OVERLAPS) {
185 dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; "
186 "other: %p %llus +%u\n",
187 req, (unsigned long long)sector, size,
188 i, (unsigned long long)i->sector, i->size);
189 }
190 }
191
192 /* maybe "wake" those conflicting epoch entries
193 * that wait for this request to finish.
194 *
195 * currently, there can be only _one_ such ee
196 * (well, or some more, which would be pending
197 * P_DISCARD_ACK not yet sent by the asender...),
198 * since we block the receiver thread upon the
199 * first conflict detection, which will wait on
200 * misc_wait. maybe we want to assert that?
201 *
202 * anyways, if we found one,
203 * we just have to do a wake_up. */
204#undef OVERLAPS
205#define OVERLAPS overlaps(sector, size, e->sector, e->size)
206 slot = ee_hash_slot(mdev, req->sector);
207 hlist_for_each_entry(e, n, slot, colision) {
208 if (OVERLAPS) {
209 wake_up(&mdev->misc_wait);
210 break;
211 }
212 }
213 }
214#undef OVERLAPS
215}
216
217void complete_master_bio(struct drbd_conf *mdev,
218 struct bio_and_error *m)
219{
220 bio_endio(m->bio, m->error);
221 dec_ap_bio(mdev);
222}
223
224/* Helper for __req_mod().
225 * Set m->bio to the master bio, if it is fit to be completed,
226 * or leave it alone (it is initialized to NULL in __req_mod),
227 * if it has already been completed, or cannot be completed yet.
228 * If m->bio is set, the error status to be returned is placed in m->error.
229 */
230void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
231{
232 const unsigned long s = req->rq_state;
233 struct drbd_conf *mdev = req->mdev;
234 /* only WRITES may end up here without a master bio (on barrier ack) */
235 int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE;
236
237 /* we must not complete the master bio, while it is
238 * still being processed by _drbd_send_zc_bio (drbd_send_dblock)
239 * not yet acknowledged by the peer
240 * not yet completed by the local io subsystem
241 * these flags may get cleared in any order by
242 * the worker,
243 * the receiver,
244 * the bio_endio completion callbacks.
245 */
246 if (s & RQ_NET_QUEUED)
247 return;
248 if (s & RQ_NET_PENDING)
249 return;
250 if (s & RQ_LOCAL_PENDING)
251 return;
252
253 if (req->master_bio) {
254 /* this is data_received (remote read)
255 * or protocol C P_WRITE_ACK
256 * or protocol B P_RECV_ACK
257 * or protocol A "handed_over_to_network" (SendAck)
258 * or canceled or failed,
259 * or killed from the transfer log due to connection loss.
260 */
261
262 /*
263 * figure out whether to report success or failure.
264 *
265 * report success when at least one of the operations succeeded.
266 * or, to put the other way,
267 * only report failure, when both operations failed.
268 *
269 * what to do about the failures is handled elsewhere.
270 * what we need to do here is just: complete the master_bio.
271 *
272 * local completion error, if any, has been stored as ERR_PTR
273 * in private_bio within drbd_endio_pri.
274 */
275 int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
276 int error = PTR_ERR(req->private_bio);
277
278 /* remove the request from the conflict detection
279 * respective block_id verification hash */
280 if (!hlist_unhashed(&req->colision))
281 hlist_del(&req->colision);
282 else
283 D_ASSERT((s & RQ_NET_MASK) == 0);
284
285 /* for writes we need to do some extra housekeeping */
286 if (rw == WRITE)
287 _about_to_complete_local_write(mdev, req);
288
289 /* Update disk stats */
290 _drbd_end_io_acct(mdev, req);
291
292 m->error = ok ? 0 : (error ?: -EIO);
293 m->bio = req->master_bio;
294 req->master_bio = NULL;
295 }
296
297 if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) {
298 /* this is disconnected (local only) operation,
299 * or protocol C P_WRITE_ACK,
300 * or protocol A or B P_BARRIER_ACK,
301 * or killed from the transfer log due to connection loss. */
302 _req_is_done(mdev, req, rw);
303 }
304 /* else: network part and not DONE yet. that is
305 * protocol A or B, barrier ack still pending... */
306}
307
308/*
309 * checks whether there was an overlapping request
310 * or ee already registered.
311 *
312 * if so, return 1, in which case this request is completed on the spot,
313 * without ever being submitted or send.
314 *
315 * return 0 if it is ok to submit this request.
316 *
317 * NOTE:
318 * paranoia: assume something above us is broken, and issues different write
319 * requests for the same block simultaneously...
320 *
321 * To ensure these won't be reordered differently on both nodes, resulting in
322 * diverging data sets, we discard the later one(s). Not that this is supposed
323 * to happen, but this is the rationale why we also have to check for
324 * conflicting requests with local origin, and why we have to do so regardless
325 * of whether we allowed multiple primaries.
326 *
327 * BTW, in case we only have one primary, the ee_hash is empty anyways, and the
328 * second hlist_for_each_entry becomes a noop. This is even simpler than to
329 * grab a reference on the net_conf, and check for the two_primaries flag...
330 */
331static int _req_conflicts(struct drbd_request *req)
332{
333 struct drbd_conf *mdev = req->mdev;
334 const sector_t sector = req->sector;
335 const int size = req->size;
336 struct drbd_request *i;
337 struct drbd_epoch_entry *e;
338 struct hlist_node *n;
339 struct hlist_head *slot;
340
341 D_ASSERT(hlist_unhashed(&req->colision));
342
343 if (!get_net_conf(mdev))
344 return 0;
345
346 /* BUG_ON */
347 ERR_IF (mdev->tl_hash_s == 0)
348 goto out_no_conflict;
349 BUG_ON(mdev->tl_hash == NULL);
350
351#define OVERLAPS overlaps(i->sector, i->size, sector, size)
352 slot = tl_hash_slot(mdev, sector);
353 hlist_for_each_entry(i, n, slot, colision) {
354 if (OVERLAPS) {
355 dev_alert(DEV, "%s[%u] Concurrent local write detected! "
356 "[DISCARD L] new: %llus +%u; "
357 "pending: %llus +%u\n",
358 current->comm, current->pid,
359 (unsigned long long)sector, size,
360 (unsigned long long)i->sector, i->size);
361 goto out_conflict;
362 }
363 }
364
365 if (mdev->ee_hash_s) {
366 /* now, check for overlapping requests with remote origin */
367 BUG_ON(mdev->ee_hash == NULL);
368#undef OVERLAPS
369#define OVERLAPS overlaps(e->sector, e->size, sector, size)
370 slot = ee_hash_slot(mdev, sector);
371 hlist_for_each_entry(e, n, slot, colision) {
372 if (OVERLAPS) {
373 dev_alert(DEV, "%s[%u] Concurrent remote write detected!"
374 " [DISCARD L] new: %llus +%u; "
375 "pending: %llus +%u\n",
376 current->comm, current->pid,
377 (unsigned long long)sector, size,
378 (unsigned long long)e->sector, e->size);
379 goto out_conflict;
380 }
381 }
382 }
383#undef OVERLAPS
384
385out_no_conflict:
386 /* this is like it should be, and what we expected.
387 * our users do behave after all... */
388 put_net_conf(mdev);
389 return 0;
390
391out_conflict:
392 put_net_conf(mdev);
393 return 1;
394}
395
396/* obviously this could be coded as many single functions
397 * instead of one huge switch,
398 * or by putting the code directly in the respective locations
399 * (as it has been before).
400 *
401 * but having it this way
402 * enforces that it is all in this one place, where it is easier to audit,
403 * it makes it obvious that whatever "event" "happens" to a request should
404 * happen "atomically" within the req_lock,
405 * and it enforces that we have to think in a very structured manner
406 * about the "events" that may happen to a request during its life time ...
407 */
408void __req_mod(struct drbd_request *req, enum drbd_req_event what,
409 struct bio_and_error *m)
410{
411 struct drbd_conf *mdev = req->mdev;
412 m->bio = NULL;
413
414 switch (what) {
415 default:
416 dev_err(DEV, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__);
417 break;
418
419 /* does not happen...
420 * initialization done in drbd_req_new
421 case created:
422 break;
423 */
424
425 case to_be_send: /* via network */
426 /* reached via drbd_make_request_common
427 * and from w_read_retry_remote */
428 D_ASSERT(!(req->rq_state & RQ_NET_MASK));
429 req->rq_state |= RQ_NET_PENDING;
430 inc_ap_pending(mdev);
431 break;
432
433 case to_be_submitted: /* locally */
434 /* reached via drbd_make_request_common */
435 D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK));
436 req->rq_state |= RQ_LOCAL_PENDING;
437 break;
438
439 case completed_ok:
440 if (bio_data_dir(req->master_bio) == WRITE)
441 mdev->writ_cnt += req->size>>9;
442 else
443 mdev->read_cnt += req->size>>9;
444
445 req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
446 req->rq_state &= ~RQ_LOCAL_PENDING;
447
448 _req_may_be_done(req, m);
449 put_ldev(mdev);
450 break;
451
452 case write_completed_with_error:
453 req->rq_state |= RQ_LOCAL_COMPLETED;
454 req->rq_state &= ~RQ_LOCAL_PENDING;
455
456 dev_alert(DEV, "Local WRITE failed sec=%llus size=%u\n",
457 (unsigned long long)req->sector, req->size);
458 /* and now: check how to handle local io error. */
459 __drbd_chk_io_error(mdev, FALSE);
460 _req_may_be_done(req, m);
461 put_ldev(mdev);
462 break;
463
464 case read_ahead_completed_with_error:
465 /* it is legal to fail READA */
466 req->rq_state |= RQ_LOCAL_COMPLETED;
467 req->rq_state &= ~RQ_LOCAL_PENDING;
468 _req_may_be_done(req, m);
469 put_ldev(mdev);
470 break;
471
472 case read_completed_with_error:
473 drbd_set_out_of_sync(mdev, req->sector, req->size);
474
475 req->rq_state |= RQ_LOCAL_COMPLETED;
476 req->rq_state &= ~RQ_LOCAL_PENDING;
477
478 dev_alert(DEV, "Local READ failed sec=%llus size=%u\n",
479 (unsigned long long)req->sector, req->size);
480 /* _req_mod(req,to_be_send); oops, recursion... */
481 D_ASSERT(!(req->rq_state & RQ_NET_MASK));
482 req->rq_state |= RQ_NET_PENDING;
483 inc_ap_pending(mdev);
484
485 __drbd_chk_io_error(mdev, FALSE);
486 put_ldev(mdev);
487 /* NOTE: if we have no connection,
488 * or know the peer has no good data either,
489 * then we don't actually need to "queue_for_net_read",
490 * but we do so anyways, since the drbd_io_error()
491 * and the potential state change to "Diskless"
492 * needs to be done from process context */
493
494 /* fall through: _req_mod(req,queue_for_net_read); */
495
496 case queue_for_net_read:
497 /* READ or READA, and
498 * no local disk,
499 * or target area marked as invalid,
500 * or just got an io-error. */
501 /* from drbd_make_request_common
502 * or from bio_endio during read io-error recovery */
503
504 /* so we can verify the handle in the answer packet
505 * corresponding hlist_del is in _req_may_be_done() */
506 hlist_add_head(&req->colision, ar_hash_slot(mdev, req->sector));
507
508 set_bit(UNPLUG_REMOTE, &mdev->flags);
509
510 D_ASSERT(req->rq_state & RQ_NET_PENDING);
511 req->rq_state |= RQ_NET_QUEUED;
512 req->w.cb = (req->rq_state & RQ_LOCAL_MASK)
513 ? w_read_retry_remote
514 : w_send_read_req;
515 drbd_queue_work(&mdev->data.work, &req->w);
516 break;
517
518 case queue_for_net_write:
519 /* assert something? */
520 /* from drbd_make_request_common only */
521
522 hlist_add_head(&req->colision, tl_hash_slot(mdev, req->sector));
523 /* corresponding hlist_del is in _req_may_be_done() */
524
525 /* NOTE
526 * In case the req ended up on the transfer log before being
527 * queued on the worker, it could lead to this request being
528 * missed during cleanup after connection loss.
529 * So we have to do both operations here,
530 * within the same lock that protects the transfer log.
531 *
532 * _req_add_to_epoch(req); this has to be after the
533 * _maybe_start_new_epoch(req); which happened in
534 * drbd_make_request_common, because we now may set the bit
535 * again ourselves to close the current epoch.
536 *
537 * Add req to the (now) current epoch (barrier). */
538
539 /* otherwise we may lose an unplug, which may cause some remote
540 * io-scheduler timeout to expire, increasing maximum latency,
541 * hurting performance. */
542 set_bit(UNPLUG_REMOTE, &mdev->flags);
543
544 /* see drbd_make_request_common,
545 * just after it grabs the req_lock */
546 D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0);
547
548 req->epoch = mdev->newest_tle->br_number;
549 list_add_tail(&req->tl_requests,
550 &mdev->newest_tle->requests);
551
552 /* increment size of current epoch */
553 mdev->newest_tle->n_req++;
554
555 /* queue work item to send data */
556 D_ASSERT(req->rq_state & RQ_NET_PENDING);
557 req->rq_state |= RQ_NET_QUEUED;
558 req->w.cb = w_send_dblock;
559 drbd_queue_work(&mdev->data.work, &req->w);
560
561 /* close the epoch, in case it outgrew the limit */
562 if (mdev->newest_tle->n_req >= mdev->net_conf->max_epoch_size)
563 queue_barrier(mdev);
564
565 break;
566
567 case send_canceled:
568 /* treat it the same */
569 case send_failed:
570 /* real cleanup will be done from tl_clear. just update flags
571 * so it is no longer marked as on the worker queue */
572 req->rq_state &= ~RQ_NET_QUEUED;
573 /* if we did it right, tl_clear should be scheduled only after
574 * this, so this should not be necessary! */
575 _req_may_be_done(req, m);
576 break;
577
578 case handed_over_to_network:
579 /* assert something? */
580 if (bio_data_dir(req->master_bio) == WRITE &&
581 mdev->net_conf->wire_protocol == DRBD_PROT_A) {
582 /* this is what is dangerous about protocol A:
583 * pretend it was successfully written on the peer. */
584 if (req->rq_state & RQ_NET_PENDING) {
585 dec_ap_pending(mdev);
586 req->rq_state &= ~RQ_NET_PENDING;
587 req->rq_state |= RQ_NET_OK;
588 } /* else: neg-ack was faster... */
589 /* it is still not yet RQ_NET_DONE until the
590 * corresponding epoch barrier got acked as well,
591 * so we know what to dirty on connection loss */
592 }
593 req->rq_state &= ~RQ_NET_QUEUED;
594 req->rq_state |= RQ_NET_SENT;
595 /* because _drbd_send_zc_bio could sleep, and may want to
596 * dereference the bio even after the "write_acked_by_peer" and
597 * "completed_ok" events came in, once we return from
598 * _drbd_send_zc_bio (drbd_send_dblock), we have to check
599 * whether it is done already, and end it. */
600 _req_may_be_done(req, m);
601 break;
602
603 case connection_lost_while_pending:
604 /* transfer log cleanup after connection loss */
605 /* assert something? */
606 if (req->rq_state & RQ_NET_PENDING)
607 dec_ap_pending(mdev);
608 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
609 req->rq_state |= RQ_NET_DONE;
610 /* if it is still queued, we may not complete it here.
611 * it will be canceled soon. */
612 if (!(req->rq_state & RQ_NET_QUEUED))
613 _req_may_be_done(req, m);
614 break;
615
616 case write_acked_by_peer_and_sis:
617 req->rq_state |= RQ_NET_SIS;
618 case conflict_discarded_by_peer:
619 /* for discarded conflicting writes of multiple primaries,
620 * there is no need to keep anything in the tl, potential
621 * node crashes are covered by the activity log. */
622 if (what == conflict_discarded_by_peer)
623 dev_alert(DEV, "Got DiscardAck packet %llus +%u!"
624 " DRBD is not a random data generator!\n",
625 (unsigned long long)req->sector, req->size);
626 req->rq_state |= RQ_NET_DONE;
627 /* fall through */
628 case write_acked_by_peer:
629 /* protocol C; successfully written on peer.
630 * Nothing to do here.
631 * We want to keep the tl in place for all protocols, to cater
632 * for volatile write-back caches on lower level devices.
633 *
634 * A barrier request is expected to have forced all prior
635 * requests onto stable storage, so completion of a barrier
636 * request could set NET_DONE right here, and not wait for the
637 * P_BARRIER_ACK, but that is an unnecessary optimization. */
638
639 /* this makes it effectively the same as for: */
640 case recv_acked_by_peer:
641 /* protocol B; pretends to be successfully written on peer.
642 * see also notes above in handed_over_to_network about
643 * protocol != C */
644 req->rq_state |= RQ_NET_OK;
645 D_ASSERT(req->rq_state & RQ_NET_PENDING);
646 dec_ap_pending(mdev);
647 req->rq_state &= ~RQ_NET_PENDING;
648 _req_may_be_done(req, m);
649 break;
650
651 case neg_acked:
652 /* assert something? */
653 if (req->rq_state & RQ_NET_PENDING)
654 dec_ap_pending(mdev);
655 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
656
657 req->rq_state |= RQ_NET_DONE;
658 _req_may_be_done(req, m);
659 /* else: done by handed_over_to_network */
660 break;
661
662 case barrier_acked:
663 if (req->rq_state & RQ_NET_PENDING) {
664 /* barrier came in before all requests have been acked.
665 * this is bad, because if the connection is lost now,
666 * we won't be able to clean them up... */
667 dev_err(DEV, "FIXME (barrier_acked but pending)\n");
668 list_move(&req->tl_requests, &mdev->out_of_sequence_requests);
669 }
670 D_ASSERT(req->rq_state & RQ_NET_SENT);
671 req->rq_state |= RQ_NET_DONE;
672 _req_may_be_done(req, m);
673 break;
674
675 case data_received:
676 D_ASSERT(req->rq_state & RQ_NET_PENDING);
677 dec_ap_pending(mdev);
678 req->rq_state &= ~RQ_NET_PENDING;
679 req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
680 _req_may_be_done(req, m);
681 break;
682 };
683}
684
685/* we may do a local read if:
686 * - we are consistent (of course),
687 * - or we are generally inconsistent,
688 * BUT we are still/already IN SYNC for this area.
689 * since size may be bigger than BM_BLOCK_SIZE,
690 * we may need to check several bits.
691 */
692static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size)
693{
694 unsigned long sbnr, ebnr;
695 sector_t esector, nr_sectors;
696
697 if (mdev->state.disk == D_UP_TO_DATE)
698 return 1;
699 if (mdev->state.disk >= D_OUTDATED)
700 return 0;
701 if (mdev->state.disk < D_INCONSISTENT)
702 return 0;
703 /* state.disk == D_INCONSISTENT We will have a look at the BitMap */
704 nr_sectors = drbd_get_capacity(mdev->this_bdev);
705 esector = sector + (size >> 9) - 1;
706
707 D_ASSERT(sector < nr_sectors);
708 D_ASSERT(esector < nr_sectors);
709
710 sbnr = BM_SECT_TO_BIT(sector);
711 ebnr = BM_SECT_TO_BIT(esector);
712
713 return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr);
714}
715
716static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
717{
718 const int rw = bio_rw(bio);
719 const int size = bio->bi_size;
720 const sector_t sector = bio->bi_sector;
721 struct drbd_tl_epoch *b = NULL;
722 struct drbd_request *req;
723 int local, remote;
724 int err = -EIO;
725
726 /* allocate outside of all locks; */
727 req = drbd_req_new(mdev, bio);
728 if (!req) {
729 dec_ap_bio(mdev);
730 /* only pass the error to the upper layers.
731 * if user cannot handle io errors, that's not our business. */
732 dev_err(DEV, "could not kmalloc() req\n");
733 bio_endio(bio, -ENOMEM);
734 return 0;
735 }
736
737 local = get_ldev(mdev);
738 if (!local) {
739 bio_put(req->private_bio); /* or we get a bio leak */
740 req->private_bio = NULL;
741 }
742 if (rw == WRITE) {
743 remote = 1;
744 } else {
745 /* READ || READA */
746 if (local) {
747 if (!drbd_may_do_local_read(mdev, sector, size)) {
748 /* we could kick the syncer to
749 * sync this extent asap, wait for
750 * it, then continue locally.
751 * Or just issue the request remotely.
752 */
753 local = 0;
754 bio_put(req->private_bio);
755 req->private_bio = NULL;
756 put_ldev(mdev);
757 }
758 }
759 remote = !local && mdev->state.pdsk >= D_UP_TO_DATE;
760 }
761
762 /* If we have a disk, but a READA request is mapped to remote,
763 * we are R_PRIMARY, D_INCONSISTENT, SyncTarget.
764 * Just fail that READA request right here.
765 *
766 * THINK: maybe fail all READA when not local?
767 * or make this configurable...
768 * if network is slow, READA won't do any good.
769 */
770 if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) {
771 err = -EWOULDBLOCK;
772 goto fail_and_free_req;
773 }
774
775 /* For WRITES going to the local disk, grab a reference on the target
776 * extent. This waits for any resync activity in the corresponding
777 * resync extent to finish, and, if necessary, pulls in the target
778 * extent into the activity log, which involves further disk io because
779 * of transactional on-disk meta data updates. */
780 if (rw == WRITE && local)
781 drbd_al_begin_io(mdev, sector);
782
783 remote = remote && (mdev->state.pdsk == D_UP_TO_DATE ||
784 (mdev->state.pdsk == D_INCONSISTENT &&
785 mdev->state.conn >= C_CONNECTED));
786
787 if (!(local || remote)) {
788 dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
789 goto fail_free_complete;
790 }
791
792 /* For WRITE request, we have to make sure that we have an
793 * unused_spare_tle, in case we need to start a new epoch.
794 * I try to be smart and avoid to pre-allocate always "just in case",
795 * but there is a race between testing the bit and pointer outside the
796 * spinlock, and grabbing the spinlock.
797 * if we lost that race, we retry. */
798 if (rw == WRITE && remote &&
799 mdev->unused_spare_tle == NULL &&
800 test_bit(CREATE_BARRIER, &mdev->flags)) {
801allocate_barrier:
802 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO);
803 if (!b) {
804 dev_err(DEV, "Failed to alloc barrier.\n");
805 err = -ENOMEM;
806 goto fail_free_complete;
807 }
808 }
809
810 /* GOOD, everything prepared, grab the spin_lock */
811 spin_lock_irq(&mdev->req_lock);
812
813 if (remote) {
814 remote = (mdev->state.pdsk == D_UP_TO_DATE ||
815 (mdev->state.pdsk == D_INCONSISTENT &&
816 mdev->state.conn >= C_CONNECTED));
817 if (!remote)
818 dev_warn(DEV, "lost connection while grabbing the req_lock!\n");
819 if (!(local || remote)) {
820 dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
821 spin_unlock_irq(&mdev->req_lock);
822 goto fail_free_complete;
823 }
824 }
825
826 if (b && mdev->unused_spare_tle == NULL) {
827 mdev->unused_spare_tle = b;
828 b = NULL;
829 }
830 if (rw == WRITE && remote &&
831 mdev->unused_spare_tle == NULL &&
832 test_bit(CREATE_BARRIER, &mdev->flags)) {
833 /* someone closed the current epoch
834 * while we were grabbing the spinlock */
835 spin_unlock_irq(&mdev->req_lock);
836 goto allocate_barrier;
837 }
838
839
840 /* Update disk stats */
841 _drbd_start_io_acct(mdev, req, bio);
842
843 /* _maybe_start_new_epoch(mdev);
844 * If we need to generate a write barrier packet, we have to add the
845 * new epoch (barrier) object, and queue the barrier packet for sending,
846 * and queue the req's data after it _within the same lock_, otherwise
847 * we have race conditions were the reorder domains could be mixed up.
848 *
849 * Even read requests may start a new epoch and queue the corresponding
850 * barrier packet. To get the write ordering right, we only have to
851 * make sure that, if this is a write request and it triggered a
852 * barrier packet, this request is queued within the same spinlock. */
853 if (remote && mdev->unused_spare_tle &&
854 test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
855 _tl_add_barrier(mdev, mdev->unused_spare_tle);
856 mdev->unused_spare_tle = NULL;
857 } else {
858 D_ASSERT(!(remote && rw == WRITE &&
859 test_bit(CREATE_BARRIER, &mdev->flags)));
860 }
861
862 /* NOTE
863 * Actually, 'local' may be wrong here already, since we may have failed
864 * to write to the meta data, and may become wrong anytime because of
865 * local io-error for some other request, which would lead to us
866 * "detaching" the local disk.
867 *
868 * 'remote' may become wrong any time because the network could fail.
869 *
870 * This is a harmless race condition, though, since it is handled
871 * correctly at the appropriate places; so it just defers the failure
872 * of the respective operation.
873 */
874
875 /* mark them early for readability.
876 * this just sets some state flags. */
877 if (remote)
878 _req_mod(req, to_be_send);
879 if (local)
880 _req_mod(req, to_be_submitted);
881
882 /* check this request on the collision detection hash tables.
883 * if we have a conflict, just complete it here.
884 * THINK do we want to check reads, too? (I don't think so...) */
885 if (rw == WRITE && _req_conflicts(req)) {
886 /* this is a conflicting request.
887 * even though it may have been only _partially_
888 * overlapping with one of the currently pending requests,
889 * without even submitting or sending it, we will
890 * pretend that it was successfully served right now.
891 */
892 if (local) {
893 bio_put(req->private_bio);
894 req->private_bio = NULL;
895 drbd_al_complete_io(mdev, req->sector);
896 put_ldev(mdev);
897 local = 0;
898 }
899 if (remote)
900 dec_ap_pending(mdev);
901 _drbd_end_io_acct(mdev, req);
902 /* THINK: do we want to fail it (-EIO), or pretend success? */
903 bio_endio(req->master_bio, 0);
904 req->master_bio = NULL;
905 dec_ap_bio(mdev);
906 drbd_req_free(req);
907 remote = 0;
908 }
909
910 /* NOTE remote first: to get the concurrent write detection right,
911 * we must register the request before start of local IO. */
912 if (remote) {
913 /* either WRITE and C_CONNECTED,
914 * or READ, and no local disk,
915 * or READ, but not in sync.
916 */
917 _req_mod(req, (rw == WRITE)
918 ? queue_for_net_write
919 : queue_for_net_read);
920 }
921 spin_unlock_irq(&mdev->req_lock);
922 kfree(b); /* if someone else has beaten us to it... */
923
924 if (local) {
925 req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
926
927 if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
928 : rw == READ ? DRBD_FAULT_DT_RD
929 : DRBD_FAULT_DT_RA))
930 bio_endio(req->private_bio, -EIO);
931 else
932 generic_make_request(req->private_bio);
933 }
934
935 /* we need to plug ALWAYS since we possibly need to kick lo_dev.
936 * we plug after submit, so we won't miss an unplug event */
937 drbd_plug_device(mdev);
938
939 return 0;
940
941fail_free_complete:
942 if (rw == WRITE && local)
943 drbd_al_complete_io(mdev, sector);
944fail_and_free_req:
945 if (local) {
946 bio_put(req->private_bio);
947 req->private_bio = NULL;
948 put_ldev(mdev);
949 }
950 bio_endio(bio, err);
951 drbd_req_free(req);
952 dec_ap_bio(mdev);
953 kfree(b);
954
955 return 0;
956}
957
958/* helper function for drbd_make_request
959 * if we can determine just by the mdev (state) that this request will fail,
960 * return 1
961 * otherwise return 0
962 */
963static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
964{
965 /* Unconfigured */
966 if (mdev->state.conn == C_DISCONNECTING &&
967 mdev->state.disk == D_DISKLESS)
968 return 1;
969
970 if (mdev->state.role != R_PRIMARY &&
971 (!allow_oos || is_write)) {
972 if (__ratelimit(&drbd_ratelimit_state)) {
973 dev_err(DEV, "Process %s[%u] tried to %s; "
974 "since we are not in Primary state, "
975 "we cannot allow this\n",
976 current->comm, current->pid,
977 is_write ? "WRITE" : "READ");
978 }
979 return 1;
980 }
981
982 /*
983 * Paranoia: we might have been primary, but sync target, or
984 * even diskless, then lost the connection.
985 * This should have been handled (panic? suspend?) somewhere
986 * else. But maybe it was not, so check again here.
987 * Caution: as long as we do not have a read/write lock on mdev,
988 * to serialize state changes, this is racy, since we may lose
989 * the connection *after* we test for the cstate.
990 */
991 if (mdev->state.disk < D_UP_TO_DATE && mdev->state.pdsk < D_UP_TO_DATE) {
992 if (__ratelimit(&drbd_ratelimit_state))
993 dev_err(DEV, "Sorry, I have no access to good data anymore.\n");
994 return 1;
995 }
996
997 return 0;
998}
999
1000int drbd_make_request_26(struct request_queue *q, struct bio *bio)
1001{
1002 unsigned int s_enr, e_enr;
1003 struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
1004
1005 if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) {
1006 bio_endio(bio, -EPERM);
1007 return 0;
1008 }
1009
1010 /* Reject barrier requests if we know the underlying device does
1011 * not support them.
1012 * XXX: Need to get this info from peer as well some how so we
1013 * XXX: reject if EITHER side/data/metadata area does not support them.
1014 *
1015 * because of those XXX, this is not yet enabled,
1016 * i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit.
1017 */
1018 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags))) {
1019 /* dev_warn(DEV, "Rejecting barrier request as underlying device does not support\n"); */
1020 bio_endio(bio, -EOPNOTSUPP);
1021 return 0;
1022 }
1023
1024 /*
1025 * what we "blindly" assume:
1026 */
1027 D_ASSERT(bio->bi_size > 0);
1028 D_ASSERT((bio->bi_size & 0x1ff) == 0);
1029 D_ASSERT(bio->bi_idx == 0);
1030
1031 /* to make some things easier, force alignment of requests within the
1032 * granularity of our hash tables */
1033 s_enr = bio->bi_sector >> HT_SHIFT;
1034 e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT;
1035
1036 if (likely(s_enr == e_enr)) {
1037 inc_ap_bio(mdev, 1);
1038 return drbd_make_request_common(mdev, bio);
1039 }
1040
1041 /* can this bio be split generically?
1042 * Maybe add our own split-arbitrary-bios function. */
1043 if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_SEGMENT_SIZE) {
1044 /* rather error out here than BUG in bio_split */
1045 dev_err(DEV, "bio would need to, but cannot, be split: "
1046 "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n",
1047 bio->bi_vcnt, bio->bi_idx, bio->bi_size,
1048 (unsigned long long)bio->bi_sector);
1049 bio_endio(bio, -EINVAL);
1050 } else {
1051 /* This bio crosses some boundary, so we have to split it. */
1052 struct bio_pair *bp;
1053 /* works for the "do not cross hash slot boundaries" case
1054 * e.g. sector 262269, size 4096
1055 * s_enr = 262269 >> 6 = 4097
1056 * e_enr = (262269+8-1) >> 6 = 4098
1057 * HT_SHIFT = 6
1058 * sps = 64, mask = 63
1059 * first_sectors = 64 - (262269 & 63) = 3
1060 */
1061 const sector_t sect = bio->bi_sector;
1062 const int sps = 1 << HT_SHIFT; /* sectors per slot */
1063 const int mask = sps - 1;
1064 const sector_t first_sectors = sps - (sect & mask);
1065 bp = bio_split(bio,
1066#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
1067 bio_split_pool,
1068#endif
1069 first_sectors);
1070
1071 /* we need to get a "reference count" (ap_bio_cnt)
1072 * to avoid races with the disconnect/reconnect/suspend code.
1073 * In case we need to split the bio here, we need to get two references
1074 * atomically, otherwise we might deadlock when trying to submit the
1075 * second one! */
1076 inc_ap_bio(mdev, 2);
1077
1078 D_ASSERT(e_enr == s_enr + 1);
1079
1080 drbd_make_request_common(mdev, &bp->bio1);
1081 drbd_make_request_common(mdev, &bp->bio2);
1082 bio_pair_release(bp);
1083 }
1084 return 0;
1085}
1086
1087/* This is called by bio_add_page(). With this function we reduce
1088 * the number of BIOs that span over multiple DRBD_MAX_SEGMENT_SIZEs
1089 * units (was AL_EXTENTs).
1090 *
1091 * we do the calculation within the lower 32bit of the byte offsets,
1092 * since we don't care for actual offset, but only check whether it
1093 * would cross "activity log extent" boundaries.
1094 *
1095 * As long as the BIO is empty we have to allow at least one bvec,
1096 * regardless of size and offset. so the resulting bio may still
1097 * cross extent boundaries. those are dealt with (bio_split) in
1098 * drbd_make_request_26.
1099 */
1100int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
1101{
1102 struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
1103 unsigned int bio_offset =
1104 (unsigned int)bvm->bi_sector << 9; /* 32 bit */
1105 unsigned int bio_size = bvm->bi_size;
1106 int limit, backing_limit;
1107
1108 limit = DRBD_MAX_SEGMENT_SIZE
1109 - ((bio_offset & (DRBD_MAX_SEGMENT_SIZE-1)) + bio_size);
1110 if (limit < 0)
1111 limit = 0;
1112 if (bio_size == 0) {
1113 if (limit <= bvec->bv_len)
1114 limit = bvec->bv_len;
1115 } else if (limit && get_ldev(mdev)) {
1116 struct request_queue * const b =
1117 mdev->ldev->backing_bdev->bd_disk->queue;
1118 if (b->merge_bvec_fn && mdev->ldev->dc.use_bmbv) {
1119 backing_limit = b->merge_bvec_fn(b, bvm, bvec);
1120 limit = min(limit, backing_limit);
1121 }
1122 put_ldev(mdev);
1123 }
1124 return limit;
1125}
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
new file mode 100644
index 000000000000..f22c1bc8ec7e
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.h
@@ -0,0 +1,326 @@
1/*
2 drbd_req.h
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2006-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2006-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
8 Copyright (C) 2006-2008, Philipp Reisner <philipp.reisner@linbit.com>.
9
10 DRBD is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 DRBD is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25#ifndef _DRBD_REQ_H
26#define _DRBD_REQ_H
27
28#include <linux/module.h>
29
30#include <linux/slab.h>
31#include <linux/drbd.h>
32#include "drbd_int.h"
33#include "drbd_wrappers.h"
34
35/* The request callbacks will be called in irq context by the IDE drivers,
36 and in Softirqs/Tasklets/BH context by the SCSI drivers,
37 and by the receiver and worker in kernel-thread context.
38 Try to get the locking right :) */
39
40/*
41 * Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are
42 * associated with IO requests originating from the block layer above us.
43 *
44 * There are quite a few things that may happen to a drbd request
45 * during its lifetime.
46 *
47 * It will be created.
48 * It will be marked with the intention to be
49 * submitted to local disk and/or
50 * send via the network.
51 *
52 * It has to be placed on the transfer log and other housekeeping lists,
53 * In case we have a network connection.
54 *
55 * It may be identified as a concurrent (write) request
56 * and be handled accordingly.
57 *
58 * It may me handed over to the local disk subsystem.
59 * It may be completed by the local disk subsystem,
60 * either sucessfully or with io-error.
61 * In case it is a READ request, and it failed locally,
62 * it may be retried remotely.
63 *
64 * It may be queued for sending.
65 * It may be handed over to the network stack,
66 * which may fail.
67 * It may be acknowledged by the "peer" according to the wire_protocol in use.
68 * this may be a negative ack.
69 * It may receive a faked ack when the network connection is lost and the
70 * transfer log is cleaned up.
71 * Sending may be canceled due to network connection loss.
72 * When it finally has outlived its time,
73 * corresponding dirty bits in the resync-bitmap may be cleared or set,
74 * it will be destroyed,
75 * and completion will be signalled to the originator,
76 * with or without "success".
77 */
78
79enum drbd_req_event {
80 created,
81 to_be_send,
82 to_be_submitted,
83
84 /* XXX yes, now I am inconsistent...
85 * these two are not "events" but "actions"
86 * oh, well... */
87 queue_for_net_write,
88 queue_for_net_read,
89
90 send_canceled,
91 send_failed,
92 handed_over_to_network,
93 connection_lost_while_pending,
94 recv_acked_by_peer,
95 write_acked_by_peer,
96 write_acked_by_peer_and_sis, /* and set_in_sync */
97 conflict_discarded_by_peer,
98 neg_acked,
99 barrier_acked, /* in protocol A and B */
100 data_received, /* (remote read) */
101
102 read_completed_with_error,
103 read_ahead_completed_with_error,
104 write_completed_with_error,
105 completed_ok,
106 nothing, /* for tracing only */
107};
108
109/* encoding of request states for now. we don't actually need that many bits.
110 * we don't need to do atomic bit operations either, since most of the time we
111 * need to look at the connection state and/or manipulate some lists at the
112 * same time, so we should hold the request lock anyways.
113 */
114enum drbd_req_state_bits {
115 /* 210
116 * 000: no local possible
117 * 001: to be submitted
118 * UNUSED, we could map: 011: submitted, completion still pending
119 * 110: completed ok
120 * 010: completed with error
121 */
122 __RQ_LOCAL_PENDING,
123 __RQ_LOCAL_COMPLETED,
124 __RQ_LOCAL_OK,
125
126 /* 76543
127 * 00000: no network possible
128 * 00001: to be send
129 * 00011: to be send, on worker queue
130 * 00101: sent, expecting recv_ack (B) or write_ack (C)
131 * 11101: sent,
132 * recv_ack (B) or implicit "ack" (A),
133 * still waiting for the barrier ack.
134 * master_bio may already be completed and invalidated.
135 * 11100: write_acked (C),
136 * data_received (for remote read, any protocol)
137 * or finally the barrier ack has arrived (B,A)...
138 * request can be freed
139 * 01100: neg-acked (write, protocol C)
140 * or neg-d-acked (read, any protocol)
141 * or killed from the transfer log
142 * during cleanup after connection loss
143 * request can be freed
144 * 01000: canceled or send failed...
145 * request can be freed
146 */
147
148 /* if "SENT" is not set, yet, this can still fail or be canceled.
149 * if "SENT" is set already, we still wait for an Ack packet.
150 * when cleared, the master_bio may be completed.
151 * in (B,A) the request object may still linger on the transaction log
152 * until the corresponding barrier ack comes in */
153 __RQ_NET_PENDING,
154
155 /* If it is QUEUED, and it is a WRITE, it is also registered in the
156 * transfer log. Currently we need this flag to avoid conflicts between
157 * worker canceling the request and tl_clear_barrier killing it from
158 * transfer log. We should restructure the code so this conflict does
159 * no longer occur. */
160 __RQ_NET_QUEUED,
161
162 /* well, actually only "handed over to the network stack".
163 *
164 * TODO can potentially be dropped because of the similar meaning
165 * of RQ_NET_SENT and ~RQ_NET_QUEUED.
166 * however it is not exactly the same. before we drop it
167 * we must ensure that we can tell a request with network part
168 * from a request without, regardless of what happens to it. */
169 __RQ_NET_SENT,
170
171 /* when set, the request may be freed (if RQ_NET_QUEUED is clear).
172 * basically this means the corresponding P_BARRIER_ACK was received */
173 __RQ_NET_DONE,
174
175 /* whether or not we know (C) or pretend (B,A) that the write
176 * was successfully written on the peer.
177 */
178 __RQ_NET_OK,
179
180 /* peer called drbd_set_in_sync() for this write */
181 __RQ_NET_SIS,
182
183 /* keep this last, its for the RQ_NET_MASK */
184 __RQ_NET_MAX,
185};
186
187#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
188#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
189#define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK)
190
191#define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */
192
193#define RQ_NET_PENDING (1UL << __RQ_NET_PENDING)
194#define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED)
195#define RQ_NET_SENT (1UL << __RQ_NET_SENT)
196#define RQ_NET_DONE (1UL << __RQ_NET_DONE)
197#define RQ_NET_OK (1UL << __RQ_NET_OK)
198#define RQ_NET_SIS (1UL << __RQ_NET_SIS)
199
200/* 0x1f8 */
201#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
202
203/* epoch entries */
204static inline
205struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector)
206{
207 BUG_ON(mdev->ee_hash_s == 0);
208 return mdev->ee_hash +
209 ((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s);
210}
211
212/* transfer log (drbd_request objects) */
213static inline
214struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector)
215{
216 BUG_ON(mdev->tl_hash_s == 0);
217 return mdev->tl_hash +
218 ((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s);
219}
220
221/* application reads (drbd_request objects) */
222static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector)
223{
224 return mdev->app_reads_hash
225 + ((unsigned int)(sector) % APP_R_HSIZE);
226}
227
228/* when we receive the answer for a read request,
229 * verify that we actually know about it */
230static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
231 u64 id, sector_t sector)
232{
233 struct hlist_head *slot = ar_hash_slot(mdev, sector);
234 struct hlist_node *n;
235 struct drbd_request *req;
236
237 hlist_for_each_entry(req, n, slot, colision) {
238 if ((unsigned long)req == (unsigned long)id) {
239 D_ASSERT(req->sector == sector);
240 return req;
241 }
242 }
243 return NULL;
244}
245
246static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
247 struct bio *bio_src)
248{
249 struct bio *bio;
250 struct drbd_request *req =
251 mempool_alloc(drbd_request_mempool, GFP_NOIO);
252 if (likely(req)) {
253 bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
254
255 req->rq_state = 0;
256 req->mdev = mdev;
257 req->master_bio = bio_src;
258 req->private_bio = bio;
259 req->epoch = 0;
260 req->sector = bio->bi_sector;
261 req->size = bio->bi_size;
262 req->start_time = jiffies;
263 INIT_HLIST_NODE(&req->colision);
264 INIT_LIST_HEAD(&req->tl_requests);
265 INIT_LIST_HEAD(&req->w.list);
266
267 bio->bi_private = req;
268 bio->bi_end_io = drbd_endio_pri;
269 bio->bi_next = NULL;
270 }
271 return req;
272}
273
274static inline void drbd_req_free(struct drbd_request *req)
275{
276 mempool_free(req, drbd_request_mempool);
277}
278
279static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
280{
281 return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
282}
283
284/* Short lived temporary struct on the stack.
285 * We could squirrel the error to be returned into
286 * bio->bi_size, or similar. But that would be too ugly. */
287struct bio_and_error {
288 struct bio *bio;
289 int error;
290};
291
292extern void _req_may_be_done(struct drbd_request *req,
293 struct bio_and_error *m);
294extern void __req_mod(struct drbd_request *req, enum drbd_req_event what,
295 struct bio_and_error *m);
296extern void complete_master_bio(struct drbd_conf *mdev,
297 struct bio_and_error *m);
298
299/* use this if you don't want to deal with calling complete_master_bio()
300 * outside the spinlock, e.g. when walking some list on cleanup. */
301static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what)
302{
303 struct drbd_conf *mdev = req->mdev;
304 struct bio_and_error m;
305
306 /* __req_mod possibly frees req, do not touch req after that! */
307 __req_mod(req, what, &m);
308 if (m.bio)
309 complete_master_bio(mdev, &m);
310}
311
312/* completion of master bio is outside of spinlock.
313 * If you need it irqsave, do it your self! */
314static inline void req_mod(struct drbd_request *req,
315 enum drbd_req_event what)
316{
317 struct drbd_conf *mdev = req->mdev;
318 struct bio_and_error m;
319 spin_lock_irq(&mdev->req_lock);
320 __req_mod(req, what, &m);
321 spin_unlock_irq(&mdev->req_lock);
322
323 if (m.bio)
324 complete_master_bio(mdev, &m);
325}
326#endif
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
new file mode 100644
index 000000000000..76863e3f05be
--- /dev/null
+++ b/drivers/block/drbd/drbd_strings.c
@@ -0,0 +1,113 @@
1/*
2 drbd.h
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24*/
25
26#include <linux/drbd.h>
27
28static const char *drbd_conn_s_names[] = {
29 [C_STANDALONE] = "StandAlone",
30 [C_DISCONNECTING] = "Disconnecting",
31 [C_UNCONNECTED] = "Unconnected",
32 [C_TIMEOUT] = "Timeout",
33 [C_BROKEN_PIPE] = "BrokenPipe",
34 [C_NETWORK_FAILURE] = "NetworkFailure",
35 [C_PROTOCOL_ERROR] = "ProtocolError",
36 [C_WF_CONNECTION] = "WFConnection",
37 [C_WF_REPORT_PARAMS] = "WFReportParams",
38 [C_TEAR_DOWN] = "TearDown",
39 [C_CONNECTED] = "Connected",
40 [C_STARTING_SYNC_S] = "StartingSyncS",
41 [C_STARTING_SYNC_T] = "StartingSyncT",
42 [C_WF_BITMAP_S] = "WFBitMapS",
43 [C_WF_BITMAP_T] = "WFBitMapT",
44 [C_WF_SYNC_UUID] = "WFSyncUUID",
45 [C_SYNC_SOURCE] = "SyncSource",
46 [C_SYNC_TARGET] = "SyncTarget",
47 [C_PAUSED_SYNC_S] = "PausedSyncS",
48 [C_PAUSED_SYNC_T] = "PausedSyncT",
49 [C_VERIFY_S] = "VerifyS",
50 [C_VERIFY_T] = "VerifyT",
51};
52
53static const char *drbd_role_s_names[] = {
54 [R_PRIMARY] = "Primary",
55 [R_SECONDARY] = "Secondary",
56 [R_UNKNOWN] = "Unknown"
57};
58
59static const char *drbd_disk_s_names[] = {
60 [D_DISKLESS] = "Diskless",
61 [D_ATTACHING] = "Attaching",
62 [D_FAILED] = "Failed",
63 [D_NEGOTIATING] = "Negotiating",
64 [D_INCONSISTENT] = "Inconsistent",
65 [D_OUTDATED] = "Outdated",
66 [D_UNKNOWN] = "DUnknown",
67 [D_CONSISTENT] = "Consistent",
68 [D_UP_TO_DATE] = "UpToDate",
69};
70
71static const char *drbd_state_sw_errors[] = {
72 [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
73 [-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk",
74 [-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
75 [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk",
76 [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected",
77 [-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated",
78 [-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active",
79 [-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device",
80 [-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node",
81 [-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk",
82 [-SS_DEVICE_IN_USE] = "Device is held open by someone",
83 [-SS_NO_NET_CONFIG] = "Have no net/connection configuration",
84 [-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify",
85 [-SS_NEED_CONNECTION] = "Need a connection to start verify or resync",
86 [-SS_NOT_SUPPORTED] = "Peer does not support protocol",
87 [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
88 [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
89 [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
90};
91
92const char *drbd_conn_str(enum drbd_conns s)
93{
94 /* enums are unsigned... */
95 return s > C_PAUSED_SYNC_T ? "TOO_LARGE" : drbd_conn_s_names[s];
96}
97
98const char *drbd_role_str(enum drbd_role s)
99{
100 return s > R_SECONDARY ? "TOO_LARGE" : drbd_role_s_names[s];
101}
102
103const char *drbd_disk_str(enum drbd_disk_state s)
104{
105 return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s];
106}
107
108const char *drbd_set_st_err_str(enum drbd_state_ret_codes err)
109{
110 return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" :
111 err > SS_TWO_PRIMARIES ? "TOO_LARGE"
112 : drbd_state_sw_errors[-err];
113}
diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h
new file mode 100644
index 000000000000..fc824006e721
--- /dev/null
+++ b/drivers/block/drbd/drbd_vli.h
@@ -0,0 +1,351 @@
1/*
2-*- linux-c -*-
3 drbd_receiver.c
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25#ifndef _DRBD_VLI_H
26#define _DRBD_VLI_H
27
28/*
29 * At a granularity of 4KiB storage represented per bit,
30 * and stroage sizes of several TiB,
31 * and possibly small-bandwidth replication,
32 * the bitmap transfer time can take much too long,
33 * if transmitted in plain text.
34 *
35 * We try to reduce the transfered bitmap information
36 * by encoding runlengths of bit polarity.
37 *
38 * We never actually need to encode a "zero" (runlengths are positive).
39 * But then we have to store the value of the first bit.
40 * The first bit of information thus shall encode if the first runlength
41 * gives the number of set or unset bits.
42 *
43 * We assume that large areas are either completely set or unset,
44 * which gives good compression with any runlength method,
45 * even when encoding the runlength as fixed size 32bit/64bit integers.
46 *
47 * Still, there may be areas where the polarity flips every few bits,
48 * and encoding the runlength sequence of those areas with fix size
49 * integers would be much worse than plaintext.
50 *
51 * We want to encode small runlength values with minimum code length,
52 * while still being able to encode a Huge run of all zeros.
53 *
54 * Thus we need a Variable Length Integer encoding, VLI.
55 *
56 * For some cases, we produce more code bits than plaintext input.
57 * We need to send incompressible chunks as plaintext, skip over them
58 * and then see if the next chunk compresses better.
59 *
60 * We don't care too much about "excellent" compression ratio for large
61 * runlengths (all set/all clear): whether we achieve a factor of 100
62 * or 1000 is not that much of an issue.
63 * We do not want to waste too much on short runlengths in the "noisy"
64 * parts of the bitmap, though.
65 *
66 * There are endless variants of VLI, we experimented with:
67 * * simple byte-based
68 * * various bit based with different code word length.
69 *
70 * To avoid yet an other configuration parameter (choice of bitmap compression
71 * algorithm) which was difficult to explain and tune, we just chose the one
72 * variant that turned out best in all test cases.
73 * Based on real world usage patterns, with device sizes ranging from a few GiB
74 * to several TiB, file server/mailserver/webserver/mysql/postgress,
75 * mostly idle to really busy, the all time winner (though sometimes only
76 * marginally better) is:
77 */
78
79/*
80 * encoding is "visualised" as
81 * __little endian__ bitstream, least significant bit first (left most)
82 *
83 * this particular encoding is chosen so that the prefix code
84 * starts as unary encoding the level, then modified so that
85 * 10 levels can be described in 8bit, with minimal overhead
86 * for the smaller levels.
87 *
88 * Number of data bits follow fibonacci sequence, with the exception of the
89 * last level (+1 data bit, so it makes 64bit total). The only worse code when
90 * encoding bit polarity runlength is 1 plain bits => 2 code bits.
91prefix data bits max val Nº data bits
920 x 0x2 1
9310 x 0x4 1
94110 xx 0x8 2
951110 xxx 0x10 3
9611110 xxx xx 0x30 5
97111110 xx xxxxxx 0x130 8
9811111100 xxxxxxxx xxxxx 0x2130 13
9911111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21
10011111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34
10111111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
102 * maximum encodable value: 0x100000400202130 == 2**56 + some */
103
104/* compression "table":
105 transmitted x 0.29
106 as plaintext x ........................
107 x ........................
108 x ........................
109 x 0.59 0.21........................
110 x ........................................................
111 x .. c ...................................................
112 x 0.44.. o ...................................................
113 x .......... d ...................................................
114 x .......... e ...................................................
115 X............. ...................................................
116 x.............. b ...................................................
1172.0x............... i ...................................................
118 #X................ t ...................................................
119 #................. s ........................... plain bits ..........
120-+-----------------------------------------------------------------------
121 1 16 32 64
122*/
123
124/* LEVEL: (total bits, prefix bits, prefix value),
125 * sorted ascending by number of total bits.
126 * The rest of the code table is calculated at compiletime from this. */
127
128/* fibonacci data 1, 1, ... */
129#define VLI_L_1_1() do { \
130 LEVEL( 2, 1, 0x00); \
131 LEVEL( 3, 2, 0x01); \
132 LEVEL( 5, 3, 0x03); \
133 LEVEL( 7, 4, 0x07); \
134 LEVEL(10, 5, 0x0f); \
135 LEVEL(14, 6, 0x1f); \
136 LEVEL(21, 8, 0x3f); \
137 LEVEL(29, 8, 0x7f); \
138 LEVEL(42, 8, 0xbf); \
139 LEVEL(64, 8, 0xff); \
140 } while (0)
141
142/* finds a suitable level to decode the least significant part of in.
143 * returns number of bits consumed.
144 *
145 * BUG() for bad input, as that would mean a buggy code table. */
146static inline int vli_decode_bits(u64 *out, const u64 in)
147{
148 u64 adj = 1;
149
150#define LEVEL(t,b,v) \
151 do { \
152 if ((in & ((1 << b) -1)) == v) { \
153 *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \
154 return t; \
155 } \
156 adj += 1ULL << (t - b); \
157 } while (0)
158
159 VLI_L_1_1();
160
161 /* NOT REACHED, if VLI_LEVELS code table is defined properly */
162 BUG();
163#undef LEVEL
164}
165
166/* return number of code bits needed,
167 * or negative error number */
168static inline int __vli_encode_bits(u64 *out, const u64 in)
169{
170 u64 max = 0;
171 u64 adj = 1;
172
173 if (in == 0)
174 return -EINVAL;
175
176#define LEVEL(t,b,v) do { \
177 max += 1ULL << (t - b); \
178 if (in <= max) { \
179 if (out) \
180 *out = ((in - adj) << b) | v; \
181 return t; \
182 } \
183 adj = max + 1; \
184 } while (0)
185
186 VLI_L_1_1();
187
188 return -EOVERFLOW;
189#undef LEVEL
190}
191
192#undef VLI_L_1_1
193
194/* code from here down is independend of actually used bit code */
195
196/*
197 * Code length is determined by some unique (e.g. unary) prefix.
198 * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
199 * not a byte stream.
200 */
201
202/* for the bitstream, we need a cursor */
203struct bitstream_cursor {
204 /* the current byte */
205 u8 *b;
206 /* the current bit within *b, nomalized: 0..7 */
207 unsigned int bit;
208};
209
210/* initialize cursor to point to first bit of stream */
211static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
212{
213 cur->b = s;
214 cur->bit = 0;
215}
216
217/* advance cursor by that many bits; maximum expected input value: 64,
218 * but depending on VLI implementation, it may be more. */
219static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
220{
221 bits += cur->bit;
222 cur->b = cur->b + (bits >> 3);
223 cur->bit = bits & 7;
224}
225
226/* the bitstream itself knows its length */
227struct bitstream {
228 struct bitstream_cursor cur;
229 unsigned char *buf;
230 size_t buf_len; /* in bytes */
231
232 /* for input stream:
233 * number of trailing 0 bits for padding
234 * total number of valid bits in stream: buf_len * 8 - pad_bits */
235 unsigned int pad_bits;
236};
237
238static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
239{
240 bs->buf = s;
241 bs->buf_len = len;
242 bs->pad_bits = pad_bits;
243 bitstream_cursor_reset(&bs->cur, bs->buf);
244}
245
246static inline void bitstream_rewind(struct bitstream *bs)
247{
248 bitstream_cursor_reset(&bs->cur, bs->buf);
249 memset(bs->buf, 0, bs->buf_len);
250}
251
252/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
253 * Ignores "pad_bits".
254 * Returns zero if bits == 0 (nothing to do).
255 * Returns number of bits used if successful.
256 *
257 * If there is not enough room left in bitstream,
258 * leaves bitstream unchanged and returns -ENOBUFS.
259 */
260static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
261{
262 unsigned char *b = bs->cur.b;
263 unsigned int tmp;
264
265 if (bits == 0)
266 return 0;
267
268 if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
269 return -ENOBUFS;
270
271 /* paranoia: strip off hi bits; they should not be set anyways. */
272 if (bits < 64)
273 val &= ~0ULL >> (64 - bits);
274
275 *b++ |= (val & 0xff) << bs->cur.bit;
276
277 for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
278 *b++ |= (val >> tmp) & 0xff;
279
280 bitstream_cursor_advance(&bs->cur, bits);
281 return bits;
282}
283
284/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
285 *
286 * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
287 *
288 * If there are less than the requested number of valid bits left in the
289 * bitstream, still fetches all available bits.
290 *
291 * Returns number of actually fetched bits.
292 */
293static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
294{
295 u64 val;
296 unsigned int n;
297
298 if (bits > 64)
299 return -EINVAL;
300
301 if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
302 bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
303 - bs->cur.bit - bs->pad_bits;
304
305 if (bits == 0) {
306 *out = 0;
307 return 0;
308 }
309
310 /* get the high bits */
311 val = 0;
312 n = (bs->cur.bit + bits + 7) >> 3;
313 /* n may be at most 9, if cur.bit + bits > 64 */
314 /* which means this copies at most 8 byte */
315 if (n) {
316 memcpy(&val, bs->cur.b+1, n - 1);
317 val = le64_to_cpu(val) << (8 - bs->cur.bit);
318 }
319
320 /* we still need the low bits */
321 val |= bs->cur.b[0] >> bs->cur.bit;
322
323 /* and mask out bits we don't want */
324 val &= ~0ULL >> (64 - bits);
325
326 bitstream_cursor_advance(&bs->cur, bits);
327 *out = val;
328
329 return bits;
330}
331
332/* encodes @in as vli into @bs;
333
334 * return values
335 * > 0: number of bits successfully stored in bitstream
336 * -ENOBUFS @bs is full
337 * -EINVAL input zero (invalid)
338 * -EOVERFLOW input too large for this vli code (invalid)
339 */
340static inline int vli_encode_bits(struct bitstream *bs, u64 in)
341{
342 u64 code = code;
343 int bits = __vli_encode_bits(&code, in);
344
345 if (bits <= 0)
346 return bits;
347
348 return bitstream_put_bits(bs, code, bits);
349}
350
351#endif
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
new file mode 100644
index 000000000000..ed8796f1112d
--- /dev/null
+++ b/drivers/block/drbd/drbd_worker.c
@@ -0,0 +1,1512 @@
1/*
2 drbd_worker.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/module.h>
27#include <linux/version.h>
28#include <linux/drbd.h>
29#include <linux/sched.h>
30#include <linux/smp_lock.h>
31#include <linux/wait.h>
32#include <linux/mm.h>
33#include <linux/memcontrol.h>
34#include <linux/mm_inline.h>
35#include <linux/slab.h>
36#include <linux/random.h>
37#include <linux/mm.h>
38#include <linux/string.h>
39#include <linux/scatterlist.h>
40
41#include "drbd_int.h"
42#include "drbd_req.h"
43
44#define SLEEP_TIME (HZ/10)
45
46static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
47
48
49
50/* defined here:
51 drbd_md_io_complete
52 drbd_endio_write_sec
53 drbd_endio_read_sec
54 drbd_endio_pri
55
56 * more endio handlers:
57 atodb_endio in drbd_actlog.c
58 drbd_bm_async_io_complete in drbd_bitmap.c
59
60 * For all these callbacks, note the following:
61 * The callbacks will be called in irq context by the IDE drivers,
62 * and in Softirqs/Tasklets/BH context by the SCSI drivers.
63 * Try to get the locking right :)
64 *
65 */
66
67
68/* About the global_state_lock
69 Each state transition on an device holds a read lock. In case we have
70 to evaluate the sync after dependencies, we grab a write lock, because
71 we need stable states on all devices for that. */
72rwlock_t global_state_lock;
73
74/* used for synchronous meta data and bitmap IO
75 * submitted by drbd_md_sync_page_io()
76 */
77void drbd_md_io_complete(struct bio *bio, int error)
78{
79 struct drbd_md_io *md_io;
80
81 md_io = (struct drbd_md_io *)bio->bi_private;
82 md_io->error = error;
83
84 complete(&md_io->event);
85}
86
87/* reads on behalf of the partner,
88 * "submitted" by the receiver
89 */
90void drbd_endio_read_sec(struct bio *bio, int error) __releases(local)
91{
92 unsigned long flags = 0;
93 struct drbd_epoch_entry *e = NULL;
94 struct drbd_conf *mdev;
95 int uptodate = bio_flagged(bio, BIO_UPTODATE);
96
97 e = bio->bi_private;
98 mdev = e->mdev;
99
100 if (error)
101 dev_warn(DEV, "read: error=%d s=%llus\n", error,
102 (unsigned long long)e->sector);
103 if (!error && !uptodate) {
104 dev_warn(DEV, "read: setting error to -EIO s=%llus\n",
105 (unsigned long long)e->sector);
106 /* strange behavior of some lower level drivers...
107 * fail the request by clearing the uptodate flag,
108 * but do not return any error?! */
109 error = -EIO;
110 }
111
112 D_ASSERT(e->block_id != ID_VACANT);
113
114 spin_lock_irqsave(&mdev->req_lock, flags);
115 mdev->read_cnt += e->size >> 9;
116 list_del(&e->w.list);
117 if (list_empty(&mdev->read_ee))
118 wake_up(&mdev->ee_wait);
119 spin_unlock_irqrestore(&mdev->req_lock, flags);
120
121 drbd_chk_io_error(mdev, error, FALSE);
122 drbd_queue_work(&mdev->data.work, &e->w);
123 put_ldev(mdev);
124}
125
126/* writes on behalf of the partner, or resync writes,
127 * "submitted" by the receiver.
128 */
129void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
130{
131 unsigned long flags = 0;
132 struct drbd_epoch_entry *e = NULL;
133 struct drbd_conf *mdev;
134 sector_t e_sector;
135 int do_wake;
136 int is_syncer_req;
137 int do_al_complete_io;
138 int uptodate = bio_flagged(bio, BIO_UPTODATE);
139 int is_barrier = bio_rw_flagged(bio, BIO_RW_BARRIER);
140
141 e = bio->bi_private;
142 mdev = e->mdev;
143
144 if (error)
145 dev_warn(DEV, "write: error=%d s=%llus\n", error,
146 (unsigned long long)e->sector);
147 if (!error && !uptodate) {
148 dev_warn(DEV, "write: setting error to -EIO s=%llus\n",
149 (unsigned long long)e->sector);
150 /* strange behavior of some lower level drivers...
151 * fail the request by clearing the uptodate flag,
152 * but do not return any error?! */
153 error = -EIO;
154 }
155
156 /* error == -ENOTSUPP would be a better test,
157 * alas it is not reliable */
158 if (error && is_barrier && e->flags & EE_IS_BARRIER) {
159 drbd_bump_write_ordering(mdev, WO_bdev_flush);
160 spin_lock_irqsave(&mdev->req_lock, flags);
161 list_del(&e->w.list);
162 e->w.cb = w_e_reissue;
163 /* put_ldev actually happens below, once we come here again. */
164 __release(local);
165 spin_unlock_irqrestore(&mdev->req_lock, flags);
166 drbd_queue_work(&mdev->data.work, &e->w);
167 return;
168 }
169
170 D_ASSERT(e->block_id != ID_VACANT);
171
172 spin_lock_irqsave(&mdev->req_lock, flags);
173 mdev->writ_cnt += e->size >> 9;
174 is_syncer_req = is_syncer_block_id(e->block_id);
175
176 /* after we moved e to done_ee,
177 * we may no longer access it,
178 * it may be freed/reused already!
179 * (as soon as we release the req_lock) */
180 e_sector = e->sector;
181 do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
182
183 list_del(&e->w.list); /* has been on active_ee or sync_ee */
184 list_add_tail(&e->w.list, &mdev->done_ee);
185
186 /* No hlist_del_init(&e->colision) here, we did not send the Ack yet,
187 * neither did we wake possibly waiting conflicting requests.
188 * done from "drbd_process_done_ee" within the appropriate w.cb
189 * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
190
191 do_wake = is_syncer_req
192 ? list_empty(&mdev->sync_ee)
193 : list_empty(&mdev->active_ee);
194
195 if (error)
196 __drbd_chk_io_error(mdev, FALSE);
197 spin_unlock_irqrestore(&mdev->req_lock, flags);
198
199 if (is_syncer_req)
200 drbd_rs_complete_io(mdev, e_sector);
201
202 if (do_wake)
203 wake_up(&mdev->ee_wait);
204
205 if (do_al_complete_io)
206 drbd_al_complete_io(mdev, e_sector);
207
208 wake_asender(mdev);
209 put_ldev(mdev);
210
211}
212
213/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
214 */
215void drbd_endio_pri(struct bio *bio, int error)
216{
217 unsigned long flags;
218 struct drbd_request *req = bio->bi_private;
219 struct drbd_conf *mdev = req->mdev;
220 struct bio_and_error m;
221 enum drbd_req_event what;
222 int uptodate = bio_flagged(bio, BIO_UPTODATE);
223
224 if (error)
225 dev_warn(DEV, "p %s: error=%d\n",
226 bio_data_dir(bio) == WRITE ? "write" : "read", error);
227 if (!error && !uptodate) {
228 dev_warn(DEV, "p %s: setting error to -EIO\n",
229 bio_data_dir(bio) == WRITE ? "write" : "read");
230 /* strange behavior of some lower level drivers...
231 * fail the request by clearing the uptodate flag,
232 * but do not return any error?! */
233 error = -EIO;
234 }
235
236 /* to avoid recursion in __req_mod */
237 if (unlikely(error)) {
238 what = (bio_data_dir(bio) == WRITE)
239 ? write_completed_with_error
240 : (bio_rw(bio) == READA)
241 ? read_completed_with_error
242 : read_ahead_completed_with_error;
243 } else
244 what = completed_ok;
245
246 bio_put(req->private_bio);
247 req->private_bio = ERR_PTR(error);
248
249 spin_lock_irqsave(&mdev->req_lock, flags);
250 __req_mod(req, what, &m);
251 spin_unlock_irqrestore(&mdev->req_lock, flags);
252
253 if (m.bio)
254 complete_master_bio(mdev, &m);
255}
256
257int w_io_error(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
258{
259 struct drbd_request *req = container_of(w, struct drbd_request, w);
260
261 /* NOTE: mdev->ldev can be NULL by the time we get here! */
262 /* D_ASSERT(mdev->ldev->dc.on_io_error != EP_PASS_ON); */
263
264 /* the only way this callback is scheduled is from _req_may_be_done,
265 * when it is done and had a local write error, see comments there */
266 drbd_req_free(req);
267
268 return TRUE;
269}
270
271int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
272{
273 struct drbd_request *req = container_of(w, struct drbd_request, w);
274
275 /* We should not detach for read io-error,
276 * but try to WRITE the P_DATA_REPLY to the failed location,
277 * to give the disk the chance to relocate that block */
278
279 spin_lock_irq(&mdev->req_lock);
280 if (cancel ||
281 mdev->state.conn < C_CONNECTED ||
282 mdev->state.pdsk <= D_INCONSISTENT) {
283 _req_mod(req, send_canceled);
284 spin_unlock_irq(&mdev->req_lock);
285 dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n");
286 return 1;
287 }
288 spin_unlock_irq(&mdev->req_lock);
289
290 return w_send_read_req(mdev, w, 0);
291}
292
293int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
294{
295 ERR_IF(cancel) return 1;
296 dev_err(DEV, "resync inactive, but callback triggered??\n");
297 return 1; /* Simply ignore this! */
298}
299
300void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
301{
302 struct hash_desc desc;
303 struct scatterlist sg;
304 struct bio_vec *bvec;
305 int i;
306
307 desc.tfm = tfm;
308 desc.flags = 0;
309
310 sg_init_table(&sg, 1);
311 crypto_hash_init(&desc);
312
313 __bio_for_each_segment(bvec, bio, i, 0) {
314 sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
315 crypto_hash_update(&desc, &sg, sg.length);
316 }
317 crypto_hash_final(&desc, digest);
318}
319
320static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
321{
322 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
323 int digest_size;
324 void *digest;
325 int ok;
326
327 D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
328
329 if (unlikely(cancel)) {
330 drbd_free_ee(mdev, e);
331 return 1;
332 }
333
334 if (likely(drbd_bio_uptodate(e->private_bio))) {
335 digest_size = crypto_hash_digestsize(mdev->csums_tfm);
336 digest = kmalloc(digest_size, GFP_NOIO);
337 if (digest) {
338 drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
339
340 inc_rs_pending(mdev);
341 ok = drbd_send_drequest_csum(mdev,
342 e->sector,
343 e->size,
344 digest,
345 digest_size,
346 P_CSUM_RS_REQUEST);
347 kfree(digest);
348 } else {
349 dev_err(DEV, "kmalloc() of digest failed.\n");
350 ok = 0;
351 }
352 } else
353 ok = 1;
354
355 drbd_free_ee(mdev, e);
356
357 if (unlikely(!ok))
358 dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
359 return ok;
360}
361
362#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
363
364static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
365{
366 struct drbd_epoch_entry *e;
367
368 if (!get_ldev(mdev))
369 return 0;
370
371 /* GFP_TRY, because if there is no memory available right now, this may
372 * be rescheduled for later. It is "only" background resync, after all. */
373 e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
374 if (!e) {
375 put_ldev(mdev);
376 return 2;
377 }
378
379 spin_lock_irq(&mdev->req_lock);
380 list_add(&e->w.list, &mdev->read_ee);
381 spin_unlock_irq(&mdev->req_lock);
382
383 e->private_bio->bi_end_io = drbd_endio_read_sec;
384 e->private_bio->bi_rw = READ;
385 e->w.cb = w_e_send_csum;
386
387 mdev->read_cnt += size >> 9;
388 drbd_generic_make_request(mdev, DRBD_FAULT_RS_RD, e->private_bio);
389
390 return 1;
391}
392
393void resync_timer_fn(unsigned long data)
394{
395 unsigned long flags;
396 struct drbd_conf *mdev = (struct drbd_conf *) data;
397 int queue;
398
399 spin_lock_irqsave(&mdev->req_lock, flags);
400
401 if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) {
402 queue = 1;
403 if (mdev->state.conn == C_VERIFY_S)
404 mdev->resync_work.cb = w_make_ov_request;
405 else
406 mdev->resync_work.cb = w_make_resync_request;
407 } else {
408 queue = 0;
409 mdev->resync_work.cb = w_resync_inactive;
410 }
411
412 spin_unlock_irqrestore(&mdev->req_lock, flags);
413
414 /* harmless race: list_empty outside data.work.q_lock */
415 if (list_empty(&mdev->resync_work.list) && queue)
416 drbd_queue_work(&mdev->data.work, &mdev->resync_work);
417}
418
419int w_make_resync_request(struct drbd_conf *mdev,
420 struct drbd_work *w, int cancel)
421{
422 unsigned long bit;
423 sector_t sector;
424 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
425 int max_segment_size = queue_max_segment_size(mdev->rq_queue);
426 int number, i, size, pe, mx;
427 int align, queued, sndbuf;
428
429 if (unlikely(cancel))
430 return 1;
431
432 if (unlikely(mdev->state.conn < C_CONNECTED)) {
433 dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected");
434 return 0;
435 }
436
437 if (mdev->state.conn != C_SYNC_TARGET)
438 dev_err(DEV, "%s in w_make_resync_request\n",
439 drbd_conn_str(mdev->state.conn));
440
441 if (!get_ldev(mdev)) {
442 /* Since we only need to access mdev->rsync a
443 get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
444 to continue resync with a broken disk makes no sense at
445 all */
446 dev_err(DEV, "Disk broke down during resync!\n");
447 mdev->resync_work.cb = w_resync_inactive;
448 return 1;
449 }
450
451 number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
452 pe = atomic_read(&mdev->rs_pending_cnt);
453
454 mutex_lock(&mdev->data.mutex);
455 if (mdev->data.socket)
456 mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req);
457 else
458 mx = 1;
459 mutex_unlock(&mdev->data.mutex);
460
461 /* For resync rates >160MB/sec, allow more pending RS requests */
462 if (number > mx)
463 mx = number;
464
465 /* Limit the number of pending RS requests to no more than the peer's receive buffer */
466 if ((pe + number) > mx) {
467 number = mx - pe;
468 }
469
470 for (i = 0; i < number; i++) {
471 /* Stop generating RS requests, when half of the send buffer is filled */
472 mutex_lock(&mdev->data.mutex);
473 if (mdev->data.socket) {
474 queued = mdev->data.socket->sk->sk_wmem_queued;
475 sndbuf = mdev->data.socket->sk->sk_sndbuf;
476 } else {
477 queued = 1;
478 sndbuf = 0;
479 }
480 mutex_unlock(&mdev->data.mutex);
481 if (queued > sndbuf / 2)
482 goto requeue;
483
484next_sector:
485 size = BM_BLOCK_SIZE;
486 bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo);
487
488 if (bit == -1UL) {
489 mdev->bm_resync_fo = drbd_bm_bits(mdev);
490 mdev->resync_work.cb = w_resync_inactive;
491 put_ldev(mdev);
492 return 1;
493 }
494
495 sector = BM_BIT_TO_SECT(bit);
496
497 if (drbd_try_rs_begin_io(mdev, sector)) {
498 mdev->bm_resync_fo = bit;
499 goto requeue;
500 }
501 mdev->bm_resync_fo = bit + 1;
502
503 if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) {
504 drbd_rs_complete_io(mdev, sector);
505 goto next_sector;
506 }
507
508#if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE
509 /* try to find some adjacent bits.
510 * we stop if we have already the maximum req size.
511 *
512 * Additionally always align bigger requests, in order to
513 * be prepared for all stripe sizes of software RAIDs.
514 *
515 * we _do_ care about the agreed-upon q->max_segment_size
516 * here, as splitting up the requests on the other side is more
517 * difficult. the consequence is, that on lvm and md and other
518 * "indirect" devices, this is dead code, since
519 * q->max_segment_size will be PAGE_SIZE.
520 */
521 align = 1;
522 for (;;) {
523 if (size + BM_BLOCK_SIZE > max_segment_size)
524 break;
525
526 /* Be always aligned */
527 if (sector & ((1<<(align+3))-1))
528 break;
529
530 /* do not cross extent boundaries */
531 if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
532 break;
533 /* now, is it actually dirty, after all?
534 * caution, drbd_bm_test_bit is tri-state for some
535 * obscure reason; ( b == 0 ) would get the out-of-band
536 * only accidentally right because of the "oddly sized"
537 * adjustment below */
538 if (drbd_bm_test_bit(mdev, bit+1) != 1)
539 break;
540 bit++;
541 size += BM_BLOCK_SIZE;
542 if ((BM_BLOCK_SIZE << align) <= size)
543 align++;
544 i++;
545 }
546 /* if we merged some,
547 * reset the offset to start the next drbd_bm_find_next from */
548 if (size > BM_BLOCK_SIZE)
549 mdev->bm_resync_fo = bit + 1;
550#endif
551
552 /* adjust very last sectors, in case we are oddly sized */
553 if (sector + (size>>9) > capacity)
554 size = (capacity-sector)<<9;
555 if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
556 switch (read_for_csum(mdev, sector, size)) {
557 case 0: /* Disk failure*/
558 put_ldev(mdev);
559 return 0;
560 case 2: /* Allocation failed */
561 drbd_rs_complete_io(mdev, sector);
562 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
563 goto requeue;
564 /* case 1: everything ok */
565 }
566 } else {
567 inc_rs_pending(mdev);
568 if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
569 sector, size, ID_SYNCER)) {
570 dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");
571 dec_rs_pending(mdev);
572 put_ldev(mdev);
573 return 0;
574 }
575 }
576 }
577
578 if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) {
579 /* last syncer _request_ was sent,
580 * but the P_RS_DATA_REPLY not yet received. sync will end (and
581 * next sync group will resume), as soon as we receive the last
582 * resync data block, and the last bit is cleared.
583 * until then resync "work" is "inactive" ...
584 */
585 mdev->resync_work.cb = w_resync_inactive;
586 put_ldev(mdev);
587 return 1;
588 }
589
590 requeue:
591 mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
592 put_ldev(mdev);
593 return 1;
594}
595
596static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
597{
598 int number, i, size;
599 sector_t sector;
600 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
601
602 if (unlikely(cancel))
603 return 1;
604
605 if (unlikely(mdev->state.conn < C_CONNECTED)) {
606 dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected");
607 return 0;
608 }
609
610 number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
611 if (atomic_read(&mdev->rs_pending_cnt) > number)
612 goto requeue;
613
614 number -= atomic_read(&mdev->rs_pending_cnt);
615
616 sector = mdev->ov_position;
617 for (i = 0; i < number; i++) {
618 if (sector >= capacity) {
619 mdev->resync_work.cb = w_resync_inactive;
620 return 1;
621 }
622
623 size = BM_BLOCK_SIZE;
624
625 if (drbd_try_rs_begin_io(mdev, sector)) {
626 mdev->ov_position = sector;
627 goto requeue;
628 }
629
630 if (sector + (size>>9) > capacity)
631 size = (capacity-sector)<<9;
632
633 inc_rs_pending(mdev);
634 if (!drbd_send_ov_request(mdev, sector, size)) {
635 dec_rs_pending(mdev);
636 return 0;
637 }
638 sector += BM_SECT_PER_BIT;
639 }
640 mdev->ov_position = sector;
641
642 requeue:
643 mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
644 return 1;
645}
646
647
648int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
649{
650 kfree(w);
651 ov_oos_print(mdev);
652 drbd_resync_finished(mdev);
653
654 return 1;
655}
656
657static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
658{
659 kfree(w);
660
661 drbd_resync_finished(mdev);
662
663 return 1;
664}
665
666int drbd_resync_finished(struct drbd_conf *mdev)
667{
668 unsigned long db, dt, dbdt;
669 unsigned long n_oos;
670 union drbd_state os, ns;
671 struct drbd_work *w;
672 char *khelper_cmd = NULL;
673
674 /* Remove all elements from the resync LRU. Since future actions
675 * might set bits in the (main) bitmap, then the entries in the
676 * resync LRU would be wrong. */
677 if (drbd_rs_del_all(mdev)) {
678 /* In case this is not possible now, most probably because
679 * there are P_RS_DATA_REPLY Packets lingering on the worker's
680 * queue (or even the read operations for those packets
681 * is not finished by now). Retry in 100ms. */
682
683 drbd_kick_lo(mdev);
684 __set_current_state(TASK_INTERRUPTIBLE);
685 schedule_timeout(HZ / 10);
686 w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
687 if (w) {
688 w->cb = w_resync_finished;
689 drbd_queue_work(&mdev->data.work, w);
690 return 1;
691 }
692 dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
693 }
694
695 dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
696 if (dt <= 0)
697 dt = 1;
698 db = mdev->rs_total;
699 dbdt = Bit2KB(db/dt);
700 mdev->rs_paused /= HZ;
701
702 if (!get_ldev(mdev))
703 goto out;
704
705 spin_lock_irq(&mdev->req_lock);
706 os = mdev->state;
707
708 /* This protects us against multiple calls (that can happen in the presence
709 of application IO), and against connectivity loss just before we arrive here. */
710 if (os.conn <= C_CONNECTED)
711 goto out_unlock;
712
713 ns = os;
714 ns.conn = C_CONNECTED;
715
716 dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
717 (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ?
718 "Online verify " : "Resync",
719 dt + mdev->rs_paused, mdev->rs_paused, dbdt);
720
721 n_oos = drbd_bm_total_weight(mdev);
722
723 if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
724 if (n_oos) {
725 dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n",
726 n_oos, Bit2KB(1));
727 khelper_cmd = "out-of-sync";
728 }
729 } else {
730 D_ASSERT((n_oos - mdev->rs_failed) == 0);
731
732 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
733 khelper_cmd = "after-resync-target";
734
735 if (mdev->csums_tfm && mdev->rs_total) {
736 const unsigned long s = mdev->rs_same_csum;
737 const unsigned long t = mdev->rs_total;
738 const int ratio =
739 (t == 0) ? 0 :
740 (t < 100000) ? ((s*100)/t) : (s/(t/100));
741 dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; "
742 "transferred %luK total %luK\n",
743 ratio,
744 Bit2KB(mdev->rs_same_csum),
745 Bit2KB(mdev->rs_total - mdev->rs_same_csum),
746 Bit2KB(mdev->rs_total));
747 }
748 }
749
750 if (mdev->rs_failed) {
751 dev_info(DEV, " %lu failed blocks\n", mdev->rs_failed);
752
753 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
754 ns.disk = D_INCONSISTENT;
755 ns.pdsk = D_UP_TO_DATE;
756 } else {
757 ns.disk = D_UP_TO_DATE;
758 ns.pdsk = D_INCONSISTENT;
759 }
760 } else {
761 ns.disk = D_UP_TO_DATE;
762 ns.pdsk = D_UP_TO_DATE;
763
764 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
765 if (mdev->p_uuid) {
766 int i;
767 for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
768 _drbd_uuid_set(mdev, i, mdev->p_uuid[i]);
769 drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]);
770 _drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]);
771 } else {
772 dev_err(DEV, "mdev->p_uuid is NULL! BUG\n");
773 }
774 }
775
776 drbd_uuid_set_bm(mdev, 0UL);
777
778 if (mdev->p_uuid) {
779 /* Now the two UUID sets are equal, update what we
780 * know of the peer. */
781 int i;
782 for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
783 mdev->p_uuid[i] = mdev->ldev->md.uuid[i];
784 }
785 }
786
787 _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
788out_unlock:
789 spin_unlock_irq(&mdev->req_lock);
790 put_ldev(mdev);
791out:
792 mdev->rs_total = 0;
793 mdev->rs_failed = 0;
794 mdev->rs_paused = 0;
795 mdev->ov_start_sector = 0;
796
797 if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
798 dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n");
799 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
800 }
801
802 if (khelper_cmd)
803 drbd_khelper(mdev, khelper_cmd);
804
805 return 1;
806}
807
808/* helper */
809static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
810{
811 if (drbd_bio_has_active_page(e->private_bio)) {
812 /* This might happen if sendpage() has not finished */
813 spin_lock_irq(&mdev->req_lock);
814 list_add_tail(&e->w.list, &mdev->net_ee);
815 spin_unlock_irq(&mdev->req_lock);
816 } else
817 drbd_free_ee(mdev, e);
818}
819
820/**
821 * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
822 * @mdev: DRBD device.
823 * @w: work object.
824 * @cancel: The connection will be closed anyways
825 */
826int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
827{
828 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
829 int ok;
830
831 if (unlikely(cancel)) {
832 drbd_free_ee(mdev, e);
833 dec_unacked(mdev);
834 return 1;
835 }
836
837 if (likely(drbd_bio_uptodate(e->private_bio))) {
838 ok = drbd_send_block(mdev, P_DATA_REPLY, e);
839 } else {
840 if (__ratelimit(&drbd_ratelimit_state))
841 dev_err(DEV, "Sending NegDReply. sector=%llus.\n",
842 (unsigned long long)e->sector);
843
844 ok = drbd_send_ack(mdev, P_NEG_DREPLY, e);
845 }
846
847 dec_unacked(mdev);
848
849 move_to_net_ee_or_free(mdev, e);
850
851 if (unlikely(!ok))
852 dev_err(DEV, "drbd_send_block() failed\n");
853 return ok;
854}
855
856/**
857 * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
858 * @mdev: DRBD device.
859 * @w: work object.
860 * @cancel: The connection will be closed anyways
861 */
862int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
863{
864 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
865 int ok;
866
867 if (unlikely(cancel)) {
868 drbd_free_ee(mdev, e);
869 dec_unacked(mdev);
870 return 1;
871 }
872
873 if (get_ldev_if_state(mdev, D_FAILED)) {
874 drbd_rs_complete_io(mdev, e->sector);
875 put_ldev(mdev);
876 }
877
878 if (likely(drbd_bio_uptodate(e->private_bio))) {
879 if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
880 inc_rs_pending(mdev);
881 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
882 } else {
883 if (__ratelimit(&drbd_ratelimit_state))
884 dev_err(DEV, "Not sending RSDataReply, "
885 "partner DISKLESS!\n");
886 ok = 1;
887 }
888 } else {
889 if (__ratelimit(&drbd_ratelimit_state))
890 dev_err(DEV, "Sending NegRSDReply. sector %llus.\n",
891 (unsigned long long)e->sector);
892
893 ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
894
895 /* update resync data with failure */
896 drbd_rs_failed_io(mdev, e->sector, e->size);
897 }
898
899 dec_unacked(mdev);
900
901 move_to_net_ee_or_free(mdev, e);
902
903 if (unlikely(!ok))
904 dev_err(DEV, "drbd_send_block() failed\n");
905 return ok;
906}
907
908int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
909{
910 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
911 struct digest_info *di;
912 int digest_size;
913 void *digest = NULL;
914 int ok, eq = 0;
915
916 if (unlikely(cancel)) {
917 drbd_free_ee(mdev, e);
918 dec_unacked(mdev);
919 return 1;
920 }
921
922 drbd_rs_complete_io(mdev, e->sector);
923
924 di = (struct digest_info *)(unsigned long)e->block_id;
925
926 if (likely(drbd_bio_uptodate(e->private_bio))) {
927 /* quick hack to try to avoid a race against reconfiguration.
928 * a real fix would be much more involved,
929 * introducing more locking mechanisms */
930 if (mdev->csums_tfm) {
931 digest_size = crypto_hash_digestsize(mdev->csums_tfm);
932 D_ASSERT(digest_size == di->digest_size);
933 digest = kmalloc(digest_size, GFP_NOIO);
934 }
935 if (digest) {
936 drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
937 eq = !memcmp(digest, di->digest, digest_size);
938 kfree(digest);
939 }
940
941 if (eq) {
942 drbd_set_in_sync(mdev, e->sector, e->size);
943 mdev->rs_same_csum++;
944 ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
945 } else {
946 inc_rs_pending(mdev);
947 e->block_id = ID_SYNCER;
948 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
949 }
950 } else {
951 ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
952 if (__ratelimit(&drbd_ratelimit_state))
953 dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
954 }
955
956 dec_unacked(mdev);
957
958 kfree(di);
959
960 move_to_net_ee_or_free(mdev, e);
961
962 if (unlikely(!ok))
963 dev_err(DEV, "drbd_send_block/ack() failed\n");
964 return ok;
965}
966
967int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
968{
969 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
970 int digest_size;
971 void *digest;
972 int ok = 1;
973
974 if (unlikely(cancel))
975 goto out;
976
977 if (unlikely(!drbd_bio_uptodate(e->private_bio)))
978 goto out;
979
980 digest_size = crypto_hash_digestsize(mdev->verify_tfm);
981 /* FIXME if this allocation fails, online verify will not terminate! */
982 digest = kmalloc(digest_size, GFP_NOIO);
983 if (digest) {
984 drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest);
985 inc_rs_pending(mdev);
986 ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
987 digest, digest_size, P_OV_REPLY);
988 if (!ok)
989 dec_rs_pending(mdev);
990 kfree(digest);
991 }
992
993out:
994 drbd_free_ee(mdev, e);
995
996 dec_unacked(mdev);
997
998 return ok;
999}
1000
1001void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
1002{
1003 if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) {
1004 mdev->ov_last_oos_size += size>>9;
1005 } else {
1006 mdev->ov_last_oos_start = sector;
1007 mdev->ov_last_oos_size = size>>9;
1008 }
1009 drbd_set_out_of_sync(mdev, sector, size);
1010 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
1011}
1012
1013int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1014{
1015 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
1016 struct digest_info *di;
1017 int digest_size;
1018 void *digest;
1019 int ok, eq = 0;
1020
1021 if (unlikely(cancel)) {
1022 drbd_free_ee(mdev, e);
1023 dec_unacked(mdev);
1024 return 1;
1025 }
1026
1027 /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
1028 * the resync lru has been cleaned up already */
1029 drbd_rs_complete_io(mdev, e->sector);
1030
1031 di = (struct digest_info *)(unsigned long)e->block_id;
1032
1033 if (likely(drbd_bio_uptodate(e->private_bio))) {
1034 digest_size = crypto_hash_digestsize(mdev->verify_tfm);
1035 digest = kmalloc(digest_size, GFP_NOIO);
1036 if (digest) {
1037 drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest);
1038
1039 D_ASSERT(digest_size == di->digest_size);
1040 eq = !memcmp(digest, di->digest, digest_size);
1041 kfree(digest);
1042 }
1043 } else {
1044 ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
1045 if (__ratelimit(&drbd_ratelimit_state))
1046 dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
1047 }
1048
1049 dec_unacked(mdev);
1050
1051 kfree(di);
1052
1053 if (!eq)
1054 drbd_ov_oos_found(mdev, e->sector, e->size);
1055 else
1056 ov_oos_print(mdev);
1057
1058 ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size,
1059 eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
1060
1061 drbd_free_ee(mdev, e);
1062
1063 if (--mdev->ov_left == 0) {
1064 ov_oos_print(mdev);
1065 drbd_resync_finished(mdev);
1066 }
1067
1068 return ok;
1069}
1070
1071int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1072{
1073 struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w);
1074 complete(&b->done);
1075 return 1;
1076}
1077
1078int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1079{
1080 struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w);
1081 struct p_barrier *p = &mdev->data.sbuf.barrier;
1082 int ok = 1;
1083
1084 /* really avoid racing with tl_clear. w.cb may have been referenced
1085 * just before it was reassigned and re-queued, so double check that.
1086 * actually, this race was harmless, since we only try to send the
1087 * barrier packet here, and otherwise do nothing with the object.
1088 * but compare with the head of w_clear_epoch */
1089 spin_lock_irq(&mdev->req_lock);
1090 if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED)
1091 cancel = 1;
1092 spin_unlock_irq(&mdev->req_lock);
1093 if (cancel)
1094 return 1;
1095
1096 if (!drbd_get_data_sock(mdev))
1097 return 0;
1098 p->barrier = b->br_number;
1099 /* inc_ap_pending was done where this was queued.
1100 * dec_ap_pending will be done in got_BarrierAck
1101 * or (on connection loss) in w_clear_epoch. */
1102 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
1103 (struct p_header *)p, sizeof(*p), 0);
1104 drbd_put_data_sock(mdev);
1105
1106 return ok;
1107}
1108
1109int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1110{
1111 if (cancel)
1112 return 1;
1113 return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE);
1114}
1115
1116/**
1117 * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
1118 * @mdev: DRBD device.
1119 * @w: work object.
1120 * @cancel: The connection will be closed anyways
1121 */
1122int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1123{
1124 struct drbd_request *req = container_of(w, struct drbd_request, w);
1125 int ok;
1126
1127 if (unlikely(cancel)) {
1128 req_mod(req, send_canceled);
1129 return 1;
1130 }
1131
1132 ok = drbd_send_dblock(mdev, req);
1133 req_mod(req, ok ? handed_over_to_network : send_failed);
1134
1135 return ok;
1136}
1137
1138/**
1139 * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
1140 * @mdev: DRBD device.
1141 * @w: work object.
1142 * @cancel: The connection will be closed anyways
1143 */
1144int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1145{
1146 struct drbd_request *req = container_of(w, struct drbd_request, w);
1147 int ok;
1148
1149 if (unlikely(cancel)) {
1150 req_mod(req, send_canceled);
1151 return 1;
1152 }
1153
1154 ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size,
1155 (unsigned long)req);
1156
1157 if (!ok) {
1158 /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send();
1159 * so this is probably redundant */
1160 if (mdev->state.conn >= C_CONNECTED)
1161 drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
1162 }
1163 req_mod(req, ok ? handed_over_to_network : send_failed);
1164
1165 return ok;
1166}
1167
1168static int _drbd_may_sync_now(struct drbd_conf *mdev)
1169{
1170 struct drbd_conf *odev = mdev;
1171
1172 while (1) {
1173 if (odev->sync_conf.after == -1)
1174 return 1;
1175 odev = minor_to_mdev(odev->sync_conf.after);
1176 ERR_IF(!odev) return 1;
1177 if ((odev->state.conn >= C_SYNC_SOURCE &&
1178 odev->state.conn <= C_PAUSED_SYNC_T) ||
1179 odev->state.aftr_isp || odev->state.peer_isp ||
1180 odev->state.user_isp)
1181 return 0;
1182 }
1183}
1184
1185/**
1186 * _drbd_pause_after() - Pause resync on all devices that may not resync now
1187 * @mdev: DRBD device.
1188 *
1189 * Called from process context only (admin command and after_state_ch).
1190 */
1191static int _drbd_pause_after(struct drbd_conf *mdev)
1192{
1193 struct drbd_conf *odev;
1194 int i, rv = 0;
1195
1196 for (i = 0; i < minor_count; i++) {
1197 odev = minor_to_mdev(i);
1198 if (!odev)
1199 continue;
1200 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1201 continue;
1202 if (!_drbd_may_sync_now(odev))
1203 rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
1204 != SS_NOTHING_TO_DO);
1205 }
1206
1207 return rv;
1208}
1209
1210/**
1211 * _drbd_resume_next() - Resume resync on all devices that may resync now
1212 * @mdev: DRBD device.
1213 *
1214 * Called from process context only (admin command and worker).
1215 */
1216static int _drbd_resume_next(struct drbd_conf *mdev)
1217{
1218 struct drbd_conf *odev;
1219 int i, rv = 0;
1220
1221 for (i = 0; i < minor_count; i++) {
1222 odev = minor_to_mdev(i);
1223 if (!odev)
1224 continue;
1225 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1226 continue;
1227 if (odev->state.aftr_isp) {
1228 if (_drbd_may_sync_now(odev))
1229 rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
1230 CS_HARD, NULL)
1231 != SS_NOTHING_TO_DO) ;
1232 }
1233 }
1234 return rv;
1235}
1236
1237void resume_next_sg(struct drbd_conf *mdev)
1238{
1239 write_lock_irq(&global_state_lock);
1240 _drbd_resume_next(mdev);
1241 write_unlock_irq(&global_state_lock);
1242}
1243
1244void suspend_other_sg(struct drbd_conf *mdev)
1245{
1246 write_lock_irq(&global_state_lock);
1247 _drbd_pause_after(mdev);
1248 write_unlock_irq(&global_state_lock);
1249}
1250
1251static int sync_after_error(struct drbd_conf *mdev, int o_minor)
1252{
1253 struct drbd_conf *odev;
1254
1255 if (o_minor == -1)
1256 return NO_ERROR;
1257 if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)
1258 return ERR_SYNC_AFTER;
1259
1260 /* check for loops */
1261 odev = minor_to_mdev(o_minor);
1262 while (1) {
1263 if (odev == mdev)
1264 return ERR_SYNC_AFTER_CYCLE;
1265
1266 /* dependency chain ends here, no cycles. */
1267 if (odev->sync_conf.after == -1)
1268 return NO_ERROR;
1269
1270 /* follow the dependency chain */
1271 odev = minor_to_mdev(odev->sync_conf.after);
1272 }
1273}
1274
1275int drbd_alter_sa(struct drbd_conf *mdev, int na)
1276{
1277 int changes;
1278 int retcode;
1279
1280 write_lock_irq(&global_state_lock);
1281 retcode = sync_after_error(mdev, na);
1282 if (retcode == NO_ERROR) {
1283 mdev->sync_conf.after = na;
1284 do {
1285 changes = _drbd_pause_after(mdev);
1286 changes |= _drbd_resume_next(mdev);
1287 } while (changes);
1288 }
1289 write_unlock_irq(&global_state_lock);
1290 return retcode;
1291}
1292
1293/**
1294 * drbd_start_resync() - Start the resync process
1295 * @mdev: DRBD device.
1296 * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET
1297 *
1298 * This function might bring you directly into one of the
1299 * C_PAUSED_SYNC_* states.
1300 */
1301void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1302{
1303 union drbd_state ns;
1304 int r;
1305
1306 if (mdev->state.conn >= C_SYNC_SOURCE) {
1307 dev_err(DEV, "Resync already running!\n");
1308 return;
1309 }
1310
1311 /* In case a previous resync run was aborted by an IO error/detach on the peer. */
1312 drbd_rs_cancel_all(mdev);
1313
1314 if (side == C_SYNC_TARGET) {
1315 /* Since application IO was locked out during C_WF_BITMAP_T and
1316 C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
1317 we check that we might make the data inconsistent. */
1318 r = drbd_khelper(mdev, "before-resync-target");
1319 r = (r >> 8) & 0xff;
1320 if (r > 0) {
1321 dev_info(DEV, "before-resync-target handler returned %d, "
1322 "dropping connection.\n", r);
1323 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
1324 return;
1325 }
1326 }
1327
1328 drbd_state_lock(mdev);
1329
1330 if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
1331 drbd_state_unlock(mdev);
1332 return;
1333 }
1334
1335 if (side == C_SYNC_TARGET) {
1336 mdev->bm_resync_fo = 0;
1337 } else /* side == C_SYNC_SOURCE */ {
1338 u64 uuid;
1339
1340 get_random_bytes(&uuid, sizeof(u64));
1341 drbd_uuid_set(mdev, UI_BITMAP, uuid);
1342 drbd_send_sync_uuid(mdev, uuid);
1343
1344 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
1345 }
1346
1347 write_lock_irq(&global_state_lock);
1348 ns = mdev->state;
1349
1350 ns.aftr_isp = !_drbd_may_sync_now(mdev);
1351
1352 ns.conn = side;
1353
1354 if (side == C_SYNC_TARGET)
1355 ns.disk = D_INCONSISTENT;
1356 else /* side == C_SYNC_SOURCE */
1357 ns.pdsk = D_INCONSISTENT;
1358
1359 r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
1360 ns = mdev->state;
1361
1362 if (ns.conn < C_CONNECTED)
1363 r = SS_UNKNOWN_ERROR;
1364
1365 if (r == SS_SUCCESS) {
1366 mdev->rs_total =
1367 mdev->rs_mark_left = drbd_bm_total_weight(mdev);
1368 mdev->rs_failed = 0;
1369 mdev->rs_paused = 0;
1370 mdev->rs_start =
1371 mdev->rs_mark_time = jiffies;
1372 mdev->rs_same_csum = 0;
1373 _drbd_pause_after(mdev);
1374 }
1375 write_unlock_irq(&global_state_lock);
1376 drbd_state_unlock(mdev);
1377 put_ldev(mdev);
1378
1379 if (r == SS_SUCCESS) {
1380 dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
1381 drbd_conn_str(ns.conn),
1382 (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
1383 (unsigned long) mdev->rs_total);
1384
1385 if (mdev->rs_total == 0) {
1386 /* Peer still reachable? Beware of failing before-resync-target handlers! */
1387 request_ping(mdev);
1388 __set_current_state(TASK_INTERRUPTIBLE);
1389 schedule_timeout(mdev->net_conf->ping_timeo*HZ/9); /* 9 instead 10 */
1390 drbd_resync_finished(mdev);
1391 return;
1392 }
1393
1394 /* ns.conn may already be != mdev->state.conn,
1395 * we may have been paused in between, or become paused until
1396 * the timer triggers.
1397 * No matter, that is handled in resync_timer_fn() */
1398 if (ns.conn == C_SYNC_TARGET)
1399 mod_timer(&mdev->resync_timer, jiffies);
1400
1401 drbd_md_sync(mdev);
1402 }
1403}
1404
1405int drbd_worker(struct drbd_thread *thi)
1406{
1407 struct drbd_conf *mdev = thi->mdev;
1408 struct drbd_work *w = NULL;
1409 LIST_HEAD(work_list);
1410 int intr = 0, i;
1411
1412 sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev));
1413
1414 while (get_t_state(thi) == Running) {
1415 drbd_thread_current_set_cpu(mdev);
1416
1417 if (down_trylock(&mdev->data.work.s)) {
1418 mutex_lock(&mdev->data.mutex);
1419 if (mdev->data.socket && !mdev->net_conf->no_cork)
1420 drbd_tcp_uncork(mdev->data.socket);
1421 mutex_unlock(&mdev->data.mutex);
1422
1423 intr = down_interruptible(&mdev->data.work.s);
1424
1425 mutex_lock(&mdev->data.mutex);
1426 if (mdev->data.socket && !mdev->net_conf->no_cork)
1427 drbd_tcp_cork(mdev->data.socket);
1428 mutex_unlock(&mdev->data.mutex);
1429 }
1430
1431 if (intr) {
1432 D_ASSERT(intr == -EINTR);
1433 flush_signals(current);
1434 ERR_IF (get_t_state(thi) == Running)
1435 continue;
1436 break;
1437 }
1438
1439 if (get_t_state(thi) != Running)
1440 break;
1441 /* With this break, we have done a down() but not consumed
1442 the entry from the list. The cleanup code takes care of
1443 this... */
1444
1445 w = NULL;
1446 spin_lock_irq(&mdev->data.work.q_lock);
1447 ERR_IF(list_empty(&mdev->data.work.q)) {
1448 /* something terribly wrong in our logic.
1449 * we were able to down() the semaphore,
1450 * but the list is empty... doh.
1451 *
1452 * what is the best thing to do now?
1453 * try again from scratch, restarting the receiver,
1454 * asender, whatnot? could break even more ugly,
1455 * e.g. when we are primary, but no good local data.
1456 *
1457 * I'll try to get away just starting over this loop.
1458 */
1459 spin_unlock_irq(&mdev->data.work.q_lock);
1460 continue;
1461 }
1462 w = list_entry(mdev->data.work.q.next, struct drbd_work, list);
1463 list_del_init(&w->list);
1464 spin_unlock_irq(&mdev->data.work.q_lock);
1465
1466 if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) {
1467 /* dev_warn(DEV, "worker: a callback failed! \n"); */
1468 if (mdev->state.conn >= C_CONNECTED)
1469 drbd_force_state(mdev,
1470 NS(conn, C_NETWORK_FAILURE));
1471 }
1472 }
1473 D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags));
1474 D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags));
1475
1476 spin_lock_irq(&mdev->data.work.q_lock);
1477 i = 0;
1478 while (!list_empty(&mdev->data.work.q)) {
1479 list_splice_init(&mdev->data.work.q, &work_list);
1480 spin_unlock_irq(&mdev->data.work.q_lock);
1481
1482 while (!list_empty(&work_list)) {
1483 w = list_entry(work_list.next, struct drbd_work, list);
1484 list_del_init(&w->list);
1485 w->cb(mdev, w, 1);
1486 i++; /* dead debugging code */
1487 }
1488
1489 spin_lock_irq(&mdev->data.work.q_lock);
1490 }
1491 sema_init(&mdev->data.work.s, 0);
1492 /* DANGEROUS race: if someone did queue his work within the spinlock,
1493 * but up() ed outside the spinlock, we could get an up() on the
1494 * semaphore without corresponding list entry.
1495 * So don't do that.
1496 */
1497 spin_unlock_irq(&mdev->data.work.q_lock);
1498
1499 D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
1500 /* _drbd_set_state only uses stop_nowait.
1501 * wait here for the Exiting receiver. */
1502 drbd_thread_stop(&mdev->receiver);
1503 drbd_mdev_cleanup(mdev);
1504
1505 dev_info(DEV, "worker terminated\n");
1506
1507 clear_bit(DEVICE_DYING, &mdev->flags);
1508 clear_bit(CONFIG_PENDING, &mdev->flags);
1509 wake_up(&mdev->state_wait);
1510
1511 return 0;
1512}
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h
new file mode 100644
index 000000000000..f93fa111ce50
--- /dev/null
+++ b/drivers/block/drbd/drbd_wrappers.h
@@ -0,0 +1,91 @@
1#ifndef _DRBD_WRAPPERS_H
2#define _DRBD_WRAPPERS_H
3
4#include <linux/ctype.h>
5#include <linux/mm.h>
6
7/* see get_sb_bdev and bd_claim */
8extern char *drbd_sec_holder;
9
10/* sets the number of 512 byte sectors of our virtual device */
11static inline void drbd_set_my_capacity(struct drbd_conf *mdev,
12 sector_t size)
13{
14 /* set_capacity(mdev->this_bdev->bd_disk, size); */
15 set_capacity(mdev->vdisk, size);
16 mdev->this_bdev->bd_inode->i_size = (loff_t)size << 9;
17}
18
19#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE)
20
21static inline int drbd_bio_has_active_page(struct bio *bio)
22{
23 struct bio_vec *bvec;
24 int i;
25
26 __bio_for_each_segment(bvec, bio, i, 0) {
27 if (page_count(bvec->bv_page) > 1)
28 return 1;
29 }
30
31 return 0;
32}
33
34/* bi_end_io handlers */
35extern void drbd_md_io_complete(struct bio *bio, int error);
36extern void drbd_endio_read_sec(struct bio *bio, int error);
37extern void drbd_endio_write_sec(struct bio *bio, int error);
38extern void drbd_endio_pri(struct bio *bio, int error);
39
40/*
41 * used to submit our private bio
42 */
43static inline void drbd_generic_make_request(struct drbd_conf *mdev,
44 int fault_type, struct bio *bio)
45{
46 __release(local);
47 if (!bio->bi_bdev) {
48 printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
49 "bio->bi_bdev == NULL\n",
50 mdev_to_minor(mdev));
51 dump_stack();
52 bio_endio(bio, -ENODEV);
53 return;
54 }
55
56 if (FAULT_ACTIVE(mdev, fault_type))
57 bio_endio(bio, -EIO);
58 else
59 generic_make_request(bio);
60}
61
62static inline void drbd_plug_device(struct drbd_conf *mdev)
63{
64 struct request_queue *q;
65 q = bdev_get_queue(mdev->this_bdev);
66
67 spin_lock_irq(q->queue_lock);
68
69/* XXX the check on !blk_queue_plugged is redundant,
70 * implicitly checked in blk_plug_device */
71
72 if (!blk_queue_plugged(q)) {
73 blk_plug_device(q);
74 del_timer(&q->unplug_timer);
75 /* unplugging should not happen automatically... */
76 }
77 spin_unlock_irq(q->queue_lock);
78}
79
80static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm)
81{
82 return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK)
83 == CRYPTO_ALG_TYPE_HASH;
84}
85
86#ifndef __CHECKER__
87# undef __cond_lock
88# define __cond_lock(x,c) (c)
89#endif
90
91#endif
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index 3bb7c47c869f..1fb6c3135fc8 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -123,7 +123,15 @@ static int ps3vram_notifier_wait(struct ps3_system_bus_device *dev,
123{ 123{
124 struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); 124 struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
125 u32 *notify = ps3vram_get_notifier(priv->reports, NOTIFIER); 125 u32 *notify = ps3vram_get_notifier(priv->reports, NOTIFIER);
126 unsigned long timeout = jiffies + msecs_to_jiffies(timeout_ms); 126 unsigned long timeout;
127
128 for (timeout = 20; timeout; timeout--) {
129 if (!notify[3])
130 return 0;
131 udelay(10);
132 }
133
134 timeout = jiffies + msecs_to_jiffies(timeout_ms);
127 135
128 do { 136 do {
129 if (!notify[3]) 137 if (!notify[3])
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 8ca17a3e96ea..64e2b379a350 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -59,12 +59,14 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
59 for (; nsect > 0; nsect--, block++, buf += tr->blksize) 59 for (; nsect > 0; nsect--, block++, buf += tr->blksize)
60 if (tr->readsect(dev, block, buf)) 60 if (tr->readsect(dev, block, buf))
61 return -EIO; 61 return -EIO;
62 rq_flush_dcache_pages(req);
62 return 0; 63 return 0;
63 64
64 case WRITE: 65 case WRITE:
65 if (!tr->writesect) 66 if (!tr->writesect)
66 return -EIO; 67 return -EIO;
67 68
69 rq_flush_dcache_pages(req);
68 for (; nsect > 0; nsect--, block++, buf += tr->blksize) 70 for (; nsect > 0; nsect--, block++, buf += tr->blksize)
69 if (tr->writesect(dev, block, buf)) 71 if (tr->writesect(dev, block, buf))
70 return -EIO; 72 return -EIO;
diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c
index c94de3139223..f69b7783027f 100644
--- a/drivers/staging/pohmelfs/inode.c
+++ b/drivers/staging/pohmelfs/inode.c
@@ -143,7 +143,6 @@ static int pohmelfs_writepages(struct address_space *mapping, struct writeback_c
143 struct inode *inode = mapping->host; 143 struct inode *inode = mapping->host;
144 struct pohmelfs_inode *pi = POHMELFS_I(inode); 144 struct pohmelfs_inode *pi = POHMELFS_I(inode);
145 struct pohmelfs_sb *psb = POHMELFS_SB(inode->i_sb); 145 struct pohmelfs_sb *psb = POHMELFS_SB(inode->i_sb);
146 struct backing_dev_info *bdi = mapping->backing_dev_info;
147 int err = 0; 146 int err = 0;
148 int done = 0; 147 int done = 0;
149 int nr_pages; 148 int nr_pages;
@@ -152,11 +151,6 @@ static int pohmelfs_writepages(struct address_space *mapping, struct writeback_c
152 int scanned = 0; 151 int scanned = 0;
153 int range_whole = 0; 152 int range_whole = 0;
154 153
155 if (wbc->nonblocking && bdi_write_congested(bdi)) {
156 wbc->encountered_congestion = 1;
157 return 0;
158 }
159
160 if (wbc->range_cyclic) { 154 if (wbc->range_cyclic) {
161 index = mapping->writeback_index; /* Start from prev offset */ 155 index = mapping->writeback_index; /* Start from prev offset */
162 end = -1; 156 end = -1;
@@ -248,10 +242,6 @@ retry:
248 242
249 if (wbc->nr_to_write <= 0) 243 if (wbc->nr_to_write <= 0)
250 done = 1; 244 done = 1;
251 if (wbc->nonblocking && bdi_write_congested(bdi)) {
252 wbc->encountered_congestion = 1;
253 done = 1;
254 }
255 245
256 continue; 246 continue;
257out_continue: 247out_continue:
diff --git a/fs/aio.c b/fs/aio.c
index 02a2c9340573..c30dfc006108 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -15,6 +15,7 @@
15#include <linux/aio_abi.h> 15#include <linux/aio_abi.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/syscalls.h> 17#include <linux/syscalls.h>
18#include <linux/backing-dev.h>
18#include <linux/uio.h> 19#include <linux/uio.h>
19 20
20#define DEBUG 0 21#define DEBUG 0
@@ -32,6 +33,9 @@
32#include <linux/workqueue.h> 33#include <linux/workqueue.h>
33#include <linux/security.h> 34#include <linux/security.h>
34#include <linux/eventfd.h> 35#include <linux/eventfd.h>
36#include <linux/blkdev.h>
37#include <linux/mempool.h>
38#include <linux/hash.h>
35 39
36#include <asm/kmap_types.h> 40#include <asm/kmap_types.h>
37#include <asm/uaccess.h> 41#include <asm/uaccess.h>
@@ -60,6 +64,14 @@ static DECLARE_WORK(fput_work, aio_fput_routine);
60static DEFINE_SPINLOCK(fput_lock); 64static DEFINE_SPINLOCK(fput_lock);
61static LIST_HEAD(fput_head); 65static LIST_HEAD(fput_head);
62 66
67#define AIO_BATCH_HASH_BITS 3 /* allocated on-stack, so don't go crazy */
68#define AIO_BATCH_HASH_SIZE (1 << AIO_BATCH_HASH_BITS)
69struct aio_batch_entry {
70 struct hlist_node list;
71 struct address_space *mapping;
72};
73mempool_t *abe_pool;
74
63static void aio_kick_handler(struct work_struct *); 75static void aio_kick_handler(struct work_struct *);
64static void aio_queue_work(struct kioctx *); 76static void aio_queue_work(struct kioctx *);
65 77
@@ -73,6 +85,8 @@ static int __init aio_setup(void)
73 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); 85 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
74 86
75 aio_wq = create_workqueue("aio"); 87 aio_wq = create_workqueue("aio");
88 abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
89 BUG_ON(!abe_pool);
76 90
77 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); 91 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
78 92
@@ -1531,8 +1545,44 @@ static int aio_wake_function(wait_queue_t *wait, unsigned mode,
1531 return 1; 1545 return 1;
1532} 1546}
1533 1547
1548static void aio_batch_add(struct address_space *mapping,
1549 struct hlist_head *batch_hash)
1550{
1551 struct aio_batch_entry *abe;
1552 struct hlist_node *pos;
1553 unsigned bucket;
1554
1555 bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS);
1556 hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) {
1557 if (abe->mapping == mapping)
1558 return;
1559 }
1560
1561 abe = mempool_alloc(abe_pool, GFP_KERNEL);
1562 BUG_ON(!igrab(mapping->host));
1563 abe->mapping = mapping;
1564 hlist_add_head(&abe->list, &batch_hash[bucket]);
1565 return;
1566}
1567
1568static void aio_batch_free(struct hlist_head *batch_hash)
1569{
1570 struct aio_batch_entry *abe;
1571 struct hlist_node *pos, *n;
1572 int i;
1573
1574 for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
1575 hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
1576 blk_run_address_space(abe->mapping);
1577 iput(abe->mapping->host);
1578 hlist_del(&abe->list);
1579 mempool_free(abe, abe_pool);
1580 }
1581 }
1582}
1583
1534static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, 1584static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1535 struct iocb *iocb) 1585 struct iocb *iocb, struct hlist_head *batch_hash)
1536{ 1586{
1537 struct kiocb *req; 1587 struct kiocb *req;
1538 struct file *file; 1588 struct file *file;
@@ -1608,6 +1658,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1608 ; 1658 ;
1609 } 1659 }
1610 spin_unlock_irq(&ctx->ctx_lock); 1660 spin_unlock_irq(&ctx->ctx_lock);
1661 if (req->ki_opcode == IOCB_CMD_PREAD ||
1662 req->ki_opcode == IOCB_CMD_PREADV ||
1663 req->ki_opcode == IOCB_CMD_PWRITE ||
1664 req->ki_opcode == IOCB_CMD_PWRITEV)
1665 aio_batch_add(file->f_mapping, batch_hash);
1666
1611 aio_put_req(req); /* drop extra ref to req */ 1667 aio_put_req(req); /* drop extra ref to req */
1612 return 0; 1668 return 0;
1613 1669
@@ -1635,6 +1691,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1635 struct kioctx *ctx; 1691 struct kioctx *ctx;
1636 long ret = 0; 1692 long ret = 0;
1637 int i; 1693 int i;
1694 struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, };
1638 1695
1639 if (unlikely(nr < 0)) 1696 if (unlikely(nr < 0))
1640 return -EINVAL; 1697 return -EINVAL;
@@ -1666,10 +1723,11 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1666 break; 1723 break;
1667 } 1724 }
1668 1725
1669 ret = io_submit_one(ctx, user_iocb, &tmp); 1726 ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash);
1670 if (ret) 1727 if (ret)
1671 break; 1728 break;
1672 } 1729 }
1730 aio_batch_free(batch_hash);
1673 1731
1674 put_ioctx(ctx); 1732 put_ioctx(ctx);
1675 return i ? i : ret; 1733 return i ? i : ret;
diff --git a/fs/bio.c b/fs/bio.c
index 12da5db8682c..e23a63f4f7de 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1393,6 +1393,18 @@ void bio_check_pages_dirty(struct bio *bio)
1393 } 1393 }
1394} 1394}
1395 1395
1396#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
1397void bio_flush_dcache_pages(struct bio *bi)
1398{
1399 int i;
1400 struct bio_vec *bvec;
1401
1402 bio_for_each_segment(bvec, bi, i)
1403 flush_dcache_page(bvec->bv_page);
1404}
1405EXPORT_SYMBOL(bio_flush_dcache_pages);
1406#endif
1407
1396/** 1408/**
1397 * bio_endio - end I/O on a bio 1409 * bio_endio - end I/O on a bio
1398 * @bio: bio 1410 * @bio: bio
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8bed0557d88c..73d6a735b8f3 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -405,7 +405,17 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
405 405
406static int block_fsync(struct file *filp, struct dentry *dentry, int datasync) 406static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
407{ 407{
408 return sync_blockdev(I_BDEV(filp->f_mapping->host)); 408 struct block_device *bdev = I_BDEV(filp->f_mapping->host);
409 int error;
410
411 error = sync_blockdev(bdev);
412 if (error)
413 return error;
414
415 error = blkdev_issue_flush(bdev, NULL);
416 if (error == -EOPNOTSUPP)
417 error = 0;
418 return error;
409} 419}
410 420
411/* 421/*
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 8b10b87dc01a..b912270942fa 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1028,9 +1028,6 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1028 if (dio->bio) 1028 if (dio->bio)
1029 dio_bio_submit(dio); 1029 dio_bio_submit(dio);
1030 1030
1031 /* All IO is now issued, send it on its way */
1032 blk_run_address_space(inode->i_mapping);
1033
1034 /* 1031 /*
1035 * It is possible that, we return short IO due to end of file. 1032 * It is possible that, we return short IO due to end of file.
1036 * In that case, we need to release all the pages we got hold on. 1033 * In that case, we need to release all the pages we got hold on.
@@ -1057,8 +1054,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1057 ((rw & READ) || (dio->result == dio->size))) 1054 ((rw & READ) || (dio->result == dio->size)))
1058 ret = -EIOCBQUEUED; 1055 ret = -EIOCBQUEUED;
1059 1056
1060 if (ret != -EIOCBQUEUED) 1057 if (ret != -EIOCBQUEUED) {
1058 /* All IO is now issued, send it on its way */
1059 blk_run_address_space(inode->i_mapping);
1061 dio_await_completion(dio); 1060 dio_await_completion(dio);
1061 }
1062 1062
1063 /* 1063 /*
1064 * Sync will always be dropping the final ref and completing the 1064 * Sync will always be dropping the final ref and completing the
@@ -1124,7 +1124,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1124 int acquire_i_mutex = 0; 1124 int acquire_i_mutex = 0;
1125 1125
1126 if (rw & WRITE) 1126 if (rw & WRITE)
1127 rw = WRITE_ODIRECT; 1127 rw = WRITE_ODIRECT_PLUG;
1128 1128
1129 if (bdev) 1129 if (bdev)
1130 bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); 1130 bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9d5360c4c2af..49bc1b8e8f19 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -614,7 +614,6 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
614 struct writeback_control *wbc) 614 struct writeback_control *wbc)
615{ 615{
616 struct super_block *sb = wbc->sb, *pin_sb = NULL; 616 struct super_block *sb = wbc->sb, *pin_sb = NULL;
617 const int is_blkdev_sb = sb_is_blkdev_sb(sb);
618 const unsigned long start = jiffies; /* livelock avoidance */ 617 const unsigned long start = jiffies; /* livelock avoidance */
619 618
620 spin_lock(&inode_lock); 619 spin_lock(&inode_lock);
@@ -635,36 +634,11 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
635 continue; 634 continue;
636 } 635 }
637 636
638 if (!bdi_cap_writeback_dirty(wb->bdi)) {
639 redirty_tail(inode);
640 if (is_blkdev_sb) {
641 /*
642 * Dirty memory-backed blockdev: the ramdisk
643 * driver does this. Skip just this inode
644 */
645 continue;
646 }
647 /*
648 * Dirty memory-backed inode against a filesystem other
649 * than the kernel-internal bdev filesystem. Skip the
650 * entire superblock.
651 */
652 break;
653 }
654
655 if (inode->i_state & (I_NEW | I_WILL_FREE)) { 637 if (inode->i_state & (I_NEW | I_WILL_FREE)) {
656 requeue_io(inode); 638 requeue_io(inode);
657 continue; 639 continue;
658 } 640 }
659 641
660 if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
661 wbc->encountered_congestion = 1;
662 if (!is_blkdev_sb)
663 break; /* Skip a congested fs */
664 requeue_io(inode);
665 continue; /* Skip a congested blockdev */
666 }
667
668 /* 642 /*
669 * Was this inode dirtied after sync_sb_inodes was called? 643 * Was this inode dirtied after sync_sb_inodes was called?
670 * This keeps sync from extra jobs and livelock. 644 * This keeps sync from extra jobs and livelock.
@@ -756,6 +730,7 @@ static long wb_writeback(struct bdi_writeback *wb,
756 .sync_mode = args->sync_mode, 730 .sync_mode = args->sync_mode,
757 .older_than_this = NULL, 731 .older_than_this = NULL,
758 .for_kupdate = args->for_kupdate, 732 .for_kupdate = args->for_kupdate,
733 .for_background = args->for_background,
759 .range_cyclic = args->range_cyclic, 734 .range_cyclic = args->range_cyclic,
760 }; 735 };
761 unsigned long oldest_jif; 736 unsigned long oldest_jif;
@@ -787,7 +762,6 @@ static long wb_writeback(struct bdi_writeback *wb,
787 break; 762 break;
788 763
789 wbc.more_io = 0; 764 wbc.more_io = 0;
790 wbc.encountered_congestion = 0;
791 wbc.nr_to_write = MAX_WRITEBACK_PAGES; 765 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
792 wbc.pages_skipped = 0; 766 wbc.pages_skipped = 0;
793 writeback_inodes_wb(wb, &wbc); 767 writeback_inodes_wb(wb, &wbc);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 53eb26c16b50..c84b5cc1a943 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -178,7 +178,7 @@ static int wb_priority(struct writeback_control *wbc)
178{ 178{
179 if (wbc->for_reclaim) 179 if (wbc->for_reclaim)
180 return FLUSH_HIGHPRI | FLUSH_STABLE; 180 return FLUSH_HIGHPRI | FLUSH_STABLE;
181 if (wbc->for_kupdate) 181 if (wbc->for_kupdate || wbc->for_background)
182 return FLUSH_LOWPRI; 182 return FLUSH_LOWPRI;
183 return 0; 183 return 0;
184} 184}
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7b685e10cbad..64bc8998ac9a 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -226,6 +226,13 @@ ssize_t part_alignment_offset_show(struct device *dev,
226 return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset); 226 return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
227} 227}
228 228
229ssize_t part_discard_alignment_show(struct device *dev,
230 struct device_attribute *attr, char *buf)
231{
232 struct hd_struct *p = dev_to_part(dev);
233 return sprintf(buf, "%u\n", p->discard_alignment);
234}
235
229ssize_t part_stat_show(struct device *dev, 236ssize_t part_stat_show(struct device *dev,
230 struct device_attribute *attr, char *buf) 237 struct device_attribute *attr, char *buf)
231{ 238{
@@ -288,6 +295,8 @@ static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
288static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); 295static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
289static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); 296static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
290static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); 297static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
298static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
299 NULL);
291static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); 300static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
292static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); 301static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
293#ifdef CONFIG_FAIL_MAKE_REQUEST 302#ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -300,6 +309,7 @@ static struct attribute *part_attrs[] = {
300 &dev_attr_start.attr, 309 &dev_attr_start.attr,
301 &dev_attr_size.attr, 310 &dev_attr_size.attr,
302 &dev_attr_alignment_offset.attr, 311 &dev_attr_alignment_offset.attr,
312 &dev_attr_discard_alignment.attr,
303 &dev_attr_stat.attr, 313 &dev_attr_stat.attr,
304 &dev_attr_inflight.attr, 314 &dev_attr_inflight.attr,
305#ifdef CONFIG_FAIL_MAKE_REQUEST 315#ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -403,6 +413,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
403 413
404 p->start_sect = start; 414 p->start_sect = start;
405 p->alignment_offset = queue_sector_alignment_offset(disk->queue, start); 415 p->alignment_offset = queue_sector_alignment_offset(disk->queue, start);
416 p->discard_alignment = queue_sector_discard_alignment(disk->queue,
417 start);
406 p->nr_sects = len; 418 p->nr_sects = len;
407 p->partno = partno; 419 p->partno = partno;
408 p->policy = get_disk_ro(disk); 420 p->policy = get_disk_ro(disk);
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 038a6022152f..49cfd5f54238 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -1,7 +1,9 @@
1/************************************************************ 1/************************************************************
2 * EFI GUID Partition Table handling 2 * EFI GUID Partition Table handling
3 * Per Intel EFI Specification v1.02 3 *
4 * http://developer.intel.com/technology/efi/efi.htm 4 * http://www.uefi.org/specs/
5 * http://www.intel.com/technology/efi/
6 *
5 * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com> 7 * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com>
6 * Copyright 2000,2001,2002,2004 Dell Inc. 8 * Copyright 2000,2001,2002,2004 Dell Inc.
7 * 9 *
@@ -92,6 +94,7 @@
92 * 94 *
93 ************************************************************/ 95 ************************************************************/
94#include <linux/crc32.h> 96#include <linux/crc32.h>
97#include <linux/math64.h>
95#include "check.h" 98#include "check.h"
96#include "efi.h" 99#include "efi.h"
97 100
@@ -141,7 +144,8 @@ last_lba(struct block_device *bdev)
141{ 144{
142 if (!bdev || !bdev->bd_inode) 145 if (!bdev || !bdev->bd_inode)
143 return 0; 146 return 0;
144 return (bdev->bd_inode->i_size >> 9) - 1ULL; 147 return div_u64(bdev->bd_inode->i_size,
148 bdev_logical_block_size(bdev)) - 1ULL;
145} 149}
146 150
147static inline int 151static inline int
@@ -188,6 +192,7 @@ static size_t
188read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count) 192read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
189{ 193{
190 size_t totalreadcount = 0; 194 size_t totalreadcount = 0;
195 sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
191 196
192 if (!bdev || !buffer || lba > last_lba(bdev)) 197 if (!bdev || !buffer || lba > last_lba(bdev))
193 return 0; 198 return 0;
@@ -195,7 +200,7 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
195 while (count) { 200 while (count) {
196 int copied = 512; 201 int copied = 512;
197 Sector sect; 202 Sector sect;
198 unsigned char *data = read_dev_sector(bdev, lba++, &sect); 203 unsigned char *data = read_dev_sector(bdev, n++, &sect);
199 if (!data) 204 if (!data)
200 break; 205 break;
201 if (copied > count) 206 if (copied > count)
@@ -257,15 +262,16 @@ static gpt_header *
257alloc_read_gpt_header(struct block_device *bdev, u64 lba) 262alloc_read_gpt_header(struct block_device *bdev, u64 lba)
258{ 263{
259 gpt_header *gpt; 264 gpt_header *gpt;
265 unsigned ssz = bdev_logical_block_size(bdev);
266
260 if (!bdev) 267 if (!bdev)
261 return NULL; 268 return NULL;
262 269
263 gpt = kzalloc(sizeof (gpt_header), GFP_KERNEL); 270 gpt = kzalloc(ssz, GFP_KERNEL);
264 if (!gpt) 271 if (!gpt)
265 return NULL; 272 return NULL;
266 273
267 if (read_lba(bdev, lba, (u8 *) gpt, 274 if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) {
268 sizeof (gpt_header)) < sizeof (gpt_header)) {
269 kfree(gpt); 275 kfree(gpt);
270 gpt=NULL; 276 gpt=NULL;
271 return NULL; 277 return NULL;
@@ -601,6 +607,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
601 gpt_header *gpt = NULL; 607 gpt_header *gpt = NULL;
602 gpt_entry *ptes = NULL; 608 gpt_entry *ptes = NULL;
603 u32 i; 609 u32 i;
610 unsigned ssz = bdev_logical_block_size(bdev) / 512;
604 611
605 if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) { 612 if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) {
606 kfree(gpt); 613 kfree(gpt);
@@ -611,13 +618,14 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
611 pr_debug("GUID Partition Table is valid! Yea!\n"); 618 pr_debug("GUID Partition Table is valid! Yea!\n");
612 619
613 for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { 620 for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
621 u64 start = le64_to_cpu(ptes[i].starting_lba);
622 u64 size = le64_to_cpu(ptes[i].ending_lba) -
623 le64_to_cpu(ptes[i].starting_lba) + 1ULL;
624
614 if (!is_pte_valid(&ptes[i], last_lba(bdev))) 625 if (!is_pte_valid(&ptes[i], last_lba(bdev)))
615 continue; 626 continue;
616 627
617 put_partition(state, i+1, le64_to_cpu(ptes[i].starting_lba), 628 put_partition(state, i+1, start * ssz, size * ssz);
618 (le64_to_cpu(ptes[i].ending_lba) -
619 le64_to_cpu(ptes[i].starting_lba) +
620 1ULL));
621 629
622 /* If this is a RAID volume, tell md */ 630 /* If this is a RAID volume, tell md */
623 if (!efi_guidcmp(ptes[i].partition_type_guid, 631 if (!efi_guidcmp(ptes[i].partition_type_guid,
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
index 2cc89d0475bf..6998b589abf9 100644
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -37,7 +37,6 @@
37#define EFI_PMBR_OSTYPE_EFI 0xEF 37#define EFI_PMBR_OSTYPE_EFI 0xEF
38#define EFI_PMBR_OSTYPE_EFI_GPT 0xEE 38#define EFI_PMBR_OSTYPE_EFI_GPT 0xEE
39 39
40#define GPT_BLOCK_SIZE 512
41#define GPT_HEADER_SIGNATURE 0x5452415020494645ULL 40#define GPT_HEADER_SIGNATURE 0x5452415020494645ULL
42#define GPT_HEADER_REVISION_V1 0x00010000 41#define GPT_HEADER_REVISION_V1 0x00010000
43#define GPT_PRIMARY_PARTITION_TABLE_LBA 1 42#define GPT_PRIMARY_PARTITION_TABLE_LBA 1
@@ -79,7 +78,12 @@ typedef struct _gpt_header {
79 __le32 num_partition_entries; 78 __le32 num_partition_entries;
80 __le32 sizeof_partition_entry; 79 __le32 sizeof_partition_entry;
81 __le32 partition_entry_array_crc32; 80 __le32 partition_entry_array_crc32;
82 u8 reserved2[GPT_BLOCK_SIZE - 92]; 81
82 /* The rest of the logical block is reserved by UEFI and must be zero.
83 * EFI standard handles this by:
84 *
85 * uint8_t reserved2[ BlockSize - 92 ];
86 */
83} __attribute__ ((packed)) gpt_header; 87} __attribute__ ((packed)) gpt_header;
84 88
85typedef struct _gpt_entry_attributes { 89typedef struct _gpt_entry_attributes {
diff --git a/fs/read_write.c b/fs/read_write.c
index 3ac28987f22a..b7f4a1f94d48 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -826,8 +826,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
826 if (!(out_file->f_mode & FMODE_WRITE)) 826 if (!(out_file->f_mode & FMODE_WRITE))
827 goto fput_out; 827 goto fput_out;
828 retval = -EINVAL; 828 retval = -EINVAL;
829 if (!out_file->f_op || !out_file->f_op->sendpage)
830 goto fput_out;
831 in_inode = in_file->f_path.dentry->d_inode; 829 in_inode = in_file->f_path.dentry->d_inode;
832 out_inode = out_file->f_path.dentry->d_inode; 830 out_inode = out_file->f_path.dentry->d_inode;
833 retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count); 831 retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
diff --git a/fs/splice.c b/fs/splice.c
index 7394e9e17534..39208663aaf1 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -648,9 +648,11 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
648 ret = buf->ops->confirm(pipe, buf); 648 ret = buf->ops->confirm(pipe, buf);
649 if (!ret) { 649 if (!ret) {
650 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; 650 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
651 651 if (file->f_op && file->f_op->sendpage)
652 ret = file->f_op->sendpage(file, buf->page, buf->offset, 652 ret = file->f_op->sendpage(file, buf->page, buf->offset,
653 sd->len, &pos, more); 653 sd->len, &pos, more);
654 else
655 ret = -EINVAL;
654 } 656 }
655 657
656 return ret; 658 return ret;
@@ -1068,8 +1070,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
1068 if (unlikely(ret < 0)) 1070 if (unlikely(ret < 0))
1069 return ret; 1071 return ret;
1070 1072
1071 splice_write = out->f_op->splice_write; 1073 if (out->f_op && out->f_op->splice_write)
1072 if (!splice_write) 1074 splice_write = out->f_op->splice_write;
1075 else
1073 splice_write = default_file_splice_write; 1076 splice_write = default_file_splice_write;
1074 1077
1075 return splice_write(pipe, out, ppos, len, flags); 1078 return splice_write(pipe, out, ppos, len, flags);
@@ -1093,8 +1096,9 @@ static long do_splice_to(struct file *in, loff_t *ppos,
1093 if (unlikely(ret < 0)) 1096 if (unlikely(ret < 0))
1094 return ret; 1097 return ret;
1095 1098
1096 splice_read = in->f_op->splice_read; 1099 if (in->f_op && in->f_op->splice_read)
1097 if (!splice_read) 1100 splice_read = in->f_op->splice_read;
1101 else
1098 splice_read = default_file_splice_read; 1102 splice_read = default_file_splice_read;
1099 1103
1100 return splice_read(in, ppos, pipe, len, flags); 1104 return splice_read(in, ppos, pipe, len, flags);
@@ -1316,7 +1320,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1316 if (off_in) 1320 if (off_in)
1317 return -ESPIPE; 1321 return -ESPIPE;
1318 if (off_out) { 1322 if (off_out) {
1319 if (out->f_op->llseek == no_llseek) 1323 if (!out->f_op || !out->f_op->llseek ||
1324 out->f_op->llseek == no_llseek)
1320 return -EINVAL; 1325 return -EINVAL;
1321 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1326 if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1322 return -EFAULT; 1327 return -EFAULT;
@@ -1336,7 +1341,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1336 if (off_out) 1341 if (off_out)
1337 return -ESPIPE; 1342 return -ESPIPE;
1338 if (off_in) { 1343 if (off_in) {
1339 if (in->f_op->llseek == no_llseek) 1344 if (!in->f_op || !in->f_op->llseek ||
1345 in->f_op->llseek == no_llseek)
1340 return -EINVAL; 1346 return -EINVAL;
1341 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1347 if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1342 return -EFAULT; 1348 return -EFAULT;
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c2e30eea74dc..70f989895d15 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -904,16 +904,9 @@ xfs_convert_page(
904 904
905 if (startio) { 905 if (startio) {
906 if (count) { 906 if (count) {
907 struct backing_dev_info *bdi;
908
909 bdi = inode->i_mapping->backing_dev_info;
910 wbc->nr_to_write--; 907 wbc->nr_to_write--;
911 if (bdi_write_congested(bdi)) { 908 if (wbc->nr_to_write <= 0)
912 wbc->encountered_congestion = 1;
913 done = 1;
914 } else if (wbc->nr_to_write <= 0) {
915 done = 1; 909 done = 1;
916 }
917 } 910 }
918 xfs_start_page_writeback(page, !page_dirty, count); 911 xfs_start_page_writeback(page, !page_dirty, count);
919 } 912 }
diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h
index ba4ec39a1131..57b5c3c82e86 100644
--- a/include/asm-generic/cacheflush.h
+++ b/include/asm-generic/cacheflush.h
@@ -13,6 +13,7 @@
13#define flush_cache_dup_mm(mm) do { } while (0) 13#define flush_cache_dup_mm(mm) do { } while (0)
14#define flush_cache_range(vma, start, end) do { } while (0) 14#define flush_cache_range(vma, start, end) do { } while (0)
15#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) 15#define flush_cache_page(vma, vmaddr, pfn) do { } while (0)
16#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
16#define flush_dcache_page(page) do { } while (0) 17#define flush_dcache_page(page) do { } while (0)
17#define flush_dcache_mmap_lock(mapping) do { } while (0) 18#define flush_dcache_mmap_lock(mapping) do { } while (0)
18#define flush_dcache_mmap_unlock(mapping) do { } while (0) 19#define flush_dcache_mmap_unlock(mapping) do { } while (0)
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index b449e738533a..fcbc26af00e4 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -331,4 +331,17 @@ static inline int bdi_sched_wait(void *word)
331 return 0; 331 return 0;
332} 332}
333 333
334static inline void blk_run_backing_dev(struct backing_dev_info *bdi,
335 struct page *page)
336{
337 if (bdi && bdi->unplug_io_fn)
338 bdi->unplug_io_fn(bdi, page);
339}
340
341static inline void blk_run_address_space(struct address_space *mapping)
342{
343 if (mapping)
344 blk_run_backing_dev(mapping->backing_dev_info, NULL);
345}
346
334#endif /* _LINUX_BACKING_DEV_H */ 347#endif /* _LINUX_BACKING_DEV_H */
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5be93f18d842..7fc5606e6ea5 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -391,6 +391,18 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int,
391 gfp_t, int); 391 gfp_t, int);
392extern void bio_set_pages_dirty(struct bio *bio); 392extern void bio_set_pages_dirty(struct bio *bio);
393extern void bio_check_pages_dirty(struct bio *bio); 393extern void bio_check_pages_dirty(struct bio *bio);
394
395#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
396# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
397#endif
398#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
399extern void bio_flush_dcache_pages(struct bio *bi);
400#else
401static inline void bio_flush_dcache_pages(struct bio *bi)
402{
403}
404#endif
405
394extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *, 406extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *,
395 unsigned long, unsigned int, int, gfp_t); 407 unsigned long, unsigned int, int, gfp_t);
396extern struct bio *bio_copy_user_iov(struct request_queue *, 408extern struct bio *bio_copy_user_iov(struct request_queue *,
@@ -450,11 +462,8 @@ extern struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly;
450/* 462/*
451 * remember never ever reenable interrupts between a bvec_kmap_irq and 463 * remember never ever reenable interrupts between a bvec_kmap_irq and
452 * bvec_kunmap_irq! 464 * bvec_kunmap_irq!
453 *
454 * This function MUST be inlined - it plays with the CPU interrupt flags.
455 */ 465 */
456static __always_inline char *bvec_kmap_irq(struct bio_vec *bvec, 466static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
457 unsigned long *flags)
458{ 467{
459 unsigned long addr; 468 unsigned long addr;
460 469
@@ -470,8 +479,7 @@ static __always_inline char *bvec_kmap_irq(struct bio_vec *bvec,
470 return (char *) addr + bvec->bv_offset; 479 return (char *) addr + bvec->bv_offset;
471} 480}
472 481
473static __always_inline void bvec_kunmap_irq(char *buffer, 482static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
474 unsigned long *flags)
475{ 483{
476 unsigned long ptr = (unsigned long) buffer & PAGE_MASK; 484 unsigned long ptr = (unsigned long) buffer & PAGE_MASK;
477 485
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 221cecd86bd3..784a919aa0d0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -312,13 +312,17 @@ struct queue_limits {
312 unsigned int io_min; 312 unsigned int io_min;
313 unsigned int io_opt; 313 unsigned int io_opt;
314 unsigned int max_discard_sectors; 314 unsigned int max_discard_sectors;
315 unsigned int discard_granularity;
316 unsigned int discard_alignment;
315 317
316 unsigned short logical_block_size; 318 unsigned short logical_block_size;
317 unsigned short max_hw_segments; 319 unsigned short max_hw_segments;
318 unsigned short max_phys_segments; 320 unsigned short max_phys_segments;
319 321
320 unsigned char misaligned; 322 unsigned char misaligned;
323 unsigned char discard_misaligned;
321 unsigned char no_cluster; 324 unsigned char no_cluster;
325 signed char discard_zeroes_data;
322}; 326};
323 327
324struct request_queue 328struct request_queue
@@ -749,6 +753,17 @@ struct req_iterator {
749#define rq_iter_last(rq, _iter) \ 753#define rq_iter_last(rq, _iter) \
750 (_iter.bio->bi_next == NULL && _iter.i == _iter.bio->bi_vcnt-1) 754 (_iter.bio->bi_next == NULL && _iter.i == _iter.bio->bi_vcnt-1)
751 755
756#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
757# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
758#endif
759#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
760extern void rq_flush_dcache_pages(struct request *rq);
761#else
762static inline void rq_flush_dcache_pages(struct request *rq)
763{
764}
765#endif
766
752extern int blk_register_queue(struct gendisk *disk); 767extern int blk_register_queue(struct gendisk *disk);
753extern void blk_unregister_queue(struct gendisk *disk); 768extern void blk_unregister_queue(struct gendisk *disk);
754extern void register_disk(struct gendisk *dev); 769extern void register_disk(struct gendisk *dev);
@@ -823,19 +838,6 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
823 return bdev->bd_disk->queue; 838 return bdev->bd_disk->queue;
824} 839}
825 840
826static inline void blk_run_backing_dev(struct backing_dev_info *bdi,
827 struct page *page)
828{
829 if (bdi && bdi->unplug_io_fn)
830 bdi->unplug_io_fn(bdi, page);
831}
832
833static inline void blk_run_address_space(struct address_space *mapping)
834{
835 if (mapping)
836 blk_run_backing_dev(mapping->backing_dev_info, NULL);
837}
838
839/* 841/*
840 * blk_rq_pos() : the current sector 842 * blk_rq_pos() : the current sector
841 * blk_rq_bytes() : bytes left in the entire request 843 * blk_rq_bytes() : bytes left in the entire request
@@ -1134,6 +1136,34 @@ static inline int bdev_alignment_offset(struct block_device *bdev)
1134 return q->limits.alignment_offset; 1136 return q->limits.alignment_offset;
1135} 1137}
1136 1138
1139static inline int queue_discard_alignment(struct request_queue *q)
1140{
1141 if (q->limits.discard_misaligned)
1142 return -1;
1143
1144 return q->limits.discard_alignment;
1145}
1146
1147static inline int queue_sector_discard_alignment(struct request_queue *q,
1148 sector_t sector)
1149{
1150 return ((sector << 9) - q->limits.discard_alignment)
1151 & (q->limits.discard_granularity - 1);
1152}
1153
1154static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
1155{
1156 if (q->limits.discard_zeroes_data == 1)
1157 return 1;
1158
1159 return 0;
1160}
1161
1162static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev)
1163{
1164 return queue_discard_zeroes_data(bdev_get_queue(bdev));
1165}
1166
1137static inline int queue_dma_alignment(struct request_queue *q) 1167static inline int queue_dma_alignment(struct request_queue *q)
1138{ 1168{
1139 return q ? q->dma_alignment : 511; 1169 return q ? q->dma_alignment : 511;
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 9c8d31bacf46..ccefff02b6cb 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -60,3 +60,9 @@ SUBSYS(net_cls)
60#endif 60#endif
61 61
62/* */ 62/* */
63
64#ifdef CONFIG_BLK_CGROUP
65SUBSYS(blkio)
66#endif
67
68/* */
diff --git a/include/linux/connector.h b/include/linux/connector.h
index 3a14615fd35c..72ba63eb83c5 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -43,6 +43,8 @@
43#define CN_DST_VAL 0x1 43#define CN_DST_VAL 0x1
44#define CN_IDX_DM 0x7 /* Device Mapper */ 44#define CN_IDX_DM 0x7 /* Device Mapper */
45#define CN_VAL_DM_USERSPACE_LOG 0x1 45#define CN_VAL_DM_USERSPACE_LOG 0x1
46#define CN_IDX_DRBD 0x8
47#define CN_VAL_DRBD 0x1
46 48
47#define CN_NETLINK_USERS 8 49#define CN_NETLINK_USERS 8
48 50
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
new file mode 100644
index 000000000000..e84f4733cb55
--- /dev/null
+++ b/include/linux/drbd.h
@@ -0,0 +1,343 @@
1/*
2 drbd.h
3 Kernel module for 2.6.x Kernels
4
5 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6
7 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
8 Copyright (C) 2001-2008, Philipp Reisner <philipp.reisner@linbit.com>.
9 Copyright (C) 2001-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
10
11 drbd is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 drbd is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with drbd; see the file COPYING. If not, write to
23 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
24
25*/
26#ifndef DRBD_H
27#define DRBD_H
28#include <linux/connector.h>
29#include <asm/types.h>
30
31#ifdef __KERNEL__
32#include <linux/types.h>
33#include <asm/byteorder.h>
34#else
35#include <sys/types.h>
36#include <sys/wait.h>
37#include <limits.h>
38
39/* Altough the Linux source code makes a difference between
40 generic endianness and the bitfields' endianness, there is no
41 architecture as of Linux-2.6.24-rc4 where the bitfileds' endianness
42 does not match the generic endianness. */
43
44#if __BYTE_ORDER == __LITTLE_ENDIAN
45#define __LITTLE_ENDIAN_BITFIELD
46#elif __BYTE_ORDER == __BIG_ENDIAN
47#define __BIG_ENDIAN_BITFIELD
48#else
49# error "sorry, weird endianness on this box"
50#endif
51
52#endif
53
54
55extern const char *drbd_buildtag(void);
56#define REL_VERSION "8.3.6"
57#define API_VERSION 88
58#define PRO_VERSION_MIN 86
59#define PRO_VERSION_MAX 91
60
61
62enum drbd_io_error_p {
63 EP_PASS_ON, /* FIXME should the better be named "Ignore"? */
64 EP_CALL_HELPER,
65 EP_DETACH
66};
67
68enum drbd_fencing_p {
69 FP_DONT_CARE,
70 FP_RESOURCE,
71 FP_STONITH
72};
73
74enum drbd_disconnect_p {
75 DP_RECONNECT,
76 DP_DROP_NET_CONF,
77 DP_FREEZE_IO
78};
79
80enum drbd_after_sb_p {
81 ASB_DISCONNECT,
82 ASB_DISCARD_YOUNGER_PRI,
83 ASB_DISCARD_OLDER_PRI,
84 ASB_DISCARD_ZERO_CHG,
85 ASB_DISCARD_LEAST_CHG,
86 ASB_DISCARD_LOCAL,
87 ASB_DISCARD_REMOTE,
88 ASB_CONSENSUS,
89 ASB_DISCARD_SECONDARY,
90 ASB_CALL_HELPER,
91 ASB_VIOLENTLY
92};
93
94/* KEEP the order, do not delete or insert. Only append. */
95enum drbd_ret_codes {
96 ERR_CODE_BASE = 100,
97 NO_ERROR = 101,
98 ERR_LOCAL_ADDR = 102,
99 ERR_PEER_ADDR = 103,
100 ERR_OPEN_DISK = 104,
101 ERR_OPEN_MD_DISK = 105,
102 ERR_DISK_NOT_BDEV = 107,
103 ERR_MD_NOT_BDEV = 108,
104 ERR_DISK_TO_SMALL = 111,
105 ERR_MD_DISK_TO_SMALL = 112,
106 ERR_BDCLAIM_DISK = 114,
107 ERR_BDCLAIM_MD_DISK = 115,
108 ERR_MD_IDX_INVALID = 116,
109 ERR_IO_MD_DISK = 118,
110 ERR_MD_INVALID = 119,
111 ERR_AUTH_ALG = 120,
112 ERR_AUTH_ALG_ND = 121,
113 ERR_NOMEM = 122,
114 ERR_DISCARD = 123,
115 ERR_DISK_CONFIGURED = 124,
116 ERR_NET_CONFIGURED = 125,
117 ERR_MANDATORY_TAG = 126,
118 ERR_MINOR_INVALID = 127,
119 ERR_INTR = 129, /* EINTR */
120 ERR_RESIZE_RESYNC = 130,
121 ERR_NO_PRIMARY = 131,
122 ERR_SYNC_AFTER = 132,
123 ERR_SYNC_AFTER_CYCLE = 133,
124 ERR_PAUSE_IS_SET = 134,
125 ERR_PAUSE_IS_CLEAR = 135,
126 ERR_PACKET_NR = 137,
127 ERR_NO_DISK = 138,
128 ERR_NOT_PROTO_C = 139,
129 ERR_NOMEM_BITMAP = 140,
130 ERR_INTEGRITY_ALG = 141, /* DRBD 8.2 only */
131 ERR_INTEGRITY_ALG_ND = 142, /* DRBD 8.2 only */
132 ERR_CPU_MASK_PARSE = 143, /* DRBD 8.2 only */
133 ERR_CSUMS_ALG = 144, /* DRBD 8.2 only */
134 ERR_CSUMS_ALG_ND = 145, /* DRBD 8.2 only */
135 ERR_VERIFY_ALG = 146, /* DRBD 8.2 only */
136 ERR_VERIFY_ALG_ND = 147, /* DRBD 8.2 only */
137 ERR_CSUMS_RESYNC_RUNNING= 148, /* DRBD 8.2 only */
138 ERR_VERIFY_RUNNING = 149, /* DRBD 8.2 only */
139 ERR_DATA_NOT_CURRENT = 150,
140 ERR_CONNECTED = 151, /* DRBD 8.3 only */
141 ERR_PERM = 152,
142
143 /* insert new ones above this line */
144 AFTER_LAST_ERR_CODE
145};
146
147#define DRBD_PROT_A 1
148#define DRBD_PROT_B 2
149#define DRBD_PROT_C 3
150
151enum drbd_role {
152 R_UNKNOWN = 0,
153 R_PRIMARY = 1, /* role */
154 R_SECONDARY = 2, /* role */
155 R_MASK = 3,
156};
157
158/* The order of these constants is important.
159 * The lower ones (<C_WF_REPORT_PARAMS) indicate
160 * that there is no socket!
161 * >=C_WF_REPORT_PARAMS ==> There is a socket
162 */
163enum drbd_conns {
164 C_STANDALONE,
165 C_DISCONNECTING, /* Temporal state on the way to StandAlone. */
166 C_UNCONNECTED, /* >= C_UNCONNECTED -> inc_net() succeeds */
167
168 /* These temporal states are all used on the way
169 * from >= C_CONNECTED to Unconnected.
170 * The 'disconnect reason' states
171 * I do not allow to change beween them. */
172 C_TIMEOUT,
173 C_BROKEN_PIPE,
174 C_NETWORK_FAILURE,
175 C_PROTOCOL_ERROR,
176 C_TEAR_DOWN,
177
178 C_WF_CONNECTION,
179 C_WF_REPORT_PARAMS, /* we have a socket */
180 C_CONNECTED, /* we have introduced each other */
181 C_STARTING_SYNC_S, /* starting full sync by admin request. */
182 C_STARTING_SYNC_T, /* stariing full sync by admin request. */
183 C_WF_BITMAP_S,
184 C_WF_BITMAP_T,
185 C_WF_SYNC_UUID,
186
187 /* All SyncStates are tested with this comparison
188 * xx >= C_SYNC_SOURCE && xx <= C_PAUSED_SYNC_T */
189 C_SYNC_SOURCE,
190 C_SYNC_TARGET,
191 C_VERIFY_S,
192 C_VERIFY_T,
193 C_PAUSED_SYNC_S,
194 C_PAUSED_SYNC_T,
195 C_MASK = 31
196};
197
198enum drbd_disk_state {
199 D_DISKLESS,
200 D_ATTACHING, /* In the process of reading the meta-data */
201 D_FAILED, /* Becomes D_DISKLESS as soon as we told it the peer */
202 /* when >= D_FAILED it is legal to access mdev->bc */
203 D_NEGOTIATING, /* Late attaching state, we need to talk to the peer */
204 D_INCONSISTENT,
205 D_OUTDATED,
206 D_UNKNOWN, /* Only used for the peer, never for myself */
207 D_CONSISTENT, /* Might be D_OUTDATED, might be D_UP_TO_DATE ... */
208 D_UP_TO_DATE, /* Only this disk state allows applications' IO ! */
209 D_MASK = 15
210};
211
212union drbd_state {
213/* According to gcc's docs is the ...
214 * The order of allocation of bit-fields within a unit (C90 6.5.2.1, C99 6.7.2.1).
215 * Determined by ABI.
216 * pointed out by Maxim Uvarov q<muvarov@ru.mvista.com>
217 * even though we transmit as "cpu_to_be32(state)",
218 * the offsets of the bitfields still need to be swapped
219 * on different endianess.
220 */
221 struct {
222#if defined(__LITTLE_ENDIAN_BITFIELD)
223 unsigned role:2 ; /* 3/4 primary/secondary/unknown */
224 unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
225 unsigned conn:5 ; /* 17/32 cstates */
226 unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
227 unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
228 unsigned susp:1 ; /* 2/2 IO suspended no/yes */
229 unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
230 unsigned peer_isp:1 ;
231 unsigned user_isp:1 ;
232 unsigned _pad:11; /* 0 unused */
233#elif defined(__BIG_ENDIAN_BITFIELD)
234 unsigned _pad:11; /* 0 unused */
235 unsigned user_isp:1 ;
236 unsigned peer_isp:1 ;
237 unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
238 unsigned susp:1 ; /* 2/2 IO suspended no/yes */
239 unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
240 unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
241 unsigned conn:5 ; /* 17/32 cstates */
242 unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
243 unsigned role:2 ; /* 3/4 primary/secondary/unknown */
244#else
245# error "this endianess is not supported"
246#endif
247 };
248 unsigned int i;
249};
250
251enum drbd_state_ret_codes {
252 SS_CW_NO_NEED = 4,
253 SS_CW_SUCCESS = 3,
254 SS_NOTHING_TO_DO = 2,
255 SS_SUCCESS = 1,
256 SS_UNKNOWN_ERROR = 0, /* Used to sleep longer in _drbd_request_state */
257 SS_TWO_PRIMARIES = -1,
258 SS_NO_UP_TO_DATE_DISK = -2,
259 SS_NO_LOCAL_DISK = -4,
260 SS_NO_REMOTE_DISK = -5,
261 SS_CONNECTED_OUTDATES = -6,
262 SS_PRIMARY_NOP = -7,
263 SS_RESYNC_RUNNING = -8,
264 SS_ALREADY_STANDALONE = -9,
265 SS_CW_FAILED_BY_PEER = -10,
266 SS_IS_DISKLESS = -11,
267 SS_DEVICE_IN_USE = -12,
268 SS_NO_NET_CONFIG = -13,
269 SS_NO_VERIFY_ALG = -14, /* drbd-8.2 only */
270 SS_NEED_CONNECTION = -15, /* drbd-8.2 only */
271 SS_LOWER_THAN_OUTDATED = -16,
272 SS_NOT_SUPPORTED = -17, /* drbd-8.2 only */
273 SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */
274 SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */
275 SS_AFTER_LAST_ERROR = -20, /* Keep this at bottom */
276};
277
278/* from drbd_strings.c */
279extern const char *drbd_conn_str(enum drbd_conns);
280extern const char *drbd_role_str(enum drbd_role);
281extern const char *drbd_disk_str(enum drbd_disk_state);
282extern const char *drbd_set_st_err_str(enum drbd_state_ret_codes);
283
284#define SHARED_SECRET_MAX 64
285
286#define MDF_CONSISTENT (1 << 0)
287#define MDF_PRIMARY_IND (1 << 1)
288#define MDF_CONNECTED_IND (1 << 2)
289#define MDF_FULL_SYNC (1 << 3)
290#define MDF_WAS_UP_TO_DATE (1 << 4)
291#define MDF_PEER_OUT_DATED (1 << 5)
292#define MDF_CRASHED_PRIMARY (1 << 6)
293
294enum drbd_uuid_index {
295 UI_CURRENT,
296 UI_BITMAP,
297 UI_HISTORY_START,
298 UI_HISTORY_END,
299 UI_SIZE, /* nl-packet: number of dirty bits */
300 UI_FLAGS, /* nl-packet: flags */
301 UI_EXTENDED_SIZE /* Everything. */
302};
303
304enum drbd_timeout_flag {
305 UT_DEFAULT = 0,
306 UT_DEGRADED = 1,
307 UT_PEER_OUTDATED = 2,
308};
309
310#define UUID_JUST_CREATED ((__u64)4)
311
312#define DRBD_MAGIC 0x83740267
313#define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC)
314
315/* these are of type "int" */
316#define DRBD_MD_INDEX_INTERNAL -1
317#define DRBD_MD_INDEX_FLEX_EXT -2
318#define DRBD_MD_INDEX_FLEX_INT -3
319
320/* Start of the new netlink/connector stuff */
321
322#define DRBD_NL_CREATE_DEVICE 0x01
323#define DRBD_NL_SET_DEFAULTS 0x02
324
325
326/* For searching a vacant cn_idx value */
327#define CN_IDX_STEP 6977
328
329struct drbd_nl_cfg_req {
330 int packet_type;
331 unsigned int drbd_minor;
332 int flags;
333 unsigned short tag_list[];
334};
335
336struct drbd_nl_cfg_reply {
337 int packet_type;
338 unsigned int minor;
339 int ret_code; /* enum ret_code or set_st_err_t */
340 unsigned short tag_list[]; /* only used with get_* calls */
341};
342
343#endif
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
new file mode 100644
index 000000000000..51f47a586ad8
--- /dev/null
+++ b/include/linux/drbd_limits.h
@@ -0,0 +1,137 @@
1/*
2 drbd_limits.h
3 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
4*/
5
6/*
7 * Our current limitations.
8 * Some of them are hard limits,
9 * some of them are arbitrary range limits, that make it easier to provide
10 * feedback about nonsense settings for certain configurable values.
11 */
12
13#ifndef DRBD_LIMITS_H
14#define DRBD_LIMITS_H 1
15
16#define DEBUG_RANGE_CHECK 0
17
18#define DRBD_MINOR_COUNT_MIN 1
19#define DRBD_MINOR_COUNT_MAX 255
20
21#define DRBD_DIALOG_REFRESH_MIN 0
22#define DRBD_DIALOG_REFRESH_MAX 600
23
24/* valid port number */
25#define DRBD_PORT_MIN 1
26#define DRBD_PORT_MAX 0xffff
27
28/* startup { */
29 /* if you want more than 3.4 days, disable */
30#define DRBD_WFC_TIMEOUT_MIN 0
31#define DRBD_WFC_TIMEOUT_MAX 300000
32#define DRBD_WFC_TIMEOUT_DEF 0
33
34#define DRBD_DEGR_WFC_TIMEOUT_MIN 0
35#define DRBD_DEGR_WFC_TIMEOUT_MAX 300000
36#define DRBD_DEGR_WFC_TIMEOUT_DEF 0
37
38#define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0
39#define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000
40#define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0
41/* }*/
42
43/* net { */
44 /* timeout, unit centi seconds
45 * more than one minute timeout is not usefull */
46#define DRBD_TIMEOUT_MIN 1
47#define DRBD_TIMEOUT_MAX 600
48#define DRBD_TIMEOUT_DEF 60 /* 6 seconds */
49
50 /* active connection retries when C_WF_CONNECTION */
51#define DRBD_CONNECT_INT_MIN 1
52#define DRBD_CONNECT_INT_MAX 120
53#define DRBD_CONNECT_INT_DEF 10 /* seconds */
54
55 /* keep-alive probes when idle */
56#define DRBD_PING_INT_MIN 1
57#define DRBD_PING_INT_MAX 120
58#define DRBD_PING_INT_DEF 10
59
60 /* timeout for the ping packets.*/
61#define DRBD_PING_TIMEO_MIN 1
62#define DRBD_PING_TIMEO_MAX 100
63#define DRBD_PING_TIMEO_DEF 5
64
65 /* max number of write requests between write barriers */
66#define DRBD_MAX_EPOCH_SIZE_MIN 1
67#define DRBD_MAX_EPOCH_SIZE_MAX 20000
68#define DRBD_MAX_EPOCH_SIZE_DEF 2048
69
70 /* I don't think that a tcp send buffer of more than 10M is usefull */
71#define DRBD_SNDBUF_SIZE_MIN 0
72#define DRBD_SNDBUF_SIZE_MAX (10<<20)
73#define DRBD_SNDBUF_SIZE_DEF 0
74
75#define DRBD_RCVBUF_SIZE_MIN 0
76#define DRBD_RCVBUF_SIZE_MAX (10<<20)
77#define DRBD_RCVBUF_SIZE_DEF 0
78
79 /* @4k PageSize -> 128kB - 512MB */
80#define DRBD_MAX_BUFFERS_MIN 32
81#define DRBD_MAX_BUFFERS_MAX 131072
82#define DRBD_MAX_BUFFERS_DEF 2048
83
84 /* @4k PageSize -> 4kB - 512MB */
85#define DRBD_UNPLUG_WATERMARK_MIN 1
86#define DRBD_UNPLUG_WATERMARK_MAX 131072
87#define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16)
88
89 /* 0 is disabled.
90 * 200 should be more than enough even for very short timeouts */
91#define DRBD_KO_COUNT_MIN 0
92#define DRBD_KO_COUNT_MAX 200
93#define DRBD_KO_COUNT_DEF 0
94/* } */
95
96/* syncer { */
97 /* FIXME allow rate to be zero? */
98#define DRBD_RATE_MIN 1
99/* channel bonding 10 GbE, or other hardware */
100#define DRBD_RATE_MAX (4 << 20)
101#define DRBD_RATE_DEF 250 /* kb/second */
102
103 /* less than 7 would hit performance unneccessarily.
104 * 3833 is the largest prime that still does fit
105 * into 64 sectors of activity log */
106#define DRBD_AL_EXTENTS_MIN 7
107#define DRBD_AL_EXTENTS_MAX 3833
108#define DRBD_AL_EXTENTS_DEF 127
109
110#define DRBD_AFTER_MIN -1
111#define DRBD_AFTER_MAX 255
112#define DRBD_AFTER_DEF -1
113
114/* } */
115
116/* drbdsetup XY resize -d Z
117 * you are free to reduce the device size to nothing, if you want to.
118 * the upper limit with 64bit kernel, enough ram and flexible meta data
119 * is 16 TB, currently. */
120/* DRBD_MAX_SECTORS */
121#define DRBD_DISK_SIZE_SECT_MIN 0
122#define DRBD_DISK_SIZE_SECT_MAX (16 * (2LLU << 30))
123#define DRBD_DISK_SIZE_SECT_DEF 0 /* = disabled = no user size... */
124
125#define DRBD_ON_IO_ERROR_DEF EP_PASS_ON
126#define DRBD_FENCING_DEF FP_DONT_CARE
127#define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT
128#define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT
129#define DRBD_AFTER_SB_2P_DEF ASB_DISCONNECT
130#define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT
131
132#define DRBD_MAX_BIO_BVECS_MIN 0
133#define DRBD_MAX_BIO_BVECS_MAX 128
134#define DRBD_MAX_BIO_BVECS_DEF 0
135
136#undef RANGE
137#endif
diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h
new file mode 100644
index 000000000000..db5721ad50d1
--- /dev/null
+++ b/include/linux/drbd_nl.h
@@ -0,0 +1,137 @@
1/*
2 PAKET( name,
3 TYPE ( pn, pr, member )
4 ...
5 )
6
7 You may never reissue one of the pn arguments
8*/
9
10#if !defined(NL_PACKET) || !defined(NL_STRING) || !defined(NL_INTEGER) || !defined(NL_BIT) || !defined(NL_INT64)
11#error "The macros NL_PACKET, NL_STRING, NL_INTEGER, NL_INT64 and NL_BIT needs to be defined"
12#endif
13
14NL_PACKET(primary, 1,
15 NL_BIT( 1, T_MAY_IGNORE, overwrite_peer)
16)
17
18NL_PACKET(secondary, 2, )
19
20NL_PACKET(disk_conf, 3,
21 NL_INT64( 2, T_MAY_IGNORE, disk_size)
22 NL_STRING( 3, T_MANDATORY, backing_dev, 128)
23 NL_STRING( 4, T_MANDATORY, meta_dev, 128)
24 NL_INTEGER( 5, T_MANDATORY, meta_dev_idx)
25 NL_INTEGER( 6, T_MAY_IGNORE, on_io_error)
26 NL_INTEGER( 7, T_MAY_IGNORE, fencing)
27 NL_BIT( 37, T_MAY_IGNORE, use_bmbv)
28 NL_BIT( 53, T_MAY_IGNORE, no_disk_flush)
29 NL_BIT( 54, T_MAY_IGNORE, no_md_flush)
30 /* 55 max_bio_size was available in 8.2.6rc2 */
31 NL_INTEGER( 56, T_MAY_IGNORE, max_bio_bvecs)
32 NL_BIT( 57, T_MAY_IGNORE, no_disk_barrier)
33 NL_BIT( 58, T_MAY_IGNORE, no_disk_drain)
34)
35
36NL_PACKET(detach, 4, )
37
38NL_PACKET(net_conf, 5,
39 NL_STRING( 8, T_MANDATORY, my_addr, 128)
40 NL_STRING( 9, T_MANDATORY, peer_addr, 128)
41 NL_STRING( 10, T_MAY_IGNORE, shared_secret, SHARED_SECRET_MAX)
42 NL_STRING( 11, T_MAY_IGNORE, cram_hmac_alg, SHARED_SECRET_MAX)
43 NL_STRING( 44, T_MAY_IGNORE, integrity_alg, SHARED_SECRET_MAX)
44 NL_INTEGER( 14, T_MAY_IGNORE, timeout)
45 NL_INTEGER( 15, T_MANDATORY, wire_protocol)
46 NL_INTEGER( 16, T_MAY_IGNORE, try_connect_int)
47 NL_INTEGER( 17, T_MAY_IGNORE, ping_int)
48 NL_INTEGER( 18, T_MAY_IGNORE, max_epoch_size)
49 NL_INTEGER( 19, T_MAY_IGNORE, max_buffers)
50 NL_INTEGER( 20, T_MAY_IGNORE, unplug_watermark)
51 NL_INTEGER( 21, T_MAY_IGNORE, sndbuf_size)
52 NL_INTEGER( 22, T_MAY_IGNORE, ko_count)
53 NL_INTEGER( 24, T_MAY_IGNORE, after_sb_0p)
54 NL_INTEGER( 25, T_MAY_IGNORE, after_sb_1p)
55 NL_INTEGER( 26, T_MAY_IGNORE, after_sb_2p)
56 NL_INTEGER( 39, T_MAY_IGNORE, rr_conflict)
57 NL_INTEGER( 40, T_MAY_IGNORE, ping_timeo)
58 NL_INTEGER( 67, T_MAY_IGNORE, rcvbuf_size)
59 /* 59 addr_family was available in GIT, never released */
60 NL_BIT( 60, T_MANDATORY, mind_af)
61 NL_BIT( 27, T_MAY_IGNORE, want_lose)
62 NL_BIT( 28, T_MAY_IGNORE, two_primaries)
63 NL_BIT( 41, T_MAY_IGNORE, always_asbp)
64 NL_BIT( 61, T_MAY_IGNORE, no_cork)
65 NL_BIT( 62, T_MANDATORY, auto_sndbuf_size)
66)
67
68NL_PACKET(disconnect, 6, )
69
70NL_PACKET(resize, 7,
71 NL_INT64( 29, T_MAY_IGNORE, resize_size)
72)
73
74NL_PACKET(syncer_conf, 8,
75 NL_INTEGER( 30, T_MAY_IGNORE, rate)
76 NL_INTEGER( 31, T_MAY_IGNORE, after)
77 NL_INTEGER( 32, T_MAY_IGNORE, al_extents)
78 NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX)
79 NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32)
80 NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX)
81 NL_BIT( 65, T_MAY_IGNORE, use_rle)
82)
83
84NL_PACKET(invalidate, 9, )
85NL_PACKET(invalidate_peer, 10, )
86NL_PACKET(pause_sync, 11, )
87NL_PACKET(resume_sync, 12, )
88NL_PACKET(suspend_io, 13, )
89NL_PACKET(resume_io, 14, )
90NL_PACKET(outdate, 15, )
91NL_PACKET(get_config, 16, )
92NL_PACKET(get_state, 17,
93 NL_INTEGER( 33, T_MAY_IGNORE, state_i)
94)
95
96NL_PACKET(get_uuids, 18,
97 NL_STRING( 34, T_MAY_IGNORE, uuids, (UI_SIZE*sizeof(__u64)))
98 NL_INTEGER( 35, T_MAY_IGNORE, uuids_flags)
99)
100
101NL_PACKET(get_timeout_flag, 19,
102 NL_BIT( 36, T_MAY_IGNORE, use_degraded)
103)
104
105NL_PACKET(call_helper, 20,
106 NL_STRING( 38, T_MAY_IGNORE, helper, 32)
107)
108
109/* Tag nr 42 already allocated in drbd-8.1 development. */
110
111NL_PACKET(sync_progress, 23,
112 NL_INTEGER( 43, T_MAY_IGNORE, sync_progress)
113)
114
115NL_PACKET(dump_ee, 24,
116 NL_STRING( 45, T_MAY_IGNORE, dump_ee_reason, 32)
117 NL_STRING( 46, T_MAY_IGNORE, seen_digest, SHARED_SECRET_MAX)
118 NL_STRING( 47, T_MAY_IGNORE, calc_digest, SHARED_SECRET_MAX)
119 NL_INT64( 48, T_MAY_IGNORE, ee_sector)
120 NL_INT64( 49, T_MAY_IGNORE, ee_block_id)
121 NL_STRING( 50, T_MAY_IGNORE, ee_data, 32 << 10)
122)
123
124NL_PACKET(start_ov, 25,
125 NL_INT64( 66, T_MAY_IGNORE, start_sector)
126)
127
128NL_PACKET(new_c_uuid, 26,
129 NL_BIT( 63, T_MANDATORY, clear_bm)
130)
131
132#undef NL_PACKET
133#undef NL_INTEGER
134#undef NL_INT64
135#undef NL_BIT
136#undef NL_STRING
137
diff --git a/include/linux/drbd_tag_magic.h b/include/linux/drbd_tag_magic.h
new file mode 100644
index 000000000000..fcdff8410e99
--- /dev/null
+++ b/include/linux/drbd_tag_magic.h
@@ -0,0 +1,83 @@
1#ifndef DRBD_TAG_MAGIC_H
2#define DRBD_TAG_MAGIC_H
3
4#define TT_END 0
5#define TT_REMOVED 0xE000
6
7/* declare packet_type enums */
8enum packet_types {
9#define NL_PACKET(name, number, fields) P_ ## name = number,
10#define NL_INTEGER(pn, pr, member)
11#define NL_INT64(pn, pr, member)
12#define NL_BIT(pn, pr, member)
13#define NL_STRING(pn, pr, member, len)
14#include "drbd_nl.h"
15 P_nl_after_last_packet,
16};
17
18/* These struct are used to deduce the size of the tag lists: */
19#define NL_PACKET(name, number, fields) \
20 struct name ## _tag_len_struct { fields };
21#define NL_INTEGER(pn, pr, member) \
22 int member; int tag_and_len ## member;
23#define NL_INT64(pn, pr, member) \
24 __u64 member; int tag_and_len ## member;
25#define NL_BIT(pn, pr, member) \
26 unsigned char member:1; int tag_and_len ## member;
27#define NL_STRING(pn, pr, member, len) \
28 unsigned char member[len]; int member ## _len; \
29 int tag_and_len ## member;
30#include "linux/drbd_nl.h"
31
32/* declate tag-list-sizes */
33static const int tag_list_sizes[] = {
34#define NL_PACKET(name, number, fields) 2 fields ,
35#define NL_INTEGER(pn, pr, member) + 4 + 4
36#define NL_INT64(pn, pr, member) + 4 + 8
37#define NL_BIT(pn, pr, member) + 4 + 1
38#define NL_STRING(pn, pr, member, len) + 4 + (len)
39#include "drbd_nl.h"
40};
41
42/* The two highest bits are used for the tag type */
43#define TT_MASK 0xC000
44#define TT_INTEGER 0x0000
45#define TT_INT64 0x4000
46#define TT_BIT 0x8000
47#define TT_STRING 0xC000
48/* The next bit indicates if processing of the tag is mandatory */
49#define T_MANDATORY 0x2000
50#define T_MAY_IGNORE 0x0000
51#define TN_MASK 0x1fff
52/* The remaining 13 bits are used to enumerate the tags */
53
54#define tag_type(T) ((T) & TT_MASK)
55#define tag_number(T) ((T) & TN_MASK)
56
57/* declare tag enums */
58#define NL_PACKET(name, number, fields) fields
59enum drbd_tags {
60#define NL_INTEGER(pn, pr, member) T_ ## member = pn | TT_INTEGER | pr ,
61#define NL_INT64(pn, pr, member) T_ ## member = pn | TT_INT64 | pr ,
62#define NL_BIT(pn, pr, member) T_ ## member = pn | TT_BIT | pr ,
63#define NL_STRING(pn, pr, member, len) T_ ## member = pn | TT_STRING | pr ,
64#include "drbd_nl.h"
65};
66
67struct tag {
68 const char *name;
69 int type_n_flags;
70 int max_len;
71};
72
73/* declare tag names */
74#define NL_PACKET(name, number, fields) fields
75static const struct tag tag_descriptions[] = {
76#define NL_INTEGER(pn, pr, member) [ pn ] = { #member, TT_INTEGER | pr, sizeof(int) },
77#define NL_INT64(pn, pr, member) [ pn ] = { #member, TT_INT64 | pr, sizeof(__u64) },
78#define NL_BIT(pn, pr, member) [ pn ] = { #member, TT_BIT | pr, sizeof(int) },
79#define NL_STRING(pn, pr, member, len) [ pn ] = { #member, TT_STRING | pr, (len) },
80#include "drbd_nl.h"
81};
82
83#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2620a8c63571..891f7d642e5c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -129,7 +129,7 @@ struct inodes_stat_t {
129 * WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device 129 * WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device
130 * immediately after submission. The write equivalent 130 * immediately after submission. The write equivalent
131 * of READ_SYNC. 131 * of READ_SYNC.
132 * WRITE_ODIRECT Special case write for O_DIRECT only. 132 * WRITE_ODIRECT_PLUG Special case write for O_DIRECT only.
133 * SWRITE_SYNC 133 * SWRITE_SYNC
134 * SWRITE_SYNC_PLUG Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer. 134 * SWRITE_SYNC_PLUG Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer.
135 * See SWRITE. 135 * See SWRITE.
@@ -151,7 +151,7 @@ struct inodes_stat_t {
151#define READ_META (READ | (1 << BIO_RW_META)) 151#define READ_META (READ | (1 << BIO_RW_META))
152#define WRITE_SYNC_PLUG (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) 152#define WRITE_SYNC_PLUG (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
153#define WRITE_SYNC (WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) 153#define WRITE_SYNC (WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
154#define WRITE_ODIRECT (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) 154#define WRITE_ODIRECT_PLUG (WRITE | (1 << BIO_RW_SYNCIO))
155#define SWRITE_SYNC_PLUG \ 155#define SWRITE_SYNC_PLUG \
156 (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) 156 (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
157#define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) 157#define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
@@ -304,6 +304,7 @@ struct inodes_stat_t {
304#define BLKIOOPT _IO(0x12,121) 304#define BLKIOOPT _IO(0x12,121)
305#define BLKALIGNOFF _IO(0x12,122) 305#define BLKALIGNOFF _IO(0x12,122)
306#define BLKPBSZGET _IO(0x12,123) 306#define BLKPBSZGET _IO(0x12,123)
307#define BLKDISCARDZEROES _IO(0x12,124)
307 308
308#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ 309#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
309#define FIBMAP _IO(0x00,1) /* bmap access */ 310#define FIBMAP _IO(0x00,1) /* bmap access */
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 297df45ffd0a..c6c0c41af35f 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -91,6 +91,7 @@ struct hd_struct {
91 sector_t start_sect; 91 sector_t start_sect;
92 sector_t nr_sects; 92 sector_t nr_sects;
93 sector_t alignment_offset; 93 sector_t alignment_offset;
94 unsigned int discard_alignment;
94 struct device __dev; 95 struct device __dev;
95 struct kobject *holder_dir; 96 struct kobject *holder_dir;
96 int policy, partno; 97 int policy, partno;
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 4da4a75c3f1e..a63235996309 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -40,16 +40,11 @@ struct cfq_io_context {
40 struct io_context *ioc; 40 struct io_context *ioc;
41 41
42 unsigned long last_end_request; 42 unsigned long last_end_request;
43 sector_t last_request_pos;
44 43
45 unsigned long ttime_total; 44 unsigned long ttime_total;
46 unsigned long ttime_samples; 45 unsigned long ttime_samples;
47 unsigned long ttime_mean; 46 unsigned long ttime_mean;
48 47
49 unsigned int seek_samples;
50 u64 seek_total;
51 sector_t seek_mean;
52
53 struct list_head queue_list; 48 struct list_head queue_list;
54 struct hlist_node cic_list; 49 struct hlist_node cic_list;
55 50
@@ -73,6 +68,10 @@ struct io_context {
73 unsigned short ioprio; 68 unsigned short ioprio;
74 unsigned short ioprio_changed; 69 unsigned short ioprio_changed;
75 70
71#ifdef CONFIG_BLK_CGROUP
72 unsigned short cgroup_changed;
73#endif
74
76 /* 75 /*
77 * For request batching 76 * For request batching
78 */ 77 */
@@ -99,14 +98,15 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc)
99 return NULL; 98 return NULL;
100} 99}
101 100
101struct task_struct;
102#ifdef CONFIG_BLOCK 102#ifdef CONFIG_BLOCK
103int put_io_context(struct io_context *ioc); 103int put_io_context(struct io_context *ioc);
104void exit_io_context(void); 104void exit_io_context(struct task_struct *task);
105struct io_context *get_io_context(gfp_t gfp_flags, int node); 105struct io_context *get_io_context(gfp_t gfp_flags, int node);
106struct io_context *alloc_io_context(gfp_t gfp_flags, int node); 106struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
107void copy_io_context(struct io_context **pdst, struct io_context **psrc); 107void copy_io_context(struct io_context **pdst, struct io_context **psrc);
108#else 108#else
109static inline void exit_io_context(void) 109static inline void exit_io_context(struct task_struct *task)
110{ 110{
111} 111}
112 112
diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h
new file mode 100644
index 000000000000..3a2b2d9b0472
--- /dev/null
+++ b/include/linux/lru_cache.h
@@ -0,0 +1,294 @@
1/*
2 lru_cache.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#ifndef LRU_CACHE_H
27#define LRU_CACHE_H
28
29#include <linux/list.h>
30#include <linux/slab.h>
31#include <linux/bitops.h>
32#include <linux/string.h> /* for memset */
33#include <linux/seq_file.h>
34
35/*
36This header file (and its .c file; kernel-doc of functions see there)
37 define a helper framework to easily keep track of index:label associations,
38 and changes to an "active set" of objects, as well as pending transactions,
39 to persistently record those changes.
40
41 We use an LRU policy if it is necessary to "cool down" a region currently in
42 the active set before we can "heat" a previously unused region.
43
44 Because of this later property, it is called "lru_cache".
45 As it actually Tracks Objects in an Active SeT, we could also call it
46 toast (incidentally that is what may happen to the data on the
47 backend storage uppon next resync, if we don't get it right).
48
49What for?
50
51We replicate IO (more or less synchronously) to local and remote disk.
52
53For crash recovery after replication node failure,
54 we need to resync all regions that have been target of in-flight WRITE IO
55 (in use, or "hot", regions), as we don't know wether or not those WRITEs have
56 made it to stable storage.
57
58 To avoid a "full resync", we need to persistently track these regions.
59
60 This is known as "write intent log", and can be implemented as on-disk
61 (coarse or fine grained) bitmap, or other meta data.
62
63 To avoid the overhead of frequent extra writes to this meta data area,
64 usually the condition is softened to regions that _may_ have been target of
65 in-flight WRITE IO, e.g. by only lazily clearing the on-disk write-intent
66 bitmap, trading frequency of meta data transactions against amount of
67 (possibly unneccessary) resync traffic.
68
69 If we set a hard limit on the area that may be "hot" at any given time, we
70 limit the amount of resync traffic needed for crash recovery.
71
72For recovery after replication link failure,
73 we need to resync all blocks that have been changed on the other replica
74 in the mean time, or, if both replica have been changed independently [*],
75 all blocks that have been changed on either replica in the mean time.
76 [*] usually as a result of a cluster split-brain and insufficient protection.
77 but there are valid use cases to do this on purpose.
78
79 Tracking those blocks can be implemented as "dirty bitmap".
80 Having it fine-grained reduces the amount of resync traffic.
81 It should also be persistent, to allow for reboots (or crashes)
82 while the replication link is down.
83
84There are various possible implementations for persistently storing
85write intent log information, three of which are mentioned here.
86
87"Chunk dirtying"
88 The on-disk "dirty bitmap" may be re-used as "write-intent" bitmap as well.
89 To reduce the frequency of bitmap updates for write-intent log purposes,
90 one could dirty "chunks" (of some size) at a time of the (fine grained)
91 on-disk bitmap, while keeping the in-memory "dirty" bitmap as clean as
92 possible, flushing it to disk again when a previously "hot" (and on-disk
93 dirtied as full chunk) area "cools down" again (no IO in flight anymore,
94 and none expected in the near future either).
95
96"Explicit (coarse) write intent bitmap"
97 An other implementation could chose a (probably coarse) explicit bitmap,
98 for write-intent log purposes, additionally to the fine grained dirty bitmap.
99
100"Activity log"
101 Yet an other implementation may keep track of the hot regions, by starting
102 with an empty set, and writing down a journal of region numbers that have
103 become "hot", or have "cooled down" again.
104
105 To be able to use a ring buffer for this journal of changes to the active
106 set, we not only record the actual changes to that set, but also record the
107 not changing members of the set in a round robin fashion. To do so, we use a
108 fixed (but configurable) number of slots which we can identify by index, and
109 associate region numbers (labels) with these indices.
110 For each transaction recording a change to the active set, we record the
111 change itself (index: -old_label, +new_label), and which index is associated
112 with which label (index: current_label) within a certain sliding window that
113 is moved further over the available indices with each such transaction.
114
115 Thus, for crash recovery, if the ringbuffer is sufficiently large, we can
116 accurately reconstruct the active set.
117
118 Sufficiently large depends only on maximum number of active objects, and the
119 size of the sliding window recording "index: current_label" associations within
120 each transaction.
121
122 This is what we call the "activity log".
123
124 Currently we need one activity log transaction per single label change, which
125 does not give much benefit over the "dirty chunks of bitmap" approach, other
126 than potentially less seeks.
127
128 We plan to change the transaction format to support multiple changes per
129 transaction, which then would reduce several (disjoint, "random") updates to
130 the bitmap into one transaction to the activity log ring buffer.
131*/
132
133/* this defines an element in a tracked set
134 * .colision is for hash table lookup.
135 * When we process a new IO request, we know its sector, thus can deduce the
136 * region number (label) easily. To do the label -> object lookup without a
137 * full list walk, we use a simple hash table.
138 *
139 * .list is on one of three lists:
140 * in_use: currently in use (refcnt > 0, lc_number != LC_FREE)
141 * lru: unused but ready to be reused or recycled
142 * (ts_refcnt == 0, lc_number != LC_FREE),
143 * free: unused but ready to be recycled
144 * (ts_refcnt == 0, lc_number == LC_FREE),
145 *
146 * an element is said to be "in the active set",
147 * if either on "in_use" or "lru", i.e. lc_number != LC_FREE.
148 *
149 * DRBD currently (May 2009) only uses 61 elements on the resync lru_cache
150 * (total memory usage 2 pages), and up to 3833 elements on the act_log
151 * lru_cache, totalling ~215 kB for 64bit architechture, ~53 pages.
152 *
153 * We usually do not actually free these objects again, but only "recycle"
154 * them, as the change "index: -old_label, +LC_FREE" would need a transaction
155 * as well. Which also means that using a kmem_cache to allocate the objects
156 * from wastes some resources.
157 * But it avoids high order page allocations in kmalloc.
158 */
159struct lc_element {
160 struct hlist_node colision;
161 struct list_head list; /* LRU list or free list */
162 unsigned refcnt;
163 /* back "pointer" into ts_cache->element[index],
164 * for paranoia, and for "ts_element_to_index" */
165 unsigned lc_index;
166 /* if we want to track a larger set of objects,
167 * it needs to become arch independend u64 */
168 unsigned lc_number;
169
170 /* special label when on free list */
171#define LC_FREE (~0U)
172};
173
174struct lru_cache {
175 /* the least recently used item is kept at lru->prev */
176 struct list_head lru;
177 struct list_head free;
178 struct list_head in_use;
179
180 /* the pre-created kmem cache to allocate the objects from */
181 struct kmem_cache *lc_cache;
182
183 /* size of tracked objects, used to memset(,0,) them in lc_reset */
184 size_t element_size;
185 /* offset of struct lc_element member in the tracked object */
186 size_t element_off;
187
188 /* number of elements (indices) */
189 unsigned int nr_elements;
190 /* Arbitrary limit on maximum tracked objects. Practical limit is much
191 * lower due to allocation failures, probably. For typical use cases,
192 * nr_elements should be a few thousand at most.
193 * This also limits the maximum value of ts_element.ts_index, allowing the
194 * 8 high bits of .ts_index to be overloaded with flags in the future. */
195#define LC_MAX_ACTIVE (1<<24)
196
197 /* statistics */
198 unsigned used; /* number of lelements currently on in_use list */
199 unsigned long hits, misses, starving, dirty, changed;
200
201 /* see below: flag-bits for lru_cache */
202 unsigned long flags;
203
204 /* when changing the label of an index element */
205 unsigned int new_number;
206
207 /* for paranoia when changing the label of an index element */
208 struct lc_element *changing_element;
209
210 void *lc_private;
211 const char *name;
212
213 /* nr_elements there */
214 struct hlist_head *lc_slot;
215 struct lc_element **lc_element;
216};
217
218
219/* flag-bits for lru_cache */
220enum {
221 /* debugging aid, to catch concurrent access early.
222 * user needs to guarantee exclusive access by proper locking! */
223 __LC_PARANOIA,
224 /* if we need to change the set, but currently there is a changing
225 * transaction pending, we are "dirty", and must deferr further
226 * changing requests */
227 __LC_DIRTY,
228 /* if we need to change the set, but currently there is no free nor
229 * unused element available, we are "starving", and must not give out
230 * further references, to guarantee that eventually some refcnt will
231 * drop to zero and we will be able to make progress again, changing
232 * the set, writing the transaction.
233 * if the statistics say we are frequently starving,
234 * nr_elements is too small. */
235 __LC_STARVING,
236};
237#define LC_PARANOIA (1<<__LC_PARANOIA)
238#define LC_DIRTY (1<<__LC_DIRTY)
239#define LC_STARVING (1<<__LC_STARVING)
240
241extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
242 unsigned e_count, size_t e_size, size_t e_off);
243extern void lc_reset(struct lru_cache *lc);
244extern void lc_destroy(struct lru_cache *lc);
245extern void lc_set(struct lru_cache *lc, unsigned int enr, int index);
246extern void lc_del(struct lru_cache *lc, struct lc_element *element);
247
248extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr);
249extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr);
250extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr);
251extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e);
252extern void lc_changed(struct lru_cache *lc, struct lc_element *e);
253
254struct seq_file;
255extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc);
256
257extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
258 void (*detail) (struct seq_file *, struct lc_element *));
259
260/**
261 * lc_try_lock - can be used to stop lc_get() from changing the tracked set
262 * @lc: the lru cache to operate on
263 *
264 * Note that the reference counts and order on the active and lru lists may
265 * still change. Returns true if we aquired the lock.
266 */
267static inline int lc_try_lock(struct lru_cache *lc)
268{
269 return !test_and_set_bit(__LC_DIRTY, &lc->flags);
270}
271
272/**
273 * lc_unlock - unlock @lc, allow lc_get() to change the set again
274 * @lc: the lru cache to operate on
275 */
276static inline void lc_unlock(struct lru_cache *lc)
277{
278 clear_bit(__LC_DIRTY, &lc->flags);
279 smp_mb__after_clear_bit();
280}
281
282static inline int lc_is_used(struct lru_cache *lc, unsigned int enr)
283{
284 struct lc_element *e = lc_find(lc, enr);
285 return e && e->refcnt;
286}
287
288#define lc_entry(ptr, type, member) \
289 container_of(ptr, type, member)
290
291extern struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i);
292extern unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e);
293
294#endif
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 66ebddcff664..705f01fe413a 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -49,6 +49,7 @@ struct writeback_control {
49 unsigned nonblocking:1; /* Don't get stuck on request queues */ 49 unsigned nonblocking:1; /* Don't get stuck on request queues */
50 unsigned encountered_congestion:1; /* An output: a queue is full */ 50 unsigned encountered_congestion:1; /* An output: a queue is full */
51 unsigned for_kupdate:1; /* A kupdate writeback */ 51 unsigned for_kupdate:1; /* A kupdate writeback */
52 unsigned for_background:1; /* A background writeback */
52 unsigned for_reclaim:1; /* Invoked from the page allocator */ 53 unsigned for_reclaim:1; /* Invoked from the page allocator */
53 unsigned range_cyclic:1; /* range_start is cyclic */ 54 unsigned range_cyclic:1; /* range_start is cyclic */
54 unsigned more_io:1; /* more io to be dispatched */ 55 unsigned more_io:1; /* more io to be dispatched */
diff --git a/kernel/exit.c b/kernel/exit.c
index 80ae941cfd2e..1143012951e9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1009,7 +1009,7 @@ NORET_TYPE void do_exit(long code)
1009 tsk->flags |= PF_EXITPIDONE; 1009 tsk->flags |= PF_EXITPIDONE;
1010 1010
1011 if (tsk->io_context) 1011 if (tsk->io_context)
1012 exit_io_context(); 1012 exit_io_context(tsk);
1013 1013
1014 if (tsk->splice_pipe) 1014 if (tsk->splice_pipe)
1015 __free_pipe_info(tsk->splice_pipe); 1015 __free_pipe_info(tsk->splice_pipe);
diff --git a/kernel/fork.c b/kernel/fork.c
index edeff9ceaab9..1415dc4598ae 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1317,7 +1317,8 @@ bad_fork_free_pid:
1317 if (pid != &init_struct_pid) 1317 if (pid != &init_struct_pid)
1318 free_pid(pid); 1318 free_pid(pid);
1319bad_fork_cleanup_io: 1319bad_fork_cleanup_io:
1320 put_io_context(p->io_context); 1320 if (p->io_context)
1321 exit_io_context(p);
1321bad_fork_cleanup_namespaces: 1322bad_fork_cleanup_namespaces:
1322 exit_task_namespaces(p); 1323 exit_task_namespaces(p);
1323bad_fork_cleanup_mm: 1324bad_fork_cleanup_mm:
diff --git a/lib/Kconfig b/lib/Kconfig
index bb1326d3839c..1cfe51628e1b 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -200,4 +200,7 @@ config NLATTR
200config GENERIC_ATOMIC64 200config GENERIC_ATOMIC64
201 bool 201 bool
202 202
203config LRU_CACHE
204 tristate
205
203endmenu 206endmenu
diff --git a/lib/Makefile b/lib/Makefile
index 2e78277eff9d..347ad8db29d3 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -91,6 +91,8 @@ obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o
91 91
92obj-$(CONFIG_NLATTR) += nlattr.o 92obj-$(CONFIG_NLATTR) += nlattr.o
93 93
94obj-$(CONFIG_LRU_CACHE) += lru_cache.o
95
94obj-$(CONFIG_DMA_API_DEBUG) += dma-debug.o 96obj-$(CONFIG_DMA_API_DEBUG) += dma-debug.o
95 97
96obj-$(CONFIG_GENERIC_CSUM) += checksum.o 98obj-$(CONFIG_GENERIC_CSUM) += checksum.o
diff --git a/lib/lru_cache.c b/lib/lru_cache.c
new file mode 100644
index 000000000000..270de9d31b8c
--- /dev/null
+++ b/lib/lru_cache.c
@@ -0,0 +1,560 @@
1/*
2 lru_cache.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
26#include <linux/module.h>
27#include <linux/bitops.h>
28#include <linux/slab.h>
29#include <linux/string.h> /* for memset */
30#include <linux/seq_file.h> /* for seq_printf */
31#include <linux/lru_cache.h>
32
33MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
34 "Lars Ellenberg <lars@linbit.com>");
35MODULE_DESCRIPTION("lru_cache - Track sets of hot objects");
36MODULE_LICENSE("GPL");
37
38/* this is developers aid only.
39 * it catches concurrent access (lack of locking on the users part) */
40#define PARANOIA_ENTRY() do { \
41 BUG_ON(!lc); \
42 BUG_ON(!lc->nr_elements); \
43 BUG_ON(test_and_set_bit(__LC_PARANOIA, &lc->flags)); \
44} while (0)
45
46#define RETURN(x...) do { \
47 clear_bit(__LC_PARANOIA, &lc->flags); \
48 smp_mb__after_clear_bit(); return x ; } while (0)
49
50/* BUG() if e is not one of the elements tracked by lc */
51#define PARANOIA_LC_ELEMENT(lc, e) do { \
52 struct lru_cache *lc_ = (lc); \
53 struct lc_element *e_ = (e); \
54 unsigned i = e_->lc_index; \
55 BUG_ON(i >= lc_->nr_elements); \
56 BUG_ON(lc_->lc_element[i] != e_); } while (0)
57
58/**
59 * lc_create - prepares to track objects in an active set
60 * @name: descriptive name only used in lc_seq_printf_stats and lc_seq_dump_details
61 * @e_count: number of elements allowed to be active simultaneously
62 * @e_size: size of the tracked objects
63 * @e_off: offset to the &struct lc_element member in a tracked object
64 *
65 * Returns a pointer to a newly initialized struct lru_cache on success,
66 * or NULL on (allocation) failure.
67 */
68struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
69 unsigned e_count, size_t e_size, size_t e_off)
70{
71 struct hlist_head *slot = NULL;
72 struct lc_element **element = NULL;
73 struct lru_cache *lc;
74 struct lc_element *e;
75 unsigned cache_obj_size = kmem_cache_size(cache);
76 unsigned i;
77
78 WARN_ON(cache_obj_size < e_size);
79 if (cache_obj_size < e_size)
80 return NULL;
81
82 /* e_count too big; would probably fail the allocation below anyways.
83 * for typical use cases, e_count should be few thousand at most. */
84 if (e_count > LC_MAX_ACTIVE)
85 return NULL;
86
87 slot = kzalloc(e_count * sizeof(struct hlist_head*), GFP_KERNEL);
88 if (!slot)
89 goto out_fail;
90 element = kzalloc(e_count * sizeof(struct lc_element *), GFP_KERNEL);
91 if (!element)
92 goto out_fail;
93
94 lc = kzalloc(sizeof(*lc), GFP_KERNEL);
95 if (!lc)
96 goto out_fail;
97
98 INIT_LIST_HEAD(&lc->in_use);
99 INIT_LIST_HEAD(&lc->lru);
100 INIT_LIST_HEAD(&lc->free);
101
102 lc->name = name;
103 lc->element_size = e_size;
104 lc->element_off = e_off;
105 lc->nr_elements = e_count;
106 lc->new_number = LC_FREE;
107 lc->lc_cache = cache;
108 lc->lc_element = element;
109 lc->lc_slot = slot;
110
111 /* preallocate all objects */
112 for (i = 0; i < e_count; i++) {
113 void *p = kmem_cache_alloc(cache, GFP_KERNEL);
114 if (!p)
115 break;
116 memset(p, 0, lc->element_size);
117 e = p + e_off;
118 e->lc_index = i;
119 e->lc_number = LC_FREE;
120 list_add(&e->list, &lc->free);
121 element[i] = e;
122 }
123 if (i == e_count)
124 return lc;
125
126 /* else: could not allocate all elements, give up */
127 for (i--; i; i--) {
128 void *p = element[i];
129 kmem_cache_free(cache, p - e_off);
130 }
131 kfree(lc);
132out_fail:
133 kfree(element);
134 kfree(slot);
135 return NULL;
136}
137
138void lc_free_by_index(struct lru_cache *lc, unsigned i)
139{
140 void *p = lc->lc_element[i];
141 WARN_ON(!p);
142 if (p) {
143 p -= lc->element_off;
144 kmem_cache_free(lc->lc_cache, p);
145 }
146}
147
148/**
149 * lc_destroy - frees memory allocated by lc_create()
150 * @lc: the lru cache to destroy
151 */
152void lc_destroy(struct lru_cache *lc)
153{
154 unsigned i;
155 if (!lc)
156 return;
157 for (i = 0; i < lc->nr_elements; i++)
158 lc_free_by_index(lc, i);
159 kfree(lc->lc_element);
160 kfree(lc->lc_slot);
161 kfree(lc);
162}
163
164/**
165 * lc_reset - does a full reset for @lc and the hash table slots.
166 * @lc: the lru cache to operate on
167 *
168 * It is roughly the equivalent of re-allocating a fresh lru_cache object,
169 * basically a short cut to lc_destroy(lc); lc = lc_create(...);
170 */
171void lc_reset(struct lru_cache *lc)
172{
173 unsigned i;
174
175 INIT_LIST_HEAD(&lc->in_use);
176 INIT_LIST_HEAD(&lc->lru);
177 INIT_LIST_HEAD(&lc->free);
178 lc->used = 0;
179 lc->hits = 0;
180 lc->misses = 0;
181 lc->starving = 0;
182 lc->dirty = 0;
183 lc->changed = 0;
184 lc->flags = 0;
185 lc->changing_element = NULL;
186 lc->new_number = LC_FREE;
187 memset(lc->lc_slot, 0, sizeof(struct hlist_head) * lc->nr_elements);
188
189 for (i = 0; i < lc->nr_elements; i++) {
190 struct lc_element *e = lc->lc_element[i];
191 void *p = e;
192 p -= lc->element_off;
193 memset(p, 0, lc->element_size);
194 /* re-init it */
195 e->lc_index = i;
196 e->lc_number = LC_FREE;
197 list_add(&e->list, &lc->free);
198 }
199}
200
201/**
202 * lc_seq_printf_stats - print stats about @lc into @seq
203 * @seq: the seq_file to print into
204 * @lc: the lru cache to print statistics of
205 */
206size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc)
207{
208 /* NOTE:
209 * total calls to lc_get are
210 * (starving + hits + misses)
211 * misses include "dirty" count (update from an other thread in
212 * progress) and "changed", when this in fact lead to an successful
213 * update of the cache.
214 */
215 return seq_printf(seq, "\t%s: used:%u/%u "
216 "hits:%lu misses:%lu starving:%lu dirty:%lu changed:%lu\n",
217 lc->name, lc->used, lc->nr_elements,
218 lc->hits, lc->misses, lc->starving, lc->dirty, lc->changed);
219}
220
221static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr)
222{
223 return lc->lc_slot + (enr % lc->nr_elements);
224}
225
226
227/**
228 * lc_find - find element by label, if present in the hash table
229 * @lc: The lru_cache object
230 * @enr: element number
231 *
232 * Returns the pointer to an element, if the element with the requested
233 * "label" or element number is present in the hash table,
234 * or NULL if not found. Does not change the refcnt.
235 */
236struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr)
237{
238 struct hlist_node *n;
239 struct lc_element *e;
240
241 BUG_ON(!lc);
242 BUG_ON(!lc->nr_elements);
243 hlist_for_each_entry(e, n, lc_hash_slot(lc, enr), colision) {
244 if (e->lc_number == enr)
245 return e;
246 }
247 return NULL;
248}
249
250/* returned element will be "recycled" immediately */
251static struct lc_element *lc_evict(struct lru_cache *lc)
252{
253 struct list_head *n;
254 struct lc_element *e;
255
256 if (list_empty(&lc->lru))
257 return NULL;
258
259 n = lc->lru.prev;
260 e = list_entry(n, struct lc_element, list);
261
262 PARANOIA_LC_ELEMENT(lc, e);
263
264 list_del(&e->list);
265 hlist_del(&e->colision);
266 return e;
267}
268
269/**
270 * lc_del - removes an element from the cache
271 * @lc: The lru_cache object
272 * @e: The element to remove
273 *
274 * @e must be unused (refcnt == 0). Moves @e from "lru" to "free" list,
275 * sets @e->enr to %LC_FREE.
276 */
277void lc_del(struct lru_cache *lc, struct lc_element *e)
278{
279 PARANOIA_ENTRY();
280 PARANOIA_LC_ELEMENT(lc, e);
281 BUG_ON(e->refcnt);
282
283 e->lc_number = LC_FREE;
284 hlist_del_init(&e->colision);
285 list_move(&e->list, &lc->free);
286 RETURN();
287}
288
289static struct lc_element *lc_get_unused_element(struct lru_cache *lc)
290{
291 struct list_head *n;
292
293 if (list_empty(&lc->free))
294 return lc_evict(lc);
295
296 n = lc->free.next;
297 list_del(n);
298 return list_entry(n, struct lc_element, list);
299}
300
301static int lc_unused_element_available(struct lru_cache *lc)
302{
303 if (!list_empty(&lc->free))
304 return 1; /* something on the free list */
305 if (!list_empty(&lc->lru))
306 return 1; /* something to evict */
307
308 return 0;
309}
310
311
312/**
313 * lc_get - get element by label, maybe change the active set
314 * @lc: the lru cache to operate on
315 * @enr: the label to look up
316 *
317 * Finds an element in the cache, increases its usage count,
318 * "touches" and returns it.
319 *
320 * In case the requested number is not present, it needs to be added to the
321 * cache. Therefore it is possible that an other element becomes evicted from
322 * the cache. In either case, the user is notified so he is able to e.g. keep
323 * a persistent log of the cache changes, and therefore the objects in use.
324 *
325 * Return values:
326 * NULL
327 * The cache was marked %LC_STARVING,
328 * or the requested label was not in the active set
329 * and a changing transaction is still pending (@lc was marked %LC_DIRTY).
330 * Or no unused or free element could be recycled (@lc will be marked as
331 * %LC_STARVING, blocking further lc_get() operations).
332 *
333 * pointer to the element with the REQUESTED element number.
334 * In this case, it can be used right away
335 *
336 * pointer to an UNUSED element with some different element number,
337 * where that different number may also be %LC_FREE.
338 *
339 * In this case, the cache is marked %LC_DIRTY (blocking further changes),
340 * and the returned element pointer is removed from the lru list and
341 * hash collision chains. The user now should do whatever housekeeping
342 * is necessary.
343 * Then he must call lc_changed(lc,element_pointer), to finish
344 * the change.
345 *
346 * NOTE: The user needs to check the lc_number on EACH use, so he recognizes
347 * any cache set change.
348 */
349struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr)
350{
351 struct lc_element *e;
352
353 PARANOIA_ENTRY();
354 if (lc->flags & LC_STARVING) {
355 ++lc->starving;
356 RETURN(NULL);
357 }
358
359 e = lc_find(lc, enr);
360 if (e) {
361 ++lc->hits;
362 if (e->refcnt++ == 0)
363 lc->used++;
364 list_move(&e->list, &lc->in_use); /* Not evictable... */
365 RETURN(e);
366 }
367
368 ++lc->misses;
369
370 /* In case there is nothing available and we can not kick out
371 * the LRU element, we have to wait ...
372 */
373 if (!lc_unused_element_available(lc)) {
374 __set_bit(__LC_STARVING, &lc->flags);
375 RETURN(NULL);
376 }
377
378 /* it was not present in the active set.
379 * we are going to recycle an unused (or even "free") element.
380 * user may need to commit a transaction to record that change.
381 * we serialize on flags & TF_DIRTY */
382 if (test_and_set_bit(__LC_DIRTY, &lc->flags)) {
383 ++lc->dirty;
384 RETURN(NULL);
385 }
386
387 e = lc_get_unused_element(lc);
388 BUG_ON(!e);
389
390 clear_bit(__LC_STARVING, &lc->flags);
391 BUG_ON(++e->refcnt != 1);
392 lc->used++;
393
394 lc->changing_element = e;
395 lc->new_number = enr;
396
397 RETURN(e);
398}
399
400/* similar to lc_get,
401 * but only gets a new reference on an existing element.
402 * you either get the requested element, or NULL.
403 * will be consolidated into one function.
404 */
405struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr)
406{
407 struct lc_element *e;
408
409 PARANOIA_ENTRY();
410 if (lc->flags & LC_STARVING) {
411 ++lc->starving;
412 RETURN(NULL);
413 }
414
415 e = lc_find(lc, enr);
416 if (e) {
417 ++lc->hits;
418 if (e->refcnt++ == 0)
419 lc->used++;
420 list_move(&e->list, &lc->in_use); /* Not evictable... */
421 }
422 RETURN(e);
423}
424
425/**
426 * lc_changed - tell @lc that the change has been recorded
427 * @lc: the lru cache to operate on
428 * @e: the element pending label change
429 */
430void lc_changed(struct lru_cache *lc, struct lc_element *e)
431{
432 PARANOIA_ENTRY();
433 BUG_ON(e != lc->changing_element);
434 PARANOIA_LC_ELEMENT(lc, e);
435 ++lc->changed;
436 e->lc_number = lc->new_number;
437 list_add(&e->list, &lc->in_use);
438 hlist_add_head(&e->colision, lc_hash_slot(lc, lc->new_number));
439 lc->changing_element = NULL;
440 lc->new_number = LC_FREE;
441 clear_bit(__LC_DIRTY, &lc->flags);
442 smp_mb__after_clear_bit();
443 RETURN();
444}
445
446
447/**
448 * lc_put - give up refcnt of @e
449 * @lc: the lru cache to operate on
450 * @e: the element to put
451 *
452 * If refcnt reaches zero, the element is moved to the lru list,
453 * and a %LC_STARVING (if set) is cleared.
454 * Returns the new (post-decrement) refcnt.
455 */
456unsigned int lc_put(struct lru_cache *lc, struct lc_element *e)
457{
458 PARANOIA_ENTRY();
459 PARANOIA_LC_ELEMENT(lc, e);
460 BUG_ON(e->refcnt == 0);
461 BUG_ON(e == lc->changing_element);
462 if (--e->refcnt == 0) {
463 /* move it to the front of LRU. */
464 list_move(&e->list, &lc->lru);
465 lc->used--;
466 clear_bit(__LC_STARVING, &lc->flags);
467 smp_mb__after_clear_bit();
468 }
469 RETURN(e->refcnt);
470}
471
472/**
473 * lc_element_by_index
474 * @lc: the lru cache to operate on
475 * @i: the index of the element to return
476 */
477struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i)
478{
479 BUG_ON(i >= lc->nr_elements);
480 BUG_ON(lc->lc_element[i] == NULL);
481 BUG_ON(lc->lc_element[i]->lc_index != i);
482 return lc->lc_element[i];
483}
484
485/**
486 * lc_index_of
487 * @lc: the lru cache to operate on
488 * @e: the element to query for its index position in lc->element
489 */
490unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e)
491{
492 PARANOIA_LC_ELEMENT(lc, e);
493 return e->lc_index;
494}
495
496/**
497 * lc_set - associate index with label
498 * @lc: the lru cache to operate on
499 * @enr: the label to set
500 * @index: the element index to associate label with.
501 *
502 * Used to initialize the active set to some previously recorded state.
503 */
504void lc_set(struct lru_cache *lc, unsigned int enr, int index)
505{
506 struct lc_element *e;
507
508 if (index < 0 || index >= lc->nr_elements)
509 return;
510
511 e = lc_element_by_index(lc, index);
512 e->lc_number = enr;
513
514 hlist_del_init(&e->colision);
515 hlist_add_head(&e->colision, lc_hash_slot(lc, enr));
516 list_move(&e->list, e->refcnt ? &lc->in_use : &lc->lru);
517}
518
519/**
520 * lc_dump - Dump a complete LRU cache to seq in textual form.
521 * @lc: the lru cache to operate on
522 * @seq: the &struct seq_file pointer to seq_printf into
523 * @utext: user supplied "heading" or other info
524 * @detail: function pointer the user may provide to dump further details
525 * of the object the lc_element is embedded in.
526 */
527void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
528 void (*detail) (struct seq_file *, struct lc_element *))
529{
530 unsigned int nr_elements = lc->nr_elements;
531 struct lc_element *e;
532 int i;
533
534 seq_printf(seq, "\tnn: lc_number refcnt %s\n ", utext);
535 for (i = 0; i < nr_elements; i++) {
536 e = lc_element_by_index(lc, i);
537 if (e->lc_number == LC_FREE) {
538 seq_printf(seq, "\t%2d: FREE\n", i);
539 } else {
540 seq_printf(seq, "\t%2d: %4u %4u ", i,
541 e->lc_number, e->refcnt);
542 detail(seq, e);
543 }
544 }
545}
546
547EXPORT_SYMBOL(lc_create);
548EXPORT_SYMBOL(lc_reset);
549EXPORT_SYMBOL(lc_destroy);
550EXPORT_SYMBOL(lc_set);
551EXPORT_SYMBOL(lc_del);
552EXPORT_SYMBOL(lc_try_get);
553EXPORT_SYMBOL(lc_find);
554EXPORT_SYMBOL(lc_get);
555EXPORT_SYMBOL(lc_put);
556EXPORT_SYMBOL(lc_changed);
557EXPORT_SYMBOL(lc_element_by_index);
558EXPORT_SYMBOL(lc_index_of);
559EXPORT_SYMBOL(lc_seq_printf_stats);
560EXPORT_SYMBOL(lc_seq_dump_details);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 67a33a5a1a93..0e8ca0347707 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -609,7 +609,7 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
609 * it would never exet if it is currently stuck in the refrigerator. 609 * it would never exet if it is currently stuck in the refrigerator.
610 */ 610 */
611 list_for_each_entry(wb, &bdi->wb_list, list) { 611 list_for_each_entry(wb, &bdi->wb_list, list) {
612 wb->task->flags &= ~PF_FROZEN; 612 thaw_process(wb->task);
613 kthread_stop(wb->task); 613 kthread_stop(wb->task);
614 } 614 }
615} 615}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2c5d79236ead..0b19943ecf8b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -821,7 +821,6 @@ int write_cache_pages(struct address_space *mapping,
821 struct writeback_control *wbc, writepage_t writepage, 821 struct writeback_control *wbc, writepage_t writepage,
822 void *data) 822 void *data)
823{ 823{
824 struct backing_dev_info *bdi = mapping->backing_dev_info;
825 int ret = 0; 824 int ret = 0;
826 int done = 0; 825 int done = 0;
827 struct pagevec pvec; 826 struct pagevec pvec;
@@ -834,11 +833,6 @@ int write_cache_pages(struct address_space *mapping,
834 int range_whole = 0; 833 int range_whole = 0;
835 long nr_to_write = wbc->nr_to_write; 834 long nr_to_write = wbc->nr_to_write;
836 835
837 if (wbc->nonblocking && bdi_write_congested(bdi)) {
838 wbc->encountered_congestion = 1;
839 return 0;
840 }
841
842 pagevec_init(&pvec, 0); 836 pagevec_init(&pvec, 0);
843 if (wbc->range_cyclic) { 837 if (wbc->range_cyclic) {
844 writeback_index = mapping->writeback_index; /* prev offset */ 838 writeback_index = mapping->writeback_index; /* prev offset */
@@ -957,12 +951,6 @@ continue_unlock:
957 break; 951 break;
958 } 952 }
959 } 953 }
960
961 if (wbc->nonblocking && bdi_write_congested(bdi)) {
962 wbc->encountered_congestion = 1;
963 done = 1;
964 break;
965 }
966 } 954 }
967 pagevec_release(&pvec); 955 pagevec_release(&pvec);
968 cond_resched(); 956 cond_resched();