diff options
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/block/Kconfig | 2 | ||||
-rw-r--r-- | drivers/block/Makefile | 1 | ||||
-rw-r--r-- | drivers/block/drbd/Kconfig | 71 | ||||
-rw-r--r-- | drivers/block/drbd/Makefile | 5 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_actlog.c | 1424 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_bitmap.c | 1327 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_int.h | 2252 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_main.c | 3700 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_nl.c | 2360 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_proc.c | 265 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_receiver.c | 4427 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.c | 1120 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.h | 326 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_strings.c | 113 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_vli.h | 351 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_worker.c | 1512 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_wrappers.h | 91 |
17 files changed, 19347 insertions, 0 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 1d886e079c58..77bfce52e9ca 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig | |||
@@ -271,6 +271,8 @@ config BLK_DEV_CRYPTOLOOP | |||
271 | instead, which can be configured to be on-disk compatible with the | 271 | instead, which can be configured to be on-disk compatible with the |
272 | cryptoloop device. | 272 | cryptoloop device. |
273 | 273 | ||
274 | source "drivers/block/drbd/Kconfig" | ||
275 | |||
274 | config BLK_DEV_NBD | 276 | config BLK_DEV_NBD |
275 | tristate "Network block device support" | 277 | tristate "Network block device support" |
276 | depends on NET | 278 | depends on NET |
diff --git a/drivers/block/Makefile b/drivers/block/Makefile index cdaa3f8fddf0..aff5ac925c34 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile | |||
@@ -36,5 +36,6 @@ obj-$(CONFIG_BLK_DEV_UB) += ub.o | |||
36 | obj-$(CONFIG_BLK_DEV_HD) += hd.o | 36 | obj-$(CONFIG_BLK_DEV_HD) += hd.o |
37 | 37 | ||
38 | obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o | 38 | obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o |
39 | obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ | ||
39 | 40 | ||
40 | swim_mod-objs := swim.o swim_asm.o | 41 | swim_mod-objs := swim.o swim_asm.o |
diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig new file mode 100644 index 000000000000..f4acd04ebeef --- /dev/null +++ b/drivers/block/drbd/Kconfig | |||
@@ -0,0 +1,71 @@ | |||
1 | # | ||
2 | # DRBD device driver configuration | ||
3 | # | ||
4 | |||
5 | comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected" | ||
6 | depends on !PROC_FS || !INET || !CONNECTOR | ||
7 | |||
8 | config BLK_DEV_DRBD | ||
9 | tristate "DRBD Distributed Replicated Block Device support" | ||
10 | depends on PROC_FS && INET && CONNECTOR | ||
11 | select LRU_CACHE | ||
12 | default n | ||
13 | help | ||
14 | |||
15 | NOTE: In order to authenticate connections you have to select | ||
16 | CRYPTO_HMAC and a hash function as well. | ||
17 | |||
18 | DRBD is a shared-nothing, synchronously replicated block device. It | ||
19 | is designed to serve as a building block for high availability | ||
20 | clusters and in this context, is a "drop-in" replacement for shared | ||
21 | storage. Simplistically, you could see it as a network RAID 1. | ||
22 | |||
23 | Each minor device has a role, which can be 'primary' or 'secondary'. | ||
24 | On the node with the primary device the application is supposed to | ||
25 | run and to access the device (/dev/drbdX). Every write is sent to | ||
26 | the local 'lower level block device' and, across the network, to the | ||
27 | node with the device in 'secondary' state. The secondary device | ||
28 | simply writes the data to its lower level block device. | ||
29 | |||
30 | DRBD can also be used in dual-Primary mode (device writable on both | ||
31 | nodes), which means it can exhibit shared disk semantics in a | ||
32 | shared-nothing cluster. Needless to say, on top of dual-Primary | ||
33 | DRBD utilizing a cluster file system is necessary to maintain for | ||
34 | cache coherency. | ||
35 | |||
36 | For automatic failover you need a cluster manager (e.g. heartbeat). | ||
37 | See also: http://www.drbd.org/, http://www.linux-ha.org | ||
38 | |||
39 | If unsure, say N. | ||
40 | |||
41 | config DRBD_FAULT_INJECTION | ||
42 | bool "DRBD fault injection" | ||
43 | depends on BLK_DEV_DRBD | ||
44 | help | ||
45 | |||
46 | Say Y here if you want to simulate IO errors, in order to test DRBD's | ||
47 | behavior. | ||
48 | |||
49 | The actual simulation of IO errors is done by writing 3 values to | ||
50 | /sys/module/drbd/parameters/ | ||
51 | |||
52 | enable_faults: bitmask of... | ||
53 | 1 meta data write | ||
54 | 2 read | ||
55 | 4 resync data write | ||
56 | 8 read | ||
57 | 16 data write | ||
58 | 32 data read | ||
59 | 64 read ahead | ||
60 | 128 kmalloc of bitmap | ||
61 | 256 allocation of EE (epoch_entries) | ||
62 | |||
63 | fault_devs: bitmask of minor numbers | ||
64 | fault_rate: frequency in percent | ||
65 | |||
66 | Example: Simulate data write errors on /dev/drbd0 with a probability of 5%. | ||
67 | echo 16 > /sys/module/drbd/parameters/enable_faults | ||
68 | echo 1 > /sys/module/drbd/parameters/fault_devs | ||
69 | echo 5 > /sys/module/drbd/parameters/fault_rate | ||
70 | |||
71 | If unsure, say N. | ||
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile new file mode 100644 index 000000000000..0d3f337ff5ff --- /dev/null +++ b/drivers/block/drbd/Makefile | |||
@@ -0,0 +1,5 @@ | |||
1 | drbd-y := drbd_bitmap.o drbd_proc.o | ||
2 | drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o | ||
3 | drbd-y += drbd_main.o drbd_strings.o drbd_nl.o | ||
4 | |||
5 | obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o | ||
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c new file mode 100644 index 000000000000..17956ff6a08d --- /dev/null +++ b/drivers/block/drbd/drbd_actlog.c | |||
@@ -0,0 +1,1424 @@ | |||
1 | /* | ||
2 | drbd_actlog.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #include <linux/slab.h> | ||
27 | #include <linux/drbd.h> | ||
28 | #include "drbd_int.h" | ||
29 | #include "drbd_wrappers.h" | ||
30 | |||
31 | /* We maintain a trivial check sum in our on disk activity log. | ||
32 | * With that we can ensure correct operation even when the storage | ||
33 | * device might do a partial (last) sector write while loosing power. | ||
34 | */ | ||
35 | struct __packed al_transaction { | ||
36 | u32 magic; | ||
37 | u32 tr_number; | ||
38 | struct __packed { | ||
39 | u32 pos; | ||
40 | u32 extent; } updates[1 + AL_EXTENTS_PT]; | ||
41 | u32 xor_sum; | ||
42 | }; | ||
43 | |||
44 | struct update_odbm_work { | ||
45 | struct drbd_work w; | ||
46 | unsigned int enr; | ||
47 | }; | ||
48 | |||
49 | struct update_al_work { | ||
50 | struct drbd_work w; | ||
51 | struct lc_element *al_ext; | ||
52 | struct completion event; | ||
53 | unsigned int enr; | ||
54 | /* if old_enr != LC_FREE, write corresponding bitmap sector, too */ | ||
55 | unsigned int old_enr; | ||
56 | }; | ||
57 | |||
58 | struct drbd_atodb_wait { | ||
59 | atomic_t count; | ||
60 | struct completion io_done; | ||
61 | struct drbd_conf *mdev; | ||
62 | int error; | ||
63 | }; | ||
64 | |||
65 | |||
66 | int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int); | ||
67 | |||
68 | static int _drbd_md_sync_page_io(struct drbd_conf *mdev, | ||
69 | struct drbd_backing_dev *bdev, | ||
70 | struct page *page, sector_t sector, | ||
71 | int rw, int size) | ||
72 | { | ||
73 | struct bio *bio; | ||
74 | struct drbd_md_io md_io; | ||
75 | int ok; | ||
76 | |||
77 | md_io.mdev = mdev; | ||
78 | init_completion(&md_io.event); | ||
79 | md_io.error = 0; | ||
80 | |||
81 | if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags)) | ||
82 | rw |= (1 << BIO_RW_BARRIER); | ||
83 | rw |= ((1<<BIO_RW_UNPLUG) | (1<<BIO_RW_SYNCIO)); | ||
84 | |||
85 | retry: | ||
86 | bio = bio_alloc(GFP_NOIO, 1); | ||
87 | bio->bi_bdev = bdev->md_bdev; | ||
88 | bio->bi_sector = sector; | ||
89 | ok = (bio_add_page(bio, page, size, 0) == size); | ||
90 | if (!ok) | ||
91 | goto out; | ||
92 | bio->bi_private = &md_io; | ||
93 | bio->bi_end_io = drbd_md_io_complete; | ||
94 | bio->bi_rw = rw; | ||
95 | |||
96 | if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) | ||
97 | bio_endio(bio, -EIO); | ||
98 | else | ||
99 | submit_bio(rw, bio); | ||
100 | wait_for_completion(&md_io.event); | ||
101 | ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0; | ||
102 | |||
103 | /* check for unsupported barrier op. | ||
104 | * would rather check on EOPNOTSUPP, but that is not reliable. | ||
105 | * don't try again for ANY return value != 0 */ | ||
106 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && !ok)) { | ||
107 | /* Try again with no barrier */ | ||
108 | dev_warn(DEV, "Barriers not supported on meta data device - disabling\n"); | ||
109 | set_bit(MD_NO_BARRIER, &mdev->flags); | ||
110 | rw &= ~(1 << BIO_RW_BARRIER); | ||
111 | bio_put(bio); | ||
112 | goto retry; | ||
113 | } | ||
114 | out: | ||
115 | bio_put(bio); | ||
116 | return ok; | ||
117 | } | ||
118 | |||
119 | int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | ||
120 | sector_t sector, int rw) | ||
121 | { | ||
122 | int logical_block_size, mask, ok; | ||
123 | int offset = 0; | ||
124 | struct page *iop = mdev->md_io_page; | ||
125 | |||
126 | D_ASSERT(mutex_is_locked(&mdev->md_io_mutex)); | ||
127 | |||
128 | BUG_ON(!bdev->md_bdev); | ||
129 | |||
130 | logical_block_size = bdev_logical_block_size(bdev->md_bdev); | ||
131 | if (logical_block_size == 0) | ||
132 | logical_block_size = MD_SECTOR_SIZE; | ||
133 | |||
134 | /* in case logical_block_size != 512 [ s390 only? ] */ | ||
135 | if (logical_block_size != MD_SECTOR_SIZE) { | ||
136 | mask = (logical_block_size / MD_SECTOR_SIZE) - 1; | ||
137 | D_ASSERT(mask == 1 || mask == 3 || mask == 7); | ||
138 | D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE); | ||
139 | offset = sector & mask; | ||
140 | sector = sector & ~mask; | ||
141 | iop = mdev->md_io_tmpp; | ||
142 | |||
143 | if (rw & WRITE) { | ||
144 | /* these are GFP_KERNEL pages, pre-allocated | ||
145 | * on device initialization */ | ||
146 | void *p = page_address(mdev->md_io_page); | ||
147 | void *hp = page_address(mdev->md_io_tmpp); | ||
148 | |||
149 | ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, | ||
150 | READ, logical_block_size); | ||
151 | |||
152 | if (unlikely(!ok)) { | ||
153 | dev_err(DEV, "drbd_md_sync_page_io(,%llus," | ||
154 | "READ [logical_block_size!=512]) failed!\n", | ||
155 | (unsigned long long)sector); | ||
156 | return 0; | ||
157 | } | ||
158 | |||
159 | memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE); | ||
160 | } | ||
161 | } | ||
162 | |||
163 | if (sector < drbd_md_first_sector(bdev) || | ||
164 | sector > drbd_md_last_sector(bdev)) | ||
165 | dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n", | ||
166 | current->comm, current->pid, __func__, | ||
167 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); | ||
168 | |||
169 | ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size); | ||
170 | if (unlikely(!ok)) { | ||
171 | dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n", | ||
172 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); | ||
173 | return 0; | ||
174 | } | ||
175 | |||
176 | if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) { | ||
177 | void *p = page_address(mdev->md_io_page); | ||
178 | void *hp = page_address(mdev->md_io_tmpp); | ||
179 | |||
180 | memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE); | ||
181 | } | ||
182 | |||
183 | return ok; | ||
184 | } | ||
185 | |||
186 | static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) | ||
187 | { | ||
188 | struct lc_element *al_ext; | ||
189 | struct lc_element *tmp; | ||
190 | unsigned long al_flags = 0; | ||
191 | |||
192 | spin_lock_irq(&mdev->al_lock); | ||
193 | tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); | ||
194 | if (unlikely(tmp != NULL)) { | ||
195 | struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); | ||
196 | if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { | ||
197 | spin_unlock_irq(&mdev->al_lock); | ||
198 | return NULL; | ||
199 | } | ||
200 | } | ||
201 | al_ext = lc_get(mdev->act_log, enr); | ||
202 | al_flags = mdev->act_log->flags; | ||
203 | spin_unlock_irq(&mdev->al_lock); | ||
204 | |||
205 | /* | ||
206 | if (!al_ext) { | ||
207 | if (al_flags & LC_STARVING) | ||
208 | dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n"); | ||
209 | if (al_flags & LC_DIRTY) | ||
210 | dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n"); | ||
211 | } | ||
212 | */ | ||
213 | |||
214 | return al_ext; | ||
215 | } | ||
216 | |||
217 | void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector) | ||
218 | { | ||
219 | unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); | ||
220 | struct lc_element *al_ext; | ||
221 | struct update_al_work al_work; | ||
222 | |||
223 | D_ASSERT(atomic_read(&mdev->local_cnt) > 0); | ||
224 | |||
225 | wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr))); | ||
226 | |||
227 | if (al_ext->lc_number != enr) { | ||
228 | /* drbd_al_write_transaction(mdev,al_ext,enr); | ||
229 | * recurses into generic_make_request(), which | ||
230 | * disallows recursion, bios being serialized on the | ||
231 | * current->bio_tail list now. | ||
232 | * we have to delegate updates to the activity log | ||
233 | * to the worker thread. */ | ||
234 | init_completion(&al_work.event); | ||
235 | al_work.al_ext = al_ext; | ||
236 | al_work.enr = enr; | ||
237 | al_work.old_enr = al_ext->lc_number; | ||
238 | al_work.w.cb = w_al_write_transaction; | ||
239 | drbd_queue_work_front(&mdev->data.work, &al_work.w); | ||
240 | wait_for_completion(&al_work.event); | ||
241 | |||
242 | mdev->al_writ_cnt++; | ||
243 | |||
244 | spin_lock_irq(&mdev->al_lock); | ||
245 | lc_changed(mdev->act_log, al_ext); | ||
246 | spin_unlock_irq(&mdev->al_lock); | ||
247 | wake_up(&mdev->al_wait); | ||
248 | } | ||
249 | } | ||
250 | |||
251 | void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector) | ||
252 | { | ||
253 | unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); | ||
254 | struct lc_element *extent; | ||
255 | unsigned long flags; | ||
256 | |||
257 | spin_lock_irqsave(&mdev->al_lock, flags); | ||
258 | |||
259 | extent = lc_find(mdev->act_log, enr); | ||
260 | |||
261 | if (!extent) { | ||
262 | spin_unlock_irqrestore(&mdev->al_lock, flags); | ||
263 | dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr); | ||
264 | return; | ||
265 | } | ||
266 | |||
267 | if (lc_put(mdev->act_log, extent) == 0) | ||
268 | wake_up(&mdev->al_wait); | ||
269 | |||
270 | spin_unlock_irqrestore(&mdev->al_lock, flags); | ||
271 | } | ||
272 | |||
273 | int | ||
274 | w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
275 | { | ||
276 | struct update_al_work *aw = container_of(w, struct update_al_work, w); | ||
277 | struct lc_element *updated = aw->al_ext; | ||
278 | const unsigned int new_enr = aw->enr; | ||
279 | const unsigned int evicted = aw->old_enr; | ||
280 | struct al_transaction *buffer; | ||
281 | sector_t sector; | ||
282 | int i, n, mx; | ||
283 | unsigned int extent_nr; | ||
284 | u32 xor_sum = 0; | ||
285 | |||
286 | if (!get_ldev(mdev)) { | ||
287 | dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n"); | ||
288 | complete(&((struct update_al_work *)w)->event); | ||
289 | return 1; | ||
290 | } | ||
291 | /* do we have to do a bitmap write, first? | ||
292 | * TODO reduce maximum latency: | ||
293 | * submit both bios, then wait for both, | ||
294 | * instead of doing two synchronous sector writes. */ | ||
295 | if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE) | ||
296 | drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT); | ||
297 | |||
298 | mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */ | ||
299 | buffer = (struct al_transaction *)page_address(mdev->md_io_page); | ||
300 | |||
301 | buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); | ||
302 | buffer->tr_number = cpu_to_be32(mdev->al_tr_number); | ||
303 | |||
304 | n = lc_index_of(mdev->act_log, updated); | ||
305 | |||
306 | buffer->updates[0].pos = cpu_to_be32(n); | ||
307 | buffer->updates[0].extent = cpu_to_be32(new_enr); | ||
308 | |||
309 | xor_sum ^= new_enr; | ||
310 | |||
311 | mx = min_t(int, AL_EXTENTS_PT, | ||
312 | mdev->act_log->nr_elements - mdev->al_tr_cycle); | ||
313 | for (i = 0; i < mx; i++) { | ||
314 | unsigned idx = mdev->al_tr_cycle + i; | ||
315 | extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number; | ||
316 | buffer->updates[i+1].pos = cpu_to_be32(idx); | ||
317 | buffer->updates[i+1].extent = cpu_to_be32(extent_nr); | ||
318 | xor_sum ^= extent_nr; | ||
319 | } | ||
320 | for (; i < AL_EXTENTS_PT; i++) { | ||
321 | buffer->updates[i+1].pos = __constant_cpu_to_be32(-1); | ||
322 | buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE); | ||
323 | xor_sum ^= LC_FREE; | ||
324 | } | ||
325 | mdev->al_tr_cycle += AL_EXTENTS_PT; | ||
326 | if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) | ||
327 | mdev->al_tr_cycle = 0; | ||
328 | |||
329 | buffer->xor_sum = cpu_to_be32(xor_sum); | ||
330 | |||
331 | sector = mdev->ldev->md.md_offset | ||
332 | + mdev->ldev->md.al_offset + mdev->al_tr_pos; | ||
333 | |||
334 | if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) | ||
335 | drbd_chk_io_error(mdev, 1, TRUE); | ||
336 | |||
337 | if (++mdev->al_tr_pos > | ||
338 | div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) | ||
339 | mdev->al_tr_pos = 0; | ||
340 | |||
341 | D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE); | ||
342 | mdev->al_tr_number++; | ||
343 | |||
344 | mutex_unlock(&mdev->md_io_mutex); | ||
345 | |||
346 | complete(&((struct update_al_work *)w)->event); | ||
347 | put_ldev(mdev); | ||
348 | |||
349 | return 1; | ||
350 | } | ||
351 | |||
352 | /** | ||
353 | * drbd_al_read_tr() - Read a single transaction from the on disk activity log | ||
354 | * @mdev: DRBD device. | ||
355 | * @bdev: Block device to read form. | ||
356 | * @b: pointer to an al_transaction. | ||
357 | * @index: On disk slot of the transaction to read. | ||
358 | * | ||
359 | * Returns -1 on IO error, 0 on checksum error and 1 upon success. | ||
360 | */ | ||
361 | static int drbd_al_read_tr(struct drbd_conf *mdev, | ||
362 | struct drbd_backing_dev *bdev, | ||
363 | struct al_transaction *b, | ||
364 | int index) | ||
365 | { | ||
366 | sector_t sector; | ||
367 | int rv, i; | ||
368 | u32 xor_sum = 0; | ||
369 | |||
370 | sector = bdev->md.md_offset + bdev->md.al_offset + index; | ||
371 | |||
372 | /* Dont process error normally, | ||
373 | * as this is done before disk is attached! */ | ||
374 | if (!drbd_md_sync_page_io(mdev, bdev, sector, READ)) | ||
375 | return -1; | ||
376 | |||
377 | rv = (be32_to_cpu(b->magic) == DRBD_MAGIC); | ||
378 | |||
379 | for (i = 0; i < AL_EXTENTS_PT + 1; i++) | ||
380 | xor_sum ^= be32_to_cpu(b->updates[i].extent); | ||
381 | rv &= (xor_sum == be32_to_cpu(b->xor_sum)); | ||
382 | |||
383 | return rv; | ||
384 | } | ||
385 | |||
386 | /** | ||
387 | * drbd_al_read_log() - Restores the activity log from its on disk representation. | ||
388 | * @mdev: DRBD device. | ||
389 | * @bdev: Block device to read form. | ||
390 | * | ||
391 | * Returns 1 on success, returns 0 when reading the log failed due to IO errors. | ||
392 | */ | ||
393 | int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | ||
394 | { | ||
395 | struct al_transaction *buffer; | ||
396 | int i; | ||
397 | int rv; | ||
398 | int mx; | ||
399 | int active_extents = 0; | ||
400 | int transactions = 0; | ||
401 | int found_valid = 0; | ||
402 | int from = 0; | ||
403 | int to = 0; | ||
404 | u32 from_tnr = 0; | ||
405 | u32 to_tnr = 0; | ||
406 | u32 cnr; | ||
407 | |||
408 | mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT); | ||
409 | |||
410 | /* lock out all other meta data io for now, | ||
411 | * and make sure the page is mapped. | ||
412 | */ | ||
413 | mutex_lock(&mdev->md_io_mutex); | ||
414 | buffer = page_address(mdev->md_io_page); | ||
415 | |||
416 | /* Find the valid transaction in the log */ | ||
417 | for (i = 0; i <= mx; i++) { | ||
418 | rv = drbd_al_read_tr(mdev, bdev, buffer, i); | ||
419 | if (rv == 0) | ||
420 | continue; | ||
421 | if (rv == -1) { | ||
422 | mutex_unlock(&mdev->md_io_mutex); | ||
423 | return 0; | ||
424 | } | ||
425 | cnr = be32_to_cpu(buffer->tr_number); | ||
426 | |||
427 | if (++found_valid == 1) { | ||
428 | from = i; | ||
429 | to = i; | ||
430 | from_tnr = cnr; | ||
431 | to_tnr = cnr; | ||
432 | continue; | ||
433 | } | ||
434 | if ((int)cnr - (int)from_tnr < 0) { | ||
435 | D_ASSERT(from_tnr - cnr + i - from == mx+1); | ||
436 | from = i; | ||
437 | from_tnr = cnr; | ||
438 | } | ||
439 | if ((int)cnr - (int)to_tnr > 0) { | ||
440 | D_ASSERT(cnr - to_tnr == i - to); | ||
441 | to = i; | ||
442 | to_tnr = cnr; | ||
443 | } | ||
444 | } | ||
445 | |||
446 | if (!found_valid) { | ||
447 | dev_warn(DEV, "No usable activity log found.\n"); | ||
448 | mutex_unlock(&mdev->md_io_mutex); | ||
449 | return 1; | ||
450 | } | ||
451 | |||
452 | /* Read the valid transactions. | ||
453 | * dev_info(DEV, "Reading from %d to %d.\n",from,to); */ | ||
454 | i = from; | ||
455 | while (1) { | ||
456 | int j, pos; | ||
457 | unsigned int extent_nr; | ||
458 | unsigned int trn; | ||
459 | |||
460 | rv = drbd_al_read_tr(mdev, bdev, buffer, i); | ||
461 | ERR_IF(rv == 0) goto cancel; | ||
462 | if (rv == -1) { | ||
463 | mutex_unlock(&mdev->md_io_mutex); | ||
464 | return 0; | ||
465 | } | ||
466 | |||
467 | trn = be32_to_cpu(buffer->tr_number); | ||
468 | |||
469 | spin_lock_irq(&mdev->al_lock); | ||
470 | |||
471 | /* This loop runs backwards because in the cyclic | ||
472 | elements there might be an old version of the | ||
473 | updated element (in slot 0). So the element in slot 0 | ||
474 | can overwrite old versions. */ | ||
475 | for (j = AL_EXTENTS_PT; j >= 0; j--) { | ||
476 | pos = be32_to_cpu(buffer->updates[j].pos); | ||
477 | extent_nr = be32_to_cpu(buffer->updates[j].extent); | ||
478 | |||
479 | if (extent_nr == LC_FREE) | ||
480 | continue; | ||
481 | |||
482 | lc_set(mdev->act_log, extent_nr, pos); | ||
483 | active_extents++; | ||
484 | } | ||
485 | spin_unlock_irq(&mdev->al_lock); | ||
486 | |||
487 | transactions++; | ||
488 | |||
489 | cancel: | ||
490 | if (i == to) | ||
491 | break; | ||
492 | i++; | ||
493 | if (i > mx) | ||
494 | i = 0; | ||
495 | } | ||
496 | |||
497 | mdev->al_tr_number = to_tnr+1; | ||
498 | mdev->al_tr_pos = to; | ||
499 | if (++mdev->al_tr_pos > | ||
500 | div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) | ||
501 | mdev->al_tr_pos = 0; | ||
502 | |||
503 | /* ok, we are done with it */ | ||
504 | mutex_unlock(&mdev->md_io_mutex); | ||
505 | |||
506 | dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n", | ||
507 | transactions, active_extents); | ||
508 | |||
509 | return 1; | ||
510 | } | ||
511 | |||
512 | static void atodb_endio(struct bio *bio, int error) | ||
513 | { | ||
514 | struct drbd_atodb_wait *wc = bio->bi_private; | ||
515 | struct drbd_conf *mdev = wc->mdev; | ||
516 | struct page *page; | ||
517 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | ||
518 | |||
519 | /* strange behavior of some lower level drivers... | ||
520 | * fail the request by clearing the uptodate flag, | ||
521 | * but do not return any error?! */ | ||
522 | if (!error && !uptodate) | ||
523 | error = -EIO; | ||
524 | |||
525 | drbd_chk_io_error(mdev, error, TRUE); | ||
526 | if (error && wc->error == 0) | ||
527 | wc->error = error; | ||
528 | |||
529 | if (atomic_dec_and_test(&wc->count)) | ||
530 | complete(&wc->io_done); | ||
531 | |||
532 | page = bio->bi_io_vec[0].bv_page; | ||
533 | put_page(page); | ||
534 | bio_put(bio); | ||
535 | mdev->bm_writ_cnt++; | ||
536 | put_ldev(mdev); | ||
537 | } | ||
538 | |||
539 | #define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) | ||
540 | /* activity log to on disk bitmap -- prepare bio unless that sector | ||
541 | * is already covered by previously prepared bios */ | ||
542 | static int atodb_prepare_unless_covered(struct drbd_conf *mdev, | ||
543 | struct bio **bios, | ||
544 | unsigned int enr, | ||
545 | struct drbd_atodb_wait *wc) __must_hold(local) | ||
546 | { | ||
547 | struct bio *bio; | ||
548 | struct page *page; | ||
549 | sector_t on_disk_sector = enr + mdev->ldev->md.md_offset | ||
550 | + mdev->ldev->md.bm_offset; | ||
551 | unsigned int page_offset = PAGE_SIZE; | ||
552 | int offset; | ||
553 | int i = 0; | ||
554 | int err = -ENOMEM; | ||
555 | |||
556 | /* Check if that enr is already covered by an already created bio. | ||
557 | * Caution, bios[] is not NULL terminated, | ||
558 | * but only initialized to all NULL. | ||
559 | * For completely scattered activity log, | ||
560 | * the last invocation iterates over all bios, | ||
561 | * and finds the last NULL entry. | ||
562 | */ | ||
563 | while ((bio = bios[i])) { | ||
564 | if (bio->bi_sector == on_disk_sector) | ||
565 | return 0; | ||
566 | i++; | ||
567 | } | ||
568 | /* bios[i] == NULL, the next not yet used slot */ | ||
569 | |||
570 | /* GFP_KERNEL, we are not in the write-out path */ | ||
571 | bio = bio_alloc(GFP_KERNEL, 1); | ||
572 | if (bio == NULL) | ||
573 | return -ENOMEM; | ||
574 | |||
575 | if (i > 0) { | ||
576 | const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec; | ||
577 | page_offset = prev_bv->bv_offset + prev_bv->bv_len; | ||
578 | page = prev_bv->bv_page; | ||
579 | } | ||
580 | if (page_offset == PAGE_SIZE) { | ||
581 | page = alloc_page(__GFP_HIGHMEM); | ||
582 | if (page == NULL) | ||
583 | goto out_bio_put; | ||
584 | page_offset = 0; | ||
585 | } else { | ||
586 | get_page(page); | ||
587 | } | ||
588 | |||
589 | offset = S2W(enr); | ||
590 | drbd_bm_get_lel(mdev, offset, | ||
591 | min_t(size_t, S2W(1), drbd_bm_words(mdev) - offset), | ||
592 | kmap(page) + page_offset); | ||
593 | kunmap(page); | ||
594 | |||
595 | bio->bi_private = wc; | ||
596 | bio->bi_end_io = atodb_endio; | ||
597 | bio->bi_bdev = mdev->ldev->md_bdev; | ||
598 | bio->bi_sector = on_disk_sector; | ||
599 | |||
600 | if (bio_add_page(bio, page, MD_SECTOR_SIZE, page_offset) != MD_SECTOR_SIZE) | ||
601 | goto out_put_page; | ||
602 | |||
603 | atomic_inc(&wc->count); | ||
604 | /* we already know that we may do this... | ||
605 | * get_ldev_if_state(mdev,D_ATTACHING); | ||
606 | * just get the extra reference, so that the local_cnt reflects | ||
607 | * the number of pending IO requests DRBD at its backing device. | ||
608 | */ | ||
609 | atomic_inc(&mdev->local_cnt); | ||
610 | |||
611 | bios[i] = bio; | ||
612 | |||
613 | return 0; | ||
614 | |||
615 | out_put_page: | ||
616 | err = -EINVAL; | ||
617 | put_page(page); | ||
618 | out_bio_put: | ||
619 | bio_put(bio); | ||
620 | return err; | ||
621 | } | ||
622 | |||
623 | /** | ||
624 | * drbd_al_to_on_disk_bm() - * Writes bitmap parts covered by active AL extents | ||
625 | * @mdev: DRBD device. | ||
626 | * | ||
627 | * Called when we detach (unconfigure) local storage, | ||
628 | * or when we go from R_PRIMARY to R_SECONDARY role. | ||
629 | */ | ||
630 | void drbd_al_to_on_disk_bm(struct drbd_conf *mdev) | ||
631 | { | ||
632 | int i, nr_elements; | ||
633 | unsigned int enr; | ||
634 | struct bio **bios; | ||
635 | struct drbd_atodb_wait wc; | ||
636 | |||
637 | ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING)) | ||
638 | return; /* sorry, I don't have any act_log etc... */ | ||
639 | |||
640 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | ||
641 | |||
642 | nr_elements = mdev->act_log->nr_elements; | ||
643 | |||
644 | /* GFP_KERNEL, we are not in anyone's write-out path */ | ||
645 | bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL); | ||
646 | if (!bios) | ||
647 | goto submit_one_by_one; | ||
648 | |||
649 | atomic_set(&wc.count, 0); | ||
650 | init_completion(&wc.io_done); | ||
651 | wc.mdev = mdev; | ||
652 | wc.error = 0; | ||
653 | |||
654 | for (i = 0; i < nr_elements; i++) { | ||
655 | enr = lc_element_by_index(mdev->act_log, i)->lc_number; | ||
656 | if (enr == LC_FREE) | ||
657 | continue; | ||
658 | /* next statement also does atomic_inc wc.count and local_cnt */ | ||
659 | if (atodb_prepare_unless_covered(mdev, bios, | ||
660 | enr/AL_EXT_PER_BM_SECT, | ||
661 | &wc)) | ||
662 | goto free_bios_submit_one_by_one; | ||
663 | } | ||
664 | |||
665 | /* unnecessary optimization? */ | ||
666 | lc_unlock(mdev->act_log); | ||
667 | wake_up(&mdev->al_wait); | ||
668 | |||
669 | /* all prepared, submit them */ | ||
670 | for (i = 0; i < nr_elements; i++) { | ||
671 | if (bios[i] == NULL) | ||
672 | break; | ||
673 | if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) { | ||
674 | bios[i]->bi_rw = WRITE; | ||
675 | bio_endio(bios[i], -EIO); | ||
676 | } else { | ||
677 | submit_bio(WRITE, bios[i]); | ||
678 | } | ||
679 | } | ||
680 | |||
681 | drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev)); | ||
682 | |||
683 | /* always (try to) flush bitmap to stable storage */ | ||
684 | drbd_md_flush(mdev); | ||
685 | |||
686 | /* In case we did not submit a single IO do not wait for | ||
687 | * them to complete. ( Because we would wait forever here. ) | ||
688 | * | ||
689 | * In case we had IOs and they are already complete, there | ||
690 | * is not point in waiting anyways. | ||
691 | * Therefore this if () ... */ | ||
692 | if (atomic_read(&wc.count)) | ||
693 | wait_for_completion(&wc.io_done); | ||
694 | |||
695 | put_ldev(mdev); | ||
696 | |||
697 | kfree(bios); | ||
698 | return; | ||
699 | |||
700 | free_bios_submit_one_by_one: | ||
701 | /* free everything by calling the endio callback directly. */ | ||
702 | for (i = 0; i < nr_elements && bios[i]; i++) | ||
703 | bio_endio(bios[i], 0); | ||
704 | |||
705 | kfree(bios); | ||
706 | |||
707 | submit_one_by_one: | ||
708 | dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n"); | ||
709 | |||
710 | for (i = 0; i < mdev->act_log->nr_elements; i++) { | ||
711 | enr = lc_element_by_index(mdev->act_log, i)->lc_number; | ||
712 | if (enr == LC_FREE) | ||
713 | continue; | ||
714 | /* Really slow: if we have al-extents 16..19 active, | ||
715 | * sector 4 will be written four times! Synchronous! */ | ||
716 | drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT); | ||
717 | } | ||
718 | |||
719 | lc_unlock(mdev->act_log); | ||
720 | wake_up(&mdev->al_wait); | ||
721 | put_ldev(mdev); | ||
722 | } | ||
723 | |||
724 | /** | ||
725 | * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents | ||
726 | * @mdev: DRBD device. | ||
727 | */ | ||
728 | void drbd_al_apply_to_bm(struct drbd_conf *mdev) | ||
729 | { | ||
730 | unsigned int enr; | ||
731 | unsigned long add = 0; | ||
732 | char ppb[10]; | ||
733 | int i; | ||
734 | |||
735 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | ||
736 | |||
737 | for (i = 0; i < mdev->act_log->nr_elements; i++) { | ||
738 | enr = lc_element_by_index(mdev->act_log, i)->lc_number; | ||
739 | if (enr == LC_FREE) | ||
740 | continue; | ||
741 | add += drbd_bm_ALe_set_all(mdev, enr); | ||
742 | } | ||
743 | |||
744 | lc_unlock(mdev->act_log); | ||
745 | wake_up(&mdev->al_wait); | ||
746 | |||
747 | dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n", | ||
748 | ppsize(ppb, Bit2KB(add))); | ||
749 | } | ||
750 | |||
751 | static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) | ||
752 | { | ||
753 | int rv; | ||
754 | |||
755 | spin_lock_irq(&mdev->al_lock); | ||
756 | rv = (al_ext->refcnt == 0); | ||
757 | if (likely(rv)) | ||
758 | lc_del(mdev->act_log, al_ext); | ||
759 | spin_unlock_irq(&mdev->al_lock); | ||
760 | |||
761 | return rv; | ||
762 | } | ||
763 | |||
764 | /** | ||
765 | * drbd_al_shrink() - Removes all active extents form the activity log | ||
766 | * @mdev: DRBD device. | ||
767 | * | ||
768 | * Removes all active extents form the activity log, waiting until | ||
769 | * the reference count of each entry dropped to 0 first, of course. | ||
770 | * | ||
771 | * You need to lock mdev->act_log with lc_try_lock() / lc_unlock() | ||
772 | */ | ||
773 | void drbd_al_shrink(struct drbd_conf *mdev) | ||
774 | { | ||
775 | struct lc_element *al_ext; | ||
776 | int i; | ||
777 | |||
778 | D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags)); | ||
779 | |||
780 | for (i = 0; i < mdev->act_log->nr_elements; i++) { | ||
781 | al_ext = lc_element_by_index(mdev->act_log, i); | ||
782 | if (al_ext->lc_number == LC_FREE) | ||
783 | continue; | ||
784 | wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext)); | ||
785 | } | ||
786 | |||
787 | wake_up(&mdev->al_wait); | ||
788 | } | ||
789 | |||
790 | static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
791 | { | ||
792 | struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); | ||
793 | |||
794 | if (!get_ldev(mdev)) { | ||
795 | if (__ratelimit(&drbd_ratelimit_state)) | ||
796 | dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n"); | ||
797 | kfree(udw); | ||
798 | return 1; | ||
799 | } | ||
800 | |||
801 | drbd_bm_write_sect(mdev, udw->enr); | ||
802 | put_ldev(mdev); | ||
803 | |||
804 | kfree(udw); | ||
805 | |||
806 | if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) { | ||
807 | switch (mdev->state.conn) { | ||
808 | case C_SYNC_SOURCE: case C_SYNC_TARGET: | ||
809 | case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T: | ||
810 | drbd_resync_finished(mdev); | ||
811 | default: | ||
812 | /* nothing to do */ | ||
813 | break; | ||
814 | } | ||
815 | } | ||
816 | drbd_bcast_sync_progress(mdev); | ||
817 | |||
818 | return 1; | ||
819 | } | ||
820 | |||
821 | |||
822 | /* ATTENTION. The AL's extents are 4MB each, while the extents in the | ||
823 | * resync LRU-cache are 16MB each. | ||
824 | * The caller of this function has to hold an get_ldev() reference. | ||
825 | * | ||
826 | * TODO will be obsoleted once we have a caching lru of the on disk bitmap | ||
827 | */ | ||
828 | static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, | ||
829 | int count, int success) | ||
830 | { | ||
831 | struct lc_element *e; | ||
832 | struct update_odbm_work *udw; | ||
833 | |||
834 | unsigned int enr; | ||
835 | |||
836 | D_ASSERT(atomic_read(&mdev->local_cnt)); | ||
837 | |||
838 | /* I simply assume that a sector/size pair never crosses | ||
839 | * a 16 MB extent border. (Currently this is true...) */ | ||
840 | enr = BM_SECT_TO_EXT(sector); | ||
841 | |||
842 | e = lc_get(mdev->resync, enr); | ||
843 | if (e) { | ||
844 | struct bm_extent *ext = lc_entry(e, struct bm_extent, lce); | ||
845 | if (ext->lce.lc_number == enr) { | ||
846 | if (success) | ||
847 | ext->rs_left -= count; | ||
848 | else | ||
849 | ext->rs_failed += count; | ||
850 | if (ext->rs_left < ext->rs_failed) { | ||
851 | dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d " | ||
852 | "rs_failed=%d count=%d\n", | ||
853 | (unsigned long long)sector, | ||
854 | ext->lce.lc_number, ext->rs_left, | ||
855 | ext->rs_failed, count); | ||
856 | dump_stack(); | ||
857 | |||
858 | lc_put(mdev->resync, &ext->lce); | ||
859 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
860 | return; | ||
861 | } | ||
862 | } else { | ||
863 | /* Normally this element should be in the cache, | ||
864 | * since drbd_rs_begin_io() pulled it already in. | ||
865 | * | ||
866 | * But maybe an application write finished, and we set | ||
867 | * something outside the resync lru_cache in sync. | ||
868 | */ | ||
869 | int rs_left = drbd_bm_e_weight(mdev, enr); | ||
870 | if (ext->flags != 0) { | ||
871 | dev_warn(DEV, "changing resync lce: %d[%u;%02lx]" | ||
872 | " -> %d[%u;00]\n", | ||
873 | ext->lce.lc_number, ext->rs_left, | ||
874 | ext->flags, enr, rs_left); | ||
875 | ext->flags = 0; | ||
876 | } | ||
877 | if (ext->rs_failed) { | ||
878 | dev_warn(DEV, "Kicking resync_lru element enr=%u " | ||
879 | "out with rs_failed=%d\n", | ||
880 | ext->lce.lc_number, ext->rs_failed); | ||
881 | set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); | ||
882 | } | ||
883 | ext->rs_left = rs_left; | ||
884 | ext->rs_failed = success ? 0 : count; | ||
885 | lc_changed(mdev->resync, &ext->lce); | ||
886 | } | ||
887 | lc_put(mdev->resync, &ext->lce); | ||
888 | /* no race, we are within the al_lock! */ | ||
889 | |||
890 | if (ext->rs_left == ext->rs_failed) { | ||
891 | ext->rs_failed = 0; | ||
892 | |||
893 | udw = kmalloc(sizeof(*udw), GFP_ATOMIC); | ||
894 | if (udw) { | ||
895 | udw->enr = ext->lce.lc_number; | ||
896 | udw->w.cb = w_update_odbm; | ||
897 | drbd_queue_work_front(&mdev->data.work, &udw->w); | ||
898 | } else { | ||
899 | dev_warn(DEV, "Could not kmalloc an udw\n"); | ||
900 | set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); | ||
901 | } | ||
902 | } | ||
903 | } else { | ||
904 | dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n", | ||
905 | mdev->resync_locked, | ||
906 | mdev->resync->nr_elements, | ||
907 | mdev->resync->flags); | ||
908 | } | ||
909 | } | ||
910 | |||
911 | /* clear the bit corresponding to the piece of storage in question: | ||
912 | * size byte of data starting from sector. Only clear a bits of the affected | ||
913 | * one ore more _aligned_ BM_BLOCK_SIZE blocks. | ||
914 | * | ||
915 | * called by worker on C_SYNC_TARGET and receiver on SyncSource. | ||
916 | * | ||
917 | */ | ||
918 | void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, | ||
919 | const char *file, const unsigned int line) | ||
920 | { | ||
921 | /* Is called from worker and receiver context _only_ */ | ||
922 | unsigned long sbnr, ebnr, lbnr; | ||
923 | unsigned long count = 0; | ||
924 | sector_t esector, nr_sectors; | ||
925 | int wake_up = 0; | ||
926 | unsigned long flags; | ||
927 | |||
928 | if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { | ||
929 | dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", | ||
930 | (unsigned long long)sector, size); | ||
931 | return; | ||
932 | } | ||
933 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | ||
934 | esector = sector + (size >> 9) - 1; | ||
935 | |||
936 | ERR_IF(sector >= nr_sectors) return; | ||
937 | ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); | ||
938 | |||
939 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); | ||
940 | |||
941 | /* we clear it (in sync). | ||
942 | * round up start sector, round down end sector. we make sure we only | ||
943 | * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */ | ||
944 | if (unlikely(esector < BM_SECT_PER_BIT-1)) | ||
945 | return; | ||
946 | if (unlikely(esector == (nr_sectors-1))) | ||
947 | ebnr = lbnr; | ||
948 | else | ||
949 | ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); | ||
950 | sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); | ||
951 | |||
952 | if (sbnr > ebnr) | ||
953 | return; | ||
954 | |||
955 | /* | ||
956 | * ok, (capacity & 7) != 0 sometimes, but who cares... | ||
957 | * we count rs_{total,left} in bits, not sectors. | ||
958 | */ | ||
959 | spin_lock_irqsave(&mdev->al_lock, flags); | ||
960 | count = drbd_bm_clear_bits(mdev, sbnr, ebnr); | ||
961 | if (count) { | ||
962 | /* we need the lock for drbd_try_clear_on_disk_bm */ | ||
963 | if (jiffies - mdev->rs_mark_time > HZ*10) { | ||
964 | /* should be rolling marks, | ||
965 | * but we estimate only anyways. */ | ||
966 | if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) && | ||
967 | mdev->state.conn != C_PAUSED_SYNC_T && | ||
968 | mdev->state.conn != C_PAUSED_SYNC_S) { | ||
969 | mdev->rs_mark_time = jiffies; | ||
970 | mdev->rs_mark_left = drbd_bm_total_weight(mdev); | ||
971 | } | ||
972 | } | ||
973 | if (get_ldev(mdev)) { | ||
974 | drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE); | ||
975 | put_ldev(mdev); | ||
976 | } | ||
977 | /* just wake_up unconditional now, various lc_chaged(), | ||
978 | * lc_put() in drbd_try_clear_on_disk_bm(). */ | ||
979 | wake_up = 1; | ||
980 | } | ||
981 | spin_unlock_irqrestore(&mdev->al_lock, flags); | ||
982 | if (wake_up) | ||
983 | wake_up(&mdev->al_wait); | ||
984 | } | ||
985 | |||
986 | /* | ||
987 | * this is intended to set one request worth of data out of sync. | ||
988 | * affects at least 1 bit, | ||
989 | * and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits. | ||
990 | * | ||
991 | * called by tl_clear and drbd_send_dblock (==drbd_make_request). | ||
992 | * so this can be _any_ process. | ||
993 | */ | ||
994 | void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, | ||
995 | const char *file, const unsigned int line) | ||
996 | { | ||
997 | unsigned long sbnr, ebnr, lbnr, flags; | ||
998 | sector_t esector, nr_sectors; | ||
999 | unsigned int enr, count; | ||
1000 | struct lc_element *e; | ||
1001 | |||
1002 | if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { | ||
1003 | dev_err(DEV, "sector: %llus, size: %d\n", | ||
1004 | (unsigned long long)sector, size); | ||
1005 | return; | ||
1006 | } | ||
1007 | |||
1008 | if (!get_ldev(mdev)) | ||
1009 | return; /* no disk, no metadata, no bitmap to set bits in */ | ||
1010 | |||
1011 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | ||
1012 | esector = sector + (size >> 9) - 1; | ||
1013 | |||
1014 | ERR_IF(sector >= nr_sectors) | ||
1015 | goto out; | ||
1016 | ERR_IF(esector >= nr_sectors) | ||
1017 | esector = (nr_sectors-1); | ||
1018 | |||
1019 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); | ||
1020 | |||
1021 | /* we set it out of sync, | ||
1022 | * we do not need to round anything here */ | ||
1023 | sbnr = BM_SECT_TO_BIT(sector); | ||
1024 | ebnr = BM_SECT_TO_BIT(esector); | ||
1025 | |||
1026 | /* ok, (capacity & 7) != 0 sometimes, but who cares... | ||
1027 | * we count rs_{total,left} in bits, not sectors. */ | ||
1028 | spin_lock_irqsave(&mdev->al_lock, flags); | ||
1029 | count = drbd_bm_set_bits(mdev, sbnr, ebnr); | ||
1030 | |||
1031 | enr = BM_SECT_TO_EXT(sector); | ||
1032 | e = lc_find(mdev->resync, enr); | ||
1033 | if (e) | ||
1034 | lc_entry(e, struct bm_extent, lce)->rs_left += count; | ||
1035 | spin_unlock_irqrestore(&mdev->al_lock, flags); | ||
1036 | |||
1037 | out: | ||
1038 | put_ldev(mdev); | ||
1039 | } | ||
1040 | |||
1041 | static | ||
1042 | struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr) | ||
1043 | { | ||
1044 | struct lc_element *e; | ||
1045 | struct bm_extent *bm_ext; | ||
1046 | int wakeup = 0; | ||
1047 | unsigned long rs_flags; | ||
1048 | |||
1049 | spin_lock_irq(&mdev->al_lock); | ||
1050 | if (mdev->resync_locked > mdev->resync->nr_elements/2) { | ||
1051 | spin_unlock_irq(&mdev->al_lock); | ||
1052 | return NULL; | ||
1053 | } | ||
1054 | e = lc_get(mdev->resync, enr); | ||
1055 | bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; | ||
1056 | if (bm_ext) { | ||
1057 | if (bm_ext->lce.lc_number != enr) { | ||
1058 | bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); | ||
1059 | bm_ext->rs_failed = 0; | ||
1060 | lc_changed(mdev->resync, &bm_ext->lce); | ||
1061 | wakeup = 1; | ||
1062 | } | ||
1063 | if (bm_ext->lce.refcnt == 1) | ||
1064 | mdev->resync_locked++; | ||
1065 | set_bit(BME_NO_WRITES, &bm_ext->flags); | ||
1066 | } | ||
1067 | rs_flags = mdev->resync->flags; | ||
1068 | spin_unlock_irq(&mdev->al_lock); | ||
1069 | if (wakeup) | ||
1070 | wake_up(&mdev->al_wait); | ||
1071 | |||
1072 | if (!bm_ext) { | ||
1073 | if (rs_flags & LC_STARVING) | ||
1074 | dev_warn(DEV, "Have to wait for element" | ||
1075 | " (resync LRU too small?)\n"); | ||
1076 | BUG_ON(rs_flags & LC_DIRTY); | ||
1077 | } | ||
1078 | |||
1079 | return bm_ext; | ||
1080 | } | ||
1081 | |||
1082 | static int _is_in_al(struct drbd_conf *mdev, unsigned int enr) | ||
1083 | { | ||
1084 | struct lc_element *al_ext; | ||
1085 | int rv = 0; | ||
1086 | |||
1087 | spin_lock_irq(&mdev->al_lock); | ||
1088 | if (unlikely(enr == mdev->act_log->new_number)) | ||
1089 | rv = 1; | ||
1090 | else { | ||
1091 | al_ext = lc_find(mdev->act_log, enr); | ||
1092 | if (al_ext) { | ||
1093 | if (al_ext->refcnt) | ||
1094 | rv = 1; | ||
1095 | } | ||
1096 | } | ||
1097 | spin_unlock_irq(&mdev->al_lock); | ||
1098 | |||
1099 | /* | ||
1100 | if (unlikely(rv)) { | ||
1101 | dev_info(DEV, "Delaying sync read until app's write is done\n"); | ||
1102 | } | ||
1103 | */ | ||
1104 | return rv; | ||
1105 | } | ||
1106 | |||
1107 | /** | ||
1108 | * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED | ||
1109 | * @mdev: DRBD device. | ||
1110 | * @sector: The sector number. | ||
1111 | * | ||
1112 | * This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted. | ||
1113 | */ | ||
1114 | int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector) | ||
1115 | { | ||
1116 | unsigned int enr = BM_SECT_TO_EXT(sector); | ||
1117 | struct bm_extent *bm_ext; | ||
1118 | int i, sig; | ||
1119 | |||
1120 | sig = wait_event_interruptible(mdev->al_wait, | ||
1121 | (bm_ext = _bme_get(mdev, enr))); | ||
1122 | if (sig) | ||
1123 | return 0; | ||
1124 | |||
1125 | if (test_bit(BME_LOCKED, &bm_ext->flags)) | ||
1126 | return 1; | ||
1127 | |||
1128 | for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { | ||
1129 | sig = wait_event_interruptible(mdev->al_wait, | ||
1130 | !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i)); | ||
1131 | if (sig) { | ||
1132 | spin_lock_irq(&mdev->al_lock); | ||
1133 | if (lc_put(mdev->resync, &bm_ext->lce) == 0) { | ||
1134 | clear_bit(BME_NO_WRITES, &bm_ext->flags); | ||
1135 | mdev->resync_locked--; | ||
1136 | wake_up(&mdev->al_wait); | ||
1137 | } | ||
1138 | spin_unlock_irq(&mdev->al_lock); | ||
1139 | return 0; | ||
1140 | } | ||
1141 | } | ||
1142 | |||
1143 | set_bit(BME_LOCKED, &bm_ext->flags); | ||
1144 | |||
1145 | return 1; | ||
1146 | } | ||
1147 | |||
1148 | /** | ||
1149 | * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep | ||
1150 | * @mdev: DRBD device. | ||
1151 | * @sector: The sector number. | ||
1152 | * | ||
1153 | * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then | ||
1154 | * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN | ||
1155 | * if there is still application IO going on in this area. | ||
1156 | */ | ||
1157 | int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector) | ||
1158 | { | ||
1159 | unsigned int enr = BM_SECT_TO_EXT(sector); | ||
1160 | const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT; | ||
1161 | struct lc_element *e; | ||
1162 | struct bm_extent *bm_ext; | ||
1163 | int i; | ||
1164 | |||
1165 | spin_lock_irq(&mdev->al_lock); | ||
1166 | if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) { | ||
1167 | /* in case you have very heavy scattered io, it may | ||
1168 | * stall the syncer undefined if we give up the ref count | ||
1169 | * when we try again and requeue. | ||
1170 | * | ||
1171 | * if we don't give up the refcount, but the next time | ||
1172 | * we are scheduled this extent has been "synced" by new | ||
1173 | * application writes, we'd miss the lc_put on the | ||
1174 | * extent we keep the refcount on. | ||
1175 | * so we remembered which extent we had to try again, and | ||
1176 | * if the next requested one is something else, we do | ||
1177 | * the lc_put here... | ||
1178 | * we also have to wake_up | ||
1179 | */ | ||
1180 | e = lc_find(mdev->resync, mdev->resync_wenr); | ||
1181 | bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; | ||
1182 | if (bm_ext) { | ||
1183 | D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); | ||
1184 | D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags)); | ||
1185 | clear_bit(BME_NO_WRITES, &bm_ext->flags); | ||
1186 | mdev->resync_wenr = LC_FREE; | ||
1187 | if (lc_put(mdev->resync, &bm_ext->lce) == 0) | ||
1188 | mdev->resync_locked--; | ||
1189 | wake_up(&mdev->al_wait); | ||
1190 | } else { | ||
1191 | dev_alert(DEV, "LOGIC BUG\n"); | ||
1192 | } | ||
1193 | } | ||
1194 | /* TRY. */ | ||
1195 | e = lc_try_get(mdev->resync, enr); | ||
1196 | bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; | ||
1197 | if (bm_ext) { | ||
1198 | if (test_bit(BME_LOCKED, &bm_ext->flags)) | ||
1199 | goto proceed; | ||
1200 | if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) { | ||
1201 | mdev->resync_locked++; | ||
1202 | } else { | ||
1203 | /* we did set the BME_NO_WRITES, | ||
1204 | * but then could not set BME_LOCKED, | ||
1205 | * so we tried again. | ||
1206 | * drop the extra reference. */ | ||
1207 | bm_ext->lce.refcnt--; | ||
1208 | D_ASSERT(bm_ext->lce.refcnt > 0); | ||
1209 | } | ||
1210 | goto check_al; | ||
1211 | } else { | ||
1212 | /* do we rather want to try later? */ | ||
1213 | if (mdev->resync_locked > mdev->resync->nr_elements-3) | ||
1214 | goto try_again; | ||
1215 | /* Do or do not. There is no try. -- Yoda */ | ||
1216 | e = lc_get(mdev->resync, enr); | ||
1217 | bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; | ||
1218 | if (!bm_ext) { | ||
1219 | const unsigned long rs_flags = mdev->resync->flags; | ||
1220 | if (rs_flags & LC_STARVING) | ||
1221 | dev_warn(DEV, "Have to wait for element" | ||
1222 | " (resync LRU too small?)\n"); | ||
1223 | BUG_ON(rs_flags & LC_DIRTY); | ||
1224 | goto try_again; | ||
1225 | } | ||
1226 | if (bm_ext->lce.lc_number != enr) { | ||
1227 | bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); | ||
1228 | bm_ext->rs_failed = 0; | ||
1229 | lc_changed(mdev->resync, &bm_ext->lce); | ||
1230 | wake_up(&mdev->al_wait); | ||
1231 | D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0); | ||
1232 | } | ||
1233 | set_bit(BME_NO_WRITES, &bm_ext->flags); | ||
1234 | D_ASSERT(bm_ext->lce.refcnt == 1); | ||
1235 | mdev->resync_locked++; | ||
1236 | goto check_al; | ||
1237 | } | ||
1238 | check_al: | ||
1239 | for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { | ||
1240 | if (unlikely(al_enr+i == mdev->act_log->new_number)) | ||
1241 | goto try_again; | ||
1242 | if (lc_is_used(mdev->act_log, al_enr+i)) | ||
1243 | goto try_again; | ||
1244 | } | ||
1245 | set_bit(BME_LOCKED, &bm_ext->flags); | ||
1246 | proceed: | ||
1247 | mdev->resync_wenr = LC_FREE; | ||
1248 | spin_unlock_irq(&mdev->al_lock); | ||
1249 | return 0; | ||
1250 | |||
1251 | try_again: | ||
1252 | if (bm_ext) | ||
1253 | mdev->resync_wenr = enr; | ||
1254 | spin_unlock_irq(&mdev->al_lock); | ||
1255 | return -EAGAIN; | ||
1256 | } | ||
1257 | |||
1258 | void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector) | ||
1259 | { | ||
1260 | unsigned int enr = BM_SECT_TO_EXT(sector); | ||
1261 | struct lc_element *e; | ||
1262 | struct bm_extent *bm_ext; | ||
1263 | unsigned long flags; | ||
1264 | |||
1265 | spin_lock_irqsave(&mdev->al_lock, flags); | ||
1266 | e = lc_find(mdev->resync, enr); | ||
1267 | bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; | ||
1268 | if (!bm_ext) { | ||
1269 | spin_unlock_irqrestore(&mdev->al_lock, flags); | ||
1270 | if (__ratelimit(&drbd_ratelimit_state)) | ||
1271 | dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n"); | ||
1272 | return; | ||
1273 | } | ||
1274 | |||
1275 | if (bm_ext->lce.refcnt == 0) { | ||
1276 | spin_unlock_irqrestore(&mdev->al_lock, flags); | ||
1277 | dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, " | ||
1278 | "but refcnt is 0!?\n", | ||
1279 | (unsigned long long)sector, enr); | ||
1280 | return; | ||
1281 | } | ||
1282 | |||
1283 | if (lc_put(mdev->resync, &bm_ext->lce) == 0) { | ||
1284 | clear_bit(BME_LOCKED, &bm_ext->flags); | ||
1285 | clear_bit(BME_NO_WRITES, &bm_ext->flags); | ||
1286 | mdev->resync_locked--; | ||
1287 | wake_up(&mdev->al_wait); | ||
1288 | } | ||
1289 | |||
1290 | spin_unlock_irqrestore(&mdev->al_lock, flags); | ||
1291 | } | ||
1292 | |||
1293 | /** | ||
1294 | * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED) | ||
1295 | * @mdev: DRBD device. | ||
1296 | */ | ||
1297 | void drbd_rs_cancel_all(struct drbd_conf *mdev) | ||
1298 | { | ||
1299 | spin_lock_irq(&mdev->al_lock); | ||
1300 | |||
1301 | if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */ | ||
1302 | lc_reset(mdev->resync); | ||
1303 | put_ldev(mdev); | ||
1304 | } | ||
1305 | mdev->resync_locked = 0; | ||
1306 | mdev->resync_wenr = LC_FREE; | ||
1307 | spin_unlock_irq(&mdev->al_lock); | ||
1308 | wake_up(&mdev->al_wait); | ||
1309 | } | ||
1310 | |||
1311 | /** | ||
1312 | * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU | ||
1313 | * @mdev: DRBD device. | ||
1314 | * | ||
1315 | * Returns 0 upon success, -EAGAIN if at least one reference count was | ||
1316 | * not zero. | ||
1317 | */ | ||
1318 | int drbd_rs_del_all(struct drbd_conf *mdev) | ||
1319 | { | ||
1320 | struct lc_element *e; | ||
1321 | struct bm_extent *bm_ext; | ||
1322 | int i; | ||
1323 | |||
1324 | spin_lock_irq(&mdev->al_lock); | ||
1325 | |||
1326 | if (get_ldev_if_state(mdev, D_FAILED)) { | ||
1327 | /* ok, ->resync is there. */ | ||
1328 | for (i = 0; i < mdev->resync->nr_elements; i++) { | ||
1329 | e = lc_element_by_index(mdev->resync, i); | ||
1330 | bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; | ||
1331 | if (bm_ext->lce.lc_number == LC_FREE) | ||
1332 | continue; | ||
1333 | if (bm_ext->lce.lc_number == mdev->resync_wenr) { | ||
1334 | dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently" | ||
1335 | " got 'synced' by application io\n", | ||
1336 | mdev->resync_wenr); | ||
1337 | D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); | ||
1338 | D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags)); | ||
1339 | clear_bit(BME_NO_WRITES, &bm_ext->flags); | ||
1340 | mdev->resync_wenr = LC_FREE; | ||
1341 | lc_put(mdev->resync, &bm_ext->lce); | ||
1342 | } | ||
1343 | if (bm_ext->lce.refcnt != 0) { | ||
1344 | dev_info(DEV, "Retrying drbd_rs_del_all() later. " | ||
1345 | "refcnt=%d\n", bm_ext->lce.refcnt); | ||
1346 | put_ldev(mdev); | ||
1347 | spin_unlock_irq(&mdev->al_lock); | ||
1348 | return -EAGAIN; | ||
1349 | } | ||
1350 | D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); | ||
1351 | D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags)); | ||
1352 | lc_del(mdev->resync, &bm_ext->lce); | ||
1353 | } | ||
1354 | D_ASSERT(mdev->resync->used == 0); | ||
1355 | put_ldev(mdev); | ||
1356 | } | ||
1357 | spin_unlock_irq(&mdev->al_lock); | ||
1358 | |||
1359 | return 0; | ||
1360 | } | ||
1361 | |||
1362 | /** | ||
1363 | * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks | ||
1364 | * @mdev: DRBD device. | ||
1365 | * @sector: The sector number. | ||
1366 | * @size: Size of failed IO operation, in byte. | ||
1367 | */ | ||
1368 | void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size) | ||
1369 | { | ||
1370 | /* Is called from worker and receiver context _only_ */ | ||
1371 | unsigned long sbnr, ebnr, lbnr; | ||
1372 | unsigned long count; | ||
1373 | sector_t esector, nr_sectors; | ||
1374 | int wake_up = 0; | ||
1375 | |||
1376 | if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { | ||
1377 | dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", | ||
1378 | (unsigned long long)sector, size); | ||
1379 | return; | ||
1380 | } | ||
1381 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | ||
1382 | esector = sector + (size >> 9) - 1; | ||
1383 | |||
1384 | ERR_IF(sector >= nr_sectors) return; | ||
1385 | ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); | ||
1386 | |||
1387 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); | ||
1388 | |||
1389 | /* | ||
1390 | * round up start sector, round down end sector. we make sure we only | ||
1391 | * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */ | ||
1392 | if (unlikely(esector < BM_SECT_PER_BIT-1)) | ||
1393 | return; | ||
1394 | if (unlikely(esector == (nr_sectors-1))) | ||
1395 | ebnr = lbnr; | ||
1396 | else | ||
1397 | ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); | ||
1398 | sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); | ||
1399 | |||
1400 | if (sbnr > ebnr) | ||
1401 | return; | ||
1402 | |||
1403 | /* | ||
1404 | * ok, (capacity & 7) != 0 sometimes, but who cares... | ||
1405 | * we count rs_{total,left} in bits, not sectors. | ||
1406 | */ | ||
1407 | spin_lock_irq(&mdev->al_lock); | ||
1408 | count = drbd_bm_count_bits(mdev, sbnr, ebnr); | ||
1409 | if (count) { | ||
1410 | mdev->rs_failed += count; | ||
1411 | |||
1412 | if (get_ldev(mdev)) { | ||
1413 | drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE); | ||
1414 | put_ldev(mdev); | ||
1415 | } | ||
1416 | |||
1417 | /* just wake_up unconditional now, various lc_chaged(), | ||
1418 | * lc_put() in drbd_try_clear_on_disk_bm(). */ | ||
1419 | wake_up = 1; | ||
1420 | } | ||
1421 | spin_unlock_irq(&mdev->al_lock); | ||
1422 | if (wake_up) | ||
1423 | wake_up(&mdev->al_wait); | ||
1424 | } | ||
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c new file mode 100644 index 000000000000..b61057e77882 --- /dev/null +++ b/drivers/block/drbd/drbd_bitmap.c | |||
@@ -0,0 +1,1327 @@ | |||
1 | /* | ||
2 | drbd_bitmap.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2004-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | */ | ||
24 | |||
25 | #include <linux/bitops.h> | ||
26 | #include <linux/vmalloc.h> | ||
27 | #include <linux/string.h> | ||
28 | #include <linux/drbd.h> | ||
29 | #include <asm/kmap_types.h> | ||
30 | #include "drbd_int.h" | ||
31 | |||
32 | /* OPAQUE outside this file! | ||
33 | * interface defined in drbd_int.h | ||
34 | |||
35 | * convention: | ||
36 | * function name drbd_bm_... => used elsewhere, "public". | ||
37 | * function name bm_... => internal to implementation, "private". | ||
38 | |||
39 | * Note that since find_first_bit returns int, at the current granularity of | ||
40 | * the bitmap (4KB per byte), this implementation "only" supports up to | ||
41 | * 1<<(32+12) == 16 TB... | ||
42 | */ | ||
43 | |||
44 | /* | ||
45 | * NOTE | ||
46 | * Access to the *bm_pages is protected by bm_lock. | ||
47 | * It is safe to read the other members within the lock. | ||
48 | * | ||
49 | * drbd_bm_set_bits is called from bio_endio callbacks, | ||
50 | * We may be called with irq already disabled, | ||
51 | * so we need spin_lock_irqsave(). | ||
52 | * And we need the kmap_atomic. | ||
53 | */ | ||
54 | struct drbd_bitmap { | ||
55 | struct page **bm_pages; | ||
56 | spinlock_t bm_lock; | ||
57 | /* WARNING unsigned long bm_*: | ||
58 | * 32bit number of bit offset is just enough for 512 MB bitmap. | ||
59 | * it will blow up if we make the bitmap bigger... | ||
60 | * not that it makes much sense to have a bitmap that large, | ||
61 | * rather change the granularity to 16k or 64k or something. | ||
62 | * (that implies other problems, however...) | ||
63 | */ | ||
64 | unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */ | ||
65 | unsigned long bm_bits; | ||
66 | size_t bm_words; | ||
67 | size_t bm_number_of_pages; | ||
68 | sector_t bm_dev_capacity; | ||
69 | struct semaphore bm_change; /* serializes resize operations */ | ||
70 | |||
71 | atomic_t bm_async_io; | ||
72 | wait_queue_head_t bm_io_wait; | ||
73 | |||
74 | unsigned long bm_flags; | ||
75 | |||
76 | /* debugging aid, in case we are still racy somewhere */ | ||
77 | char *bm_why; | ||
78 | struct task_struct *bm_task; | ||
79 | }; | ||
80 | |||
81 | /* definition of bits in bm_flags */ | ||
82 | #define BM_LOCKED 0 | ||
83 | #define BM_MD_IO_ERROR 1 | ||
84 | #define BM_P_VMALLOCED 2 | ||
85 | |||
86 | static int bm_is_locked(struct drbd_bitmap *b) | ||
87 | { | ||
88 | return test_bit(BM_LOCKED, &b->bm_flags); | ||
89 | } | ||
90 | |||
91 | #define bm_print_lock_info(m) __bm_print_lock_info(m, __func__) | ||
92 | static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func) | ||
93 | { | ||
94 | struct drbd_bitmap *b = mdev->bitmap; | ||
95 | if (!__ratelimit(&drbd_ratelimit_state)) | ||
96 | return; | ||
97 | dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n", | ||
98 | current == mdev->receiver.task ? "receiver" : | ||
99 | current == mdev->asender.task ? "asender" : | ||
100 | current == mdev->worker.task ? "worker" : current->comm, | ||
101 | func, b->bm_why ?: "?", | ||
102 | b->bm_task == mdev->receiver.task ? "receiver" : | ||
103 | b->bm_task == mdev->asender.task ? "asender" : | ||
104 | b->bm_task == mdev->worker.task ? "worker" : "?"); | ||
105 | } | ||
106 | |||
107 | void drbd_bm_lock(struct drbd_conf *mdev, char *why) | ||
108 | { | ||
109 | struct drbd_bitmap *b = mdev->bitmap; | ||
110 | int trylock_failed; | ||
111 | |||
112 | if (!b) { | ||
113 | dev_err(DEV, "FIXME no bitmap in drbd_bm_lock!?\n"); | ||
114 | return; | ||
115 | } | ||
116 | |||
117 | trylock_failed = down_trylock(&b->bm_change); | ||
118 | |||
119 | if (trylock_failed) { | ||
120 | dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n", | ||
121 | current == mdev->receiver.task ? "receiver" : | ||
122 | current == mdev->asender.task ? "asender" : | ||
123 | current == mdev->worker.task ? "worker" : current->comm, | ||
124 | why, b->bm_why ?: "?", | ||
125 | b->bm_task == mdev->receiver.task ? "receiver" : | ||
126 | b->bm_task == mdev->asender.task ? "asender" : | ||
127 | b->bm_task == mdev->worker.task ? "worker" : "?"); | ||
128 | down(&b->bm_change); | ||
129 | } | ||
130 | if (__test_and_set_bit(BM_LOCKED, &b->bm_flags)) | ||
131 | dev_err(DEV, "FIXME bitmap already locked in bm_lock\n"); | ||
132 | |||
133 | b->bm_why = why; | ||
134 | b->bm_task = current; | ||
135 | } | ||
136 | |||
137 | void drbd_bm_unlock(struct drbd_conf *mdev) | ||
138 | { | ||
139 | struct drbd_bitmap *b = mdev->bitmap; | ||
140 | if (!b) { | ||
141 | dev_err(DEV, "FIXME no bitmap in drbd_bm_unlock!?\n"); | ||
142 | return; | ||
143 | } | ||
144 | |||
145 | if (!__test_and_clear_bit(BM_LOCKED, &mdev->bitmap->bm_flags)) | ||
146 | dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n"); | ||
147 | |||
148 | b->bm_why = NULL; | ||
149 | b->bm_task = NULL; | ||
150 | up(&b->bm_change); | ||
151 | } | ||
152 | |||
153 | /* word offset to long pointer */ | ||
154 | static unsigned long *__bm_map_paddr(struct drbd_bitmap *b, unsigned long offset, const enum km_type km) | ||
155 | { | ||
156 | struct page *page; | ||
157 | unsigned long page_nr; | ||
158 | |||
159 | /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */ | ||
160 | page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3); | ||
161 | BUG_ON(page_nr >= b->bm_number_of_pages); | ||
162 | page = b->bm_pages[page_nr]; | ||
163 | |||
164 | return (unsigned long *) kmap_atomic(page, km); | ||
165 | } | ||
166 | |||
167 | static unsigned long * bm_map_paddr(struct drbd_bitmap *b, unsigned long offset) | ||
168 | { | ||
169 | return __bm_map_paddr(b, offset, KM_IRQ1); | ||
170 | } | ||
171 | |||
172 | static void __bm_unmap(unsigned long *p_addr, const enum km_type km) | ||
173 | { | ||
174 | kunmap_atomic(p_addr, km); | ||
175 | }; | ||
176 | |||
177 | static void bm_unmap(unsigned long *p_addr) | ||
178 | { | ||
179 | return __bm_unmap(p_addr, KM_IRQ1); | ||
180 | } | ||
181 | |||
182 | /* long word offset of _bitmap_ sector */ | ||
183 | #define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) | ||
184 | /* word offset from start of bitmap to word number _in_page_ | ||
185 | * modulo longs per page | ||
186 | #define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long)) | ||
187 | hm, well, Philipp thinks gcc might not optimze the % into & (... - 1) | ||
188 | so do it explicitly: | ||
189 | */ | ||
190 | #define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1)) | ||
191 | |||
192 | /* Long words per page */ | ||
193 | #define LWPP (PAGE_SIZE/sizeof(long)) | ||
194 | |||
195 | /* | ||
196 | * actually most functions herein should take a struct drbd_bitmap*, not a | ||
197 | * struct drbd_conf*, but for the debug macros I like to have the mdev around | ||
198 | * to be able to report device specific. | ||
199 | */ | ||
200 | |||
201 | static void bm_free_pages(struct page **pages, unsigned long number) | ||
202 | { | ||
203 | unsigned long i; | ||
204 | if (!pages) | ||
205 | return; | ||
206 | |||
207 | for (i = 0; i < number; i++) { | ||
208 | if (!pages[i]) { | ||
209 | printk(KERN_ALERT "drbd: bm_free_pages tried to free " | ||
210 | "a NULL pointer; i=%lu n=%lu\n", | ||
211 | i, number); | ||
212 | continue; | ||
213 | } | ||
214 | __free_page(pages[i]); | ||
215 | pages[i] = NULL; | ||
216 | } | ||
217 | } | ||
218 | |||
219 | static void bm_vk_free(void *ptr, int v) | ||
220 | { | ||
221 | if (v) | ||
222 | vfree(ptr); | ||
223 | else | ||
224 | kfree(ptr); | ||
225 | } | ||
226 | |||
227 | /* | ||
228 | * "have" and "want" are NUMBER OF PAGES. | ||
229 | */ | ||
230 | static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) | ||
231 | { | ||
232 | struct page **old_pages = b->bm_pages; | ||
233 | struct page **new_pages, *page; | ||
234 | unsigned int i, bytes, vmalloced = 0; | ||
235 | unsigned long have = b->bm_number_of_pages; | ||
236 | |||
237 | BUG_ON(have == 0 && old_pages != NULL); | ||
238 | BUG_ON(have != 0 && old_pages == NULL); | ||
239 | |||
240 | if (have == want) | ||
241 | return old_pages; | ||
242 | |||
243 | /* Trying kmalloc first, falling back to vmalloc. | ||
244 | * GFP_KERNEL is ok, as this is done when a lower level disk is | ||
245 | * "attached" to the drbd. Context is receiver thread or cqueue | ||
246 | * thread. As we have no disk yet, we are not in the IO path, | ||
247 | * not even the IO path of the peer. */ | ||
248 | bytes = sizeof(struct page *)*want; | ||
249 | new_pages = kmalloc(bytes, GFP_KERNEL); | ||
250 | if (!new_pages) { | ||
251 | new_pages = vmalloc(bytes); | ||
252 | if (!new_pages) | ||
253 | return NULL; | ||
254 | vmalloced = 1; | ||
255 | } | ||
256 | |||
257 | memset(new_pages, 0, bytes); | ||
258 | if (want >= have) { | ||
259 | for (i = 0; i < have; i++) | ||
260 | new_pages[i] = old_pages[i]; | ||
261 | for (; i < want; i++) { | ||
262 | page = alloc_page(GFP_HIGHUSER); | ||
263 | if (!page) { | ||
264 | bm_free_pages(new_pages + have, i - have); | ||
265 | bm_vk_free(new_pages, vmalloced); | ||
266 | return NULL; | ||
267 | } | ||
268 | new_pages[i] = page; | ||
269 | } | ||
270 | } else { | ||
271 | for (i = 0; i < want; i++) | ||
272 | new_pages[i] = old_pages[i]; | ||
273 | /* NOT HERE, we are outside the spinlock! | ||
274 | bm_free_pages(old_pages + want, have - want); | ||
275 | */ | ||
276 | } | ||
277 | |||
278 | if (vmalloced) | ||
279 | set_bit(BM_P_VMALLOCED, &b->bm_flags); | ||
280 | else | ||
281 | clear_bit(BM_P_VMALLOCED, &b->bm_flags); | ||
282 | |||
283 | return new_pages; | ||
284 | } | ||
285 | |||
286 | /* | ||
287 | * called on driver init only. TODO call when a device is created. | ||
288 | * allocates the drbd_bitmap, and stores it in mdev->bitmap. | ||
289 | */ | ||
290 | int drbd_bm_init(struct drbd_conf *mdev) | ||
291 | { | ||
292 | struct drbd_bitmap *b = mdev->bitmap; | ||
293 | WARN_ON(b != NULL); | ||
294 | b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL); | ||
295 | if (!b) | ||
296 | return -ENOMEM; | ||
297 | spin_lock_init(&b->bm_lock); | ||
298 | init_MUTEX(&b->bm_change); | ||
299 | init_waitqueue_head(&b->bm_io_wait); | ||
300 | |||
301 | mdev->bitmap = b; | ||
302 | |||
303 | return 0; | ||
304 | } | ||
305 | |||
306 | sector_t drbd_bm_capacity(struct drbd_conf *mdev) | ||
307 | { | ||
308 | ERR_IF(!mdev->bitmap) return 0; | ||
309 | return mdev->bitmap->bm_dev_capacity; | ||
310 | } | ||
311 | |||
312 | /* called on driver unload. TODO: call when a device is destroyed. | ||
313 | */ | ||
314 | void drbd_bm_cleanup(struct drbd_conf *mdev) | ||
315 | { | ||
316 | ERR_IF (!mdev->bitmap) return; | ||
317 | bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages); | ||
318 | bm_vk_free(mdev->bitmap->bm_pages, test_bit(BM_P_VMALLOCED, &mdev->bitmap->bm_flags)); | ||
319 | kfree(mdev->bitmap); | ||
320 | mdev->bitmap = NULL; | ||
321 | } | ||
322 | |||
323 | /* | ||
324 | * since (b->bm_bits % BITS_PER_LONG) != 0, | ||
325 | * this masks out the remaining bits. | ||
326 | * Returns the number of bits cleared. | ||
327 | */ | ||
328 | static int bm_clear_surplus(struct drbd_bitmap *b) | ||
329 | { | ||
330 | const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1; | ||
331 | size_t w = b->bm_bits >> LN2_BPL; | ||
332 | int cleared = 0; | ||
333 | unsigned long *p_addr, *bm; | ||
334 | |||
335 | p_addr = bm_map_paddr(b, w); | ||
336 | bm = p_addr + MLPP(w); | ||
337 | if (w < b->bm_words) { | ||
338 | cleared = hweight_long(*bm & ~mask); | ||
339 | *bm &= mask; | ||
340 | w++; bm++; | ||
341 | } | ||
342 | |||
343 | if (w < b->bm_words) { | ||
344 | cleared += hweight_long(*bm); | ||
345 | *bm = 0; | ||
346 | } | ||
347 | bm_unmap(p_addr); | ||
348 | return cleared; | ||
349 | } | ||
350 | |||
351 | static void bm_set_surplus(struct drbd_bitmap *b) | ||
352 | { | ||
353 | const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1; | ||
354 | size_t w = b->bm_bits >> LN2_BPL; | ||
355 | unsigned long *p_addr, *bm; | ||
356 | |||
357 | p_addr = bm_map_paddr(b, w); | ||
358 | bm = p_addr + MLPP(w); | ||
359 | if (w < b->bm_words) { | ||
360 | *bm |= ~mask; | ||
361 | bm++; w++; | ||
362 | } | ||
363 | |||
364 | if (w < b->bm_words) { | ||
365 | *bm = ~(0UL); | ||
366 | } | ||
367 | bm_unmap(p_addr); | ||
368 | } | ||
369 | |||
370 | static unsigned long __bm_count_bits(struct drbd_bitmap *b, const int swap_endian) | ||
371 | { | ||
372 | unsigned long *p_addr, *bm, offset = 0; | ||
373 | unsigned long bits = 0; | ||
374 | unsigned long i, do_now; | ||
375 | |||
376 | while (offset < b->bm_words) { | ||
377 | i = do_now = min_t(size_t, b->bm_words-offset, LWPP); | ||
378 | p_addr = __bm_map_paddr(b, offset, KM_USER0); | ||
379 | bm = p_addr + MLPP(offset); | ||
380 | while (i--) { | ||
381 | #ifndef __LITTLE_ENDIAN | ||
382 | if (swap_endian) | ||
383 | *bm = lel_to_cpu(*bm); | ||
384 | #endif | ||
385 | bits += hweight_long(*bm++); | ||
386 | } | ||
387 | __bm_unmap(p_addr, KM_USER0); | ||
388 | offset += do_now; | ||
389 | cond_resched(); | ||
390 | } | ||
391 | |||
392 | return bits; | ||
393 | } | ||
394 | |||
395 | static unsigned long bm_count_bits(struct drbd_bitmap *b) | ||
396 | { | ||
397 | return __bm_count_bits(b, 0); | ||
398 | } | ||
399 | |||
400 | static unsigned long bm_count_bits_swap_endian(struct drbd_bitmap *b) | ||
401 | { | ||
402 | return __bm_count_bits(b, 1); | ||
403 | } | ||
404 | |||
405 | /* offset and len in long words.*/ | ||
406 | static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) | ||
407 | { | ||
408 | unsigned long *p_addr, *bm; | ||
409 | size_t do_now, end; | ||
410 | |||
411 | #define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512) | ||
412 | |||
413 | end = offset + len; | ||
414 | |||
415 | if (end > b->bm_words) { | ||
416 | printk(KERN_ALERT "drbd: bm_memset end > bm_words\n"); | ||
417 | return; | ||
418 | } | ||
419 | |||
420 | while (offset < end) { | ||
421 | do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset; | ||
422 | p_addr = bm_map_paddr(b, offset); | ||
423 | bm = p_addr + MLPP(offset); | ||
424 | if (bm+do_now > p_addr + LWPP) { | ||
425 | printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n", | ||
426 | p_addr, bm, (int)do_now); | ||
427 | break; /* breaks to after catch_oob_access_end() only! */ | ||
428 | } | ||
429 | memset(bm, c, do_now * sizeof(long)); | ||
430 | bm_unmap(p_addr); | ||
431 | offset += do_now; | ||
432 | } | ||
433 | } | ||
434 | |||
435 | /* | ||
436 | * make sure the bitmap has enough room for the attached storage, | ||
437 | * if necessary, resize. | ||
438 | * called whenever we may have changed the device size. | ||
439 | * returns -ENOMEM if we could not allocate enough memory, 0 on success. | ||
440 | * In case this is actually a resize, we copy the old bitmap into the new one. | ||
441 | * Otherwise, the bitmap is initialized to all bits set. | ||
442 | */ | ||
443 | int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity) | ||
444 | { | ||
445 | struct drbd_bitmap *b = mdev->bitmap; | ||
446 | unsigned long bits, words, owords, obits, *p_addr, *bm; | ||
447 | unsigned long want, have, onpages; /* number of pages */ | ||
448 | struct page **npages, **opages = NULL; | ||
449 | int err = 0, growing; | ||
450 | int opages_vmalloced; | ||
451 | |||
452 | ERR_IF(!b) return -ENOMEM; | ||
453 | |||
454 | drbd_bm_lock(mdev, "resize"); | ||
455 | |||
456 | dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n", | ||
457 | (unsigned long long)capacity); | ||
458 | |||
459 | if (capacity == b->bm_dev_capacity) | ||
460 | goto out; | ||
461 | |||
462 | opages_vmalloced = test_bit(BM_P_VMALLOCED, &b->bm_flags); | ||
463 | |||
464 | if (capacity == 0) { | ||
465 | spin_lock_irq(&b->bm_lock); | ||
466 | opages = b->bm_pages; | ||
467 | onpages = b->bm_number_of_pages; | ||
468 | owords = b->bm_words; | ||
469 | b->bm_pages = NULL; | ||
470 | b->bm_number_of_pages = | ||
471 | b->bm_set = | ||
472 | b->bm_bits = | ||
473 | b->bm_words = | ||
474 | b->bm_dev_capacity = 0; | ||
475 | spin_unlock_irq(&b->bm_lock); | ||
476 | bm_free_pages(opages, onpages); | ||
477 | bm_vk_free(opages, opages_vmalloced); | ||
478 | goto out; | ||
479 | } | ||
480 | bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT)); | ||
481 | |||
482 | /* if we would use | ||
483 | words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL; | ||
484 | a 32bit host could present the wrong number of words | ||
485 | to a 64bit host. | ||
486 | */ | ||
487 | words = ALIGN(bits, 64) >> LN2_BPL; | ||
488 | |||
489 | if (get_ldev(mdev)) { | ||
490 | D_ASSERT((u64)bits <= (((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12)); | ||
491 | put_ldev(mdev); | ||
492 | } | ||
493 | |||
494 | /* one extra long to catch off by one errors */ | ||
495 | want = ALIGN((words+1)*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT; | ||
496 | have = b->bm_number_of_pages; | ||
497 | if (want == have) { | ||
498 | D_ASSERT(b->bm_pages != NULL); | ||
499 | npages = b->bm_pages; | ||
500 | } else { | ||
501 | if (FAULT_ACTIVE(mdev, DRBD_FAULT_BM_ALLOC)) | ||
502 | npages = NULL; | ||
503 | else | ||
504 | npages = bm_realloc_pages(b, want); | ||
505 | } | ||
506 | |||
507 | if (!npages) { | ||
508 | err = -ENOMEM; | ||
509 | goto out; | ||
510 | } | ||
511 | |||
512 | spin_lock_irq(&b->bm_lock); | ||
513 | opages = b->bm_pages; | ||
514 | owords = b->bm_words; | ||
515 | obits = b->bm_bits; | ||
516 | |||
517 | growing = bits > obits; | ||
518 | if (opages) | ||
519 | bm_set_surplus(b); | ||
520 | |||
521 | b->bm_pages = npages; | ||
522 | b->bm_number_of_pages = want; | ||
523 | b->bm_bits = bits; | ||
524 | b->bm_words = words; | ||
525 | b->bm_dev_capacity = capacity; | ||
526 | |||
527 | if (growing) { | ||
528 | bm_memset(b, owords, 0xff, words-owords); | ||
529 | b->bm_set += bits - obits; | ||
530 | } | ||
531 | |||
532 | if (want < have) { | ||
533 | /* implicit: (opages != NULL) && (opages != npages) */ | ||
534 | bm_free_pages(opages + want, have - want); | ||
535 | } | ||
536 | |||
537 | p_addr = bm_map_paddr(b, words); | ||
538 | bm = p_addr + MLPP(words); | ||
539 | *bm = DRBD_MAGIC; | ||
540 | bm_unmap(p_addr); | ||
541 | |||
542 | (void)bm_clear_surplus(b); | ||
543 | |||
544 | spin_unlock_irq(&b->bm_lock); | ||
545 | if (opages != npages) | ||
546 | bm_vk_free(opages, opages_vmalloced); | ||
547 | if (!growing) | ||
548 | b->bm_set = bm_count_bits(b); | ||
549 | dev_info(DEV, "resync bitmap: bits=%lu words=%lu\n", bits, words); | ||
550 | |||
551 | out: | ||
552 | drbd_bm_unlock(mdev); | ||
553 | return err; | ||
554 | } | ||
555 | |||
556 | /* inherently racy: | ||
557 | * if not protected by other means, return value may be out of date when | ||
558 | * leaving this function... | ||
559 | * we still need to lock it, since it is important that this returns | ||
560 | * bm_set == 0 precisely. | ||
561 | * | ||
562 | * maybe bm_set should be atomic_t ? | ||
563 | */ | ||
564 | static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev) | ||
565 | { | ||
566 | struct drbd_bitmap *b = mdev->bitmap; | ||
567 | unsigned long s; | ||
568 | unsigned long flags; | ||
569 | |||
570 | ERR_IF(!b) return 0; | ||
571 | ERR_IF(!b->bm_pages) return 0; | ||
572 | |||
573 | spin_lock_irqsave(&b->bm_lock, flags); | ||
574 | s = b->bm_set; | ||
575 | spin_unlock_irqrestore(&b->bm_lock, flags); | ||
576 | |||
577 | return s; | ||
578 | } | ||
579 | |||
580 | unsigned long drbd_bm_total_weight(struct drbd_conf *mdev) | ||
581 | { | ||
582 | unsigned long s; | ||
583 | /* if I don't have a disk, I don't know about out-of-sync status */ | ||
584 | if (!get_ldev_if_state(mdev, D_NEGOTIATING)) | ||
585 | return 0; | ||
586 | s = _drbd_bm_total_weight(mdev); | ||
587 | put_ldev(mdev); | ||
588 | return s; | ||
589 | } | ||
590 | |||
591 | size_t drbd_bm_words(struct drbd_conf *mdev) | ||
592 | { | ||
593 | struct drbd_bitmap *b = mdev->bitmap; | ||
594 | ERR_IF(!b) return 0; | ||
595 | ERR_IF(!b->bm_pages) return 0; | ||
596 | |||
597 | return b->bm_words; | ||
598 | } | ||
599 | |||
600 | unsigned long drbd_bm_bits(struct drbd_conf *mdev) | ||
601 | { | ||
602 | struct drbd_bitmap *b = mdev->bitmap; | ||
603 | ERR_IF(!b) return 0; | ||
604 | |||
605 | return b->bm_bits; | ||
606 | } | ||
607 | |||
608 | /* merge number words from buffer into the bitmap starting at offset. | ||
609 | * buffer[i] is expected to be little endian unsigned long. | ||
610 | * bitmap must be locked by drbd_bm_lock. | ||
611 | * currently only used from receive_bitmap. | ||
612 | */ | ||
613 | void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number, | ||
614 | unsigned long *buffer) | ||
615 | { | ||
616 | struct drbd_bitmap *b = mdev->bitmap; | ||
617 | unsigned long *p_addr, *bm; | ||
618 | unsigned long word, bits; | ||
619 | size_t end, do_now; | ||
620 | |||
621 | end = offset + number; | ||
622 | |||
623 | ERR_IF(!b) return; | ||
624 | ERR_IF(!b->bm_pages) return; | ||
625 | if (number == 0) | ||
626 | return; | ||
627 | WARN_ON(offset >= b->bm_words); | ||
628 | WARN_ON(end > b->bm_words); | ||
629 | |||
630 | spin_lock_irq(&b->bm_lock); | ||
631 | while (offset < end) { | ||
632 | do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; | ||
633 | p_addr = bm_map_paddr(b, offset); | ||
634 | bm = p_addr + MLPP(offset); | ||
635 | offset += do_now; | ||
636 | while (do_now--) { | ||
637 | bits = hweight_long(*bm); | ||
638 | word = *bm | lel_to_cpu(*buffer++); | ||
639 | *bm++ = word; | ||
640 | b->bm_set += hweight_long(word) - bits; | ||
641 | } | ||
642 | bm_unmap(p_addr); | ||
643 | } | ||
644 | /* with 32bit <-> 64bit cross-platform connect | ||
645 | * this is only correct for current usage, | ||
646 | * where we _know_ that we are 64 bit aligned, | ||
647 | * and know that this function is used in this way, too... | ||
648 | */ | ||
649 | if (end == b->bm_words) | ||
650 | b->bm_set -= bm_clear_surplus(b); | ||
651 | |||
652 | spin_unlock_irq(&b->bm_lock); | ||
653 | } | ||
654 | |||
655 | /* copy number words from the bitmap starting at offset into the buffer. | ||
656 | * buffer[i] will be little endian unsigned long. | ||
657 | */ | ||
658 | void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number, | ||
659 | unsigned long *buffer) | ||
660 | { | ||
661 | struct drbd_bitmap *b = mdev->bitmap; | ||
662 | unsigned long *p_addr, *bm; | ||
663 | size_t end, do_now; | ||
664 | |||
665 | end = offset + number; | ||
666 | |||
667 | ERR_IF(!b) return; | ||
668 | ERR_IF(!b->bm_pages) return; | ||
669 | |||
670 | spin_lock_irq(&b->bm_lock); | ||
671 | if ((offset >= b->bm_words) || | ||
672 | (end > b->bm_words) || | ||
673 | (number <= 0)) | ||
674 | dev_err(DEV, "offset=%lu number=%lu bm_words=%lu\n", | ||
675 | (unsigned long) offset, | ||
676 | (unsigned long) number, | ||
677 | (unsigned long) b->bm_words); | ||
678 | else { | ||
679 | while (offset < end) { | ||
680 | do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; | ||
681 | p_addr = bm_map_paddr(b, offset); | ||
682 | bm = p_addr + MLPP(offset); | ||
683 | offset += do_now; | ||
684 | while (do_now--) | ||
685 | *buffer++ = cpu_to_lel(*bm++); | ||
686 | bm_unmap(p_addr); | ||
687 | } | ||
688 | } | ||
689 | spin_unlock_irq(&b->bm_lock); | ||
690 | } | ||
691 | |||
692 | /* set all bits in the bitmap */ | ||
693 | void drbd_bm_set_all(struct drbd_conf *mdev) | ||
694 | { | ||
695 | struct drbd_bitmap *b = mdev->bitmap; | ||
696 | ERR_IF(!b) return; | ||
697 | ERR_IF(!b->bm_pages) return; | ||
698 | |||
699 | spin_lock_irq(&b->bm_lock); | ||
700 | bm_memset(b, 0, 0xff, b->bm_words); | ||
701 | (void)bm_clear_surplus(b); | ||
702 | b->bm_set = b->bm_bits; | ||
703 | spin_unlock_irq(&b->bm_lock); | ||
704 | } | ||
705 | |||
706 | /* clear all bits in the bitmap */ | ||
707 | void drbd_bm_clear_all(struct drbd_conf *mdev) | ||
708 | { | ||
709 | struct drbd_bitmap *b = mdev->bitmap; | ||
710 | ERR_IF(!b) return; | ||
711 | ERR_IF(!b->bm_pages) return; | ||
712 | |||
713 | spin_lock_irq(&b->bm_lock); | ||
714 | bm_memset(b, 0, 0, b->bm_words); | ||
715 | b->bm_set = 0; | ||
716 | spin_unlock_irq(&b->bm_lock); | ||
717 | } | ||
718 | |||
719 | static void bm_async_io_complete(struct bio *bio, int error) | ||
720 | { | ||
721 | struct drbd_bitmap *b = bio->bi_private; | ||
722 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | ||
723 | |||
724 | |||
725 | /* strange behavior of some lower level drivers... | ||
726 | * fail the request by clearing the uptodate flag, | ||
727 | * but do not return any error?! | ||
728 | * do we want to WARN() on this? */ | ||
729 | if (!error && !uptodate) | ||
730 | error = -EIO; | ||
731 | |||
732 | if (error) { | ||
733 | /* doh. what now? | ||
734 | * for now, set all bits, and flag MD_IO_ERROR */ | ||
735 | __set_bit(BM_MD_IO_ERROR, &b->bm_flags); | ||
736 | } | ||
737 | if (atomic_dec_and_test(&b->bm_async_io)) | ||
738 | wake_up(&b->bm_io_wait); | ||
739 | |||
740 | bio_put(bio); | ||
741 | } | ||
742 | |||
743 | static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local) | ||
744 | { | ||
745 | /* we are process context. we always get a bio */ | ||
746 | struct bio *bio = bio_alloc(GFP_KERNEL, 1); | ||
747 | unsigned int len; | ||
748 | sector_t on_disk_sector = | ||
749 | mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset; | ||
750 | on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9); | ||
751 | |||
752 | /* this might happen with very small | ||
753 | * flexible external meta data device */ | ||
754 | len = min_t(unsigned int, PAGE_SIZE, | ||
755 | (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9); | ||
756 | |||
757 | bio->bi_bdev = mdev->ldev->md_bdev; | ||
758 | bio->bi_sector = on_disk_sector; | ||
759 | bio_add_page(bio, b->bm_pages[page_nr], len, 0); | ||
760 | bio->bi_private = b; | ||
761 | bio->bi_end_io = bm_async_io_complete; | ||
762 | |||
763 | if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) { | ||
764 | bio->bi_rw |= rw; | ||
765 | bio_endio(bio, -EIO); | ||
766 | } else { | ||
767 | submit_bio(rw, bio); | ||
768 | } | ||
769 | } | ||
770 | |||
771 | # if defined(__LITTLE_ENDIAN) | ||
772 | /* nothing to do, on disk == in memory */ | ||
773 | # define bm_cpu_to_lel(x) ((void)0) | ||
774 | # else | ||
775 | void bm_cpu_to_lel(struct drbd_bitmap *b) | ||
776 | { | ||
777 | /* need to cpu_to_lel all the pages ... | ||
778 | * this may be optimized by using | ||
779 | * cpu_to_lel(-1) == -1 and cpu_to_lel(0) == 0; | ||
780 | * the following is still not optimal, but better than nothing */ | ||
781 | unsigned int i; | ||
782 | unsigned long *p_addr, *bm; | ||
783 | if (b->bm_set == 0) { | ||
784 | /* no page at all; avoid swap if all is 0 */ | ||
785 | i = b->bm_number_of_pages; | ||
786 | } else if (b->bm_set == b->bm_bits) { | ||
787 | /* only the last page */ | ||
788 | i = b->bm_number_of_pages - 1; | ||
789 | } else { | ||
790 | /* all pages */ | ||
791 | i = 0; | ||
792 | } | ||
793 | for (; i < b->bm_number_of_pages; i++) { | ||
794 | p_addr = kmap_atomic(b->bm_pages[i], KM_USER0); | ||
795 | for (bm = p_addr; bm < p_addr + PAGE_SIZE/sizeof(long); bm++) | ||
796 | *bm = cpu_to_lel(*bm); | ||
797 | kunmap_atomic(p_addr, KM_USER0); | ||
798 | } | ||
799 | } | ||
800 | # endif | ||
801 | /* lel_to_cpu == cpu_to_lel */ | ||
802 | # define bm_lel_to_cpu(x) bm_cpu_to_lel(x) | ||
803 | |||
804 | /* | ||
805 | * bm_rw: read/write the whole bitmap from/to its on disk location. | ||
806 | */ | ||
807 | static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local) | ||
808 | { | ||
809 | struct drbd_bitmap *b = mdev->bitmap; | ||
810 | /* sector_t sector; */ | ||
811 | int bm_words, num_pages, i; | ||
812 | unsigned long now; | ||
813 | char ppb[10]; | ||
814 | int err = 0; | ||
815 | |||
816 | WARN_ON(!bm_is_locked(b)); | ||
817 | |||
818 | /* no spinlock here, the drbd_bm_lock should be enough! */ | ||
819 | |||
820 | bm_words = drbd_bm_words(mdev); | ||
821 | num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT; | ||
822 | |||
823 | /* on disk bitmap is little endian */ | ||
824 | if (rw == WRITE) | ||
825 | bm_cpu_to_lel(b); | ||
826 | |||
827 | now = jiffies; | ||
828 | atomic_set(&b->bm_async_io, num_pages); | ||
829 | __clear_bit(BM_MD_IO_ERROR, &b->bm_flags); | ||
830 | |||
831 | /* let the layers below us try to merge these bios... */ | ||
832 | for (i = 0; i < num_pages; i++) | ||
833 | bm_page_io_async(mdev, b, i, rw); | ||
834 | |||
835 | drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev)); | ||
836 | wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0); | ||
837 | |||
838 | if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) { | ||
839 | dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); | ||
840 | drbd_chk_io_error(mdev, 1, TRUE); | ||
841 | err = -EIO; | ||
842 | } | ||
843 | |||
844 | now = jiffies; | ||
845 | if (rw == WRITE) { | ||
846 | /* swap back endianness */ | ||
847 | bm_lel_to_cpu(b); | ||
848 | /* flush bitmap to stable storage */ | ||
849 | drbd_md_flush(mdev); | ||
850 | } else /* rw == READ */ { | ||
851 | /* just read, if necessary adjust endianness */ | ||
852 | b->bm_set = bm_count_bits_swap_endian(b); | ||
853 | dev_info(DEV, "recounting of set bits took additional %lu jiffies\n", | ||
854 | jiffies - now); | ||
855 | } | ||
856 | now = b->bm_set; | ||
857 | |||
858 | dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", | ||
859 | ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); | ||
860 | |||
861 | return err; | ||
862 | } | ||
863 | |||
864 | /** | ||
865 | * drbd_bm_read() - Read the whole bitmap from its on disk location. | ||
866 | * @mdev: DRBD device. | ||
867 | */ | ||
868 | int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) | ||
869 | { | ||
870 | return bm_rw(mdev, READ); | ||
871 | } | ||
872 | |||
873 | /** | ||
874 | * drbd_bm_write() - Write the whole bitmap to its on disk location. | ||
875 | * @mdev: DRBD device. | ||
876 | */ | ||
877 | int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) | ||
878 | { | ||
879 | return bm_rw(mdev, WRITE); | ||
880 | } | ||
881 | |||
882 | /** | ||
883 | * drbd_bm_write_sect: Writes a 512 (MD_SECTOR_SIZE) byte piece of the bitmap | ||
884 | * @mdev: DRBD device. | ||
885 | * @enr: Extent number in the resync lru (happens to be sector offset) | ||
886 | * | ||
887 | * The BM_EXT_SIZE is on purpose exactly the amount of the bitmap covered | ||
888 | * by a single sector write. Therefore enr == sector offset from the | ||
889 | * start of the bitmap. | ||
890 | */ | ||
891 | int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local) | ||
892 | { | ||
893 | sector_t on_disk_sector = enr + mdev->ldev->md.md_offset | ||
894 | + mdev->ldev->md.bm_offset; | ||
895 | int bm_words, num_words, offset; | ||
896 | int err = 0; | ||
897 | |||
898 | mutex_lock(&mdev->md_io_mutex); | ||
899 | bm_words = drbd_bm_words(mdev); | ||
900 | offset = S2W(enr); /* word offset into bitmap */ | ||
901 | num_words = min(S2W(1), bm_words - offset); | ||
902 | if (num_words < S2W(1)) | ||
903 | memset(page_address(mdev->md_io_page), 0, MD_SECTOR_SIZE); | ||
904 | drbd_bm_get_lel(mdev, offset, num_words, | ||
905 | page_address(mdev->md_io_page)); | ||
906 | if (!drbd_md_sync_page_io(mdev, mdev->ldev, on_disk_sector, WRITE)) { | ||
907 | int i; | ||
908 | err = -EIO; | ||
909 | dev_err(DEV, "IO ERROR writing bitmap sector %lu " | ||
910 | "(meta-disk sector %llus)\n", | ||
911 | enr, (unsigned long long)on_disk_sector); | ||
912 | drbd_chk_io_error(mdev, 1, TRUE); | ||
913 | for (i = 0; i < AL_EXT_PER_BM_SECT; i++) | ||
914 | drbd_bm_ALe_set_all(mdev, enr*AL_EXT_PER_BM_SECT+i); | ||
915 | } | ||
916 | mdev->bm_writ_cnt++; | ||
917 | mutex_unlock(&mdev->md_io_mutex); | ||
918 | return err; | ||
919 | } | ||
920 | |||
921 | /* NOTE | ||
922 | * find_first_bit returns int, we return unsigned long. | ||
923 | * should not make much difference anyways, but ... | ||
924 | * | ||
925 | * this returns a bit number, NOT a sector! | ||
926 | */ | ||
927 | #define BPP_MASK ((1UL << (PAGE_SHIFT+3)) - 1) | ||
928 | static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo, | ||
929 | const int find_zero_bit, const enum km_type km) | ||
930 | { | ||
931 | struct drbd_bitmap *b = mdev->bitmap; | ||
932 | unsigned long i = -1UL; | ||
933 | unsigned long *p_addr; | ||
934 | unsigned long bit_offset; /* bit offset of the mapped page. */ | ||
935 | |||
936 | if (bm_fo > b->bm_bits) { | ||
937 | dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits); | ||
938 | } else { | ||
939 | while (bm_fo < b->bm_bits) { | ||
940 | unsigned long offset; | ||
941 | bit_offset = bm_fo & ~BPP_MASK; /* bit offset of the page */ | ||
942 | offset = bit_offset >> LN2_BPL; /* word offset of the page */ | ||
943 | p_addr = __bm_map_paddr(b, offset, km); | ||
944 | |||
945 | if (find_zero_bit) | ||
946 | i = find_next_zero_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK); | ||
947 | else | ||
948 | i = find_next_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK); | ||
949 | |||
950 | __bm_unmap(p_addr, km); | ||
951 | if (i < PAGE_SIZE*8) { | ||
952 | i = bit_offset + i; | ||
953 | if (i >= b->bm_bits) | ||
954 | break; | ||
955 | goto found; | ||
956 | } | ||
957 | bm_fo = bit_offset + PAGE_SIZE*8; | ||
958 | } | ||
959 | i = -1UL; | ||
960 | } | ||
961 | found: | ||
962 | return i; | ||
963 | } | ||
964 | |||
965 | static unsigned long bm_find_next(struct drbd_conf *mdev, | ||
966 | unsigned long bm_fo, const int find_zero_bit) | ||
967 | { | ||
968 | struct drbd_bitmap *b = mdev->bitmap; | ||
969 | unsigned long i = -1UL; | ||
970 | |||
971 | ERR_IF(!b) return i; | ||
972 | ERR_IF(!b->bm_pages) return i; | ||
973 | |||
974 | spin_lock_irq(&b->bm_lock); | ||
975 | if (bm_is_locked(b)) | ||
976 | bm_print_lock_info(mdev); | ||
977 | |||
978 | i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1); | ||
979 | |||
980 | spin_unlock_irq(&b->bm_lock); | ||
981 | return i; | ||
982 | } | ||
983 | |||
984 | unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo) | ||
985 | { | ||
986 | return bm_find_next(mdev, bm_fo, 0); | ||
987 | } | ||
988 | |||
989 | #if 0 | ||
990 | /* not yet needed for anything. */ | ||
991 | unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo) | ||
992 | { | ||
993 | return bm_find_next(mdev, bm_fo, 1); | ||
994 | } | ||
995 | #endif | ||
996 | |||
997 | /* does not spin_lock_irqsave. | ||
998 | * you must take drbd_bm_lock() first */ | ||
999 | unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo) | ||
1000 | { | ||
1001 | /* WARN_ON(!bm_is_locked(mdev)); */ | ||
1002 | return __bm_find_next(mdev, bm_fo, 0, KM_USER1); | ||
1003 | } | ||
1004 | |||
1005 | unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo) | ||
1006 | { | ||
1007 | /* WARN_ON(!bm_is_locked(mdev)); */ | ||
1008 | return __bm_find_next(mdev, bm_fo, 1, KM_USER1); | ||
1009 | } | ||
1010 | |||
1011 | /* returns number of bits actually changed. | ||
1012 | * for val != 0, we change 0 -> 1, return code positive | ||
1013 | * for val == 0, we change 1 -> 0, return code negative | ||
1014 | * wants bitnr, not sector. | ||
1015 | * expected to be called for only a few bits (e - s about BITS_PER_LONG). | ||
1016 | * Must hold bitmap lock already. */ | ||
1017 | int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, | ||
1018 | unsigned long e, int val, const enum km_type km) | ||
1019 | { | ||
1020 | struct drbd_bitmap *b = mdev->bitmap; | ||
1021 | unsigned long *p_addr = NULL; | ||
1022 | unsigned long bitnr; | ||
1023 | unsigned long last_page_nr = -1UL; | ||
1024 | int c = 0; | ||
1025 | |||
1026 | if (e >= b->bm_bits) { | ||
1027 | dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n", | ||
1028 | s, e, b->bm_bits); | ||
1029 | e = b->bm_bits ? b->bm_bits -1 : 0; | ||
1030 | } | ||
1031 | for (bitnr = s; bitnr <= e; bitnr++) { | ||
1032 | unsigned long offset = bitnr>>LN2_BPL; | ||
1033 | unsigned long page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3); | ||
1034 | if (page_nr != last_page_nr) { | ||
1035 | if (p_addr) | ||
1036 | __bm_unmap(p_addr, km); | ||
1037 | p_addr = __bm_map_paddr(b, offset, km); | ||
1038 | last_page_nr = page_nr; | ||
1039 | } | ||
1040 | if (val) | ||
1041 | c += (0 == __test_and_set_bit(bitnr & BPP_MASK, p_addr)); | ||
1042 | else | ||
1043 | c -= (0 != __test_and_clear_bit(bitnr & BPP_MASK, p_addr)); | ||
1044 | } | ||
1045 | if (p_addr) | ||
1046 | __bm_unmap(p_addr, km); | ||
1047 | b->bm_set += c; | ||
1048 | return c; | ||
1049 | } | ||
1050 | |||
1051 | /* returns number of bits actually changed. | ||
1052 | * for val != 0, we change 0 -> 1, return code positive | ||
1053 | * for val == 0, we change 1 -> 0, return code negative | ||
1054 | * wants bitnr, not sector */ | ||
1055 | int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, | ||
1056 | const unsigned long e, int val) | ||
1057 | { | ||
1058 | unsigned long flags; | ||
1059 | struct drbd_bitmap *b = mdev->bitmap; | ||
1060 | int c = 0; | ||
1061 | |||
1062 | ERR_IF(!b) return 1; | ||
1063 | ERR_IF(!b->bm_pages) return 0; | ||
1064 | |||
1065 | spin_lock_irqsave(&b->bm_lock, flags); | ||
1066 | if (bm_is_locked(b)) | ||
1067 | bm_print_lock_info(mdev); | ||
1068 | |||
1069 | c = __bm_change_bits_to(mdev, s, e, val, KM_IRQ1); | ||
1070 | |||
1071 | spin_unlock_irqrestore(&b->bm_lock, flags); | ||
1072 | return c; | ||
1073 | } | ||
1074 | |||
1075 | /* returns number of bits changed 0 -> 1 */ | ||
1076 | int drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) | ||
1077 | { | ||
1078 | return bm_change_bits_to(mdev, s, e, 1); | ||
1079 | } | ||
1080 | |||
1081 | /* returns number of bits changed 1 -> 0 */ | ||
1082 | int drbd_bm_clear_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) | ||
1083 | { | ||
1084 | return -bm_change_bits_to(mdev, s, e, 0); | ||
1085 | } | ||
1086 | |||
1087 | /* sets all bits in full words, | ||
1088 | * from first_word up to, but not including, last_word */ | ||
1089 | static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b, | ||
1090 | int page_nr, int first_word, int last_word) | ||
1091 | { | ||
1092 | int i; | ||
1093 | int bits; | ||
1094 | unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_USER0); | ||
1095 | for (i = first_word; i < last_word; i++) { | ||
1096 | bits = hweight_long(paddr[i]); | ||
1097 | paddr[i] = ~0UL; | ||
1098 | b->bm_set += BITS_PER_LONG - bits; | ||
1099 | } | ||
1100 | kunmap_atomic(paddr, KM_USER0); | ||
1101 | } | ||
1102 | |||
1103 | /* Same thing as drbd_bm_set_bits, but without taking the spin_lock_irqsave. | ||
1104 | * You must first drbd_bm_lock(). | ||
1105 | * Can be called to set the whole bitmap in one go. | ||
1106 | * Sets bits from s to e _inclusive_. */ | ||
1107 | void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) | ||
1108 | { | ||
1109 | /* First set_bit from the first bit (s) | ||
1110 | * up to the next long boundary (sl), | ||
1111 | * then assign full words up to the last long boundary (el), | ||
1112 | * then set_bit up to and including the last bit (e). | ||
1113 | * | ||
1114 | * Do not use memset, because we must account for changes, | ||
1115 | * so we need to loop over the words with hweight() anyways. | ||
1116 | */ | ||
1117 | unsigned long sl = ALIGN(s,BITS_PER_LONG); | ||
1118 | unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1); | ||
1119 | int first_page; | ||
1120 | int last_page; | ||
1121 | int page_nr; | ||
1122 | int first_word; | ||
1123 | int last_word; | ||
1124 | |||
1125 | if (e - s <= 3*BITS_PER_LONG) { | ||
1126 | /* don't bother; el and sl may even be wrong. */ | ||
1127 | __bm_change_bits_to(mdev, s, e, 1, KM_USER0); | ||
1128 | return; | ||
1129 | } | ||
1130 | |||
1131 | /* difference is large enough that we can trust sl and el */ | ||
1132 | |||
1133 | /* bits filling the current long */ | ||
1134 | if (sl) | ||
1135 | __bm_change_bits_to(mdev, s, sl-1, 1, KM_USER0); | ||
1136 | |||
1137 | first_page = sl >> (3 + PAGE_SHIFT); | ||
1138 | last_page = el >> (3 + PAGE_SHIFT); | ||
1139 | |||
1140 | /* MLPP: modulo longs per page */ | ||
1141 | /* LWPP: long words per page */ | ||
1142 | first_word = MLPP(sl >> LN2_BPL); | ||
1143 | last_word = LWPP; | ||
1144 | |||
1145 | /* first and full pages, unless first page == last page */ | ||
1146 | for (page_nr = first_page; page_nr < last_page; page_nr++) { | ||
1147 | bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word); | ||
1148 | cond_resched(); | ||
1149 | first_word = 0; | ||
1150 | } | ||
1151 | |||
1152 | /* last page (respectively only page, for first page == last page) */ | ||
1153 | last_word = MLPP(el >> LN2_BPL); | ||
1154 | bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word); | ||
1155 | |||
1156 | /* possibly trailing bits. | ||
1157 | * example: (e & 63) == 63, el will be e+1. | ||
1158 | * if that even was the very last bit, | ||
1159 | * it would trigger an assert in __bm_change_bits_to() | ||
1160 | */ | ||
1161 | if (el <= e) | ||
1162 | __bm_change_bits_to(mdev, el, e, 1, KM_USER0); | ||
1163 | } | ||
1164 | |||
1165 | /* returns bit state | ||
1166 | * wants bitnr, NOT sector. | ||
1167 | * inherently racy... area needs to be locked by means of {al,rs}_lru | ||
1168 | * 1 ... bit set | ||
1169 | * 0 ... bit not set | ||
1170 | * -1 ... first out of bounds access, stop testing for bits! | ||
1171 | */ | ||
1172 | int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr) | ||
1173 | { | ||
1174 | unsigned long flags; | ||
1175 | struct drbd_bitmap *b = mdev->bitmap; | ||
1176 | unsigned long *p_addr; | ||
1177 | int i; | ||
1178 | |||
1179 | ERR_IF(!b) return 0; | ||
1180 | ERR_IF(!b->bm_pages) return 0; | ||
1181 | |||
1182 | spin_lock_irqsave(&b->bm_lock, flags); | ||
1183 | if (bm_is_locked(b)) | ||
1184 | bm_print_lock_info(mdev); | ||
1185 | if (bitnr < b->bm_bits) { | ||
1186 | unsigned long offset = bitnr>>LN2_BPL; | ||
1187 | p_addr = bm_map_paddr(b, offset); | ||
1188 | i = test_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0; | ||
1189 | bm_unmap(p_addr); | ||
1190 | } else if (bitnr == b->bm_bits) { | ||
1191 | i = -1; | ||
1192 | } else { /* (bitnr > b->bm_bits) */ | ||
1193 | dev_err(DEV, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits); | ||
1194 | i = 0; | ||
1195 | } | ||
1196 | |||
1197 | spin_unlock_irqrestore(&b->bm_lock, flags); | ||
1198 | return i; | ||
1199 | } | ||
1200 | |||
1201 | /* returns number of bits set in the range [s, e] */ | ||
1202 | int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) | ||
1203 | { | ||
1204 | unsigned long flags; | ||
1205 | struct drbd_bitmap *b = mdev->bitmap; | ||
1206 | unsigned long *p_addr = NULL, page_nr = -1; | ||
1207 | unsigned long bitnr; | ||
1208 | int c = 0; | ||
1209 | size_t w; | ||
1210 | |||
1211 | /* If this is called without a bitmap, that is a bug. But just to be | ||
1212 | * robust in case we screwed up elsewhere, in that case pretend there | ||
1213 | * was one dirty bit in the requested area, so we won't try to do a | ||
1214 | * local read there (no bitmap probably implies no disk) */ | ||
1215 | ERR_IF(!b) return 1; | ||
1216 | ERR_IF(!b->bm_pages) return 1; | ||
1217 | |||
1218 | spin_lock_irqsave(&b->bm_lock, flags); | ||
1219 | if (bm_is_locked(b)) | ||
1220 | bm_print_lock_info(mdev); | ||
1221 | for (bitnr = s; bitnr <= e; bitnr++) { | ||
1222 | w = bitnr >> LN2_BPL; | ||
1223 | if (page_nr != w >> (PAGE_SHIFT - LN2_BPL + 3)) { | ||
1224 | page_nr = w >> (PAGE_SHIFT - LN2_BPL + 3); | ||
1225 | if (p_addr) | ||
1226 | bm_unmap(p_addr); | ||
1227 | p_addr = bm_map_paddr(b, w); | ||
1228 | } | ||
1229 | ERR_IF (bitnr >= b->bm_bits) { | ||
1230 | dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); | ||
1231 | } else { | ||
1232 | c += (0 != test_bit(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr)); | ||
1233 | } | ||
1234 | } | ||
1235 | if (p_addr) | ||
1236 | bm_unmap(p_addr); | ||
1237 | spin_unlock_irqrestore(&b->bm_lock, flags); | ||
1238 | return c; | ||
1239 | } | ||
1240 | |||
1241 | |||
1242 | /* inherently racy... | ||
1243 | * return value may be already out-of-date when this function returns. | ||
1244 | * but the general usage is that this is only use during a cstate when bits are | ||
1245 | * only cleared, not set, and typically only care for the case when the return | ||
1246 | * value is zero, or we already "locked" this "bitmap extent" by other means. | ||
1247 | * | ||
1248 | * enr is bm-extent number, since we chose to name one sector (512 bytes) | ||
1249 | * worth of the bitmap a "bitmap extent". | ||
1250 | * | ||
1251 | * TODO | ||
1252 | * I think since we use it like a reference count, we should use the real | ||
1253 | * reference count of some bitmap extent element from some lru instead... | ||
1254 | * | ||
1255 | */ | ||
1256 | int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr) | ||
1257 | { | ||
1258 | struct drbd_bitmap *b = mdev->bitmap; | ||
1259 | int count, s, e; | ||
1260 | unsigned long flags; | ||
1261 | unsigned long *p_addr, *bm; | ||
1262 | |||
1263 | ERR_IF(!b) return 0; | ||
1264 | ERR_IF(!b->bm_pages) return 0; | ||
1265 | |||
1266 | spin_lock_irqsave(&b->bm_lock, flags); | ||
1267 | if (bm_is_locked(b)) | ||
1268 | bm_print_lock_info(mdev); | ||
1269 | |||
1270 | s = S2W(enr); | ||
1271 | e = min((size_t)S2W(enr+1), b->bm_words); | ||
1272 | count = 0; | ||
1273 | if (s < b->bm_words) { | ||
1274 | int n = e-s; | ||
1275 | p_addr = bm_map_paddr(b, s); | ||
1276 | bm = p_addr + MLPP(s); | ||
1277 | while (n--) | ||
1278 | count += hweight_long(*bm++); | ||
1279 | bm_unmap(p_addr); | ||
1280 | } else { | ||
1281 | dev_err(DEV, "start offset (%d) too large in drbd_bm_e_weight\n", s); | ||
1282 | } | ||
1283 | spin_unlock_irqrestore(&b->bm_lock, flags); | ||
1284 | return count; | ||
1285 | } | ||
1286 | |||
1287 | /* set all bits covered by the AL-extent al_enr */ | ||
1288 | unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr) | ||
1289 | { | ||
1290 | struct drbd_bitmap *b = mdev->bitmap; | ||
1291 | unsigned long *p_addr, *bm; | ||
1292 | unsigned long weight; | ||
1293 | int count, s, e, i, do_now; | ||
1294 | ERR_IF(!b) return 0; | ||
1295 | ERR_IF(!b->bm_pages) return 0; | ||
1296 | |||
1297 | spin_lock_irq(&b->bm_lock); | ||
1298 | if (bm_is_locked(b)) | ||
1299 | bm_print_lock_info(mdev); | ||
1300 | weight = b->bm_set; | ||
1301 | |||
1302 | s = al_enr * BM_WORDS_PER_AL_EXT; | ||
1303 | e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words); | ||
1304 | /* assert that s and e are on the same page */ | ||
1305 | D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3) | ||
1306 | == s >> (PAGE_SHIFT - LN2_BPL + 3)); | ||
1307 | count = 0; | ||
1308 | if (s < b->bm_words) { | ||
1309 | i = do_now = e-s; | ||
1310 | p_addr = bm_map_paddr(b, s); | ||
1311 | bm = p_addr + MLPP(s); | ||
1312 | while (i--) { | ||
1313 | count += hweight_long(*bm); | ||
1314 | *bm = -1UL; | ||
1315 | bm++; | ||
1316 | } | ||
1317 | bm_unmap(p_addr); | ||
1318 | b->bm_set += do_now*BITS_PER_LONG - count; | ||
1319 | if (e == b->bm_words) | ||
1320 | b->bm_set -= bm_clear_surplus(b); | ||
1321 | } else { | ||
1322 | dev_err(DEV, "start offset (%d) too large in drbd_bm_ALe_set_all\n", s); | ||
1323 | } | ||
1324 | weight = b->bm_set - weight; | ||
1325 | spin_unlock_irq(&b->bm_lock); | ||
1326 | return weight; | ||
1327 | } | ||
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h new file mode 100644 index 000000000000..2312d782fe99 --- /dev/null +++ b/drivers/block/drbd/drbd_int.h | |||
@@ -0,0 +1,2252 @@ | |||
1 | /* | ||
2 | drbd_int.h | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #ifndef _DRBD_INT_H | ||
27 | #define _DRBD_INT_H | ||
28 | |||
29 | #include <linux/compiler.h> | ||
30 | #include <linux/types.h> | ||
31 | #include <linux/version.h> | ||
32 | #include <linux/list.h> | ||
33 | #include <linux/sched.h> | ||
34 | #include <linux/bitops.h> | ||
35 | #include <linux/slab.h> | ||
36 | #include <linux/crypto.h> | ||
37 | #include <linux/ratelimit.h> | ||
38 | #include <linux/tcp.h> | ||
39 | #include <linux/mutex.h> | ||
40 | #include <linux/major.h> | ||
41 | #include <linux/blkdev.h> | ||
42 | #include <linux/genhd.h> | ||
43 | #include <net/tcp.h> | ||
44 | #include <linux/lru_cache.h> | ||
45 | |||
46 | #ifdef __CHECKER__ | ||
47 | # define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) | ||
48 | # define __protected_read_by(x) __attribute__((require_context(x,1,999,"read"))) | ||
49 | # define __protected_write_by(x) __attribute__((require_context(x,1,999,"write"))) | ||
50 | # define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call"))) | ||
51 | #else | ||
52 | # define __protected_by(x) | ||
53 | # define __protected_read_by(x) | ||
54 | # define __protected_write_by(x) | ||
55 | # define __must_hold(x) | ||
56 | #endif | ||
57 | |||
58 | #define __no_warn(lock, stmt) do { __acquire(lock); stmt; __release(lock); } while (0) | ||
59 | |||
60 | /* module parameter, defined in drbd_main.c */ | ||
61 | extern unsigned int minor_count; | ||
62 | extern int disable_sendpage; | ||
63 | extern int allow_oos; | ||
64 | extern unsigned int cn_idx; | ||
65 | |||
66 | #ifdef CONFIG_DRBD_FAULT_INJECTION | ||
67 | extern int enable_faults; | ||
68 | extern int fault_rate; | ||
69 | extern int fault_devs; | ||
70 | #endif | ||
71 | |||
72 | extern char usermode_helper[]; | ||
73 | |||
74 | |||
75 | #ifndef TRUE | ||
76 | #define TRUE 1 | ||
77 | #endif | ||
78 | #ifndef FALSE | ||
79 | #define FALSE 0 | ||
80 | #endif | ||
81 | |||
82 | /* I don't remember why XCPU ... | ||
83 | * This is used to wake the asender, | ||
84 | * and to interrupt sending the sending task | ||
85 | * on disconnect. | ||
86 | */ | ||
87 | #define DRBD_SIG SIGXCPU | ||
88 | |||
89 | /* This is used to stop/restart our threads. | ||
90 | * Cannot use SIGTERM nor SIGKILL, since these | ||
91 | * are sent out by init on runlevel changes | ||
92 | * I choose SIGHUP for now. | ||
93 | */ | ||
94 | #define DRBD_SIGKILL SIGHUP | ||
95 | |||
96 | /* All EEs on the free list should have ID_VACANT (== 0) | ||
97 | * freshly allocated EEs get !ID_VACANT (== 1) | ||
98 | * so if it says "cannot dereference null pointer at adress 0x00000001", | ||
99 | * it is most likely one of these :( */ | ||
100 | |||
101 | #define ID_IN_SYNC (4711ULL) | ||
102 | #define ID_OUT_OF_SYNC (4712ULL) | ||
103 | |||
104 | #define ID_SYNCER (-1ULL) | ||
105 | #define ID_VACANT 0 | ||
106 | #define is_syncer_block_id(id) ((id) == ID_SYNCER) | ||
107 | |||
108 | struct drbd_conf; | ||
109 | |||
110 | |||
111 | /* to shorten dev_warn(DEV, "msg"); and relatives statements */ | ||
112 | #define DEV (disk_to_dev(mdev->vdisk)) | ||
113 | |||
114 | #define D_ASSERT(exp) if (!(exp)) \ | ||
115 | dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__) | ||
116 | |||
117 | #define ERR_IF(exp) if (({ \ | ||
118 | int _b = (exp) != 0; \ | ||
119 | if (_b) dev_err(DEV, "%s: (%s) in %s:%d\n", \ | ||
120 | __func__, #exp, __FILE__, __LINE__); \ | ||
121 | _b; \ | ||
122 | })) | ||
123 | |||
124 | /* Defines to control fault insertion */ | ||
125 | enum { | ||
126 | DRBD_FAULT_MD_WR = 0, /* meta data write */ | ||
127 | DRBD_FAULT_MD_RD = 1, /* read */ | ||
128 | DRBD_FAULT_RS_WR = 2, /* resync */ | ||
129 | DRBD_FAULT_RS_RD = 3, | ||
130 | DRBD_FAULT_DT_WR = 4, /* data */ | ||
131 | DRBD_FAULT_DT_RD = 5, | ||
132 | DRBD_FAULT_DT_RA = 6, /* data read ahead */ | ||
133 | DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */ | ||
134 | DRBD_FAULT_AL_EE = 8, /* alloc ee */ | ||
135 | |||
136 | DRBD_FAULT_MAX, | ||
137 | }; | ||
138 | |||
139 | #ifdef CONFIG_DRBD_FAULT_INJECTION | ||
140 | extern unsigned int | ||
141 | _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type); | ||
142 | static inline int | ||
143 | drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) { | ||
144 | return fault_rate && | ||
145 | (enable_faults & (1<<type)) && | ||
146 | _drbd_insert_fault(mdev, type); | ||
147 | } | ||
148 | #define FAULT_ACTIVE(_m, _t) (drbd_insert_fault((_m), (_t))) | ||
149 | |||
150 | #else | ||
151 | #define FAULT_ACTIVE(_m, _t) (0) | ||
152 | #endif | ||
153 | |||
154 | /* integer division, round _UP_ to the next integer */ | ||
155 | #define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0)) | ||
156 | /* usual integer division */ | ||
157 | #define div_floor(A, B) ((A)/(B)) | ||
158 | |||
159 | /* drbd_meta-data.c (still in drbd_main.c) */ | ||
160 | /* 4th incarnation of the disk layout. */ | ||
161 | #define DRBD_MD_MAGIC (DRBD_MAGIC+4) | ||
162 | |||
163 | extern struct drbd_conf **minor_table; | ||
164 | extern struct ratelimit_state drbd_ratelimit_state; | ||
165 | |||
166 | /* on the wire */ | ||
167 | enum drbd_packets { | ||
168 | /* receiver (data socket) */ | ||
169 | P_DATA = 0x00, | ||
170 | P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */ | ||
171 | P_RS_DATA_REPLY = 0x02, /* Response to P_RS_DATA_REQUEST */ | ||
172 | P_BARRIER = 0x03, | ||
173 | P_BITMAP = 0x04, | ||
174 | P_BECOME_SYNC_TARGET = 0x05, | ||
175 | P_BECOME_SYNC_SOURCE = 0x06, | ||
176 | P_UNPLUG_REMOTE = 0x07, /* Used at various times to hint the peer */ | ||
177 | P_DATA_REQUEST = 0x08, /* Used to ask for a data block */ | ||
178 | P_RS_DATA_REQUEST = 0x09, /* Used to ask for a data block for resync */ | ||
179 | P_SYNC_PARAM = 0x0a, | ||
180 | P_PROTOCOL = 0x0b, | ||
181 | P_UUIDS = 0x0c, | ||
182 | P_SIZES = 0x0d, | ||
183 | P_STATE = 0x0e, | ||
184 | P_SYNC_UUID = 0x0f, | ||
185 | P_AUTH_CHALLENGE = 0x10, | ||
186 | P_AUTH_RESPONSE = 0x11, | ||
187 | P_STATE_CHG_REQ = 0x12, | ||
188 | |||
189 | /* asender (meta socket */ | ||
190 | P_PING = 0x13, | ||
191 | P_PING_ACK = 0x14, | ||
192 | P_RECV_ACK = 0x15, /* Used in protocol B */ | ||
193 | P_WRITE_ACK = 0x16, /* Used in protocol C */ | ||
194 | P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */ | ||
195 | P_DISCARD_ACK = 0x18, /* Used in proto C, two-primaries conflict detection */ | ||
196 | P_NEG_ACK = 0x19, /* Sent if local disk is unusable */ | ||
197 | P_NEG_DREPLY = 0x1a, /* Local disk is broken... */ | ||
198 | P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */ | ||
199 | P_BARRIER_ACK = 0x1c, | ||
200 | P_STATE_CHG_REPLY = 0x1d, | ||
201 | |||
202 | /* "new" commands, no longer fitting into the ordering scheme above */ | ||
203 | |||
204 | P_OV_REQUEST = 0x1e, /* data socket */ | ||
205 | P_OV_REPLY = 0x1f, | ||
206 | P_OV_RESULT = 0x20, /* meta socket */ | ||
207 | P_CSUM_RS_REQUEST = 0x21, /* data socket */ | ||
208 | P_RS_IS_IN_SYNC = 0x22, /* meta socket */ | ||
209 | P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */ | ||
210 | P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */ | ||
211 | |||
212 | P_MAX_CMD = 0x25, | ||
213 | P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ | ||
214 | P_MAX_OPT_CMD = 0x101, | ||
215 | |||
216 | /* special command ids for handshake */ | ||
217 | |||
218 | P_HAND_SHAKE_M = 0xfff1, /* First Packet on the MetaSock */ | ||
219 | P_HAND_SHAKE_S = 0xfff2, /* First Packet on the Socket */ | ||
220 | |||
221 | P_HAND_SHAKE = 0xfffe /* FIXED for the next century! */ | ||
222 | }; | ||
223 | |||
224 | static inline const char *cmdname(enum drbd_packets cmd) | ||
225 | { | ||
226 | /* THINK may need to become several global tables | ||
227 | * when we want to support more than | ||
228 | * one PRO_VERSION */ | ||
229 | static const char *cmdnames[] = { | ||
230 | [P_DATA] = "Data", | ||
231 | [P_DATA_REPLY] = "DataReply", | ||
232 | [P_RS_DATA_REPLY] = "RSDataReply", | ||
233 | [P_BARRIER] = "Barrier", | ||
234 | [P_BITMAP] = "ReportBitMap", | ||
235 | [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget", | ||
236 | [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource", | ||
237 | [P_UNPLUG_REMOTE] = "UnplugRemote", | ||
238 | [P_DATA_REQUEST] = "DataRequest", | ||
239 | [P_RS_DATA_REQUEST] = "RSDataRequest", | ||
240 | [P_SYNC_PARAM] = "SyncParam", | ||
241 | [P_SYNC_PARAM89] = "SyncParam89", | ||
242 | [P_PROTOCOL] = "ReportProtocol", | ||
243 | [P_UUIDS] = "ReportUUIDs", | ||
244 | [P_SIZES] = "ReportSizes", | ||
245 | [P_STATE] = "ReportState", | ||
246 | [P_SYNC_UUID] = "ReportSyncUUID", | ||
247 | [P_AUTH_CHALLENGE] = "AuthChallenge", | ||
248 | [P_AUTH_RESPONSE] = "AuthResponse", | ||
249 | [P_PING] = "Ping", | ||
250 | [P_PING_ACK] = "PingAck", | ||
251 | [P_RECV_ACK] = "RecvAck", | ||
252 | [P_WRITE_ACK] = "WriteAck", | ||
253 | [P_RS_WRITE_ACK] = "RSWriteAck", | ||
254 | [P_DISCARD_ACK] = "DiscardAck", | ||
255 | [P_NEG_ACK] = "NegAck", | ||
256 | [P_NEG_DREPLY] = "NegDReply", | ||
257 | [P_NEG_RS_DREPLY] = "NegRSDReply", | ||
258 | [P_BARRIER_ACK] = "BarrierAck", | ||
259 | [P_STATE_CHG_REQ] = "StateChgRequest", | ||
260 | [P_STATE_CHG_REPLY] = "StateChgReply", | ||
261 | [P_OV_REQUEST] = "OVRequest", | ||
262 | [P_OV_REPLY] = "OVReply", | ||
263 | [P_OV_RESULT] = "OVResult", | ||
264 | [P_MAX_CMD] = NULL, | ||
265 | }; | ||
266 | |||
267 | if (cmd == P_HAND_SHAKE_M) | ||
268 | return "HandShakeM"; | ||
269 | if (cmd == P_HAND_SHAKE_S) | ||
270 | return "HandShakeS"; | ||
271 | if (cmd == P_HAND_SHAKE) | ||
272 | return "HandShake"; | ||
273 | if (cmd >= P_MAX_CMD) | ||
274 | return "Unknown"; | ||
275 | return cmdnames[cmd]; | ||
276 | } | ||
277 | |||
278 | /* for sending/receiving the bitmap, | ||
279 | * possibly in some encoding scheme */ | ||
280 | struct bm_xfer_ctx { | ||
281 | /* "const" | ||
282 | * stores total bits and long words | ||
283 | * of the bitmap, so we don't need to | ||
284 | * call the accessor functions over and again. */ | ||
285 | unsigned long bm_bits; | ||
286 | unsigned long bm_words; | ||
287 | /* during xfer, current position within the bitmap */ | ||
288 | unsigned long bit_offset; | ||
289 | unsigned long word_offset; | ||
290 | |||
291 | /* statistics; index: (h->command == P_BITMAP) */ | ||
292 | unsigned packets[2]; | ||
293 | unsigned bytes[2]; | ||
294 | }; | ||
295 | |||
296 | extern void INFO_bm_xfer_stats(struct drbd_conf *mdev, | ||
297 | const char *direction, struct bm_xfer_ctx *c); | ||
298 | |||
299 | static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c) | ||
300 | { | ||
301 | /* word_offset counts "native long words" (32 or 64 bit), | ||
302 | * aligned at 64 bit. | ||
303 | * Encoded packet may end at an unaligned bit offset. | ||
304 | * In case a fallback clear text packet is transmitted in | ||
305 | * between, we adjust this offset back to the last 64bit | ||
306 | * aligned "native long word", which makes coding and decoding | ||
307 | * the plain text bitmap much more convenient. */ | ||
308 | #if BITS_PER_LONG == 64 | ||
309 | c->word_offset = c->bit_offset >> 6; | ||
310 | #elif BITS_PER_LONG == 32 | ||
311 | c->word_offset = c->bit_offset >> 5; | ||
312 | c->word_offset &= ~(1UL); | ||
313 | #else | ||
314 | # error "unsupported BITS_PER_LONG" | ||
315 | #endif | ||
316 | } | ||
317 | |||
318 | #ifndef __packed | ||
319 | #define __packed __attribute__((packed)) | ||
320 | #endif | ||
321 | |||
322 | /* This is the layout for a packet on the wire. | ||
323 | * The byteorder is the network byte order. | ||
324 | * (except block_id and barrier fields. | ||
325 | * these are pointers to local structs | ||
326 | * and have no relevance for the partner, | ||
327 | * which just echoes them as received.) | ||
328 | * | ||
329 | * NOTE that the payload starts at a long aligned offset, | ||
330 | * regardless of 32 or 64 bit arch! | ||
331 | */ | ||
332 | struct p_header { | ||
333 | u32 magic; | ||
334 | u16 command; | ||
335 | u16 length; /* bytes of data after this header */ | ||
336 | u8 payload[0]; | ||
337 | } __packed; | ||
338 | /* 8 bytes. packet FIXED for the next century! */ | ||
339 | |||
340 | /* | ||
341 | * short commands, packets without payload, plain p_header: | ||
342 | * P_PING | ||
343 | * P_PING_ACK | ||
344 | * P_BECOME_SYNC_TARGET | ||
345 | * P_BECOME_SYNC_SOURCE | ||
346 | * P_UNPLUG_REMOTE | ||
347 | */ | ||
348 | |||
349 | /* | ||
350 | * commands with out-of-struct payload: | ||
351 | * P_BITMAP (no additional fields) | ||
352 | * P_DATA, P_DATA_REPLY (see p_data) | ||
353 | * P_COMPRESSED_BITMAP (see receive_compressed_bitmap) | ||
354 | */ | ||
355 | |||
356 | /* these defines must not be changed without changing the protocol version */ | ||
357 | #define DP_HARDBARRIER 1 | ||
358 | #define DP_RW_SYNC 2 | ||
359 | #define DP_MAY_SET_IN_SYNC 4 | ||
360 | |||
361 | struct p_data { | ||
362 | struct p_header head; | ||
363 | u64 sector; /* 64 bits sector number */ | ||
364 | u64 block_id; /* to identify the request in protocol B&C */ | ||
365 | u32 seq_num; | ||
366 | u32 dp_flags; | ||
367 | } __packed; | ||
368 | |||
369 | /* | ||
370 | * commands which share a struct: | ||
371 | * p_block_ack: | ||
372 | * P_RECV_ACK (proto B), P_WRITE_ACK (proto C), | ||
373 | * P_DISCARD_ACK (proto C, two-primaries conflict detection) | ||
374 | * p_block_req: | ||
375 | * P_DATA_REQUEST, P_RS_DATA_REQUEST | ||
376 | */ | ||
377 | struct p_block_ack { | ||
378 | struct p_header head; | ||
379 | u64 sector; | ||
380 | u64 block_id; | ||
381 | u32 blksize; | ||
382 | u32 seq_num; | ||
383 | } __packed; | ||
384 | |||
385 | |||
386 | struct p_block_req { | ||
387 | struct p_header head; | ||
388 | u64 sector; | ||
389 | u64 block_id; | ||
390 | u32 blksize; | ||
391 | u32 pad; /* to multiple of 8 Byte */ | ||
392 | } __packed; | ||
393 | |||
394 | /* | ||
395 | * commands with their own struct for additional fields: | ||
396 | * P_HAND_SHAKE | ||
397 | * P_BARRIER | ||
398 | * P_BARRIER_ACK | ||
399 | * P_SYNC_PARAM | ||
400 | * ReportParams | ||
401 | */ | ||
402 | |||
403 | struct p_handshake { | ||
404 | struct p_header head; /* 8 bytes */ | ||
405 | u32 protocol_min; | ||
406 | u32 feature_flags; | ||
407 | u32 protocol_max; | ||
408 | |||
409 | /* should be more than enough for future enhancements | ||
410 | * for now, feature_flags and the reserverd array shall be zero. | ||
411 | */ | ||
412 | |||
413 | u32 _pad; | ||
414 | u64 reserverd[7]; | ||
415 | } __packed; | ||
416 | /* 80 bytes, FIXED for the next century */ | ||
417 | |||
418 | struct p_barrier { | ||
419 | struct p_header head; | ||
420 | u32 barrier; /* barrier number _handle_ only */ | ||
421 | u32 pad; /* to multiple of 8 Byte */ | ||
422 | } __packed; | ||
423 | |||
424 | struct p_barrier_ack { | ||
425 | struct p_header head; | ||
426 | u32 barrier; | ||
427 | u32 set_size; | ||
428 | } __packed; | ||
429 | |||
430 | struct p_rs_param { | ||
431 | struct p_header head; | ||
432 | u32 rate; | ||
433 | |||
434 | /* Since protocol version 88 and higher. */ | ||
435 | char verify_alg[0]; | ||
436 | } __packed; | ||
437 | |||
438 | struct p_rs_param_89 { | ||
439 | struct p_header head; | ||
440 | u32 rate; | ||
441 | /* protocol version 89: */ | ||
442 | char verify_alg[SHARED_SECRET_MAX]; | ||
443 | char csums_alg[SHARED_SECRET_MAX]; | ||
444 | } __packed; | ||
445 | |||
446 | struct p_protocol { | ||
447 | struct p_header head; | ||
448 | u32 protocol; | ||
449 | u32 after_sb_0p; | ||
450 | u32 after_sb_1p; | ||
451 | u32 after_sb_2p; | ||
452 | u32 want_lose; | ||
453 | u32 two_primaries; | ||
454 | |||
455 | /* Since protocol version 87 and higher. */ | ||
456 | char integrity_alg[0]; | ||
457 | |||
458 | } __packed; | ||
459 | |||
460 | struct p_uuids { | ||
461 | struct p_header head; | ||
462 | u64 uuid[UI_EXTENDED_SIZE]; | ||
463 | } __packed; | ||
464 | |||
465 | struct p_rs_uuid { | ||
466 | struct p_header head; | ||
467 | u64 uuid; | ||
468 | } __packed; | ||
469 | |||
470 | struct p_sizes { | ||
471 | struct p_header head; | ||
472 | u64 d_size; /* size of disk */ | ||
473 | u64 u_size; /* user requested size */ | ||
474 | u64 c_size; /* current exported size */ | ||
475 | u32 max_segment_size; /* Maximal size of a BIO */ | ||
476 | u32 queue_order_type; | ||
477 | } __packed; | ||
478 | |||
479 | struct p_state { | ||
480 | struct p_header head; | ||
481 | u32 state; | ||
482 | } __packed; | ||
483 | |||
484 | struct p_req_state { | ||
485 | struct p_header head; | ||
486 | u32 mask; | ||
487 | u32 val; | ||
488 | } __packed; | ||
489 | |||
490 | struct p_req_state_reply { | ||
491 | struct p_header head; | ||
492 | u32 retcode; | ||
493 | } __packed; | ||
494 | |||
495 | struct p_drbd06_param { | ||
496 | u64 size; | ||
497 | u32 state; | ||
498 | u32 blksize; | ||
499 | u32 protocol; | ||
500 | u32 version; | ||
501 | u32 gen_cnt[5]; | ||
502 | u32 bit_map_gen[5]; | ||
503 | } __packed; | ||
504 | |||
505 | struct p_discard { | ||
506 | struct p_header head; | ||
507 | u64 block_id; | ||
508 | u32 seq_num; | ||
509 | u32 pad; | ||
510 | } __packed; | ||
511 | |||
512 | /* Valid values for the encoding field. | ||
513 | * Bump proto version when changing this. */ | ||
514 | enum drbd_bitmap_code { | ||
515 | /* RLE_VLI_Bytes = 0, | ||
516 | * and other bit variants had been defined during | ||
517 | * algorithm evaluation. */ | ||
518 | RLE_VLI_Bits = 2, | ||
519 | }; | ||
520 | |||
521 | struct p_compressed_bm { | ||
522 | struct p_header head; | ||
523 | /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code | ||
524 | * (encoding & 0x80): polarity (set/unset) of first runlength | ||
525 | * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits | ||
526 | * used to pad up to head.length bytes | ||
527 | */ | ||
528 | u8 encoding; | ||
529 | |||
530 | u8 code[0]; | ||
531 | } __packed; | ||
532 | |||
533 | /* DCBP: Drbd Compressed Bitmap Packet ... */ | ||
534 | static inline enum drbd_bitmap_code | ||
535 | DCBP_get_code(struct p_compressed_bm *p) | ||
536 | { | ||
537 | return (enum drbd_bitmap_code)(p->encoding & 0x0f); | ||
538 | } | ||
539 | |||
540 | static inline void | ||
541 | DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code) | ||
542 | { | ||
543 | BUG_ON(code & ~0xf); | ||
544 | p->encoding = (p->encoding & ~0xf) | code; | ||
545 | } | ||
546 | |||
547 | static inline int | ||
548 | DCBP_get_start(struct p_compressed_bm *p) | ||
549 | { | ||
550 | return (p->encoding & 0x80) != 0; | ||
551 | } | ||
552 | |||
553 | static inline void | ||
554 | DCBP_set_start(struct p_compressed_bm *p, int set) | ||
555 | { | ||
556 | p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0); | ||
557 | } | ||
558 | |||
559 | static inline int | ||
560 | DCBP_get_pad_bits(struct p_compressed_bm *p) | ||
561 | { | ||
562 | return (p->encoding >> 4) & 0x7; | ||
563 | } | ||
564 | |||
565 | static inline void | ||
566 | DCBP_set_pad_bits(struct p_compressed_bm *p, int n) | ||
567 | { | ||
568 | BUG_ON(n & ~0x7); | ||
569 | p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4); | ||
570 | } | ||
571 | |||
572 | /* one bitmap packet, including the p_header, | ||
573 | * should fit within one _architecture independend_ page. | ||
574 | * so we need to use the fixed size 4KiB page size | ||
575 | * most architechtures have used for a long time. | ||
576 | */ | ||
577 | #define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header)) | ||
578 | #define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long)) | ||
579 | #define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm)) | ||
580 | #if (PAGE_SIZE < 4096) | ||
581 | /* drbd_send_bitmap / receive_bitmap would break horribly */ | ||
582 | #error "PAGE_SIZE too small" | ||
583 | #endif | ||
584 | |||
585 | union p_polymorph { | ||
586 | struct p_header header; | ||
587 | struct p_handshake handshake; | ||
588 | struct p_data data; | ||
589 | struct p_block_ack block_ack; | ||
590 | struct p_barrier barrier; | ||
591 | struct p_barrier_ack barrier_ack; | ||
592 | struct p_rs_param_89 rs_param_89; | ||
593 | struct p_protocol protocol; | ||
594 | struct p_sizes sizes; | ||
595 | struct p_uuids uuids; | ||
596 | struct p_state state; | ||
597 | struct p_req_state req_state; | ||
598 | struct p_req_state_reply req_state_reply; | ||
599 | struct p_block_req block_req; | ||
600 | } __packed; | ||
601 | |||
602 | /**********************************************************************/ | ||
603 | enum drbd_thread_state { | ||
604 | None, | ||
605 | Running, | ||
606 | Exiting, | ||
607 | Restarting | ||
608 | }; | ||
609 | |||
610 | struct drbd_thread { | ||
611 | spinlock_t t_lock; | ||
612 | struct task_struct *task; | ||
613 | struct completion stop; | ||
614 | enum drbd_thread_state t_state; | ||
615 | int (*function) (struct drbd_thread *); | ||
616 | struct drbd_conf *mdev; | ||
617 | int reset_cpu_mask; | ||
618 | }; | ||
619 | |||
620 | static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi) | ||
621 | { | ||
622 | /* THINK testing the t_state seems to be uncritical in all cases | ||
623 | * (but thread_{start,stop}), so we can read it *without* the lock. | ||
624 | * --lge */ | ||
625 | |||
626 | smp_rmb(); | ||
627 | return thi->t_state; | ||
628 | } | ||
629 | |||
630 | |||
631 | /* | ||
632 | * Having this as the first member of a struct provides sort of "inheritance". | ||
633 | * "derived" structs can be "drbd_queue_work()"ed. | ||
634 | * The callback should know and cast back to the descendant struct. | ||
635 | * drbd_request and drbd_epoch_entry are descendants of drbd_work. | ||
636 | */ | ||
637 | struct drbd_work; | ||
638 | typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel); | ||
639 | struct drbd_work { | ||
640 | struct list_head list; | ||
641 | drbd_work_cb cb; | ||
642 | }; | ||
643 | |||
644 | struct drbd_tl_epoch; | ||
645 | struct drbd_request { | ||
646 | struct drbd_work w; | ||
647 | struct drbd_conf *mdev; | ||
648 | |||
649 | /* if local IO is not allowed, will be NULL. | ||
650 | * if local IO _is_ allowed, holds the locally submitted bio clone, | ||
651 | * or, after local IO completion, the ERR_PTR(error). | ||
652 | * see drbd_endio_pri(). */ | ||
653 | struct bio *private_bio; | ||
654 | |||
655 | struct hlist_node colision; | ||
656 | sector_t sector; | ||
657 | unsigned int size; | ||
658 | unsigned int epoch; /* barrier_nr */ | ||
659 | |||
660 | /* barrier_nr: used to check on "completion" whether this req was in | ||
661 | * the current epoch, and we therefore have to close it, | ||
662 | * starting a new epoch... | ||
663 | */ | ||
664 | |||
665 | /* up to here, the struct layout is identical to drbd_epoch_entry; | ||
666 | * we might be able to use that to our advantage... */ | ||
667 | |||
668 | struct list_head tl_requests; /* ring list in the transfer log */ | ||
669 | struct bio *master_bio; /* master bio pointer */ | ||
670 | unsigned long rq_state; /* see comments above _req_mod() */ | ||
671 | int seq_num; | ||
672 | unsigned long start_time; | ||
673 | }; | ||
674 | |||
675 | struct drbd_tl_epoch { | ||
676 | struct drbd_work w; | ||
677 | struct list_head requests; /* requests before */ | ||
678 | struct drbd_tl_epoch *next; /* pointer to the next barrier */ | ||
679 | unsigned int br_number; /* the barriers identifier. */ | ||
680 | int n_req; /* number of requests attached before this barrier */ | ||
681 | }; | ||
682 | |||
683 | struct drbd_request; | ||
684 | |||
685 | /* These Tl_epoch_entries may be in one of 6 lists: | ||
686 | active_ee .. data packet being written | ||
687 | sync_ee .. syncer block being written | ||
688 | done_ee .. block written, need to send P_WRITE_ACK | ||
689 | read_ee .. [RS]P_DATA_REQUEST being read | ||
690 | */ | ||
691 | |||
692 | struct drbd_epoch { | ||
693 | struct list_head list; | ||
694 | unsigned int barrier_nr; | ||
695 | atomic_t epoch_size; /* increased on every request added. */ | ||
696 | atomic_t active; /* increased on every req. added, and dec on every finished. */ | ||
697 | unsigned long flags; | ||
698 | }; | ||
699 | |||
700 | /* drbd_epoch flag bits */ | ||
701 | enum { | ||
702 | DE_BARRIER_IN_NEXT_EPOCH_ISSUED, | ||
703 | DE_BARRIER_IN_NEXT_EPOCH_DONE, | ||
704 | DE_CONTAINS_A_BARRIER, | ||
705 | DE_HAVE_BARRIER_NUMBER, | ||
706 | DE_IS_FINISHING, | ||
707 | }; | ||
708 | |||
709 | enum epoch_event { | ||
710 | EV_PUT, | ||
711 | EV_GOT_BARRIER_NR, | ||
712 | EV_BARRIER_DONE, | ||
713 | EV_BECAME_LAST, | ||
714 | EV_CLEANUP = 32, /* used as flag */ | ||
715 | }; | ||
716 | |||
717 | struct drbd_epoch_entry { | ||
718 | struct drbd_work w; | ||
719 | struct drbd_conf *mdev; | ||
720 | struct bio *private_bio; | ||
721 | struct hlist_node colision; | ||
722 | sector_t sector; | ||
723 | unsigned int size; | ||
724 | struct drbd_epoch *epoch; | ||
725 | |||
726 | /* up to here, the struct layout is identical to drbd_request; | ||
727 | * we might be able to use that to our advantage... */ | ||
728 | |||
729 | unsigned int flags; | ||
730 | u64 block_id; | ||
731 | }; | ||
732 | |||
733 | struct drbd_wq_barrier { | ||
734 | struct drbd_work w; | ||
735 | struct completion done; | ||
736 | }; | ||
737 | |||
738 | struct digest_info { | ||
739 | int digest_size; | ||
740 | void *digest; | ||
741 | }; | ||
742 | |||
743 | /* ee flag bits */ | ||
744 | enum { | ||
745 | __EE_CALL_AL_COMPLETE_IO, | ||
746 | __EE_CONFLICT_PENDING, | ||
747 | __EE_MAY_SET_IN_SYNC, | ||
748 | __EE_IS_BARRIER, | ||
749 | }; | ||
750 | #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) | ||
751 | #define EE_CONFLICT_PENDING (1<<__EE_CONFLICT_PENDING) | ||
752 | #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) | ||
753 | #define EE_IS_BARRIER (1<<__EE_IS_BARRIER) | ||
754 | |||
755 | /* global flag bits */ | ||
756 | enum { | ||
757 | CREATE_BARRIER, /* next P_DATA is preceeded by a P_BARRIER */ | ||
758 | SIGNAL_ASENDER, /* whether asender wants to be interrupted */ | ||
759 | SEND_PING, /* whether asender should send a ping asap */ | ||
760 | |||
761 | STOP_SYNC_TIMER, /* tell timer to cancel itself */ | ||
762 | UNPLUG_QUEUED, /* only relevant with kernel 2.4 */ | ||
763 | UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ | ||
764 | MD_DIRTY, /* current uuids and flags not yet on disk */ | ||
765 | DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */ | ||
766 | USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */ | ||
767 | CLUSTER_ST_CHANGE, /* Cluster wide state change going on... */ | ||
768 | CL_ST_CHG_SUCCESS, | ||
769 | CL_ST_CHG_FAIL, | ||
770 | CRASHED_PRIMARY, /* This node was a crashed primary. | ||
771 | * Gets cleared when the state.conn | ||
772 | * goes into C_CONNECTED state. */ | ||
773 | WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */ | ||
774 | NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */ | ||
775 | CONSIDER_RESYNC, | ||
776 | |||
777 | MD_NO_BARRIER, /* meta data device does not support barriers, | ||
778 | so don't even try */ | ||
779 | SUSPEND_IO, /* suspend application io */ | ||
780 | BITMAP_IO, /* suspend application io; | ||
781 | once no more io in flight, start bitmap io */ | ||
782 | BITMAP_IO_QUEUED, /* Started bitmap IO */ | ||
783 | RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ | ||
784 | NET_CONGESTED, /* The data socket is congested */ | ||
785 | |||
786 | CONFIG_PENDING, /* serialization of (re)configuration requests. | ||
787 | * if set, also prevents the device from dying */ | ||
788 | DEVICE_DYING, /* device became unconfigured, | ||
789 | * but worker thread is still handling the cleanup. | ||
790 | * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed, | ||
791 | * while this is set. */ | ||
792 | RESIZE_PENDING, /* Size change detected locally, waiting for the response from | ||
793 | * the peer, if it changed there as well. */ | ||
794 | }; | ||
795 | |||
796 | struct drbd_bitmap; /* opaque for drbd_conf */ | ||
797 | |||
798 | /* TODO sort members for performance | ||
799 | * MAYBE group them further */ | ||
800 | |||
801 | /* THINK maybe we actually want to use the default "event/%s" worker threads | ||
802 | * or similar in linux 2.6, which uses per cpu data and threads. | ||
803 | * | ||
804 | * To be general, this might need a spin_lock member. | ||
805 | * For now, please use the mdev->req_lock to protect list_head, | ||
806 | * see drbd_queue_work below. | ||
807 | */ | ||
808 | struct drbd_work_queue { | ||
809 | struct list_head q; | ||
810 | struct semaphore s; /* producers up it, worker down()s it */ | ||
811 | spinlock_t q_lock; /* to protect the list. */ | ||
812 | }; | ||
813 | |||
814 | struct drbd_socket { | ||
815 | struct drbd_work_queue work; | ||
816 | struct mutex mutex; | ||
817 | struct socket *socket; | ||
818 | /* this way we get our | ||
819 | * send/receive buffers off the stack */ | ||
820 | union p_polymorph sbuf; | ||
821 | union p_polymorph rbuf; | ||
822 | }; | ||
823 | |||
824 | struct drbd_md { | ||
825 | u64 md_offset; /* sector offset to 'super' block */ | ||
826 | |||
827 | u64 la_size_sect; /* last agreed size, unit sectors */ | ||
828 | u64 uuid[UI_SIZE]; | ||
829 | u64 device_uuid; | ||
830 | u32 flags; | ||
831 | u32 md_size_sect; | ||
832 | |||
833 | s32 al_offset; /* signed relative sector offset to al area */ | ||
834 | s32 bm_offset; /* signed relative sector offset to bitmap */ | ||
835 | |||
836 | /* u32 al_nr_extents; important for restoring the AL | ||
837 | * is stored into sync_conf.al_extents, which in turn | ||
838 | * gets applied to act_log->nr_elements | ||
839 | */ | ||
840 | }; | ||
841 | |||
842 | /* for sync_conf and other types... */ | ||
843 | #define NL_PACKET(name, number, fields) struct name { fields }; | ||
844 | #define NL_INTEGER(pn,pr,member) int member; | ||
845 | #define NL_INT64(pn,pr,member) __u64 member; | ||
846 | #define NL_BIT(pn,pr,member) unsigned member:1; | ||
847 | #define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len; | ||
848 | #include "linux/drbd_nl.h" | ||
849 | |||
850 | struct drbd_backing_dev { | ||
851 | struct block_device *backing_bdev; | ||
852 | struct block_device *md_bdev; | ||
853 | struct file *lo_file; | ||
854 | struct file *md_file; | ||
855 | struct drbd_md md; | ||
856 | struct disk_conf dc; /* The user provided config... */ | ||
857 | sector_t known_size; /* last known size of that backing device */ | ||
858 | }; | ||
859 | |||
860 | struct drbd_md_io { | ||
861 | struct drbd_conf *mdev; | ||
862 | struct completion event; | ||
863 | int error; | ||
864 | }; | ||
865 | |||
866 | struct bm_io_work { | ||
867 | struct drbd_work w; | ||
868 | char *why; | ||
869 | int (*io_fn)(struct drbd_conf *mdev); | ||
870 | void (*done)(struct drbd_conf *mdev, int rv); | ||
871 | }; | ||
872 | |||
873 | enum write_ordering_e { | ||
874 | WO_none, | ||
875 | WO_drain_io, | ||
876 | WO_bdev_flush, | ||
877 | WO_bio_barrier | ||
878 | }; | ||
879 | |||
880 | struct drbd_conf { | ||
881 | /* things that are stored as / read from meta data on disk */ | ||
882 | unsigned long flags; | ||
883 | |||
884 | /* configured by drbdsetup */ | ||
885 | struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */ | ||
886 | struct syncer_conf sync_conf; | ||
887 | struct drbd_backing_dev *ldev __protected_by(local); | ||
888 | |||
889 | sector_t p_size; /* partner's disk size */ | ||
890 | struct request_queue *rq_queue; | ||
891 | struct block_device *this_bdev; | ||
892 | struct gendisk *vdisk; | ||
893 | |||
894 | struct drbd_socket data; /* data/barrier/cstate/parameter packets */ | ||
895 | struct drbd_socket meta; /* ping/ack (metadata) packets */ | ||
896 | int agreed_pro_version; /* actually used protocol version */ | ||
897 | unsigned long last_received; /* in jiffies, either socket */ | ||
898 | unsigned int ko_count; | ||
899 | struct drbd_work resync_work, | ||
900 | unplug_work, | ||
901 | md_sync_work; | ||
902 | struct timer_list resync_timer; | ||
903 | struct timer_list md_sync_timer; | ||
904 | |||
905 | /* Used after attach while negotiating new disk state. */ | ||
906 | union drbd_state new_state_tmp; | ||
907 | |||
908 | union drbd_state state; | ||
909 | wait_queue_head_t misc_wait; | ||
910 | wait_queue_head_t state_wait; /* upon each state change. */ | ||
911 | unsigned int send_cnt; | ||
912 | unsigned int recv_cnt; | ||
913 | unsigned int read_cnt; | ||
914 | unsigned int writ_cnt; | ||
915 | unsigned int al_writ_cnt; | ||
916 | unsigned int bm_writ_cnt; | ||
917 | atomic_t ap_bio_cnt; /* Requests we need to complete */ | ||
918 | atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */ | ||
919 | atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ | ||
920 | atomic_t unacked_cnt; /* Need to send replys for */ | ||
921 | atomic_t local_cnt; /* Waiting for local completion */ | ||
922 | atomic_t net_cnt; /* Users of net_conf */ | ||
923 | spinlock_t req_lock; | ||
924 | struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */ | ||
925 | struct drbd_tl_epoch *newest_tle; | ||
926 | struct drbd_tl_epoch *oldest_tle; | ||
927 | struct list_head out_of_sequence_requests; | ||
928 | struct hlist_head *tl_hash; | ||
929 | unsigned int tl_hash_s; | ||
930 | |||
931 | /* blocks to sync in this run [unit BM_BLOCK_SIZE] */ | ||
932 | unsigned long rs_total; | ||
933 | /* number of sync IOs that failed in this run */ | ||
934 | unsigned long rs_failed; | ||
935 | /* Syncer's start time [unit jiffies] */ | ||
936 | unsigned long rs_start; | ||
937 | /* cumulated time in PausedSyncX state [unit jiffies] */ | ||
938 | unsigned long rs_paused; | ||
939 | /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */ | ||
940 | unsigned long rs_mark_left; | ||
941 | /* marks's time [unit jiffies] */ | ||
942 | unsigned long rs_mark_time; | ||
943 | /* skipped because csum was equeal [unit BM_BLOCK_SIZE] */ | ||
944 | unsigned long rs_same_csum; | ||
945 | |||
946 | /* where does the admin want us to start? (sector) */ | ||
947 | sector_t ov_start_sector; | ||
948 | /* where are we now? (sector) */ | ||
949 | sector_t ov_position; | ||
950 | /* Start sector of out of sync range (to merge printk reporting). */ | ||
951 | sector_t ov_last_oos_start; | ||
952 | /* size of out-of-sync range in sectors. */ | ||
953 | sector_t ov_last_oos_size; | ||
954 | unsigned long ov_left; /* in bits */ | ||
955 | struct crypto_hash *csums_tfm; | ||
956 | struct crypto_hash *verify_tfm; | ||
957 | |||
958 | struct drbd_thread receiver; | ||
959 | struct drbd_thread worker; | ||
960 | struct drbd_thread asender; | ||
961 | struct drbd_bitmap *bitmap; | ||
962 | unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */ | ||
963 | |||
964 | /* Used to track operations of resync... */ | ||
965 | struct lru_cache *resync; | ||
966 | /* Number of locked elements in resync LRU */ | ||
967 | unsigned int resync_locked; | ||
968 | /* resync extent number waiting for application requests */ | ||
969 | unsigned int resync_wenr; | ||
970 | |||
971 | int open_cnt; | ||
972 | u64 *p_uuid; | ||
973 | struct drbd_epoch *current_epoch; | ||
974 | spinlock_t epoch_lock; | ||
975 | unsigned int epochs; | ||
976 | enum write_ordering_e write_ordering; | ||
977 | struct list_head active_ee; /* IO in progress */ | ||
978 | struct list_head sync_ee; /* IO in progress */ | ||
979 | struct list_head done_ee; /* send ack */ | ||
980 | struct list_head read_ee; /* IO in progress */ | ||
981 | struct list_head net_ee; /* zero-copy network send in progress */ | ||
982 | struct hlist_head *ee_hash; /* is proteced by req_lock! */ | ||
983 | unsigned int ee_hash_s; | ||
984 | |||
985 | /* this one is protected by ee_lock, single thread */ | ||
986 | struct drbd_epoch_entry *last_write_w_barrier; | ||
987 | |||
988 | int next_barrier_nr; | ||
989 | struct hlist_head *app_reads_hash; /* is proteced by req_lock */ | ||
990 | struct list_head resync_reads; | ||
991 | atomic_t pp_in_use; | ||
992 | wait_queue_head_t ee_wait; | ||
993 | struct page *md_io_page; /* one page buffer for md_io */ | ||
994 | struct page *md_io_tmpp; /* for logical_block_size != 512 */ | ||
995 | struct mutex md_io_mutex; /* protects the md_io_buffer */ | ||
996 | spinlock_t al_lock; | ||
997 | wait_queue_head_t al_wait; | ||
998 | struct lru_cache *act_log; /* activity log */ | ||
999 | unsigned int al_tr_number; | ||
1000 | int al_tr_cycle; | ||
1001 | int al_tr_pos; /* position of the next transaction in the journal */ | ||
1002 | struct crypto_hash *cram_hmac_tfm; | ||
1003 | struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */ | ||
1004 | struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */ | ||
1005 | void *int_dig_out; | ||
1006 | void *int_dig_in; | ||
1007 | void *int_dig_vv; | ||
1008 | wait_queue_head_t seq_wait; | ||
1009 | atomic_t packet_seq; | ||
1010 | unsigned int peer_seq; | ||
1011 | spinlock_t peer_seq_lock; | ||
1012 | unsigned int minor; | ||
1013 | unsigned long comm_bm_set; /* communicated number of set bits. */ | ||
1014 | cpumask_var_t cpu_mask; | ||
1015 | struct bm_io_work bm_io_work; | ||
1016 | u64 ed_uuid; /* UUID of the exposed data */ | ||
1017 | struct mutex state_mutex; | ||
1018 | char congestion_reason; /* Why we where congested... */ | ||
1019 | }; | ||
1020 | |||
1021 | static inline struct drbd_conf *minor_to_mdev(unsigned int minor) | ||
1022 | { | ||
1023 | struct drbd_conf *mdev; | ||
1024 | |||
1025 | mdev = minor < minor_count ? minor_table[minor] : NULL; | ||
1026 | |||
1027 | return mdev; | ||
1028 | } | ||
1029 | |||
1030 | static inline unsigned int mdev_to_minor(struct drbd_conf *mdev) | ||
1031 | { | ||
1032 | return mdev->minor; | ||
1033 | } | ||
1034 | |||
1035 | /* returns 1 if it was successfull, | ||
1036 | * returns 0 if there was no data socket. | ||
1037 | * so wherever you are going to use the data.socket, e.g. do | ||
1038 | * if (!drbd_get_data_sock(mdev)) | ||
1039 | * return 0; | ||
1040 | * CODE(); | ||
1041 | * drbd_put_data_sock(mdev); | ||
1042 | */ | ||
1043 | static inline int drbd_get_data_sock(struct drbd_conf *mdev) | ||
1044 | { | ||
1045 | mutex_lock(&mdev->data.mutex); | ||
1046 | /* drbd_disconnect() could have called drbd_free_sock() | ||
1047 | * while we were waiting in down()... */ | ||
1048 | if (unlikely(mdev->data.socket == NULL)) { | ||
1049 | mutex_unlock(&mdev->data.mutex); | ||
1050 | return 0; | ||
1051 | } | ||
1052 | return 1; | ||
1053 | } | ||
1054 | |||
1055 | static inline void drbd_put_data_sock(struct drbd_conf *mdev) | ||
1056 | { | ||
1057 | mutex_unlock(&mdev->data.mutex); | ||
1058 | } | ||
1059 | |||
1060 | /* | ||
1061 | * function declarations | ||
1062 | *************************/ | ||
1063 | |||
1064 | /* drbd_main.c */ | ||
1065 | |||
1066 | enum chg_state_flags { | ||
1067 | CS_HARD = 1, | ||
1068 | CS_VERBOSE = 2, | ||
1069 | CS_WAIT_COMPLETE = 4, | ||
1070 | CS_SERIALIZE = 8, | ||
1071 | CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, | ||
1072 | }; | ||
1073 | |||
1074 | extern void drbd_init_set_defaults(struct drbd_conf *mdev); | ||
1075 | extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, | ||
1076 | union drbd_state mask, union drbd_state val); | ||
1077 | extern void drbd_force_state(struct drbd_conf *, union drbd_state, | ||
1078 | union drbd_state); | ||
1079 | extern int _drbd_request_state(struct drbd_conf *, union drbd_state, | ||
1080 | union drbd_state, enum chg_state_flags); | ||
1081 | extern int __drbd_set_state(struct drbd_conf *, union drbd_state, | ||
1082 | enum chg_state_flags, struct completion *done); | ||
1083 | extern void print_st_err(struct drbd_conf *, union drbd_state, | ||
1084 | union drbd_state, int); | ||
1085 | extern int drbd_thread_start(struct drbd_thread *thi); | ||
1086 | extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait); | ||
1087 | #ifdef CONFIG_SMP | ||
1088 | extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev); | ||
1089 | extern void drbd_calc_cpu_mask(struct drbd_conf *mdev); | ||
1090 | #else | ||
1091 | #define drbd_thread_current_set_cpu(A) ({}) | ||
1092 | #define drbd_calc_cpu_mask(A) ({}) | ||
1093 | #endif | ||
1094 | extern void drbd_free_resources(struct drbd_conf *mdev); | ||
1095 | extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, | ||
1096 | unsigned int set_size); | ||
1097 | extern void tl_clear(struct drbd_conf *mdev); | ||
1098 | extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *); | ||
1099 | extern void drbd_free_sock(struct drbd_conf *mdev); | ||
1100 | extern int drbd_send(struct drbd_conf *mdev, struct socket *sock, | ||
1101 | void *buf, size_t size, unsigned msg_flags); | ||
1102 | extern int drbd_send_protocol(struct drbd_conf *mdev); | ||
1103 | extern int drbd_send_uuids(struct drbd_conf *mdev); | ||
1104 | extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); | ||
1105 | extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val); | ||
1106 | extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply); | ||
1107 | extern int _drbd_send_state(struct drbd_conf *mdev); | ||
1108 | extern int drbd_send_state(struct drbd_conf *mdev); | ||
1109 | extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, | ||
1110 | enum drbd_packets cmd, struct p_header *h, | ||
1111 | size_t size, unsigned msg_flags); | ||
1112 | #define USE_DATA_SOCKET 1 | ||
1113 | #define USE_META_SOCKET 0 | ||
1114 | extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, | ||
1115 | enum drbd_packets cmd, struct p_header *h, | ||
1116 | size_t size); | ||
1117 | extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1118 | char *data, size_t size); | ||
1119 | extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc); | ||
1120 | extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, | ||
1121 | u32 set_size); | ||
1122 | extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1123 | struct drbd_epoch_entry *e); | ||
1124 | extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1125 | struct p_block_req *rp); | ||
1126 | extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1127 | struct p_data *dp); | ||
1128 | extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1129 | sector_t sector, int blksize, u64 block_id); | ||
1130 | extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1131 | struct drbd_epoch_entry *e); | ||
1132 | extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req); | ||
1133 | extern int _drbd_send_barrier(struct drbd_conf *mdev, | ||
1134 | struct drbd_tl_epoch *barrier); | ||
1135 | extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd, | ||
1136 | sector_t sector, int size, u64 block_id); | ||
1137 | extern int drbd_send_drequest_csum(struct drbd_conf *mdev, | ||
1138 | sector_t sector,int size, | ||
1139 | void *digest, int digest_size, | ||
1140 | enum drbd_packets cmd); | ||
1141 | extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size); | ||
1142 | |||
1143 | extern int drbd_send_bitmap(struct drbd_conf *mdev); | ||
1144 | extern int _drbd_send_bitmap(struct drbd_conf *mdev); | ||
1145 | extern int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode); | ||
1146 | extern void drbd_free_bc(struct drbd_backing_dev *ldev); | ||
1147 | extern void drbd_mdev_cleanup(struct drbd_conf *mdev); | ||
1148 | |||
1149 | /* drbd_meta-data.c (still in drbd_main.c) */ | ||
1150 | extern void drbd_md_sync(struct drbd_conf *mdev); | ||
1151 | extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev); | ||
1152 | /* maybe define them below as inline? */ | ||
1153 | extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); | ||
1154 | extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); | ||
1155 | extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local); | ||
1156 | extern void _drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local); | ||
1157 | extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local); | ||
1158 | extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local); | ||
1159 | extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local); | ||
1160 | extern int drbd_md_test_flag(struct drbd_backing_dev *, int); | ||
1161 | extern void drbd_md_mark_dirty(struct drbd_conf *mdev); | ||
1162 | extern void drbd_queue_bitmap_io(struct drbd_conf *mdev, | ||
1163 | int (*io_fn)(struct drbd_conf *), | ||
1164 | void (*done)(struct drbd_conf *, int), | ||
1165 | char *why); | ||
1166 | extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); | ||
1167 | extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); | ||
1168 | extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why); | ||
1169 | |||
1170 | |||
1171 | /* Meta data layout | ||
1172 | We reserve a 128MB Block (4k aligned) | ||
1173 | * either at the end of the backing device | ||
1174 | * or on a seperate meta data device. */ | ||
1175 | |||
1176 | #define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */ | ||
1177 | /* The following numbers are sectors */ | ||
1178 | #define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */ | ||
1179 | #define MD_AL_MAX_SIZE 64 /* = 32 kb LOG ~ 3776 extents ~ 14 GB Storage */ | ||
1180 | /* Allows up to about 3.8TB */ | ||
1181 | #define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE) | ||
1182 | |||
1183 | /* Since the smalles IO unit is usually 512 byte */ | ||
1184 | #define MD_SECTOR_SHIFT 9 | ||
1185 | #define MD_SECTOR_SIZE (1<<MD_SECTOR_SHIFT) | ||
1186 | |||
1187 | /* activity log */ | ||
1188 | #define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */ | ||
1189 | #define AL_EXTENT_SHIFT 22 /* One extent represents 4M Storage */ | ||
1190 | #define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT) | ||
1191 | |||
1192 | #if BITS_PER_LONG == 32 | ||
1193 | #define LN2_BPL 5 | ||
1194 | #define cpu_to_lel(A) cpu_to_le32(A) | ||
1195 | #define lel_to_cpu(A) le32_to_cpu(A) | ||
1196 | #elif BITS_PER_LONG == 64 | ||
1197 | #define LN2_BPL 6 | ||
1198 | #define cpu_to_lel(A) cpu_to_le64(A) | ||
1199 | #define lel_to_cpu(A) le64_to_cpu(A) | ||
1200 | #else | ||
1201 | #error "LN2 of BITS_PER_LONG unknown!" | ||
1202 | #endif | ||
1203 | |||
1204 | /* resync bitmap */ | ||
1205 | /* 16MB sized 'bitmap extent' to track syncer usage */ | ||
1206 | struct bm_extent { | ||
1207 | int rs_left; /* number of bits set (out of sync) in this extent. */ | ||
1208 | int rs_failed; /* number of failed resync requests in this extent. */ | ||
1209 | unsigned long flags; | ||
1210 | struct lc_element lce; | ||
1211 | }; | ||
1212 | |||
1213 | #define BME_NO_WRITES 0 /* bm_extent.flags: no more requests on this one! */ | ||
1214 | #define BME_LOCKED 1 /* bm_extent.flags: syncer active on this one. */ | ||
1215 | |||
1216 | /* drbd_bitmap.c */ | ||
1217 | /* | ||
1218 | * We need to store one bit for a block. | ||
1219 | * Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap. | ||
1220 | * Bit 0 ==> local node thinks this block is binary identical on both nodes | ||
1221 | * Bit 1 ==> local node thinks this block needs to be synced. | ||
1222 | */ | ||
1223 | |||
1224 | #define BM_BLOCK_SHIFT 12 /* 4k per bit */ | ||
1225 | #define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT) | ||
1226 | /* (9+3) : 512 bytes @ 8 bits; representing 16M storage | ||
1227 | * per sector of on disk bitmap */ | ||
1228 | #define BM_EXT_SHIFT (BM_BLOCK_SHIFT + MD_SECTOR_SHIFT + 3) /* = 24 */ | ||
1229 | #define BM_EXT_SIZE (1<<BM_EXT_SHIFT) | ||
1230 | |||
1231 | #if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12) | ||
1232 | #error "HAVE YOU FIXED drbdmeta AS WELL??" | ||
1233 | #endif | ||
1234 | |||
1235 | /* thus many _storage_ sectors are described by one bit */ | ||
1236 | #define BM_SECT_TO_BIT(x) ((x)>>(BM_BLOCK_SHIFT-9)) | ||
1237 | #define BM_BIT_TO_SECT(x) ((sector_t)(x)<<(BM_BLOCK_SHIFT-9)) | ||
1238 | #define BM_SECT_PER_BIT BM_BIT_TO_SECT(1) | ||
1239 | |||
1240 | /* bit to represented kilo byte conversion */ | ||
1241 | #define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10)) | ||
1242 | |||
1243 | /* in which _bitmap_ extent (resp. sector) the bit for a certain | ||
1244 | * _storage_ sector is located in */ | ||
1245 | #define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9)) | ||
1246 | |||
1247 | /* how much _storage_ sectors we have per bitmap sector */ | ||
1248 | #define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9)) | ||
1249 | #define BM_SECT_PER_EXT BM_EXT_TO_SECT(1) | ||
1250 | |||
1251 | /* in one sector of the bitmap, we have this many activity_log extents. */ | ||
1252 | #define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT)) | ||
1253 | #define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) | ||
1254 | |||
1255 | #define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT) | ||
1256 | #define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1) | ||
1257 | |||
1258 | /* the extent in "PER_EXTENT" below is an activity log extent | ||
1259 | * we need that many (long words/bytes) to store the bitmap | ||
1260 | * of one AL_EXTENT_SIZE chunk of storage. | ||
1261 | * we can store the bitmap for that many AL_EXTENTS within | ||
1262 | * one sector of the _on_disk_ bitmap: | ||
1263 | * bit 0 bit 37 bit 38 bit (512*8)-1 | ||
1264 | * ...|........|........|.. // ..|........| | ||
1265 | * sect. 0 `296 `304 ^(512*8*8)-1 | ||
1266 | * | ||
1267 | #define BM_WORDS_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG ) | ||
1268 | #define BM_BYTES_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 ) // 128 | ||
1269 | #define BM_EXT_PER_SECT ( 512 / BM_BYTES_PER_EXTENT ) // 4 | ||
1270 | */ | ||
1271 | |||
1272 | #define DRBD_MAX_SECTORS_32 (0xffffffffLU) | ||
1273 | #define DRBD_MAX_SECTORS_BM \ | ||
1274 | ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9))) | ||
1275 | #if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32 | ||
1276 | #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM | ||
1277 | #define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM | ||
1278 | #elif !defined(CONFIG_LBD) && BITS_PER_LONG == 32 | ||
1279 | #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32 | ||
1280 | #define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32 | ||
1281 | #else | ||
1282 | #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM | ||
1283 | /* 16 TB in units of sectors */ | ||
1284 | #if BITS_PER_LONG == 32 | ||
1285 | /* adjust by one page worth of bitmap, | ||
1286 | * so we won't wrap around in drbd_bm_find_next_bit. | ||
1287 | * you should use 64bit OS for that much storage, anyways. */ | ||
1288 | #define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff) | ||
1289 | #else | ||
1290 | #define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0x1LU << 32) | ||
1291 | #endif | ||
1292 | #endif | ||
1293 | |||
1294 | /* Sector shift value for the "hash" functions of tl_hash and ee_hash tables. | ||
1295 | * With a value of 6 all IO in one 32K block make it to the same slot of the | ||
1296 | * hash table. */ | ||
1297 | #define HT_SHIFT 6 | ||
1298 | #define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT)) | ||
1299 | |||
1300 | /* Number of elements in the app_reads_hash */ | ||
1301 | #define APP_R_HSIZE 15 | ||
1302 | |||
1303 | extern int drbd_bm_init(struct drbd_conf *mdev); | ||
1304 | extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors); | ||
1305 | extern void drbd_bm_cleanup(struct drbd_conf *mdev); | ||
1306 | extern void drbd_bm_set_all(struct drbd_conf *mdev); | ||
1307 | extern void drbd_bm_clear_all(struct drbd_conf *mdev); | ||
1308 | extern int drbd_bm_set_bits( | ||
1309 | struct drbd_conf *mdev, unsigned long s, unsigned long e); | ||
1310 | extern int drbd_bm_clear_bits( | ||
1311 | struct drbd_conf *mdev, unsigned long s, unsigned long e); | ||
1312 | /* bm_set_bits variant for use while holding drbd_bm_lock */ | ||
1313 | extern void _drbd_bm_set_bits(struct drbd_conf *mdev, | ||
1314 | const unsigned long s, const unsigned long e); | ||
1315 | extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr); | ||
1316 | extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); | ||
1317 | extern int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local); | ||
1318 | extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); | ||
1319 | extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); | ||
1320 | extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, | ||
1321 | unsigned long al_enr); | ||
1322 | extern size_t drbd_bm_words(struct drbd_conf *mdev); | ||
1323 | extern unsigned long drbd_bm_bits(struct drbd_conf *mdev); | ||
1324 | extern sector_t drbd_bm_capacity(struct drbd_conf *mdev); | ||
1325 | extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo); | ||
1326 | /* bm_find_next variants for use while you hold drbd_bm_lock() */ | ||
1327 | extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo); | ||
1328 | extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo); | ||
1329 | extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev); | ||
1330 | extern int drbd_bm_rs_done(struct drbd_conf *mdev); | ||
1331 | /* for receive_bitmap */ | ||
1332 | extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, | ||
1333 | size_t number, unsigned long *buffer); | ||
1334 | /* for _drbd_send_bitmap and drbd_bm_write_sect */ | ||
1335 | extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, | ||
1336 | size_t number, unsigned long *buffer); | ||
1337 | |||
1338 | extern void drbd_bm_lock(struct drbd_conf *mdev, char *why); | ||
1339 | extern void drbd_bm_unlock(struct drbd_conf *mdev); | ||
1340 | |||
1341 | extern int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e); | ||
1342 | /* drbd_main.c */ | ||
1343 | |||
1344 | extern struct kmem_cache *drbd_request_cache; | ||
1345 | extern struct kmem_cache *drbd_ee_cache; /* epoch entries */ | ||
1346 | extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ | ||
1347 | extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ | ||
1348 | extern mempool_t *drbd_request_mempool; | ||
1349 | extern mempool_t *drbd_ee_mempool; | ||
1350 | |||
1351 | extern struct page *drbd_pp_pool; /* drbd's page pool */ | ||
1352 | extern spinlock_t drbd_pp_lock; | ||
1353 | extern int drbd_pp_vacant; | ||
1354 | extern wait_queue_head_t drbd_pp_wait; | ||
1355 | |||
1356 | extern rwlock_t global_state_lock; | ||
1357 | |||
1358 | extern struct drbd_conf *drbd_new_device(unsigned int minor); | ||
1359 | extern void drbd_free_mdev(struct drbd_conf *mdev); | ||
1360 | |||
1361 | extern int proc_details; | ||
1362 | |||
1363 | /* drbd_req */ | ||
1364 | extern int drbd_make_request_26(struct request_queue *q, struct bio *bio); | ||
1365 | extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); | ||
1366 | extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec); | ||
1367 | extern int is_valid_ar_handle(struct drbd_request *, sector_t); | ||
1368 | |||
1369 | |||
1370 | /* drbd_nl.c */ | ||
1371 | extern void drbd_suspend_io(struct drbd_conf *mdev); | ||
1372 | extern void drbd_resume_io(struct drbd_conf *mdev); | ||
1373 | extern char *ppsize(char *buf, unsigned long long size); | ||
1374 | extern sector_t drbd_new_dev_size(struct drbd_conf *, | ||
1375 | struct drbd_backing_dev *); | ||
1376 | enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; | ||
1377 | extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *) __must_hold(local); | ||
1378 | extern void resync_after_online_grow(struct drbd_conf *); | ||
1379 | extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local); | ||
1380 | extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, | ||
1381 | int force); | ||
1382 | enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev); | ||
1383 | extern int drbd_khelper(struct drbd_conf *mdev, char *cmd); | ||
1384 | |||
1385 | /* drbd_worker.c */ | ||
1386 | extern int drbd_worker(struct drbd_thread *thi); | ||
1387 | extern int drbd_alter_sa(struct drbd_conf *mdev, int na); | ||
1388 | extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side); | ||
1389 | extern void resume_next_sg(struct drbd_conf *mdev); | ||
1390 | extern void suspend_other_sg(struct drbd_conf *mdev); | ||
1391 | extern int drbd_resync_finished(struct drbd_conf *mdev); | ||
1392 | /* maybe rather drbd_main.c ? */ | ||
1393 | extern int drbd_md_sync_page_io(struct drbd_conf *mdev, | ||
1394 | struct drbd_backing_dev *bdev, sector_t sector, int rw); | ||
1395 | extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int); | ||
1396 | |||
1397 | static inline void ov_oos_print(struct drbd_conf *mdev) | ||
1398 | { | ||
1399 | if (mdev->ov_last_oos_size) { | ||
1400 | dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n", | ||
1401 | (unsigned long long)mdev->ov_last_oos_start, | ||
1402 | (unsigned long)mdev->ov_last_oos_size); | ||
1403 | } | ||
1404 | mdev->ov_last_oos_size=0; | ||
1405 | } | ||
1406 | |||
1407 | |||
1408 | extern void drbd_csum(struct drbd_conf *, struct crypto_hash *, struct bio *, void *); | ||
1409 | /* worker callbacks */ | ||
1410 | extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int); | ||
1411 | extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int); | ||
1412 | extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int); | ||
1413 | extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int); | ||
1414 | extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int); | ||
1415 | extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int); | ||
1416 | extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int); | ||
1417 | extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int); | ||
1418 | extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int); | ||
1419 | extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int); | ||
1420 | extern int w_io_error(struct drbd_conf *, struct drbd_work *, int); | ||
1421 | extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int); | ||
1422 | extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int); | ||
1423 | extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int); | ||
1424 | extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int); | ||
1425 | extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int); | ||
1426 | extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int); | ||
1427 | extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int); | ||
1428 | |||
1429 | extern void resync_timer_fn(unsigned long data); | ||
1430 | |||
1431 | /* drbd_receiver.c */ | ||
1432 | extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list); | ||
1433 | extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, | ||
1434 | u64 id, | ||
1435 | sector_t sector, | ||
1436 | unsigned int data_size, | ||
1437 | gfp_t gfp_mask) __must_hold(local); | ||
1438 | extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e); | ||
1439 | extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev, | ||
1440 | struct list_head *head); | ||
1441 | extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, | ||
1442 | struct list_head *head); | ||
1443 | extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled); | ||
1444 | extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed); | ||
1445 | extern void drbd_flush_workqueue(struct drbd_conf *mdev); | ||
1446 | |||
1447 | /* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to | ||
1448 | * mess with get_fs/set_fs, we know we are KERNEL_DS always. */ | ||
1449 | static inline int drbd_setsockopt(struct socket *sock, int level, int optname, | ||
1450 | char __user *optval, int optlen) | ||
1451 | { | ||
1452 | int err; | ||
1453 | if (level == SOL_SOCKET) | ||
1454 | err = sock_setsockopt(sock, level, optname, optval, optlen); | ||
1455 | else | ||
1456 | err = sock->ops->setsockopt(sock, level, optname, optval, | ||
1457 | optlen); | ||
1458 | return err; | ||
1459 | } | ||
1460 | |||
1461 | static inline void drbd_tcp_cork(struct socket *sock) | ||
1462 | { | ||
1463 | int __user val = 1; | ||
1464 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, | ||
1465 | (char __user *)&val, sizeof(val)); | ||
1466 | } | ||
1467 | |||
1468 | static inline void drbd_tcp_uncork(struct socket *sock) | ||
1469 | { | ||
1470 | int __user val = 0; | ||
1471 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, | ||
1472 | (char __user *)&val, sizeof(val)); | ||
1473 | } | ||
1474 | |||
1475 | static inline void drbd_tcp_nodelay(struct socket *sock) | ||
1476 | { | ||
1477 | int __user val = 1; | ||
1478 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY, | ||
1479 | (char __user *)&val, sizeof(val)); | ||
1480 | } | ||
1481 | |||
1482 | static inline void drbd_tcp_quickack(struct socket *sock) | ||
1483 | { | ||
1484 | int __user val = 1; | ||
1485 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK, | ||
1486 | (char __user *)&val, sizeof(val)); | ||
1487 | } | ||
1488 | |||
1489 | void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo); | ||
1490 | |||
1491 | /* drbd_proc.c */ | ||
1492 | extern struct proc_dir_entry *drbd_proc; | ||
1493 | extern struct file_operations drbd_proc_fops; | ||
1494 | extern const char *drbd_conn_str(enum drbd_conns s); | ||
1495 | extern const char *drbd_role_str(enum drbd_role s); | ||
1496 | |||
1497 | /* drbd_actlog.c */ | ||
1498 | extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector); | ||
1499 | extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector); | ||
1500 | extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); | ||
1501 | extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector); | ||
1502 | extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector); | ||
1503 | extern void drbd_rs_cancel_all(struct drbd_conf *mdev); | ||
1504 | extern int drbd_rs_del_all(struct drbd_conf *mdev); | ||
1505 | extern void drbd_rs_failed_io(struct drbd_conf *mdev, | ||
1506 | sector_t sector, int size); | ||
1507 | extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *); | ||
1508 | extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, | ||
1509 | int size, const char *file, const unsigned int line); | ||
1510 | #define drbd_set_in_sync(mdev, sector, size) \ | ||
1511 | __drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__) | ||
1512 | extern void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, | ||
1513 | int size, const char *file, const unsigned int line); | ||
1514 | #define drbd_set_out_of_sync(mdev, sector, size) \ | ||
1515 | __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) | ||
1516 | extern void drbd_al_apply_to_bm(struct drbd_conf *mdev); | ||
1517 | extern void drbd_al_to_on_disk_bm(struct drbd_conf *mdev); | ||
1518 | extern void drbd_al_shrink(struct drbd_conf *mdev); | ||
1519 | |||
1520 | |||
1521 | /* drbd_nl.c */ | ||
1522 | |||
1523 | void drbd_nl_cleanup(void); | ||
1524 | int __init drbd_nl_init(void); | ||
1525 | void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state); | ||
1526 | void drbd_bcast_sync_progress(struct drbd_conf *mdev); | ||
1527 | void drbd_bcast_ee(struct drbd_conf *mdev, | ||
1528 | const char *reason, const int dgs, | ||
1529 | const char* seen_hash, const char* calc_hash, | ||
1530 | const struct drbd_epoch_entry* e); | ||
1531 | |||
1532 | |||
1533 | /** | ||
1534 | * DOC: DRBD State macros | ||
1535 | * | ||
1536 | * These macros are used to express state changes in easily readable form. | ||
1537 | * | ||
1538 | * The NS macros expand to a mask and a value, that can be bit ored onto the | ||
1539 | * current state as soon as the spinlock (req_lock) was taken. | ||
1540 | * | ||
1541 | * The _NS macros are used for state functions that get called with the | ||
1542 | * spinlock. These macros expand directly to the new state value. | ||
1543 | * | ||
1544 | * Besides the basic forms NS() and _NS() additional _?NS[23] are defined | ||
1545 | * to express state changes that affect more than one aspect of the state. | ||
1546 | * | ||
1547 | * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY) | ||
1548 | * Means that the network connection was established and that the peer | ||
1549 | * is in secondary role. | ||
1550 | */ | ||
1551 | #define role_MASK R_MASK | ||
1552 | #define peer_MASK R_MASK | ||
1553 | #define disk_MASK D_MASK | ||
1554 | #define pdsk_MASK D_MASK | ||
1555 | #define conn_MASK C_MASK | ||
1556 | #define susp_MASK 1 | ||
1557 | #define user_isp_MASK 1 | ||
1558 | #define aftr_isp_MASK 1 | ||
1559 | |||
1560 | #define NS(T, S) \ | ||
1561 | ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \ | ||
1562 | ({ union drbd_state val; val.i = 0; val.T = (S); val; }) | ||
1563 | #define NS2(T1, S1, T2, S2) \ | ||
1564 | ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ | ||
1565 | mask.T2 = T2##_MASK; mask; }), \ | ||
1566 | ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ | ||
1567 | val.T2 = (S2); val; }) | ||
1568 | #define NS3(T1, S1, T2, S2, T3, S3) \ | ||
1569 | ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ | ||
1570 | mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \ | ||
1571 | ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ | ||
1572 | val.T2 = (S2); val.T3 = (S3); val; }) | ||
1573 | |||
1574 | #define _NS(D, T, S) \ | ||
1575 | D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; }) | ||
1576 | #define _NS2(D, T1, S1, T2, S2) \ | ||
1577 | D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \ | ||
1578 | __ns.T2 = (S2); __ns; }) | ||
1579 | #define _NS3(D, T1, S1, T2, S2, T3, S3) \ | ||
1580 | D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \ | ||
1581 | __ns.T2 = (S2); __ns.T3 = (S3); __ns; }) | ||
1582 | |||
1583 | /* | ||
1584 | * inline helper functions | ||
1585 | *************************/ | ||
1586 | |||
1587 | static inline void drbd_state_lock(struct drbd_conf *mdev) | ||
1588 | { | ||
1589 | wait_event(mdev->misc_wait, | ||
1590 | !test_and_set_bit(CLUSTER_ST_CHANGE, &mdev->flags)); | ||
1591 | } | ||
1592 | |||
1593 | static inline void drbd_state_unlock(struct drbd_conf *mdev) | ||
1594 | { | ||
1595 | clear_bit(CLUSTER_ST_CHANGE, &mdev->flags); | ||
1596 | wake_up(&mdev->misc_wait); | ||
1597 | } | ||
1598 | |||
1599 | static inline int _drbd_set_state(struct drbd_conf *mdev, | ||
1600 | union drbd_state ns, enum chg_state_flags flags, | ||
1601 | struct completion *done) | ||
1602 | { | ||
1603 | int rv; | ||
1604 | |||
1605 | read_lock(&global_state_lock); | ||
1606 | rv = __drbd_set_state(mdev, ns, flags, done); | ||
1607 | read_unlock(&global_state_lock); | ||
1608 | |||
1609 | return rv; | ||
1610 | } | ||
1611 | |||
1612 | /** | ||
1613 | * drbd_request_state() - Reqest a state change | ||
1614 | * @mdev: DRBD device. | ||
1615 | * @mask: mask of state bits to change. | ||
1616 | * @val: value of new state bits. | ||
1617 | * | ||
1618 | * This is the most graceful way of requesting a state change. It is verbose | ||
1619 | * quite verbose in case the state change is not possible, and all those | ||
1620 | * state changes are globally serialized. | ||
1621 | */ | ||
1622 | static inline int drbd_request_state(struct drbd_conf *mdev, | ||
1623 | union drbd_state mask, | ||
1624 | union drbd_state val) | ||
1625 | { | ||
1626 | return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED); | ||
1627 | } | ||
1628 | |||
1629 | #define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__) | ||
1630 | static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, const char *where) | ||
1631 | { | ||
1632 | switch (mdev->ldev->dc.on_io_error) { | ||
1633 | case EP_PASS_ON: | ||
1634 | if (!forcedetach) { | ||
1635 | if (printk_ratelimit()) | ||
1636 | dev_err(DEV, "Local IO failed in %s." | ||
1637 | "Passing error on...\n", where); | ||
1638 | break; | ||
1639 | } | ||
1640 | /* NOTE fall through to detach case if forcedetach set */ | ||
1641 | case EP_DETACH: | ||
1642 | case EP_CALL_HELPER: | ||
1643 | if (mdev->state.disk > D_FAILED) { | ||
1644 | _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); | ||
1645 | dev_err(DEV, "Local IO failed in %s." | ||
1646 | "Detaching...\n", where); | ||
1647 | } | ||
1648 | break; | ||
1649 | } | ||
1650 | } | ||
1651 | |||
1652 | /** | ||
1653 | * drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers | ||
1654 | * @mdev: DRBD device. | ||
1655 | * @error: Error code passed to the IO completion callback | ||
1656 | * @forcedetach: Force detach. I.e. the error happened while accessing the meta data | ||
1657 | * | ||
1658 | * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED) | ||
1659 | */ | ||
1660 | #define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__) | ||
1661 | static inline void drbd_chk_io_error_(struct drbd_conf *mdev, | ||
1662 | int error, int forcedetach, const char *where) | ||
1663 | { | ||
1664 | if (error) { | ||
1665 | unsigned long flags; | ||
1666 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
1667 | __drbd_chk_io_error_(mdev, forcedetach, where); | ||
1668 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
1669 | } | ||
1670 | } | ||
1671 | |||
1672 | |||
1673 | /** | ||
1674 | * drbd_md_first_sector() - Returns the first sector number of the meta data area | ||
1675 | * @bdev: Meta data block device. | ||
1676 | * | ||
1677 | * BTW, for internal meta data, this happens to be the maximum capacity | ||
1678 | * we could agree upon with our peer node. | ||
1679 | */ | ||
1680 | static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) | ||
1681 | { | ||
1682 | switch (bdev->dc.meta_dev_idx) { | ||
1683 | case DRBD_MD_INDEX_INTERNAL: | ||
1684 | case DRBD_MD_INDEX_FLEX_INT: | ||
1685 | return bdev->md.md_offset + bdev->md.bm_offset; | ||
1686 | case DRBD_MD_INDEX_FLEX_EXT: | ||
1687 | default: | ||
1688 | return bdev->md.md_offset; | ||
1689 | } | ||
1690 | } | ||
1691 | |||
1692 | /** | ||
1693 | * drbd_md_last_sector() - Return the last sector number of the meta data area | ||
1694 | * @bdev: Meta data block device. | ||
1695 | */ | ||
1696 | static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) | ||
1697 | { | ||
1698 | switch (bdev->dc.meta_dev_idx) { | ||
1699 | case DRBD_MD_INDEX_INTERNAL: | ||
1700 | case DRBD_MD_INDEX_FLEX_INT: | ||
1701 | return bdev->md.md_offset + MD_AL_OFFSET - 1; | ||
1702 | case DRBD_MD_INDEX_FLEX_EXT: | ||
1703 | default: | ||
1704 | return bdev->md.md_offset + bdev->md.md_size_sect; | ||
1705 | } | ||
1706 | } | ||
1707 | |||
1708 | /* Returns the number of 512 byte sectors of the device */ | ||
1709 | static inline sector_t drbd_get_capacity(struct block_device *bdev) | ||
1710 | { | ||
1711 | /* return bdev ? get_capacity(bdev->bd_disk) : 0; */ | ||
1712 | return bdev ? bdev->bd_inode->i_size >> 9 : 0; | ||
1713 | } | ||
1714 | |||
1715 | /** | ||
1716 | * drbd_get_max_capacity() - Returns the capacity we announce to out peer | ||
1717 | * @bdev: Meta data block device. | ||
1718 | * | ||
1719 | * returns the capacity we announce to out peer. we clip ourselves at the | ||
1720 | * various MAX_SECTORS, because if we don't, current implementation will | ||
1721 | * oops sooner or later | ||
1722 | */ | ||
1723 | static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) | ||
1724 | { | ||
1725 | sector_t s; | ||
1726 | switch (bdev->dc.meta_dev_idx) { | ||
1727 | case DRBD_MD_INDEX_INTERNAL: | ||
1728 | case DRBD_MD_INDEX_FLEX_INT: | ||
1729 | s = drbd_get_capacity(bdev->backing_bdev) | ||
1730 | ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, | ||
1731 | drbd_md_first_sector(bdev)) | ||
1732 | : 0; | ||
1733 | break; | ||
1734 | case DRBD_MD_INDEX_FLEX_EXT: | ||
1735 | s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX, | ||
1736 | drbd_get_capacity(bdev->backing_bdev)); | ||
1737 | /* clip at maximum size the meta device can support */ | ||
1738 | s = min_t(sector_t, s, | ||
1739 | BM_EXT_TO_SECT(bdev->md.md_size_sect | ||
1740 | - bdev->md.bm_offset)); | ||
1741 | break; | ||
1742 | default: | ||
1743 | s = min_t(sector_t, DRBD_MAX_SECTORS, | ||
1744 | drbd_get_capacity(bdev->backing_bdev)); | ||
1745 | } | ||
1746 | return s; | ||
1747 | } | ||
1748 | |||
1749 | /** | ||
1750 | * drbd_md_ss__() - Return the sector number of our meta data super block | ||
1751 | * @mdev: DRBD device. | ||
1752 | * @bdev: Meta data block device. | ||
1753 | */ | ||
1754 | static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, | ||
1755 | struct drbd_backing_dev *bdev) | ||
1756 | { | ||
1757 | switch (bdev->dc.meta_dev_idx) { | ||
1758 | default: /* external, some index */ | ||
1759 | return MD_RESERVED_SECT * bdev->dc.meta_dev_idx; | ||
1760 | case DRBD_MD_INDEX_INTERNAL: | ||
1761 | /* with drbd08, internal meta data is always "flexible" */ | ||
1762 | case DRBD_MD_INDEX_FLEX_INT: | ||
1763 | /* sizeof(struct md_on_disk_07) == 4k | ||
1764 | * position: last 4k aligned block of 4k size */ | ||
1765 | if (!bdev->backing_bdev) { | ||
1766 | if (__ratelimit(&drbd_ratelimit_state)) { | ||
1767 | dev_err(DEV, "bdev->backing_bdev==NULL\n"); | ||
1768 | dump_stack(); | ||
1769 | } | ||
1770 | return 0; | ||
1771 | } | ||
1772 | return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) | ||
1773 | - MD_AL_OFFSET; | ||
1774 | case DRBD_MD_INDEX_FLEX_EXT: | ||
1775 | return 0; | ||
1776 | } | ||
1777 | } | ||
1778 | |||
1779 | static inline void | ||
1780 | _drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) | ||
1781 | { | ||
1782 | list_add_tail(&w->list, &q->q); | ||
1783 | up(&q->s); | ||
1784 | } | ||
1785 | |||
1786 | static inline void | ||
1787 | drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w) | ||
1788 | { | ||
1789 | unsigned long flags; | ||
1790 | spin_lock_irqsave(&q->q_lock, flags); | ||
1791 | list_add(&w->list, &q->q); | ||
1792 | up(&q->s); /* within the spinlock, | ||
1793 | see comment near end of drbd_worker() */ | ||
1794 | spin_unlock_irqrestore(&q->q_lock, flags); | ||
1795 | } | ||
1796 | |||
1797 | static inline void | ||
1798 | drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) | ||
1799 | { | ||
1800 | unsigned long flags; | ||
1801 | spin_lock_irqsave(&q->q_lock, flags); | ||
1802 | list_add_tail(&w->list, &q->q); | ||
1803 | up(&q->s); /* within the spinlock, | ||
1804 | see comment near end of drbd_worker() */ | ||
1805 | spin_unlock_irqrestore(&q->q_lock, flags); | ||
1806 | } | ||
1807 | |||
1808 | static inline void wake_asender(struct drbd_conf *mdev) | ||
1809 | { | ||
1810 | if (test_bit(SIGNAL_ASENDER, &mdev->flags)) | ||
1811 | force_sig(DRBD_SIG, mdev->asender.task); | ||
1812 | } | ||
1813 | |||
1814 | static inline void request_ping(struct drbd_conf *mdev) | ||
1815 | { | ||
1816 | set_bit(SEND_PING, &mdev->flags); | ||
1817 | wake_asender(mdev); | ||
1818 | } | ||
1819 | |||
1820 | static inline int drbd_send_short_cmd(struct drbd_conf *mdev, | ||
1821 | enum drbd_packets cmd) | ||
1822 | { | ||
1823 | struct p_header h; | ||
1824 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h)); | ||
1825 | } | ||
1826 | |||
1827 | static inline int drbd_send_ping(struct drbd_conf *mdev) | ||
1828 | { | ||
1829 | struct p_header h; | ||
1830 | return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h)); | ||
1831 | } | ||
1832 | |||
1833 | static inline int drbd_send_ping_ack(struct drbd_conf *mdev) | ||
1834 | { | ||
1835 | struct p_header h; | ||
1836 | return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h)); | ||
1837 | } | ||
1838 | |||
1839 | static inline void drbd_thread_stop(struct drbd_thread *thi) | ||
1840 | { | ||
1841 | _drbd_thread_stop(thi, FALSE, TRUE); | ||
1842 | } | ||
1843 | |||
1844 | static inline void drbd_thread_stop_nowait(struct drbd_thread *thi) | ||
1845 | { | ||
1846 | _drbd_thread_stop(thi, FALSE, FALSE); | ||
1847 | } | ||
1848 | |||
1849 | static inline void drbd_thread_restart_nowait(struct drbd_thread *thi) | ||
1850 | { | ||
1851 | _drbd_thread_stop(thi, TRUE, FALSE); | ||
1852 | } | ||
1853 | |||
1854 | /* counts how many answer packets packets we expect from our peer, | ||
1855 | * for either explicit application requests, | ||
1856 | * or implicit barrier packets as necessary. | ||
1857 | * increased: | ||
1858 | * w_send_barrier | ||
1859 | * _req_mod(req, queue_for_net_write or queue_for_net_read); | ||
1860 | * it is much easier and equally valid to count what we queue for the | ||
1861 | * worker, even before it actually was queued or send. | ||
1862 | * (drbd_make_request_common; recovery path on read io-error) | ||
1863 | * decreased: | ||
1864 | * got_BarrierAck (respective tl_clear, tl_clear_barrier) | ||
1865 | * _req_mod(req, data_received) | ||
1866 | * [from receive_DataReply] | ||
1867 | * _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked) | ||
1868 | * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)] | ||
1869 | * for some reason it is NOT decreased in got_NegAck, | ||
1870 | * but in the resulting cleanup code from report_params. | ||
1871 | * we should try to remember the reason for that... | ||
1872 | * _req_mod(req, send_failed or send_canceled) | ||
1873 | * _req_mod(req, connection_lost_while_pending) | ||
1874 | * [from tl_clear_barrier] | ||
1875 | */ | ||
1876 | static inline void inc_ap_pending(struct drbd_conf *mdev) | ||
1877 | { | ||
1878 | atomic_inc(&mdev->ap_pending_cnt); | ||
1879 | } | ||
1880 | |||
1881 | #define ERR_IF_CNT_IS_NEGATIVE(which) \ | ||
1882 | if (atomic_read(&mdev->which) < 0) \ | ||
1883 | dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \ | ||
1884 | __func__ , __LINE__ , \ | ||
1885 | atomic_read(&mdev->which)) | ||
1886 | |||
1887 | #define dec_ap_pending(mdev) do { \ | ||
1888 | typecheck(struct drbd_conf *, mdev); \ | ||
1889 | if (atomic_dec_and_test(&mdev->ap_pending_cnt)) \ | ||
1890 | wake_up(&mdev->misc_wait); \ | ||
1891 | ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0) | ||
1892 | |||
1893 | /* counts how many resync-related answers we still expect from the peer | ||
1894 | * increase decrease | ||
1895 | * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY) | ||
1896 | * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK whith ID_SYNCER) | ||
1897 | * (or P_NEG_ACK with ID_SYNCER) | ||
1898 | */ | ||
1899 | static inline void inc_rs_pending(struct drbd_conf *mdev) | ||
1900 | { | ||
1901 | atomic_inc(&mdev->rs_pending_cnt); | ||
1902 | } | ||
1903 | |||
1904 | #define dec_rs_pending(mdev) do { \ | ||
1905 | typecheck(struct drbd_conf *, mdev); \ | ||
1906 | atomic_dec(&mdev->rs_pending_cnt); \ | ||
1907 | ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0) | ||
1908 | |||
1909 | /* counts how many answers we still need to send to the peer. | ||
1910 | * increased on | ||
1911 | * receive_Data unless protocol A; | ||
1912 | * we need to send a P_RECV_ACK (proto B) | ||
1913 | * or P_WRITE_ACK (proto C) | ||
1914 | * receive_RSDataReply (recv_resync_read) we need to send a P_WRITE_ACK | ||
1915 | * receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA | ||
1916 | * receive_Barrier_* we need to send a P_BARRIER_ACK | ||
1917 | */ | ||
1918 | static inline void inc_unacked(struct drbd_conf *mdev) | ||
1919 | { | ||
1920 | atomic_inc(&mdev->unacked_cnt); | ||
1921 | } | ||
1922 | |||
1923 | #define dec_unacked(mdev) do { \ | ||
1924 | typecheck(struct drbd_conf *, mdev); \ | ||
1925 | atomic_dec(&mdev->unacked_cnt); \ | ||
1926 | ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) | ||
1927 | |||
1928 | #define sub_unacked(mdev, n) do { \ | ||
1929 | typecheck(struct drbd_conf *, mdev); \ | ||
1930 | atomic_sub(n, &mdev->unacked_cnt); \ | ||
1931 | ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) | ||
1932 | |||
1933 | |||
1934 | static inline void put_net_conf(struct drbd_conf *mdev) | ||
1935 | { | ||
1936 | if (atomic_dec_and_test(&mdev->net_cnt)) | ||
1937 | wake_up(&mdev->misc_wait); | ||
1938 | } | ||
1939 | |||
1940 | /** | ||
1941 | * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there | ||
1942 | * @mdev: DRBD device. | ||
1943 | * | ||
1944 | * You have to call put_net_conf() when finished working with mdev->net_conf. | ||
1945 | */ | ||
1946 | static inline int get_net_conf(struct drbd_conf *mdev) | ||
1947 | { | ||
1948 | int have_net_conf; | ||
1949 | |||
1950 | atomic_inc(&mdev->net_cnt); | ||
1951 | have_net_conf = mdev->state.conn >= C_UNCONNECTED; | ||
1952 | if (!have_net_conf) | ||
1953 | put_net_conf(mdev); | ||
1954 | return have_net_conf; | ||
1955 | } | ||
1956 | |||
1957 | /** | ||
1958 | * get_ldev() - Increase the ref count on mdev->ldev. Returns 0 if there is no ldev | ||
1959 | * @M: DRBD device. | ||
1960 | * | ||
1961 | * You have to call put_ldev() when finished working with mdev->ldev. | ||
1962 | */ | ||
1963 | #define get_ldev(M) __cond_lock(local, _get_ldev_if_state(M,D_INCONSISTENT)) | ||
1964 | #define get_ldev_if_state(M,MINS) __cond_lock(local, _get_ldev_if_state(M,MINS)) | ||
1965 | |||
1966 | static inline void put_ldev(struct drbd_conf *mdev) | ||
1967 | { | ||
1968 | __release(local); | ||
1969 | if (atomic_dec_and_test(&mdev->local_cnt)) | ||
1970 | wake_up(&mdev->misc_wait); | ||
1971 | D_ASSERT(atomic_read(&mdev->local_cnt) >= 0); | ||
1972 | } | ||
1973 | |||
1974 | #ifndef __CHECKER__ | ||
1975 | static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins) | ||
1976 | { | ||
1977 | int io_allowed; | ||
1978 | |||
1979 | atomic_inc(&mdev->local_cnt); | ||
1980 | io_allowed = (mdev->state.disk >= mins); | ||
1981 | if (!io_allowed) | ||
1982 | put_ldev(mdev); | ||
1983 | return io_allowed; | ||
1984 | } | ||
1985 | #else | ||
1986 | extern int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins); | ||
1987 | #endif | ||
1988 | |||
1989 | /* you must have an "get_ldev" reference */ | ||
1990 | static inline void drbd_get_syncer_progress(struct drbd_conf *mdev, | ||
1991 | unsigned long *bits_left, unsigned int *per_mil_done) | ||
1992 | { | ||
1993 | /* | ||
1994 | * this is to break it at compile time when we change that | ||
1995 | * (we may feel 4TB maximum storage per drbd is not enough) | ||
1996 | */ | ||
1997 | typecheck(unsigned long, mdev->rs_total); | ||
1998 | |||
1999 | /* note: both rs_total and rs_left are in bits, i.e. in | ||
2000 | * units of BM_BLOCK_SIZE. | ||
2001 | * for the percentage, we don't care. */ | ||
2002 | |||
2003 | *bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed; | ||
2004 | /* >> 10 to prevent overflow, | ||
2005 | * +1 to prevent division by zero */ | ||
2006 | if (*bits_left > mdev->rs_total) { | ||
2007 | /* doh. maybe a logic bug somewhere. | ||
2008 | * may also be just a race condition | ||
2009 | * between this and a disconnect during sync. | ||
2010 | * for now, just prevent in-kernel buffer overflow. | ||
2011 | */ | ||
2012 | smp_rmb(); | ||
2013 | dev_warn(DEV, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n", | ||
2014 | drbd_conn_str(mdev->state.conn), | ||
2015 | *bits_left, mdev->rs_total, mdev->rs_failed); | ||
2016 | *per_mil_done = 0; | ||
2017 | } else { | ||
2018 | /* make sure the calculation happens in long context */ | ||
2019 | unsigned long tmp = 1000UL - | ||
2020 | (*bits_left >> 10)*1000UL | ||
2021 | / ((mdev->rs_total >> 10) + 1UL); | ||
2022 | *per_mil_done = tmp; | ||
2023 | } | ||
2024 | } | ||
2025 | |||
2026 | |||
2027 | /* this throttles on-the-fly application requests | ||
2028 | * according to max_buffers settings; | ||
2029 | * maybe re-implement using semaphores? */ | ||
2030 | static inline int drbd_get_max_buffers(struct drbd_conf *mdev) | ||
2031 | { | ||
2032 | int mxb = 1000000; /* arbitrary limit on open requests */ | ||
2033 | if (get_net_conf(mdev)) { | ||
2034 | mxb = mdev->net_conf->max_buffers; | ||
2035 | put_net_conf(mdev); | ||
2036 | } | ||
2037 | return mxb; | ||
2038 | } | ||
2039 | |||
2040 | static inline int drbd_state_is_stable(union drbd_state s) | ||
2041 | { | ||
2042 | |||
2043 | /* DO NOT add a default clause, we want the compiler to warn us | ||
2044 | * for any newly introduced state we may have forgotten to add here */ | ||
2045 | |||
2046 | switch ((enum drbd_conns)s.conn) { | ||
2047 | /* new io only accepted when there is no connection, ... */ | ||
2048 | case C_STANDALONE: | ||
2049 | case C_WF_CONNECTION: | ||
2050 | /* ... or there is a well established connection. */ | ||
2051 | case C_CONNECTED: | ||
2052 | case C_SYNC_SOURCE: | ||
2053 | case C_SYNC_TARGET: | ||
2054 | case C_VERIFY_S: | ||
2055 | case C_VERIFY_T: | ||
2056 | case C_PAUSED_SYNC_S: | ||
2057 | case C_PAUSED_SYNC_T: | ||
2058 | /* maybe stable, look at the disk state */ | ||
2059 | break; | ||
2060 | |||
2061 | /* no new io accepted during tansitional states | ||
2062 | * like handshake or teardown */ | ||
2063 | case C_DISCONNECTING: | ||
2064 | case C_UNCONNECTED: | ||
2065 | case C_TIMEOUT: | ||
2066 | case C_BROKEN_PIPE: | ||
2067 | case C_NETWORK_FAILURE: | ||
2068 | case C_PROTOCOL_ERROR: | ||
2069 | case C_TEAR_DOWN: | ||
2070 | case C_WF_REPORT_PARAMS: | ||
2071 | case C_STARTING_SYNC_S: | ||
2072 | case C_STARTING_SYNC_T: | ||
2073 | case C_WF_BITMAP_S: | ||
2074 | case C_WF_BITMAP_T: | ||
2075 | case C_WF_SYNC_UUID: | ||
2076 | case C_MASK: | ||
2077 | /* not "stable" */ | ||
2078 | return 0; | ||
2079 | } | ||
2080 | |||
2081 | switch ((enum drbd_disk_state)s.disk) { | ||
2082 | case D_DISKLESS: | ||
2083 | case D_INCONSISTENT: | ||
2084 | case D_OUTDATED: | ||
2085 | case D_CONSISTENT: | ||
2086 | case D_UP_TO_DATE: | ||
2087 | /* disk state is stable as well. */ | ||
2088 | break; | ||
2089 | |||
2090 | /* no new io accepted during tansitional states */ | ||
2091 | case D_ATTACHING: | ||
2092 | case D_FAILED: | ||
2093 | case D_NEGOTIATING: | ||
2094 | case D_UNKNOWN: | ||
2095 | case D_MASK: | ||
2096 | /* not "stable" */ | ||
2097 | return 0; | ||
2098 | } | ||
2099 | |||
2100 | return 1; | ||
2101 | } | ||
2102 | |||
2103 | static inline int __inc_ap_bio_cond(struct drbd_conf *mdev) | ||
2104 | { | ||
2105 | int mxb = drbd_get_max_buffers(mdev); | ||
2106 | |||
2107 | if (mdev->state.susp) | ||
2108 | return 0; | ||
2109 | if (test_bit(SUSPEND_IO, &mdev->flags)) | ||
2110 | return 0; | ||
2111 | |||
2112 | /* to avoid potential deadlock or bitmap corruption, | ||
2113 | * in various places, we only allow new application io | ||
2114 | * to start during "stable" states. */ | ||
2115 | |||
2116 | /* no new io accepted when attaching or detaching the disk */ | ||
2117 | if (!drbd_state_is_stable(mdev->state)) | ||
2118 | return 0; | ||
2119 | |||
2120 | /* since some older kernels don't have atomic_add_unless, | ||
2121 | * and we are within the spinlock anyways, we have this workaround. */ | ||
2122 | if (atomic_read(&mdev->ap_bio_cnt) > mxb) | ||
2123 | return 0; | ||
2124 | if (test_bit(BITMAP_IO, &mdev->flags)) | ||
2125 | return 0; | ||
2126 | return 1; | ||
2127 | } | ||
2128 | |||
2129 | /* I'd like to use wait_event_lock_irq, | ||
2130 | * but I'm not sure when it got introduced, | ||
2131 | * and not sure when it has 3 or 4 arguments */ | ||
2132 | static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two) | ||
2133 | { | ||
2134 | /* compare with after_state_ch, | ||
2135 | * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */ | ||
2136 | DEFINE_WAIT(wait); | ||
2137 | |||
2138 | /* we wait here | ||
2139 | * as long as the device is suspended | ||
2140 | * until the bitmap is no longer on the fly during connection | ||
2141 | * handshake as long as we would exeed the max_buffer limit. | ||
2142 | * | ||
2143 | * to avoid races with the reconnect code, | ||
2144 | * we need to atomic_inc within the spinlock. */ | ||
2145 | |||
2146 | spin_lock_irq(&mdev->req_lock); | ||
2147 | while (!__inc_ap_bio_cond(mdev)) { | ||
2148 | prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE); | ||
2149 | spin_unlock_irq(&mdev->req_lock); | ||
2150 | schedule(); | ||
2151 | finish_wait(&mdev->misc_wait, &wait); | ||
2152 | spin_lock_irq(&mdev->req_lock); | ||
2153 | } | ||
2154 | atomic_add(one_or_two, &mdev->ap_bio_cnt); | ||
2155 | spin_unlock_irq(&mdev->req_lock); | ||
2156 | } | ||
2157 | |||
2158 | static inline void dec_ap_bio(struct drbd_conf *mdev) | ||
2159 | { | ||
2160 | int mxb = drbd_get_max_buffers(mdev); | ||
2161 | int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt); | ||
2162 | |||
2163 | D_ASSERT(ap_bio >= 0); | ||
2164 | /* this currently does wake_up for every dec_ap_bio! | ||
2165 | * maybe rather introduce some type of hysteresis? | ||
2166 | * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */ | ||
2167 | if (ap_bio < mxb) | ||
2168 | wake_up(&mdev->misc_wait); | ||
2169 | if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) { | ||
2170 | if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) | ||
2171 | drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); | ||
2172 | } | ||
2173 | } | ||
2174 | |||
2175 | static inline void drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) | ||
2176 | { | ||
2177 | mdev->ed_uuid = val; | ||
2178 | } | ||
2179 | |||
2180 | static inline int seq_cmp(u32 a, u32 b) | ||
2181 | { | ||
2182 | /* we assume wrap around at 32bit. | ||
2183 | * for wrap around at 24bit (old atomic_t), | ||
2184 | * we'd have to | ||
2185 | * a <<= 8; b <<= 8; | ||
2186 | */ | ||
2187 | return (s32)(a) - (s32)(b); | ||
2188 | } | ||
2189 | #define seq_lt(a, b) (seq_cmp((a), (b)) < 0) | ||
2190 | #define seq_gt(a, b) (seq_cmp((a), (b)) > 0) | ||
2191 | #define seq_ge(a, b) (seq_cmp((a), (b)) >= 0) | ||
2192 | #define seq_le(a, b) (seq_cmp((a), (b)) <= 0) | ||
2193 | /* CAUTION: please no side effects in arguments! */ | ||
2194 | #define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b))) | ||
2195 | |||
2196 | static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq) | ||
2197 | { | ||
2198 | unsigned int m; | ||
2199 | spin_lock(&mdev->peer_seq_lock); | ||
2200 | m = seq_max(mdev->peer_seq, new_seq); | ||
2201 | mdev->peer_seq = m; | ||
2202 | spin_unlock(&mdev->peer_seq_lock); | ||
2203 | if (m == new_seq) | ||
2204 | wake_up(&mdev->seq_wait); | ||
2205 | } | ||
2206 | |||
2207 | static inline void drbd_update_congested(struct drbd_conf *mdev) | ||
2208 | { | ||
2209 | struct sock *sk = mdev->data.socket->sk; | ||
2210 | if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5) | ||
2211 | set_bit(NET_CONGESTED, &mdev->flags); | ||
2212 | } | ||
2213 | |||
2214 | static inline int drbd_queue_order_type(struct drbd_conf *mdev) | ||
2215 | { | ||
2216 | /* sorry, we currently have no working implementation | ||
2217 | * of distributed TCQ stuff */ | ||
2218 | #ifndef QUEUE_ORDERED_NONE | ||
2219 | #define QUEUE_ORDERED_NONE 0 | ||
2220 | #endif | ||
2221 | return QUEUE_ORDERED_NONE; | ||
2222 | } | ||
2223 | |||
2224 | static inline void drbd_blk_run_queue(struct request_queue *q) | ||
2225 | { | ||
2226 | if (q && q->unplug_fn) | ||
2227 | q->unplug_fn(q); | ||
2228 | } | ||
2229 | |||
2230 | static inline void drbd_kick_lo(struct drbd_conf *mdev) | ||
2231 | { | ||
2232 | if (get_ldev(mdev)) { | ||
2233 | drbd_blk_run_queue(bdev_get_queue(mdev->ldev->backing_bdev)); | ||
2234 | put_ldev(mdev); | ||
2235 | } | ||
2236 | } | ||
2237 | |||
2238 | static inline void drbd_md_flush(struct drbd_conf *mdev) | ||
2239 | { | ||
2240 | int r; | ||
2241 | |||
2242 | if (test_bit(MD_NO_BARRIER, &mdev->flags)) | ||
2243 | return; | ||
2244 | |||
2245 | r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL); | ||
2246 | if (r) { | ||
2247 | set_bit(MD_NO_BARRIER, &mdev->flags); | ||
2248 | dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); | ||
2249 | } | ||
2250 | } | ||
2251 | |||
2252 | #endif | ||
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c new file mode 100644 index 000000000000..11d8ff6016ac --- /dev/null +++ b/drivers/block/drbd/drbd_main.c | |||
@@ -0,0 +1,3700 @@ | |||
1 | /* | ||
2 | drbd.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev | ||
11 | from Logicworks, Inc. for making SDP replication support possible. | ||
12 | |||
13 | drbd is free software; you can redistribute it and/or modify | ||
14 | it under the terms of the GNU General Public License as published by | ||
15 | the Free Software Foundation; either version 2, or (at your option) | ||
16 | any later version. | ||
17 | |||
18 | drbd is distributed in the hope that it will be useful, | ||
19 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
20 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
21 | GNU General Public License for more details. | ||
22 | |||
23 | You should have received a copy of the GNU General Public License | ||
24 | along with drbd; see the file COPYING. If not, write to | ||
25 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
26 | |||
27 | */ | ||
28 | |||
29 | #include <linux/module.h> | ||
30 | #include <linux/version.h> | ||
31 | #include <linux/drbd.h> | ||
32 | #include <asm/uaccess.h> | ||
33 | #include <asm/types.h> | ||
34 | #include <net/sock.h> | ||
35 | #include <linux/ctype.h> | ||
36 | #include <linux/smp_lock.h> | ||
37 | #include <linux/fs.h> | ||
38 | #include <linux/file.h> | ||
39 | #include <linux/proc_fs.h> | ||
40 | #include <linux/init.h> | ||
41 | #include <linux/mm.h> | ||
42 | #include <linux/memcontrol.h> | ||
43 | #include <linux/mm_inline.h> | ||
44 | #include <linux/slab.h> | ||
45 | #include <linux/random.h> | ||
46 | #include <linux/reboot.h> | ||
47 | #include <linux/notifier.h> | ||
48 | #include <linux/kthread.h> | ||
49 | |||
50 | #define __KERNEL_SYSCALLS__ | ||
51 | #include <linux/unistd.h> | ||
52 | #include <linux/vmalloc.h> | ||
53 | |||
54 | #include <linux/drbd_limits.h> | ||
55 | #include "drbd_int.h" | ||
56 | #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */ | ||
57 | |||
58 | #include "drbd_vli.h" | ||
59 | |||
60 | struct after_state_chg_work { | ||
61 | struct drbd_work w; | ||
62 | union drbd_state os; | ||
63 | union drbd_state ns; | ||
64 | enum chg_state_flags flags; | ||
65 | struct completion *done; | ||
66 | }; | ||
67 | |||
68 | int drbdd_init(struct drbd_thread *); | ||
69 | int drbd_worker(struct drbd_thread *); | ||
70 | int drbd_asender(struct drbd_thread *); | ||
71 | |||
72 | int drbd_init(void); | ||
73 | static int drbd_open(struct block_device *bdev, fmode_t mode); | ||
74 | static int drbd_release(struct gendisk *gd, fmode_t mode); | ||
75 | static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused); | ||
76 | static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | ||
77 | union drbd_state ns, enum chg_state_flags flags); | ||
78 | static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused); | ||
79 | static void md_sync_timer_fn(unsigned long data); | ||
80 | static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); | ||
81 | |||
82 | MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " | ||
83 | "Lars Ellenberg <lars@linbit.com>"); | ||
84 | MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION); | ||
85 | MODULE_VERSION(REL_VERSION); | ||
86 | MODULE_LICENSE("GPL"); | ||
87 | MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)"); | ||
88 | MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR); | ||
89 | |||
90 | #include <linux/moduleparam.h> | ||
91 | /* allow_open_on_secondary */ | ||
92 | MODULE_PARM_DESC(allow_oos, "DONT USE!"); | ||
93 | /* thanks to these macros, if compiled into the kernel (not-module), | ||
94 | * this becomes the boot parameter drbd.minor_count */ | ||
95 | module_param(minor_count, uint, 0444); | ||
96 | module_param(disable_sendpage, bool, 0644); | ||
97 | module_param(allow_oos, bool, 0); | ||
98 | module_param(cn_idx, uint, 0444); | ||
99 | module_param(proc_details, int, 0644); | ||
100 | |||
101 | #ifdef CONFIG_DRBD_FAULT_INJECTION | ||
102 | int enable_faults; | ||
103 | int fault_rate; | ||
104 | static int fault_count; | ||
105 | int fault_devs; | ||
106 | /* bitmap of enabled faults */ | ||
107 | module_param(enable_faults, int, 0664); | ||
108 | /* fault rate % value - applies to all enabled faults */ | ||
109 | module_param(fault_rate, int, 0664); | ||
110 | /* count of faults inserted */ | ||
111 | module_param(fault_count, int, 0664); | ||
112 | /* bitmap of devices to insert faults on */ | ||
113 | module_param(fault_devs, int, 0644); | ||
114 | #endif | ||
115 | |||
116 | /* module parameter, defined */ | ||
117 | unsigned int minor_count = 32; | ||
118 | int disable_sendpage; | ||
119 | int allow_oos; | ||
120 | unsigned int cn_idx = CN_IDX_DRBD; | ||
121 | int proc_details; /* Detail level in proc drbd*/ | ||
122 | |||
123 | /* Module parameter for setting the user mode helper program | ||
124 | * to run. Default is /sbin/drbdadm */ | ||
125 | char usermode_helper[80] = "/sbin/drbdadm"; | ||
126 | |||
127 | module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644); | ||
128 | |||
129 | /* in 2.6.x, our device mapping and config info contains our virtual gendisks | ||
130 | * as member "struct gendisk *vdisk;" | ||
131 | */ | ||
132 | struct drbd_conf **minor_table; | ||
133 | |||
134 | struct kmem_cache *drbd_request_cache; | ||
135 | struct kmem_cache *drbd_ee_cache; /* epoch entries */ | ||
136 | struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ | ||
137 | struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ | ||
138 | mempool_t *drbd_request_mempool; | ||
139 | mempool_t *drbd_ee_mempool; | ||
140 | |||
141 | /* I do not use a standard mempool, because: | ||
142 | 1) I want to hand out the pre-allocated objects first. | ||
143 | 2) I want to be able to interrupt sleeping allocation with a signal. | ||
144 | Note: This is a single linked list, the next pointer is the private | ||
145 | member of struct page. | ||
146 | */ | ||
147 | struct page *drbd_pp_pool; | ||
148 | spinlock_t drbd_pp_lock; | ||
149 | int drbd_pp_vacant; | ||
150 | wait_queue_head_t drbd_pp_wait; | ||
151 | |||
152 | DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5); | ||
153 | |||
154 | static struct block_device_operations drbd_ops = { | ||
155 | .owner = THIS_MODULE, | ||
156 | .open = drbd_open, | ||
157 | .release = drbd_release, | ||
158 | }; | ||
159 | |||
160 | #define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0])) | ||
161 | |||
162 | #ifdef __CHECKER__ | ||
163 | /* When checking with sparse, and this is an inline function, sparse will | ||
164 | give tons of false positives. When this is a real functions sparse works. | ||
165 | */ | ||
166 | int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins) | ||
167 | { | ||
168 | int io_allowed; | ||
169 | |||
170 | atomic_inc(&mdev->local_cnt); | ||
171 | io_allowed = (mdev->state.disk >= mins); | ||
172 | if (!io_allowed) { | ||
173 | if (atomic_dec_and_test(&mdev->local_cnt)) | ||
174 | wake_up(&mdev->misc_wait); | ||
175 | } | ||
176 | return io_allowed; | ||
177 | } | ||
178 | |||
179 | #endif | ||
180 | |||
181 | /** | ||
182 | * DOC: The transfer log | ||
183 | * | ||
184 | * The transfer log is a single linked list of &struct drbd_tl_epoch objects. | ||
185 | * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail | ||
186 | * of the list. There is always at least one &struct drbd_tl_epoch object. | ||
187 | * | ||
188 | * Each &struct drbd_tl_epoch has a circular double linked list of requests | ||
189 | * attached. | ||
190 | */ | ||
191 | static int tl_init(struct drbd_conf *mdev) | ||
192 | { | ||
193 | struct drbd_tl_epoch *b; | ||
194 | |||
195 | /* during device minor initialization, we may well use GFP_KERNEL */ | ||
196 | b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL); | ||
197 | if (!b) | ||
198 | return 0; | ||
199 | INIT_LIST_HEAD(&b->requests); | ||
200 | INIT_LIST_HEAD(&b->w.list); | ||
201 | b->next = NULL; | ||
202 | b->br_number = 4711; | ||
203 | b->n_req = 0; | ||
204 | b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ | ||
205 | |||
206 | mdev->oldest_tle = b; | ||
207 | mdev->newest_tle = b; | ||
208 | INIT_LIST_HEAD(&mdev->out_of_sequence_requests); | ||
209 | |||
210 | mdev->tl_hash = NULL; | ||
211 | mdev->tl_hash_s = 0; | ||
212 | |||
213 | return 1; | ||
214 | } | ||
215 | |||
216 | static void tl_cleanup(struct drbd_conf *mdev) | ||
217 | { | ||
218 | D_ASSERT(mdev->oldest_tle == mdev->newest_tle); | ||
219 | D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); | ||
220 | kfree(mdev->oldest_tle); | ||
221 | mdev->oldest_tle = NULL; | ||
222 | kfree(mdev->unused_spare_tle); | ||
223 | mdev->unused_spare_tle = NULL; | ||
224 | kfree(mdev->tl_hash); | ||
225 | mdev->tl_hash = NULL; | ||
226 | mdev->tl_hash_s = 0; | ||
227 | } | ||
228 | |||
229 | /** | ||
230 | * _tl_add_barrier() - Adds a barrier to the transfer log | ||
231 | * @mdev: DRBD device. | ||
232 | * @new: Barrier to be added before the current head of the TL. | ||
233 | * | ||
234 | * The caller must hold the req_lock. | ||
235 | */ | ||
236 | void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new) | ||
237 | { | ||
238 | struct drbd_tl_epoch *newest_before; | ||
239 | |||
240 | INIT_LIST_HEAD(&new->requests); | ||
241 | INIT_LIST_HEAD(&new->w.list); | ||
242 | new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ | ||
243 | new->next = NULL; | ||
244 | new->n_req = 0; | ||
245 | |||
246 | newest_before = mdev->newest_tle; | ||
247 | /* never send a barrier number == 0, because that is special-cased | ||
248 | * when using TCQ for our write ordering code */ | ||
249 | new->br_number = (newest_before->br_number+1) ?: 1; | ||
250 | if (mdev->newest_tle != new) { | ||
251 | mdev->newest_tle->next = new; | ||
252 | mdev->newest_tle = new; | ||
253 | } | ||
254 | } | ||
255 | |||
256 | /** | ||
257 | * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL | ||
258 | * @mdev: DRBD device. | ||
259 | * @barrier_nr: Expected identifier of the DRBD write barrier packet. | ||
260 | * @set_size: Expected number of requests before that barrier. | ||
261 | * | ||
262 | * In case the passed barrier_nr or set_size does not match the oldest | ||
263 | * &struct drbd_tl_epoch objects this function will cause a termination | ||
264 | * of the connection. | ||
265 | */ | ||
266 | void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, | ||
267 | unsigned int set_size) | ||
268 | { | ||
269 | struct drbd_tl_epoch *b, *nob; /* next old barrier */ | ||
270 | struct list_head *le, *tle; | ||
271 | struct drbd_request *r; | ||
272 | |||
273 | spin_lock_irq(&mdev->req_lock); | ||
274 | |||
275 | b = mdev->oldest_tle; | ||
276 | |||
277 | /* first some paranoia code */ | ||
278 | if (b == NULL) { | ||
279 | dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n", | ||
280 | barrier_nr); | ||
281 | goto bail; | ||
282 | } | ||
283 | if (b->br_number != barrier_nr) { | ||
284 | dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n", | ||
285 | barrier_nr, b->br_number); | ||
286 | goto bail; | ||
287 | } | ||
288 | if (b->n_req != set_size) { | ||
289 | dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n", | ||
290 | barrier_nr, set_size, b->n_req); | ||
291 | goto bail; | ||
292 | } | ||
293 | |||
294 | /* Clean up list of requests processed during current epoch */ | ||
295 | list_for_each_safe(le, tle, &b->requests) { | ||
296 | r = list_entry(le, struct drbd_request, tl_requests); | ||
297 | _req_mod(r, barrier_acked); | ||
298 | } | ||
299 | /* There could be requests on the list waiting for completion | ||
300 | of the write to the local disk. To avoid corruptions of | ||
301 | slab's data structures we have to remove the lists head. | ||
302 | |||
303 | Also there could have been a barrier ack out of sequence, overtaking | ||
304 | the write acks - which would be a bug and violating write ordering. | ||
305 | To not deadlock in case we lose connection while such requests are | ||
306 | still pending, we need some way to find them for the | ||
307 | _req_mode(connection_lost_while_pending). | ||
308 | |||
309 | These have been list_move'd to the out_of_sequence_requests list in | ||
310 | _req_mod(, barrier_acked) above. | ||
311 | */ | ||
312 | list_del_init(&b->requests); | ||
313 | |||
314 | nob = b->next; | ||
315 | if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { | ||
316 | _tl_add_barrier(mdev, b); | ||
317 | if (nob) | ||
318 | mdev->oldest_tle = nob; | ||
319 | /* if nob == NULL b was the only barrier, and becomes the new | ||
320 | barrier. Therefore mdev->oldest_tle points already to b */ | ||
321 | } else { | ||
322 | D_ASSERT(nob != NULL); | ||
323 | mdev->oldest_tle = nob; | ||
324 | kfree(b); | ||
325 | } | ||
326 | |||
327 | spin_unlock_irq(&mdev->req_lock); | ||
328 | dec_ap_pending(mdev); | ||
329 | |||
330 | return; | ||
331 | |||
332 | bail: | ||
333 | spin_unlock_irq(&mdev->req_lock); | ||
334 | drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); | ||
335 | } | ||
336 | |||
337 | |||
338 | /** | ||
339 | * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL | ||
340 | * @mdev: DRBD device. | ||
341 | * | ||
342 | * This is called after the connection to the peer was lost. The storage covered | ||
343 | * by the requests on the transfer gets marked as our of sync. Called from the | ||
344 | * receiver thread and the worker thread. | ||
345 | */ | ||
346 | void tl_clear(struct drbd_conf *mdev) | ||
347 | { | ||
348 | struct drbd_tl_epoch *b, *tmp; | ||
349 | struct list_head *le, *tle; | ||
350 | struct drbd_request *r; | ||
351 | int new_initial_bnr = net_random(); | ||
352 | |||
353 | spin_lock_irq(&mdev->req_lock); | ||
354 | |||
355 | b = mdev->oldest_tle; | ||
356 | while (b) { | ||
357 | list_for_each_safe(le, tle, &b->requests) { | ||
358 | r = list_entry(le, struct drbd_request, tl_requests); | ||
359 | /* It would be nice to complete outside of spinlock. | ||
360 | * But this is easier for now. */ | ||
361 | _req_mod(r, connection_lost_while_pending); | ||
362 | } | ||
363 | tmp = b->next; | ||
364 | |||
365 | /* there could still be requests on that ring list, | ||
366 | * in case local io is still pending */ | ||
367 | list_del(&b->requests); | ||
368 | |||
369 | /* dec_ap_pending corresponding to queue_barrier. | ||
370 | * the newest barrier may not have been queued yet, | ||
371 | * in which case w.cb is still NULL. */ | ||
372 | if (b->w.cb != NULL) | ||
373 | dec_ap_pending(mdev); | ||
374 | |||
375 | if (b == mdev->newest_tle) { | ||
376 | /* recycle, but reinit! */ | ||
377 | D_ASSERT(tmp == NULL); | ||
378 | INIT_LIST_HEAD(&b->requests); | ||
379 | INIT_LIST_HEAD(&b->w.list); | ||
380 | b->w.cb = NULL; | ||
381 | b->br_number = new_initial_bnr; | ||
382 | b->n_req = 0; | ||
383 | |||
384 | mdev->oldest_tle = b; | ||
385 | break; | ||
386 | } | ||
387 | kfree(b); | ||
388 | b = tmp; | ||
389 | } | ||
390 | |||
391 | /* we expect this list to be empty. */ | ||
392 | D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); | ||
393 | |||
394 | /* but just in case, clean it up anyways! */ | ||
395 | list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) { | ||
396 | r = list_entry(le, struct drbd_request, tl_requests); | ||
397 | /* It would be nice to complete outside of spinlock. | ||
398 | * But this is easier for now. */ | ||
399 | _req_mod(r, connection_lost_while_pending); | ||
400 | } | ||
401 | |||
402 | /* ensure bit indicating barrier is required is clear */ | ||
403 | clear_bit(CREATE_BARRIER, &mdev->flags); | ||
404 | |||
405 | spin_unlock_irq(&mdev->req_lock); | ||
406 | } | ||
407 | |||
408 | /** | ||
409 | * cl_wide_st_chg() - TRUE if the state change is a cluster wide one | ||
410 | * @mdev: DRBD device. | ||
411 | * @os: old (current) state. | ||
412 | * @ns: new (wanted) state. | ||
413 | */ | ||
414 | static int cl_wide_st_chg(struct drbd_conf *mdev, | ||
415 | union drbd_state os, union drbd_state ns) | ||
416 | { | ||
417 | return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED && | ||
418 | ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || | ||
419 | (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || | ||
420 | (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || | ||
421 | (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) || | ||
422 | (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || | ||
423 | (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S); | ||
424 | } | ||
425 | |||
426 | int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, | ||
427 | union drbd_state mask, union drbd_state val) | ||
428 | { | ||
429 | unsigned long flags; | ||
430 | union drbd_state os, ns; | ||
431 | int rv; | ||
432 | |||
433 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
434 | os = mdev->state; | ||
435 | ns.i = (os.i & ~mask.i) | val.i; | ||
436 | rv = _drbd_set_state(mdev, ns, f, NULL); | ||
437 | ns = mdev->state; | ||
438 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
439 | |||
440 | return rv; | ||
441 | } | ||
442 | |||
443 | /** | ||
444 | * drbd_force_state() - Impose a change which happens outside our control on our state | ||
445 | * @mdev: DRBD device. | ||
446 | * @mask: mask of state bits to change. | ||
447 | * @val: value of new state bits. | ||
448 | */ | ||
449 | void drbd_force_state(struct drbd_conf *mdev, | ||
450 | union drbd_state mask, union drbd_state val) | ||
451 | { | ||
452 | drbd_change_state(mdev, CS_HARD, mask, val); | ||
453 | } | ||
454 | |||
455 | static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns); | ||
456 | static int is_valid_state_transition(struct drbd_conf *, | ||
457 | union drbd_state, union drbd_state); | ||
458 | static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, | ||
459 | union drbd_state ns, int *warn_sync_abort); | ||
460 | int drbd_send_state_req(struct drbd_conf *, | ||
461 | union drbd_state, union drbd_state); | ||
462 | |||
463 | static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev, | ||
464 | union drbd_state mask, union drbd_state val) | ||
465 | { | ||
466 | union drbd_state os, ns; | ||
467 | unsigned long flags; | ||
468 | int rv; | ||
469 | |||
470 | if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags)) | ||
471 | return SS_CW_SUCCESS; | ||
472 | |||
473 | if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags)) | ||
474 | return SS_CW_FAILED_BY_PEER; | ||
475 | |||
476 | rv = 0; | ||
477 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
478 | os = mdev->state; | ||
479 | ns.i = (os.i & ~mask.i) | val.i; | ||
480 | ns = sanitize_state(mdev, os, ns, NULL); | ||
481 | |||
482 | if (!cl_wide_st_chg(mdev, os, ns)) | ||
483 | rv = SS_CW_NO_NEED; | ||
484 | if (!rv) { | ||
485 | rv = is_valid_state(mdev, ns); | ||
486 | if (rv == SS_SUCCESS) { | ||
487 | rv = is_valid_state_transition(mdev, ns, os); | ||
488 | if (rv == SS_SUCCESS) | ||
489 | rv = 0; /* cont waiting, otherwise fail. */ | ||
490 | } | ||
491 | } | ||
492 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
493 | |||
494 | return rv; | ||
495 | } | ||
496 | |||
497 | /** | ||
498 | * drbd_req_state() - Perform an eventually cluster wide state change | ||
499 | * @mdev: DRBD device. | ||
500 | * @mask: mask of state bits to change. | ||
501 | * @val: value of new state bits. | ||
502 | * @f: flags | ||
503 | * | ||
504 | * Should not be called directly, use drbd_request_state() or | ||
505 | * _drbd_request_state(). | ||
506 | */ | ||
507 | static int drbd_req_state(struct drbd_conf *mdev, | ||
508 | union drbd_state mask, union drbd_state val, | ||
509 | enum chg_state_flags f) | ||
510 | { | ||
511 | struct completion done; | ||
512 | unsigned long flags; | ||
513 | union drbd_state os, ns; | ||
514 | int rv; | ||
515 | |||
516 | init_completion(&done); | ||
517 | |||
518 | if (f & CS_SERIALIZE) | ||
519 | mutex_lock(&mdev->state_mutex); | ||
520 | |||
521 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
522 | os = mdev->state; | ||
523 | ns.i = (os.i & ~mask.i) | val.i; | ||
524 | ns = sanitize_state(mdev, os, ns, NULL); | ||
525 | |||
526 | if (cl_wide_st_chg(mdev, os, ns)) { | ||
527 | rv = is_valid_state(mdev, ns); | ||
528 | if (rv == SS_SUCCESS) | ||
529 | rv = is_valid_state_transition(mdev, ns, os); | ||
530 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
531 | |||
532 | if (rv < SS_SUCCESS) { | ||
533 | if (f & CS_VERBOSE) | ||
534 | print_st_err(mdev, os, ns, rv); | ||
535 | goto abort; | ||
536 | } | ||
537 | |||
538 | drbd_state_lock(mdev); | ||
539 | if (!drbd_send_state_req(mdev, mask, val)) { | ||
540 | drbd_state_unlock(mdev); | ||
541 | rv = SS_CW_FAILED_BY_PEER; | ||
542 | if (f & CS_VERBOSE) | ||
543 | print_st_err(mdev, os, ns, rv); | ||
544 | goto abort; | ||
545 | } | ||
546 | |||
547 | wait_event(mdev->state_wait, | ||
548 | (rv = _req_st_cond(mdev, mask, val))); | ||
549 | |||
550 | if (rv < SS_SUCCESS) { | ||
551 | drbd_state_unlock(mdev); | ||
552 | if (f & CS_VERBOSE) | ||
553 | print_st_err(mdev, os, ns, rv); | ||
554 | goto abort; | ||
555 | } | ||
556 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
557 | os = mdev->state; | ||
558 | ns.i = (os.i & ~mask.i) | val.i; | ||
559 | rv = _drbd_set_state(mdev, ns, f, &done); | ||
560 | drbd_state_unlock(mdev); | ||
561 | } else { | ||
562 | rv = _drbd_set_state(mdev, ns, f, &done); | ||
563 | } | ||
564 | |||
565 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
566 | |||
567 | if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) { | ||
568 | D_ASSERT(current != mdev->worker.task); | ||
569 | wait_for_completion(&done); | ||
570 | } | ||
571 | |||
572 | abort: | ||
573 | if (f & CS_SERIALIZE) | ||
574 | mutex_unlock(&mdev->state_mutex); | ||
575 | |||
576 | return rv; | ||
577 | } | ||
578 | |||
579 | /** | ||
580 | * _drbd_request_state() - Request a state change (with flags) | ||
581 | * @mdev: DRBD device. | ||
582 | * @mask: mask of state bits to change. | ||
583 | * @val: value of new state bits. | ||
584 | * @f: flags | ||
585 | * | ||
586 | * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE | ||
587 | * flag, or when logging of failed state change requests is not desired. | ||
588 | */ | ||
589 | int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask, | ||
590 | union drbd_state val, enum chg_state_flags f) | ||
591 | { | ||
592 | int rv; | ||
593 | |||
594 | wait_event(mdev->state_wait, | ||
595 | (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE); | ||
596 | |||
597 | return rv; | ||
598 | } | ||
599 | |||
600 | static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns) | ||
601 | { | ||
602 | dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n", | ||
603 | name, | ||
604 | drbd_conn_str(ns.conn), | ||
605 | drbd_role_str(ns.role), | ||
606 | drbd_role_str(ns.peer), | ||
607 | drbd_disk_str(ns.disk), | ||
608 | drbd_disk_str(ns.pdsk), | ||
609 | ns.susp ? 's' : 'r', | ||
610 | ns.aftr_isp ? 'a' : '-', | ||
611 | ns.peer_isp ? 'p' : '-', | ||
612 | ns.user_isp ? 'u' : '-' | ||
613 | ); | ||
614 | } | ||
615 | |||
616 | void print_st_err(struct drbd_conf *mdev, | ||
617 | union drbd_state os, union drbd_state ns, int err) | ||
618 | { | ||
619 | if (err == SS_IN_TRANSIENT_STATE) | ||
620 | return; | ||
621 | dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err)); | ||
622 | print_st(mdev, " state", os); | ||
623 | print_st(mdev, "wanted", ns); | ||
624 | } | ||
625 | |||
626 | |||
627 | #define drbd_peer_str drbd_role_str | ||
628 | #define drbd_pdsk_str drbd_disk_str | ||
629 | |||
630 | #define drbd_susp_str(A) ((A) ? "1" : "0") | ||
631 | #define drbd_aftr_isp_str(A) ((A) ? "1" : "0") | ||
632 | #define drbd_peer_isp_str(A) ((A) ? "1" : "0") | ||
633 | #define drbd_user_isp_str(A) ((A) ? "1" : "0") | ||
634 | |||
635 | #define PSC(A) \ | ||
636 | ({ if (ns.A != os.A) { \ | ||
637 | pbp += sprintf(pbp, #A "( %s -> %s ) ", \ | ||
638 | drbd_##A##_str(os.A), \ | ||
639 | drbd_##A##_str(ns.A)); \ | ||
640 | } }) | ||
641 | |||
642 | /** | ||
643 | * is_valid_state() - Returns an SS_ error code if ns is not valid | ||
644 | * @mdev: DRBD device. | ||
645 | * @ns: State to consider. | ||
646 | */ | ||
647 | static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns) | ||
648 | { | ||
649 | /* See drbd_state_sw_errors in drbd_strings.c */ | ||
650 | |||
651 | enum drbd_fencing_p fp; | ||
652 | int rv = SS_SUCCESS; | ||
653 | |||
654 | fp = FP_DONT_CARE; | ||
655 | if (get_ldev(mdev)) { | ||
656 | fp = mdev->ldev->dc.fencing; | ||
657 | put_ldev(mdev); | ||
658 | } | ||
659 | |||
660 | if (get_net_conf(mdev)) { | ||
661 | if (!mdev->net_conf->two_primaries && | ||
662 | ns.role == R_PRIMARY && ns.peer == R_PRIMARY) | ||
663 | rv = SS_TWO_PRIMARIES; | ||
664 | put_net_conf(mdev); | ||
665 | } | ||
666 | |||
667 | if (rv <= 0) | ||
668 | /* already found a reason to abort */; | ||
669 | else if (ns.role == R_SECONDARY && mdev->open_cnt) | ||
670 | rv = SS_DEVICE_IN_USE; | ||
671 | |||
672 | else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE) | ||
673 | rv = SS_NO_UP_TO_DATE_DISK; | ||
674 | |||
675 | else if (fp >= FP_RESOURCE && | ||
676 | ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN) | ||
677 | rv = SS_PRIMARY_NOP; | ||
678 | |||
679 | else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT) | ||
680 | rv = SS_NO_UP_TO_DATE_DISK; | ||
681 | |||
682 | else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT) | ||
683 | rv = SS_NO_LOCAL_DISK; | ||
684 | |||
685 | else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) | ||
686 | rv = SS_NO_REMOTE_DISK; | ||
687 | |||
688 | else if ((ns.conn == C_CONNECTED || | ||
689 | ns.conn == C_WF_BITMAP_S || | ||
690 | ns.conn == C_SYNC_SOURCE || | ||
691 | ns.conn == C_PAUSED_SYNC_S) && | ||
692 | ns.disk == D_OUTDATED) | ||
693 | rv = SS_CONNECTED_OUTDATES; | ||
694 | |||
695 | else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
696 | (mdev->sync_conf.verify_alg[0] == 0)) | ||
697 | rv = SS_NO_VERIFY_ALG; | ||
698 | |||
699 | else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
700 | mdev->agreed_pro_version < 88) | ||
701 | rv = SS_NOT_SUPPORTED; | ||
702 | |||
703 | return rv; | ||
704 | } | ||
705 | |||
706 | /** | ||
707 | * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible | ||
708 | * @mdev: DRBD device. | ||
709 | * @ns: new state. | ||
710 | * @os: old state. | ||
711 | */ | ||
712 | static int is_valid_state_transition(struct drbd_conf *mdev, | ||
713 | union drbd_state ns, union drbd_state os) | ||
714 | { | ||
715 | int rv = SS_SUCCESS; | ||
716 | |||
717 | if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) && | ||
718 | os.conn > C_CONNECTED) | ||
719 | rv = SS_RESYNC_RUNNING; | ||
720 | |||
721 | if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE) | ||
722 | rv = SS_ALREADY_STANDALONE; | ||
723 | |||
724 | if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS) | ||
725 | rv = SS_IS_DISKLESS; | ||
726 | |||
727 | if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED) | ||
728 | rv = SS_NO_NET_CONFIG; | ||
729 | |||
730 | if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING) | ||
731 | rv = SS_LOWER_THAN_OUTDATED; | ||
732 | |||
733 | if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) | ||
734 | rv = SS_IN_TRANSIENT_STATE; | ||
735 | |||
736 | if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) | ||
737 | rv = SS_IN_TRANSIENT_STATE; | ||
738 | |||
739 | if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) | ||
740 | rv = SS_NEED_CONNECTION; | ||
741 | |||
742 | if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
743 | ns.conn != os.conn && os.conn > C_CONNECTED) | ||
744 | rv = SS_RESYNC_RUNNING; | ||
745 | |||
746 | if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && | ||
747 | os.conn < C_CONNECTED) | ||
748 | rv = SS_NEED_CONNECTION; | ||
749 | |||
750 | return rv; | ||
751 | } | ||
752 | |||
753 | /** | ||
754 | * sanitize_state() - Resolves implicitly necessary additional changes to a state transition | ||
755 | * @mdev: DRBD device. | ||
756 | * @os: old state. | ||
757 | * @ns: new state. | ||
758 | * @warn_sync_abort: | ||
759 | * | ||
760 | * When we loose connection, we have to set the state of the peers disk (pdsk) | ||
761 | * to D_UNKNOWN. This rule and many more along those lines are in this function. | ||
762 | */ | ||
763 | static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, | ||
764 | union drbd_state ns, int *warn_sync_abort) | ||
765 | { | ||
766 | enum drbd_fencing_p fp; | ||
767 | |||
768 | fp = FP_DONT_CARE; | ||
769 | if (get_ldev(mdev)) { | ||
770 | fp = mdev->ldev->dc.fencing; | ||
771 | put_ldev(mdev); | ||
772 | } | ||
773 | |||
774 | /* Disallow Network errors to configure a device's network part */ | ||
775 | if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) && | ||
776 | os.conn <= C_DISCONNECTING) | ||
777 | ns.conn = os.conn; | ||
778 | |||
779 | /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */ | ||
780 | if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN && | ||
781 | ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING) | ||
782 | ns.conn = os.conn; | ||
783 | |||
784 | /* After C_DISCONNECTING only C_STANDALONE may follow */ | ||
785 | if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) | ||
786 | ns.conn = os.conn; | ||
787 | |||
788 | if (ns.conn < C_CONNECTED) { | ||
789 | ns.peer_isp = 0; | ||
790 | ns.peer = R_UNKNOWN; | ||
791 | if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT) | ||
792 | ns.pdsk = D_UNKNOWN; | ||
793 | } | ||
794 | |||
795 | /* Clear the aftr_isp when becoming unconfigured */ | ||
796 | if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) | ||
797 | ns.aftr_isp = 0; | ||
798 | |||
799 | if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS) | ||
800 | ns.pdsk = D_UNKNOWN; | ||
801 | |||
802 | /* Abort resync if a disk fails/detaches */ | ||
803 | if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && | ||
804 | (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { | ||
805 | if (warn_sync_abort) | ||
806 | *warn_sync_abort = 1; | ||
807 | ns.conn = C_CONNECTED; | ||
808 | } | ||
809 | |||
810 | if (ns.conn >= C_CONNECTED && | ||
811 | ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) || | ||
812 | (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) { | ||
813 | switch (ns.conn) { | ||
814 | case C_WF_BITMAP_T: | ||
815 | case C_PAUSED_SYNC_T: | ||
816 | ns.disk = D_OUTDATED; | ||
817 | break; | ||
818 | case C_CONNECTED: | ||
819 | case C_WF_BITMAP_S: | ||
820 | case C_SYNC_SOURCE: | ||
821 | case C_PAUSED_SYNC_S: | ||
822 | ns.disk = D_UP_TO_DATE; | ||
823 | break; | ||
824 | case C_SYNC_TARGET: | ||
825 | ns.disk = D_INCONSISTENT; | ||
826 | dev_warn(DEV, "Implicitly set disk state Inconsistent!\n"); | ||
827 | break; | ||
828 | } | ||
829 | if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE) | ||
830 | dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n"); | ||
831 | } | ||
832 | |||
833 | if (ns.conn >= C_CONNECTED && | ||
834 | (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) { | ||
835 | switch (ns.conn) { | ||
836 | case C_CONNECTED: | ||
837 | case C_WF_BITMAP_T: | ||
838 | case C_PAUSED_SYNC_T: | ||
839 | case C_SYNC_TARGET: | ||
840 | ns.pdsk = D_UP_TO_DATE; | ||
841 | break; | ||
842 | case C_WF_BITMAP_S: | ||
843 | case C_PAUSED_SYNC_S: | ||
844 | ns.pdsk = D_OUTDATED; | ||
845 | break; | ||
846 | case C_SYNC_SOURCE: | ||
847 | ns.pdsk = D_INCONSISTENT; | ||
848 | dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n"); | ||
849 | break; | ||
850 | } | ||
851 | if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE) | ||
852 | dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n"); | ||
853 | } | ||
854 | |||
855 | /* Connection breaks down before we finished "Negotiating" */ | ||
856 | if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING && | ||
857 | get_ldev_if_state(mdev, D_NEGOTIATING)) { | ||
858 | if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) { | ||
859 | ns.disk = mdev->new_state_tmp.disk; | ||
860 | ns.pdsk = mdev->new_state_tmp.pdsk; | ||
861 | } else { | ||
862 | dev_alert(DEV, "Connection lost while negotiating, no data!\n"); | ||
863 | ns.disk = D_DISKLESS; | ||
864 | ns.pdsk = D_UNKNOWN; | ||
865 | } | ||
866 | put_ldev(mdev); | ||
867 | } | ||
868 | |||
869 | if (fp == FP_STONITH && | ||
870 | (ns.role == R_PRIMARY && | ||
871 | ns.conn < C_CONNECTED && | ||
872 | ns.pdsk > D_OUTDATED)) | ||
873 | ns.susp = 1; | ||
874 | |||
875 | if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { | ||
876 | if (ns.conn == C_SYNC_SOURCE) | ||
877 | ns.conn = C_PAUSED_SYNC_S; | ||
878 | if (ns.conn == C_SYNC_TARGET) | ||
879 | ns.conn = C_PAUSED_SYNC_T; | ||
880 | } else { | ||
881 | if (ns.conn == C_PAUSED_SYNC_S) | ||
882 | ns.conn = C_SYNC_SOURCE; | ||
883 | if (ns.conn == C_PAUSED_SYNC_T) | ||
884 | ns.conn = C_SYNC_TARGET; | ||
885 | } | ||
886 | |||
887 | return ns; | ||
888 | } | ||
889 | |||
890 | /* helper for __drbd_set_state */ | ||
891 | static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs) | ||
892 | { | ||
893 | if (cs == C_VERIFY_T) { | ||
894 | /* starting online verify from an arbitrary position | ||
895 | * does not fit well into the existing protocol. | ||
896 | * on C_VERIFY_T, we initialize ov_left and friends | ||
897 | * implicitly in receive_DataRequest once the | ||
898 | * first P_OV_REQUEST is received */ | ||
899 | mdev->ov_start_sector = ~(sector_t)0; | ||
900 | } else { | ||
901 | unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector); | ||
902 | if (bit >= mdev->rs_total) | ||
903 | mdev->ov_start_sector = | ||
904 | BM_BIT_TO_SECT(mdev->rs_total - 1); | ||
905 | mdev->ov_position = mdev->ov_start_sector; | ||
906 | } | ||
907 | } | ||
908 | |||
909 | /** | ||
910 | * __drbd_set_state() - Set a new DRBD state | ||
911 | * @mdev: DRBD device. | ||
912 | * @ns: new state. | ||
913 | * @flags: Flags | ||
914 | * @done: Optional completion, that will get completed after the after_state_ch() finished | ||
915 | * | ||
916 | * Caller needs to hold req_lock, and global_state_lock. Do not call directly. | ||
917 | */ | ||
918 | int __drbd_set_state(struct drbd_conf *mdev, | ||
919 | union drbd_state ns, enum chg_state_flags flags, | ||
920 | struct completion *done) | ||
921 | { | ||
922 | union drbd_state os; | ||
923 | int rv = SS_SUCCESS; | ||
924 | int warn_sync_abort = 0; | ||
925 | struct after_state_chg_work *ascw; | ||
926 | |||
927 | os = mdev->state; | ||
928 | |||
929 | ns = sanitize_state(mdev, os, ns, &warn_sync_abort); | ||
930 | |||
931 | if (ns.i == os.i) | ||
932 | return SS_NOTHING_TO_DO; | ||
933 | |||
934 | if (!(flags & CS_HARD)) { | ||
935 | /* pre-state-change checks ; only look at ns */ | ||
936 | /* See drbd_state_sw_errors in drbd_strings.c */ | ||
937 | |||
938 | rv = is_valid_state(mdev, ns); | ||
939 | if (rv < SS_SUCCESS) { | ||
940 | /* If the old state was illegal as well, then let | ||
941 | this happen...*/ | ||
942 | |||
943 | if (is_valid_state(mdev, os) == rv) { | ||
944 | dev_err(DEV, "Considering state change from bad state. " | ||
945 | "Error would be: '%s'\n", | ||
946 | drbd_set_st_err_str(rv)); | ||
947 | print_st(mdev, "old", os); | ||
948 | print_st(mdev, "new", ns); | ||
949 | rv = is_valid_state_transition(mdev, ns, os); | ||
950 | } | ||
951 | } else | ||
952 | rv = is_valid_state_transition(mdev, ns, os); | ||
953 | } | ||
954 | |||
955 | if (rv < SS_SUCCESS) { | ||
956 | if (flags & CS_VERBOSE) | ||
957 | print_st_err(mdev, os, ns, rv); | ||
958 | return rv; | ||
959 | } | ||
960 | |||
961 | if (warn_sync_abort) | ||
962 | dev_warn(DEV, "Resync aborted.\n"); | ||
963 | |||
964 | { | ||
965 | char *pbp, pb[300]; | ||
966 | pbp = pb; | ||
967 | *pbp = 0; | ||
968 | PSC(role); | ||
969 | PSC(peer); | ||
970 | PSC(conn); | ||
971 | PSC(disk); | ||
972 | PSC(pdsk); | ||
973 | PSC(susp); | ||
974 | PSC(aftr_isp); | ||
975 | PSC(peer_isp); | ||
976 | PSC(user_isp); | ||
977 | dev_info(DEV, "%s\n", pb); | ||
978 | } | ||
979 | |||
980 | /* solve the race between becoming unconfigured, | ||
981 | * worker doing the cleanup, and | ||
982 | * admin reconfiguring us: | ||
983 | * on (re)configure, first set CONFIG_PENDING, | ||
984 | * then wait for a potentially exiting worker, | ||
985 | * start the worker, and schedule one no_op. | ||
986 | * then proceed with configuration. | ||
987 | */ | ||
988 | if (ns.disk == D_DISKLESS && | ||
989 | ns.conn == C_STANDALONE && | ||
990 | ns.role == R_SECONDARY && | ||
991 | !test_and_set_bit(CONFIG_PENDING, &mdev->flags)) | ||
992 | set_bit(DEVICE_DYING, &mdev->flags); | ||
993 | |||
994 | mdev->state.i = ns.i; | ||
995 | wake_up(&mdev->misc_wait); | ||
996 | wake_up(&mdev->state_wait); | ||
997 | |||
998 | /* post-state-change actions */ | ||
999 | if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) { | ||
1000 | set_bit(STOP_SYNC_TIMER, &mdev->flags); | ||
1001 | mod_timer(&mdev->resync_timer, jiffies); | ||
1002 | } | ||
1003 | |||
1004 | /* aborted verify run. log the last position */ | ||
1005 | if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && | ||
1006 | ns.conn < C_CONNECTED) { | ||
1007 | mdev->ov_start_sector = | ||
1008 | BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left); | ||
1009 | dev_info(DEV, "Online Verify reached sector %llu\n", | ||
1010 | (unsigned long long)mdev->ov_start_sector); | ||
1011 | } | ||
1012 | |||
1013 | if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && | ||
1014 | (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { | ||
1015 | dev_info(DEV, "Syncer continues.\n"); | ||
1016 | mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time; | ||
1017 | if (ns.conn == C_SYNC_TARGET) { | ||
1018 | if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags)) | ||
1019 | mod_timer(&mdev->resync_timer, jiffies); | ||
1020 | /* This if (!test_bit) is only needed for the case | ||
1021 | that a device that has ceased to used its timer, | ||
1022 | i.e. it is already in drbd_resync_finished() gets | ||
1023 | paused and resumed. */ | ||
1024 | } | ||
1025 | } | ||
1026 | |||
1027 | if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && | ||
1028 | (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { | ||
1029 | dev_info(DEV, "Resync suspended\n"); | ||
1030 | mdev->rs_mark_time = jiffies; | ||
1031 | if (ns.conn == C_PAUSED_SYNC_T) | ||
1032 | set_bit(STOP_SYNC_TIMER, &mdev->flags); | ||
1033 | } | ||
1034 | |||
1035 | if (os.conn == C_CONNECTED && | ||
1036 | (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { | ||
1037 | mdev->ov_position = 0; | ||
1038 | mdev->rs_total = | ||
1039 | mdev->rs_mark_left = drbd_bm_bits(mdev); | ||
1040 | if (mdev->agreed_pro_version >= 90) | ||
1041 | set_ov_position(mdev, ns.conn); | ||
1042 | else | ||
1043 | mdev->ov_start_sector = 0; | ||
1044 | mdev->ov_left = mdev->rs_total | ||
1045 | - BM_SECT_TO_BIT(mdev->ov_position); | ||
1046 | mdev->rs_start = | ||
1047 | mdev->rs_mark_time = jiffies; | ||
1048 | mdev->ov_last_oos_size = 0; | ||
1049 | mdev->ov_last_oos_start = 0; | ||
1050 | |||
1051 | if (ns.conn == C_VERIFY_S) { | ||
1052 | dev_info(DEV, "Starting Online Verify from sector %llu\n", | ||
1053 | (unsigned long long)mdev->ov_position); | ||
1054 | mod_timer(&mdev->resync_timer, jiffies); | ||
1055 | } | ||
1056 | } | ||
1057 | |||
1058 | if (get_ldev(mdev)) { | ||
1059 | u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND| | ||
1060 | MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE| | ||
1061 | MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY); | ||
1062 | |||
1063 | if (test_bit(CRASHED_PRIMARY, &mdev->flags)) | ||
1064 | mdf |= MDF_CRASHED_PRIMARY; | ||
1065 | if (mdev->state.role == R_PRIMARY || | ||
1066 | (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY)) | ||
1067 | mdf |= MDF_PRIMARY_IND; | ||
1068 | if (mdev->state.conn > C_WF_REPORT_PARAMS) | ||
1069 | mdf |= MDF_CONNECTED_IND; | ||
1070 | if (mdev->state.disk > D_INCONSISTENT) | ||
1071 | mdf |= MDF_CONSISTENT; | ||
1072 | if (mdev->state.disk > D_OUTDATED) | ||
1073 | mdf |= MDF_WAS_UP_TO_DATE; | ||
1074 | if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT) | ||
1075 | mdf |= MDF_PEER_OUT_DATED; | ||
1076 | if (mdf != mdev->ldev->md.flags) { | ||
1077 | mdev->ldev->md.flags = mdf; | ||
1078 | drbd_md_mark_dirty(mdev); | ||
1079 | } | ||
1080 | if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT) | ||
1081 | drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]); | ||
1082 | put_ldev(mdev); | ||
1083 | } | ||
1084 | |||
1085 | /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */ | ||
1086 | if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT && | ||
1087 | os.peer == R_SECONDARY && ns.peer == R_PRIMARY) | ||
1088 | set_bit(CONSIDER_RESYNC, &mdev->flags); | ||
1089 | |||
1090 | /* Receiver should clean up itself */ | ||
1091 | if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING) | ||
1092 | drbd_thread_stop_nowait(&mdev->receiver); | ||
1093 | |||
1094 | /* Now the receiver finished cleaning up itself, it should die */ | ||
1095 | if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE) | ||
1096 | drbd_thread_stop_nowait(&mdev->receiver); | ||
1097 | |||
1098 | /* Upon network failure, we need to restart the receiver. */ | ||
1099 | if (os.conn > C_TEAR_DOWN && | ||
1100 | ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) | ||
1101 | drbd_thread_restart_nowait(&mdev->receiver); | ||
1102 | |||
1103 | ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); | ||
1104 | if (ascw) { | ||
1105 | ascw->os = os; | ||
1106 | ascw->ns = ns; | ||
1107 | ascw->flags = flags; | ||
1108 | ascw->w.cb = w_after_state_ch; | ||
1109 | ascw->done = done; | ||
1110 | drbd_queue_work(&mdev->data.work, &ascw->w); | ||
1111 | } else { | ||
1112 | dev_warn(DEV, "Could not kmalloc an ascw\n"); | ||
1113 | } | ||
1114 | |||
1115 | return rv; | ||
1116 | } | ||
1117 | |||
1118 | static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
1119 | { | ||
1120 | struct after_state_chg_work *ascw = | ||
1121 | container_of(w, struct after_state_chg_work, w); | ||
1122 | after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags); | ||
1123 | if (ascw->flags & CS_WAIT_COMPLETE) { | ||
1124 | D_ASSERT(ascw->done != NULL); | ||
1125 | complete(ascw->done); | ||
1126 | } | ||
1127 | kfree(ascw); | ||
1128 | |||
1129 | return 1; | ||
1130 | } | ||
1131 | |||
1132 | static void abw_start_sync(struct drbd_conf *mdev, int rv) | ||
1133 | { | ||
1134 | if (rv) { | ||
1135 | dev_err(DEV, "Writing the bitmap failed not starting resync.\n"); | ||
1136 | _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE); | ||
1137 | return; | ||
1138 | } | ||
1139 | |||
1140 | switch (mdev->state.conn) { | ||
1141 | case C_STARTING_SYNC_T: | ||
1142 | _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); | ||
1143 | break; | ||
1144 | case C_STARTING_SYNC_S: | ||
1145 | drbd_start_resync(mdev, C_SYNC_SOURCE); | ||
1146 | break; | ||
1147 | } | ||
1148 | } | ||
1149 | |||
1150 | /** | ||
1151 | * after_state_ch() - Perform after state change actions that may sleep | ||
1152 | * @mdev: DRBD device. | ||
1153 | * @os: old state. | ||
1154 | * @ns: new state. | ||
1155 | * @flags: Flags | ||
1156 | */ | ||
1157 | static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | ||
1158 | union drbd_state ns, enum chg_state_flags flags) | ||
1159 | { | ||
1160 | enum drbd_fencing_p fp; | ||
1161 | |||
1162 | if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { | ||
1163 | clear_bit(CRASHED_PRIMARY, &mdev->flags); | ||
1164 | if (mdev->p_uuid) | ||
1165 | mdev->p_uuid[UI_FLAGS] &= ~((u64)2); | ||
1166 | } | ||
1167 | |||
1168 | fp = FP_DONT_CARE; | ||
1169 | if (get_ldev(mdev)) { | ||
1170 | fp = mdev->ldev->dc.fencing; | ||
1171 | put_ldev(mdev); | ||
1172 | } | ||
1173 | |||
1174 | /* Inform userspace about the change... */ | ||
1175 | drbd_bcast_state(mdev, ns); | ||
1176 | |||
1177 | if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) && | ||
1178 | (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) | ||
1179 | drbd_khelper(mdev, "pri-on-incon-degr"); | ||
1180 | |||
1181 | /* Here we have the actions that are performed after a | ||
1182 | state change. This function might sleep */ | ||
1183 | |||
1184 | if (fp == FP_STONITH && ns.susp) { | ||
1185 | /* case1: The outdate peer handler is successful: | ||
1186 | * case2: The connection was established again: */ | ||
1187 | if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) || | ||
1188 | (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) { | ||
1189 | tl_clear(mdev); | ||
1190 | spin_lock_irq(&mdev->req_lock); | ||
1191 | _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL); | ||
1192 | spin_unlock_irq(&mdev->req_lock); | ||
1193 | } | ||
1194 | } | ||
1195 | /* Do not change the order of the if above and the two below... */ | ||
1196 | if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ | ||
1197 | drbd_send_uuids(mdev); | ||
1198 | drbd_send_state(mdev); | ||
1199 | } | ||
1200 | if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S) | ||
1201 | drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)"); | ||
1202 | |||
1203 | /* Lost contact to peer's copy of the data */ | ||
1204 | if ((os.pdsk >= D_INCONSISTENT && | ||
1205 | os.pdsk != D_UNKNOWN && | ||
1206 | os.pdsk != D_OUTDATED) | ||
1207 | && (ns.pdsk < D_INCONSISTENT || | ||
1208 | ns.pdsk == D_UNKNOWN || | ||
1209 | ns.pdsk == D_OUTDATED)) { | ||
1210 | kfree(mdev->p_uuid); | ||
1211 | mdev->p_uuid = NULL; | ||
1212 | if (get_ldev(mdev)) { | ||
1213 | if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && | ||
1214 | mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { | ||
1215 | drbd_uuid_new_current(mdev); | ||
1216 | drbd_send_uuids(mdev); | ||
1217 | } | ||
1218 | put_ldev(mdev); | ||
1219 | } | ||
1220 | } | ||
1221 | |||
1222 | if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { | ||
1223 | if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) | ||
1224 | drbd_uuid_new_current(mdev); | ||
1225 | |||
1226 | /* D_DISKLESS Peer becomes secondary */ | ||
1227 | if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) | ||
1228 | drbd_al_to_on_disk_bm(mdev); | ||
1229 | put_ldev(mdev); | ||
1230 | } | ||
1231 | |||
1232 | /* Last part of the attaching process ... */ | ||
1233 | if (ns.conn >= C_CONNECTED && | ||
1234 | os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { | ||
1235 | kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */ | ||
1236 | mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */ | ||
1237 | drbd_send_sizes(mdev, 0); /* to start sync... */ | ||
1238 | drbd_send_uuids(mdev); | ||
1239 | drbd_send_state(mdev); | ||
1240 | } | ||
1241 | |||
1242 | /* We want to pause/continue resync, tell peer. */ | ||
1243 | if (ns.conn >= C_CONNECTED && | ||
1244 | ((os.aftr_isp != ns.aftr_isp) || | ||
1245 | (os.user_isp != ns.user_isp))) | ||
1246 | drbd_send_state(mdev); | ||
1247 | |||
1248 | /* In case one of the isp bits got set, suspend other devices. */ | ||
1249 | if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && | ||
1250 | (ns.aftr_isp || ns.peer_isp || ns.user_isp)) | ||
1251 | suspend_other_sg(mdev); | ||
1252 | |||
1253 | /* Make sure the peer gets informed about eventual state | ||
1254 | changes (ISP bits) while we were in WFReportParams. */ | ||
1255 | if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) | ||
1256 | drbd_send_state(mdev); | ||
1257 | |||
1258 | /* We are in the progress to start a full sync... */ | ||
1259 | if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || | ||
1260 | (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S)) | ||
1261 | drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync"); | ||
1262 | |||
1263 | /* We are invalidating our self... */ | ||
1264 | if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED && | ||
1265 | os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) | ||
1266 | drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate"); | ||
1267 | |||
1268 | if (os.disk > D_FAILED && ns.disk == D_FAILED) { | ||
1269 | enum drbd_io_error_p eh; | ||
1270 | |||
1271 | eh = EP_PASS_ON; | ||
1272 | if (get_ldev_if_state(mdev, D_FAILED)) { | ||
1273 | eh = mdev->ldev->dc.on_io_error; | ||
1274 | put_ldev(mdev); | ||
1275 | } | ||
1276 | |||
1277 | drbd_rs_cancel_all(mdev); | ||
1278 | /* since get_ldev() only works as long as disk>=D_INCONSISTENT, | ||
1279 | and it is D_DISKLESS here, local_cnt can only go down, it can | ||
1280 | not increase... It will reach zero */ | ||
1281 | wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); | ||
1282 | mdev->rs_total = 0; | ||
1283 | mdev->rs_failed = 0; | ||
1284 | atomic_set(&mdev->rs_pending_cnt, 0); | ||
1285 | |||
1286 | spin_lock_irq(&mdev->req_lock); | ||
1287 | _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL); | ||
1288 | spin_unlock_irq(&mdev->req_lock); | ||
1289 | |||
1290 | if (eh == EP_CALL_HELPER) | ||
1291 | drbd_khelper(mdev, "local-io-error"); | ||
1292 | } | ||
1293 | |||
1294 | if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) { | ||
1295 | |||
1296 | if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ { | ||
1297 | if (drbd_send_state(mdev)) | ||
1298 | dev_warn(DEV, "Notified peer that my disk is broken.\n"); | ||
1299 | else | ||
1300 | dev_err(DEV, "Sending state in drbd_io_error() failed\n"); | ||
1301 | } | ||
1302 | |||
1303 | lc_destroy(mdev->resync); | ||
1304 | mdev->resync = NULL; | ||
1305 | lc_destroy(mdev->act_log); | ||
1306 | mdev->act_log = NULL; | ||
1307 | __no_warn(local, | ||
1308 | drbd_free_bc(mdev->ldev); | ||
1309 | mdev->ldev = NULL;); | ||
1310 | |||
1311 | if (mdev->md_io_tmpp) | ||
1312 | __free_page(mdev->md_io_tmpp); | ||
1313 | } | ||
1314 | |||
1315 | /* Disks got bigger while they were detached */ | ||
1316 | if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && | ||
1317 | test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) { | ||
1318 | if (ns.conn == C_CONNECTED) | ||
1319 | resync_after_online_grow(mdev); | ||
1320 | } | ||
1321 | |||
1322 | /* A resync finished or aborted, wake paused devices... */ | ||
1323 | if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) || | ||
1324 | (os.peer_isp && !ns.peer_isp) || | ||
1325 | (os.user_isp && !ns.user_isp)) | ||
1326 | resume_next_sg(mdev); | ||
1327 | |||
1328 | /* Upon network connection, we need to start the receiver */ | ||
1329 | if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED) | ||
1330 | drbd_thread_start(&mdev->receiver); | ||
1331 | |||
1332 | /* Terminate worker thread if we are unconfigured - it will be | ||
1333 | restarted as needed... */ | ||
1334 | if (ns.disk == D_DISKLESS && | ||
1335 | ns.conn == C_STANDALONE && | ||
1336 | ns.role == R_SECONDARY) { | ||
1337 | if (os.aftr_isp != ns.aftr_isp) | ||
1338 | resume_next_sg(mdev); | ||
1339 | /* set in __drbd_set_state, unless CONFIG_PENDING was set */ | ||
1340 | if (test_bit(DEVICE_DYING, &mdev->flags)) | ||
1341 | drbd_thread_stop_nowait(&mdev->worker); | ||
1342 | } | ||
1343 | |||
1344 | drbd_md_sync(mdev); | ||
1345 | } | ||
1346 | |||
1347 | |||
1348 | static int drbd_thread_setup(void *arg) | ||
1349 | { | ||
1350 | struct drbd_thread *thi = (struct drbd_thread *) arg; | ||
1351 | struct drbd_conf *mdev = thi->mdev; | ||
1352 | unsigned long flags; | ||
1353 | int retval; | ||
1354 | |||
1355 | restart: | ||
1356 | retval = thi->function(thi); | ||
1357 | |||
1358 | spin_lock_irqsave(&thi->t_lock, flags); | ||
1359 | |||
1360 | /* if the receiver has been "Exiting", the last thing it did | ||
1361 | * was set the conn state to "StandAlone", | ||
1362 | * if now a re-connect request comes in, conn state goes C_UNCONNECTED, | ||
1363 | * and receiver thread will be "started". | ||
1364 | * drbd_thread_start needs to set "Restarting" in that case. | ||
1365 | * t_state check and assignment needs to be within the same spinlock, | ||
1366 | * so either thread_start sees Exiting, and can remap to Restarting, | ||
1367 | * or thread_start see None, and can proceed as normal. | ||
1368 | */ | ||
1369 | |||
1370 | if (thi->t_state == Restarting) { | ||
1371 | dev_info(DEV, "Restarting %s\n", current->comm); | ||
1372 | thi->t_state = Running; | ||
1373 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1374 | goto restart; | ||
1375 | } | ||
1376 | |||
1377 | thi->task = NULL; | ||
1378 | thi->t_state = None; | ||
1379 | smp_mb(); | ||
1380 | complete(&thi->stop); | ||
1381 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1382 | |||
1383 | dev_info(DEV, "Terminating %s\n", current->comm); | ||
1384 | |||
1385 | /* Release mod reference taken when thread was started */ | ||
1386 | module_put(THIS_MODULE); | ||
1387 | return retval; | ||
1388 | } | ||
1389 | |||
1390 | static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi, | ||
1391 | int (*func) (struct drbd_thread *)) | ||
1392 | { | ||
1393 | spin_lock_init(&thi->t_lock); | ||
1394 | thi->task = NULL; | ||
1395 | thi->t_state = None; | ||
1396 | thi->function = func; | ||
1397 | thi->mdev = mdev; | ||
1398 | } | ||
1399 | |||
1400 | int drbd_thread_start(struct drbd_thread *thi) | ||
1401 | { | ||
1402 | struct drbd_conf *mdev = thi->mdev; | ||
1403 | struct task_struct *nt; | ||
1404 | unsigned long flags; | ||
1405 | |||
1406 | const char *me = | ||
1407 | thi == &mdev->receiver ? "receiver" : | ||
1408 | thi == &mdev->asender ? "asender" : | ||
1409 | thi == &mdev->worker ? "worker" : "NONSENSE"; | ||
1410 | |||
1411 | /* is used from state engine doing drbd_thread_stop_nowait, | ||
1412 | * while holding the req lock irqsave */ | ||
1413 | spin_lock_irqsave(&thi->t_lock, flags); | ||
1414 | |||
1415 | switch (thi->t_state) { | ||
1416 | case None: | ||
1417 | dev_info(DEV, "Starting %s thread (from %s [%d])\n", | ||
1418 | me, current->comm, current->pid); | ||
1419 | |||
1420 | /* Get ref on module for thread - this is released when thread exits */ | ||
1421 | if (!try_module_get(THIS_MODULE)) { | ||
1422 | dev_err(DEV, "Failed to get module reference in drbd_thread_start\n"); | ||
1423 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1424 | return FALSE; | ||
1425 | } | ||
1426 | |||
1427 | init_completion(&thi->stop); | ||
1428 | D_ASSERT(thi->task == NULL); | ||
1429 | thi->reset_cpu_mask = 1; | ||
1430 | thi->t_state = Running; | ||
1431 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1432 | flush_signals(current); /* otherw. may get -ERESTARTNOINTR */ | ||
1433 | |||
1434 | nt = kthread_create(drbd_thread_setup, (void *) thi, | ||
1435 | "drbd%d_%s", mdev_to_minor(mdev), me); | ||
1436 | |||
1437 | if (IS_ERR(nt)) { | ||
1438 | dev_err(DEV, "Couldn't start thread\n"); | ||
1439 | |||
1440 | module_put(THIS_MODULE); | ||
1441 | return FALSE; | ||
1442 | } | ||
1443 | spin_lock_irqsave(&thi->t_lock, flags); | ||
1444 | thi->task = nt; | ||
1445 | thi->t_state = Running; | ||
1446 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1447 | wake_up_process(nt); | ||
1448 | break; | ||
1449 | case Exiting: | ||
1450 | thi->t_state = Restarting; | ||
1451 | dev_info(DEV, "Restarting %s thread (from %s [%d])\n", | ||
1452 | me, current->comm, current->pid); | ||
1453 | /* fall through */ | ||
1454 | case Running: | ||
1455 | case Restarting: | ||
1456 | default: | ||
1457 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1458 | break; | ||
1459 | } | ||
1460 | |||
1461 | return TRUE; | ||
1462 | } | ||
1463 | |||
1464 | |||
1465 | void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) | ||
1466 | { | ||
1467 | unsigned long flags; | ||
1468 | |||
1469 | enum drbd_thread_state ns = restart ? Restarting : Exiting; | ||
1470 | |||
1471 | /* may be called from state engine, holding the req lock irqsave */ | ||
1472 | spin_lock_irqsave(&thi->t_lock, flags); | ||
1473 | |||
1474 | if (thi->t_state == None) { | ||
1475 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1476 | if (restart) | ||
1477 | drbd_thread_start(thi); | ||
1478 | return; | ||
1479 | } | ||
1480 | |||
1481 | if (thi->t_state != ns) { | ||
1482 | if (thi->task == NULL) { | ||
1483 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1484 | return; | ||
1485 | } | ||
1486 | |||
1487 | thi->t_state = ns; | ||
1488 | smp_mb(); | ||
1489 | init_completion(&thi->stop); | ||
1490 | if (thi->task != current) | ||
1491 | force_sig(DRBD_SIGKILL, thi->task); | ||
1492 | |||
1493 | } | ||
1494 | |||
1495 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1496 | |||
1497 | if (wait) | ||
1498 | wait_for_completion(&thi->stop); | ||
1499 | } | ||
1500 | |||
1501 | #ifdef CONFIG_SMP | ||
1502 | /** | ||
1503 | * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs | ||
1504 | * @mdev: DRBD device. | ||
1505 | * | ||
1506 | * Forces all threads of a device onto the same CPU. This is beneficial for | ||
1507 | * DRBD's performance. May be overwritten by user's configuration. | ||
1508 | */ | ||
1509 | void drbd_calc_cpu_mask(struct drbd_conf *mdev) | ||
1510 | { | ||
1511 | int ord, cpu; | ||
1512 | |||
1513 | /* user override. */ | ||
1514 | if (cpumask_weight(mdev->cpu_mask)) | ||
1515 | return; | ||
1516 | |||
1517 | ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask); | ||
1518 | for_each_online_cpu(cpu) { | ||
1519 | if (ord-- == 0) { | ||
1520 | cpumask_set_cpu(cpu, mdev->cpu_mask); | ||
1521 | return; | ||
1522 | } | ||
1523 | } | ||
1524 | /* should not be reached */ | ||
1525 | cpumask_setall(mdev->cpu_mask); | ||
1526 | } | ||
1527 | |||
1528 | /** | ||
1529 | * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread | ||
1530 | * @mdev: DRBD device. | ||
1531 | * | ||
1532 | * call in the "main loop" of _all_ threads, no need for any mutex, current won't die | ||
1533 | * prematurely. | ||
1534 | */ | ||
1535 | void drbd_thread_current_set_cpu(struct drbd_conf *mdev) | ||
1536 | { | ||
1537 | struct task_struct *p = current; | ||
1538 | struct drbd_thread *thi = | ||
1539 | p == mdev->asender.task ? &mdev->asender : | ||
1540 | p == mdev->receiver.task ? &mdev->receiver : | ||
1541 | p == mdev->worker.task ? &mdev->worker : | ||
1542 | NULL; | ||
1543 | ERR_IF(thi == NULL) | ||
1544 | return; | ||
1545 | if (!thi->reset_cpu_mask) | ||
1546 | return; | ||
1547 | thi->reset_cpu_mask = 0; | ||
1548 | set_cpus_allowed_ptr(p, mdev->cpu_mask); | ||
1549 | } | ||
1550 | #endif | ||
1551 | |||
1552 | /* the appropriate socket mutex must be held already */ | ||
1553 | int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, | ||
1554 | enum drbd_packets cmd, struct p_header *h, | ||
1555 | size_t size, unsigned msg_flags) | ||
1556 | { | ||
1557 | int sent, ok; | ||
1558 | |||
1559 | ERR_IF(!h) return FALSE; | ||
1560 | ERR_IF(!size) return FALSE; | ||
1561 | |||
1562 | h->magic = BE_DRBD_MAGIC; | ||
1563 | h->command = cpu_to_be16(cmd); | ||
1564 | h->length = cpu_to_be16(size-sizeof(struct p_header)); | ||
1565 | |||
1566 | sent = drbd_send(mdev, sock, h, size, msg_flags); | ||
1567 | |||
1568 | ok = (sent == size); | ||
1569 | if (!ok) | ||
1570 | dev_err(DEV, "short sent %s size=%d sent=%d\n", | ||
1571 | cmdname(cmd), (int)size, sent); | ||
1572 | return ok; | ||
1573 | } | ||
1574 | |||
1575 | /* don't pass the socket. we may only look at it | ||
1576 | * when we hold the appropriate socket mutex. | ||
1577 | */ | ||
1578 | int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, | ||
1579 | enum drbd_packets cmd, struct p_header *h, size_t size) | ||
1580 | { | ||
1581 | int ok = 0; | ||
1582 | struct socket *sock; | ||
1583 | |||
1584 | if (use_data_socket) { | ||
1585 | mutex_lock(&mdev->data.mutex); | ||
1586 | sock = mdev->data.socket; | ||
1587 | } else { | ||
1588 | mutex_lock(&mdev->meta.mutex); | ||
1589 | sock = mdev->meta.socket; | ||
1590 | } | ||
1591 | |||
1592 | /* drbd_disconnect() could have called drbd_free_sock() | ||
1593 | * while we were waiting in down()... */ | ||
1594 | if (likely(sock != NULL)) | ||
1595 | ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0); | ||
1596 | |||
1597 | if (use_data_socket) | ||
1598 | mutex_unlock(&mdev->data.mutex); | ||
1599 | else | ||
1600 | mutex_unlock(&mdev->meta.mutex); | ||
1601 | return ok; | ||
1602 | } | ||
1603 | |||
1604 | int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data, | ||
1605 | size_t size) | ||
1606 | { | ||
1607 | struct p_header h; | ||
1608 | int ok; | ||
1609 | |||
1610 | h.magic = BE_DRBD_MAGIC; | ||
1611 | h.command = cpu_to_be16(cmd); | ||
1612 | h.length = cpu_to_be16(size); | ||
1613 | |||
1614 | if (!drbd_get_data_sock(mdev)) | ||
1615 | return 0; | ||
1616 | |||
1617 | ok = (sizeof(h) == | ||
1618 | drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0)); | ||
1619 | ok = ok && (size == | ||
1620 | drbd_send(mdev, mdev->data.socket, data, size, 0)); | ||
1621 | |||
1622 | drbd_put_data_sock(mdev); | ||
1623 | |||
1624 | return ok; | ||
1625 | } | ||
1626 | |||
1627 | int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc) | ||
1628 | { | ||
1629 | struct p_rs_param_89 *p; | ||
1630 | struct socket *sock; | ||
1631 | int size, rv; | ||
1632 | const int apv = mdev->agreed_pro_version; | ||
1633 | |||
1634 | size = apv <= 87 ? sizeof(struct p_rs_param) | ||
1635 | : apv == 88 ? sizeof(struct p_rs_param) | ||
1636 | + strlen(mdev->sync_conf.verify_alg) + 1 | ||
1637 | : /* 89 */ sizeof(struct p_rs_param_89); | ||
1638 | |||
1639 | /* used from admin command context and receiver/worker context. | ||
1640 | * to avoid kmalloc, grab the socket right here, | ||
1641 | * then use the pre-allocated sbuf there */ | ||
1642 | mutex_lock(&mdev->data.mutex); | ||
1643 | sock = mdev->data.socket; | ||
1644 | |||
1645 | if (likely(sock != NULL)) { | ||
1646 | enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; | ||
1647 | |||
1648 | p = &mdev->data.sbuf.rs_param_89; | ||
1649 | |||
1650 | /* initialize verify_alg and csums_alg */ | ||
1651 | memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); | ||
1652 | |||
1653 | p->rate = cpu_to_be32(sc->rate); | ||
1654 | |||
1655 | if (apv >= 88) | ||
1656 | strcpy(p->verify_alg, mdev->sync_conf.verify_alg); | ||
1657 | if (apv >= 89) | ||
1658 | strcpy(p->csums_alg, mdev->sync_conf.csums_alg); | ||
1659 | |||
1660 | rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0); | ||
1661 | } else | ||
1662 | rv = 0; /* not ok */ | ||
1663 | |||
1664 | mutex_unlock(&mdev->data.mutex); | ||
1665 | |||
1666 | return rv; | ||
1667 | } | ||
1668 | |||
1669 | int drbd_send_protocol(struct drbd_conf *mdev) | ||
1670 | { | ||
1671 | struct p_protocol *p; | ||
1672 | int size, rv; | ||
1673 | |||
1674 | size = sizeof(struct p_protocol); | ||
1675 | |||
1676 | if (mdev->agreed_pro_version >= 87) | ||
1677 | size += strlen(mdev->net_conf->integrity_alg) + 1; | ||
1678 | |||
1679 | /* we must not recurse into our own queue, | ||
1680 | * as that is blocked during handshake */ | ||
1681 | p = kmalloc(size, GFP_NOIO); | ||
1682 | if (p == NULL) | ||
1683 | return 0; | ||
1684 | |||
1685 | p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol); | ||
1686 | p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p); | ||
1687 | p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p); | ||
1688 | p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p); | ||
1689 | p->want_lose = cpu_to_be32(mdev->net_conf->want_lose); | ||
1690 | p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries); | ||
1691 | |||
1692 | if (mdev->agreed_pro_version >= 87) | ||
1693 | strcpy(p->integrity_alg, mdev->net_conf->integrity_alg); | ||
1694 | |||
1695 | rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL, | ||
1696 | (struct p_header *)p, size); | ||
1697 | kfree(p); | ||
1698 | return rv; | ||
1699 | } | ||
1700 | |||
1701 | int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags) | ||
1702 | { | ||
1703 | struct p_uuids p; | ||
1704 | int i; | ||
1705 | |||
1706 | if (!get_ldev_if_state(mdev, D_NEGOTIATING)) | ||
1707 | return 1; | ||
1708 | |||
1709 | for (i = UI_CURRENT; i < UI_SIZE; i++) | ||
1710 | p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0; | ||
1711 | |||
1712 | mdev->comm_bm_set = drbd_bm_total_weight(mdev); | ||
1713 | p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set); | ||
1714 | uuid_flags |= mdev->net_conf->want_lose ? 1 : 0; | ||
1715 | uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0; | ||
1716 | uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0; | ||
1717 | p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags); | ||
1718 | |||
1719 | put_ldev(mdev); | ||
1720 | |||
1721 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS, | ||
1722 | (struct p_header *)&p, sizeof(p)); | ||
1723 | } | ||
1724 | |||
1725 | int drbd_send_uuids(struct drbd_conf *mdev) | ||
1726 | { | ||
1727 | return _drbd_send_uuids(mdev, 0); | ||
1728 | } | ||
1729 | |||
1730 | int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev) | ||
1731 | { | ||
1732 | return _drbd_send_uuids(mdev, 8); | ||
1733 | } | ||
1734 | |||
1735 | |||
1736 | int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val) | ||
1737 | { | ||
1738 | struct p_rs_uuid p; | ||
1739 | |||
1740 | p.uuid = cpu_to_be64(val); | ||
1741 | |||
1742 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID, | ||
1743 | (struct p_header *)&p, sizeof(p)); | ||
1744 | } | ||
1745 | |||
1746 | int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply) | ||
1747 | { | ||
1748 | struct p_sizes p; | ||
1749 | sector_t d_size, u_size; | ||
1750 | int q_order_type; | ||
1751 | int ok; | ||
1752 | |||
1753 | if (get_ldev_if_state(mdev, D_NEGOTIATING)) { | ||
1754 | D_ASSERT(mdev->ldev->backing_bdev); | ||
1755 | d_size = drbd_get_max_capacity(mdev->ldev); | ||
1756 | u_size = mdev->ldev->dc.disk_size; | ||
1757 | q_order_type = drbd_queue_order_type(mdev); | ||
1758 | p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev)); | ||
1759 | put_ldev(mdev); | ||
1760 | } else { | ||
1761 | d_size = 0; | ||
1762 | u_size = 0; | ||
1763 | q_order_type = QUEUE_ORDERED_NONE; | ||
1764 | } | ||
1765 | |||
1766 | p.d_size = cpu_to_be64(d_size); | ||
1767 | p.u_size = cpu_to_be64(u_size); | ||
1768 | p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); | ||
1769 | p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue)); | ||
1770 | p.queue_order_type = cpu_to_be32(q_order_type); | ||
1771 | |||
1772 | ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, | ||
1773 | (struct p_header *)&p, sizeof(p)); | ||
1774 | return ok; | ||
1775 | } | ||
1776 | |||
1777 | /** | ||
1778 | * drbd_send_state() - Sends the drbd state to the peer | ||
1779 | * @mdev: DRBD device. | ||
1780 | */ | ||
1781 | int drbd_send_state(struct drbd_conf *mdev) | ||
1782 | { | ||
1783 | struct socket *sock; | ||
1784 | struct p_state p; | ||
1785 | int ok = 0; | ||
1786 | |||
1787 | /* Grab state lock so we wont send state if we're in the middle | ||
1788 | * of a cluster wide state change on another thread */ | ||
1789 | drbd_state_lock(mdev); | ||
1790 | |||
1791 | mutex_lock(&mdev->data.mutex); | ||
1792 | |||
1793 | p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */ | ||
1794 | sock = mdev->data.socket; | ||
1795 | |||
1796 | if (likely(sock != NULL)) { | ||
1797 | ok = _drbd_send_cmd(mdev, sock, P_STATE, | ||
1798 | (struct p_header *)&p, sizeof(p), 0); | ||
1799 | } | ||
1800 | |||
1801 | mutex_unlock(&mdev->data.mutex); | ||
1802 | |||
1803 | drbd_state_unlock(mdev); | ||
1804 | return ok; | ||
1805 | } | ||
1806 | |||
1807 | int drbd_send_state_req(struct drbd_conf *mdev, | ||
1808 | union drbd_state mask, union drbd_state val) | ||
1809 | { | ||
1810 | struct p_req_state p; | ||
1811 | |||
1812 | p.mask = cpu_to_be32(mask.i); | ||
1813 | p.val = cpu_to_be32(val.i); | ||
1814 | |||
1815 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ, | ||
1816 | (struct p_header *)&p, sizeof(p)); | ||
1817 | } | ||
1818 | |||
1819 | int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode) | ||
1820 | { | ||
1821 | struct p_req_state_reply p; | ||
1822 | |||
1823 | p.retcode = cpu_to_be32(retcode); | ||
1824 | |||
1825 | return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY, | ||
1826 | (struct p_header *)&p, sizeof(p)); | ||
1827 | } | ||
1828 | |||
1829 | int fill_bitmap_rle_bits(struct drbd_conf *mdev, | ||
1830 | struct p_compressed_bm *p, | ||
1831 | struct bm_xfer_ctx *c) | ||
1832 | { | ||
1833 | struct bitstream bs; | ||
1834 | unsigned long plain_bits; | ||
1835 | unsigned long tmp; | ||
1836 | unsigned long rl; | ||
1837 | unsigned len; | ||
1838 | unsigned toggle; | ||
1839 | int bits; | ||
1840 | |||
1841 | /* may we use this feature? */ | ||
1842 | if ((mdev->sync_conf.use_rle == 0) || | ||
1843 | (mdev->agreed_pro_version < 90)) | ||
1844 | return 0; | ||
1845 | |||
1846 | if (c->bit_offset >= c->bm_bits) | ||
1847 | return 0; /* nothing to do. */ | ||
1848 | |||
1849 | /* use at most thus many bytes */ | ||
1850 | bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0); | ||
1851 | memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX); | ||
1852 | /* plain bits covered in this code string */ | ||
1853 | plain_bits = 0; | ||
1854 | |||
1855 | /* p->encoding & 0x80 stores whether the first run length is set. | ||
1856 | * bit offset is implicit. | ||
1857 | * start with toggle == 2 to be able to tell the first iteration */ | ||
1858 | toggle = 2; | ||
1859 | |||
1860 | /* see how much plain bits we can stuff into one packet | ||
1861 | * using RLE and VLI. */ | ||
1862 | do { | ||
1863 | tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset) | ||
1864 | : _drbd_bm_find_next(mdev, c->bit_offset); | ||
1865 | if (tmp == -1UL) | ||
1866 | tmp = c->bm_bits; | ||
1867 | rl = tmp - c->bit_offset; | ||
1868 | |||
1869 | if (toggle == 2) { /* first iteration */ | ||
1870 | if (rl == 0) { | ||
1871 | /* the first checked bit was set, | ||
1872 | * store start value, */ | ||
1873 | DCBP_set_start(p, 1); | ||
1874 | /* but skip encoding of zero run length */ | ||
1875 | toggle = !toggle; | ||
1876 | continue; | ||
1877 | } | ||
1878 | DCBP_set_start(p, 0); | ||
1879 | } | ||
1880 | |||
1881 | /* paranoia: catch zero runlength. | ||
1882 | * can only happen if bitmap is modified while we scan it. */ | ||
1883 | if (rl == 0) { | ||
1884 | dev_err(DEV, "unexpected zero runlength while encoding bitmap " | ||
1885 | "t:%u bo:%lu\n", toggle, c->bit_offset); | ||
1886 | return -1; | ||
1887 | } | ||
1888 | |||
1889 | bits = vli_encode_bits(&bs, rl); | ||
1890 | if (bits == -ENOBUFS) /* buffer full */ | ||
1891 | break; | ||
1892 | if (bits <= 0) { | ||
1893 | dev_err(DEV, "error while encoding bitmap: %d\n", bits); | ||
1894 | return 0; | ||
1895 | } | ||
1896 | |||
1897 | toggle = !toggle; | ||
1898 | plain_bits += rl; | ||
1899 | c->bit_offset = tmp; | ||
1900 | } while (c->bit_offset < c->bm_bits); | ||
1901 | |||
1902 | len = bs.cur.b - p->code + !!bs.cur.bit; | ||
1903 | |||
1904 | if (plain_bits < (len << 3)) { | ||
1905 | /* incompressible with this method. | ||
1906 | * we need to rewind both word and bit position. */ | ||
1907 | c->bit_offset -= plain_bits; | ||
1908 | bm_xfer_ctx_bit_to_word_offset(c); | ||
1909 | c->bit_offset = c->word_offset * BITS_PER_LONG; | ||
1910 | return 0; | ||
1911 | } | ||
1912 | |||
1913 | /* RLE + VLI was able to compress it just fine. | ||
1914 | * update c->word_offset. */ | ||
1915 | bm_xfer_ctx_bit_to_word_offset(c); | ||
1916 | |||
1917 | /* store pad_bits */ | ||
1918 | DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7); | ||
1919 | |||
1920 | return len; | ||
1921 | } | ||
1922 | |||
1923 | enum { OK, FAILED, DONE } | ||
1924 | send_bitmap_rle_or_plain(struct drbd_conf *mdev, | ||
1925 | struct p_header *h, struct bm_xfer_ctx *c) | ||
1926 | { | ||
1927 | struct p_compressed_bm *p = (void*)h; | ||
1928 | unsigned long num_words; | ||
1929 | int len; | ||
1930 | int ok; | ||
1931 | |||
1932 | len = fill_bitmap_rle_bits(mdev, p, c); | ||
1933 | |||
1934 | if (len < 0) | ||
1935 | return FAILED; | ||
1936 | |||
1937 | if (len) { | ||
1938 | DCBP_set_code(p, RLE_VLI_Bits); | ||
1939 | ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h, | ||
1940 | sizeof(*p) + len, 0); | ||
1941 | |||
1942 | c->packets[0]++; | ||
1943 | c->bytes[0] += sizeof(*p) + len; | ||
1944 | |||
1945 | if (c->bit_offset >= c->bm_bits) | ||
1946 | len = 0; /* DONE */ | ||
1947 | } else { | ||
1948 | /* was not compressible. | ||
1949 | * send a buffer full of plain text bits instead. */ | ||
1950 | num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); | ||
1951 | len = num_words * sizeof(long); | ||
1952 | if (len) | ||
1953 | drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload); | ||
1954 | ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP, | ||
1955 | h, sizeof(struct p_header) + len, 0); | ||
1956 | c->word_offset += num_words; | ||
1957 | c->bit_offset = c->word_offset * BITS_PER_LONG; | ||
1958 | |||
1959 | c->packets[1]++; | ||
1960 | c->bytes[1] += sizeof(struct p_header) + len; | ||
1961 | |||
1962 | if (c->bit_offset > c->bm_bits) | ||
1963 | c->bit_offset = c->bm_bits; | ||
1964 | } | ||
1965 | ok = ok ? ((len == 0) ? DONE : OK) : FAILED; | ||
1966 | |||
1967 | if (ok == DONE) | ||
1968 | INFO_bm_xfer_stats(mdev, "send", c); | ||
1969 | return ok; | ||
1970 | } | ||
1971 | |||
1972 | /* See the comment at receive_bitmap() */ | ||
1973 | int _drbd_send_bitmap(struct drbd_conf *mdev) | ||
1974 | { | ||
1975 | struct bm_xfer_ctx c; | ||
1976 | struct p_header *p; | ||
1977 | int ret; | ||
1978 | |||
1979 | ERR_IF(!mdev->bitmap) return FALSE; | ||
1980 | |||
1981 | /* maybe we should use some per thread scratch page, | ||
1982 | * and allocate that during initial device creation? */ | ||
1983 | p = (struct p_header *) __get_free_page(GFP_NOIO); | ||
1984 | if (!p) { | ||
1985 | dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); | ||
1986 | return FALSE; | ||
1987 | } | ||
1988 | |||
1989 | if (get_ldev(mdev)) { | ||
1990 | if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { | ||
1991 | dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n"); | ||
1992 | drbd_bm_set_all(mdev); | ||
1993 | if (drbd_bm_write(mdev)) { | ||
1994 | /* write_bm did fail! Leave full sync flag set in Meta P_DATA | ||
1995 | * but otherwise process as per normal - need to tell other | ||
1996 | * side that a full resync is required! */ | ||
1997 | dev_err(DEV, "Failed to write bitmap to disk!\n"); | ||
1998 | } else { | ||
1999 | drbd_md_clear_flag(mdev, MDF_FULL_SYNC); | ||
2000 | drbd_md_sync(mdev); | ||
2001 | } | ||
2002 | } | ||
2003 | put_ldev(mdev); | ||
2004 | } | ||
2005 | |||
2006 | c = (struct bm_xfer_ctx) { | ||
2007 | .bm_bits = drbd_bm_bits(mdev), | ||
2008 | .bm_words = drbd_bm_words(mdev), | ||
2009 | }; | ||
2010 | |||
2011 | do { | ||
2012 | ret = send_bitmap_rle_or_plain(mdev, p, &c); | ||
2013 | } while (ret == OK); | ||
2014 | |||
2015 | free_page((unsigned long) p); | ||
2016 | return (ret == DONE); | ||
2017 | } | ||
2018 | |||
2019 | int drbd_send_bitmap(struct drbd_conf *mdev) | ||
2020 | { | ||
2021 | int err; | ||
2022 | |||
2023 | if (!drbd_get_data_sock(mdev)) | ||
2024 | return -1; | ||
2025 | err = !_drbd_send_bitmap(mdev); | ||
2026 | drbd_put_data_sock(mdev); | ||
2027 | return err; | ||
2028 | } | ||
2029 | |||
2030 | int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size) | ||
2031 | { | ||
2032 | int ok; | ||
2033 | struct p_barrier_ack p; | ||
2034 | |||
2035 | p.barrier = barrier_nr; | ||
2036 | p.set_size = cpu_to_be32(set_size); | ||
2037 | |||
2038 | if (mdev->state.conn < C_CONNECTED) | ||
2039 | return FALSE; | ||
2040 | ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK, | ||
2041 | (struct p_header *)&p, sizeof(p)); | ||
2042 | return ok; | ||
2043 | } | ||
2044 | |||
2045 | /** | ||
2046 | * _drbd_send_ack() - Sends an ack packet | ||
2047 | * @mdev: DRBD device. | ||
2048 | * @cmd: Packet command code. | ||
2049 | * @sector: sector, needs to be in big endian byte order | ||
2050 | * @blksize: size in byte, needs to be in big endian byte order | ||
2051 | * @block_id: Id, big endian byte order | ||
2052 | */ | ||
2053 | static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
2054 | u64 sector, | ||
2055 | u32 blksize, | ||
2056 | u64 block_id) | ||
2057 | { | ||
2058 | int ok; | ||
2059 | struct p_block_ack p; | ||
2060 | |||
2061 | p.sector = sector; | ||
2062 | p.block_id = block_id; | ||
2063 | p.blksize = blksize; | ||
2064 | p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq)); | ||
2065 | |||
2066 | if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED) | ||
2067 | return FALSE; | ||
2068 | ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd, | ||
2069 | (struct p_header *)&p, sizeof(p)); | ||
2070 | return ok; | ||
2071 | } | ||
2072 | |||
2073 | int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
2074 | struct p_data *dp) | ||
2075 | { | ||
2076 | const int header_size = sizeof(struct p_data) | ||
2077 | - sizeof(struct p_header); | ||
2078 | int data_size = ((struct p_header *)dp)->length - header_size; | ||
2079 | |||
2080 | return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size), | ||
2081 | dp->block_id); | ||
2082 | } | ||
2083 | |||
2084 | int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
2085 | struct p_block_req *rp) | ||
2086 | { | ||
2087 | return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id); | ||
2088 | } | ||
2089 | |||
2090 | /** | ||
2091 | * drbd_send_ack() - Sends an ack packet | ||
2092 | * @mdev: DRBD device. | ||
2093 | * @cmd: Packet command code. | ||
2094 | * @e: Epoch entry. | ||
2095 | */ | ||
2096 | int drbd_send_ack(struct drbd_conf *mdev, | ||
2097 | enum drbd_packets cmd, struct drbd_epoch_entry *e) | ||
2098 | { | ||
2099 | return _drbd_send_ack(mdev, cmd, | ||
2100 | cpu_to_be64(e->sector), | ||
2101 | cpu_to_be32(e->size), | ||
2102 | e->block_id); | ||
2103 | } | ||
2104 | |||
2105 | /* This function misuses the block_id field to signal if the blocks | ||
2106 | * are is sync or not. */ | ||
2107 | int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
2108 | sector_t sector, int blksize, u64 block_id) | ||
2109 | { | ||
2110 | return _drbd_send_ack(mdev, cmd, | ||
2111 | cpu_to_be64(sector), | ||
2112 | cpu_to_be32(blksize), | ||
2113 | cpu_to_be64(block_id)); | ||
2114 | } | ||
2115 | |||
2116 | int drbd_send_drequest(struct drbd_conf *mdev, int cmd, | ||
2117 | sector_t sector, int size, u64 block_id) | ||
2118 | { | ||
2119 | int ok; | ||
2120 | struct p_block_req p; | ||
2121 | |||
2122 | p.sector = cpu_to_be64(sector); | ||
2123 | p.block_id = block_id; | ||
2124 | p.blksize = cpu_to_be32(size); | ||
2125 | |||
2126 | ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, | ||
2127 | (struct p_header *)&p, sizeof(p)); | ||
2128 | return ok; | ||
2129 | } | ||
2130 | |||
2131 | int drbd_send_drequest_csum(struct drbd_conf *mdev, | ||
2132 | sector_t sector, int size, | ||
2133 | void *digest, int digest_size, | ||
2134 | enum drbd_packets cmd) | ||
2135 | { | ||
2136 | int ok; | ||
2137 | struct p_block_req p; | ||
2138 | |||
2139 | p.sector = cpu_to_be64(sector); | ||
2140 | p.block_id = BE_DRBD_MAGIC + 0xbeef; | ||
2141 | p.blksize = cpu_to_be32(size); | ||
2142 | |||
2143 | p.head.magic = BE_DRBD_MAGIC; | ||
2144 | p.head.command = cpu_to_be16(cmd); | ||
2145 | p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size); | ||
2146 | |||
2147 | mutex_lock(&mdev->data.mutex); | ||
2148 | |||
2149 | ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0)); | ||
2150 | ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0)); | ||
2151 | |||
2152 | mutex_unlock(&mdev->data.mutex); | ||
2153 | |||
2154 | return ok; | ||
2155 | } | ||
2156 | |||
2157 | int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size) | ||
2158 | { | ||
2159 | int ok; | ||
2160 | struct p_block_req p; | ||
2161 | |||
2162 | p.sector = cpu_to_be64(sector); | ||
2163 | p.block_id = BE_DRBD_MAGIC + 0xbabe; | ||
2164 | p.blksize = cpu_to_be32(size); | ||
2165 | |||
2166 | ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST, | ||
2167 | (struct p_header *)&p, sizeof(p)); | ||
2168 | return ok; | ||
2169 | } | ||
2170 | |||
2171 | /* called on sndtimeo | ||
2172 | * returns FALSE if we should retry, | ||
2173 | * TRUE if we think connection is dead | ||
2174 | */ | ||
2175 | static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock) | ||
2176 | { | ||
2177 | int drop_it; | ||
2178 | /* long elapsed = (long)(jiffies - mdev->last_received); */ | ||
2179 | |||
2180 | drop_it = mdev->meta.socket == sock | ||
2181 | || !mdev->asender.task | ||
2182 | || get_t_state(&mdev->asender) != Running | ||
2183 | || mdev->state.conn < C_CONNECTED; | ||
2184 | |||
2185 | if (drop_it) | ||
2186 | return TRUE; | ||
2187 | |||
2188 | drop_it = !--mdev->ko_count; | ||
2189 | if (!drop_it) { | ||
2190 | dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n", | ||
2191 | current->comm, current->pid, mdev->ko_count); | ||
2192 | request_ping(mdev); | ||
2193 | } | ||
2194 | |||
2195 | return drop_it; /* && (mdev->state == R_PRIMARY) */; | ||
2196 | } | ||
2197 | |||
2198 | /* The idea of sendpage seems to be to put some kind of reference | ||
2199 | * to the page into the skb, and to hand it over to the NIC. In | ||
2200 | * this process get_page() gets called. | ||
2201 | * | ||
2202 | * As soon as the page was really sent over the network put_page() | ||
2203 | * gets called by some part of the network layer. [ NIC driver? ] | ||
2204 | * | ||
2205 | * [ get_page() / put_page() increment/decrement the count. If count | ||
2206 | * reaches 0 the page will be freed. ] | ||
2207 | * | ||
2208 | * This works nicely with pages from FSs. | ||
2209 | * But this means that in protocol A we might signal IO completion too early! | ||
2210 | * | ||
2211 | * In order not to corrupt data during a resync we must make sure | ||
2212 | * that we do not reuse our own buffer pages (EEs) to early, therefore | ||
2213 | * we have the net_ee list. | ||
2214 | * | ||
2215 | * XFS seems to have problems, still, it submits pages with page_count == 0! | ||
2216 | * As a workaround, we disable sendpage on pages | ||
2217 | * with page_count == 0 or PageSlab. | ||
2218 | */ | ||
2219 | static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page, | ||
2220 | int offset, size_t size) | ||
2221 | { | ||
2222 | int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0); | ||
2223 | kunmap(page); | ||
2224 | if (sent == size) | ||
2225 | mdev->send_cnt += size>>9; | ||
2226 | return sent == size; | ||
2227 | } | ||
2228 | |||
2229 | static int _drbd_send_page(struct drbd_conf *mdev, struct page *page, | ||
2230 | int offset, size_t size) | ||
2231 | { | ||
2232 | mm_segment_t oldfs = get_fs(); | ||
2233 | int sent, ok; | ||
2234 | int len = size; | ||
2235 | |||
2236 | /* e.g. XFS meta- & log-data is in slab pages, which have a | ||
2237 | * page_count of 0 and/or have PageSlab() set. | ||
2238 | * we cannot use send_page for those, as that does get_page(); | ||
2239 | * put_page(); and would cause either a VM_BUG directly, or | ||
2240 | * __page_cache_release a page that would actually still be referenced | ||
2241 | * by someone, leading to some obscure delayed Oops somewhere else. */ | ||
2242 | if (disable_sendpage || (page_count(page) < 1) || PageSlab(page)) | ||
2243 | return _drbd_no_send_page(mdev, page, offset, size); | ||
2244 | |||
2245 | drbd_update_congested(mdev); | ||
2246 | set_fs(KERNEL_DS); | ||
2247 | do { | ||
2248 | sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page, | ||
2249 | offset, len, | ||
2250 | MSG_NOSIGNAL); | ||
2251 | if (sent == -EAGAIN) { | ||
2252 | if (we_should_drop_the_connection(mdev, | ||
2253 | mdev->data.socket)) | ||
2254 | break; | ||
2255 | else | ||
2256 | continue; | ||
2257 | } | ||
2258 | if (sent <= 0) { | ||
2259 | dev_warn(DEV, "%s: size=%d len=%d sent=%d\n", | ||
2260 | __func__, (int)size, len, sent); | ||
2261 | break; | ||
2262 | } | ||
2263 | len -= sent; | ||
2264 | offset += sent; | ||
2265 | } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/); | ||
2266 | set_fs(oldfs); | ||
2267 | clear_bit(NET_CONGESTED, &mdev->flags); | ||
2268 | |||
2269 | ok = (len == 0); | ||
2270 | if (likely(ok)) | ||
2271 | mdev->send_cnt += size>>9; | ||
2272 | return ok; | ||
2273 | } | ||
2274 | |||
2275 | static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) | ||
2276 | { | ||
2277 | struct bio_vec *bvec; | ||
2278 | int i; | ||
2279 | __bio_for_each_segment(bvec, bio, i, 0) { | ||
2280 | if (!_drbd_no_send_page(mdev, bvec->bv_page, | ||
2281 | bvec->bv_offset, bvec->bv_len)) | ||
2282 | return 0; | ||
2283 | } | ||
2284 | return 1; | ||
2285 | } | ||
2286 | |||
2287 | static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) | ||
2288 | { | ||
2289 | struct bio_vec *bvec; | ||
2290 | int i; | ||
2291 | __bio_for_each_segment(bvec, bio, i, 0) { | ||
2292 | if (!_drbd_send_page(mdev, bvec->bv_page, | ||
2293 | bvec->bv_offset, bvec->bv_len)) | ||
2294 | return 0; | ||
2295 | } | ||
2296 | |||
2297 | return 1; | ||
2298 | } | ||
2299 | |||
2300 | /* Used to send write requests | ||
2301 | * R_PRIMARY -> Peer (P_DATA) | ||
2302 | */ | ||
2303 | int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) | ||
2304 | { | ||
2305 | int ok = 1; | ||
2306 | struct p_data p; | ||
2307 | unsigned int dp_flags = 0; | ||
2308 | void *dgb; | ||
2309 | int dgs; | ||
2310 | |||
2311 | if (!drbd_get_data_sock(mdev)) | ||
2312 | return 0; | ||
2313 | |||
2314 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? | ||
2315 | crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; | ||
2316 | |||
2317 | p.head.magic = BE_DRBD_MAGIC; | ||
2318 | p.head.command = cpu_to_be16(P_DATA); | ||
2319 | p.head.length = | ||
2320 | cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size); | ||
2321 | |||
2322 | p.sector = cpu_to_be64(req->sector); | ||
2323 | p.block_id = (unsigned long)req; | ||
2324 | p.seq_num = cpu_to_be32(req->seq_num = | ||
2325 | atomic_add_return(1, &mdev->packet_seq)); | ||
2326 | dp_flags = 0; | ||
2327 | |||
2328 | /* NOTE: no need to check if barriers supported here as we would | ||
2329 | * not pass the test in make_request_common in that case | ||
2330 | */ | ||
2331 | if (bio_rw_flagged(req->master_bio, BIO_RW_BARRIER)) { | ||
2332 | dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n"); | ||
2333 | /* dp_flags |= DP_HARDBARRIER; */ | ||
2334 | } | ||
2335 | if (bio_rw_flagged(req->master_bio, BIO_RW_SYNCIO)) | ||
2336 | dp_flags |= DP_RW_SYNC; | ||
2337 | /* for now handle SYNCIO and UNPLUG | ||
2338 | * as if they still were one and the same flag */ | ||
2339 | if (bio_rw_flagged(req->master_bio, BIO_RW_UNPLUG)) | ||
2340 | dp_flags |= DP_RW_SYNC; | ||
2341 | if (mdev->state.conn >= C_SYNC_SOURCE && | ||
2342 | mdev->state.conn <= C_PAUSED_SYNC_T) | ||
2343 | dp_flags |= DP_MAY_SET_IN_SYNC; | ||
2344 | |||
2345 | p.dp_flags = cpu_to_be32(dp_flags); | ||
2346 | set_bit(UNPLUG_REMOTE, &mdev->flags); | ||
2347 | ok = (sizeof(p) == | ||
2348 | drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE)); | ||
2349 | if (ok && dgs) { | ||
2350 | dgb = mdev->int_dig_out; | ||
2351 | drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb); | ||
2352 | ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); | ||
2353 | } | ||
2354 | if (ok) { | ||
2355 | if (mdev->net_conf->wire_protocol == DRBD_PROT_A) | ||
2356 | ok = _drbd_send_bio(mdev, req->master_bio); | ||
2357 | else | ||
2358 | ok = _drbd_send_zc_bio(mdev, req->master_bio); | ||
2359 | } | ||
2360 | |||
2361 | drbd_put_data_sock(mdev); | ||
2362 | return ok; | ||
2363 | } | ||
2364 | |||
2365 | /* answer packet, used to send data back for read requests: | ||
2366 | * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY) | ||
2367 | * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY) | ||
2368 | */ | ||
2369 | int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
2370 | struct drbd_epoch_entry *e) | ||
2371 | { | ||
2372 | int ok; | ||
2373 | struct p_data p; | ||
2374 | void *dgb; | ||
2375 | int dgs; | ||
2376 | |||
2377 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? | ||
2378 | crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; | ||
2379 | |||
2380 | p.head.magic = BE_DRBD_MAGIC; | ||
2381 | p.head.command = cpu_to_be16(cmd); | ||
2382 | p.head.length = | ||
2383 | cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size); | ||
2384 | |||
2385 | p.sector = cpu_to_be64(e->sector); | ||
2386 | p.block_id = e->block_id; | ||
2387 | /* p.seq_num = 0; No sequence numbers here.. */ | ||
2388 | |||
2389 | /* Only called by our kernel thread. | ||
2390 | * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL | ||
2391 | * in response to admin command or module unload. | ||
2392 | */ | ||
2393 | if (!drbd_get_data_sock(mdev)) | ||
2394 | return 0; | ||
2395 | |||
2396 | ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, | ||
2397 | sizeof(p), MSG_MORE); | ||
2398 | if (ok && dgs) { | ||
2399 | dgb = mdev->int_dig_out; | ||
2400 | drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb); | ||
2401 | ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); | ||
2402 | } | ||
2403 | if (ok) | ||
2404 | ok = _drbd_send_zc_bio(mdev, e->private_bio); | ||
2405 | |||
2406 | drbd_put_data_sock(mdev); | ||
2407 | return ok; | ||
2408 | } | ||
2409 | |||
2410 | /* | ||
2411 | drbd_send distinguishes two cases: | ||
2412 | |||
2413 | Packets sent via the data socket "sock" | ||
2414 | and packets sent via the meta data socket "msock" | ||
2415 | |||
2416 | sock msock | ||
2417 | -----------------+-------------------------+------------------------------ | ||
2418 | timeout conf.timeout / 2 conf.timeout / 2 | ||
2419 | timeout action send a ping via msock Abort communication | ||
2420 | and close all sockets | ||
2421 | */ | ||
2422 | |||
2423 | /* | ||
2424 | * you must have down()ed the appropriate [m]sock_mutex elsewhere! | ||
2425 | */ | ||
2426 | int drbd_send(struct drbd_conf *mdev, struct socket *sock, | ||
2427 | void *buf, size_t size, unsigned msg_flags) | ||
2428 | { | ||
2429 | struct kvec iov; | ||
2430 | struct msghdr msg; | ||
2431 | int rv, sent = 0; | ||
2432 | |||
2433 | if (!sock) | ||
2434 | return -1000; | ||
2435 | |||
2436 | /* THINK if (signal_pending) return ... ? */ | ||
2437 | |||
2438 | iov.iov_base = buf; | ||
2439 | iov.iov_len = size; | ||
2440 | |||
2441 | msg.msg_name = NULL; | ||
2442 | msg.msg_namelen = 0; | ||
2443 | msg.msg_control = NULL; | ||
2444 | msg.msg_controllen = 0; | ||
2445 | msg.msg_flags = msg_flags | MSG_NOSIGNAL; | ||
2446 | |||
2447 | if (sock == mdev->data.socket) { | ||
2448 | mdev->ko_count = mdev->net_conf->ko_count; | ||
2449 | drbd_update_congested(mdev); | ||
2450 | } | ||
2451 | do { | ||
2452 | /* STRANGE | ||
2453 | * tcp_sendmsg does _not_ use its size parameter at all ? | ||
2454 | * | ||
2455 | * -EAGAIN on timeout, -EINTR on signal. | ||
2456 | */ | ||
2457 | /* THINK | ||
2458 | * do we need to block DRBD_SIG if sock == &meta.socket ?? | ||
2459 | * otherwise wake_asender() might interrupt some send_*Ack ! | ||
2460 | */ | ||
2461 | rv = kernel_sendmsg(sock, &msg, &iov, 1, size); | ||
2462 | if (rv == -EAGAIN) { | ||
2463 | if (we_should_drop_the_connection(mdev, sock)) | ||
2464 | break; | ||
2465 | else | ||
2466 | continue; | ||
2467 | } | ||
2468 | D_ASSERT(rv != 0); | ||
2469 | if (rv == -EINTR) { | ||
2470 | flush_signals(current); | ||
2471 | rv = 0; | ||
2472 | } | ||
2473 | if (rv < 0) | ||
2474 | break; | ||
2475 | sent += rv; | ||
2476 | iov.iov_base += rv; | ||
2477 | iov.iov_len -= rv; | ||
2478 | } while (sent < size); | ||
2479 | |||
2480 | if (sock == mdev->data.socket) | ||
2481 | clear_bit(NET_CONGESTED, &mdev->flags); | ||
2482 | |||
2483 | if (rv <= 0) { | ||
2484 | if (rv != -EAGAIN) { | ||
2485 | dev_err(DEV, "%s_sendmsg returned %d\n", | ||
2486 | sock == mdev->meta.socket ? "msock" : "sock", | ||
2487 | rv); | ||
2488 | drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE)); | ||
2489 | } else | ||
2490 | drbd_force_state(mdev, NS(conn, C_TIMEOUT)); | ||
2491 | } | ||
2492 | |||
2493 | return sent; | ||
2494 | } | ||
2495 | |||
2496 | static int drbd_open(struct block_device *bdev, fmode_t mode) | ||
2497 | { | ||
2498 | struct drbd_conf *mdev = bdev->bd_disk->private_data; | ||
2499 | unsigned long flags; | ||
2500 | int rv = 0; | ||
2501 | |||
2502 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
2503 | /* to have a stable mdev->state.role | ||
2504 | * and no race with updating open_cnt */ | ||
2505 | |||
2506 | if (mdev->state.role != R_PRIMARY) { | ||
2507 | if (mode & FMODE_WRITE) | ||
2508 | rv = -EROFS; | ||
2509 | else if (!allow_oos) | ||
2510 | rv = -EMEDIUMTYPE; | ||
2511 | } | ||
2512 | |||
2513 | if (!rv) | ||
2514 | mdev->open_cnt++; | ||
2515 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
2516 | |||
2517 | return rv; | ||
2518 | } | ||
2519 | |||
2520 | static int drbd_release(struct gendisk *gd, fmode_t mode) | ||
2521 | { | ||
2522 | struct drbd_conf *mdev = gd->private_data; | ||
2523 | mdev->open_cnt--; | ||
2524 | return 0; | ||
2525 | } | ||
2526 | |||
2527 | static void drbd_unplug_fn(struct request_queue *q) | ||
2528 | { | ||
2529 | struct drbd_conf *mdev = q->queuedata; | ||
2530 | |||
2531 | /* unplug FIRST */ | ||
2532 | spin_lock_irq(q->queue_lock); | ||
2533 | blk_remove_plug(q); | ||
2534 | spin_unlock_irq(q->queue_lock); | ||
2535 | |||
2536 | /* only if connected */ | ||
2537 | spin_lock_irq(&mdev->req_lock); | ||
2538 | if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) { | ||
2539 | D_ASSERT(mdev->state.role == R_PRIMARY); | ||
2540 | if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) { | ||
2541 | /* add to the data.work queue, | ||
2542 | * unless already queued. | ||
2543 | * XXX this might be a good addition to drbd_queue_work | ||
2544 | * anyways, to detect "double queuing" ... */ | ||
2545 | if (list_empty(&mdev->unplug_work.list)) | ||
2546 | drbd_queue_work(&mdev->data.work, | ||
2547 | &mdev->unplug_work); | ||
2548 | } | ||
2549 | } | ||
2550 | spin_unlock_irq(&mdev->req_lock); | ||
2551 | |||
2552 | if (mdev->state.disk >= D_INCONSISTENT) | ||
2553 | drbd_kick_lo(mdev); | ||
2554 | } | ||
2555 | |||
2556 | static void drbd_set_defaults(struct drbd_conf *mdev) | ||
2557 | { | ||
2558 | mdev->sync_conf.after = DRBD_AFTER_DEF; | ||
2559 | mdev->sync_conf.rate = DRBD_RATE_DEF; | ||
2560 | mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_DEF; | ||
2561 | mdev->state = (union drbd_state) { | ||
2562 | { .role = R_SECONDARY, | ||
2563 | .peer = R_UNKNOWN, | ||
2564 | .conn = C_STANDALONE, | ||
2565 | .disk = D_DISKLESS, | ||
2566 | .pdsk = D_UNKNOWN, | ||
2567 | .susp = 0 | ||
2568 | } }; | ||
2569 | } | ||
2570 | |||
2571 | void drbd_init_set_defaults(struct drbd_conf *mdev) | ||
2572 | { | ||
2573 | /* the memset(,0,) did most of this. | ||
2574 | * note: only assignments, no allocation in here */ | ||
2575 | |||
2576 | drbd_set_defaults(mdev); | ||
2577 | |||
2578 | /* for now, we do NOT yet support it, | ||
2579 | * even though we start some framework | ||
2580 | * to eventually support barriers */ | ||
2581 | set_bit(NO_BARRIER_SUPP, &mdev->flags); | ||
2582 | |||
2583 | atomic_set(&mdev->ap_bio_cnt, 0); | ||
2584 | atomic_set(&mdev->ap_pending_cnt, 0); | ||
2585 | atomic_set(&mdev->rs_pending_cnt, 0); | ||
2586 | atomic_set(&mdev->unacked_cnt, 0); | ||
2587 | atomic_set(&mdev->local_cnt, 0); | ||
2588 | atomic_set(&mdev->net_cnt, 0); | ||
2589 | atomic_set(&mdev->packet_seq, 0); | ||
2590 | atomic_set(&mdev->pp_in_use, 0); | ||
2591 | |||
2592 | mutex_init(&mdev->md_io_mutex); | ||
2593 | mutex_init(&mdev->data.mutex); | ||
2594 | mutex_init(&mdev->meta.mutex); | ||
2595 | sema_init(&mdev->data.work.s, 0); | ||
2596 | sema_init(&mdev->meta.work.s, 0); | ||
2597 | mutex_init(&mdev->state_mutex); | ||
2598 | |||
2599 | spin_lock_init(&mdev->data.work.q_lock); | ||
2600 | spin_lock_init(&mdev->meta.work.q_lock); | ||
2601 | |||
2602 | spin_lock_init(&mdev->al_lock); | ||
2603 | spin_lock_init(&mdev->req_lock); | ||
2604 | spin_lock_init(&mdev->peer_seq_lock); | ||
2605 | spin_lock_init(&mdev->epoch_lock); | ||
2606 | |||
2607 | INIT_LIST_HEAD(&mdev->active_ee); | ||
2608 | INIT_LIST_HEAD(&mdev->sync_ee); | ||
2609 | INIT_LIST_HEAD(&mdev->done_ee); | ||
2610 | INIT_LIST_HEAD(&mdev->read_ee); | ||
2611 | INIT_LIST_HEAD(&mdev->net_ee); | ||
2612 | INIT_LIST_HEAD(&mdev->resync_reads); | ||
2613 | INIT_LIST_HEAD(&mdev->data.work.q); | ||
2614 | INIT_LIST_HEAD(&mdev->meta.work.q); | ||
2615 | INIT_LIST_HEAD(&mdev->resync_work.list); | ||
2616 | INIT_LIST_HEAD(&mdev->unplug_work.list); | ||
2617 | INIT_LIST_HEAD(&mdev->md_sync_work.list); | ||
2618 | INIT_LIST_HEAD(&mdev->bm_io_work.w.list); | ||
2619 | mdev->resync_work.cb = w_resync_inactive; | ||
2620 | mdev->unplug_work.cb = w_send_write_hint; | ||
2621 | mdev->md_sync_work.cb = w_md_sync; | ||
2622 | mdev->bm_io_work.w.cb = w_bitmap_io; | ||
2623 | init_timer(&mdev->resync_timer); | ||
2624 | init_timer(&mdev->md_sync_timer); | ||
2625 | mdev->resync_timer.function = resync_timer_fn; | ||
2626 | mdev->resync_timer.data = (unsigned long) mdev; | ||
2627 | mdev->md_sync_timer.function = md_sync_timer_fn; | ||
2628 | mdev->md_sync_timer.data = (unsigned long) mdev; | ||
2629 | |||
2630 | init_waitqueue_head(&mdev->misc_wait); | ||
2631 | init_waitqueue_head(&mdev->state_wait); | ||
2632 | init_waitqueue_head(&mdev->ee_wait); | ||
2633 | init_waitqueue_head(&mdev->al_wait); | ||
2634 | init_waitqueue_head(&mdev->seq_wait); | ||
2635 | |||
2636 | drbd_thread_init(mdev, &mdev->receiver, drbdd_init); | ||
2637 | drbd_thread_init(mdev, &mdev->worker, drbd_worker); | ||
2638 | drbd_thread_init(mdev, &mdev->asender, drbd_asender); | ||
2639 | |||
2640 | mdev->agreed_pro_version = PRO_VERSION_MAX; | ||
2641 | mdev->write_ordering = WO_bio_barrier; | ||
2642 | mdev->resync_wenr = LC_FREE; | ||
2643 | } | ||
2644 | |||
2645 | void drbd_mdev_cleanup(struct drbd_conf *mdev) | ||
2646 | { | ||
2647 | if (mdev->receiver.t_state != None) | ||
2648 | dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n", | ||
2649 | mdev->receiver.t_state); | ||
2650 | |||
2651 | /* no need to lock it, I'm the only thread alive */ | ||
2652 | if (atomic_read(&mdev->current_epoch->epoch_size) != 0) | ||
2653 | dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size)); | ||
2654 | mdev->al_writ_cnt = | ||
2655 | mdev->bm_writ_cnt = | ||
2656 | mdev->read_cnt = | ||
2657 | mdev->recv_cnt = | ||
2658 | mdev->send_cnt = | ||
2659 | mdev->writ_cnt = | ||
2660 | mdev->p_size = | ||
2661 | mdev->rs_start = | ||
2662 | mdev->rs_total = | ||
2663 | mdev->rs_failed = | ||
2664 | mdev->rs_mark_left = | ||
2665 | mdev->rs_mark_time = 0; | ||
2666 | D_ASSERT(mdev->net_conf == NULL); | ||
2667 | |||
2668 | drbd_set_my_capacity(mdev, 0); | ||
2669 | if (mdev->bitmap) { | ||
2670 | /* maybe never allocated. */ | ||
2671 | drbd_bm_resize(mdev, 0); | ||
2672 | drbd_bm_cleanup(mdev); | ||
2673 | } | ||
2674 | |||
2675 | drbd_free_resources(mdev); | ||
2676 | |||
2677 | /* | ||
2678 | * currently we drbd_init_ee only on module load, so | ||
2679 | * we may do drbd_release_ee only on module unload! | ||
2680 | */ | ||
2681 | D_ASSERT(list_empty(&mdev->active_ee)); | ||
2682 | D_ASSERT(list_empty(&mdev->sync_ee)); | ||
2683 | D_ASSERT(list_empty(&mdev->done_ee)); | ||
2684 | D_ASSERT(list_empty(&mdev->read_ee)); | ||
2685 | D_ASSERT(list_empty(&mdev->net_ee)); | ||
2686 | D_ASSERT(list_empty(&mdev->resync_reads)); | ||
2687 | D_ASSERT(list_empty(&mdev->data.work.q)); | ||
2688 | D_ASSERT(list_empty(&mdev->meta.work.q)); | ||
2689 | D_ASSERT(list_empty(&mdev->resync_work.list)); | ||
2690 | D_ASSERT(list_empty(&mdev->unplug_work.list)); | ||
2691 | |||
2692 | } | ||
2693 | |||
2694 | |||
2695 | static void drbd_destroy_mempools(void) | ||
2696 | { | ||
2697 | struct page *page; | ||
2698 | |||
2699 | while (drbd_pp_pool) { | ||
2700 | page = drbd_pp_pool; | ||
2701 | drbd_pp_pool = (struct page *)page_private(page); | ||
2702 | __free_page(page); | ||
2703 | drbd_pp_vacant--; | ||
2704 | } | ||
2705 | |||
2706 | /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */ | ||
2707 | |||
2708 | if (drbd_ee_mempool) | ||
2709 | mempool_destroy(drbd_ee_mempool); | ||
2710 | if (drbd_request_mempool) | ||
2711 | mempool_destroy(drbd_request_mempool); | ||
2712 | if (drbd_ee_cache) | ||
2713 | kmem_cache_destroy(drbd_ee_cache); | ||
2714 | if (drbd_request_cache) | ||
2715 | kmem_cache_destroy(drbd_request_cache); | ||
2716 | if (drbd_bm_ext_cache) | ||
2717 | kmem_cache_destroy(drbd_bm_ext_cache); | ||
2718 | if (drbd_al_ext_cache) | ||
2719 | kmem_cache_destroy(drbd_al_ext_cache); | ||
2720 | |||
2721 | drbd_ee_mempool = NULL; | ||
2722 | drbd_request_mempool = NULL; | ||
2723 | drbd_ee_cache = NULL; | ||
2724 | drbd_request_cache = NULL; | ||
2725 | drbd_bm_ext_cache = NULL; | ||
2726 | drbd_al_ext_cache = NULL; | ||
2727 | |||
2728 | return; | ||
2729 | } | ||
2730 | |||
2731 | static int drbd_create_mempools(void) | ||
2732 | { | ||
2733 | struct page *page; | ||
2734 | const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count; | ||
2735 | int i; | ||
2736 | |||
2737 | /* prepare our caches and mempools */ | ||
2738 | drbd_request_mempool = NULL; | ||
2739 | drbd_ee_cache = NULL; | ||
2740 | drbd_request_cache = NULL; | ||
2741 | drbd_bm_ext_cache = NULL; | ||
2742 | drbd_al_ext_cache = NULL; | ||
2743 | drbd_pp_pool = NULL; | ||
2744 | |||
2745 | /* caches */ | ||
2746 | drbd_request_cache = kmem_cache_create( | ||
2747 | "drbd_req", sizeof(struct drbd_request), 0, 0, NULL); | ||
2748 | if (drbd_request_cache == NULL) | ||
2749 | goto Enomem; | ||
2750 | |||
2751 | drbd_ee_cache = kmem_cache_create( | ||
2752 | "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL); | ||
2753 | if (drbd_ee_cache == NULL) | ||
2754 | goto Enomem; | ||
2755 | |||
2756 | drbd_bm_ext_cache = kmem_cache_create( | ||
2757 | "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL); | ||
2758 | if (drbd_bm_ext_cache == NULL) | ||
2759 | goto Enomem; | ||
2760 | |||
2761 | drbd_al_ext_cache = kmem_cache_create( | ||
2762 | "drbd_al", sizeof(struct lc_element), 0, 0, NULL); | ||
2763 | if (drbd_al_ext_cache == NULL) | ||
2764 | goto Enomem; | ||
2765 | |||
2766 | /* mempools */ | ||
2767 | drbd_request_mempool = mempool_create(number, | ||
2768 | mempool_alloc_slab, mempool_free_slab, drbd_request_cache); | ||
2769 | if (drbd_request_mempool == NULL) | ||
2770 | goto Enomem; | ||
2771 | |||
2772 | drbd_ee_mempool = mempool_create(number, | ||
2773 | mempool_alloc_slab, mempool_free_slab, drbd_ee_cache); | ||
2774 | if (drbd_request_mempool == NULL) | ||
2775 | goto Enomem; | ||
2776 | |||
2777 | /* drbd's page pool */ | ||
2778 | spin_lock_init(&drbd_pp_lock); | ||
2779 | |||
2780 | for (i = 0; i < number; i++) { | ||
2781 | page = alloc_page(GFP_HIGHUSER); | ||
2782 | if (!page) | ||
2783 | goto Enomem; | ||
2784 | set_page_private(page, (unsigned long)drbd_pp_pool); | ||
2785 | drbd_pp_pool = page; | ||
2786 | } | ||
2787 | drbd_pp_vacant = number; | ||
2788 | |||
2789 | return 0; | ||
2790 | |||
2791 | Enomem: | ||
2792 | drbd_destroy_mempools(); /* in case we allocated some */ | ||
2793 | return -ENOMEM; | ||
2794 | } | ||
2795 | |||
2796 | static int drbd_notify_sys(struct notifier_block *this, unsigned long code, | ||
2797 | void *unused) | ||
2798 | { | ||
2799 | /* just so we have it. you never know what interesting things we | ||
2800 | * might want to do here some day... | ||
2801 | */ | ||
2802 | |||
2803 | return NOTIFY_DONE; | ||
2804 | } | ||
2805 | |||
2806 | static struct notifier_block drbd_notifier = { | ||
2807 | .notifier_call = drbd_notify_sys, | ||
2808 | }; | ||
2809 | |||
2810 | static void drbd_release_ee_lists(struct drbd_conf *mdev) | ||
2811 | { | ||
2812 | int rr; | ||
2813 | |||
2814 | rr = drbd_release_ee(mdev, &mdev->active_ee); | ||
2815 | if (rr) | ||
2816 | dev_err(DEV, "%d EEs in active list found!\n", rr); | ||
2817 | |||
2818 | rr = drbd_release_ee(mdev, &mdev->sync_ee); | ||
2819 | if (rr) | ||
2820 | dev_err(DEV, "%d EEs in sync list found!\n", rr); | ||
2821 | |||
2822 | rr = drbd_release_ee(mdev, &mdev->read_ee); | ||
2823 | if (rr) | ||
2824 | dev_err(DEV, "%d EEs in read list found!\n", rr); | ||
2825 | |||
2826 | rr = drbd_release_ee(mdev, &mdev->done_ee); | ||
2827 | if (rr) | ||
2828 | dev_err(DEV, "%d EEs in done list found!\n", rr); | ||
2829 | |||
2830 | rr = drbd_release_ee(mdev, &mdev->net_ee); | ||
2831 | if (rr) | ||
2832 | dev_err(DEV, "%d EEs in net list found!\n", rr); | ||
2833 | } | ||
2834 | |||
2835 | /* caution. no locking. | ||
2836 | * currently only used from module cleanup code. */ | ||
2837 | static void drbd_delete_device(unsigned int minor) | ||
2838 | { | ||
2839 | struct drbd_conf *mdev = minor_to_mdev(minor); | ||
2840 | |||
2841 | if (!mdev) | ||
2842 | return; | ||
2843 | |||
2844 | /* paranoia asserts */ | ||
2845 | if (mdev->open_cnt != 0) | ||
2846 | dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt, | ||
2847 | __FILE__ , __LINE__); | ||
2848 | |||
2849 | ERR_IF (!list_empty(&mdev->data.work.q)) { | ||
2850 | struct list_head *lp; | ||
2851 | list_for_each(lp, &mdev->data.work.q) { | ||
2852 | dev_err(DEV, "lp = %p\n", lp); | ||
2853 | } | ||
2854 | }; | ||
2855 | /* end paranoia asserts */ | ||
2856 | |||
2857 | del_gendisk(mdev->vdisk); | ||
2858 | |||
2859 | /* cleanup stuff that may have been allocated during | ||
2860 | * device (re-)configuration or state changes */ | ||
2861 | |||
2862 | if (mdev->this_bdev) | ||
2863 | bdput(mdev->this_bdev); | ||
2864 | |||
2865 | drbd_free_resources(mdev); | ||
2866 | |||
2867 | drbd_release_ee_lists(mdev); | ||
2868 | |||
2869 | /* should be free'd on disconnect? */ | ||
2870 | kfree(mdev->ee_hash); | ||
2871 | /* | ||
2872 | mdev->ee_hash_s = 0; | ||
2873 | mdev->ee_hash = NULL; | ||
2874 | */ | ||
2875 | |||
2876 | lc_destroy(mdev->act_log); | ||
2877 | lc_destroy(mdev->resync); | ||
2878 | |||
2879 | kfree(mdev->p_uuid); | ||
2880 | /* mdev->p_uuid = NULL; */ | ||
2881 | |||
2882 | kfree(mdev->int_dig_out); | ||
2883 | kfree(mdev->int_dig_in); | ||
2884 | kfree(mdev->int_dig_vv); | ||
2885 | |||
2886 | /* cleanup the rest that has been | ||
2887 | * allocated from drbd_new_device | ||
2888 | * and actually free the mdev itself */ | ||
2889 | drbd_free_mdev(mdev); | ||
2890 | } | ||
2891 | |||
2892 | static void drbd_cleanup(void) | ||
2893 | { | ||
2894 | unsigned int i; | ||
2895 | |||
2896 | unregister_reboot_notifier(&drbd_notifier); | ||
2897 | |||
2898 | drbd_nl_cleanup(); | ||
2899 | |||
2900 | if (minor_table) { | ||
2901 | if (drbd_proc) | ||
2902 | remove_proc_entry("drbd", NULL); | ||
2903 | i = minor_count; | ||
2904 | while (i--) | ||
2905 | drbd_delete_device(i); | ||
2906 | drbd_destroy_mempools(); | ||
2907 | } | ||
2908 | |||
2909 | kfree(minor_table); | ||
2910 | |||
2911 | unregister_blkdev(DRBD_MAJOR, "drbd"); | ||
2912 | |||
2913 | printk(KERN_INFO "drbd: module cleanup done.\n"); | ||
2914 | } | ||
2915 | |||
2916 | /** | ||
2917 | * drbd_congested() - Callback for pdflush | ||
2918 | * @congested_data: User data | ||
2919 | * @bdi_bits: Bits pdflush is currently interested in | ||
2920 | * | ||
2921 | * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested. | ||
2922 | */ | ||
2923 | static int drbd_congested(void *congested_data, int bdi_bits) | ||
2924 | { | ||
2925 | struct drbd_conf *mdev = congested_data; | ||
2926 | struct request_queue *q; | ||
2927 | char reason = '-'; | ||
2928 | int r = 0; | ||
2929 | |||
2930 | if (!__inc_ap_bio_cond(mdev)) { | ||
2931 | /* DRBD has frozen IO */ | ||
2932 | r = bdi_bits; | ||
2933 | reason = 'd'; | ||
2934 | goto out; | ||
2935 | } | ||
2936 | |||
2937 | if (get_ldev(mdev)) { | ||
2938 | q = bdev_get_queue(mdev->ldev->backing_bdev); | ||
2939 | r = bdi_congested(&q->backing_dev_info, bdi_bits); | ||
2940 | put_ldev(mdev); | ||
2941 | if (r) | ||
2942 | reason = 'b'; | ||
2943 | } | ||
2944 | |||
2945 | if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) { | ||
2946 | r |= (1 << BDI_async_congested); | ||
2947 | reason = reason == 'b' ? 'a' : 'n'; | ||
2948 | } | ||
2949 | |||
2950 | out: | ||
2951 | mdev->congestion_reason = reason; | ||
2952 | return r; | ||
2953 | } | ||
2954 | |||
2955 | struct drbd_conf *drbd_new_device(unsigned int minor) | ||
2956 | { | ||
2957 | struct drbd_conf *mdev; | ||
2958 | struct gendisk *disk; | ||
2959 | struct request_queue *q; | ||
2960 | |||
2961 | /* GFP_KERNEL, we are outside of all write-out paths */ | ||
2962 | mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL); | ||
2963 | if (!mdev) | ||
2964 | return NULL; | ||
2965 | if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL)) | ||
2966 | goto out_no_cpumask; | ||
2967 | |||
2968 | mdev->minor = minor; | ||
2969 | |||
2970 | drbd_init_set_defaults(mdev); | ||
2971 | |||
2972 | q = blk_alloc_queue(GFP_KERNEL); | ||
2973 | if (!q) | ||
2974 | goto out_no_q; | ||
2975 | mdev->rq_queue = q; | ||
2976 | q->queuedata = mdev; | ||
2977 | blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE); | ||
2978 | |||
2979 | disk = alloc_disk(1); | ||
2980 | if (!disk) | ||
2981 | goto out_no_disk; | ||
2982 | mdev->vdisk = disk; | ||
2983 | |||
2984 | set_disk_ro(disk, TRUE); | ||
2985 | |||
2986 | disk->queue = q; | ||
2987 | disk->major = DRBD_MAJOR; | ||
2988 | disk->first_minor = minor; | ||
2989 | disk->fops = &drbd_ops; | ||
2990 | sprintf(disk->disk_name, "drbd%d", minor); | ||
2991 | disk->private_data = mdev; | ||
2992 | |||
2993 | mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor)); | ||
2994 | /* we have no partitions. we contain only ourselves. */ | ||
2995 | mdev->this_bdev->bd_contains = mdev->this_bdev; | ||
2996 | |||
2997 | q->backing_dev_info.congested_fn = drbd_congested; | ||
2998 | q->backing_dev_info.congested_data = mdev; | ||
2999 | |||
3000 | blk_queue_make_request(q, drbd_make_request_26); | ||
3001 | blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); | ||
3002 | blk_queue_merge_bvec(q, drbd_merge_bvec); | ||
3003 | q->queue_lock = &mdev->req_lock; /* needed since we use */ | ||
3004 | /* plugging on a queue, that actually has no requests! */ | ||
3005 | q->unplug_fn = drbd_unplug_fn; | ||
3006 | |||
3007 | mdev->md_io_page = alloc_page(GFP_KERNEL); | ||
3008 | if (!mdev->md_io_page) | ||
3009 | goto out_no_io_page; | ||
3010 | |||
3011 | if (drbd_bm_init(mdev)) | ||
3012 | goto out_no_bitmap; | ||
3013 | /* no need to lock access, we are still initializing this minor device. */ | ||
3014 | if (!tl_init(mdev)) | ||
3015 | goto out_no_tl; | ||
3016 | |||
3017 | mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL); | ||
3018 | if (!mdev->app_reads_hash) | ||
3019 | goto out_no_app_reads; | ||
3020 | |||
3021 | mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL); | ||
3022 | if (!mdev->current_epoch) | ||
3023 | goto out_no_epoch; | ||
3024 | |||
3025 | INIT_LIST_HEAD(&mdev->current_epoch->list); | ||
3026 | mdev->epochs = 1; | ||
3027 | |||
3028 | return mdev; | ||
3029 | |||
3030 | /* out_whatever_else: | ||
3031 | kfree(mdev->current_epoch); */ | ||
3032 | out_no_epoch: | ||
3033 | kfree(mdev->app_reads_hash); | ||
3034 | out_no_app_reads: | ||
3035 | tl_cleanup(mdev); | ||
3036 | out_no_tl: | ||
3037 | drbd_bm_cleanup(mdev); | ||
3038 | out_no_bitmap: | ||
3039 | __free_page(mdev->md_io_page); | ||
3040 | out_no_io_page: | ||
3041 | put_disk(disk); | ||
3042 | out_no_disk: | ||
3043 | blk_cleanup_queue(q); | ||
3044 | out_no_q: | ||
3045 | free_cpumask_var(mdev->cpu_mask); | ||
3046 | out_no_cpumask: | ||
3047 | kfree(mdev); | ||
3048 | return NULL; | ||
3049 | } | ||
3050 | |||
3051 | /* counterpart of drbd_new_device. | ||
3052 | * last part of drbd_delete_device. */ | ||
3053 | void drbd_free_mdev(struct drbd_conf *mdev) | ||
3054 | { | ||
3055 | kfree(mdev->current_epoch); | ||
3056 | kfree(mdev->app_reads_hash); | ||
3057 | tl_cleanup(mdev); | ||
3058 | if (mdev->bitmap) /* should no longer be there. */ | ||
3059 | drbd_bm_cleanup(mdev); | ||
3060 | __free_page(mdev->md_io_page); | ||
3061 | put_disk(mdev->vdisk); | ||
3062 | blk_cleanup_queue(mdev->rq_queue); | ||
3063 | free_cpumask_var(mdev->cpu_mask); | ||
3064 | kfree(mdev); | ||
3065 | } | ||
3066 | |||
3067 | |||
3068 | int __init drbd_init(void) | ||
3069 | { | ||
3070 | int err; | ||
3071 | |||
3072 | if (sizeof(struct p_handshake) != 80) { | ||
3073 | printk(KERN_ERR | ||
3074 | "drbd: never change the size or layout " | ||
3075 | "of the HandShake packet.\n"); | ||
3076 | return -EINVAL; | ||
3077 | } | ||
3078 | |||
3079 | if (1 > minor_count || minor_count > 255) { | ||
3080 | printk(KERN_ERR | ||
3081 | "drbd: invalid minor_count (%d)\n", minor_count); | ||
3082 | #ifdef MODULE | ||
3083 | return -EINVAL; | ||
3084 | #else | ||
3085 | minor_count = 8; | ||
3086 | #endif | ||
3087 | } | ||
3088 | |||
3089 | err = drbd_nl_init(); | ||
3090 | if (err) | ||
3091 | return err; | ||
3092 | |||
3093 | err = register_blkdev(DRBD_MAJOR, "drbd"); | ||
3094 | if (err) { | ||
3095 | printk(KERN_ERR | ||
3096 | "drbd: unable to register block device major %d\n", | ||
3097 | DRBD_MAJOR); | ||
3098 | return err; | ||
3099 | } | ||
3100 | |||
3101 | register_reboot_notifier(&drbd_notifier); | ||
3102 | |||
3103 | /* | ||
3104 | * allocate all necessary structs | ||
3105 | */ | ||
3106 | err = -ENOMEM; | ||
3107 | |||
3108 | init_waitqueue_head(&drbd_pp_wait); | ||
3109 | |||
3110 | drbd_proc = NULL; /* play safe for drbd_cleanup */ | ||
3111 | minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count, | ||
3112 | GFP_KERNEL); | ||
3113 | if (!minor_table) | ||
3114 | goto Enomem; | ||
3115 | |||
3116 | err = drbd_create_mempools(); | ||
3117 | if (err) | ||
3118 | goto Enomem; | ||
3119 | |||
3120 | drbd_proc = proc_create("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops); | ||
3121 | if (!drbd_proc) { | ||
3122 | printk(KERN_ERR "drbd: unable to register proc file\n"); | ||
3123 | goto Enomem; | ||
3124 | } | ||
3125 | |||
3126 | rwlock_init(&global_state_lock); | ||
3127 | |||
3128 | printk(KERN_INFO "drbd: initialized. " | ||
3129 | "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n", | ||
3130 | API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX); | ||
3131 | printk(KERN_INFO "drbd: %s\n", drbd_buildtag()); | ||
3132 | printk(KERN_INFO "drbd: registered as block device major %d\n", | ||
3133 | DRBD_MAJOR); | ||
3134 | printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table); | ||
3135 | |||
3136 | return 0; /* Success! */ | ||
3137 | |||
3138 | Enomem: | ||
3139 | drbd_cleanup(); | ||
3140 | if (err == -ENOMEM) | ||
3141 | /* currently always the case */ | ||
3142 | printk(KERN_ERR "drbd: ran out of memory\n"); | ||
3143 | else | ||
3144 | printk(KERN_ERR "drbd: initialization failure\n"); | ||
3145 | return err; | ||
3146 | } | ||
3147 | |||
3148 | void drbd_free_bc(struct drbd_backing_dev *ldev) | ||
3149 | { | ||
3150 | if (ldev == NULL) | ||
3151 | return; | ||
3152 | |||
3153 | bd_release(ldev->backing_bdev); | ||
3154 | bd_release(ldev->md_bdev); | ||
3155 | |||
3156 | fput(ldev->lo_file); | ||
3157 | fput(ldev->md_file); | ||
3158 | |||
3159 | kfree(ldev); | ||
3160 | } | ||
3161 | |||
3162 | void drbd_free_sock(struct drbd_conf *mdev) | ||
3163 | { | ||
3164 | if (mdev->data.socket) { | ||
3165 | kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR); | ||
3166 | sock_release(mdev->data.socket); | ||
3167 | mdev->data.socket = NULL; | ||
3168 | } | ||
3169 | if (mdev->meta.socket) { | ||
3170 | kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR); | ||
3171 | sock_release(mdev->meta.socket); | ||
3172 | mdev->meta.socket = NULL; | ||
3173 | } | ||
3174 | } | ||
3175 | |||
3176 | |||
3177 | void drbd_free_resources(struct drbd_conf *mdev) | ||
3178 | { | ||
3179 | crypto_free_hash(mdev->csums_tfm); | ||
3180 | mdev->csums_tfm = NULL; | ||
3181 | crypto_free_hash(mdev->verify_tfm); | ||
3182 | mdev->verify_tfm = NULL; | ||
3183 | crypto_free_hash(mdev->cram_hmac_tfm); | ||
3184 | mdev->cram_hmac_tfm = NULL; | ||
3185 | crypto_free_hash(mdev->integrity_w_tfm); | ||
3186 | mdev->integrity_w_tfm = NULL; | ||
3187 | crypto_free_hash(mdev->integrity_r_tfm); | ||
3188 | mdev->integrity_r_tfm = NULL; | ||
3189 | |||
3190 | drbd_free_sock(mdev); | ||
3191 | |||
3192 | __no_warn(local, | ||
3193 | drbd_free_bc(mdev->ldev); | ||
3194 | mdev->ldev = NULL;); | ||
3195 | } | ||
3196 | |||
3197 | /* meta data management */ | ||
3198 | |||
3199 | struct meta_data_on_disk { | ||
3200 | u64 la_size; /* last agreed size. */ | ||
3201 | u64 uuid[UI_SIZE]; /* UUIDs. */ | ||
3202 | u64 device_uuid; | ||
3203 | u64 reserved_u64_1; | ||
3204 | u32 flags; /* MDF */ | ||
3205 | u32 magic; | ||
3206 | u32 md_size_sect; | ||
3207 | u32 al_offset; /* offset to this block */ | ||
3208 | u32 al_nr_extents; /* important for restoring the AL */ | ||
3209 | /* `-- act_log->nr_elements <-- sync_conf.al_extents */ | ||
3210 | u32 bm_offset; /* offset to the bitmap, from here */ | ||
3211 | u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ | ||
3212 | u32 reserved_u32[4]; | ||
3213 | |||
3214 | } __packed; | ||
3215 | |||
3216 | /** | ||
3217 | * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set | ||
3218 | * @mdev: DRBD device. | ||
3219 | */ | ||
3220 | void drbd_md_sync(struct drbd_conf *mdev) | ||
3221 | { | ||
3222 | struct meta_data_on_disk *buffer; | ||
3223 | sector_t sector; | ||
3224 | int i; | ||
3225 | |||
3226 | if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) | ||
3227 | return; | ||
3228 | del_timer(&mdev->md_sync_timer); | ||
3229 | |||
3230 | /* We use here D_FAILED and not D_ATTACHING because we try to write | ||
3231 | * metadata even if we detach due to a disk failure! */ | ||
3232 | if (!get_ldev_if_state(mdev, D_FAILED)) | ||
3233 | return; | ||
3234 | |||
3235 | mutex_lock(&mdev->md_io_mutex); | ||
3236 | buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); | ||
3237 | memset(buffer, 0, 512); | ||
3238 | |||
3239 | buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); | ||
3240 | for (i = UI_CURRENT; i < UI_SIZE; i++) | ||
3241 | buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); | ||
3242 | buffer->flags = cpu_to_be32(mdev->ldev->md.flags); | ||
3243 | buffer->magic = cpu_to_be32(DRBD_MD_MAGIC); | ||
3244 | |||
3245 | buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect); | ||
3246 | buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset); | ||
3247 | buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements); | ||
3248 | buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE); | ||
3249 | buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid); | ||
3250 | |||
3251 | buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset); | ||
3252 | |||
3253 | D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); | ||
3254 | sector = mdev->ldev->md.md_offset; | ||
3255 | |||
3256 | if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { | ||
3257 | clear_bit(MD_DIRTY, &mdev->flags); | ||
3258 | } else { | ||
3259 | /* this was a try anyways ... */ | ||
3260 | dev_err(DEV, "meta data update failed!\n"); | ||
3261 | |||
3262 | drbd_chk_io_error(mdev, 1, TRUE); | ||
3263 | } | ||
3264 | |||
3265 | /* Update mdev->ldev->md.la_size_sect, | ||
3266 | * since we updated it on metadata. */ | ||
3267 | mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev); | ||
3268 | |||
3269 | mutex_unlock(&mdev->md_io_mutex); | ||
3270 | put_ldev(mdev); | ||
3271 | } | ||
3272 | |||
3273 | /** | ||
3274 | * drbd_md_read() - Reads in the meta data super block | ||
3275 | * @mdev: DRBD device. | ||
3276 | * @bdev: Device from which the meta data should be read in. | ||
3277 | * | ||
3278 | * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case | ||
3279 | * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID. | ||
3280 | */ | ||
3281 | int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | ||
3282 | { | ||
3283 | struct meta_data_on_disk *buffer; | ||
3284 | int i, rv = NO_ERROR; | ||
3285 | |||
3286 | if (!get_ldev_if_state(mdev, D_ATTACHING)) | ||
3287 | return ERR_IO_MD_DISK; | ||
3288 | |||
3289 | mutex_lock(&mdev->md_io_mutex); | ||
3290 | buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); | ||
3291 | |||
3292 | if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { | ||
3293 | /* NOTE: cant do normal error processing here as this is | ||
3294 | called BEFORE disk is attached */ | ||
3295 | dev_err(DEV, "Error while reading metadata.\n"); | ||
3296 | rv = ERR_IO_MD_DISK; | ||
3297 | goto err; | ||
3298 | } | ||
3299 | |||
3300 | if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) { | ||
3301 | dev_err(DEV, "Error while reading metadata, magic not found.\n"); | ||
3302 | rv = ERR_MD_INVALID; | ||
3303 | goto err; | ||
3304 | } | ||
3305 | if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) { | ||
3306 | dev_err(DEV, "unexpected al_offset: %d (expected %d)\n", | ||
3307 | be32_to_cpu(buffer->al_offset), bdev->md.al_offset); | ||
3308 | rv = ERR_MD_INVALID; | ||
3309 | goto err; | ||
3310 | } | ||
3311 | if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) { | ||
3312 | dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n", | ||
3313 | be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset); | ||
3314 | rv = ERR_MD_INVALID; | ||
3315 | goto err; | ||
3316 | } | ||
3317 | if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) { | ||
3318 | dev_err(DEV, "unexpected md_size: %u (expected %u)\n", | ||
3319 | be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect); | ||
3320 | rv = ERR_MD_INVALID; | ||
3321 | goto err; | ||
3322 | } | ||
3323 | |||
3324 | if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { | ||
3325 | dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n", | ||
3326 | be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE); | ||
3327 | rv = ERR_MD_INVALID; | ||
3328 | goto err; | ||
3329 | } | ||
3330 | |||
3331 | bdev->md.la_size_sect = be64_to_cpu(buffer->la_size); | ||
3332 | for (i = UI_CURRENT; i < UI_SIZE; i++) | ||
3333 | bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); | ||
3334 | bdev->md.flags = be32_to_cpu(buffer->flags); | ||
3335 | mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents); | ||
3336 | bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); | ||
3337 | |||
3338 | if (mdev->sync_conf.al_extents < 7) | ||
3339 | mdev->sync_conf.al_extents = 127; | ||
3340 | |||
3341 | err: | ||
3342 | mutex_unlock(&mdev->md_io_mutex); | ||
3343 | put_ldev(mdev); | ||
3344 | |||
3345 | return rv; | ||
3346 | } | ||
3347 | |||
3348 | /** | ||
3349 | * drbd_md_mark_dirty() - Mark meta data super block as dirty | ||
3350 | * @mdev: DRBD device. | ||
3351 | * | ||
3352 | * Call this function if you change anything that should be written to | ||
3353 | * the meta-data super block. This function sets MD_DIRTY, and starts a | ||
3354 | * timer that ensures that within five seconds you have to call drbd_md_sync(). | ||
3355 | */ | ||
3356 | void drbd_md_mark_dirty(struct drbd_conf *mdev) | ||
3357 | { | ||
3358 | set_bit(MD_DIRTY, &mdev->flags); | ||
3359 | mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ); | ||
3360 | } | ||
3361 | |||
3362 | |||
3363 | static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local) | ||
3364 | { | ||
3365 | int i; | ||
3366 | |||
3367 | for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) | ||
3368 | mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i]; | ||
3369 | } | ||
3370 | |||
3371 | void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) | ||
3372 | { | ||
3373 | if (idx == UI_CURRENT) { | ||
3374 | if (mdev->state.role == R_PRIMARY) | ||
3375 | val |= 1; | ||
3376 | else | ||
3377 | val &= ~((u64)1); | ||
3378 | |||
3379 | drbd_set_ed_uuid(mdev, val); | ||
3380 | } | ||
3381 | |||
3382 | mdev->ldev->md.uuid[idx] = val; | ||
3383 | drbd_md_mark_dirty(mdev); | ||
3384 | } | ||
3385 | |||
3386 | |||
3387 | void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) | ||
3388 | { | ||
3389 | if (mdev->ldev->md.uuid[idx]) { | ||
3390 | drbd_uuid_move_history(mdev); | ||
3391 | mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx]; | ||
3392 | } | ||
3393 | _drbd_uuid_set(mdev, idx, val); | ||
3394 | } | ||
3395 | |||
3396 | /** | ||
3397 | * drbd_uuid_new_current() - Creates a new current UUID | ||
3398 | * @mdev: DRBD device. | ||
3399 | * | ||
3400 | * Creates a new current UUID, and rotates the old current UUID into | ||
3401 | * the bitmap slot. Causes an incremental resync upon next connect. | ||
3402 | */ | ||
3403 | void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local) | ||
3404 | { | ||
3405 | u64 val; | ||
3406 | |||
3407 | dev_info(DEV, "Creating new current UUID\n"); | ||
3408 | D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0); | ||
3409 | mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT]; | ||
3410 | |||
3411 | get_random_bytes(&val, sizeof(u64)); | ||
3412 | _drbd_uuid_set(mdev, UI_CURRENT, val); | ||
3413 | } | ||
3414 | |||
3415 | void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) | ||
3416 | { | ||
3417 | if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0) | ||
3418 | return; | ||
3419 | |||
3420 | if (val == 0) { | ||
3421 | drbd_uuid_move_history(mdev); | ||
3422 | mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP]; | ||
3423 | mdev->ldev->md.uuid[UI_BITMAP] = 0; | ||
3424 | } else { | ||
3425 | if (mdev->ldev->md.uuid[UI_BITMAP]) | ||
3426 | dev_warn(DEV, "bm UUID already set"); | ||
3427 | |||
3428 | mdev->ldev->md.uuid[UI_BITMAP] = val; | ||
3429 | mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1); | ||
3430 | |||
3431 | } | ||
3432 | drbd_md_mark_dirty(mdev); | ||
3433 | } | ||
3434 | |||
3435 | /** | ||
3436 | * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() | ||
3437 | * @mdev: DRBD device. | ||
3438 | * | ||
3439 | * Sets all bits in the bitmap and writes the whole bitmap to stable storage. | ||
3440 | */ | ||
3441 | int drbd_bmio_set_n_write(struct drbd_conf *mdev) | ||
3442 | { | ||
3443 | int rv = -EIO; | ||
3444 | |||
3445 | if (get_ldev_if_state(mdev, D_ATTACHING)) { | ||
3446 | drbd_md_set_flag(mdev, MDF_FULL_SYNC); | ||
3447 | drbd_md_sync(mdev); | ||
3448 | drbd_bm_set_all(mdev); | ||
3449 | |||
3450 | rv = drbd_bm_write(mdev); | ||
3451 | |||
3452 | if (!rv) { | ||
3453 | drbd_md_clear_flag(mdev, MDF_FULL_SYNC); | ||
3454 | drbd_md_sync(mdev); | ||
3455 | } | ||
3456 | |||
3457 | put_ldev(mdev); | ||
3458 | } | ||
3459 | |||
3460 | return rv; | ||
3461 | } | ||
3462 | |||
3463 | /** | ||
3464 | * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() | ||
3465 | * @mdev: DRBD device. | ||
3466 | * | ||
3467 | * Clears all bits in the bitmap and writes the whole bitmap to stable storage. | ||
3468 | */ | ||
3469 | int drbd_bmio_clear_n_write(struct drbd_conf *mdev) | ||
3470 | { | ||
3471 | int rv = -EIO; | ||
3472 | |||
3473 | if (get_ldev_if_state(mdev, D_ATTACHING)) { | ||
3474 | drbd_bm_clear_all(mdev); | ||
3475 | rv = drbd_bm_write(mdev); | ||
3476 | put_ldev(mdev); | ||
3477 | } | ||
3478 | |||
3479 | return rv; | ||
3480 | } | ||
3481 | |||
3482 | static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
3483 | { | ||
3484 | struct bm_io_work *work = container_of(w, struct bm_io_work, w); | ||
3485 | int rv; | ||
3486 | |||
3487 | D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0); | ||
3488 | |||
3489 | drbd_bm_lock(mdev, work->why); | ||
3490 | rv = work->io_fn(mdev); | ||
3491 | drbd_bm_unlock(mdev); | ||
3492 | |||
3493 | clear_bit(BITMAP_IO, &mdev->flags); | ||
3494 | wake_up(&mdev->misc_wait); | ||
3495 | |||
3496 | if (work->done) | ||
3497 | work->done(mdev, rv); | ||
3498 | |||
3499 | clear_bit(BITMAP_IO_QUEUED, &mdev->flags); | ||
3500 | work->why = NULL; | ||
3501 | |||
3502 | return 1; | ||
3503 | } | ||
3504 | |||
3505 | /** | ||
3506 | * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap | ||
3507 | * @mdev: DRBD device. | ||
3508 | * @io_fn: IO callback to be called when bitmap IO is possible | ||
3509 | * @done: callback to be called after the bitmap IO was performed | ||
3510 | * @why: Descriptive text of the reason for doing the IO | ||
3511 | * | ||
3512 | * While IO on the bitmap happens we freeze application IO thus we ensure | ||
3513 | * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be | ||
3514 | * called from worker context. It MUST NOT be used while a previous such | ||
3515 | * work is still pending! | ||
3516 | */ | ||
3517 | void drbd_queue_bitmap_io(struct drbd_conf *mdev, | ||
3518 | int (*io_fn)(struct drbd_conf *), | ||
3519 | void (*done)(struct drbd_conf *, int), | ||
3520 | char *why) | ||
3521 | { | ||
3522 | D_ASSERT(current == mdev->worker.task); | ||
3523 | |||
3524 | D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags)); | ||
3525 | D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags)); | ||
3526 | D_ASSERT(list_empty(&mdev->bm_io_work.w.list)); | ||
3527 | if (mdev->bm_io_work.why) | ||
3528 | dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n", | ||
3529 | why, mdev->bm_io_work.why); | ||
3530 | |||
3531 | mdev->bm_io_work.io_fn = io_fn; | ||
3532 | mdev->bm_io_work.done = done; | ||
3533 | mdev->bm_io_work.why = why; | ||
3534 | |||
3535 | set_bit(BITMAP_IO, &mdev->flags); | ||
3536 | if (atomic_read(&mdev->ap_bio_cnt) == 0) { | ||
3537 | if (list_empty(&mdev->bm_io_work.w.list)) { | ||
3538 | set_bit(BITMAP_IO_QUEUED, &mdev->flags); | ||
3539 | drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); | ||
3540 | } else | ||
3541 | dev_err(DEV, "FIXME avoided double queuing bm_io_work\n"); | ||
3542 | } | ||
3543 | } | ||
3544 | |||
3545 | /** | ||
3546 | * drbd_bitmap_io() - Does an IO operation on the whole bitmap | ||
3547 | * @mdev: DRBD device. | ||
3548 | * @io_fn: IO callback to be called when bitmap IO is possible | ||
3549 | * @why: Descriptive text of the reason for doing the IO | ||
3550 | * | ||
3551 | * freezes application IO while that the actual IO operations runs. This | ||
3552 | * functions MAY NOT be called from worker context. | ||
3553 | */ | ||
3554 | int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why) | ||
3555 | { | ||
3556 | int rv; | ||
3557 | |||
3558 | D_ASSERT(current != mdev->worker.task); | ||
3559 | |||
3560 | drbd_suspend_io(mdev); | ||
3561 | |||
3562 | drbd_bm_lock(mdev, why); | ||
3563 | rv = io_fn(mdev); | ||
3564 | drbd_bm_unlock(mdev); | ||
3565 | |||
3566 | drbd_resume_io(mdev); | ||
3567 | |||
3568 | return rv; | ||
3569 | } | ||
3570 | |||
3571 | void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local) | ||
3572 | { | ||
3573 | if ((mdev->ldev->md.flags & flag) != flag) { | ||
3574 | drbd_md_mark_dirty(mdev); | ||
3575 | mdev->ldev->md.flags |= flag; | ||
3576 | } | ||
3577 | } | ||
3578 | |||
3579 | void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local) | ||
3580 | { | ||
3581 | if ((mdev->ldev->md.flags & flag) != 0) { | ||
3582 | drbd_md_mark_dirty(mdev); | ||
3583 | mdev->ldev->md.flags &= ~flag; | ||
3584 | } | ||
3585 | } | ||
3586 | int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag) | ||
3587 | { | ||
3588 | return (bdev->md.flags & flag) != 0; | ||
3589 | } | ||
3590 | |||
3591 | static void md_sync_timer_fn(unsigned long data) | ||
3592 | { | ||
3593 | struct drbd_conf *mdev = (struct drbd_conf *) data; | ||
3594 | |||
3595 | drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work); | ||
3596 | } | ||
3597 | |||
3598 | static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
3599 | { | ||
3600 | dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); | ||
3601 | drbd_md_sync(mdev); | ||
3602 | |||
3603 | return 1; | ||
3604 | } | ||
3605 | |||
3606 | #ifdef CONFIG_DRBD_FAULT_INJECTION | ||
3607 | /* Fault insertion support including random number generator shamelessly | ||
3608 | * stolen from kernel/rcutorture.c */ | ||
3609 | struct fault_random_state { | ||
3610 | unsigned long state; | ||
3611 | unsigned long count; | ||
3612 | }; | ||
3613 | |||
3614 | #define FAULT_RANDOM_MULT 39916801 /* prime */ | ||
3615 | #define FAULT_RANDOM_ADD 479001701 /* prime */ | ||
3616 | #define FAULT_RANDOM_REFRESH 10000 | ||
3617 | |||
3618 | /* | ||
3619 | * Crude but fast random-number generator. Uses a linear congruential | ||
3620 | * generator, with occasional help from get_random_bytes(). | ||
3621 | */ | ||
3622 | static unsigned long | ||
3623 | _drbd_fault_random(struct fault_random_state *rsp) | ||
3624 | { | ||
3625 | long refresh; | ||
3626 | |||
3627 | if (--rsp->count < 0) { | ||
3628 | get_random_bytes(&refresh, sizeof(refresh)); | ||
3629 | rsp->state += refresh; | ||
3630 | rsp->count = FAULT_RANDOM_REFRESH; | ||
3631 | } | ||
3632 | rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD; | ||
3633 | return swahw32(rsp->state); | ||
3634 | } | ||
3635 | |||
3636 | static char * | ||
3637 | _drbd_fault_str(unsigned int type) { | ||
3638 | static char *_faults[] = { | ||
3639 | [DRBD_FAULT_MD_WR] = "Meta-data write", | ||
3640 | [DRBD_FAULT_MD_RD] = "Meta-data read", | ||
3641 | [DRBD_FAULT_RS_WR] = "Resync write", | ||
3642 | [DRBD_FAULT_RS_RD] = "Resync read", | ||
3643 | [DRBD_FAULT_DT_WR] = "Data write", | ||
3644 | [DRBD_FAULT_DT_RD] = "Data read", | ||
3645 | [DRBD_FAULT_DT_RA] = "Data read ahead", | ||
3646 | [DRBD_FAULT_BM_ALLOC] = "BM allocation", | ||
3647 | [DRBD_FAULT_AL_EE] = "EE allocation" | ||
3648 | }; | ||
3649 | |||
3650 | return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**"; | ||
3651 | } | ||
3652 | |||
3653 | unsigned int | ||
3654 | _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) | ||
3655 | { | ||
3656 | static struct fault_random_state rrs = {0, 0}; | ||
3657 | |||
3658 | unsigned int ret = ( | ||
3659 | (fault_devs == 0 || | ||
3660 | ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) && | ||
3661 | (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate)); | ||
3662 | |||
3663 | if (ret) { | ||
3664 | fault_count++; | ||
3665 | |||
3666 | if (printk_ratelimit()) | ||
3667 | dev_warn(DEV, "***Simulating %s failure\n", | ||
3668 | _drbd_fault_str(type)); | ||
3669 | } | ||
3670 | |||
3671 | return ret; | ||
3672 | } | ||
3673 | #endif | ||
3674 | |||
3675 | const char *drbd_buildtag(void) | ||
3676 | { | ||
3677 | /* DRBD built from external sources has here a reference to the | ||
3678 | git hash of the source code. */ | ||
3679 | |||
3680 | static char buildtag[38] = "\0uilt-in"; | ||
3681 | |||
3682 | if (buildtag[0] == 0) { | ||
3683 | #ifdef CONFIG_MODULES | ||
3684 | if (THIS_MODULE != NULL) | ||
3685 | sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion); | ||
3686 | else | ||
3687 | #endif | ||
3688 | buildtag[0] = 'b'; | ||
3689 | } | ||
3690 | |||
3691 | return buildtag; | ||
3692 | } | ||
3693 | |||
3694 | module_init(drbd_init) | ||
3695 | module_exit(drbd_cleanup) | ||
3696 | |||
3697 | EXPORT_SYMBOL(drbd_conn_str); | ||
3698 | EXPORT_SYMBOL(drbd_role_str); | ||
3699 | EXPORT_SYMBOL(drbd_disk_str); | ||
3700 | EXPORT_SYMBOL(drbd_set_st_err_str); | ||
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c new file mode 100644 index 000000000000..22538d9628f1 --- /dev/null +++ b/drivers/block/drbd/drbd_nl.c | |||
@@ -0,0 +1,2360 @@ | |||
1 | /* | ||
2 | drbd_nl.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | #include <linux/drbd.h> | ||
28 | #include <linux/in.h> | ||
29 | #include <linux/fs.h> | ||
30 | #include <linux/file.h> | ||
31 | #include <linux/slab.h> | ||
32 | #include <linux/connector.h> | ||
33 | #include <linux/blkpg.h> | ||
34 | #include <linux/cpumask.h> | ||
35 | #include "drbd_int.h" | ||
36 | #include "drbd_wrappers.h" | ||
37 | #include <asm/unaligned.h> | ||
38 | #include <linux/drbd_tag_magic.h> | ||
39 | #include <linux/drbd_limits.h> | ||
40 | |||
41 | static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int); | ||
42 | static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *); | ||
43 | static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *); | ||
44 | |||
45 | /* see get_sb_bdev and bd_claim */ | ||
46 | static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; | ||
47 | |||
48 | /* Generate the tag_list to struct functions */ | ||
49 | #define NL_PACKET(name, number, fields) \ | ||
50 | static int name ## _from_tags(struct drbd_conf *mdev, \ | ||
51 | unsigned short *tags, struct name *arg) __attribute__ ((unused)); \ | ||
52 | static int name ## _from_tags(struct drbd_conf *mdev, \ | ||
53 | unsigned short *tags, struct name *arg) \ | ||
54 | { \ | ||
55 | int tag; \ | ||
56 | int dlen; \ | ||
57 | \ | ||
58 | while ((tag = get_unaligned(tags++)) != TT_END) { \ | ||
59 | dlen = get_unaligned(tags++); \ | ||
60 | switch (tag_number(tag)) { \ | ||
61 | fields \ | ||
62 | default: \ | ||
63 | if (tag & T_MANDATORY) { \ | ||
64 | dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \ | ||
65 | return 0; \ | ||
66 | } \ | ||
67 | } \ | ||
68 | tags = (unsigned short *)((char *)tags + dlen); \ | ||
69 | } \ | ||
70 | return 1; \ | ||
71 | } | ||
72 | #define NL_INTEGER(pn, pr, member) \ | ||
73 | case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \ | ||
74 | arg->member = get_unaligned((int *)(tags)); \ | ||
75 | break; | ||
76 | #define NL_INT64(pn, pr, member) \ | ||
77 | case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \ | ||
78 | arg->member = get_unaligned((u64 *)(tags)); \ | ||
79 | break; | ||
80 | #define NL_BIT(pn, pr, member) \ | ||
81 | case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \ | ||
82 | arg->member = *(char *)(tags) ? 1 : 0; \ | ||
83 | break; | ||
84 | #define NL_STRING(pn, pr, member, len) \ | ||
85 | case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \ | ||
86 | if (dlen > len) { \ | ||
87 | dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \ | ||
88 | #member, dlen, (unsigned int)len); \ | ||
89 | return 0; \ | ||
90 | } \ | ||
91 | arg->member ## _len = dlen; \ | ||
92 | memcpy(arg->member, tags, min_t(size_t, dlen, len)); \ | ||
93 | break; | ||
94 | #include "linux/drbd_nl.h" | ||
95 | |||
96 | /* Generate the struct to tag_list functions */ | ||
97 | #define NL_PACKET(name, number, fields) \ | ||
98 | static unsigned short* \ | ||
99 | name ## _to_tags(struct drbd_conf *mdev, \ | ||
100 | struct name *arg, unsigned short *tags) __attribute__ ((unused)); \ | ||
101 | static unsigned short* \ | ||
102 | name ## _to_tags(struct drbd_conf *mdev, \ | ||
103 | struct name *arg, unsigned short *tags) \ | ||
104 | { \ | ||
105 | fields \ | ||
106 | return tags; \ | ||
107 | } | ||
108 | |||
109 | #define NL_INTEGER(pn, pr, member) \ | ||
110 | put_unaligned(pn | pr | TT_INTEGER, tags++); \ | ||
111 | put_unaligned(sizeof(int), tags++); \ | ||
112 | put_unaligned(arg->member, (int *)tags); \ | ||
113 | tags = (unsigned short *)((char *)tags+sizeof(int)); | ||
114 | #define NL_INT64(pn, pr, member) \ | ||
115 | put_unaligned(pn | pr | TT_INT64, tags++); \ | ||
116 | put_unaligned(sizeof(u64), tags++); \ | ||
117 | put_unaligned(arg->member, (u64 *)tags); \ | ||
118 | tags = (unsigned short *)((char *)tags+sizeof(u64)); | ||
119 | #define NL_BIT(pn, pr, member) \ | ||
120 | put_unaligned(pn | pr | TT_BIT, tags++); \ | ||
121 | put_unaligned(sizeof(char), tags++); \ | ||
122 | *(char *)tags = arg->member; \ | ||
123 | tags = (unsigned short *)((char *)tags+sizeof(char)); | ||
124 | #define NL_STRING(pn, pr, member, len) \ | ||
125 | put_unaligned(pn | pr | TT_STRING, tags++); \ | ||
126 | put_unaligned(arg->member ## _len, tags++); \ | ||
127 | memcpy(tags, arg->member, arg->member ## _len); \ | ||
128 | tags = (unsigned short *)((char *)tags + arg->member ## _len); | ||
129 | #include "linux/drbd_nl.h" | ||
130 | |||
131 | void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name); | ||
132 | void drbd_nl_send_reply(struct cn_msg *, int); | ||
133 | |||
134 | int drbd_khelper(struct drbd_conf *mdev, char *cmd) | ||
135 | { | ||
136 | char *envp[] = { "HOME=/", | ||
137 | "TERM=linux", | ||
138 | "PATH=/sbin:/usr/sbin:/bin:/usr/bin", | ||
139 | NULL, /* Will be set to address family */ | ||
140 | NULL, /* Will be set to address */ | ||
141 | NULL }; | ||
142 | |||
143 | char mb[12], af[20], ad[60], *afs; | ||
144 | char *argv[] = {usermode_helper, cmd, mb, NULL }; | ||
145 | int ret; | ||
146 | |||
147 | snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev)); | ||
148 | |||
149 | if (get_net_conf(mdev)) { | ||
150 | switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) { | ||
151 | case AF_INET6: | ||
152 | afs = "ipv6"; | ||
153 | snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6", | ||
154 | &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr); | ||
155 | break; | ||
156 | case AF_INET: | ||
157 | afs = "ipv4"; | ||
158 | snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4", | ||
159 | &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr); | ||
160 | break; | ||
161 | default: | ||
162 | afs = "ssocks"; | ||
163 | snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4", | ||
164 | &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr); | ||
165 | } | ||
166 | snprintf(af, 20, "DRBD_PEER_AF=%s", afs); | ||
167 | envp[3]=af; | ||
168 | envp[4]=ad; | ||
169 | put_net_conf(mdev); | ||
170 | } | ||
171 | |||
172 | dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb); | ||
173 | |||
174 | drbd_bcast_ev_helper(mdev, cmd); | ||
175 | ret = call_usermodehelper(usermode_helper, argv, envp, 1); | ||
176 | if (ret) | ||
177 | dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", | ||
178 | usermode_helper, cmd, mb, | ||
179 | (ret >> 8) & 0xff, ret); | ||
180 | else | ||
181 | dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", | ||
182 | usermode_helper, cmd, mb, | ||
183 | (ret >> 8) & 0xff, ret); | ||
184 | |||
185 | if (ret < 0) /* Ignore any ERRNOs we got. */ | ||
186 | ret = 0; | ||
187 | |||
188 | return ret; | ||
189 | } | ||
190 | |||
191 | enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev) | ||
192 | { | ||
193 | char *ex_to_string; | ||
194 | int r; | ||
195 | enum drbd_disk_state nps; | ||
196 | enum drbd_fencing_p fp; | ||
197 | |||
198 | D_ASSERT(mdev->state.pdsk == D_UNKNOWN); | ||
199 | |||
200 | if (get_ldev_if_state(mdev, D_CONSISTENT)) { | ||
201 | fp = mdev->ldev->dc.fencing; | ||
202 | put_ldev(mdev); | ||
203 | } else { | ||
204 | dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n"); | ||
205 | return mdev->state.pdsk; | ||
206 | } | ||
207 | |||
208 | if (fp == FP_STONITH) | ||
209 | _drbd_request_state(mdev, NS(susp, 1), CS_WAIT_COMPLETE); | ||
210 | |||
211 | r = drbd_khelper(mdev, "fence-peer"); | ||
212 | |||
213 | switch ((r>>8) & 0xff) { | ||
214 | case 3: /* peer is inconsistent */ | ||
215 | ex_to_string = "peer is inconsistent or worse"; | ||
216 | nps = D_INCONSISTENT; | ||
217 | break; | ||
218 | case 4: /* peer got outdated, or was already outdated */ | ||
219 | ex_to_string = "peer was fenced"; | ||
220 | nps = D_OUTDATED; | ||
221 | break; | ||
222 | case 5: /* peer was down */ | ||
223 | if (mdev->state.disk == D_UP_TO_DATE) { | ||
224 | /* we will(have) create(d) a new UUID anyways... */ | ||
225 | ex_to_string = "peer is unreachable, assumed to be dead"; | ||
226 | nps = D_OUTDATED; | ||
227 | } else { | ||
228 | ex_to_string = "peer unreachable, doing nothing since disk != UpToDate"; | ||
229 | nps = mdev->state.pdsk; | ||
230 | } | ||
231 | break; | ||
232 | case 6: /* Peer is primary, voluntarily outdate myself. | ||
233 | * This is useful when an unconnected R_SECONDARY is asked to | ||
234 | * become R_PRIMARY, but finds the other peer being active. */ | ||
235 | ex_to_string = "peer is active"; | ||
236 | dev_warn(DEV, "Peer is primary, outdating myself.\n"); | ||
237 | nps = D_UNKNOWN; | ||
238 | _drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE); | ||
239 | break; | ||
240 | case 7: | ||
241 | if (fp != FP_STONITH) | ||
242 | dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n"); | ||
243 | ex_to_string = "peer was stonithed"; | ||
244 | nps = D_OUTDATED; | ||
245 | break; | ||
246 | default: | ||
247 | /* The script is broken ... */ | ||
248 | nps = D_UNKNOWN; | ||
249 | dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff); | ||
250 | return nps; | ||
251 | } | ||
252 | |||
253 | dev_info(DEV, "fence-peer helper returned %d (%s)\n", | ||
254 | (r>>8) & 0xff, ex_to_string); | ||
255 | return nps; | ||
256 | } | ||
257 | |||
258 | |||
259 | int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) | ||
260 | { | ||
261 | const int max_tries = 4; | ||
262 | int r = 0; | ||
263 | int try = 0; | ||
264 | int forced = 0; | ||
265 | union drbd_state mask, val; | ||
266 | enum drbd_disk_state nps; | ||
267 | |||
268 | if (new_role == R_PRIMARY) | ||
269 | request_ping(mdev); /* Detect a dead peer ASAP */ | ||
270 | |||
271 | mutex_lock(&mdev->state_mutex); | ||
272 | |||
273 | mask.i = 0; mask.role = R_MASK; | ||
274 | val.i = 0; val.role = new_role; | ||
275 | |||
276 | while (try++ < max_tries) { | ||
277 | r = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE); | ||
278 | |||
279 | /* in case we first succeeded to outdate, | ||
280 | * but now suddenly could establish a connection */ | ||
281 | if (r == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) { | ||
282 | val.pdsk = 0; | ||
283 | mask.pdsk = 0; | ||
284 | continue; | ||
285 | } | ||
286 | |||
287 | if (r == SS_NO_UP_TO_DATE_DISK && force && | ||
288 | (mdev->state.disk == D_INCONSISTENT || | ||
289 | mdev->state.disk == D_OUTDATED)) { | ||
290 | mask.disk = D_MASK; | ||
291 | val.disk = D_UP_TO_DATE; | ||
292 | forced = 1; | ||
293 | continue; | ||
294 | } | ||
295 | |||
296 | if (r == SS_NO_UP_TO_DATE_DISK && | ||
297 | mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) { | ||
298 | D_ASSERT(mdev->state.pdsk == D_UNKNOWN); | ||
299 | nps = drbd_try_outdate_peer(mdev); | ||
300 | |||
301 | if (nps == D_OUTDATED || nps == D_INCONSISTENT) { | ||
302 | val.disk = D_UP_TO_DATE; | ||
303 | mask.disk = D_MASK; | ||
304 | } | ||
305 | |||
306 | val.pdsk = nps; | ||
307 | mask.pdsk = D_MASK; | ||
308 | |||
309 | continue; | ||
310 | } | ||
311 | |||
312 | if (r == SS_NOTHING_TO_DO) | ||
313 | goto fail; | ||
314 | if (r == SS_PRIMARY_NOP && mask.pdsk == 0) { | ||
315 | nps = drbd_try_outdate_peer(mdev); | ||
316 | |||
317 | if (force && nps > D_OUTDATED) { | ||
318 | dev_warn(DEV, "Forced into split brain situation!\n"); | ||
319 | nps = D_OUTDATED; | ||
320 | } | ||
321 | |||
322 | mask.pdsk = D_MASK; | ||
323 | val.pdsk = nps; | ||
324 | |||
325 | continue; | ||
326 | } | ||
327 | if (r == SS_TWO_PRIMARIES) { | ||
328 | /* Maybe the peer is detected as dead very soon... | ||
329 | retry at most once more in this case. */ | ||
330 | __set_current_state(TASK_INTERRUPTIBLE); | ||
331 | schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10); | ||
332 | if (try < max_tries) | ||
333 | try = max_tries - 1; | ||
334 | continue; | ||
335 | } | ||
336 | if (r < SS_SUCCESS) { | ||
337 | r = _drbd_request_state(mdev, mask, val, | ||
338 | CS_VERBOSE + CS_WAIT_COMPLETE); | ||
339 | if (r < SS_SUCCESS) | ||
340 | goto fail; | ||
341 | } | ||
342 | break; | ||
343 | } | ||
344 | |||
345 | if (r < SS_SUCCESS) | ||
346 | goto fail; | ||
347 | |||
348 | if (forced) | ||
349 | dev_warn(DEV, "Forced to consider local data as UpToDate!\n"); | ||
350 | |||
351 | /* Wait until nothing is on the fly :) */ | ||
352 | wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0); | ||
353 | |||
354 | if (new_role == R_SECONDARY) { | ||
355 | set_disk_ro(mdev->vdisk, TRUE); | ||
356 | if (get_ldev(mdev)) { | ||
357 | mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; | ||
358 | put_ldev(mdev); | ||
359 | } | ||
360 | } else { | ||
361 | if (get_net_conf(mdev)) { | ||
362 | mdev->net_conf->want_lose = 0; | ||
363 | put_net_conf(mdev); | ||
364 | } | ||
365 | set_disk_ro(mdev->vdisk, FALSE); | ||
366 | if (get_ldev(mdev)) { | ||
367 | if (((mdev->state.conn < C_CONNECTED || | ||
368 | mdev->state.pdsk <= D_FAILED) | ||
369 | && mdev->ldev->md.uuid[UI_BITMAP] == 0) || forced) | ||
370 | drbd_uuid_new_current(mdev); | ||
371 | |||
372 | mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1; | ||
373 | put_ldev(mdev); | ||
374 | } | ||
375 | } | ||
376 | |||
377 | if ((new_role == R_SECONDARY) && get_ldev(mdev)) { | ||
378 | drbd_al_to_on_disk_bm(mdev); | ||
379 | put_ldev(mdev); | ||
380 | } | ||
381 | |||
382 | if (mdev->state.conn >= C_WF_REPORT_PARAMS) { | ||
383 | /* if this was forced, we should consider sync */ | ||
384 | if (forced) | ||
385 | drbd_send_uuids(mdev); | ||
386 | drbd_send_state(mdev); | ||
387 | } | ||
388 | |||
389 | drbd_md_sync(mdev); | ||
390 | |||
391 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | ||
392 | fail: | ||
393 | mutex_unlock(&mdev->state_mutex); | ||
394 | return r; | ||
395 | } | ||
396 | |||
397 | |||
398 | static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
399 | struct drbd_nl_cfg_reply *reply) | ||
400 | { | ||
401 | struct primary primary_args; | ||
402 | |||
403 | memset(&primary_args, 0, sizeof(struct primary)); | ||
404 | if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) { | ||
405 | reply->ret_code = ERR_MANDATORY_TAG; | ||
406 | return 0; | ||
407 | } | ||
408 | |||
409 | reply->ret_code = | ||
410 | drbd_set_role(mdev, R_PRIMARY, primary_args.overwrite_peer); | ||
411 | |||
412 | return 0; | ||
413 | } | ||
414 | |||
415 | static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
416 | struct drbd_nl_cfg_reply *reply) | ||
417 | { | ||
418 | reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0); | ||
419 | |||
420 | return 0; | ||
421 | } | ||
422 | |||
423 | /* initializes the md.*_offset members, so we are able to find | ||
424 | * the on disk meta data */ | ||
425 | static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, | ||
426 | struct drbd_backing_dev *bdev) | ||
427 | { | ||
428 | sector_t md_size_sect = 0; | ||
429 | switch (bdev->dc.meta_dev_idx) { | ||
430 | default: | ||
431 | /* v07 style fixed size indexed meta data */ | ||
432 | bdev->md.md_size_sect = MD_RESERVED_SECT; | ||
433 | bdev->md.md_offset = drbd_md_ss__(mdev, bdev); | ||
434 | bdev->md.al_offset = MD_AL_OFFSET; | ||
435 | bdev->md.bm_offset = MD_BM_OFFSET; | ||
436 | break; | ||
437 | case DRBD_MD_INDEX_FLEX_EXT: | ||
438 | /* just occupy the full device; unit: sectors */ | ||
439 | bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev); | ||
440 | bdev->md.md_offset = 0; | ||
441 | bdev->md.al_offset = MD_AL_OFFSET; | ||
442 | bdev->md.bm_offset = MD_BM_OFFSET; | ||
443 | break; | ||
444 | case DRBD_MD_INDEX_INTERNAL: | ||
445 | case DRBD_MD_INDEX_FLEX_INT: | ||
446 | bdev->md.md_offset = drbd_md_ss__(mdev, bdev); | ||
447 | /* al size is still fixed */ | ||
448 | bdev->md.al_offset = -MD_AL_MAX_SIZE; | ||
449 | /* we need (slightly less than) ~ this much bitmap sectors: */ | ||
450 | md_size_sect = drbd_get_capacity(bdev->backing_bdev); | ||
451 | md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); | ||
452 | md_size_sect = BM_SECT_TO_EXT(md_size_sect); | ||
453 | md_size_sect = ALIGN(md_size_sect, 8); | ||
454 | |||
455 | /* plus the "drbd meta data super block", | ||
456 | * and the activity log; */ | ||
457 | md_size_sect += MD_BM_OFFSET; | ||
458 | |||
459 | bdev->md.md_size_sect = md_size_sect; | ||
460 | /* bitmap offset is adjusted by 'super' block size */ | ||
461 | bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; | ||
462 | break; | ||
463 | } | ||
464 | } | ||
465 | |||
466 | char *ppsize(char *buf, unsigned long long size) | ||
467 | { | ||
468 | /* Needs 9 bytes at max. */ | ||
469 | static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' }; | ||
470 | int base = 0; | ||
471 | while (size >= 10000) { | ||
472 | /* shift + round */ | ||
473 | size = (size >> 10) + !!(size & (1<<9)); | ||
474 | base++; | ||
475 | } | ||
476 | sprintf(buf, "%lu %cB", (long)size, units[base]); | ||
477 | |||
478 | return buf; | ||
479 | } | ||
480 | |||
481 | /* there is still a theoretical deadlock when called from receiver | ||
482 | * on an D_INCONSISTENT R_PRIMARY: | ||
483 | * remote READ does inc_ap_bio, receiver would need to receive answer | ||
484 | * packet from remote to dec_ap_bio again. | ||
485 | * receiver receive_sizes(), comes here, | ||
486 | * waits for ap_bio_cnt == 0. -> deadlock. | ||
487 | * but this cannot happen, actually, because: | ||
488 | * R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable | ||
489 | * (not connected, or bad/no disk on peer): | ||
490 | * see drbd_fail_request_early, ap_bio_cnt is zero. | ||
491 | * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET: | ||
492 | * peer may not initiate a resize. | ||
493 | */ | ||
494 | void drbd_suspend_io(struct drbd_conf *mdev) | ||
495 | { | ||
496 | set_bit(SUSPEND_IO, &mdev->flags); | ||
497 | wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); | ||
498 | } | ||
499 | |||
500 | void drbd_resume_io(struct drbd_conf *mdev) | ||
501 | { | ||
502 | clear_bit(SUSPEND_IO, &mdev->flags); | ||
503 | wake_up(&mdev->misc_wait); | ||
504 | } | ||
505 | |||
506 | /** | ||
507 | * drbd_determine_dev_size() - Sets the right device size obeying all constraints | ||
508 | * @mdev: DRBD device. | ||
509 | * | ||
510 | * Returns 0 on success, negative return values indicate errors. | ||
511 | * You should call drbd_md_sync() after calling this function. | ||
512 | */ | ||
513 | enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev) __must_hold(local) | ||
514 | { | ||
515 | sector_t prev_first_sect, prev_size; /* previous meta location */ | ||
516 | sector_t la_size; | ||
517 | sector_t size; | ||
518 | char ppb[10]; | ||
519 | |||
520 | int md_moved, la_size_changed; | ||
521 | enum determine_dev_size rv = unchanged; | ||
522 | |||
523 | /* race: | ||
524 | * application request passes inc_ap_bio, | ||
525 | * but then cannot get an AL-reference. | ||
526 | * this function later may wait on ap_bio_cnt == 0. -> deadlock. | ||
527 | * | ||
528 | * to avoid that: | ||
529 | * Suspend IO right here. | ||
530 | * still lock the act_log to not trigger ASSERTs there. | ||
531 | */ | ||
532 | drbd_suspend_io(mdev); | ||
533 | |||
534 | /* no wait necessary anymore, actually we could assert that */ | ||
535 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | ||
536 | |||
537 | prev_first_sect = drbd_md_first_sector(mdev->ldev); | ||
538 | prev_size = mdev->ldev->md.md_size_sect; | ||
539 | la_size = mdev->ldev->md.la_size_sect; | ||
540 | |||
541 | /* TODO: should only be some assert here, not (re)init... */ | ||
542 | drbd_md_set_sector_offsets(mdev, mdev->ldev); | ||
543 | |||
544 | size = drbd_new_dev_size(mdev, mdev->ldev); | ||
545 | |||
546 | if (drbd_get_capacity(mdev->this_bdev) != size || | ||
547 | drbd_bm_capacity(mdev) != size) { | ||
548 | int err; | ||
549 | err = drbd_bm_resize(mdev, size); | ||
550 | if (unlikely(err)) { | ||
551 | /* currently there is only one error: ENOMEM! */ | ||
552 | size = drbd_bm_capacity(mdev)>>1; | ||
553 | if (size == 0) { | ||
554 | dev_err(DEV, "OUT OF MEMORY! " | ||
555 | "Could not allocate bitmap!\n"); | ||
556 | } else { | ||
557 | dev_err(DEV, "BM resizing failed. " | ||
558 | "Leaving size unchanged at size = %lu KB\n", | ||
559 | (unsigned long)size); | ||
560 | } | ||
561 | rv = dev_size_error; | ||
562 | } | ||
563 | /* racy, see comments above. */ | ||
564 | drbd_set_my_capacity(mdev, size); | ||
565 | mdev->ldev->md.la_size_sect = size; | ||
566 | dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), | ||
567 | (unsigned long long)size>>1); | ||
568 | } | ||
569 | if (rv == dev_size_error) | ||
570 | goto out; | ||
571 | |||
572 | la_size_changed = (la_size != mdev->ldev->md.la_size_sect); | ||
573 | |||
574 | md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev) | ||
575 | || prev_size != mdev->ldev->md.md_size_sect; | ||
576 | |||
577 | if (la_size_changed || md_moved) { | ||
578 | drbd_al_shrink(mdev); /* All extents inactive. */ | ||
579 | dev_info(DEV, "Writing the whole bitmap, %s\n", | ||
580 | la_size_changed && md_moved ? "size changed and md moved" : | ||
581 | la_size_changed ? "size changed" : "md moved"); | ||
582 | rv = drbd_bitmap_io(mdev, &drbd_bm_write, "size changed"); /* does drbd_resume_io() ! */ | ||
583 | drbd_md_mark_dirty(mdev); | ||
584 | } | ||
585 | |||
586 | if (size > la_size) | ||
587 | rv = grew; | ||
588 | if (size < la_size) | ||
589 | rv = shrunk; | ||
590 | out: | ||
591 | lc_unlock(mdev->act_log); | ||
592 | wake_up(&mdev->al_wait); | ||
593 | drbd_resume_io(mdev); | ||
594 | |||
595 | return rv; | ||
596 | } | ||
597 | |||
598 | sector_t | ||
599 | drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | ||
600 | { | ||
601 | sector_t p_size = mdev->p_size; /* partner's disk size. */ | ||
602 | sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */ | ||
603 | sector_t m_size; /* my size */ | ||
604 | sector_t u_size = bdev->dc.disk_size; /* size requested by user. */ | ||
605 | sector_t size = 0; | ||
606 | |||
607 | m_size = drbd_get_max_capacity(bdev); | ||
608 | |||
609 | if (p_size && m_size) { | ||
610 | size = min_t(sector_t, p_size, m_size); | ||
611 | } else { | ||
612 | if (la_size) { | ||
613 | size = la_size; | ||
614 | if (m_size && m_size < size) | ||
615 | size = m_size; | ||
616 | if (p_size && p_size < size) | ||
617 | size = p_size; | ||
618 | } else { | ||
619 | if (m_size) | ||
620 | size = m_size; | ||
621 | if (p_size) | ||
622 | size = p_size; | ||
623 | } | ||
624 | } | ||
625 | |||
626 | if (size == 0) | ||
627 | dev_err(DEV, "Both nodes diskless!\n"); | ||
628 | |||
629 | if (u_size) { | ||
630 | if (u_size > size) | ||
631 | dev_err(DEV, "Requested disk size is too big (%lu > %lu)\n", | ||
632 | (unsigned long)u_size>>1, (unsigned long)size>>1); | ||
633 | else | ||
634 | size = u_size; | ||
635 | } | ||
636 | |||
637 | return size; | ||
638 | } | ||
639 | |||
640 | /** | ||
641 | * drbd_check_al_size() - Ensures that the AL is of the right size | ||
642 | * @mdev: DRBD device. | ||
643 | * | ||
644 | * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation | ||
645 | * failed, and 0 on success. You should call drbd_md_sync() after you called | ||
646 | * this function. | ||
647 | */ | ||
648 | static int drbd_check_al_size(struct drbd_conf *mdev) | ||
649 | { | ||
650 | struct lru_cache *n, *t; | ||
651 | struct lc_element *e; | ||
652 | unsigned int in_use; | ||
653 | int i; | ||
654 | |||
655 | ERR_IF(mdev->sync_conf.al_extents < 7) | ||
656 | mdev->sync_conf.al_extents = 127; | ||
657 | |||
658 | if (mdev->act_log && | ||
659 | mdev->act_log->nr_elements == mdev->sync_conf.al_extents) | ||
660 | return 0; | ||
661 | |||
662 | in_use = 0; | ||
663 | t = mdev->act_log; | ||
664 | n = lc_create("act_log", drbd_al_ext_cache, | ||
665 | mdev->sync_conf.al_extents, sizeof(struct lc_element), 0); | ||
666 | |||
667 | if (n == NULL) { | ||
668 | dev_err(DEV, "Cannot allocate act_log lru!\n"); | ||
669 | return -ENOMEM; | ||
670 | } | ||
671 | spin_lock_irq(&mdev->al_lock); | ||
672 | if (t) { | ||
673 | for (i = 0; i < t->nr_elements; i++) { | ||
674 | e = lc_element_by_index(t, i); | ||
675 | if (e->refcnt) | ||
676 | dev_err(DEV, "refcnt(%d)==%d\n", | ||
677 | e->lc_number, e->refcnt); | ||
678 | in_use += e->refcnt; | ||
679 | } | ||
680 | } | ||
681 | if (!in_use) | ||
682 | mdev->act_log = n; | ||
683 | spin_unlock_irq(&mdev->al_lock); | ||
684 | if (in_use) { | ||
685 | dev_err(DEV, "Activity log still in use!\n"); | ||
686 | lc_destroy(n); | ||
687 | return -EBUSY; | ||
688 | } else { | ||
689 | if (t) | ||
690 | lc_destroy(t); | ||
691 | } | ||
692 | drbd_md_mark_dirty(mdev); /* we changed mdev->act_log->nr_elemens */ | ||
693 | return 0; | ||
694 | } | ||
695 | |||
696 | void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __must_hold(local) | ||
697 | { | ||
698 | struct request_queue * const q = mdev->rq_queue; | ||
699 | struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; | ||
700 | int max_segments = mdev->ldev->dc.max_bio_bvecs; | ||
701 | |||
702 | if (b->merge_bvec_fn && !mdev->ldev->dc.use_bmbv) | ||
703 | max_seg_s = PAGE_SIZE; | ||
704 | |||
705 | max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s); | ||
706 | |||
707 | blk_queue_max_sectors(q, max_seg_s >> 9); | ||
708 | blk_queue_max_phys_segments(q, max_segments ? max_segments : MAX_PHYS_SEGMENTS); | ||
709 | blk_queue_max_hw_segments(q, max_segments ? max_segments : MAX_HW_SEGMENTS); | ||
710 | blk_queue_max_segment_size(q, max_seg_s); | ||
711 | blk_queue_logical_block_size(q, 512); | ||
712 | blk_queue_segment_boundary(q, PAGE_SIZE-1); | ||
713 | blk_stack_limits(&q->limits, &b->limits, 0); | ||
714 | |||
715 | if (b->merge_bvec_fn) | ||
716 | dev_warn(DEV, "Backing device's merge_bvec_fn() = %p\n", | ||
717 | b->merge_bvec_fn); | ||
718 | dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q)); | ||
719 | |||
720 | if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { | ||
721 | dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n", | ||
722 | q->backing_dev_info.ra_pages, | ||
723 | b->backing_dev_info.ra_pages); | ||
724 | q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; | ||
725 | } | ||
726 | } | ||
727 | |||
728 | /* serialize deconfig (worker exiting, doing cleanup) | ||
729 | * and reconfig (drbdsetup disk, drbdsetup net) | ||
730 | * | ||
731 | * wait for a potentially exiting worker, then restart it, | ||
732 | * or start a new one. | ||
733 | */ | ||
734 | static void drbd_reconfig_start(struct drbd_conf *mdev) | ||
735 | { | ||
736 | wait_event(mdev->state_wait, test_and_set_bit(CONFIG_PENDING, &mdev->flags)); | ||
737 | wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags)); | ||
738 | drbd_thread_start(&mdev->worker); | ||
739 | } | ||
740 | |||
741 | /* if still unconfigured, stops worker again. | ||
742 | * if configured now, clears CONFIG_PENDING. | ||
743 | * wakes potential waiters */ | ||
744 | static void drbd_reconfig_done(struct drbd_conf *mdev) | ||
745 | { | ||
746 | spin_lock_irq(&mdev->req_lock); | ||
747 | if (mdev->state.disk == D_DISKLESS && | ||
748 | mdev->state.conn == C_STANDALONE && | ||
749 | mdev->state.role == R_SECONDARY) { | ||
750 | set_bit(DEVICE_DYING, &mdev->flags); | ||
751 | drbd_thread_stop_nowait(&mdev->worker); | ||
752 | } else | ||
753 | clear_bit(CONFIG_PENDING, &mdev->flags); | ||
754 | spin_unlock_irq(&mdev->req_lock); | ||
755 | wake_up(&mdev->state_wait); | ||
756 | } | ||
757 | |||
758 | /* does always return 0; | ||
759 | * interesting return code is in reply->ret_code */ | ||
760 | static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
761 | struct drbd_nl_cfg_reply *reply) | ||
762 | { | ||
763 | enum drbd_ret_codes retcode; | ||
764 | enum determine_dev_size dd; | ||
765 | sector_t max_possible_sectors; | ||
766 | sector_t min_md_device_sectors; | ||
767 | struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ | ||
768 | struct inode *inode, *inode2; | ||
769 | struct lru_cache *resync_lru = NULL; | ||
770 | union drbd_state ns, os; | ||
771 | int rv; | ||
772 | int cp_discovered = 0; | ||
773 | int logical_block_size; | ||
774 | |||
775 | drbd_reconfig_start(mdev); | ||
776 | |||
777 | /* if you want to reconfigure, please tear down first */ | ||
778 | if (mdev->state.disk > D_DISKLESS) { | ||
779 | retcode = ERR_DISK_CONFIGURED; | ||
780 | goto fail; | ||
781 | } | ||
782 | |||
783 | /* allocation not in the IO path, cqueue thread context */ | ||
784 | nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); | ||
785 | if (!nbc) { | ||
786 | retcode = ERR_NOMEM; | ||
787 | goto fail; | ||
788 | } | ||
789 | |||
790 | nbc->dc.disk_size = DRBD_DISK_SIZE_SECT_DEF; | ||
791 | nbc->dc.on_io_error = DRBD_ON_IO_ERROR_DEF; | ||
792 | nbc->dc.fencing = DRBD_FENCING_DEF; | ||
793 | nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF; | ||
794 | |||
795 | if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) { | ||
796 | retcode = ERR_MANDATORY_TAG; | ||
797 | goto fail; | ||
798 | } | ||
799 | |||
800 | if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) { | ||
801 | retcode = ERR_MD_IDX_INVALID; | ||
802 | goto fail; | ||
803 | } | ||
804 | |||
805 | nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0); | ||
806 | if (IS_ERR(nbc->lo_file)) { | ||
807 | dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev, | ||
808 | PTR_ERR(nbc->lo_file)); | ||
809 | nbc->lo_file = NULL; | ||
810 | retcode = ERR_OPEN_DISK; | ||
811 | goto fail; | ||
812 | } | ||
813 | |||
814 | inode = nbc->lo_file->f_dentry->d_inode; | ||
815 | |||
816 | if (!S_ISBLK(inode->i_mode)) { | ||
817 | retcode = ERR_DISK_NOT_BDEV; | ||
818 | goto fail; | ||
819 | } | ||
820 | |||
821 | nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0); | ||
822 | if (IS_ERR(nbc->md_file)) { | ||
823 | dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev, | ||
824 | PTR_ERR(nbc->md_file)); | ||
825 | nbc->md_file = NULL; | ||
826 | retcode = ERR_OPEN_MD_DISK; | ||
827 | goto fail; | ||
828 | } | ||
829 | |||
830 | inode2 = nbc->md_file->f_dentry->d_inode; | ||
831 | |||
832 | if (!S_ISBLK(inode2->i_mode)) { | ||
833 | retcode = ERR_MD_NOT_BDEV; | ||
834 | goto fail; | ||
835 | } | ||
836 | |||
837 | nbc->backing_bdev = inode->i_bdev; | ||
838 | if (bd_claim(nbc->backing_bdev, mdev)) { | ||
839 | printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n", | ||
840 | nbc->backing_bdev, mdev, | ||
841 | nbc->backing_bdev->bd_holder, | ||
842 | nbc->backing_bdev->bd_contains->bd_holder, | ||
843 | nbc->backing_bdev->bd_holders); | ||
844 | retcode = ERR_BDCLAIM_DISK; | ||
845 | goto fail; | ||
846 | } | ||
847 | |||
848 | resync_lru = lc_create("resync", drbd_bm_ext_cache, | ||
849 | 61, sizeof(struct bm_extent), | ||
850 | offsetof(struct bm_extent, lce)); | ||
851 | if (!resync_lru) { | ||
852 | retcode = ERR_NOMEM; | ||
853 | goto release_bdev_fail; | ||
854 | } | ||
855 | |||
856 | /* meta_dev_idx >= 0: external fixed size, | ||
857 | * possibly multiple drbd sharing one meta device. | ||
858 | * TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is | ||
859 | * not yet used by some other drbd minor! | ||
860 | * (if you use drbd.conf + drbdadm, | ||
861 | * that should check it for you already; but if you don't, or someone | ||
862 | * fooled it, we need to double check here) */ | ||
863 | nbc->md_bdev = inode2->i_bdev; | ||
864 | if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev | ||
865 | : (void *) drbd_m_holder)) { | ||
866 | retcode = ERR_BDCLAIM_MD_DISK; | ||
867 | goto release_bdev_fail; | ||
868 | } | ||
869 | |||
870 | if ((nbc->backing_bdev == nbc->md_bdev) != | ||
871 | (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL || | ||
872 | nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { | ||
873 | retcode = ERR_MD_IDX_INVALID; | ||
874 | goto release_bdev2_fail; | ||
875 | } | ||
876 | |||
877 | /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ | ||
878 | drbd_md_set_sector_offsets(mdev, nbc); | ||
879 | |||
880 | if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) { | ||
881 | dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", | ||
882 | (unsigned long long) drbd_get_max_capacity(nbc), | ||
883 | (unsigned long long) nbc->dc.disk_size); | ||
884 | retcode = ERR_DISK_TO_SMALL; | ||
885 | goto release_bdev2_fail; | ||
886 | } | ||
887 | |||
888 | if (nbc->dc.meta_dev_idx < 0) { | ||
889 | max_possible_sectors = DRBD_MAX_SECTORS_FLEX; | ||
890 | /* at least one MB, otherwise it does not make sense */ | ||
891 | min_md_device_sectors = (2<<10); | ||
892 | } else { | ||
893 | max_possible_sectors = DRBD_MAX_SECTORS; | ||
894 | min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1); | ||
895 | } | ||
896 | |||
897 | if (drbd_get_capacity(nbc->md_bdev) > max_possible_sectors) | ||
898 | dev_warn(DEV, "truncating very big lower level device " | ||
899 | "to currently maximum possible %llu sectors\n", | ||
900 | (unsigned long long) max_possible_sectors); | ||
901 | |||
902 | if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { | ||
903 | retcode = ERR_MD_DISK_TO_SMALL; | ||
904 | dev_warn(DEV, "refusing attach: md-device too small, " | ||
905 | "at least %llu sectors needed for this meta-disk type\n", | ||
906 | (unsigned long long) min_md_device_sectors); | ||
907 | goto release_bdev2_fail; | ||
908 | } | ||
909 | |||
910 | /* Make sure the new disk is big enough | ||
911 | * (we may currently be R_PRIMARY with no local disk...) */ | ||
912 | if (drbd_get_max_capacity(nbc) < | ||
913 | drbd_get_capacity(mdev->this_bdev)) { | ||
914 | retcode = ERR_DISK_TO_SMALL; | ||
915 | goto release_bdev2_fail; | ||
916 | } | ||
917 | |||
918 | nbc->known_size = drbd_get_capacity(nbc->backing_bdev); | ||
919 | |||
920 | drbd_suspend_io(mdev); | ||
921 | /* also wait for the last barrier ack. */ | ||
922 | wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt)); | ||
923 | /* and for any other previously queued work */ | ||
924 | drbd_flush_workqueue(mdev); | ||
925 | |||
926 | retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE); | ||
927 | drbd_resume_io(mdev); | ||
928 | if (retcode < SS_SUCCESS) | ||
929 | goto release_bdev2_fail; | ||
930 | |||
931 | if (!get_ldev_if_state(mdev, D_ATTACHING)) | ||
932 | goto force_diskless; | ||
933 | |||
934 | drbd_md_set_sector_offsets(mdev, nbc); | ||
935 | |||
936 | if (!mdev->bitmap) { | ||
937 | if (drbd_bm_init(mdev)) { | ||
938 | retcode = ERR_NOMEM; | ||
939 | goto force_diskless_dec; | ||
940 | } | ||
941 | } | ||
942 | |||
943 | retcode = drbd_md_read(mdev, nbc); | ||
944 | if (retcode != NO_ERROR) | ||
945 | goto force_diskless_dec; | ||
946 | |||
947 | if (mdev->state.conn < C_CONNECTED && | ||
948 | mdev->state.role == R_PRIMARY && | ||
949 | (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { | ||
950 | dev_err(DEV, "Can only attach to data with current UUID=%016llX\n", | ||
951 | (unsigned long long)mdev->ed_uuid); | ||
952 | retcode = ERR_DATA_NOT_CURRENT; | ||
953 | goto force_diskless_dec; | ||
954 | } | ||
955 | |||
956 | /* Since we are diskless, fix the activity log first... */ | ||
957 | if (drbd_check_al_size(mdev)) { | ||
958 | retcode = ERR_NOMEM; | ||
959 | goto force_diskless_dec; | ||
960 | } | ||
961 | |||
962 | /* Prevent shrinking of consistent devices ! */ | ||
963 | if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && | ||
964 | drbd_new_dev_size(mdev, nbc) < nbc->md.la_size_sect) { | ||
965 | dev_warn(DEV, "refusing to truncate a consistent device\n"); | ||
966 | retcode = ERR_DISK_TO_SMALL; | ||
967 | goto force_diskless_dec; | ||
968 | } | ||
969 | |||
970 | if (!drbd_al_read_log(mdev, nbc)) { | ||
971 | retcode = ERR_IO_MD_DISK; | ||
972 | goto force_diskless_dec; | ||
973 | } | ||
974 | |||
975 | /* allocate a second IO page if logical_block_size != 512 */ | ||
976 | logical_block_size = bdev_logical_block_size(nbc->md_bdev); | ||
977 | if (logical_block_size == 0) | ||
978 | logical_block_size = MD_SECTOR_SIZE; | ||
979 | |||
980 | if (logical_block_size != MD_SECTOR_SIZE) { | ||
981 | if (!mdev->md_io_tmpp) { | ||
982 | struct page *page = alloc_page(GFP_NOIO); | ||
983 | if (!page) | ||
984 | goto force_diskless_dec; | ||
985 | |||
986 | dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n", | ||
987 | logical_block_size, MD_SECTOR_SIZE); | ||
988 | dev_warn(DEV, "Workaround engaged (has performance impact).\n"); | ||
989 | |||
990 | mdev->md_io_tmpp = page; | ||
991 | } | ||
992 | } | ||
993 | |||
994 | /* Reset the "barriers don't work" bits here, then force meta data to | ||
995 | * be written, to ensure we determine if barriers are supported. */ | ||
996 | if (nbc->dc.no_md_flush) | ||
997 | set_bit(MD_NO_BARRIER, &mdev->flags); | ||
998 | else | ||
999 | clear_bit(MD_NO_BARRIER, &mdev->flags); | ||
1000 | |||
1001 | /* Point of no return reached. | ||
1002 | * Devices and memory are no longer released by error cleanup below. | ||
1003 | * now mdev takes over responsibility, and the state engine should | ||
1004 | * clean it up somewhere. */ | ||
1005 | D_ASSERT(mdev->ldev == NULL); | ||
1006 | mdev->ldev = nbc; | ||
1007 | mdev->resync = resync_lru; | ||
1008 | nbc = NULL; | ||
1009 | resync_lru = NULL; | ||
1010 | |||
1011 | mdev->write_ordering = WO_bio_barrier; | ||
1012 | drbd_bump_write_ordering(mdev, WO_bio_barrier); | ||
1013 | |||
1014 | if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY)) | ||
1015 | set_bit(CRASHED_PRIMARY, &mdev->flags); | ||
1016 | else | ||
1017 | clear_bit(CRASHED_PRIMARY, &mdev->flags); | ||
1018 | |||
1019 | if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND)) { | ||
1020 | set_bit(CRASHED_PRIMARY, &mdev->flags); | ||
1021 | cp_discovered = 1; | ||
1022 | } | ||
1023 | |||
1024 | mdev->send_cnt = 0; | ||
1025 | mdev->recv_cnt = 0; | ||
1026 | mdev->read_cnt = 0; | ||
1027 | mdev->writ_cnt = 0; | ||
1028 | |||
1029 | drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE); | ||
1030 | |||
1031 | /* If I am currently not R_PRIMARY, | ||
1032 | * but meta data primary indicator is set, | ||
1033 | * I just now recover from a hard crash, | ||
1034 | * and have been R_PRIMARY before that crash. | ||
1035 | * | ||
1036 | * Now, if I had no connection before that crash | ||
1037 | * (have been degraded R_PRIMARY), chances are that | ||
1038 | * I won't find my peer now either. | ||
1039 | * | ||
1040 | * In that case, and _only_ in that case, | ||
1041 | * we use the degr-wfc-timeout instead of the default, | ||
1042 | * so we can automatically recover from a crash of a | ||
1043 | * degraded but active "cluster" after a certain timeout. | ||
1044 | */ | ||
1045 | clear_bit(USE_DEGR_WFC_T, &mdev->flags); | ||
1046 | if (mdev->state.role != R_PRIMARY && | ||
1047 | drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) && | ||
1048 | !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND)) | ||
1049 | set_bit(USE_DEGR_WFC_T, &mdev->flags); | ||
1050 | |||
1051 | dd = drbd_determin_dev_size(mdev); | ||
1052 | if (dd == dev_size_error) { | ||
1053 | retcode = ERR_NOMEM_BITMAP; | ||
1054 | goto force_diskless_dec; | ||
1055 | } else if (dd == grew) | ||
1056 | set_bit(RESYNC_AFTER_NEG, &mdev->flags); | ||
1057 | |||
1058 | if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { | ||
1059 | dev_info(DEV, "Assuming that all blocks are out of sync " | ||
1060 | "(aka FullSync)\n"); | ||
1061 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from attaching")) { | ||
1062 | retcode = ERR_IO_MD_DISK; | ||
1063 | goto force_diskless_dec; | ||
1064 | } | ||
1065 | } else { | ||
1066 | if (drbd_bitmap_io(mdev, &drbd_bm_read, "read from attaching") < 0) { | ||
1067 | retcode = ERR_IO_MD_DISK; | ||
1068 | goto force_diskless_dec; | ||
1069 | } | ||
1070 | } | ||
1071 | |||
1072 | if (cp_discovered) { | ||
1073 | drbd_al_apply_to_bm(mdev); | ||
1074 | drbd_al_to_on_disk_bm(mdev); | ||
1075 | } | ||
1076 | |||
1077 | spin_lock_irq(&mdev->req_lock); | ||
1078 | os = mdev->state; | ||
1079 | ns.i = os.i; | ||
1080 | /* If MDF_CONSISTENT is not set go into inconsistent state, | ||
1081 | otherwise investigate MDF_WasUpToDate... | ||
1082 | If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state, | ||
1083 | otherwise into D_CONSISTENT state. | ||
1084 | */ | ||
1085 | if (drbd_md_test_flag(mdev->ldev, MDF_CONSISTENT)) { | ||
1086 | if (drbd_md_test_flag(mdev->ldev, MDF_WAS_UP_TO_DATE)) | ||
1087 | ns.disk = D_CONSISTENT; | ||
1088 | else | ||
1089 | ns.disk = D_OUTDATED; | ||
1090 | } else { | ||
1091 | ns.disk = D_INCONSISTENT; | ||
1092 | } | ||
1093 | |||
1094 | if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED)) | ||
1095 | ns.pdsk = D_OUTDATED; | ||
1096 | |||
1097 | if ( ns.disk == D_CONSISTENT && | ||
1098 | (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE)) | ||
1099 | ns.disk = D_UP_TO_DATE; | ||
1100 | |||
1101 | /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND, | ||
1102 | MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before | ||
1103 | this point, because drbd_request_state() modifies these | ||
1104 | flags. */ | ||
1105 | |||
1106 | /* In case we are C_CONNECTED postpone any decision on the new disk | ||
1107 | state after the negotiation phase. */ | ||
1108 | if (mdev->state.conn == C_CONNECTED) { | ||
1109 | mdev->new_state_tmp.i = ns.i; | ||
1110 | ns.i = os.i; | ||
1111 | ns.disk = D_NEGOTIATING; | ||
1112 | } | ||
1113 | |||
1114 | rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | ||
1115 | ns = mdev->state; | ||
1116 | spin_unlock_irq(&mdev->req_lock); | ||
1117 | |||
1118 | if (rv < SS_SUCCESS) | ||
1119 | goto force_diskless_dec; | ||
1120 | |||
1121 | if (mdev->state.role == R_PRIMARY) | ||
1122 | mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1; | ||
1123 | else | ||
1124 | mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; | ||
1125 | |||
1126 | drbd_md_mark_dirty(mdev); | ||
1127 | drbd_md_sync(mdev); | ||
1128 | |||
1129 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | ||
1130 | put_ldev(mdev); | ||
1131 | reply->ret_code = retcode; | ||
1132 | drbd_reconfig_done(mdev); | ||
1133 | return 0; | ||
1134 | |||
1135 | force_diskless_dec: | ||
1136 | put_ldev(mdev); | ||
1137 | force_diskless: | ||
1138 | drbd_force_state(mdev, NS(disk, D_DISKLESS)); | ||
1139 | drbd_md_sync(mdev); | ||
1140 | release_bdev2_fail: | ||
1141 | if (nbc) | ||
1142 | bd_release(nbc->md_bdev); | ||
1143 | release_bdev_fail: | ||
1144 | if (nbc) | ||
1145 | bd_release(nbc->backing_bdev); | ||
1146 | fail: | ||
1147 | if (nbc) { | ||
1148 | if (nbc->lo_file) | ||
1149 | fput(nbc->lo_file); | ||
1150 | if (nbc->md_file) | ||
1151 | fput(nbc->md_file); | ||
1152 | kfree(nbc); | ||
1153 | } | ||
1154 | lc_destroy(resync_lru); | ||
1155 | |||
1156 | reply->ret_code = retcode; | ||
1157 | drbd_reconfig_done(mdev); | ||
1158 | return 0; | ||
1159 | } | ||
1160 | |||
1161 | static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1162 | struct drbd_nl_cfg_reply *reply) | ||
1163 | { | ||
1164 | reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS)); | ||
1165 | return 0; | ||
1166 | } | ||
1167 | |||
1168 | static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1169 | struct drbd_nl_cfg_reply *reply) | ||
1170 | { | ||
1171 | int i, ns; | ||
1172 | enum drbd_ret_codes retcode; | ||
1173 | struct net_conf *new_conf = NULL; | ||
1174 | struct crypto_hash *tfm = NULL; | ||
1175 | struct crypto_hash *integrity_w_tfm = NULL; | ||
1176 | struct crypto_hash *integrity_r_tfm = NULL; | ||
1177 | struct hlist_head *new_tl_hash = NULL; | ||
1178 | struct hlist_head *new_ee_hash = NULL; | ||
1179 | struct drbd_conf *odev; | ||
1180 | char hmac_name[CRYPTO_MAX_ALG_NAME]; | ||
1181 | void *int_dig_out = NULL; | ||
1182 | void *int_dig_in = NULL; | ||
1183 | void *int_dig_vv = NULL; | ||
1184 | struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr; | ||
1185 | |||
1186 | drbd_reconfig_start(mdev); | ||
1187 | |||
1188 | if (mdev->state.conn > C_STANDALONE) { | ||
1189 | retcode = ERR_NET_CONFIGURED; | ||
1190 | goto fail; | ||
1191 | } | ||
1192 | |||
1193 | /* allocation not in the IO path, cqueue thread context */ | ||
1194 | new_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL); | ||
1195 | if (!new_conf) { | ||
1196 | retcode = ERR_NOMEM; | ||
1197 | goto fail; | ||
1198 | } | ||
1199 | |||
1200 | memset(new_conf, 0, sizeof(struct net_conf)); | ||
1201 | new_conf->timeout = DRBD_TIMEOUT_DEF; | ||
1202 | new_conf->try_connect_int = DRBD_CONNECT_INT_DEF; | ||
1203 | new_conf->ping_int = DRBD_PING_INT_DEF; | ||
1204 | new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF; | ||
1205 | new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF; | ||
1206 | new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF; | ||
1207 | new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF; | ||
1208 | new_conf->rcvbuf_size = DRBD_RCVBUF_SIZE_DEF; | ||
1209 | new_conf->ko_count = DRBD_KO_COUNT_DEF; | ||
1210 | new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF; | ||
1211 | new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF; | ||
1212 | new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF; | ||
1213 | new_conf->want_lose = 0; | ||
1214 | new_conf->two_primaries = 0; | ||
1215 | new_conf->wire_protocol = DRBD_PROT_C; | ||
1216 | new_conf->ping_timeo = DRBD_PING_TIMEO_DEF; | ||
1217 | new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF; | ||
1218 | |||
1219 | if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) { | ||
1220 | retcode = ERR_MANDATORY_TAG; | ||
1221 | goto fail; | ||
1222 | } | ||
1223 | |||
1224 | if (new_conf->two_primaries | ||
1225 | && (new_conf->wire_protocol != DRBD_PROT_C)) { | ||
1226 | retcode = ERR_NOT_PROTO_C; | ||
1227 | goto fail; | ||
1228 | }; | ||
1229 | |||
1230 | if (mdev->state.role == R_PRIMARY && new_conf->want_lose) { | ||
1231 | retcode = ERR_DISCARD; | ||
1232 | goto fail; | ||
1233 | } | ||
1234 | |||
1235 | retcode = NO_ERROR; | ||
1236 | |||
1237 | new_my_addr = (struct sockaddr *)&new_conf->my_addr; | ||
1238 | new_peer_addr = (struct sockaddr *)&new_conf->peer_addr; | ||
1239 | for (i = 0; i < minor_count; i++) { | ||
1240 | odev = minor_to_mdev(i); | ||
1241 | if (!odev || odev == mdev) | ||
1242 | continue; | ||
1243 | if (get_net_conf(odev)) { | ||
1244 | taken_addr = (struct sockaddr *)&odev->net_conf->my_addr; | ||
1245 | if (new_conf->my_addr_len == odev->net_conf->my_addr_len && | ||
1246 | !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len)) | ||
1247 | retcode = ERR_LOCAL_ADDR; | ||
1248 | |||
1249 | taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr; | ||
1250 | if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len && | ||
1251 | !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len)) | ||
1252 | retcode = ERR_PEER_ADDR; | ||
1253 | |||
1254 | put_net_conf(odev); | ||
1255 | if (retcode != NO_ERROR) | ||
1256 | goto fail; | ||
1257 | } | ||
1258 | } | ||
1259 | |||
1260 | if (new_conf->cram_hmac_alg[0] != 0) { | ||
1261 | snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)", | ||
1262 | new_conf->cram_hmac_alg); | ||
1263 | tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC); | ||
1264 | if (IS_ERR(tfm)) { | ||
1265 | tfm = NULL; | ||
1266 | retcode = ERR_AUTH_ALG; | ||
1267 | goto fail; | ||
1268 | } | ||
1269 | |||
1270 | if (crypto_tfm_alg_type(crypto_hash_tfm(tfm)) | ||
1271 | != CRYPTO_ALG_TYPE_HASH) { | ||
1272 | retcode = ERR_AUTH_ALG_ND; | ||
1273 | goto fail; | ||
1274 | } | ||
1275 | } | ||
1276 | |||
1277 | if (new_conf->integrity_alg[0]) { | ||
1278 | integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); | ||
1279 | if (IS_ERR(integrity_w_tfm)) { | ||
1280 | integrity_w_tfm = NULL; | ||
1281 | retcode=ERR_INTEGRITY_ALG; | ||
1282 | goto fail; | ||
1283 | } | ||
1284 | |||
1285 | if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) { | ||
1286 | retcode=ERR_INTEGRITY_ALG_ND; | ||
1287 | goto fail; | ||
1288 | } | ||
1289 | |||
1290 | integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); | ||
1291 | if (IS_ERR(integrity_r_tfm)) { | ||
1292 | integrity_r_tfm = NULL; | ||
1293 | retcode=ERR_INTEGRITY_ALG; | ||
1294 | goto fail; | ||
1295 | } | ||
1296 | } | ||
1297 | |||
1298 | ns = new_conf->max_epoch_size/8; | ||
1299 | if (mdev->tl_hash_s != ns) { | ||
1300 | new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); | ||
1301 | if (!new_tl_hash) { | ||
1302 | retcode = ERR_NOMEM; | ||
1303 | goto fail; | ||
1304 | } | ||
1305 | } | ||
1306 | |||
1307 | ns = new_conf->max_buffers/8; | ||
1308 | if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) { | ||
1309 | new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); | ||
1310 | if (!new_ee_hash) { | ||
1311 | retcode = ERR_NOMEM; | ||
1312 | goto fail; | ||
1313 | } | ||
1314 | } | ||
1315 | |||
1316 | ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0; | ||
1317 | |||
1318 | if (integrity_w_tfm) { | ||
1319 | i = crypto_hash_digestsize(integrity_w_tfm); | ||
1320 | int_dig_out = kmalloc(i, GFP_KERNEL); | ||
1321 | if (!int_dig_out) { | ||
1322 | retcode = ERR_NOMEM; | ||
1323 | goto fail; | ||
1324 | } | ||
1325 | int_dig_in = kmalloc(i, GFP_KERNEL); | ||
1326 | if (!int_dig_in) { | ||
1327 | retcode = ERR_NOMEM; | ||
1328 | goto fail; | ||
1329 | } | ||
1330 | int_dig_vv = kmalloc(i, GFP_KERNEL); | ||
1331 | if (!int_dig_vv) { | ||
1332 | retcode = ERR_NOMEM; | ||
1333 | goto fail; | ||
1334 | } | ||
1335 | } | ||
1336 | |||
1337 | if (!mdev->bitmap) { | ||
1338 | if(drbd_bm_init(mdev)) { | ||
1339 | retcode = ERR_NOMEM; | ||
1340 | goto fail; | ||
1341 | } | ||
1342 | } | ||
1343 | |||
1344 | spin_lock_irq(&mdev->req_lock); | ||
1345 | if (mdev->net_conf != NULL) { | ||
1346 | retcode = ERR_NET_CONFIGURED; | ||
1347 | spin_unlock_irq(&mdev->req_lock); | ||
1348 | goto fail; | ||
1349 | } | ||
1350 | mdev->net_conf = new_conf; | ||
1351 | |||
1352 | mdev->send_cnt = 0; | ||
1353 | mdev->recv_cnt = 0; | ||
1354 | |||
1355 | if (new_tl_hash) { | ||
1356 | kfree(mdev->tl_hash); | ||
1357 | mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8; | ||
1358 | mdev->tl_hash = new_tl_hash; | ||
1359 | } | ||
1360 | |||
1361 | if (new_ee_hash) { | ||
1362 | kfree(mdev->ee_hash); | ||
1363 | mdev->ee_hash_s = mdev->net_conf->max_buffers/8; | ||
1364 | mdev->ee_hash = new_ee_hash; | ||
1365 | } | ||
1366 | |||
1367 | crypto_free_hash(mdev->cram_hmac_tfm); | ||
1368 | mdev->cram_hmac_tfm = tfm; | ||
1369 | |||
1370 | crypto_free_hash(mdev->integrity_w_tfm); | ||
1371 | mdev->integrity_w_tfm = integrity_w_tfm; | ||
1372 | |||
1373 | crypto_free_hash(mdev->integrity_r_tfm); | ||
1374 | mdev->integrity_r_tfm = integrity_r_tfm; | ||
1375 | |||
1376 | kfree(mdev->int_dig_out); | ||
1377 | kfree(mdev->int_dig_in); | ||
1378 | kfree(mdev->int_dig_vv); | ||
1379 | mdev->int_dig_out=int_dig_out; | ||
1380 | mdev->int_dig_in=int_dig_in; | ||
1381 | mdev->int_dig_vv=int_dig_vv; | ||
1382 | spin_unlock_irq(&mdev->req_lock); | ||
1383 | |||
1384 | retcode = _drbd_request_state(mdev, NS(conn, C_UNCONNECTED), CS_VERBOSE); | ||
1385 | |||
1386 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | ||
1387 | reply->ret_code = retcode; | ||
1388 | drbd_reconfig_done(mdev); | ||
1389 | return 0; | ||
1390 | |||
1391 | fail: | ||
1392 | kfree(int_dig_out); | ||
1393 | kfree(int_dig_in); | ||
1394 | kfree(int_dig_vv); | ||
1395 | crypto_free_hash(tfm); | ||
1396 | crypto_free_hash(integrity_w_tfm); | ||
1397 | crypto_free_hash(integrity_r_tfm); | ||
1398 | kfree(new_tl_hash); | ||
1399 | kfree(new_ee_hash); | ||
1400 | kfree(new_conf); | ||
1401 | |||
1402 | reply->ret_code = retcode; | ||
1403 | drbd_reconfig_done(mdev); | ||
1404 | return 0; | ||
1405 | } | ||
1406 | |||
1407 | static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1408 | struct drbd_nl_cfg_reply *reply) | ||
1409 | { | ||
1410 | int retcode; | ||
1411 | |||
1412 | retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED); | ||
1413 | |||
1414 | if (retcode == SS_NOTHING_TO_DO) | ||
1415 | goto done; | ||
1416 | else if (retcode == SS_ALREADY_STANDALONE) | ||
1417 | goto done; | ||
1418 | else if (retcode == SS_PRIMARY_NOP) { | ||
1419 | /* Our statche checking code wants to see the peer outdated. */ | ||
1420 | retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING, | ||
1421 | pdsk, D_OUTDATED)); | ||
1422 | } else if (retcode == SS_CW_FAILED_BY_PEER) { | ||
1423 | /* The peer probably wants to see us outdated. */ | ||
1424 | retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING, | ||
1425 | disk, D_OUTDATED), | ||
1426 | CS_ORDERED); | ||
1427 | if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) { | ||
1428 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
1429 | retcode = SS_SUCCESS; | ||
1430 | } | ||
1431 | } | ||
1432 | |||
1433 | if (retcode < SS_SUCCESS) | ||
1434 | goto fail; | ||
1435 | |||
1436 | if (wait_event_interruptible(mdev->state_wait, | ||
1437 | mdev->state.conn != C_DISCONNECTING)) { | ||
1438 | /* Do not test for mdev->state.conn == C_STANDALONE, since | ||
1439 | someone else might connect us in the mean time! */ | ||
1440 | retcode = ERR_INTR; | ||
1441 | goto fail; | ||
1442 | } | ||
1443 | |||
1444 | done: | ||
1445 | retcode = NO_ERROR; | ||
1446 | fail: | ||
1447 | drbd_md_sync(mdev); | ||
1448 | reply->ret_code = retcode; | ||
1449 | return 0; | ||
1450 | } | ||
1451 | |||
1452 | void resync_after_online_grow(struct drbd_conf *mdev) | ||
1453 | { | ||
1454 | int iass; /* I am sync source */ | ||
1455 | |||
1456 | dev_info(DEV, "Resync of new storage after online grow\n"); | ||
1457 | if (mdev->state.role != mdev->state.peer) | ||
1458 | iass = (mdev->state.role == R_PRIMARY); | ||
1459 | else | ||
1460 | iass = test_bit(DISCARD_CONCURRENT, &mdev->flags); | ||
1461 | |||
1462 | if (iass) | ||
1463 | drbd_start_resync(mdev, C_SYNC_SOURCE); | ||
1464 | else | ||
1465 | _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE); | ||
1466 | } | ||
1467 | |||
1468 | static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1469 | struct drbd_nl_cfg_reply *reply) | ||
1470 | { | ||
1471 | struct resize rs; | ||
1472 | int retcode = NO_ERROR; | ||
1473 | int ldsc = 0; /* local disk size changed */ | ||
1474 | enum determine_dev_size dd; | ||
1475 | |||
1476 | memset(&rs, 0, sizeof(struct resize)); | ||
1477 | if (!resize_from_tags(mdev, nlp->tag_list, &rs)) { | ||
1478 | retcode = ERR_MANDATORY_TAG; | ||
1479 | goto fail; | ||
1480 | } | ||
1481 | |||
1482 | if (mdev->state.conn > C_CONNECTED) { | ||
1483 | retcode = ERR_RESIZE_RESYNC; | ||
1484 | goto fail; | ||
1485 | } | ||
1486 | |||
1487 | if (mdev->state.role == R_SECONDARY && | ||
1488 | mdev->state.peer == R_SECONDARY) { | ||
1489 | retcode = ERR_NO_PRIMARY; | ||
1490 | goto fail; | ||
1491 | } | ||
1492 | |||
1493 | if (!get_ldev(mdev)) { | ||
1494 | retcode = ERR_NO_DISK; | ||
1495 | goto fail; | ||
1496 | } | ||
1497 | |||
1498 | if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { | ||
1499 | mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); | ||
1500 | ldsc = 1; | ||
1501 | } | ||
1502 | |||
1503 | mdev->ldev->dc.disk_size = (sector_t)rs.resize_size; | ||
1504 | dd = drbd_determin_dev_size(mdev); | ||
1505 | drbd_md_sync(mdev); | ||
1506 | put_ldev(mdev); | ||
1507 | if (dd == dev_size_error) { | ||
1508 | retcode = ERR_NOMEM_BITMAP; | ||
1509 | goto fail; | ||
1510 | } | ||
1511 | |||
1512 | if (mdev->state.conn == C_CONNECTED && (dd != unchanged || ldsc)) { | ||
1513 | if (dd == grew) | ||
1514 | set_bit(RESIZE_PENDING, &mdev->flags); | ||
1515 | |||
1516 | drbd_send_uuids(mdev); | ||
1517 | drbd_send_sizes(mdev, 1); | ||
1518 | } | ||
1519 | |||
1520 | fail: | ||
1521 | reply->ret_code = retcode; | ||
1522 | return 0; | ||
1523 | } | ||
1524 | |||
1525 | static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1526 | struct drbd_nl_cfg_reply *reply) | ||
1527 | { | ||
1528 | int retcode = NO_ERROR; | ||
1529 | int err; | ||
1530 | int ovr; /* online verify running */ | ||
1531 | int rsr; /* re-sync running */ | ||
1532 | struct crypto_hash *verify_tfm = NULL; | ||
1533 | struct crypto_hash *csums_tfm = NULL; | ||
1534 | struct syncer_conf sc; | ||
1535 | cpumask_var_t new_cpu_mask; | ||
1536 | |||
1537 | if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) { | ||
1538 | retcode = ERR_NOMEM; | ||
1539 | goto fail; | ||
1540 | } | ||
1541 | |||
1542 | if (nlp->flags & DRBD_NL_SET_DEFAULTS) { | ||
1543 | memset(&sc, 0, sizeof(struct syncer_conf)); | ||
1544 | sc.rate = DRBD_RATE_DEF; | ||
1545 | sc.after = DRBD_AFTER_DEF; | ||
1546 | sc.al_extents = DRBD_AL_EXTENTS_DEF; | ||
1547 | } else | ||
1548 | memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf)); | ||
1549 | |||
1550 | if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) { | ||
1551 | retcode = ERR_MANDATORY_TAG; | ||
1552 | goto fail; | ||
1553 | } | ||
1554 | |||
1555 | /* re-sync running */ | ||
1556 | rsr = ( mdev->state.conn == C_SYNC_SOURCE || | ||
1557 | mdev->state.conn == C_SYNC_TARGET || | ||
1558 | mdev->state.conn == C_PAUSED_SYNC_S || | ||
1559 | mdev->state.conn == C_PAUSED_SYNC_T ); | ||
1560 | |||
1561 | if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) { | ||
1562 | retcode = ERR_CSUMS_RESYNC_RUNNING; | ||
1563 | goto fail; | ||
1564 | } | ||
1565 | |||
1566 | if (!rsr && sc.csums_alg[0]) { | ||
1567 | csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC); | ||
1568 | if (IS_ERR(csums_tfm)) { | ||
1569 | csums_tfm = NULL; | ||
1570 | retcode = ERR_CSUMS_ALG; | ||
1571 | goto fail; | ||
1572 | } | ||
1573 | |||
1574 | if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) { | ||
1575 | retcode = ERR_CSUMS_ALG_ND; | ||
1576 | goto fail; | ||
1577 | } | ||
1578 | } | ||
1579 | |||
1580 | /* online verify running */ | ||
1581 | ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T); | ||
1582 | |||
1583 | if (ovr) { | ||
1584 | if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) { | ||
1585 | retcode = ERR_VERIFY_RUNNING; | ||
1586 | goto fail; | ||
1587 | } | ||
1588 | } | ||
1589 | |||
1590 | if (!ovr && sc.verify_alg[0]) { | ||
1591 | verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC); | ||
1592 | if (IS_ERR(verify_tfm)) { | ||
1593 | verify_tfm = NULL; | ||
1594 | retcode = ERR_VERIFY_ALG; | ||
1595 | goto fail; | ||
1596 | } | ||
1597 | |||
1598 | if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) { | ||
1599 | retcode = ERR_VERIFY_ALG_ND; | ||
1600 | goto fail; | ||
1601 | } | ||
1602 | } | ||
1603 | |||
1604 | /* silently ignore cpu mask on UP kernel */ | ||
1605 | if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) { | ||
1606 | err = __bitmap_parse(sc.cpu_mask, 32, 0, | ||
1607 | cpumask_bits(new_cpu_mask), nr_cpu_ids); | ||
1608 | if (err) { | ||
1609 | dev_warn(DEV, "__bitmap_parse() failed with %d\n", err); | ||
1610 | retcode = ERR_CPU_MASK_PARSE; | ||
1611 | goto fail; | ||
1612 | } | ||
1613 | } | ||
1614 | |||
1615 | ERR_IF (sc.rate < 1) sc.rate = 1; | ||
1616 | ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */ | ||
1617 | #define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT) | ||
1618 | if (sc.al_extents > AL_MAX) { | ||
1619 | dev_err(DEV, "sc.al_extents > %d\n", AL_MAX); | ||
1620 | sc.al_extents = AL_MAX; | ||
1621 | } | ||
1622 | #undef AL_MAX | ||
1623 | |||
1624 | /* most sanity checks done, try to assign the new sync-after | ||
1625 | * dependency. need to hold the global lock in there, | ||
1626 | * to avoid a race in the dependency loop check. */ | ||
1627 | retcode = drbd_alter_sa(mdev, sc.after); | ||
1628 | if (retcode != NO_ERROR) | ||
1629 | goto fail; | ||
1630 | |||
1631 | /* ok, assign the rest of it as well. | ||
1632 | * lock against receive_SyncParam() */ | ||
1633 | spin_lock(&mdev->peer_seq_lock); | ||
1634 | mdev->sync_conf = sc; | ||
1635 | |||
1636 | if (!rsr) { | ||
1637 | crypto_free_hash(mdev->csums_tfm); | ||
1638 | mdev->csums_tfm = csums_tfm; | ||
1639 | csums_tfm = NULL; | ||
1640 | } | ||
1641 | |||
1642 | if (!ovr) { | ||
1643 | crypto_free_hash(mdev->verify_tfm); | ||
1644 | mdev->verify_tfm = verify_tfm; | ||
1645 | verify_tfm = NULL; | ||
1646 | } | ||
1647 | spin_unlock(&mdev->peer_seq_lock); | ||
1648 | |||
1649 | if (get_ldev(mdev)) { | ||
1650 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | ||
1651 | drbd_al_shrink(mdev); | ||
1652 | err = drbd_check_al_size(mdev); | ||
1653 | lc_unlock(mdev->act_log); | ||
1654 | wake_up(&mdev->al_wait); | ||
1655 | |||
1656 | put_ldev(mdev); | ||
1657 | drbd_md_sync(mdev); | ||
1658 | |||
1659 | if (err) { | ||
1660 | retcode = ERR_NOMEM; | ||
1661 | goto fail; | ||
1662 | } | ||
1663 | } | ||
1664 | |||
1665 | if (mdev->state.conn >= C_CONNECTED) | ||
1666 | drbd_send_sync_param(mdev, &sc); | ||
1667 | |||
1668 | if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) { | ||
1669 | cpumask_copy(mdev->cpu_mask, new_cpu_mask); | ||
1670 | drbd_calc_cpu_mask(mdev); | ||
1671 | mdev->receiver.reset_cpu_mask = 1; | ||
1672 | mdev->asender.reset_cpu_mask = 1; | ||
1673 | mdev->worker.reset_cpu_mask = 1; | ||
1674 | } | ||
1675 | |||
1676 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | ||
1677 | fail: | ||
1678 | free_cpumask_var(new_cpu_mask); | ||
1679 | crypto_free_hash(csums_tfm); | ||
1680 | crypto_free_hash(verify_tfm); | ||
1681 | reply->ret_code = retcode; | ||
1682 | return 0; | ||
1683 | } | ||
1684 | |||
1685 | static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1686 | struct drbd_nl_cfg_reply *reply) | ||
1687 | { | ||
1688 | int retcode; | ||
1689 | |||
1690 | retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); | ||
1691 | |||
1692 | if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION) | ||
1693 | retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); | ||
1694 | |||
1695 | while (retcode == SS_NEED_CONNECTION) { | ||
1696 | spin_lock_irq(&mdev->req_lock); | ||
1697 | if (mdev->state.conn < C_CONNECTED) | ||
1698 | retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL); | ||
1699 | spin_unlock_irq(&mdev->req_lock); | ||
1700 | |||
1701 | if (retcode != SS_NEED_CONNECTION) | ||
1702 | break; | ||
1703 | |||
1704 | retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); | ||
1705 | } | ||
1706 | |||
1707 | reply->ret_code = retcode; | ||
1708 | return 0; | ||
1709 | } | ||
1710 | |||
1711 | static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1712 | struct drbd_nl_cfg_reply *reply) | ||
1713 | { | ||
1714 | |||
1715 | reply->ret_code = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); | ||
1716 | |||
1717 | return 0; | ||
1718 | } | ||
1719 | |||
1720 | static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1721 | struct drbd_nl_cfg_reply *reply) | ||
1722 | { | ||
1723 | int retcode = NO_ERROR; | ||
1724 | |||
1725 | if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO) | ||
1726 | retcode = ERR_PAUSE_IS_SET; | ||
1727 | |||
1728 | reply->ret_code = retcode; | ||
1729 | return 0; | ||
1730 | } | ||
1731 | |||
1732 | static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1733 | struct drbd_nl_cfg_reply *reply) | ||
1734 | { | ||
1735 | int retcode = NO_ERROR; | ||
1736 | |||
1737 | if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) | ||
1738 | retcode = ERR_PAUSE_IS_CLEAR; | ||
1739 | |||
1740 | reply->ret_code = retcode; | ||
1741 | return 0; | ||
1742 | } | ||
1743 | |||
1744 | static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1745 | struct drbd_nl_cfg_reply *reply) | ||
1746 | { | ||
1747 | reply->ret_code = drbd_request_state(mdev, NS(susp, 1)); | ||
1748 | |||
1749 | return 0; | ||
1750 | } | ||
1751 | |||
1752 | static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1753 | struct drbd_nl_cfg_reply *reply) | ||
1754 | { | ||
1755 | reply->ret_code = drbd_request_state(mdev, NS(susp, 0)); | ||
1756 | return 0; | ||
1757 | } | ||
1758 | |||
1759 | static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1760 | struct drbd_nl_cfg_reply *reply) | ||
1761 | { | ||
1762 | reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED)); | ||
1763 | return 0; | ||
1764 | } | ||
1765 | |||
1766 | static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1767 | struct drbd_nl_cfg_reply *reply) | ||
1768 | { | ||
1769 | unsigned short *tl; | ||
1770 | |||
1771 | tl = reply->tag_list; | ||
1772 | |||
1773 | if (get_ldev(mdev)) { | ||
1774 | tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl); | ||
1775 | put_ldev(mdev); | ||
1776 | } | ||
1777 | |||
1778 | if (get_net_conf(mdev)) { | ||
1779 | tl = net_conf_to_tags(mdev, mdev->net_conf, tl); | ||
1780 | put_net_conf(mdev); | ||
1781 | } | ||
1782 | tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl); | ||
1783 | |||
1784 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
1785 | |||
1786 | return (int)((char *)tl - (char *)reply->tag_list); | ||
1787 | } | ||
1788 | |||
1789 | static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1790 | struct drbd_nl_cfg_reply *reply) | ||
1791 | { | ||
1792 | unsigned short *tl = reply->tag_list; | ||
1793 | union drbd_state s = mdev->state; | ||
1794 | unsigned long rs_left; | ||
1795 | unsigned int res; | ||
1796 | |||
1797 | tl = get_state_to_tags(mdev, (struct get_state *)&s, tl); | ||
1798 | |||
1799 | /* no local ref, no bitmap, no syncer progress. */ | ||
1800 | if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) { | ||
1801 | if (get_ldev(mdev)) { | ||
1802 | drbd_get_syncer_progress(mdev, &rs_left, &res); | ||
1803 | tl = tl_add_int(tl, T_sync_progress, &res); | ||
1804 | put_ldev(mdev); | ||
1805 | } | ||
1806 | } | ||
1807 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
1808 | |||
1809 | return (int)((char *)tl - (char *)reply->tag_list); | ||
1810 | } | ||
1811 | |||
1812 | static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1813 | struct drbd_nl_cfg_reply *reply) | ||
1814 | { | ||
1815 | unsigned short *tl; | ||
1816 | |||
1817 | tl = reply->tag_list; | ||
1818 | |||
1819 | if (get_ldev(mdev)) { | ||
1820 | tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64)); | ||
1821 | tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags); | ||
1822 | put_ldev(mdev); | ||
1823 | } | ||
1824 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
1825 | |||
1826 | return (int)((char *)tl - (char *)reply->tag_list); | ||
1827 | } | ||
1828 | |||
1829 | /** | ||
1830 | * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use | ||
1831 | * @mdev: DRBD device. | ||
1832 | * @nlp: Netlink/connector packet from drbdsetup | ||
1833 | * @reply: Reply packet for drbdsetup | ||
1834 | */ | ||
1835 | static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1836 | struct drbd_nl_cfg_reply *reply) | ||
1837 | { | ||
1838 | unsigned short *tl; | ||
1839 | char rv; | ||
1840 | |||
1841 | tl = reply->tag_list; | ||
1842 | |||
1843 | rv = mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : | ||
1844 | test_bit(USE_DEGR_WFC_T, &mdev->flags) ? UT_DEGRADED : UT_DEFAULT; | ||
1845 | |||
1846 | tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv)); | ||
1847 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
1848 | |||
1849 | return (int)((char *)tl - (char *)reply->tag_list); | ||
1850 | } | ||
1851 | |||
1852 | static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1853 | struct drbd_nl_cfg_reply *reply) | ||
1854 | { | ||
1855 | /* default to resume from last known position, if possible */ | ||
1856 | struct start_ov args = | ||
1857 | { .start_sector = mdev->ov_start_sector }; | ||
1858 | |||
1859 | if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) { | ||
1860 | reply->ret_code = ERR_MANDATORY_TAG; | ||
1861 | return 0; | ||
1862 | } | ||
1863 | /* w_make_ov_request expects position to be aligned */ | ||
1864 | mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT; | ||
1865 | reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); | ||
1866 | return 0; | ||
1867 | } | ||
1868 | |||
1869 | |||
1870 | static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1871 | struct drbd_nl_cfg_reply *reply) | ||
1872 | { | ||
1873 | int retcode = NO_ERROR; | ||
1874 | int skip_initial_sync = 0; | ||
1875 | int err; | ||
1876 | |||
1877 | struct new_c_uuid args; | ||
1878 | |||
1879 | memset(&args, 0, sizeof(struct new_c_uuid)); | ||
1880 | if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) { | ||
1881 | reply->ret_code = ERR_MANDATORY_TAG; | ||
1882 | return 0; | ||
1883 | } | ||
1884 | |||
1885 | mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */ | ||
1886 | |||
1887 | if (!get_ldev(mdev)) { | ||
1888 | retcode = ERR_NO_DISK; | ||
1889 | goto out; | ||
1890 | } | ||
1891 | |||
1892 | /* this is "skip initial sync", assume to be clean */ | ||
1893 | if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 && | ||
1894 | mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) { | ||
1895 | dev_info(DEV, "Preparing to skip initial sync\n"); | ||
1896 | skip_initial_sync = 1; | ||
1897 | } else if (mdev->state.conn != C_STANDALONE) { | ||
1898 | retcode = ERR_CONNECTED; | ||
1899 | goto out_dec; | ||
1900 | } | ||
1901 | |||
1902 | drbd_uuid_set(mdev, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */ | ||
1903 | drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */ | ||
1904 | |||
1905 | if (args.clear_bm) { | ||
1906 | err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, "clear_n_write from new_c_uuid"); | ||
1907 | if (err) { | ||
1908 | dev_err(DEV, "Writing bitmap failed with %d\n",err); | ||
1909 | retcode = ERR_IO_MD_DISK; | ||
1910 | } | ||
1911 | if (skip_initial_sync) { | ||
1912 | drbd_send_uuids_skip_initial_sync(mdev); | ||
1913 | _drbd_uuid_set(mdev, UI_BITMAP, 0); | ||
1914 | spin_lock_irq(&mdev->req_lock); | ||
1915 | _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), | ||
1916 | CS_VERBOSE, NULL); | ||
1917 | spin_unlock_irq(&mdev->req_lock); | ||
1918 | } | ||
1919 | } | ||
1920 | |||
1921 | drbd_md_sync(mdev); | ||
1922 | out_dec: | ||
1923 | put_ldev(mdev); | ||
1924 | out: | ||
1925 | mutex_unlock(&mdev->state_mutex); | ||
1926 | |||
1927 | reply->ret_code = retcode; | ||
1928 | return 0; | ||
1929 | } | ||
1930 | |||
1931 | static struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp) | ||
1932 | { | ||
1933 | struct drbd_conf *mdev; | ||
1934 | |||
1935 | if (nlp->drbd_minor >= minor_count) | ||
1936 | return NULL; | ||
1937 | |||
1938 | mdev = minor_to_mdev(nlp->drbd_minor); | ||
1939 | |||
1940 | if (!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) { | ||
1941 | struct gendisk *disk = NULL; | ||
1942 | mdev = drbd_new_device(nlp->drbd_minor); | ||
1943 | |||
1944 | spin_lock_irq(&drbd_pp_lock); | ||
1945 | if (minor_table[nlp->drbd_minor] == NULL) { | ||
1946 | minor_table[nlp->drbd_minor] = mdev; | ||
1947 | disk = mdev->vdisk; | ||
1948 | mdev = NULL; | ||
1949 | } /* else: we lost the race */ | ||
1950 | spin_unlock_irq(&drbd_pp_lock); | ||
1951 | |||
1952 | if (disk) /* we won the race above */ | ||
1953 | /* in case we ever add a drbd_delete_device(), | ||
1954 | * don't forget the del_gendisk! */ | ||
1955 | add_disk(disk); | ||
1956 | else /* we lost the race above */ | ||
1957 | drbd_free_mdev(mdev); | ||
1958 | |||
1959 | mdev = minor_to_mdev(nlp->drbd_minor); | ||
1960 | } | ||
1961 | |||
1962 | return mdev; | ||
1963 | } | ||
1964 | |||
1965 | struct cn_handler_struct { | ||
1966 | int (*function)(struct drbd_conf *, | ||
1967 | struct drbd_nl_cfg_req *, | ||
1968 | struct drbd_nl_cfg_reply *); | ||
1969 | int reply_body_size; | ||
1970 | }; | ||
1971 | |||
1972 | static struct cn_handler_struct cnd_table[] = { | ||
1973 | [ P_primary ] = { &drbd_nl_primary, 0 }, | ||
1974 | [ P_secondary ] = { &drbd_nl_secondary, 0 }, | ||
1975 | [ P_disk_conf ] = { &drbd_nl_disk_conf, 0 }, | ||
1976 | [ P_detach ] = { &drbd_nl_detach, 0 }, | ||
1977 | [ P_net_conf ] = { &drbd_nl_net_conf, 0 }, | ||
1978 | [ P_disconnect ] = { &drbd_nl_disconnect, 0 }, | ||
1979 | [ P_resize ] = { &drbd_nl_resize, 0 }, | ||
1980 | [ P_syncer_conf ] = { &drbd_nl_syncer_conf, 0 }, | ||
1981 | [ P_invalidate ] = { &drbd_nl_invalidate, 0 }, | ||
1982 | [ P_invalidate_peer ] = { &drbd_nl_invalidate_peer, 0 }, | ||
1983 | [ P_pause_sync ] = { &drbd_nl_pause_sync, 0 }, | ||
1984 | [ P_resume_sync ] = { &drbd_nl_resume_sync, 0 }, | ||
1985 | [ P_suspend_io ] = { &drbd_nl_suspend_io, 0 }, | ||
1986 | [ P_resume_io ] = { &drbd_nl_resume_io, 0 }, | ||
1987 | [ P_outdate ] = { &drbd_nl_outdate, 0 }, | ||
1988 | [ P_get_config ] = { &drbd_nl_get_config, | ||
1989 | sizeof(struct syncer_conf_tag_len_struct) + | ||
1990 | sizeof(struct disk_conf_tag_len_struct) + | ||
1991 | sizeof(struct net_conf_tag_len_struct) }, | ||
1992 | [ P_get_state ] = { &drbd_nl_get_state, | ||
1993 | sizeof(struct get_state_tag_len_struct) + | ||
1994 | sizeof(struct sync_progress_tag_len_struct) }, | ||
1995 | [ P_get_uuids ] = { &drbd_nl_get_uuids, | ||
1996 | sizeof(struct get_uuids_tag_len_struct) }, | ||
1997 | [ P_get_timeout_flag ] = { &drbd_nl_get_timeout_flag, | ||
1998 | sizeof(struct get_timeout_flag_tag_len_struct)}, | ||
1999 | [ P_start_ov ] = { &drbd_nl_start_ov, 0 }, | ||
2000 | [ P_new_c_uuid ] = { &drbd_nl_new_c_uuid, 0 }, | ||
2001 | }; | ||
2002 | |||
2003 | static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms *nsp) | ||
2004 | { | ||
2005 | struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data; | ||
2006 | struct cn_handler_struct *cm; | ||
2007 | struct cn_msg *cn_reply; | ||
2008 | struct drbd_nl_cfg_reply *reply; | ||
2009 | struct drbd_conf *mdev; | ||
2010 | int retcode, rr; | ||
2011 | int reply_size = sizeof(struct cn_msg) | ||
2012 | + sizeof(struct drbd_nl_cfg_reply) | ||
2013 | + sizeof(short int); | ||
2014 | |||
2015 | if (!try_module_get(THIS_MODULE)) { | ||
2016 | printk(KERN_ERR "drbd: try_module_get() failed!\n"); | ||
2017 | return; | ||
2018 | } | ||
2019 | |||
2020 | if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN)) { | ||
2021 | retcode = ERR_PERM; | ||
2022 | goto fail; | ||
2023 | } | ||
2024 | |||
2025 | mdev = ensure_mdev(nlp); | ||
2026 | if (!mdev) { | ||
2027 | retcode = ERR_MINOR_INVALID; | ||
2028 | goto fail; | ||
2029 | } | ||
2030 | |||
2031 | if (nlp->packet_type >= P_nl_after_last_packet) { | ||
2032 | retcode = ERR_PACKET_NR; | ||
2033 | goto fail; | ||
2034 | } | ||
2035 | |||
2036 | cm = cnd_table + nlp->packet_type; | ||
2037 | |||
2038 | /* This may happen if packet number is 0: */ | ||
2039 | if (cm->function == NULL) { | ||
2040 | retcode = ERR_PACKET_NR; | ||
2041 | goto fail; | ||
2042 | } | ||
2043 | |||
2044 | reply_size += cm->reply_body_size; | ||
2045 | |||
2046 | /* allocation not in the IO path, cqueue thread context */ | ||
2047 | cn_reply = kmalloc(reply_size, GFP_KERNEL); | ||
2048 | if (!cn_reply) { | ||
2049 | retcode = ERR_NOMEM; | ||
2050 | goto fail; | ||
2051 | } | ||
2052 | reply = (struct drbd_nl_cfg_reply *) cn_reply->data; | ||
2053 | |||
2054 | reply->packet_type = | ||
2055 | cm->reply_body_size ? nlp->packet_type : P_nl_after_last_packet; | ||
2056 | reply->minor = nlp->drbd_minor; | ||
2057 | reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */ | ||
2058 | /* reply->tag_list; might be modified by cm->function. */ | ||
2059 | |||
2060 | rr = cm->function(mdev, nlp, reply); | ||
2061 | |||
2062 | cn_reply->id = req->id; | ||
2063 | cn_reply->seq = req->seq; | ||
2064 | cn_reply->ack = req->ack + 1; | ||
2065 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr; | ||
2066 | cn_reply->flags = 0; | ||
2067 | |||
2068 | rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL); | ||
2069 | if (rr && rr != -ESRCH) | ||
2070 | printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); | ||
2071 | |||
2072 | kfree(cn_reply); | ||
2073 | module_put(THIS_MODULE); | ||
2074 | return; | ||
2075 | fail: | ||
2076 | drbd_nl_send_reply(req, retcode); | ||
2077 | module_put(THIS_MODULE); | ||
2078 | } | ||
2079 | |||
2080 | static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */ | ||
2081 | |||
2082 | static unsigned short * | ||
2083 | __tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, | ||
2084 | unsigned short len, int nul_terminated) | ||
2085 | { | ||
2086 | unsigned short l = tag_descriptions[tag_number(tag)].max_len; | ||
2087 | len = (len < l) ? len : l; | ||
2088 | put_unaligned(tag, tl++); | ||
2089 | put_unaligned(len, tl++); | ||
2090 | memcpy(tl, data, len); | ||
2091 | tl = (unsigned short*)((char*)tl + len); | ||
2092 | if (nul_terminated) | ||
2093 | *((char*)tl - 1) = 0; | ||
2094 | return tl; | ||
2095 | } | ||
2096 | |||
2097 | static unsigned short * | ||
2098 | tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len) | ||
2099 | { | ||
2100 | return __tl_add_blob(tl, tag, data, len, 0); | ||
2101 | } | ||
2102 | |||
2103 | static unsigned short * | ||
2104 | tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str) | ||
2105 | { | ||
2106 | return __tl_add_blob(tl, tag, str, strlen(str)+1, 0); | ||
2107 | } | ||
2108 | |||
2109 | static unsigned short * | ||
2110 | tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val) | ||
2111 | { | ||
2112 | put_unaligned(tag, tl++); | ||
2113 | switch(tag_type(tag)) { | ||
2114 | case TT_INTEGER: | ||
2115 | put_unaligned(sizeof(int), tl++); | ||
2116 | put_unaligned(*(int *)val, (int *)tl); | ||
2117 | tl = (unsigned short*)((char*)tl+sizeof(int)); | ||
2118 | break; | ||
2119 | case TT_INT64: | ||
2120 | put_unaligned(sizeof(u64), tl++); | ||
2121 | put_unaligned(*(u64 *)val, (u64 *)tl); | ||
2122 | tl = (unsigned short*)((char*)tl+sizeof(u64)); | ||
2123 | break; | ||
2124 | default: | ||
2125 | /* someone did something stupid. */ | ||
2126 | ; | ||
2127 | } | ||
2128 | return tl; | ||
2129 | } | ||
2130 | |||
2131 | void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state) | ||
2132 | { | ||
2133 | char buffer[sizeof(struct cn_msg)+ | ||
2134 | sizeof(struct drbd_nl_cfg_reply)+ | ||
2135 | sizeof(struct get_state_tag_len_struct)+ | ||
2136 | sizeof(short int)]; | ||
2137 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | ||
2138 | struct drbd_nl_cfg_reply *reply = | ||
2139 | (struct drbd_nl_cfg_reply *)cn_reply->data; | ||
2140 | unsigned short *tl = reply->tag_list; | ||
2141 | |||
2142 | /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */ | ||
2143 | |||
2144 | tl = get_state_to_tags(mdev, (struct get_state *)&state, tl); | ||
2145 | |||
2146 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
2147 | |||
2148 | cn_reply->id.idx = CN_IDX_DRBD; | ||
2149 | cn_reply->id.val = CN_VAL_DRBD; | ||
2150 | |||
2151 | cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); | ||
2152 | cn_reply->ack = 0; /* not used here. */ | ||
2153 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | ||
2154 | (int)((char *)tl - (char *)reply->tag_list); | ||
2155 | cn_reply->flags = 0; | ||
2156 | |||
2157 | reply->packet_type = P_get_state; | ||
2158 | reply->minor = mdev_to_minor(mdev); | ||
2159 | reply->ret_code = NO_ERROR; | ||
2160 | |||
2161 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | ||
2162 | } | ||
2163 | |||
2164 | void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name) | ||
2165 | { | ||
2166 | char buffer[sizeof(struct cn_msg)+ | ||
2167 | sizeof(struct drbd_nl_cfg_reply)+ | ||
2168 | sizeof(struct call_helper_tag_len_struct)+ | ||
2169 | sizeof(short int)]; | ||
2170 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | ||
2171 | struct drbd_nl_cfg_reply *reply = | ||
2172 | (struct drbd_nl_cfg_reply *)cn_reply->data; | ||
2173 | unsigned short *tl = reply->tag_list; | ||
2174 | |||
2175 | /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */ | ||
2176 | |||
2177 | tl = tl_add_str(tl, T_helper, helper_name); | ||
2178 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
2179 | |||
2180 | cn_reply->id.idx = CN_IDX_DRBD; | ||
2181 | cn_reply->id.val = CN_VAL_DRBD; | ||
2182 | |||
2183 | cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); | ||
2184 | cn_reply->ack = 0; /* not used here. */ | ||
2185 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | ||
2186 | (int)((char *)tl - (char *)reply->tag_list); | ||
2187 | cn_reply->flags = 0; | ||
2188 | |||
2189 | reply->packet_type = P_call_helper; | ||
2190 | reply->minor = mdev_to_minor(mdev); | ||
2191 | reply->ret_code = NO_ERROR; | ||
2192 | |||
2193 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | ||
2194 | } | ||
2195 | |||
2196 | void drbd_bcast_ee(struct drbd_conf *mdev, | ||
2197 | const char *reason, const int dgs, | ||
2198 | const char* seen_hash, const char* calc_hash, | ||
2199 | const struct drbd_epoch_entry* e) | ||
2200 | { | ||
2201 | struct cn_msg *cn_reply; | ||
2202 | struct drbd_nl_cfg_reply *reply; | ||
2203 | struct bio_vec *bvec; | ||
2204 | unsigned short *tl; | ||
2205 | int i; | ||
2206 | |||
2207 | if (!e) | ||
2208 | return; | ||
2209 | if (!reason || !reason[0]) | ||
2210 | return; | ||
2211 | |||
2212 | /* apparently we have to memcpy twice, first to prepare the data for the | ||
2213 | * struct cn_msg, then within cn_netlink_send from the cn_msg to the | ||
2214 | * netlink skb. */ | ||
2215 | /* receiver thread context, which is not in the writeout path (of this node), | ||
2216 | * but may be in the writeout path of the _other_ node. | ||
2217 | * GFP_NOIO to avoid potential "distributed deadlock". */ | ||
2218 | cn_reply = kmalloc( | ||
2219 | sizeof(struct cn_msg)+ | ||
2220 | sizeof(struct drbd_nl_cfg_reply)+ | ||
2221 | sizeof(struct dump_ee_tag_len_struct)+ | ||
2222 | sizeof(short int), | ||
2223 | GFP_NOIO); | ||
2224 | |||
2225 | if (!cn_reply) { | ||
2226 | dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n", | ||
2227 | (unsigned long long)e->sector, e->size); | ||
2228 | return; | ||
2229 | } | ||
2230 | |||
2231 | reply = (struct drbd_nl_cfg_reply*)cn_reply->data; | ||
2232 | tl = reply->tag_list; | ||
2233 | |||
2234 | tl = tl_add_str(tl, T_dump_ee_reason, reason); | ||
2235 | tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs); | ||
2236 | tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs); | ||
2237 | tl = tl_add_int(tl, T_ee_sector, &e->sector); | ||
2238 | tl = tl_add_int(tl, T_ee_block_id, &e->block_id); | ||
2239 | |||
2240 | put_unaligned(T_ee_data, tl++); | ||
2241 | put_unaligned(e->size, tl++); | ||
2242 | |||
2243 | __bio_for_each_segment(bvec, e->private_bio, i, 0) { | ||
2244 | void *d = kmap(bvec->bv_page); | ||
2245 | memcpy(tl, d + bvec->bv_offset, bvec->bv_len); | ||
2246 | kunmap(bvec->bv_page); | ||
2247 | tl=(unsigned short*)((char*)tl + bvec->bv_len); | ||
2248 | } | ||
2249 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
2250 | |||
2251 | cn_reply->id.idx = CN_IDX_DRBD; | ||
2252 | cn_reply->id.val = CN_VAL_DRBD; | ||
2253 | |||
2254 | cn_reply->seq = atomic_add_return(1,&drbd_nl_seq); | ||
2255 | cn_reply->ack = 0; // not used here. | ||
2256 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | ||
2257 | (int)((char*)tl - (char*)reply->tag_list); | ||
2258 | cn_reply->flags = 0; | ||
2259 | |||
2260 | reply->packet_type = P_dump_ee; | ||
2261 | reply->minor = mdev_to_minor(mdev); | ||
2262 | reply->ret_code = NO_ERROR; | ||
2263 | |||
2264 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | ||
2265 | kfree(cn_reply); | ||
2266 | } | ||
2267 | |||
2268 | void drbd_bcast_sync_progress(struct drbd_conf *mdev) | ||
2269 | { | ||
2270 | char buffer[sizeof(struct cn_msg)+ | ||
2271 | sizeof(struct drbd_nl_cfg_reply)+ | ||
2272 | sizeof(struct sync_progress_tag_len_struct)+ | ||
2273 | sizeof(short int)]; | ||
2274 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | ||
2275 | struct drbd_nl_cfg_reply *reply = | ||
2276 | (struct drbd_nl_cfg_reply *)cn_reply->data; | ||
2277 | unsigned short *tl = reply->tag_list; | ||
2278 | unsigned long rs_left; | ||
2279 | unsigned int res; | ||
2280 | |||
2281 | /* no local ref, no bitmap, no syncer progress, no broadcast. */ | ||
2282 | if (!get_ldev(mdev)) | ||
2283 | return; | ||
2284 | drbd_get_syncer_progress(mdev, &rs_left, &res); | ||
2285 | put_ldev(mdev); | ||
2286 | |||
2287 | tl = tl_add_int(tl, T_sync_progress, &res); | ||
2288 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
2289 | |||
2290 | cn_reply->id.idx = CN_IDX_DRBD; | ||
2291 | cn_reply->id.val = CN_VAL_DRBD; | ||
2292 | |||
2293 | cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); | ||
2294 | cn_reply->ack = 0; /* not used here. */ | ||
2295 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | ||
2296 | (int)((char *)tl - (char *)reply->tag_list); | ||
2297 | cn_reply->flags = 0; | ||
2298 | |||
2299 | reply->packet_type = P_sync_progress; | ||
2300 | reply->minor = mdev_to_minor(mdev); | ||
2301 | reply->ret_code = NO_ERROR; | ||
2302 | |||
2303 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | ||
2304 | } | ||
2305 | |||
2306 | int __init drbd_nl_init(void) | ||
2307 | { | ||
2308 | static struct cb_id cn_id_drbd; | ||
2309 | int err, try=10; | ||
2310 | |||
2311 | cn_id_drbd.val = CN_VAL_DRBD; | ||
2312 | do { | ||
2313 | cn_id_drbd.idx = cn_idx; | ||
2314 | err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback); | ||
2315 | if (!err) | ||
2316 | break; | ||
2317 | cn_idx = (cn_idx + CN_IDX_STEP); | ||
2318 | } while (try--); | ||
2319 | |||
2320 | if (err) { | ||
2321 | printk(KERN_ERR "drbd: cn_drbd failed to register\n"); | ||
2322 | return err; | ||
2323 | } | ||
2324 | |||
2325 | return 0; | ||
2326 | } | ||
2327 | |||
2328 | void drbd_nl_cleanup(void) | ||
2329 | { | ||
2330 | static struct cb_id cn_id_drbd; | ||
2331 | |||
2332 | cn_id_drbd.idx = cn_idx; | ||
2333 | cn_id_drbd.val = CN_VAL_DRBD; | ||
2334 | |||
2335 | cn_del_callback(&cn_id_drbd); | ||
2336 | } | ||
2337 | |||
2338 | void drbd_nl_send_reply(struct cn_msg *req, int ret_code) | ||
2339 | { | ||
2340 | char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)]; | ||
2341 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | ||
2342 | struct drbd_nl_cfg_reply *reply = | ||
2343 | (struct drbd_nl_cfg_reply *)cn_reply->data; | ||
2344 | int rr; | ||
2345 | |||
2346 | cn_reply->id = req->id; | ||
2347 | |||
2348 | cn_reply->seq = req->seq; | ||
2349 | cn_reply->ack = req->ack + 1; | ||
2350 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply); | ||
2351 | cn_reply->flags = 0; | ||
2352 | |||
2353 | reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor; | ||
2354 | reply->ret_code = ret_code; | ||
2355 | |||
2356 | rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | ||
2357 | if (rr && rr != -ESRCH) | ||
2358 | printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); | ||
2359 | } | ||
2360 | |||
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c new file mode 100644 index 000000000000..bdd0b4943b10 --- /dev/null +++ b/drivers/block/drbd/drbd_proc.c | |||
@@ -0,0 +1,265 @@ | |||
1 | /* | ||
2 | drbd_proc.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | |||
28 | #include <asm/uaccess.h> | ||
29 | #include <linux/fs.h> | ||
30 | #include <linux/file.h> | ||
31 | #include <linux/slab.h> | ||
32 | #include <linux/proc_fs.h> | ||
33 | #include <linux/seq_file.h> | ||
34 | #include <linux/drbd.h> | ||
35 | #include "drbd_int.h" | ||
36 | |||
37 | static int drbd_proc_open(struct inode *inode, struct file *file); | ||
38 | |||
39 | |||
40 | struct proc_dir_entry *drbd_proc; | ||
41 | struct file_operations drbd_proc_fops = { | ||
42 | .owner = THIS_MODULE, | ||
43 | .open = drbd_proc_open, | ||
44 | .read = seq_read, | ||
45 | .llseek = seq_lseek, | ||
46 | .release = single_release, | ||
47 | }; | ||
48 | |||
49 | |||
50 | /*lge | ||
51 | * progress bars shamelessly adapted from driver/md/md.c | ||
52 | * output looks like | ||
53 | * [=====>..............] 33.5% (23456/123456) | ||
54 | * finish: 2:20:20 speed: 6,345 (6,456) K/sec | ||
55 | */ | ||
56 | static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq) | ||
57 | { | ||
58 | unsigned long db, dt, dbdt, rt, rs_left; | ||
59 | unsigned int res; | ||
60 | int i, x, y; | ||
61 | |||
62 | drbd_get_syncer_progress(mdev, &rs_left, &res); | ||
63 | |||
64 | x = res/50; | ||
65 | y = 20-x; | ||
66 | seq_printf(seq, "\t["); | ||
67 | for (i = 1; i < x; i++) | ||
68 | seq_printf(seq, "="); | ||
69 | seq_printf(seq, ">"); | ||
70 | for (i = 0; i < y; i++) | ||
71 | seq_printf(seq, "."); | ||
72 | seq_printf(seq, "] "); | ||
73 | |||
74 | seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10); | ||
75 | /* if more than 1 GB display in MB */ | ||
76 | if (mdev->rs_total > 0x100000L) | ||
77 | seq_printf(seq, "(%lu/%lu)M\n\t", | ||
78 | (unsigned long) Bit2KB(rs_left >> 10), | ||
79 | (unsigned long) Bit2KB(mdev->rs_total >> 10)); | ||
80 | else | ||
81 | seq_printf(seq, "(%lu/%lu)K\n\t", | ||
82 | (unsigned long) Bit2KB(rs_left), | ||
83 | (unsigned long) Bit2KB(mdev->rs_total)); | ||
84 | |||
85 | /* see drivers/md/md.c | ||
86 | * We do not want to overflow, so the order of operands and | ||
87 | * the * 100 / 100 trick are important. We do a +1 to be | ||
88 | * safe against division by zero. We only estimate anyway. | ||
89 | * | ||
90 | * dt: time from mark until now | ||
91 | * db: blocks written from mark until now | ||
92 | * rt: remaining time | ||
93 | */ | ||
94 | dt = (jiffies - mdev->rs_mark_time) / HZ; | ||
95 | |||
96 | if (dt > 20) { | ||
97 | /* if we made no update to rs_mark_time for too long, | ||
98 | * we are stalled. show that. */ | ||
99 | seq_printf(seq, "stalled\n"); | ||
100 | return; | ||
101 | } | ||
102 | |||
103 | if (!dt) | ||
104 | dt++; | ||
105 | db = mdev->rs_mark_left - rs_left; | ||
106 | rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */ | ||
107 | |||
108 | seq_printf(seq, "finish: %lu:%02lu:%02lu", | ||
109 | rt / 3600, (rt % 3600) / 60, rt % 60); | ||
110 | |||
111 | /* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */ | ||
112 | dbdt = Bit2KB(db/dt); | ||
113 | if (dbdt > 1000) | ||
114 | seq_printf(seq, " speed: %ld,%03ld", | ||
115 | dbdt/1000, dbdt % 1000); | ||
116 | else | ||
117 | seq_printf(seq, " speed: %ld", dbdt); | ||
118 | |||
119 | /* mean speed since syncer started | ||
120 | * we do account for PausedSync periods */ | ||
121 | dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; | ||
122 | if (dt <= 0) | ||
123 | dt = 1; | ||
124 | db = mdev->rs_total - rs_left; | ||
125 | dbdt = Bit2KB(db/dt); | ||
126 | if (dbdt > 1000) | ||
127 | seq_printf(seq, " (%ld,%03ld)", | ||
128 | dbdt/1000, dbdt % 1000); | ||
129 | else | ||
130 | seq_printf(seq, " (%ld)", dbdt); | ||
131 | |||
132 | seq_printf(seq, " K/sec\n"); | ||
133 | } | ||
134 | |||
135 | static void resync_dump_detail(struct seq_file *seq, struct lc_element *e) | ||
136 | { | ||
137 | struct bm_extent *bme = lc_entry(e, struct bm_extent, lce); | ||
138 | |||
139 | seq_printf(seq, "%5d %s %s\n", bme->rs_left, | ||
140 | bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------", | ||
141 | bme->flags & BME_LOCKED ? "LOCKED" : "------" | ||
142 | ); | ||
143 | } | ||
144 | |||
145 | static int drbd_seq_show(struct seq_file *seq, void *v) | ||
146 | { | ||
147 | int i, hole = 0; | ||
148 | const char *sn; | ||
149 | struct drbd_conf *mdev; | ||
150 | |||
151 | static char write_ordering_chars[] = { | ||
152 | [WO_none] = 'n', | ||
153 | [WO_drain_io] = 'd', | ||
154 | [WO_bdev_flush] = 'f', | ||
155 | [WO_bio_barrier] = 'b', | ||
156 | }; | ||
157 | |||
158 | seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n", | ||
159 | API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag()); | ||
160 | |||
161 | /* | ||
162 | cs .. connection state | ||
163 | ro .. node role (local/remote) | ||
164 | ds .. disk state (local/remote) | ||
165 | protocol | ||
166 | various flags | ||
167 | ns .. network send | ||
168 | nr .. network receive | ||
169 | dw .. disk write | ||
170 | dr .. disk read | ||
171 | al .. activity log write count | ||
172 | bm .. bitmap update write count | ||
173 | pe .. pending (waiting for ack or data reply) | ||
174 | ua .. unack'd (still need to send ack or data reply) | ||
175 | ap .. application requests accepted, but not yet completed | ||
176 | ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending | ||
177 | wo .. write ordering mode currently in use | ||
178 | oos .. known out-of-sync kB | ||
179 | */ | ||
180 | |||
181 | for (i = 0; i < minor_count; i++) { | ||
182 | mdev = minor_to_mdev(i); | ||
183 | if (!mdev) { | ||
184 | hole = 1; | ||
185 | continue; | ||
186 | } | ||
187 | if (hole) { | ||
188 | hole = 0; | ||
189 | seq_printf(seq, "\n"); | ||
190 | } | ||
191 | |||
192 | sn = drbd_conn_str(mdev->state.conn); | ||
193 | |||
194 | if (mdev->state.conn == C_STANDALONE && | ||
195 | mdev->state.disk == D_DISKLESS && | ||
196 | mdev->state.role == R_SECONDARY) { | ||
197 | seq_printf(seq, "%2d: cs:Unconfigured\n", i); | ||
198 | } else { | ||
199 | seq_printf(seq, | ||
200 | "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n" | ||
201 | " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " | ||
202 | "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c", | ||
203 | i, sn, | ||
204 | drbd_role_str(mdev->state.role), | ||
205 | drbd_role_str(mdev->state.peer), | ||
206 | drbd_disk_str(mdev->state.disk), | ||
207 | drbd_disk_str(mdev->state.pdsk), | ||
208 | (mdev->net_conf == NULL ? ' ' : | ||
209 | (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')), | ||
210 | mdev->state.susp ? 's' : 'r', | ||
211 | mdev->state.aftr_isp ? 'a' : '-', | ||
212 | mdev->state.peer_isp ? 'p' : '-', | ||
213 | mdev->state.user_isp ? 'u' : '-', | ||
214 | mdev->congestion_reason ?: '-', | ||
215 | mdev->send_cnt/2, | ||
216 | mdev->recv_cnt/2, | ||
217 | mdev->writ_cnt/2, | ||
218 | mdev->read_cnt/2, | ||
219 | mdev->al_writ_cnt, | ||
220 | mdev->bm_writ_cnt, | ||
221 | atomic_read(&mdev->local_cnt), | ||
222 | atomic_read(&mdev->ap_pending_cnt) + | ||
223 | atomic_read(&mdev->rs_pending_cnt), | ||
224 | atomic_read(&mdev->unacked_cnt), | ||
225 | atomic_read(&mdev->ap_bio_cnt), | ||
226 | mdev->epochs, | ||
227 | write_ordering_chars[mdev->write_ordering] | ||
228 | ); | ||
229 | seq_printf(seq, " oos:%lu\n", | ||
230 | Bit2KB(drbd_bm_total_weight(mdev))); | ||
231 | } | ||
232 | if (mdev->state.conn == C_SYNC_SOURCE || | ||
233 | mdev->state.conn == C_SYNC_TARGET) | ||
234 | drbd_syncer_progress(mdev, seq); | ||
235 | |||
236 | if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T) | ||
237 | seq_printf(seq, "\t%3d%% %lu/%lu\n", | ||
238 | (int)((mdev->rs_total-mdev->ov_left) / | ||
239 | (mdev->rs_total/100+1)), | ||
240 | mdev->rs_total - mdev->ov_left, | ||
241 | mdev->rs_total); | ||
242 | |||
243 | if (proc_details >= 1 && get_ldev_if_state(mdev, D_FAILED)) { | ||
244 | lc_seq_printf_stats(seq, mdev->resync); | ||
245 | lc_seq_printf_stats(seq, mdev->act_log); | ||
246 | put_ldev(mdev); | ||
247 | } | ||
248 | |||
249 | if (proc_details >= 2) { | ||
250 | if (mdev->resync) { | ||
251 | lc_seq_dump_details(seq, mdev->resync, "rs_left", | ||
252 | resync_dump_detail); | ||
253 | } | ||
254 | } | ||
255 | } | ||
256 | |||
257 | return 0; | ||
258 | } | ||
259 | |||
260 | static int drbd_proc_open(struct inode *inode, struct file *file) | ||
261 | { | ||
262 | return single_open(file, drbd_seq_show, PDE(inode)->data); | ||
263 | } | ||
264 | |||
265 | /* PROC FS stuff end */ | ||
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c new file mode 100644 index 000000000000..360baf60f574 --- /dev/null +++ b/drivers/block/drbd/drbd_receiver.c | |||
@@ -0,0 +1,4427 @@ | |||
1 | /* | ||
2 | drbd_receiver.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | */ | ||
24 | |||
25 | |||
26 | #include <linux/module.h> | ||
27 | |||
28 | #include <asm/uaccess.h> | ||
29 | #include <net/sock.h> | ||
30 | |||
31 | #include <linux/version.h> | ||
32 | #include <linux/drbd.h> | ||
33 | #include <linux/fs.h> | ||
34 | #include <linux/file.h> | ||
35 | #include <linux/in.h> | ||
36 | #include <linux/mm.h> | ||
37 | #include <linux/memcontrol.h> | ||
38 | #include <linux/mm_inline.h> | ||
39 | #include <linux/slab.h> | ||
40 | #include <linux/smp_lock.h> | ||
41 | #include <linux/pkt_sched.h> | ||
42 | #define __KERNEL_SYSCALLS__ | ||
43 | #include <linux/unistd.h> | ||
44 | #include <linux/vmalloc.h> | ||
45 | #include <linux/random.h> | ||
46 | #include <linux/mm.h> | ||
47 | #include <linux/string.h> | ||
48 | #include <linux/scatterlist.h> | ||
49 | #include "drbd_int.h" | ||
50 | #include "drbd_req.h" | ||
51 | |||
52 | #include "drbd_vli.h" | ||
53 | |||
54 | struct flush_work { | ||
55 | struct drbd_work w; | ||
56 | struct drbd_epoch *epoch; | ||
57 | }; | ||
58 | |||
59 | enum finish_epoch { | ||
60 | FE_STILL_LIVE, | ||
61 | FE_DESTROYED, | ||
62 | FE_RECYCLED, | ||
63 | }; | ||
64 | |||
65 | static int drbd_do_handshake(struct drbd_conf *mdev); | ||
66 | static int drbd_do_auth(struct drbd_conf *mdev); | ||
67 | |||
68 | static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event); | ||
69 | static int e_end_block(struct drbd_conf *, struct drbd_work *, int); | ||
70 | |||
71 | static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch) | ||
72 | { | ||
73 | struct drbd_epoch *prev; | ||
74 | spin_lock(&mdev->epoch_lock); | ||
75 | prev = list_entry(epoch->list.prev, struct drbd_epoch, list); | ||
76 | if (prev == epoch || prev == mdev->current_epoch) | ||
77 | prev = NULL; | ||
78 | spin_unlock(&mdev->epoch_lock); | ||
79 | return prev; | ||
80 | } | ||
81 | |||
82 | #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) | ||
83 | |||
84 | static struct page *drbd_pp_first_page_or_try_alloc(struct drbd_conf *mdev) | ||
85 | { | ||
86 | struct page *page = NULL; | ||
87 | |||
88 | /* Yes, testing drbd_pp_vacant outside the lock is racy. | ||
89 | * So what. It saves a spin_lock. */ | ||
90 | if (drbd_pp_vacant > 0) { | ||
91 | spin_lock(&drbd_pp_lock); | ||
92 | page = drbd_pp_pool; | ||
93 | if (page) { | ||
94 | drbd_pp_pool = (struct page *)page_private(page); | ||
95 | set_page_private(page, 0); /* just to be polite */ | ||
96 | drbd_pp_vacant--; | ||
97 | } | ||
98 | spin_unlock(&drbd_pp_lock); | ||
99 | } | ||
100 | /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD | ||
101 | * "criss-cross" setup, that might cause write-out on some other DRBD, | ||
102 | * which in turn might block on the other node at this very place. */ | ||
103 | if (!page) | ||
104 | page = alloc_page(GFP_TRY); | ||
105 | if (page) | ||
106 | atomic_inc(&mdev->pp_in_use); | ||
107 | return page; | ||
108 | } | ||
109 | |||
110 | /* kick lower level device, if we have more than (arbitrary number) | ||
111 | * reference counts on it, which typically are locally submitted io | ||
112 | * requests. don't use unacked_cnt, so we speed up proto A and B, too. */ | ||
113 | static void maybe_kick_lo(struct drbd_conf *mdev) | ||
114 | { | ||
115 | if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark) | ||
116 | drbd_kick_lo(mdev); | ||
117 | } | ||
118 | |||
119 | static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed) | ||
120 | { | ||
121 | struct drbd_epoch_entry *e; | ||
122 | struct list_head *le, *tle; | ||
123 | |||
124 | /* The EEs are always appended to the end of the list. Since | ||
125 | they are sent in order over the wire, they have to finish | ||
126 | in order. As soon as we see the first not finished we can | ||
127 | stop to examine the list... */ | ||
128 | |||
129 | list_for_each_safe(le, tle, &mdev->net_ee) { | ||
130 | e = list_entry(le, struct drbd_epoch_entry, w.list); | ||
131 | if (drbd_bio_has_active_page(e->private_bio)) | ||
132 | break; | ||
133 | list_move(le, to_be_freed); | ||
134 | } | ||
135 | } | ||
136 | |||
137 | static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev) | ||
138 | { | ||
139 | LIST_HEAD(reclaimed); | ||
140 | struct drbd_epoch_entry *e, *t; | ||
141 | |||
142 | maybe_kick_lo(mdev); | ||
143 | spin_lock_irq(&mdev->req_lock); | ||
144 | reclaim_net_ee(mdev, &reclaimed); | ||
145 | spin_unlock_irq(&mdev->req_lock); | ||
146 | |||
147 | list_for_each_entry_safe(e, t, &reclaimed, w.list) | ||
148 | drbd_free_ee(mdev, e); | ||
149 | } | ||
150 | |||
151 | /** | ||
152 | * drbd_pp_alloc() - Returns a page, fails only if a signal comes in | ||
153 | * @mdev: DRBD device. | ||
154 | * @retry: whether or not to retry allocation forever (or until signalled) | ||
155 | * | ||
156 | * Tries to allocate a page, first from our own page pool, then from the | ||
157 | * kernel, unless this allocation would exceed the max_buffers setting. | ||
158 | * If @retry is non-zero, retry until DRBD frees a page somewhere else. | ||
159 | */ | ||
160 | static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry) | ||
161 | { | ||
162 | struct page *page = NULL; | ||
163 | DEFINE_WAIT(wait); | ||
164 | |||
165 | if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { | ||
166 | page = drbd_pp_first_page_or_try_alloc(mdev); | ||
167 | if (page) | ||
168 | return page; | ||
169 | } | ||
170 | |||
171 | for (;;) { | ||
172 | prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); | ||
173 | |||
174 | drbd_kick_lo_and_reclaim_net(mdev); | ||
175 | |||
176 | if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { | ||
177 | page = drbd_pp_first_page_or_try_alloc(mdev); | ||
178 | if (page) | ||
179 | break; | ||
180 | } | ||
181 | |||
182 | if (!retry) | ||
183 | break; | ||
184 | |||
185 | if (signal_pending(current)) { | ||
186 | dev_warn(DEV, "drbd_pp_alloc interrupted!\n"); | ||
187 | break; | ||
188 | } | ||
189 | |||
190 | schedule(); | ||
191 | } | ||
192 | finish_wait(&drbd_pp_wait, &wait); | ||
193 | |||
194 | return page; | ||
195 | } | ||
196 | |||
197 | /* Must not be used from irq, as that may deadlock: see drbd_pp_alloc. | ||
198 | * Is also used from inside an other spin_lock_irq(&mdev->req_lock) */ | ||
199 | static void drbd_pp_free(struct drbd_conf *mdev, struct page *page) | ||
200 | { | ||
201 | int free_it; | ||
202 | |||
203 | spin_lock(&drbd_pp_lock); | ||
204 | if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) { | ||
205 | free_it = 1; | ||
206 | } else { | ||
207 | set_page_private(page, (unsigned long)drbd_pp_pool); | ||
208 | drbd_pp_pool = page; | ||
209 | drbd_pp_vacant++; | ||
210 | free_it = 0; | ||
211 | } | ||
212 | spin_unlock(&drbd_pp_lock); | ||
213 | |||
214 | atomic_dec(&mdev->pp_in_use); | ||
215 | |||
216 | if (free_it) | ||
217 | __free_page(page); | ||
218 | |||
219 | wake_up(&drbd_pp_wait); | ||
220 | } | ||
221 | |||
222 | static void drbd_pp_free_bio_pages(struct drbd_conf *mdev, struct bio *bio) | ||
223 | { | ||
224 | struct page *p_to_be_freed = NULL; | ||
225 | struct page *page; | ||
226 | struct bio_vec *bvec; | ||
227 | int i; | ||
228 | |||
229 | spin_lock(&drbd_pp_lock); | ||
230 | __bio_for_each_segment(bvec, bio, i, 0) { | ||
231 | if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) { | ||
232 | set_page_private(bvec->bv_page, (unsigned long)p_to_be_freed); | ||
233 | p_to_be_freed = bvec->bv_page; | ||
234 | } else { | ||
235 | set_page_private(bvec->bv_page, (unsigned long)drbd_pp_pool); | ||
236 | drbd_pp_pool = bvec->bv_page; | ||
237 | drbd_pp_vacant++; | ||
238 | } | ||
239 | } | ||
240 | spin_unlock(&drbd_pp_lock); | ||
241 | atomic_sub(bio->bi_vcnt, &mdev->pp_in_use); | ||
242 | |||
243 | while (p_to_be_freed) { | ||
244 | page = p_to_be_freed; | ||
245 | p_to_be_freed = (struct page *)page_private(page); | ||
246 | set_page_private(page, 0); /* just to be polite */ | ||
247 | put_page(page); | ||
248 | } | ||
249 | |||
250 | wake_up(&drbd_pp_wait); | ||
251 | } | ||
252 | |||
253 | /* | ||
254 | You need to hold the req_lock: | ||
255 | _drbd_wait_ee_list_empty() | ||
256 | |||
257 | You must not have the req_lock: | ||
258 | drbd_free_ee() | ||
259 | drbd_alloc_ee() | ||
260 | drbd_init_ee() | ||
261 | drbd_release_ee() | ||
262 | drbd_ee_fix_bhs() | ||
263 | drbd_process_done_ee() | ||
264 | drbd_clear_done_ee() | ||
265 | drbd_wait_ee_list_empty() | ||
266 | */ | ||
267 | |||
268 | struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, | ||
269 | u64 id, | ||
270 | sector_t sector, | ||
271 | unsigned int data_size, | ||
272 | gfp_t gfp_mask) __must_hold(local) | ||
273 | { | ||
274 | struct request_queue *q; | ||
275 | struct drbd_epoch_entry *e; | ||
276 | struct page *page; | ||
277 | struct bio *bio; | ||
278 | unsigned int ds; | ||
279 | |||
280 | if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE)) | ||
281 | return NULL; | ||
282 | |||
283 | e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); | ||
284 | if (!e) { | ||
285 | if (!(gfp_mask & __GFP_NOWARN)) | ||
286 | dev_err(DEV, "alloc_ee: Allocation of an EE failed\n"); | ||
287 | return NULL; | ||
288 | } | ||
289 | |||
290 | bio = bio_alloc(gfp_mask & ~__GFP_HIGHMEM, div_ceil(data_size, PAGE_SIZE)); | ||
291 | if (!bio) { | ||
292 | if (!(gfp_mask & __GFP_NOWARN)) | ||
293 | dev_err(DEV, "alloc_ee: Allocation of a bio failed\n"); | ||
294 | goto fail1; | ||
295 | } | ||
296 | |||
297 | bio->bi_bdev = mdev->ldev->backing_bdev; | ||
298 | bio->bi_sector = sector; | ||
299 | |||
300 | ds = data_size; | ||
301 | while (ds) { | ||
302 | page = drbd_pp_alloc(mdev, (gfp_mask & __GFP_WAIT)); | ||
303 | if (!page) { | ||
304 | if (!(gfp_mask & __GFP_NOWARN)) | ||
305 | dev_err(DEV, "alloc_ee: Allocation of a page failed\n"); | ||
306 | goto fail2; | ||
307 | } | ||
308 | if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) { | ||
309 | drbd_pp_free(mdev, page); | ||
310 | dev_err(DEV, "alloc_ee: bio_add_page(s=%llu," | ||
311 | "data_size=%u,ds=%u) failed\n", | ||
312 | (unsigned long long)sector, data_size, ds); | ||
313 | |||
314 | q = bdev_get_queue(bio->bi_bdev); | ||
315 | if (q->merge_bvec_fn) { | ||
316 | struct bvec_merge_data bvm = { | ||
317 | .bi_bdev = bio->bi_bdev, | ||
318 | .bi_sector = bio->bi_sector, | ||
319 | .bi_size = bio->bi_size, | ||
320 | .bi_rw = bio->bi_rw, | ||
321 | }; | ||
322 | int l = q->merge_bvec_fn(q, &bvm, | ||
323 | &bio->bi_io_vec[bio->bi_vcnt]); | ||
324 | dev_err(DEV, "merge_bvec_fn() = %d\n", l); | ||
325 | } | ||
326 | |||
327 | /* dump more of the bio. */ | ||
328 | dev_err(DEV, "bio->bi_max_vecs = %d\n", bio->bi_max_vecs); | ||
329 | dev_err(DEV, "bio->bi_vcnt = %d\n", bio->bi_vcnt); | ||
330 | dev_err(DEV, "bio->bi_size = %d\n", bio->bi_size); | ||
331 | dev_err(DEV, "bio->bi_phys_segments = %d\n", bio->bi_phys_segments); | ||
332 | |||
333 | goto fail2; | ||
334 | break; | ||
335 | } | ||
336 | ds -= min_t(int, ds, PAGE_SIZE); | ||
337 | } | ||
338 | |||
339 | D_ASSERT(data_size == bio->bi_size); | ||
340 | |||
341 | bio->bi_private = e; | ||
342 | e->mdev = mdev; | ||
343 | e->sector = sector; | ||
344 | e->size = bio->bi_size; | ||
345 | |||
346 | e->private_bio = bio; | ||
347 | e->block_id = id; | ||
348 | INIT_HLIST_NODE(&e->colision); | ||
349 | e->epoch = NULL; | ||
350 | e->flags = 0; | ||
351 | |||
352 | return e; | ||
353 | |||
354 | fail2: | ||
355 | drbd_pp_free_bio_pages(mdev, bio); | ||
356 | bio_put(bio); | ||
357 | fail1: | ||
358 | mempool_free(e, drbd_ee_mempool); | ||
359 | |||
360 | return NULL; | ||
361 | } | ||
362 | |||
363 | void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) | ||
364 | { | ||
365 | struct bio *bio = e->private_bio; | ||
366 | drbd_pp_free_bio_pages(mdev, bio); | ||
367 | bio_put(bio); | ||
368 | D_ASSERT(hlist_unhashed(&e->colision)); | ||
369 | mempool_free(e, drbd_ee_mempool); | ||
370 | } | ||
371 | |||
372 | int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list) | ||
373 | { | ||
374 | LIST_HEAD(work_list); | ||
375 | struct drbd_epoch_entry *e, *t; | ||
376 | int count = 0; | ||
377 | |||
378 | spin_lock_irq(&mdev->req_lock); | ||
379 | list_splice_init(list, &work_list); | ||
380 | spin_unlock_irq(&mdev->req_lock); | ||
381 | |||
382 | list_for_each_entry_safe(e, t, &work_list, w.list) { | ||
383 | drbd_free_ee(mdev, e); | ||
384 | count++; | ||
385 | } | ||
386 | return count; | ||
387 | } | ||
388 | |||
389 | |||
390 | /* | ||
391 | * This function is called from _asender only_ | ||
392 | * but see also comments in _req_mod(,barrier_acked) | ||
393 | * and receive_Barrier. | ||
394 | * | ||
395 | * Move entries from net_ee to done_ee, if ready. | ||
396 | * Grab done_ee, call all callbacks, free the entries. | ||
397 | * The callbacks typically send out ACKs. | ||
398 | */ | ||
399 | static int drbd_process_done_ee(struct drbd_conf *mdev) | ||
400 | { | ||
401 | LIST_HEAD(work_list); | ||
402 | LIST_HEAD(reclaimed); | ||
403 | struct drbd_epoch_entry *e, *t; | ||
404 | int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS); | ||
405 | |||
406 | spin_lock_irq(&mdev->req_lock); | ||
407 | reclaim_net_ee(mdev, &reclaimed); | ||
408 | list_splice_init(&mdev->done_ee, &work_list); | ||
409 | spin_unlock_irq(&mdev->req_lock); | ||
410 | |||
411 | list_for_each_entry_safe(e, t, &reclaimed, w.list) | ||
412 | drbd_free_ee(mdev, e); | ||
413 | |||
414 | /* possible callbacks here: | ||
415 | * e_end_block, and e_end_resync_block, e_send_discard_ack. | ||
416 | * all ignore the last argument. | ||
417 | */ | ||
418 | list_for_each_entry_safe(e, t, &work_list, w.list) { | ||
419 | /* list_del not necessary, next/prev members not touched */ | ||
420 | ok = e->w.cb(mdev, &e->w, !ok) && ok; | ||
421 | drbd_free_ee(mdev, e); | ||
422 | } | ||
423 | wake_up(&mdev->ee_wait); | ||
424 | |||
425 | return ok; | ||
426 | } | ||
427 | |||
428 | void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) | ||
429 | { | ||
430 | DEFINE_WAIT(wait); | ||
431 | |||
432 | /* avoids spin_lock/unlock | ||
433 | * and calling prepare_to_wait in the fast path */ | ||
434 | while (!list_empty(head)) { | ||
435 | prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE); | ||
436 | spin_unlock_irq(&mdev->req_lock); | ||
437 | drbd_kick_lo(mdev); | ||
438 | schedule(); | ||
439 | finish_wait(&mdev->ee_wait, &wait); | ||
440 | spin_lock_irq(&mdev->req_lock); | ||
441 | } | ||
442 | } | ||
443 | |||
444 | void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) | ||
445 | { | ||
446 | spin_lock_irq(&mdev->req_lock); | ||
447 | _drbd_wait_ee_list_empty(mdev, head); | ||
448 | spin_unlock_irq(&mdev->req_lock); | ||
449 | } | ||
450 | |||
451 | /* see also kernel_accept; which is only present since 2.6.18. | ||
452 | * also we want to log which part of it failed, exactly */ | ||
453 | static int drbd_accept(struct drbd_conf *mdev, const char **what, | ||
454 | struct socket *sock, struct socket **newsock) | ||
455 | { | ||
456 | struct sock *sk = sock->sk; | ||
457 | int err = 0; | ||
458 | |||
459 | *what = "listen"; | ||
460 | err = sock->ops->listen(sock, 5); | ||
461 | if (err < 0) | ||
462 | goto out; | ||
463 | |||
464 | *what = "sock_create_lite"; | ||
465 | err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, | ||
466 | newsock); | ||
467 | if (err < 0) | ||
468 | goto out; | ||
469 | |||
470 | *what = "accept"; | ||
471 | err = sock->ops->accept(sock, *newsock, 0); | ||
472 | if (err < 0) { | ||
473 | sock_release(*newsock); | ||
474 | *newsock = NULL; | ||
475 | goto out; | ||
476 | } | ||
477 | (*newsock)->ops = sock->ops; | ||
478 | |||
479 | out: | ||
480 | return err; | ||
481 | } | ||
482 | |||
483 | static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock, | ||
484 | void *buf, size_t size, int flags) | ||
485 | { | ||
486 | mm_segment_t oldfs; | ||
487 | struct kvec iov = { | ||
488 | .iov_base = buf, | ||
489 | .iov_len = size, | ||
490 | }; | ||
491 | struct msghdr msg = { | ||
492 | .msg_iovlen = 1, | ||
493 | .msg_iov = (struct iovec *)&iov, | ||
494 | .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL) | ||
495 | }; | ||
496 | int rv; | ||
497 | |||
498 | oldfs = get_fs(); | ||
499 | set_fs(KERNEL_DS); | ||
500 | rv = sock_recvmsg(sock, &msg, size, msg.msg_flags); | ||
501 | set_fs(oldfs); | ||
502 | |||
503 | return rv; | ||
504 | } | ||
505 | |||
506 | static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size) | ||
507 | { | ||
508 | mm_segment_t oldfs; | ||
509 | struct kvec iov = { | ||
510 | .iov_base = buf, | ||
511 | .iov_len = size, | ||
512 | }; | ||
513 | struct msghdr msg = { | ||
514 | .msg_iovlen = 1, | ||
515 | .msg_iov = (struct iovec *)&iov, | ||
516 | .msg_flags = MSG_WAITALL | MSG_NOSIGNAL | ||
517 | }; | ||
518 | int rv; | ||
519 | |||
520 | oldfs = get_fs(); | ||
521 | set_fs(KERNEL_DS); | ||
522 | |||
523 | for (;;) { | ||
524 | rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags); | ||
525 | if (rv == size) | ||
526 | break; | ||
527 | |||
528 | /* Note: | ||
529 | * ECONNRESET other side closed the connection | ||
530 | * ERESTARTSYS (on sock) we got a signal | ||
531 | */ | ||
532 | |||
533 | if (rv < 0) { | ||
534 | if (rv == -ECONNRESET) | ||
535 | dev_info(DEV, "sock was reset by peer\n"); | ||
536 | else if (rv != -ERESTARTSYS) | ||
537 | dev_err(DEV, "sock_recvmsg returned %d\n", rv); | ||
538 | break; | ||
539 | } else if (rv == 0) { | ||
540 | dev_info(DEV, "sock was shut down by peer\n"); | ||
541 | break; | ||
542 | } else { | ||
543 | /* signal came in, or peer/link went down, | ||
544 | * after we read a partial message | ||
545 | */ | ||
546 | /* D_ASSERT(signal_pending(current)); */ | ||
547 | break; | ||
548 | } | ||
549 | }; | ||
550 | |||
551 | set_fs(oldfs); | ||
552 | |||
553 | if (rv != size) | ||
554 | drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE)); | ||
555 | |||
556 | return rv; | ||
557 | } | ||
558 | |||
559 | static struct socket *drbd_try_connect(struct drbd_conf *mdev) | ||
560 | { | ||
561 | const char *what; | ||
562 | struct socket *sock; | ||
563 | struct sockaddr_in6 src_in6; | ||
564 | int err; | ||
565 | int disconnect_on_error = 1; | ||
566 | |||
567 | if (!get_net_conf(mdev)) | ||
568 | return NULL; | ||
569 | |||
570 | what = "sock_create_kern"; | ||
571 | err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family, | ||
572 | SOCK_STREAM, IPPROTO_TCP, &sock); | ||
573 | if (err < 0) { | ||
574 | sock = NULL; | ||
575 | goto out; | ||
576 | } | ||
577 | |||
578 | sock->sk->sk_rcvtimeo = | ||
579 | sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ; | ||
580 | |||
581 | /* explicitly bind to the configured IP as source IP | ||
582 | * for the outgoing connections. | ||
583 | * This is needed for multihomed hosts and to be | ||
584 | * able to use lo: interfaces for drbd. | ||
585 | * Make sure to use 0 as port number, so linux selects | ||
586 | * a free one dynamically. | ||
587 | */ | ||
588 | memcpy(&src_in6, mdev->net_conf->my_addr, | ||
589 | min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6))); | ||
590 | if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6) | ||
591 | src_in6.sin6_port = 0; | ||
592 | else | ||
593 | ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */ | ||
594 | |||
595 | what = "bind before connect"; | ||
596 | err = sock->ops->bind(sock, | ||
597 | (struct sockaddr *) &src_in6, | ||
598 | mdev->net_conf->my_addr_len); | ||
599 | if (err < 0) | ||
600 | goto out; | ||
601 | |||
602 | /* connect may fail, peer not yet available. | ||
603 | * stay C_WF_CONNECTION, don't go Disconnecting! */ | ||
604 | disconnect_on_error = 0; | ||
605 | what = "connect"; | ||
606 | err = sock->ops->connect(sock, | ||
607 | (struct sockaddr *)mdev->net_conf->peer_addr, | ||
608 | mdev->net_conf->peer_addr_len, 0); | ||
609 | |||
610 | out: | ||
611 | if (err < 0) { | ||
612 | if (sock) { | ||
613 | sock_release(sock); | ||
614 | sock = NULL; | ||
615 | } | ||
616 | switch (-err) { | ||
617 | /* timeout, busy, signal pending */ | ||
618 | case ETIMEDOUT: case EAGAIN: case EINPROGRESS: | ||
619 | case EINTR: case ERESTARTSYS: | ||
620 | /* peer not (yet) available, network problem */ | ||
621 | case ECONNREFUSED: case ENETUNREACH: | ||
622 | case EHOSTDOWN: case EHOSTUNREACH: | ||
623 | disconnect_on_error = 0; | ||
624 | break; | ||
625 | default: | ||
626 | dev_err(DEV, "%s failed, err = %d\n", what, err); | ||
627 | } | ||
628 | if (disconnect_on_error) | ||
629 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
630 | } | ||
631 | put_net_conf(mdev); | ||
632 | return sock; | ||
633 | } | ||
634 | |||
635 | static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev) | ||
636 | { | ||
637 | int timeo, err; | ||
638 | struct socket *s_estab = NULL, *s_listen; | ||
639 | const char *what; | ||
640 | |||
641 | if (!get_net_conf(mdev)) | ||
642 | return NULL; | ||
643 | |||
644 | what = "sock_create_kern"; | ||
645 | err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family, | ||
646 | SOCK_STREAM, IPPROTO_TCP, &s_listen); | ||
647 | if (err) { | ||
648 | s_listen = NULL; | ||
649 | goto out; | ||
650 | } | ||
651 | |||
652 | timeo = mdev->net_conf->try_connect_int * HZ; | ||
653 | timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */ | ||
654 | |||
655 | s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */ | ||
656 | s_listen->sk->sk_rcvtimeo = timeo; | ||
657 | s_listen->sk->sk_sndtimeo = timeo; | ||
658 | |||
659 | what = "bind before listen"; | ||
660 | err = s_listen->ops->bind(s_listen, | ||
661 | (struct sockaddr *) mdev->net_conf->my_addr, | ||
662 | mdev->net_conf->my_addr_len); | ||
663 | if (err < 0) | ||
664 | goto out; | ||
665 | |||
666 | err = drbd_accept(mdev, &what, s_listen, &s_estab); | ||
667 | |||
668 | out: | ||
669 | if (s_listen) | ||
670 | sock_release(s_listen); | ||
671 | if (err < 0) { | ||
672 | if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { | ||
673 | dev_err(DEV, "%s failed, err = %d\n", what, err); | ||
674 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
675 | } | ||
676 | } | ||
677 | put_net_conf(mdev); | ||
678 | |||
679 | return s_estab; | ||
680 | } | ||
681 | |||
682 | static int drbd_send_fp(struct drbd_conf *mdev, | ||
683 | struct socket *sock, enum drbd_packets cmd) | ||
684 | { | ||
685 | struct p_header *h = (struct p_header *) &mdev->data.sbuf.header; | ||
686 | |||
687 | return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0); | ||
688 | } | ||
689 | |||
690 | static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock) | ||
691 | { | ||
692 | struct p_header *h = (struct p_header *) &mdev->data.sbuf.header; | ||
693 | int rr; | ||
694 | |||
695 | rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0); | ||
696 | |||
697 | if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC) | ||
698 | return be16_to_cpu(h->command); | ||
699 | |||
700 | return 0xffff; | ||
701 | } | ||
702 | |||
703 | /** | ||
704 | * drbd_socket_okay() - Free the socket if its connection is not okay | ||
705 | * @mdev: DRBD device. | ||
706 | * @sock: pointer to the pointer to the socket. | ||
707 | */ | ||
708 | static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock) | ||
709 | { | ||
710 | int rr; | ||
711 | char tb[4]; | ||
712 | |||
713 | if (!*sock) | ||
714 | return FALSE; | ||
715 | |||
716 | rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK); | ||
717 | |||
718 | if (rr > 0 || rr == -EAGAIN) { | ||
719 | return TRUE; | ||
720 | } else { | ||
721 | sock_release(*sock); | ||
722 | *sock = NULL; | ||
723 | return FALSE; | ||
724 | } | ||
725 | } | ||
726 | |||
727 | /* | ||
728 | * return values: | ||
729 | * 1 yes, we have a valid connection | ||
730 | * 0 oops, did not work out, please try again | ||
731 | * -1 peer talks different language, | ||
732 | * no point in trying again, please go standalone. | ||
733 | * -2 We do not have a network config... | ||
734 | */ | ||
735 | static int drbd_connect(struct drbd_conf *mdev) | ||
736 | { | ||
737 | struct socket *s, *sock, *msock; | ||
738 | int try, h, ok; | ||
739 | |||
740 | D_ASSERT(!mdev->data.socket); | ||
741 | |||
742 | if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) | ||
743 | dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n"); | ||
744 | |||
745 | if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS) | ||
746 | return -2; | ||
747 | |||
748 | clear_bit(DISCARD_CONCURRENT, &mdev->flags); | ||
749 | |||
750 | sock = NULL; | ||
751 | msock = NULL; | ||
752 | |||
753 | do { | ||
754 | for (try = 0;;) { | ||
755 | /* 3 tries, this should take less than a second! */ | ||
756 | s = drbd_try_connect(mdev); | ||
757 | if (s || ++try >= 3) | ||
758 | break; | ||
759 | /* give the other side time to call bind() & listen() */ | ||
760 | __set_current_state(TASK_INTERRUPTIBLE); | ||
761 | schedule_timeout(HZ / 10); | ||
762 | } | ||
763 | |||
764 | if (s) { | ||
765 | if (!sock) { | ||
766 | drbd_send_fp(mdev, s, P_HAND_SHAKE_S); | ||
767 | sock = s; | ||
768 | s = NULL; | ||
769 | } else if (!msock) { | ||
770 | drbd_send_fp(mdev, s, P_HAND_SHAKE_M); | ||
771 | msock = s; | ||
772 | s = NULL; | ||
773 | } else { | ||
774 | dev_err(DEV, "Logic error in drbd_connect()\n"); | ||
775 | goto out_release_sockets; | ||
776 | } | ||
777 | } | ||
778 | |||
779 | if (sock && msock) { | ||
780 | __set_current_state(TASK_INTERRUPTIBLE); | ||
781 | schedule_timeout(HZ / 10); | ||
782 | ok = drbd_socket_okay(mdev, &sock); | ||
783 | ok = drbd_socket_okay(mdev, &msock) && ok; | ||
784 | if (ok) | ||
785 | break; | ||
786 | } | ||
787 | |||
788 | retry: | ||
789 | s = drbd_wait_for_connect(mdev); | ||
790 | if (s) { | ||
791 | try = drbd_recv_fp(mdev, s); | ||
792 | drbd_socket_okay(mdev, &sock); | ||
793 | drbd_socket_okay(mdev, &msock); | ||
794 | switch (try) { | ||
795 | case P_HAND_SHAKE_S: | ||
796 | if (sock) { | ||
797 | dev_warn(DEV, "initial packet S crossed\n"); | ||
798 | sock_release(sock); | ||
799 | } | ||
800 | sock = s; | ||
801 | break; | ||
802 | case P_HAND_SHAKE_M: | ||
803 | if (msock) { | ||
804 | dev_warn(DEV, "initial packet M crossed\n"); | ||
805 | sock_release(msock); | ||
806 | } | ||
807 | msock = s; | ||
808 | set_bit(DISCARD_CONCURRENT, &mdev->flags); | ||
809 | break; | ||
810 | default: | ||
811 | dev_warn(DEV, "Error receiving initial packet\n"); | ||
812 | sock_release(s); | ||
813 | if (random32() & 1) | ||
814 | goto retry; | ||
815 | } | ||
816 | } | ||
817 | |||
818 | if (mdev->state.conn <= C_DISCONNECTING) | ||
819 | goto out_release_sockets; | ||
820 | if (signal_pending(current)) { | ||
821 | flush_signals(current); | ||
822 | smp_rmb(); | ||
823 | if (get_t_state(&mdev->receiver) == Exiting) | ||
824 | goto out_release_sockets; | ||
825 | } | ||
826 | |||
827 | if (sock && msock) { | ||
828 | ok = drbd_socket_okay(mdev, &sock); | ||
829 | ok = drbd_socket_okay(mdev, &msock) && ok; | ||
830 | if (ok) | ||
831 | break; | ||
832 | } | ||
833 | } while (1); | ||
834 | |||
835 | msock->sk->sk_reuse = 1; /* SO_REUSEADDR */ | ||
836 | sock->sk->sk_reuse = 1; /* SO_REUSEADDR */ | ||
837 | |||
838 | sock->sk->sk_allocation = GFP_NOIO; | ||
839 | msock->sk->sk_allocation = GFP_NOIO; | ||
840 | |||
841 | sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; | ||
842 | msock->sk->sk_priority = TC_PRIO_INTERACTIVE; | ||
843 | |||
844 | if (mdev->net_conf->sndbuf_size) { | ||
845 | sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size; | ||
846 | sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK; | ||
847 | } | ||
848 | |||
849 | if (mdev->net_conf->rcvbuf_size) { | ||
850 | sock->sk->sk_rcvbuf = mdev->net_conf->rcvbuf_size; | ||
851 | sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK; | ||
852 | } | ||
853 | |||
854 | /* NOT YET ... | ||
855 | * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; | ||
856 | * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; | ||
857 | * first set it to the P_HAND_SHAKE timeout, | ||
858 | * which we set to 4x the configured ping_timeout. */ | ||
859 | sock->sk->sk_sndtimeo = | ||
860 | sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10; | ||
861 | |||
862 | msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; | ||
863 | msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; | ||
864 | |||
865 | /* we don't want delays. | ||
866 | * we use TCP_CORK where apropriate, though */ | ||
867 | drbd_tcp_nodelay(sock); | ||
868 | drbd_tcp_nodelay(msock); | ||
869 | |||
870 | mdev->data.socket = sock; | ||
871 | mdev->meta.socket = msock; | ||
872 | mdev->last_received = jiffies; | ||
873 | |||
874 | D_ASSERT(mdev->asender.task == NULL); | ||
875 | |||
876 | h = drbd_do_handshake(mdev); | ||
877 | if (h <= 0) | ||
878 | return h; | ||
879 | |||
880 | if (mdev->cram_hmac_tfm) { | ||
881 | /* drbd_request_state(mdev, NS(conn, WFAuth)); */ | ||
882 | if (!drbd_do_auth(mdev)) { | ||
883 | dev_err(DEV, "Authentication of peer failed\n"); | ||
884 | return -1; | ||
885 | } | ||
886 | } | ||
887 | |||
888 | if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS) | ||
889 | return 0; | ||
890 | |||
891 | sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; | ||
892 | sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; | ||
893 | |||
894 | atomic_set(&mdev->packet_seq, 0); | ||
895 | mdev->peer_seq = 0; | ||
896 | |||
897 | drbd_thread_start(&mdev->asender); | ||
898 | |||
899 | drbd_send_protocol(mdev); | ||
900 | drbd_send_sync_param(mdev, &mdev->sync_conf); | ||
901 | drbd_send_sizes(mdev, 0); | ||
902 | drbd_send_uuids(mdev); | ||
903 | drbd_send_state(mdev); | ||
904 | clear_bit(USE_DEGR_WFC_T, &mdev->flags); | ||
905 | clear_bit(RESIZE_PENDING, &mdev->flags); | ||
906 | |||
907 | return 1; | ||
908 | |||
909 | out_release_sockets: | ||
910 | if (sock) | ||
911 | sock_release(sock); | ||
912 | if (msock) | ||
913 | sock_release(msock); | ||
914 | return -1; | ||
915 | } | ||
916 | |||
917 | static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h) | ||
918 | { | ||
919 | int r; | ||
920 | |||
921 | r = drbd_recv(mdev, h, sizeof(*h)); | ||
922 | |||
923 | if (unlikely(r != sizeof(*h))) { | ||
924 | dev_err(DEV, "short read expecting header on sock: r=%d\n", r); | ||
925 | return FALSE; | ||
926 | }; | ||
927 | h->command = be16_to_cpu(h->command); | ||
928 | h->length = be16_to_cpu(h->length); | ||
929 | if (unlikely(h->magic != BE_DRBD_MAGIC)) { | ||
930 | dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n", | ||
931 | (long)be32_to_cpu(h->magic), | ||
932 | h->command, h->length); | ||
933 | return FALSE; | ||
934 | } | ||
935 | mdev->last_received = jiffies; | ||
936 | |||
937 | return TRUE; | ||
938 | } | ||
939 | |||
940 | static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch) | ||
941 | { | ||
942 | int rv; | ||
943 | |||
944 | if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) { | ||
945 | rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL); | ||
946 | if (rv) { | ||
947 | dev_err(DEV, "local disk flush failed with status %d\n", rv); | ||
948 | /* would rather check on EOPNOTSUPP, but that is not reliable. | ||
949 | * don't try again for ANY return value != 0 | ||
950 | * if (rv == -EOPNOTSUPP) */ | ||
951 | drbd_bump_write_ordering(mdev, WO_drain_io); | ||
952 | } | ||
953 | put_ldev(mdev); | ||
954 | } | ||
955 | |||
956 | return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE); | ||
957 | } | ||
958 | |||
959 | static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
960 | { | ||
961 | struct flush_work *fw = (struct flush_work *)w; | ||
962 | struct drbd_epoch *epoch = fw->epoch; | ||
963 | |||
964 | kfree(w); | ||
965 | |||
966 | if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags)) | ||
967 | drbd_flush_after_epoch(mdev, epoch); | ||
968 | |||
969 | drbd_may_finish_epoch(mdev, epoch, EV_PUT | | ||
970 | (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0)); | ||
971 | |||
972 | return 1; | ||
973 | } | ||
974 | |||
975 | /** | ||
976 | * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it. | ||
977 | * @mdev: DRBD device. | ||
978 | * @epoch: Epoch object. | ||
979 | * @ev: Epoch event. | ||
980 | */ | ||
981 | static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, | ||
982 | struct drbd_epoch *epoch, | ||
983 | enum epoch_event ev) | ||
984 | { | ||
985 | int finish, epoch_size; | ||
986 | struct drbd_epoch *next_epoch; | ||
987 | int schedule_flush = 0; | ||
988 | enum finish_epoch rv = FE_STILL_LIVE; | ||
989 | |||
990 | spin_lock(&mdev->epoch_lock); | ||
991 | do { | ||
992 | next_epoch = NULL; | ||
993 | finish = 0; | ||
994 | |||
995 | epoch_size = atomic_read(&epoch->epoch_size); | ||
996 | |||
997 | switch (ev & ~EV_CLEANUP) { | ||
998 | case EV_PUT: | ||
999 | atomic_dec(&epoch->active); | ||
1000 | break; | ||
1001 | case EV_GOT_BARRIER_NR: | ||
1002 | set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags); | ||
1003 | |||
1004 | /* Special case: If we just switched from WO_bio_barrier to | ||
1005 | WO_bdev_flush we should not finish the current epoch */ | ||
1006 | if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 && | ||
1007 | mdev->write_ordering != WO_bio_barrier && | ||
1008 | epoch == mdev->current_epoch) | ||
1009 | clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags); | ||
1010 | break; | ||
1011 | case EV_BARRIER_DONE: | ||
1012 | set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags); | ||
1013 | break; | ||
1014 | case EV_BECAME_LAST: | ||
1015 | /* nothing to do*/ | ||
1016 | break; | ||
1017 | } | ||
1018 | |||
1019 | if (epoch_size != 0 && | ||
1020 | atomic_read(&epoch->active) == 0 && | ||
1021 | test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) && | ||
1022 | epoch->list.prev == &mdev->current_epoch->list && | ||
1023 | !test_bit(DE_IS_FINISHING, &epoch->flags)) { | ||
1024 | /* Nearly all conditions are met to finish that epoch... */ | ||
1025 | if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) || | ||
1026 | mdev->write_ordering == WO_none || | ||
1027 | (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) || | ||
1028 | ev & EV_CLEANUP) { | ||
1029 | finish = 1; | ||
1030 | set_bit(DE_IS_FINISHING, &epoch->flags); | ||
1031 | } else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) && | ||
1032 | mdev->write_ordering == WO_bio_barrier) { | ||
1033 | atomic_inc(&epoch->active); | ||
1034 | schedule_flush = 1; | ||
1035 | } | ||
1036 | } | ||
1037 | if (finish) { | ||
1038 | if (!(ev & EV_CLEANUP)) { | ||
1039 | spin_unlock(&mdev->epoch_lock); | ||
1040 | drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size); | ||
1041 | spin_lock(&mdev->epoch_lock); | ||
1042 | } | ||
1043 | dec_unacked(mdev); | ||
1044 | |||
1045 | if (mdev->current_epoch != epoch) { | ||
1046 | next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); | ||
1047 | list_del(&epoch->list); | ||
1048 | ev = EV_BECAME_LAST | (ev & EV_CLEANUP); | ||
1049 | mdev->epochs--; | ||
1050 | kfree(epoch); | ||
1051 | |||
1052 | if (rv == FE_STILL_LIVE) | ||
1053 | rv = FE_DESTROYED; | ||
1054 | } else { | ||
1055 | epoch->flags = 0; | ||
1056 | atomic_set(&epoch->epoch_size, 0); | ||
1057 | /* atomic_set(&epoch->active, 0); is alrady zero */ | ||
1058 | if (rv == FE_STILL_LIVE) | ||
1059 | rv = FE_RECYCLED; | ||
1060 | } | ||
1061 | } | ||
1062 | |||
1063 | if (!next_epoch) | ||
1064 | break; | ||
1065 | |||
1066 | epoch = next_epoch; | ||
1067 | } while (1); | ||
1068 | |||
1069 | spin_unlock(&mdev->epoch_lock); | ||
1070 | |||
1071 | if (schedule_flush) { | ||
1072 | struct flush_work *fw; | ||
1073 | fw = kmalloc(sizeof(*fw), GFP_ATOMIC); | ||
1074 | if (fw) { | ||
1075 | fw->w.cb = w_flush; | ||
1076 | fw->epoch = epoch; | ||
1077 | drbd_queue_work(&mdev->data.work, &fw->w); | ||
1078 | } else { | ||
1079 | dev_warn(DEV, "Could not kmalloc a flush_work obj\n"); | ||
1080 | set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); | ||
1081 | /* That is not a recursion, only one level */ | ||
1082 | drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE); | ||
1083 | drbd_may_finish_epoch(mdev, epoch, EV_PUT); | ||
1084 | } | ||
1085 | } | ||
1086 | |||
1087 | return rv; | ||
1088 | } | ||
1089 | |||
1090 | /** | ||
1091 | * drbd_bump_write_ordering() - Fall back to an other write ordering method | ||
1092 | * @mdev: DRBD device. | ||
1093 | * @wo: Write ordering method to try. | ||
1094 | */ | ||
1095 | void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local) | ||
1096 | { | ||
1097 | enum write_ordering_e pwo; | ||
1098 | static char *write_ordering_str[] = { | ||
1099 | [WO_none] = "none", | ||
1100 | [WO_drain_io] = "drain", | ||
1101 | [WO_bdev_flush] = "flush", | ||
1102 | [WO_bio_barrier] = "barrier", | ||
1103 | }; | ||
1104 | |||
1105 | pwo = mdev->write_ordering; | ||
1106 | wo = min(pwo, wo); | ||
1107 | if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier) | ||
1108 | wo = WO_bdev_flush; | ||
1109 | if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush) | ||
1110 | wo = WO_drain_io; | ||
1111 | if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain) | ||
1112 | wo = WO_none; | ||
1113 | mdev->write_ordering = wo; | ||
1114 | if (pwo != mdev->write_ordering || wo == WO_bio_barrier) | ||
1115 | dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]); | ||
1116 | } | ||
1117 | |||
1118 | /** | ||
1119 | * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set | ||
1120 | * @mdev: DRBD device. | ||
1121 | * @w: work object. | ||
1122 | * @cancel: The connection will be closed anyways (unused in this callback) | ||
1123 | */ | ||
1124 | int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local) | ||
1125 | { | ||
1126 | struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; | ||
1127 | struct bio *bio = e->private_bio; | ||
1128 | |||
1129 | /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place, | ||
1130 | (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch) | ||
1131 | so that we can finish that epoch in drbd_may_finish_epoch(). | ||
1132 | That is necessary if we already have a long chain of Epochs, before | ||
1133 | we realize that BIO_RW_BARRIER is actually not supported */ | ||
1134 | |||
1135 | /* As long as the -ENOTSUPP on the barrier is reported immediately | ||
1136 | that will never trigger. If it is reported late, we will just | ||
1137 | print that warning and continue correctly for all future requests | ||
1138 | with WO_bdev_flush */ | ||
1139 | if (previous_epoch(mdev, e->epoch)) | ||
1140 | dev_warn(DEV, "Write ordering was not enforced (one time event)\n"); | ||
1141 | |||
1142 | /* prepare bio for re-submit, | ||
1143 | * re-init volatile members */ | ||
1144 | /* we still have a local reference, | ||
1145 | * get_ldev was done in receive_Data. */ | ||
1146 | bio->bi_bdev = mdev->ldev->backing_bdev; | ||
1147 | bio->bi_sector = e->sector; | ||
1148 | bio->bi_size = e->size; | ||
1149 | bio->bi_idx = 0; | ||
1150 | |||
1151 | bio->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
1152 | bio->bi_flags |= 1 << BIO_UPTODATE; | ||
1153 | |||
1154 | /* don't know whether this is necessary: */ | ||
1155 | bio->bi_phys_segments = 0; | ||
1156 | bio->bi_next = NULL; | ||
1157 | |||
1158 | /* these should be unchanged: */ | ||
1159 | /* bio->bi_end_io = drbd_endio_write_sec; */ | ||
1160 | /* bio->bi_vcnt = whatever; */ | ||
1161 | |||
1162 | e->w.cb = e_end_block; | ||
1163 | |||
1164 | /* This is no longer a barrier request. */ | ||
1165 | bio->bi_rw &= ~(1UL << BIO_RW_BARRIER); | ||
1166 | |||
1167 | drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, bio); | ||
1168 | |||
1169 | return 1; | ||
1170 | } | ||
1171 | |||
1172 | static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h) | ||
1173 | { | ||
1174 | int rv, issue_flush; | ||
1175 | struct p_barrier *p = (struct p_barrier *)h; | ||
1176 | struct drbd_epoch *epoch; | ||
1177 | |||
1178 | ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; | ||
1179 | |||
1180 | rv = drbd_recv(mdev, h->payload, h->length); | ||
1181 | ERR_IF(rv != h->length) return FALSE; | ||
1182 | |||
1183 | inc_unacked(mdev); | ||
1184 | |||
1185 | if (mdev->net_conf->wire_protocol != DRBD_PROT_C) | ||
1186 | drbd_kick_lo(mdev); | ||
1187 | |||
1188 | mdev->current_epoch->barrier_nr = p->barrier; | ||
1189 | rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR); | ||
1190 | |||
1191 | /* P_BARRIER_ACK may imply that the corresponding extent is dropped from | ||
1192 | * the activity log, which means it would not be resynced in case the | ||
1193 | * R_PRIMARY crashes now. | ||
1194 | * Therefore we must send the barrier_ack after the barrier request was | ||
1195 | * completed. */ | ||
1196 | switch (mdev->write_ordering) { | ||
1197 | case WO_bio_barrier: | ||
1198 | case WO_none: | ||
1199 | if (rv == FE_RECYCLED) | ||
1200 | return TRUE; | ||
1201 | break; | ||
1202 | |||
1203 | case WO_bdev_flush: | ||
1204 | case WO_drain_io: | ||
1205 | D_ASSERT(rv == FE_STILL_LIVE); | ||
1206 | set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags); | ||
1207 | drbd_wait_ee_list_empty(mdev, &mdev->active_ee); | ||
1208 | rv = drbd_flush_after_epoch(mdev, mdev->current_epoch); | ||
1209 | if (rv == FE_RECYCLED) | ||
1210 | return TRUE; | ||
1211 | |||
1212 | /* The asender will send all the ACKs and barrier ACKs out, since | ||
1213 | all EEs moved from the active_ee to the done_ee. We need to | ||
1214 | provide a new epoch object for the EEs that come in soon */ | ||
1215 | break; | ||
1216 | } | ||
1217 | |||
1218 | /* receiver context, in the writeout path of the other node. | ||
1219 | * avoid potential distributed deadlock */ | ||
1220 | epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); | ||
1221 | if (!epoch) { | ||
1222 | dev_warn(DEV, "Allocation of an epoch failed, slowing down\n"); | ||
1223 | issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); | ||
1224 | drbd_wait_ee_list_empty(mdev, &mdev->active_ee); | ||
1225 | if (issue_flush) { | ||
1226 | rv = drbd_flush_after_epoch(mdev, mdev->current_epoch); | ||
1227 | if (rv == FE_RECYCLED) | ||
1228 | return TRUE; | ||
1229 | } | ||
1230 | |||
1231 | drbd_wait_ee_list_empty(mdev, &mdev->done_ee); | ||
1232 | |||
1233 | return TRUE; | ||
1234 | } | ||
1235 | |||
1236 | epoch->flags = 0; | ||
1237 | atomic_set(&epoch->epoch_size, 0); | ||
1238 | atomic_set(&epoch->active, 0); | ||
1239 | |||
1240 | spin_lock(&mdev->epoch_lock); | ||
1241 | if (atomic_read(&mdev->current_epoch->epoch_size)) { | ||
1242 | list_add(&epoch->list, &mdev->current_epoch->list); | ||
1243 | mdev->current_epoch = epoch; | ||
1244 | mdev->epochs++; | ||
1245 | } else { | ||
1246 | /* The current_epoch got recycled while we allocated this one... */ | ||
1247 | kfree(epoch); | ||
1248 | } | ||
1249 | spin_unlock(&mdev->epoch_lock); | ||
1250 | |||
1251 | return TRUE; | ||
1252 | } | ||
1253 | |||
1254 | /* used from receive_RSDataReply (recv_resync_read) | ||
1255 | * and from receive_Data */ | ||
1256 | static struct drbd_epoch_entry * | ||
1257 | read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local) | ||
1258 | { | ||
1259 | struct drbd_epoch_entry *e; | ||
1260 | struct bio_vec *bvec; | ||
1261 | struct page *page; | ||
1262 | struct bio *bio; | ||
1263 | int dgs, ds, i, rr; | ||
1264 | void *dig_in = mdev->int_dig_in; | ||
1265 | void *dig_vv = mdev->int_dig_vv; | ||
1266 | |||
1267 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? | ||
1268 | crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; | ||
1269 | |||
1270 | if (dgs) { | ||
1271 | rr = drbd_recv(mdev, dig_in, dgs); | ||
1272 | if (rr != dgs) { | ||
1273 | dev_warn(DEV, "short read receiving data digest: read %d expected %d\n", | ||
1274 | rr, dgs); | ||
1275 | return NULL; | ||
1276 | } | ||
1277 | } | ||
1278 | |||
1279 | data_size -= dgs; | ||
1280 | |||
1281 | ERR_IF(data_size & 0x1ff) return NULL; | ||
1282 | ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL; | ||
1283 | |||
1284 | /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD | ||
1285 | * "criss-cross" setup, that might cause write-out on some other DRBD, | ||
1286 | * which in turn might block on the other node at this very place. */ | ||
1287 | e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO); | ||
1288 | if (!e) | ||
1289 | return NULL; | ||
1290 | bio = e->private_bio; | ||
1291 | ds = data_size; | ||
1292 | bio_for_each_segment(bvec, bio, i) { | ||
1293 | page = bvec->bv_page; | ||
1294 | rr = drbd_recv(mdev, kmap(page), min_t(int, ds, PAGE_SIZE)); | ||
1295 | kunmap(page); | ||
1296 | if (rr != min_t(int, ds, PAGE_SIZE)) { | ||
1297 | drbd_free_ee(mdev, e); | ||
1298 | dev_warn(DEV, "short read receiving data: read %d expected %d\n", | ||
1299 | rr, min_t(int, ds, PAGE_SIZE)); | ||
1300 | return NULL; | ||
1301 | } | ||
1302 | ds -= rr; | ||
1303 | } | ||
1304 | |||
1305 | if (dgs) { | ||
1306 | drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv); | ||
1307 | if (memcmp(dig_in, dig_vv, dgs)) { | ||
1308 | dev_err(DEV, "Digest integrity check FAILED.\n"); | ||
1309 | drbd_bcast_ee(mdev, "digest failed", | ||
1310 | dgs, dig_in, dig_vv, e); | ||
1311 | drbd_free_ee(mdev, e); | ||
1312 | return NULL; | ||
1313 | } | ||
1314 | } | ||
1315 | mdev->recv_cnt += data_size>>9; | ||
1316 | return e; | ||
1317 | } | ||
1318 | |||
1319 | /* drbd_drain_block() just takes a data block | ||
1320 | * out of the socket input buffer, and discards it. | ||
1321 | */ | ||
1322 | static int drbd_drain_block(struct drbd_conf *mdev, int data_size) | ||
1323 | { | ||
1324 | struct page *page; | ||
1325 | int rr, rv = 1; | ||
1326 | void *data; | ||
1327 | |||
1328 | page = drbd_pp_alloc(mdev, 1); | ||
1329 | |||
1330 | data = kmap(page); | ||
1331 | while (data_size) { | ||
1332 | rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE)); | ||
1333 | if (rr != min_t(int, data_size, PAGE_SIZE)) { | ||
1334 | rv = 0; | ||
1335 | dev_warn(DEV, "short read receiving data: read %d expected %d\n", | ||
1336 | rr, min_t(int, data_size, PAGE_SIZE)); | ||
1337 | break; | ||
1338 | } | ||
1339 | data_size -= rr; | ||
1340 | } | ||
1341 | kunmap(page); | ||
1342 | drbd_pp_free(mdev, page); | ||
1343 | return rv; | ||
1344 | } | ||
1345 | |||
1346 | static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, | ||
1347 | sector_t sector, int data_size) | ||
1348 | { | ||
1349 | struct bio_vec *bvec; | ||
1350 | struct bio *bio; | ||
1351 | int dgs, rr, i, expect; | ||
1352 | void *dig_in = mdev->int_dig_in; | ||
1353 | void *dig_vv = mdev->int_dig_vv; | ||
1354 | |||
1355 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? | ||
1356 | crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; | ||
1357 | |||
1358 | if (dgs) { | ||
1359 | rr = drbd_recv(mdev, dig_in, dgs); | ||
1360 | if (rr != dgs) { | ||
1361 | dev_warn(DEV, "short read receiving data reply digest: read %d expected %d\n", | ||
1362 | rr, dgs); | ||
1363 | return 0; | ||
1364 | } | ||
1365 | } | ||
1366 | |||
1367 | data_size -= dgs; | ||
1368 | |||
1369 | /* optimistically update recv_cnt. if receiving fails below, | ||
1370 | * we disconnect anyways, and counters will be reset. */ | ||
1371 | mdev->recv_cnt += data_size>>9; | ||
1372 | |||
1373 | bio = req->master_bio; | ||
1374 | D_ASSERT(sector == bio->bi_sector); | ||
1375 | |||
1376 | bio_for_each_segment(bvec, bio, i) { | ||
1377 | expect = min_t(int, data_size, bvec->bv_len); | ||
1378 | rr = drbd_recv(mdev, | ||
1379 | kmap(bvec->bv_page)+bvec->bv_offset, | ||
1380 | expect); | ||
1381 | kunmap(bvec->bv_page); | ||
1382 | if (rr != expect) { | ||
1383 | dev_warn(DEV, "short read receiving data reply: " | ||
1384 | "read %d expected %d\n", | ||
1385 | rr, expect); | ||
1386 | return 0; | ||
1387 | } | ||
1388 | data_size -= rr; | ||
1389 | } | ||
1390 | |||
1391 | if (dgs) { | ||
1392 | drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv); | ||
1393 | if (memcmp(dig_in, dig_vv, dgs)) { | ||
1394 | dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n"); | ||
1395 | return 0; | ||
1396 | } | ||
1397 | } | ||
1398 | |||
1399 | D_ASSERT(data_size == 0); | ||
1400 | return 1; | ||
1401 | } | ||
1402 | |||
1403 | /* e_end_resync_block() is called via | ||
1404 | * drbd_process_done_ee() by asender only */ | ||
1405 | static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
1406 | { | ||
1407 | struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; | ||
1408 | sector_t sector = e->sector; | ||
1409 | int ok; | ||
1410 | |||
1411 | D_ASSERT(hlist_unhashed(&e->colision)); | ||
1412 | |||
1413 | if (likely(drbd_bio_uptodate(e->private_bio))) { | ||
1414 | drbd_set_in_sync(mdev, sector, e->size); | ||
1415 | ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e); | ||
1416 | } else { | ||
1417 | /* Record failure to sync */ | ||
1418 | drbd_rs_failed_io(mdev, sector, e->size); | ||
1419 | |||
1420 | ok = drbd_send_ack(mdev, P_NEG_ACK, e); | ||
1421 | } | ||
1422 | dec_unacked(mdev); | ||
1423 | |||
1424 | return ok; | ||
1425 | } | ||
1426 | |||
1427 | static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local) | ||
1428 | { | ||
1429 | struct drbd_epoch_entry *e; | ||
1430 | |||
1431 | e = read_in_block(mdev, ID_SYNCER, sector, data_size); | ||
1432 | if (!e) { | ||
1433 | put_ldev(mdev); | ||
1434 | return FALSE; | ||
1435 | } | ||
1436 | |||
1437 | dec_rs_pending(mdev); | ||
1438 | |||
1439 | e->private_bio->bi_end_io = drbd_endio_write_sec; | ||
1440 | e->private_bio->bi_rw = WRITE; | ||
1441 | e->w.cb = e_end_resync_block; | ||
1442 | |||
1443 | inc_unacked(mdev); | ||
1444 | /* corresponding dec_unacked() in e_end_resync_block() | ||
1445 | * respective _drbd_clear_done_ee */ | ||
1446 | |||
1447 | spin_lock_irq(&mdev->req_lock); | ||
1448 | list_add(&e->w.list, &mdev->sync_ee); | ||
1449 | spin_unlock_irq(&mdev->req_lock); | ||
1450 | |||
1451 | drbd_generic_make_request(mdev, DRBD_FAULT_RS_WR, e->private_bio); | ||
1452 | /* accounting done in endio */ | ||
1453 | |||
1454 | maybe_kick_lo(mdev); | ||
1455 | return TRUE; | ||
1456 | } | ||
1457 | |||
1458 | static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h) | ||
1459 | { | ||
1460 | struct drbd_request *req; | ||
1461 | sector_t sector; | ||
1462 | unsigned int header_size, data_size; | ||
1463 | int ok; | ||
1464 | struct p_data *p = (struct p_data *)h; | ||
1465 | |||
1466 | header_size = sizeof(*p) - sizeof(*h); | ||
1467 | data_size = h->length - header_size; | ||
1468 | |||
1469 | ERR_IF(data_size == 0) return FALSE; | ||
1470 | |||
1471 | if (drbd_recv(mdev, h->payload, header_size) != header_size) | ||
1472 | return FALSE; | ||
1473 | |||
1474 | sector = be64_to_cpu(p->sector); | ||
1475 | |||
1476 | spin_lock_irq(&mdev->req_lock); | ||
1477 | req = _ar_id_to_req(mdev, p->block_id, sector); | ||
1478 | spin_unlock_irq(&mdev->req_lock); | ||
1479 | if (unlikely(!req)) { | ||
1480 | dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n"); | ||
1481 | return FALSE; | ||
1482 | } | ||
1483 | |||
1484 | /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid | ||
1485 | * special casing it there for the various failure cases. | ||
1486 | * still no race with drbd_fail_pending_reads */ | ||
1487 | ok = recv_dless_read(mdev, req, sector, data_size); | ||
1488 | |||
1489 | if (ok) | ||
1490 | req_mod(req, data_received); | ||
1491 | /* else: nothing. handled from drbd_disconnect... | ||
1492 | * I don't think we may complete this just yet | ||
1493 | * in case we are "on-disconnect: freeze" */ | ||
1494 | |||
1495 | return ok; | ||
1496 | } | ||
1497 | |||
1498 | static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h) | ||
1499 | { | ||
1500 | sector_t sector; | ||
1501 | unsigned int header_size, data_size; | ||
1502 | int ok; | ||
1503 | struct p_data *p = (struct p_data *)h; | ||
1504 | |||
1505 | header_size = sizeof(*p) - sizeof(*h); | ||
1506 | data_size = h->length - header_size; | ||
1507 | |||
1508 | ERR_IF(data_size == 0) return FALSE; | ||
1509 | |||
1510 | if (drbd_recv(mdev, h->payload, header_size) != header_size) | ||
1511 | return FALSE; | ||
1512 | |||
1513 | sector = be64_to_cpu(p->sector); | ||
1514 | D_ASSERT(p->block_id == ID_SYNCER); | ||
1515 | |||
1516 | if (get_ldev(mdev)) { | ||
1517 | /* data is submitted to disk within recv_resync_read. | ||
1518 | * corresponding put_ldev done below on error, | ||
1519 | * or in drbd_endio_write_sec. */ | ||
1520 | ok = recv_resync_read(mdev, sector, data_size); | ||
1521 | } else { | ||
1522 | if (__ratelimit(&drbd_ratelimit_state)) | ||
1523 | dev_err(DEV, "Can not write resync data to local disk.\n"); | ||
1524 | |||
1525 | ok = drbd_drain_block(mdev, data_size); | ||
1526 | |||
1527 | drbd_send_ack_dp(mdev, P_NEG_ACK, p); | ||
1528 | } | ||
1529 | |||
1530 | return ok; | ||
1531 | } | ||
1532 | |||
1533 | /* e_end_block() is called via drbd_process_done_ee(). | ||
1534 | * this means this function only runs in the asender thread | ||
1535 | */ | ||
1536 | static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1537 | { | ||
1538 | struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; | ||
1539 | sector_t sector = e->sector; | ||
1540 | struct drbd_epoch *epoch; | ||
1541 | int ok = 1, pcmd; | ||
1542 | |||
1543 | if (e->flags & EE_IS_BARRIER) { | ||
1544 | epoch = previous_epoch(mdev, e->epoch); | ||
1545 | if (epoch) | ||
1546 | drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0)); | ||
1547 | } | ||
1548 | |||
1549 | if (mdev->net_conf->wire_protocol == DRBD_PROT_C) { | ||
1550 | if (likely(drbd_bio_uptodate(e->private_bio))) { | ||
1551 | pcmd = (mdev->state.conn >= C_SYNC_SOURCE && | ||
1552 | mdev->state.conn <= C_PAUSED_SYNC_T && | ||
1553 | e->flags & EE_MAY_SET_IN_SYNC) ? | ||
1554 | P_RS_WRITE_ACK : P_WRITE_ACK; | ||
1555 | ok &= drbd_send_ack(mdev, pcmd, e); | ||
1556 | if (pcmd == P_RS_WRITE_ACK) | ||
1557 | drbd_set_in_sync(mdev, sector, e->size); | ||
1558 | } else { | ||
1559 | ok = drbd_send_ack(mdev, P_NEG_ACK, e); | ||
1560 | /* we expect it to be marked out of sync anyways... | ||
1561 | * maybe assert this? */ | ||
1562 | } | ||
1563 | dec_unacked(mdev); | ||
1564 | } | ||
1565 | /* we delete from the conflict detection hash _after_ we sent out the | ||
1566 | * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ | ||
1567 | if (mdev->net_conf->two_primaries) { | ||
1568 | spin_lock_irq(&mdev->req_lock); | ||
1569 | D_ASSERT(!hlist_unhashed(&e->colision)); | ||
1570 | hlist_del_init(&e->colision); | ||
1571 | spin_unlock_irq(&mdev->req_lock); | ||
1572 | } else { | ||
1573 | D_ASSERT(hlist_unhashed(&e->colision)); | ||
1574 | } | ||
1575 | |||
1576 | drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); | ||
1577 | |||
1578 | return ok; | ||
1579 | } | ||
1580 | |||
1581 | static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
1582 | { | ||
1583 | struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; | ||
1584 | int ok = 1; | ||
1585 | |||
1586 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | ||
1587 | ok = drbd_send_ack(mdev, P_DISCARD_ACK, e); | ||
1588 | |||
1589 | spin_lock_irq(&mdev->req_lock); | ||
1590 | D_ASSERT(!hlist_unhashed(&e->colision)); | ||
1591 | hlist_del_init(&e->colision); | ||
1592 | spin_unlock_irq(&mdev->req_lock); | ||
1593 | |||
1594 | dec_unacked(mdev); | ||
1595 | |||
1596 | return ok; | ||
1597 | } | ||
1598 | |||
1599 | /* Called from receive_Data. | ||
1600 | * Synchronize packets on sock with packets on msock. | ||
1601 | * | ||
1602 | * This is here so even when a P_DATA packet traveling via sock overtook an Ack | ||
1603 | * packet traveling on msock, they are still processed in the order they have | ||
1604 | * been sent. | ||
1605 | * | ||
1606 | * Note: we don't care for Ack packets overtaking P_DATA packets. | ||
1607 | * | ||
1608 | * In case packet_seq is larger than mdev->peer_seq number, there are | ||
1609 | * outstanding packets on the msock. We wait for them to arrive. | ||
1610 | * In case we are the logically next packet, we update mdev->peer_seq | ||
1611 | * ourselves. Correctly handles 32bit wrap around. | ||
1612 | * | ||
1613 | * Assume we have a 10 GBit connection, that is about 1<<30 byte per second, | ||
1614 | * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds | ||
1615 | * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have | ||
1616 | * 1<<9 == 512 seconds aka ages for the 32bit wrap around... | ||
1617 | * | ||
1618 | * returns 0 if we may process the packet, | ||
1619 | * -ERESTARTSYS if we were interrupted (by disconnect signal). */ | ||
1620 | static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq) | ||
1621 | { | ||
1622 | DEFINE_WAIT(wait); | ||
1623 | unsigned int p_seq; | ||
1624 | long timeout; | ||
1625 | int ret = 0; | ||
1626 | spin_lock(&mdev->peer_seq_lock); | ||
1627 | for (;;) { | ||
1628 | prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE); | ||
1629 | if (seq_le(packet_seq, mdev->peer_seq+1)) | ||
1630 | break; | ||
1631 | if (signal_pending(current)) { | ||
1632 | ret = -ERESTARTSYS; | ||
1633 | break; | ||
1634 | } | ||
1635 | p_seq = mdev->peer_seq; | ||
1636 | spin_unlock(&mdev->peer_seq_lock); | ||
1637 | timeout = schedule_timeout(30*HZ); | ||
1638 | spin_lock(&mdev->peer_seq_lock); | ||
1639 | if (timeout == 0 && p_seq == mdev->peer_seq) { | ||
1640 | ret = -ETIMEDOUT; | ||
1641 | dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n"); | ||
1642 | break; | ||
1643 | } | ||
1644 | } | ||
1645 | finish_wait(&mdev->seq_wait, &wait); | ||
1646 | if (mdev->peer_seq+1 == packet_seq) | ||
1647 | mdev->peer_seq++; | ||
1648 | spin_unlock(&mdev->peer_seq_lock); | ||
1649 | return ret; | ||
1650 | } | ||
1651 | |||
1652 | /* mirrored write */ | ||
1653 | static int receive_Data(struct drbd_conf *mdev, struct p_header *h) | ||
1654 | { | ||
1655 | sector_t sector; | ||
1656 | struct drbd_epoch_entry *e; | ||
1657 | struct p_data *p = (struct p_data *)h; | ||
1658 | int header_size, data_size; | ||
1659 | int rw = WRITE; | ||
1660 | u32 dp_flags; | ||
1661 | |||
1662 | header_size = sizeof(*p) - sizeof(*h); | ||
1663 | data_size = h->length - header_size; | ||
1664 | |||
1665 | ERR_IF(data_size == 0) return FALSE; | ||
1666 | |||
1667 | if (drbd_recv(mdev, h->payload, header_size) != header_size) | ||
1668 | return FALSE; | ||
1669 | |||
1670 | if (!get_ldev(mdev)) { | ||
1671 | if (__ratelimit(&drbd_ratelimit_state)) | ||
1672 | dev_err(DEV, "Can not write mirrored data block " | ||
1673 | "to local disk.\n"); | ||
1674 | spin_lock(&mdev->peer_seq_lock); | ||
1675 | if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num)) | ||
1676 | mdev->peer_seq++; | ||
1677 | spin_unlock(&mdev->peer_seq_lock); | ||
1678 | |||
1679 | drbd_send_ack_dp(mdev, P_NEG_ACK, p); | ||
1680 | atomic_inc(&mdev->current_epoch->epoch_size); | ||
1681 | return drbd_drain_block(mdev, data_size); | ||
1682 | } | ||
1683 | |||
1684 | /* get_ldev(mdev) successful. | ||
1685 | * Corresponding put_ldev done either below (on various errors), | ||
1686 | * or in drbd_endio_write_sec, if we successfully submit the data at | ||
1687 | * the end of this function. */ | ||
1688 | |||
1689 | sector = be64_to_cpu(p->sector); | ||
1690 | e = read_in_block(mdev, p->block_id, sector, data_size); | ||
1691 | if (!e) { | ||
1692 | put_ldev(mdev); | ||
1693 | return FALSE; | ||
1694 | } | ||
1695 | |||
1696 | e->private_bio->bi_end_io = drbd_endio_write_sec; | ||
1697 | e->w.cb = e_end_block; | ||
1698 | |||
1699 | spin_lock(&mdev->epoch_lock); | ||
1700 | e->epoch = mdev->current_epoch; | ||
1701 | atomic_inc(&e->epoch->epoch_size); | ||
1702 | atomic_inc(&e->epoch->active); | ||
1703 | |||
1704 | if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) { | ||
1705 | struct drbd_epoch *epoch; | ||
1706 | /* Issue a barrier if we start a new epoch, and the previous epoch | ||
1707 | was not a epoch containing a single request which already was | ||
1708 | a Barrier. */ | ||
1709 | epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list); | ||
1710 | if (epoch == e->epoch) { | ||
1711 | set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags); | ||
1712 | rw |= (1<<BIO_RW_BARRIER); | ||
1713 | e->flags |= EE_IS_BARRIER; | ||
1714 | } else { | ||
1715 | if (atomic_read(&epoch->epoch_size) > 1 || | ||
1716 | !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) { | ||
1717 | set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); | ||
1718 | set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags); | ||
1719 | rw |= (1<<BIO_RW_BARRIER); | ||
1720 | e->flags |= EE_IS_BARRIER; | ||
1721 | } | ||
1722 | } | ||
1723 | } | ||
1724 | spin_unlock(&mdev->epoch_lock); | ||
1725 | |||
1726 | dp_flags = be32_to_cpu(p->dp_flags); | ||
1727 | if (dp_flags & DP_HARDBARRIER) { | ||
1728 | dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n"); | ||
1729 | /* rw |= (1<<BIO_RW_BARRIER); */ | ||
1730 | } | ||
1731 | if (dp_flags & DP_RW_SYNC) | ||
1732 | rw |= (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG); | ||
1733 | if (dp_flags & DP_MAY_SET_IN_SYNC) | ||
1734 | e->flags |= EE_MAY_SET_IN_SYNC; | ||
1735 | |||
1736 | /* I'm the receiver, I do hold a net_cnt reference. */ | ||
1737 | if (!mdev->net_conf->two_primaries) { | ||
1738 | spin_lock_irq(&mdev->req_lock); | ||
1739 | } else { | ||
1740 | /* don't get the req_lock yet, | ||
1741 | * we may sleep in drbd_wait_peer_seq */ | ||
1742 | const int size = e->size; | ||
1743 | const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags); | ||
1744 | DEFINE_WAIT(wait); | ||
1745 | struct drbd_request *i; | ||
1746 | struct hlist_node *n; | ||
1747 | struct hlist_head *slot; | ||
1748 | int first; | ||
1749 | |||
1750 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | ||
1751 | BUG_ON(mdev->ee_hash == NULL); | ||
1752 | BUG_ON(mdev->tl_hash == NULL); | ||
1753 | |||
1754 | /* conflict detection and handling: | ||
1755 | * 1. wait on the sequence number, | ||
1756 | * in case this data packet overtook ACK packets. | ||
1757 | * 2. check our hash tables for conflicting requests. | ||
1758 | * we only need to walk the tl_hash, since an ee can not | ||
1759 | * have a conflict with an other ee: on the submitting | ||
1760 | * node, the corresponding req had already been conflicting, | ||
1761 | * and a conflicting req is never sent. | ||
1762 | * | ||
1763 | * Note: for two_primaries, we are protocol C, | ||
1764 | * so there cannot be any request that is DONE | ||
1765 | * but still on the transfer log. | ||
1766 | * | ||
1767 | * unconditionally add to the ee_hash. | ||
1768 | * | ||
1769 | * if no conflicting request is found: | ||
1770 | * submit. | ||
1771 | * | ||
1772 | * if any conflicting request is found | ||
1773 | * that has not yet been acked, | ||
1774 | * AND I have the "discard concurrent writes" flag: | ||
1775 | * queue (via done_ee) the P_DISCARD_ACK; OUT. | ||
1776 | * | ||
1777 | * if any conflicting request is found: | ||
1778 | * block the receiver, waiting on misc_wait | ||
1779 | * until no more conflicting requests are there, | ||
1780 | * or we get interrupted (disconnect). | ||
1781 | * | ||
1782 | * we do not just write after local io completion of those | ||
1783 | * requests, but only after req is done completely, i.e. | ||
1784 | * we wait for the P_DISCARD_ACK to arrive! | ||
1785 | * | ||
1786 | * then proceed normally, i.e. submit. | ||
1787 | */ | ||
1788 | if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num))) | ||
1789 | goto out_interrupted; | ||
1790 | |||
1791 | spin_lock_irq(&mdev->req_lock); | ||
1792 | |||
1793 | hlist_add_head(&e->colision, ee_hash_slot(mdev, sector)); | ||
1794 | |||
1795 | #define OVERLAPS overlaps(i->sector, i->size, sector, size) | ||
1796 | slot = tl_hash_slot(mdev, sector); | ||
1797 | first = 1; | ||
1798 | for (;;) { | ||
1799 | int have_unacked = 0; | ||
1800 | int have_conflict = 0; | ||
1801 | prepare_to_wait(&mdev->misc_wait, &wait, | ||
1802 | TASK_INTERRUPTIBLE); | ||
1803 | hlist_for_each_entry(i, n, slot, colision) { | ||
1804 | if (OVERLAPS) { | ||
1805 | /* only ALERT on first iteration, | ||
1806 | * we may be woken up early... */ | ||
1807 | if (first) | ||
1808 | dev_alert(DEV, "%s[%u] Concurrent local write detected!" | ||
1809 | " new: %llus +%u; pending: %llus +%u\n", | ||
1810 | current->comm, current->pid, | ||
1811 | (unsigned long long)sector, size, | ||
1812 | (unsigned long long)i->sector, i->size); | ||
1813 | if (i->rq_state & RQ_NET_PENDING) | ||
1814 | ++have_unacked; | ||
1815 | ++have_conflict; | ||
1816 | } | ||
1817 | } | ||
1818 | #undef OVERLAPS | ||
1819 | if (!have_conflict) | ||
1820 | break; | ||
1821 | |||
1822 | /* Discard Ack only for the _first_ iteration */ | ||
1823 | if (first && discard && have_unacked) { | ||
1824 | dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n", | ||
1825 | (unsigned long long)sector); | ||
1826 | inc_unacked(mdev); | ||
1827 | e->w.cb = e_send_discard_ack; | ||
1828 | list_add_tail(&e->w.list, &mdev->done_ee); | ||
1829 | |||
1830 | spin_unlock_irq(&mdev->req_lock); | ||
1831 | |||
1832 | /* we could probably send that P_DISCARD_ACK ourselves, | ||
1833 | * but I don't like the receiver using the msock */ | ||
1834 | |||
1835 | put_ldev(mdev); | ||
1836 | wake_asender(mdev); | ||
1837 | finish_wait(&mdev->misc_wait, &wait); | ||
1838 | return TRUE; | ||
1839 | } | ||
1840 | |||
1841 | if (signal_pending(current)) { | ||
1842 | hlist_del_init(&e->colision); | ||
1843 | |||
1844 | spin_unlock_irq(&mdev->req_lock); | ||
1845 | |||
1846 | finish_wait(&mdev->misc_wait, &wait); | ||
1847 | goto out_interrupted; | ||
1848 | } | ||
1849 | |||
1850 | spin_unlock_irq(&mdev->req_lock); | ||
1851 | if (first) { | ||
1852 | first = 0; | ||
1853 | dev_alert(DEV, "Concurrent write! [W AFTERWARDS] " | ||
1854 | "sec=%llus\n", (unsigned long long)sector); | ||
1855 | } else if (discard) { | ||
1856 | /* we had none on the first iteration. | ||
1857 | * there must be none now. */ | ||
1858 | D_ASSERT(have_unacked == 0); | ||
1859 | } | ||
1860 | schedule(); | ||
1861 | spin_lock_irq(&mdev->req_lock); | ||
1862 | } | ||
1863 | finish_wait(&mdev->misc_wait, &wait); | ||
1864 | } | ||
1865 | |||
1866 | list_add(&e->w.list, &mdev->active_ee); | ||
1867 | spin_unlock_irq(&mdev->req_lock); | ||
1868 | |||
1869 | switch (mdev->net_conf->wire_protocol) { | ||
1870 | case DRBD_PROT_C: | ||
1871 | inc_unacked(mdev); | ||
1872 | /* corresponding dec_unacked() in e_end_block() | ||
1873 | * respective _drbd_clear_done_ee */ | ||
1874 | break; | ||
1875 | case DRBD_PROT_B: | ||
1876 | /* I really don't like it that the receiver thread | ||
1877 | * sends on the msock, but anyways */ | ||
1878 | drbd_send_ack(mdev, P_RECV_ACK, e); | ||
1879 | break; | ||
1880 | case DRBD_PROT_A: | ||
1881 | /* nothing to do */ | ||
1882 | break; | ||
1883 | } | ||
1884 | |||
1885 | if (mdev->state.pdsk == D_DISKLESS) { | ||
1886 | /* In case we have the only disk of the cluster, */ | ||
1887 | drbd_set_out_of_sync(mdev, e->sector, e->size); | ||
1888 | e->flags |= EE_CALL_AL_COMPLETE_IO; | ||
1889 | drbd_al_begin_io(mdev, e->sector); | ||
1890 | } | ||
1891 | |||
1892 | e->private_bio->bi_rw = rw; | ||
1893 | drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, e->private_bio); | ||
1894 | /* accounting done in endio */ | ||
1895 | |||
1896 | maybe_kick_lo(mdev); | ||
1897 | return TRUE; | ||
1898 | |||
1899 | out_interrupted: | ||
1900 | /* yes, the epoch_size now is imbalanced. | ||
1901 | * but we drop the connection anyways, so we don't have a chance to | ||
1902 | * receive a barrier... atomic_inc(&mdev->epoch_size); */ | ||
1903 | put_ldev(mdev); | ||
1904 | drbd_free_ee(mdev, e); | ||
1905 | return FALSE; | ||
1906 | } | ||
1907 | |||
1908 | static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) | ||
1909 | { | ||
1910 | sector_t sector; | ||
1911 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); | ||
1912 | struct drbd_epoch_entry *e; | ||
1913 | struct digest_info *di = NULL; | ||
1914 | int size, digest_size; | ||
1915 | unsigned int fault_type; | ||
1916 | struct p_block_req *p = | ||
1917 | (struct p_block_req *)h; | ||
1918 | const int brps = sizeof(*p)-sizeof(*h); | ||
1919 | |||
1920 | if (drbd_recv(mdev, h->payload, brps) != brps) | ||
1921 | return FALSE; | ||
1922 | |||
1923 | sector = be64_to_cpu(p->sector); | ||
1924 | size = be32_to_cpu(p->blksize); | ||
1925 | |||
1926 | if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { | ||
1927 | dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, | ||
1928 | (unsigned long long)sector, size); | ||
1929 | return FALSE; | ||
1930 | } | ||
1931 | if (sector + (size>>9) > capacity) { | ||
1932 | dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, | ||
1933 | (unsigned long long)sector, size); | ||
1934 | return FALSE; | ||
1935 | } | ||
1936 | |||
1937 | if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) { | ||
1938 | if (__ratelimit(&drbd_ratelimit_state)) | ||
1939 | dev_err(DEV, "Can not satisfy peer's read request, " | ||
1940 | "no local data.\n"); | ||
1941 | drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY : | ||
1942 | P_NEG_RS_DREPLY , p); | ||
1943 | return TRUE; | ||
1944 | } | ||
1945 | |||
1946 | /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD | ||
1947 | * "criss-cross" setup, that might cause write-out on some other DRBD, | ||
1948 | * which in turn might block on the other node at this very place. */ | ||
1949 | e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO); | ||
1950 | if (!e) { | ||
1951 | put_ldev(mdev); | ||
1952 | return FALSE; | ||
1953 | } | ||
1954 | |||
1955 | e->private_bio->bi_rw = READ; | ||
1956 | e->private_bio->bi_end_io = drbd_endio_read_sec; | ||
1957 | |||
1958 | switch (h->command) { | ||
1959 | case P_DATA_REQUEST: | ||
1960 | e->w.cb = w_e_end_data_req; | ||
1961 | fault_type = DRBD_FAULT_DT_RD; | ||
1962 | break; | ||
1963 | case P_RS_DATA_REQUEST: | ||
1964 | e->w.cb = w_e_end_rsdata_req; | ||
1965 | fault_type = DRBD_FAULT_RS_RD; | ||
1966 | /* Eventually this should become asynchronously. Currently it | ||
1967 | * blocks the whole receiver just to delay the reading of a | ||
1968 | * resync data block. | ||
1969 | * the drbd_work_queue mechanism is made for this... | ||
1970 | */ | ||
1971 | if (!drbd_rs_begin_io(mdev, sector)) { | ||
1972 | /* we have been interrupted, | ||
1973 | * probably connection lost! */ | ||
1974 | D_ASSERT(signal_pending(current)); | ||
1975 | goto out_free_e; | ||
1976 | } | ||
1977 | break; | ||
1978 | |||
1979 | case P_OV_REPLY: | ||
1980 | case P_CSUM_RS_REQUEST: | ||
1981 | fault_type = DRBD_FAULT_RS_RD; | ||
1982 | digest_size = h->length - brps ; | ||
1983 | di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO); | ||
1984 | if (!di) | ||
1985 | goto out_free_e; | ||
1986 | |||
1987 | di->digest_size = digest_size; | ||
1988 | di->digest = (((char *)di)+sizeof(struct digest_info)); | ||
1989 | |||
1990 | if (drbd_recv(mdev, di->digest, digest_size) != digest_size) | ||
1991 | goto out_free_e; | ||
1992 | |||
1993 | e->block_id = (u64)(unsigned long)di; | ||
1994 | if (h->command == P_CSUM_RS_REQUEST) { | ||
1995 | D_ASSERT(mdev->agreed_pro_version >= 89); | ||
1996 | e->w.cb = w_e_end_csum_rs_req; | ||
1997 | } else if (h->command == P_OV_REPLY) { | ||
1998 | e->w.cb = w_e_end_ov_reply; | ||
1999 | dec_rs_pending(mdev); | ||
2000 | break; | ||
2001 | } | ||
2002 | |||
2003 | if (!drbd_rs_begin_io(mdev, sector)) { | ||
2004 | /* we have been interrupted, probably connection lost! */ | ||
2005 | D_ASSERT(signal_pending(current)); | ||
2006 | goto out_free_e; | ||
2007 | } | ||
2008 | break; | ||
2009 | |||
2010 | case P_OV_REQUEST: | ||
2011 | if (mdev->state.conn >= C_CONNECTED && | ||
2012 | mdev->state.conn != C_VERIFY_T) | ||
2013 | dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n", | ||
2014 | drbd_conn_str(mdev->state.conn)); | ||
2015 | if (mdev->ov_start_sector == ~(sector_t)0 && | ||
2016 | mdev->agreed_pro_version >= 90) { | ||
2017 | mdev->ov_start_sector = sector; | ||
2018 | mdev->ov_position = sector; | ||
2019 | mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(sector); | ||
2020 | dev_info(DEV, "Online Verify start sector: %llu\n", | ||
2021 | (unsigned long long)sector); | ||
2022 | } | ||
2023 | e->w.cb = w_e_end_ov_req; | ||
2024 | fault_type = DRBD_FAULT_RS_RD; | ||
2025 | /* Eventually this should become asynchronous. Currently it | ||
2026 | * blocks the whole receiver just to delay the reading of a | ||
2027 | * resync data block. | ||
2028 | * the drbd_work_queue mechanism is made for this... | ||
2029 | */ | ||
2030 | if (!drbd_rs_begin_io(mdev, sector)) { | ||
2031 | /* we have been interrupted, | ||
2032 | * probably connection lost! */ | ||
2033 | D_ASSERT(signal_pending(current)); | ||
2034 | goto out_free_e; | ||
2035 | } | ||
2036 | break; | ||
2037 | |||
2038 | |||
2039 | default: | ||
2040 | dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n", | ||
2041 | cmdname(h->command)); | ||
2042 | fault_type = DRBD_FAULT_MAX; | ||
2043 | } | ||
2044 | |||
2045 | spin_lock_irq(&mdev->req_lock); | ||
2046 | list_add(&e->w.list, &mdev->read_ee); | ||
2047 | spin_unlock_irq(&mdev->req_lock); | ||
2048 | |||
2049 | inc_unacked(mdev); | ||
2050 | |||
2051 | drbd_generic_make_request(mdev, fault_type, e->private_bio); | ||
2052 | maybe_kick_lo(mdev); | ||
2053 | |||
2054 | return TRUE; | ||
2055 | |||
2056 | out_free_e: | ||
2057 | kfree(di); | ||
2058 | put_ldev(mdev); | ||
2059 | drbd_free_ee(mdev, e); | ||
2060 | return FALSE; | ||
2061 | } | ||
2062 | |||
2063 | static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) | ||
2064 | { | ||
2065 | int self, peer, rv = -100; | ||
2066 | unsigned long ch_self, ch_peer; | ||
2067 | |||
2068 | self = mdev->ldev->md.uuid[UI_BITMAP] & 1; | ||
2069 | peer = mdev->p_uuid[UI_BITMAP] & 1; | ||
2070 | |||
2071 | ch_peer = mdev->p_uuid[UI_SIZE]; | ||
2072 | ch_self = mdev->comm_bm_set; | ||
2073 | |||
2074 | switch (mdev->net_conf->after_sb_0p) { | ||
2075 | case ASB_CONSENSUS: | ||
2076 | case ASB_DISCARD_SECONDARY: | ||
2077 | case ASB_CALL_HELPER: | ||
2078 | dev_err(DEV, "Configuration error.\n"); | ||
2079 | break; | ||
2080 | case ASB_DISCONNECT: | ||
2081 | break; | ||
2082 | case ASB_DISCARD_YOUNGER_PRI: | ||
2083 | if (self == 0 && peer == 1) { | ||
2084 | rv = -1; | ||
2085 | break; | ||
2086 | } | ||
2087 | if (self == 1 && peer == 0) { | ||
2088 | rv = 1; | ||
2089 | break; | ||
2090 | } | ||
2091 | /* Else fall through to one of the other strategies... */ | ||
2092 | case ASB_DISCARD_OLDER_PRI: | ||
2093 | if (self == 0 && peer == 1) { | ||
2094 | rv = 1; | ||
2095 | break; | ||
2096 | } | ||
2097 | if (self == 1 && peer == 0) { | ||
2098 | rv = -1; | ||
2099 | break; | ||
2100 | } | ||
2101 | /* Else fall through to one of the other strategies... */ | ||
2102 | dev_warn(DEV, "Discard younger/older primary did not found a decision\n" | ||
2103 | "Using discard-least-changes instead\n"); | ||
2104 | case ASB_DISCARD_ZERO_CHG: | ||
2105 | if (ch_peer == 0 && ch_self == 0) { | ||
2106 | rv = test_bit(DISCARD_CONCURRENT, &mdev->flags) | ||
2107 | ? -1 : 1; | ||
2108 | break; | ||
2109 | } else { | ||
2110 | if (ch_peer == 0) { rv = 1; break; } | ||
2111 | if (ch_self == 0) { rv = -1; break; } | ||
2112 | } | ||
2113 | if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG) | ||
2114 | break; | ||
2115 | case ASB_DISCARD_LEAST_CHG: | ||
2116 | if (ch_self < ch_peer) | ||
2117 | rv = -1; | ||
2118 | else if (ch_self > ch_peer) | ||
2119 | rv = 1; | ||
2120 | else /* ( ch_self == ch_peer ) */ | ||
2121 | /* Well, then use something else. */ | ||
2122 | rv = test_bit(DISCARD_CONCURRENT, &mdev->flags) | ||
2123 | ? -1 : 1; | ||
2124 | break; | ||
2125 | case ASB_DISCARD_LOCAL: | ||
2126 | rv = -1; | ||
2127 | break; | ||
2128 | case ASB_DISCARD_REMOTE: | ||
2129 | rv = 1; | ||
2130 | } | ||
2131 | |||
2132 | return rv; | ||
2133 | } | ||
2134 | |||
2135 | static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) | ||
2136 | { | ||
2137 | int self, peer, hg, rv = -100; | ||
2138 | |||
2139 | self = mdev->ldev->md.uuid[UI_BITMAP] & 1; | ||
2140 | peer = mdev->p_uuid[UI_BITMAP] & 1; | ||
2141 | |||
2142 | switch (mdev->net_conf->after_sb_1p) { | ||
2143 | case ASB_DISCARD_YOUNGER_PRI: | ||
2144 | case ASB_DISCARD_OLDER_PRI: | ||
2145 | case ASB_DISCARD_LEAST_CHG: | ||
2146 | case ASB_DISCARD_LOCAL: | ||
2147 | case ASB_DISCARD_REMOTE: | ||
2148 | dev_err(DEV, "Configuration error.\n"); | ||
2149 | break; | ||
2150 | case ASB_DISCONNECT: | ||
2151 | break; | ||
2152 | case ASB_CONSENSUS: | ||
2153 | hg = drbd_asb_recover_0p(mdev); | ||
2154 | if (hg == -1 && mdev->state.role == R_SECONDARY) | ||
2155 | rv = hg; | ||
2156 | if (hg == 1 && mdev->state.role == R_PRIMARY) | ||
2157 | rv = hg; | ||
2158 | break; | ||
2159 | case ASB_VIOLENTLY: | ||
2160 | rv = drbd_asb_recover_0p(mdev); | ||
2161 | break; | ||
2162 | case ASB_DISCARD_SECONDARY: | ||
2163 | return mdev->state.role == R_PRIMARY ? 1 : -1; | ||
2164 | case ASB_CALL_HELPER: | ||
2165 | hg = drbd_asb_recover_0p(mdev); | ||
2166 | if (hg == -1 && mdev->state.role == R_PRIMARY) { | ||
2167 | self = drbd_set_role(mdev, R_SECONDARY, 0); | ||
2168 | /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, | ||
2169 | * we might be here in C_WF_REPORT_PARAMS which is transient. | ||
2170 | * we do not need to wait for the after state change work either. */ | ||
2171 | self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY)); | ||
2172 | if (self != SS_SUCCESS) { | ||
2173 | drbd_khelper(mdev, "pri-lost-after-sb"); | ||
2174 | } else { | ||
2175 | dev_warn(DEV, "Successfully gave up primary role.\n"); | ||
2176 | rv = hg; | ||
2177 | } | ||
2178 | } else | ||
2179 | rv = hg; | ||
2180 | } | ||
2181 | |||
2182 | return rv; | ||
2183 | } | ||
2184 | |||
2185 | static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local) | ||
2186 | { | ||
2187 | int self, peer, hg, rv = -100; | ||
2188 | |||
2189 | self = mdev->ldev->md.uuid[UI_BITMAP] & 1; | ||
2190 | peer = mdev->p_uuid[UI_BITMAP] & 1; | ||
2191 | |||
2192 | switch (mdev->net_conf->after_sb_2p) { | ||
2193 | case ASB_DISCARD_YOUNGER_PRI: | ||
2194 | case ASB_DISCARD_OLDER_PRI: | ||
2195 | case ASB_DISCARD_LEAST_CHG: | ||
2196 | case ASB_DISCARD_LOCAL: | ||
2197 | case ASB_DISCARD_REMOTE: | ||
2198 | case ASB_CONSENSUS: | ||
2199 | case ASB_DISCARD_SECONDARY: | ||
2200 | dev_err(DEV, "Configuration error.\n"); | ||
2201 | break; | ||
2202 | case ASB_VIOLENTLY: | ||
2203 | rv = drbd_asb_recover_0p(mdev); | ||
2204 | break; | ||
2205 | case ASB_DISCONNECT: | ||
2206 | break; | ||
2207 | case ASB_CALL_HELPER: | ||
2208 | hg = drbd_asb_recover_0p(mdev); | ||
2209 | if (hg == -1) { | ||
2210 | /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, | ||
2211 | * we might be here in C_WF_REPORT_PARAMS which is transient. | ||
2212 | * we do not need to wait for the after state change work either. */ | ||
2213 | self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY)); | ||
2214 | if (self != SS_SUCCESS) { | ||
2215 | drbd_khelper(mdev, "pri-lost-after-sb"); | ||
2216 | } else { | ||
2217 | dev_warn(DEV, "Successfully gave up primary role.\n"); | ||
2218 | rv = hg; | ||
2219 | } | ||
2220 | } else | ||
2221 | rv = hg; | ||
2222 | } | ||
2223 | |||
2224 | return rv; | ||
2225 | } | ||
2226 | |||
2227 | static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid, | ||
2228 | u64 bits, u64 flags) | ||
2229 | { | ||
2230 | if (!uuid) { | ||
2231 | dev_info(DEV, "%s uuid info vanished while I was looking!\n", text); | ||
2232 | return; | ||
2233 | } | ||
2234 | dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n", | ||
2235 | text, | ||
2236 | (unsigned long long)uuid[UI_CURRENT], | ||
2237 | (unsigned long long)uuid[UI_BITMAP], | ||
2238 | (unsigned long long)uuid[UI_HISTORY_START], | ||
2239 | (unsigned long long)uuid[UI_HISTORY_END], | ||
2240 | (unsigned long long)bits, | ||
2241 | (unsigned long long)flags); | ||
2242 | } | ||
2243 | |||
2244 | /* | ||
2245 | 100 after split brain try auto recover | ||
2246 | 2 C_SYNC_SOURCE set BitMap | ||
2247 | 1 C_SYNC_SOURCE use BitMap | ||
2248 | 0 no Sync | ||
2249 | -1 C_SYNC_TARGET use BitMap | ||
2250 | -2 C_SYNC_TARGET set BitMap | ||
2251 | -100 after split brain, disconnect | ||
2252 | -1000 unrelated data | ||
2253 | */ | ||
2254 | static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local) | ||
2255 | { | ||
2256 | u64 self, peer; | ||
2257 | int i, j; | ||
2258 | |||
2259 | self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1); | ||
2260 | peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1); | ||
2261 | |||
2262 | *rule_nr = 10; | ||
2263 | if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED) | ||
2264 | return 0; | ||
2265 | |||
2266 | *rule_nr = 20; | ||
2267 | if ((self == UUID_JUST_CREATED || self == (u64)0) && | ||
2268 | peer != UUID_JUST_CREATED) | ||
2269 | return -2; | ||
2270 | |||
2271 | *rule_nr = 30; | ||
2272 | if (self != UUID_JUST_CREATED && | ||
2273 | (peer == UUID_JUST_CREATED || peer == (u64)0)) | ||
2274 | return 2; | ||
2275 | |||
2276 | if (self == peer) { | ||
2277 | int rct, dc; /* roles at crash time */ | ||
2278 | |||
2279 | if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) { | ||
2280 | |||
2281 | if (mdev->agreed_pro_version < 91) | ||
2282 | return -1001; | ||
2283 | |||
2284 | if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) && | ||
2285 | (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) { | ||
2286 | dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n"); | ||
2287 | drbd_uuid_set_bm(mdev, 0UL); | ||
2288 | |||
2289 | drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, | ||
2290 | mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0); | ||
2291 | *rule_nr = 34; | ||
2292 | } else { | ||
2293 | dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n"); | ||
2294 | *rule_nr = 36; | ||
2295 | } | ||
2296 | |||
2297 | return 1; | ||
2298 | } | ||
2299 | |||
2300 | if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) { | ||
2301 | |||
2302 | if (mdev->agreed_pro_version < 91) | ||
2303 | return -1001; | ||
2304 | |||
2305 | if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) && | ||
2306 | (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) { | ||
2307 | dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n"); | ||
2308 | |||
2309 | mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START]; | ||
2310 | mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP]; | ||
2311 | mdev->p_uuid[UI_BITMAP] = 0UL; | ||
2312 | |||
2313 | drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); | ||
2314 | *rule_nr = 35; | ||
2315 | } else { | ||
2316 | dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n"); | ||
2317 | *rule_nr = 37; | ||
2318 | } | ||
2319 | |||
2320 | return -1; | ||
2321 | } | ||
2322 | |||
2323 | /* Common power [off|failure] */ | ||
2324 | rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) + | ||
2325 | (mdev->p_uuid[UI_FLAGS] & 2); | ||
2326 | /* lowest bit is set when we were primary, | ||
2327 | * next bit (weight 2) is set when peer was primary */ | ||
2328 | *rule_nr = 40; | ||
2329 | |||
2330 | switch (rct) { | ||
2331 | case 0: /* !self_pri && !peer_pri */ return 0; | ||
2332 | case 1: /* self_pri && !peer_pri */ return 1; | ||
2333 | case 2: /* !self_pri && peer_pri */ return -1; | ||
2334 | case 3: /* self_pri && peer_pri */ | ||
2335 | dc = test_bit(DISCARD_CONCURRENT, &mdev->flags); | ||
2336 | return dc ? -1 : 1; | ||
2337 | } | ||
2338 | } | ||
2339 | |||
2340 | *rule_nr = 50; | ||
2341 | peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1); | ||
2342 | if (self == peer) | ||
2343 | return -1; | ||
2344 | |||
2345 | *rule_nr = 51; | ||
2346 | peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); | ||
2347 | if (self == peer) { | ||
2348 | self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); | ||
2349 | peer = mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1); | ||
2350 | if (self == peer) { | ||
2351 | /* The last P_SYNC_UUID did not get though. Undo the last start of | ||
2352 | resync as sync source modifications of the peer's UUIDs. */ | ||
2353 | |||
2354 | if (mdev->agreed_pro_version < 91) | ||
2355 | return -1001; | ||
2356 | |||
2357 | mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START]; | ||
2358 | mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1]; | ||
2359 | return -1; | ||
2360 | } | ||
2361 | } | ||
2362 | |||
2363 | *rule_nr = 60; | ||
2364 | self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1); | ||
2365 | for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { | ||
2366 | peer = mdev->p_uuid[i] & ~((u64)1); | ||
2367 | if (self == peer) | ||
2368 | return -2; | ||
2369 | } | ||
2370 | |||
2371 | *rule_nr = 70; | ||
2372 | self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1); | ||
2373 | peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1); | ||
2374 | if (self == peer) | ||
2375 | return 1; | ||
2376 | |||
2377 | *rule_nr = 71; | ||
2378 | self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); | ||
2379 | if (self == peer) { | ||
2380 | self = mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1); | ||
2381 | peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); | ||
2382 | if (self == peer) { | ||
2383 | /* The last P_SYNC_UUID did not get though. Undo the last start of | ||
2384 | resync as sync source modifications of our UUIDs. */ | ||
2385 | |||
2386 | if (mdev->agreed_pro_version < 91) | ||
2387 | return -1001; | ||
2388 | |||
2389 | _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]); | ||
2390 | _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]); | ||
2391 | |||
2392 | dev_info(DEV, "Undid last start of resync:\n"); | ||
2393 | |||
2394 | drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, | ||
2395 | mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0); | ||
2396 | |||
2397 | return 1; | ||
2398 | } | ||
2399 | } | ||
2400 | |||
2401 | |||
2402 | *rule_nr = 80; | ||
2403 | for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { | ||
2404 | self = mdev->ldev->md.uuid[i] & ~((u64)1); | ||
2405 | if (self == peer) | ||
2406 | return 2; | ||
2407 | } | ||
2408 | |||
2409 | *rule_nr = 90; | ||
2410 | self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1); | ||
2411 | peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1); | ||
2412 | if (self == peer && self != ((u64)0)) | ||
2413 | return 100; | ||
2414 | |||
2415 | *rule_nr = 100; | ||
2416 | for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { | ||
2417 | self = mdev->ldev->md.uuid[i] & ~((u64)1); | ||
2418 | for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) { | ||
2419 | peer = mdev->p_uuid[j] & ~((u64)1); | ||
2420 | if (self == peer) | ||
2421 | return -100; | ||
2422 | } | ||
2423 | } | ||
2424 | |||
2425 | return -1000; | ||
2426 | } | ||
2427 | |||
2428 | /* drbd_sync_handshake() returns the new conn state on success, or | ||
2429 | CONN_MASK (-1) on failure. | ||
2430 | */ | ||
2431 | static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role, | ||
2432 | enum drbd_disk_state peer_disk) __must_hold(local) | ||
2433 | { | ||
2434 | int hg, rule_nr; | ||
2435 | enum drbd_conns rv = C_MASK; | ||
2436 | enum drbd_disk_state mydisk; | ||
2437 | |||
2438 | mydisk = mdev->state.disk; | ||
2439 | if (mydisk == D_NEGOTIATING) | ||
2440 | mydisk = mdev->new_state_tmp.disk; | ||
2441 | |||
2442 | dev_info(DEV, "drbd_sync_handshake:\n"); | ||
2443 | drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0); | ||
2444 | drbd_uuid_dump(mdev, "peer", mdev->p_uuid, | ||
2445 | mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); | ||
2446 | |||
2447 | hg = drbd_uuid_compare(mdev, &rule_nr); | ||
2448 | |||
2449 | dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr); | ||
2450 | |||
2451 | if (hg == -1000) { | ||
2452 | dev_alert(DEV, "Unrelated data, aborting!\n"); | ||
2453 | return C_MASK; | ||
2454 | } | ||
2455 | if (hg == -1001) { | ||
2456 | dev_alert(DEV, "To resolve this both sides have to support at least protocol\n"); | ||
2457 | return C_MASK; | ||
2458 | } | ||
2459 | |||
2460 | if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) || | ||
2461 | (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) { | ||
2462 | int f = (hg == -100) || abs(hg) == 2; | ||
2463 | hg = mydisk > D_INCONSISTENT ? 1 : -1; | ||
2464 | if (f) | ||
2465 | hg = hg*2; | ||
2466 | dev_info(DEV, "Becoming sync %s due to disk states.\n", | ||
2467 | hg > 0 ? "source" : "target"); | ||
2468 | } | ||
2469 | |||
2470 | if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) { | ||
2471 | int pcount = (mdev->state.role == R_PRIMARY) | ||
2472 | + (peer_role == R_PRIMARY); | ||
2473 | int forced = (hg == -100); | ||
2474 | |||
2475 | switch (pcount) { | ||
2476 | case 0: | ||
2477 | hg = drbd_asb_recover_0p(mdev); | ||
2478 | break; | ||
2479 | case 1: | ||
2480 | hg = drbd_asb_recover_1p(mdev); | ||
2481 | break; | ||
2482 | case 2: | ||
2483 | hg = drbd_asb_recover_2p(mdev); | ||
2484 | break; | ||
2485 | } | ||
2486 | if (abs(hg) < 100) { | ||
2487 | dev_warn(DEV, "Split-Brain detected, %d primaries, " | ||
2488 | "automatically solved. Sync from %s node\n", | ||
2489 | pcount, (hg < 0) ? "peer" : "this"); | ||
2490 | if (forced) { | ||
2491 | dev_warn(DEV, "Doing a full sync, since" | ||
2492 | " UUIDs where ambiguous.\n"); | ||
2493 | hg = hg*2; | ||
2494 | } | ||
2495 | } | ||
2496 | } | ||
2497 | |||
2498 | if (hg == -100) { | ||
2499 | if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1)) | ||
2500 | hg = -1; | ||
2501 | if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1)) | ||
2502 | hg = 1; | ||
2503 | |||
2504 | if (abs(hg) < 100) | ||
2505 | dev_warn(DEV, "Split-Brain detected, manually solved. " | ||
2506 | "Sync from %s node\n", | ||
2507 | (hg < 0) ? "peer" : "this"); | ||
2508 | } | ||
2509 | |||
2510 | if (hg == -100) { | ||
2511 | dev_alert(DEV, "Split-Brain detected, dropping connection!\n"); | ||
2512 | drbd_khelper(mdev, "split-brain"); | ||
2513 | return C_MASK; | ||
2514 | } | ||
2515 | |||
2516 | if (hg > 0 && mydisk <= D_INCONSISTENT) { | ||
2517 | dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n"); | ||
2518 | return C_MASK; | ||
2519 | } | ||
2520 | |||
2521 | if (hg < 0 && /* by intention we do not use mydisk here. */ | ||
2522 | mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) { | ||
2523 | switch (mdev->net_conf->rr_conflict) { | ||
2524 | case ASB_CALL_HELPER: | ||
2525 | drbd_khelper(mdev, "pri-lost"); | ||
2526 | /* fall through */ | ||
2527 | case ASB_DISCONNECT: | ||
2528 | dev_err(DEV, "I shall become SyncTarget, but I am primary!\n"); | ||
2529 | return C_MASK; | ||
2530 | case ASB_VIOLENTLY: | ||
2531 | dev_warn(DEV, "Becoming SyncTarget, violating the stable-data" | ||
2532 | "assumption\n"); | ||
2533 | } | ||
2534 | } | ||
2535 | |||
2536 | if (abs(hg) >= 2) { | ||
2537 | dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n"); | ||
2538 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake")) | ||
2539 | return C_MASK; | ||
2540 | } | ||
2541 | |||
2542 | if (hg > 0) { /* become sync source. */ | ||
2543 | rv = C_WF_BITMAP_S; | ||
2544 | } else if (hg < 0) { /* become sync target */ | ||
2545 | rv = C_WF_BITMAP_T; | ||
2546 | } else { | ||
2547 | rv = C_CONNECTED; | ||
2548 | if (drbd_bm_total_weight(mdev)) { | ||
2549 | dev_info(DEV, "No resync, but %lu bits in bitmap!\n", | ||
2550 | drbd_bm_total_weight(mdev)); | ||
2551 | } | ||
2552 | } | ||
2553 | |||
2554 | return rv; | ||
2555 | } | ||
2556 | |||
2557 | /* returns 1 if invalid */ | ||
2558 | static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self) | ||
2559 | { | ||
2560 | /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */ | ||
2561 | if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) || | ||
2562 | (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL)) | ||
2563 | return 0; | ||
2564 | |||
2565 | /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */ | ||
2566 | if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL || | ||
2567 | self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL) | ||
2568 | return 1; | ||
2569 | |||
2570 | /* everything else is valid if they are equal on both sides. */ | ||
2571 | if (peer == self) | ||
2572 | return 0; | ||
2573 | |||
2574 | /* everything es is invalid. */ | ||
2575 | return 1; | ||
2576 | } | ||
2577 | |||
2578 | static int receive_protocol(struct drbd_conf *mdev, struct p_header *h) | ||
2579 | { | ||
2580 | struct p_protocol *p = (struct p_protocol *)h; | ||
2581 | int header_size, data_size; | ||
2582 | int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; | ||
2583 | int p_want_lose, p_two_primaries; | ||
2584 | char p_integrity_alg[SHARED_SECRET_MAX] = ""; | ||
2585 | |||
2586 | header_size = sizeof(*p) - sizeof(*h); | ||
2587 | data_size = h->length - header_size; | ||
2588 | |||
2589 | if (drbd_recv(mdev, h->payload, header_size) != header_size) | ||
2590 | return FALSE; | ||
2591 | |||
2592 | p_proto = be32_to_cpu(p->protocol); | ||
2593 | p_after_sb_0p = be32_to_cpu(p->after_sb_0p); | ||
2594 | p_after_sb_1p = be32_to_cpu(p->after_sb_1p); | ||
2595 | p_after_sb_2p = be32_to_cpu(p->after_sb_2p); | ||
2596 | p_want_lose = be32_to_cpu(p->want_lose); | ||
2597 | p_two_primaries = be32_to_cpu(p->two_primaries); | ||
2598 | |||
2599 | if (p_proto != mdev->net_conf->wire_protocol) { | ||
2600 | dev_err(DEV, "incompatible communication protocols\n"); | ||
2601 | goto disconnect; | ||
2602 | } | ||
2603 | |||
2604 | if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) { | ||
2605 | dev_err(DEV, "incompatible after-sb-0pri settings\n"); | ||
2606 | goto disconnect; | ||
2607 | } | ||
2608 | |||
2609 | if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) { | ||
2610 | dev_err(DEV, "incompatible after-sb-1pri settings\n"); | ||
2611 | goto disconnect; | ||
2612 | } | ||
2613 | |||
2614 | if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) { | ||
2615 | dev_err(DEV, "incompatible after-sb-2pri settings\n"); | ||
2616 | goto disconnect; | ||
2617 | } | ||
2618 | |||
2619 | if (p_want_lose && mdev->net_conf->want_lose) { | ||
2620 | dev_err(DEV, "both sides have the 'want_lose' flag set\n"); | ||
2621 | goto disconnect; | ||
2622 | } | ||
2623 | |||
2624 | if (p_two_primaries != mdev->net_conf->two_primaries) { | ||
2625 | dev_err(DEV, "incompatible setting of the two-primaries options\n"); | ||
2626 | goto disconnect; | ||
2627 | } | ||
2628 | |||
2629 | if (mdev->agreed_pro_version >= 87) { | ||
2630 | unsigned char *my_alg = mdev->net_conf->integrity_alg; | ||
2631 | |||
2632 | if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size) | ||
2633 | return FALSE; | ||
2634 | |||
2635 | p_integrity_alg[SHARED_SECRET_MAX-1] = 0; | ||
2636 | if (strcmp(p_integrity_alg, my_alg)) { | ||
2637 | dev_err(DEV, "incompatible setting of the data-integrity-alg\n"); | ||
2638 | goto disconnect; | ||
2639 | } | ||
2640 | dev_info(DEV, "data-integrity-alg: %s\n", | ||
2641 | my_alg[0] ? my_alg : (unsigned char *)"<not-used>"); | ||
2642 | } | ||
2643 | |||
2644 | return TRUE; | ||
2645 | |||
2646 | disconnect: | ||
2647 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2648 | return FALSE; | ||
2649 | } | ||
2650 | |||
2651 | /* helper function | ||
2652 | * input: alg name, feature name | ||
2653 | * return: NULL (alg name was "") | ||
2654 | * ERR_PTR(error) if something goes wrong | ||
2655 | * or the crypto hash ptr, if it worked out ok. */ | ||
2656 | struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev, | ||
2657 | const char *alg, const char *name) | ||
2658 | { | ||
2659 | struct crypto_hash *tfm; | ||
2660 | |||
2661 | if (!alg[0]) | ||
2662 | return NULL; | ||
2663 | |||
2664 | tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC); | ||
2665 | if (IS_ERR(tfm)) { | ||
2666 | dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n", | ||
2667 | alg, name, PTR_ERR(tfm)); | ||
2668 | return tfm; | ||
2669 | } | ||
2670 | if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) { | ||
2671 | crypto_free_hash(tfm); | ||
2672 | dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name); | ||
2673 | return ERR_PTR(-EINVAL); | ||
2674 | } | ||
2675 | return tfm; | ||
2676 | } | ||
2677 | |||
2678 | static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h) | ||
2679 | { | ||
2680 | int ok = TRUE; | ||
2681 | struct p_rs_param_89 *p = (struct p_rs_param_89 *)h; | ||
2682 | unsigned int header_size, data_size, exp_max_sz; | ||
2683 | struct crypto_hash *verify_tfm = NULL; | ||
2684 | struct crypto_hash *csums_tfm = NULL; | ||
2685 | const int apv = mdev->agreed_pro_version; | ||
2686 | |||
2687 | exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) | ||
2688 | : apv == 88 ? sizeof(struct p_rs_param) | ||
2689 | + SHARED_SECRET_MAX | ||
2690 | : /* 89 */ sizeof(struct p_rs_param_89); | ||
2691 | |||
2692 | if (h->length > exp_max_sz) { | ||
2693 | dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n", | ||
2694 | h->length, exp_max_sz); | ||
2695 | return FALSE; | ||
2696 | } | ||
2697 | |||
2698 | if (apv <= 88) { | ||
2699 | header_size = sizeof(struct p_rs_param) - sizeof(*h); | ||
2700 | data_size = h->length - header_size; | ||
2701 | } else /* apv >= 89 */ { | ||
2702 | header_size = sizeof(struct p_rs_param_89) - sizeof(*h); | ||
2703 | data_size = h->length - header_size; | ||
2704 | D_ASSERT(data_size == 0); | ||
2705 | } | ||
2706 | |||
2707 | /* initialize verify_alg and csums_alg */ | ||
2708 | memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); | ||
2709 | |||
2710 | if (drbd_recv(mdev, h->payload, header_size) != header_size) | ||
2711 | return FALSE; | ||
2712 | |||
2713 | mdev->sync_conf.rate = be32_to_cpu(p->rate); | ||
2714 | |||
2715 | if (apv >= 88) { | ||
2716 | if (apv == 88) { | ||
2717 | if (data_size > SHARED_SECRET_MAX) { | ||
2718 | dev_err(DEV, "verify-alg too long, " | ||
2719 | "peer wants %u, accepting only %u byte\n", | ||
2720 | data_size, SHARED_SECRET_MAX); | ||
2721 | return FALSE; | ||
2722 | } | ||
2723 | |||
2724 | if (drbd_recv(mdev, p->verify_alg, data_size) != data_size) | ||
2725 | return FALSE; | ||
2726 | |||
2727 | /* we expect NUL terminated string */ | ||
2728 | /* but just in case someone tries to be evil */ | ||
2729 | D_ASSERT(p->verify_alg[data_size-1] == 0); | ||
2730 | p->verify_alg[data_size-1] = 0; | ||
2731 | |||
2732 | } else /* apv >= 89 */ { | ||
2733 | /* we still expect NUL terminated strings */ | ||
2734 | /* but just in case someone tries to be evil */ | ||
2735 | D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0); | ||
2736 | D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0); | ||
2737 | p->verify_alg[SHARED_SECRET_MAX-1] = 0; | ||
2738 | p->csums_alg[SHARED_SECRET_MAX-1] = 0; | ||
2739 | } | ||
2740 | |||
2741 | if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) { | ||
2742 | if (mdev->state.conn == C_WF_REPORT_PARAMS) { | ||
2743 | dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n", | ||
2744 | mdev->sync_conf.verify_alg, p->verify_alg); | ||
2745 | goto disconnect; | ||
2746 | } | ||
2747 | verify_tfm = drbd_crypto_alloc_digest_safe(mdev, | ||
2748 | p->verify_alg, "verify-alg"); | ||
2749 | if (IS_ERR(verify_tfm)) { | ||
2750 | verify_tfm = NULL; | ||
2751 | goto disconnect; | ||
2752 | } | ||
2753 | } | ||
2754 | |||
2755 | if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) { | ||
2756 | if (mdev->state.conn == C_WF_REPORT_PARAMS) { | ||
2757 | dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n", | ||
2758 | mdev->sync_conf.csums_alg, p->csums_alg); | ||
2759 | goto disconnect; | ||
2760 | } | ||
2761 | csums_tfm = drbd_crypto_alloc_digest_safe(mdev, | ||
2762 | p->csums_alg, "csums-alg"); | ||
2763 | if (IS_ERR(csums_tfm)) { | ||
2764 | csums_tfm = NULL; | ||
2765 | goto disconnect; | ||
2766 | } | ||
2767 | } | ||
2768 | |||
2769 | |||
2770 | spin_lock(&mdev->peer_seq_lock); | ||
2771 | /* lock against drbd_nl_syncer_conf() */ | ||
2772 | if (verify_tfm) { | ||
2773 | strcpy(mdev->sync_conf.verify_alg, p->verify_alg); | ||
2774 | mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1; | ||
2775 | crypto_free_hash(mdev->verify_tfm); | ||
2776 | mdev->verify_tfm = verify_tfm; | ||
2777 | dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg); | ||
2778 | } | ||
2779 | if (csums_tfm) { | ||
2780 | strcpy(mdev->sync_conf.csums_alg, p->csums_alg); | ||
2781 | mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1; | ||
2782 | crypto_free_hash(mdev->csums_tfm); | ||
2783 | mdev->csums_tfm = csums_tfm; | ||
2784 | dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg); | ||
2785 | } | ||
2786 | spin_unlock(&mdev->peer_seq_lock); | ||
2787 | } | ||
2788 | |||
2789 | return ok; | ||
2790 | disconnect: | ||
2791 | /* just for completeness: actually not needed, | ||
2792 | * as this is not reached if csums_tfm was ok. */ | ||
2793 | crypto_free_hash(csums_tfm); | ||
2794 | /* but free the verify_tfm again, if csums_tfm did not work out */ | ||
2795 | crypto_free_hash(verify_tfm); | ||
2796 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2797 | return FALSE; | ||
2798 | } | ||
2799 | |||
2800 | static void drbd_setup_order_type(struct drbd_conf *mdev, int peer) | ||
2801 | { | ||
2802 | /* sorry, we currently have no working implementation | ||
2803 | * of distributed TCQ */ | ||
2804 | } | ||
2805 | |||
2806 | /* warn if the arguments differ by more than 12.5% */ | ||
2807 | static void warn_if_differ_considerably(struct drbd_conf *mdev, | ||
2808 | const char *s, sector_t a, sector_t b) | ||
2809 | { | ||
2810 | sector_t d; | ||
2811 | if (a == 0 || b == 0) | ||
2812 | return; | ||
2813 | d = (a > b) ? (a - b) : (b - a); | ||
2814 | if (d > (a>>3) || d > (b>>3)) | ||
2815 | dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s, | ||
2816 | (unsigned long long)a, (unsigned long long)b); | ||
2817 | } | ||
2818 | |||
2819 | static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) | ||
2820 | { | ||
2821 | struct p_sizes *p = (struct p_sizes *)h; | ||
2822 | enum determine_dev_size dd = unchanged; | ||
2823 | unsigned int max_seg_s; | ||
2824 | sector_t p_size, p_usize, my_usize; | ||
2825 | int ldsc = 0; /* local disk size changed */ | ||
2826 | enum drbd_conns nconn; | ||
2827 | |||
2828 | ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; | ||
2829 | if (drbd_recv(mdev, h->payload, h->length) != h->length) | ||
2830 | return FALSE; | ||
2831 | |||
2832 | p_size = be64_to_cpu(p->d_size); | ||
2833 | p_usize = be64_to_cpu(p->u_size); | ||
2834 | |||
2835 | if (p_size == 0 && mdev->state.disk == D_DISKLESS) { | ||
2836 | dev_err(DEV, "some backing storage is needed\n"); | ||
2837 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2838 | return FALSE; | ||
2839 | } | ||
2840 | |||
2841 | /* just store the peer's disk size for now. | ||
2842 | * we still need to figure out whether we accept that. */ | ||
2843 | mdev->p_size = p_size; | ||
2844 | |||
2845 | #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) | ||
2846 | if (get_ldev(mdev)) { | ||
2847 | warn_if_differ_considerably(mdev, "lower level device sizes", | ||
2848 | p_size, drbd_get_max_capacity(mdev->ldev)); | ||
2849 | warn_if_differ_considerably(mdev, "user requested size", | ||
2850 | p_usize, mdev->ldev->dc.disk_size); | ||
2851 | |||
2852 | /* if this is the first connect, or an otherwise expected | ||
2853 | * param exchange, choose the minimum */ | ||
2854 | if (mdev->state.conn == C_WF_REPORT_PARAMS) | ||
2855 | p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size, | ||
2856 | p_usize); | ||
2857 | |||
2858 | my_usize = mdev->ldev->dc.disk_size; | ||
2859 | |||
2860 | if (mdev->ldev->dc.disk_size != p_usize) { | ||
2861 | mdev->ldev->dc.disk_size = p_usize; | ||
2862 | dev_info(DEV, "Peer sets u_size to %lu sectors\n", | ||
2863 | (unsigned long)mdev->ldev->dc.disk_size); | ||
2864 | } | ||
2865 | |||
2866 | /* Never shrink a device with usable data during connect. | ||
2867 | But allow online shrinking if we are connected. */ | ||
2868 | if (drbd_new_dev_size(mdev, mdev->ldev) < | ||
2869 | drbd_get_capacity(mdev->this_bdev) && | ||
2870 | mdev->state.disk >= D_OUTDATED && | ||
2871 | mdev->state.conn < C_CONNECTED) { | ||
2872 | dev_err(DEV, "The peer's disk size is too small!\n"); | ||
2873 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2874 | mdev->ldev->dc.disk_size = my_usize; | ||
2875 | put_ldev(mdev); | ||
2876 | return FALSE; | ||
2877 | } | ||
2878 | put_ldev(mdev); | ||
2879 | } | ||
2880 | #undef min_not_zero | ||
2881 | |||
2882 | if (get_ldev(mdev)) { | ||
2883 | dd = drbd_determin_dev_size(mdev); | ||
2884 | put_ldev(mdev); | ||
2885 | if (dd == dev_size_error) | ||
2886 | return FALSE; | ||
2887 | drbd_md_sync(mdev); | ||
2888 | } else { | ||
2889 | /* I am diskless, need to accept the peer's size. */ | ||
2890 | drbd_set_my_capacity(mdev, p_size); | ||
2891 | } | ||
2892 | |||
2893 | if (mdev->p_uuid && mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) { | ||
2894 | nconn = drbd_sync_handshake(mdev, | ||
2895 | mdev->state.peer, mdev->state.pdsk); | ||
2896 | put_ldev(mdev); | ||
2897 | |||
2898 | if (nconn == C_MASK) { | ||
2899 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2900 | return FALSE; | ||
2901 | } | ||
2902 | |||
2903 | if (drbd_request_state(mdev, NS(conn, nconn)) < SS_SUCCESS) { | ||
2904 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2905 | return FALSE; | ||
2906 | } | ||
2907 | } | ||
2908 | |||
2909 | if (get_ldev(mdev)) { | ||
2910 | if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { | ||
2911 | mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); | ||
2912 | ldsc = 1; | ||
2913 | } | ||
2914 | |||
2915 | max_seg_s = be32_to_cpu(p->max_segment_size); | ||
2916 | if (max_seg_s != queue_max_segment_size(mdev->rq_queue)) | ||
2917 | drbd_setup_queue_param(mdev, max_seg_s); | ||
2918 | |||
2919 | drbd_setup_order_type(mdev, be32_to_cpu(p->queue_order_type)); | ||
2920 | put_ldev(mdev); | ||
2921 | } | ||
2922 | |||
2923 | if (mdev->state.conn > C_WF_REPORT_PARAMS) { | ||
2924 | if (be64_to_cpu(p->c_size) != | ||
2925 | drbd_get_capacity(mdev->this_bdev) || ldsc) { | ||
2926 | /* we have different sizes, probably peer | ||
2927 | * needs to know my new size... */ | ||
2928 | drbd_send_sizes(mdev, 0); | ||
2929 | } | ||
2930 | if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) || | ||
2931 | (dd == grew && mdev->state.conn == C_CONNECTED)) { | ||
2932 | if (mdev->state.pdsk >= D_INCONSISTENT && | ||
2933 | mdev->state.disk >= D_INCONSISTENT) | ||
2934 | resync_after_online_grow(mdev); | ||
2935 | else | ||
2936 | set_bit(RESYNC_AFTER_NEG, &mdev->flags); | ||
2937 | } | ||
2938 | } | ||
2939 | |||
2940 | return TRUE; | ||
2941 | } | ||
2942 | |||
2943 | static int receive_uuids(struct drbd_conf *mdev, struct p_header *h) | ||
2944 | { | ||
2945 | struct p_uuids *p = (struct p_uuids *)h; | ||
2946 | u64 *p_uuid; | ||
2947 | int i; | ||
2948 | |||
2949 | ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; | ||
2950 | if (drbd_recv(mdev, h->payload, h->length) != h->length) | ||
2951 | return FALSE; | ||
2952 | |||
2953 | p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); | ||
2954 | |||
2955 | for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++) | ||
2956 | p_uuid[i] = be64_to_cpu(p->uuid[i]); | ||
2957 | |||
2958 | kfree(mdev->p_uuid); | ||
2959 | mdev->p_uuid = p_uuid; | ||
2960 | |||
2961 | if (mdev->state.conn < C_CONNECTED && | ||
2962 | mdev->state.disk < D_INCONSISTENT && | ||
2963 | mdev->state.role == R_PRIMARY && | ||
2964 | (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) { | ||
2965 | dev_err(DEV, "Can only connect to data with current UUID=%016llX\n", | ||
2966 | (unsigned long long)mdev->ed_uuid); | ||
2967 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2968 | return FALSE; | ||
2969 | } | ||
2970 | |||
2971 | if (get_ldev(mdev)) { | ||
2972 | int skip_initial_sync = | ||
2973 | mdev->state.conn == C_CONNECTED && | ||
2974 | mdev->agreed_pro_version >= 90 && | ||
2975 | mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && | ||
2976 | (p_uuid[UI_FLAGS] & 8); | ||
2977 | if (skip_initial_sync) { | ||
2978 | dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n"); | ||
2979 | drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, | ||
2980 | "clear_n_write from receive_uuids"); | ||
2981 | _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]); | ||
2982 | _drbd_uuid_set(mdev, UI_BITMAP, 0); | ||
2983 | _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), | ||
2984 | CS_VERBOSE, NULL); | ||
2985 | drbd_md_sync(mdev); | ||
2986 | } | ||
2987 | put_ldev(mdev); | ||
2988 | } | ||
2989 | |||
2990 | /* Before we test for the disk state, we should wait until an eventually | ||
2991 | ongoing cluster wide state change is finished. That is important if | ||
2992 | we are primary and are detaching from our disk. We need to see the | ||
2993 | new disk state... */ | ||
2994 | wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags)); | ||
2995 | if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT) | ||
2996 | drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]); | ||
2997 | |||
2998 | return TRUE; | ||
2999 | } | ||
3000 | |||
3001 | /** | ||
3002 | * convert_state() - Converts the peer's view of the cluster state to our point of view | ||
3003 | * @ps: The state as seen by the peer. | ||
3004 | */ | ||
3005 | static union drbd_state convert_state(union drbd_state ps) | ||
3006 | { | ||
3007 | union drbd_state ms; | ||
3008 | |||
3009 | static enum drbd_conns c_tab[] = { | ||
3010 | [C_CONNECTED] = C_CONNECTED, | ||
3011 | |||
3012 | [C_STARTING_SYNC_S] = C_STARTING_SYNC_T, | ||
3013 | [C_STARTING_SYNC_T] = C_STARTING_SYNC_S, | ||
3014 | [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */ | ||
3015 | [C_VERIFY_S] = C_VERIFY_T, | ||
3016 | [C_MASK] = C_MASK, | ||
3017 | }; | ||
3018 | |||
3019 | ms.i = ps.i; | ||
3020 | |||
3021 | ms.conn = c_tab[ps.conn]; | ||
3022 | ms.peer = ps.role; | ||
3023 | ms.role = ps.peer; | ||
3024 | ms.pdsk = ps.disk; | ||
3025 | ms.disk = ps.pdsk; | ||
3026 | ms.peer_isp = (ps.aftr_isp | ps.user_isp); | ||
3027 | |||
3028 | return ms; | ||
3029 | } | ||
3030 | |||
3031 | static int receive_req_state(struct drbd_conf *mdev, struct p_header *h) | ||
3032 | { | ||
3033 | struct p_req_state *p = (struct p_req_state *)h; | ||
3034 | union drbd_state mask, val; | ||
3035 | int rv; | ||
3036 | |||
3037 | ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; | ||
3038 | if (drbd_recv(mdev, h->payload, h->length) != h->length) | ||
3039 | return FALSE; | ||
3040 | |||
3041 | mask.i = be32_to_cpu(p->mask); | ||
3042 | val.i = be32_to_cpu(p->val); | ||
3043 | |||
3044 | if (test_bit(DISCARD_CONCURRENT, &mdev->flags) && | ||
3045 | test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) { | ||
3046 | drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG); | ||
3047 | return TRUE; | ||
3048 | } | ||
3049 | |||
3050 | mask = convert_state(mask); | ||
3051 | val = convert_state(val); | ||
3052 | |||
3053 | rv = drbd_change_state(mdev, CS_VERBOSE, mask, val); | ||
3054 | |||
3055 | drbd_send_sr_reply(mdev, rv); | ||
3056 | drbd_md_sync(mdev); | ||
3057 | |||
3058 | return TRUE; | ||
3059 | } | ||
3060 | |||
3061 | static int receive_state(struct drbd_conf *mdev, struct p_header *h) | ||
3062 | { | ||
3063 | struct p_state *p = (struct p_state *)h; | ||
3064 | enum drbd_conns nconn, oconn; | ||
3065 | union drbd_state ns, peer_state; | ||
3066 | enum drbd_disk_state real_peer_disk; | ||
3067 | int rv; | ||
3068 | |||
3069 | ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) | ||
3070 | return FALSE; | ||
3071 | |||
3072 | if (drbd_recv(mdev, h->payload, h->length) != h->length) | ||
3073 | return FALSE; | ||
3074 | |||
3075 | peer_state.i = be32_to_cpu(p->state); | ||
3076 | |||
3077 | real_peer_disk = peer_state.disk; | ||
3078 | if (peer_state.disk == D_NEGOTIATING) { | ||
3079 | real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT; | ||
3080 | dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk)); | ||
3081 | } | ||
3082 | |||
3083 | spin_lock_irq(&mdev->req_lock); | ||
3084 | retry: | ||
3085 | oconn = nconn = mdev->state.conn; | ||
3086 | spin_unlock_irq(&mdev->req_lock); | ||
3087 | |||
3088 | if (nconn == C_WF_REPORT_PARAMS) | ||
3089 | nconn = C_CONNECTED; | ||
3090 | |||
3091 | if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING && | ||
3092 | get_ldev_if_state(mdev, D_NEGOTIATING)) { | ||
3093 | int cr; /* consider resync */ | ||
3094 | |||
3095 | /* if we established a new connection */ | ||
3096 | cr = (oconn < C_CONNECTED); | ||
3097 | /* if we had an established connection | ||
3098 | * and one of the nodes newly attaches a disk */ | ||
3099 | cr |= (oconn == C_CONNECTED && | ||
3100 | (peer_state.disk == D_NEGOTIATING || | ||
3101 | mdev->state.disk == D_NEGOTIATING)); | ||
3102 | /* if we have both been inconsistent, and the peer has been | ||
3103 | * forced to be UpToDate with --overwrite-data */ | ||
3104 | cr |= test_bit(CONSIDER_RESYNC, &mdev->flags); | ||
3105 | /* if we had been plain connected, and the admin requested to | ||
3106 | * start a sync by "invalidate" or "invalidate-remote" */ | ||
3107 | cr |= (oconn == C_CONNECTED && | ||
3108 | (peer_state.conn >= C_STARTING_SYNC_S && | ||
3109 | peer_state.conn <= C_WF_BITMAP_T)); | ||
3110 | |||
3111 | if (cr) | ||
3112 | nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk); | ||
3113 | |||
3114 | put_ldev(mdev); | ||
3115 | if (nconn == C_MASK) { | ||
3116 | if (mdev->state.disk == D_NEGOTIATING) { | ||
3117 | drbd_force_state(mdev, NS(disk, D_DISKLESS)); | ||
3118 | nconn = C_CONNECTED; | ||
3119 | } else if (peer_state.disk == D_NEGOTIATING) { | ||
3120 | dev_err(DEV, "Disk attach process on the peer node was aborted.\n"); | ||
3121 | peer_state.disk = D_DISKLESS; | ||
3122 | } else { | ||
3123 | D_ASSERT(oconn == C_WF_REPORT_PARAMS); | ||
3124 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
3125 | return FALSE; | ||
3126 | } | ||
3127 | } | ||
3128 | } | ||
3129 | |||
3130 | spin_lock_irq(&mdev->req_lock); | ||
3131 | if (mdev->state.conn != oconn) | ||
3132 | goto retry; | ||
3133 | clear_bit(CONSIDER_RESYNC, &mdev->flags); | ||
3134 | ns.i = mdev->state.i; | ||
3135 | ns.conn = nconn; | ||
3136 | ns.peer = peer_state.role; | ||
3137 | ns.pdsk = real_peer_disk; | ||
3138 | ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp); | ||
3139 | if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) | ||
3140 | ns.disk = mdev->new_state_tmp.disk; | ||
3141 | |||
3142 | rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL); | ||
3143 | ns = mdev->state; | ||
3144 | spin_unlock_irq(&mdev->req_lock); | ||
3145 | |||
3146 | if (rv < SS_SUCCESS) { | ||
3147 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
3148 | return FALSE; | ||
3149 | } | ||
3150 | |||
3151 | if (oconn > C_WF_REPORT_PARAMS) { | ||
3152 | if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED && | ||
3153 | peer_state.disk != D_NEGOTIATING ) { | ||
3154 | /* we want resync, peer has not yet decided to sync... */ | ||
3155 | /* Nowadays only used when forcing a node into primary role and | ||
3156 | setting its disk to UpToDate with that */ | ||
3157 | drbd_send_uuids(mdev); | ||
3158 | drbd_send_state(mdev); | ||
3159 | } | ||
3160 | } | ||
3161 | |||
3162 | mdev->net_conf->want_lose = 0; | ||
3163 | |||
3164 | drbd_md_sync(mdev); /* update connected indicator, la_size, ... */ | ||
3165 | |||
3166 | return TRUE; | ||
3167 | } | ||
3168 | |||
3169 | static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h) | ||
3170 | { | ||
3171 | struct p_rs_uuid *p = (struct p_rs_uuid *)h; | ||
3172 | |||
3173 | wait_event(mdev->misc_wait, | ||
3174 | mdev->state.conn == C_WF_SYNC_UUID || | ||
3175 | mdev->state.conn < C_CONNECTED || | ||
3176 | mdev->state.disk < D_NEGOTIATING); | ||
3177 | |||
3178 | /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */ | ||
3179 | |||
3180 | ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; | ||
3181 | if (drbd_recv(mdev, h->payload, h->length) != h->length) | ||
3182 | return FALSE; | ||
3183 | |||
3184 | /* Here the _drbd_uuid_ functions are right, current should | ||
3185 | _not_ be rotated into the history */ | ||
3186 | if (get_ldev_if_state(mdev, D_NEGOTIATING)) { | ||
3187 | _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid)); | ||
3188 | _drbd_uuid_set(mdev, UI_BITMAP, 0UL); | ||
3189 | |||
3190 | drbd_start_resync(mdev, C_SYNC_TARGET); | ||
3191 | |||
3192 | put_ldev(mdev); | ||
3193 | } else | ||
3194 | dev_err(DEV, "Ignoring SyncUUID packet!\n"); | ||
3195 | |||
3196 | return TRUE; | ||
3197 | } | ||
3198 | |||
3199 | enum receive_bitmap_ret { OK, DONE, FAILED }; | ||
3200 | |||
3201 | static enum receive_bitmap_ret | ||
3202 | receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h, | ||
3203 | unsigned long *buffer, struct bm_xfer_ctx *c) | ||
3204 | { | ||
3205 | unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); | ||
3206 | unsigned want = num_words * sizeof(long); | ||
3207 | |||
3208 | if (want != h->length) { | ||
3209 | dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length); | ||
3210 | return FAILED; | ||
3211 | } | ||
3212 | if (want == 0) | ||
3213 | return DONE; | ||
3214 | if (drbd_recv(mdev, buffer, want) != want) | ||
3215 | return FAILED; | ||
3216 | |||
3217 | drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer); | ||
3218 | |||
3219 | c->word_offset += num_words; | ||
3220 | c->bit_offset = c->word_offset * BITS_PER_LONG; | ||
3221 | if (c->bit_offset > c->bm_bits) | ||
3222 | c->bit_offset = c->bm_bits; | ||
3223 | |||
3224 | return OK; | ||
3225 | } | ||
3226 | |||
3227 | static enum receive_bitmap_ret | ||
3228 | recv_bm_rle_bits(struct drbd_conf *mdev, | ||
3229 | struct p_compressed_bm *p, | ||
3230 | struct bm_xfer_ctx *c) | ||
3231 | { | ||
3232 | struct bitstream bs; | ||
3233 | u64 look_ahead; | ||
3234 | u64 rl; | ||
3235 | u64 tmp; | ||
3236 | unsigned long s = c->bit_offset; | ||
3237 | unsigned long e; | ||
3238 | int len = p->head.length - (sizeof(*p) - sizeof(p->head)); | ||
3239 | int toggle = DCBP_get_start(p); | ||
3240 | int have; | ||
3241 | int bits; | ||
3242 | |||
3243 | bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p)); | ||
3244 | |||
3245 | bits = bitstream_get_bits(&bs, &look_ahead, 64); | ||
3246 | if (bits < 0) | ||
3247 | return FAILED; | ||
3248 | |||
3249 | for (have = bits; have > 0; s += rl, toggle = !toggle) { | ||
3250 | bits = vli_decode_bits(&rl, look_ahead); | ||
3251 | if (bits <= 0) | ||
3252 | return FAILED; | ||
3253 | |||
3254 | if (toggle) { | ||
3255 | e = s + rl -1; | ||
3256 | if (e >= c->bm_bits) { | ||
3257 | dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e); | ||
3258 | return FAILED; | ||
3259 | } | ||
3260 | _drbd_bm_set_bits(mdev, s, e); | ||
3261 | } | ||
3262 | |||
3263 | if (have < bits) { | ||
3264 | dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n", | ||
3265 | have, bits, look_ahead, | ||
3266 | (unsigned int)(bs.cur.b - p->code), | ||
3267 | (unsigned int)bs.buf_len); | ||
3268 | return FAILED; | ||
3269 | } | ||
3270 | look_ahead >>= bits; | ||
3271 | have -= bits; | ||
3272 | |||
3273 | bits = bitstream_get_bits(&bs, &tmp, 64 - have); | ||
3274 | if (bits < 0) | ||
3275 | return FAILED; | ||
3276 | look_ahead |= tmp << have; | ||
3277 | have += bits; | ||
3278 | } | ||
3279 | |||
3280 | c->bit_offset = s; | ||
3281 | bm_xfer_ctx_bit_to_word_offset(c); | ||
3282 | |||
3283 | return (s == c->bm_bits) ? DONE : OK; | ||
3284 | } | ||
3285 | |||
3286 | static enum receive_bitmap_ret | ||
3287 | decode_bitmap_c(struct drbd_conf *mdev, | ||
3288 | struct p_compressed_bm *p, | ||
3289 | struct bm_xfer_ctx *c) | ||
3290 | { | ||
3291 | if (DCBP_get_code(p) == RLE_VLI_Bits) | ||
3292 | return recv_bm_rle_bits(mdev, p, c); | ||
3293 | |||
3294 | /* other variants had been implemented for evaluation, | ||
3295 | * but have been dropped as this one turned out to be "best" | ||
3296 | * during all our tests. */ | ||
3297 | |||
3298 | dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding); | ||
3299 | drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); | ||
3300 | return FAILED; | ||
3301 | } | ||
3302 | |||
3303 | void INFO_bm_xfer_stats(struct drbd_conf *mdev, | ||
3304 | const char *direction, struct bm_xfer_ctx *c) | ||
3305 | { | ||
3306 | /* what would it take to transfer it "plaintext" */ | ||
3307 | unsigned plain = sizeof(struct p_header) * | ||
3308 | ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1) | ||
3309 | + c->bm_words * sizeof(long); | ||
3310 | unsigned total = c->bytes[0] + c->bytes[1]; | ||
3311 | unsigned r; | ||
3312 | |||
3313 | /* total can not be zero. but just in case: */ | ||
3314 | if (total == 0) | ||
3315 | return; | ||
3316 | |||
3317 | /* don't report if not compressed */ | ||
3318 | if (total >= plain) | ||
3319 | return; | ||
3320 | |||
3321 | /* total < plain. check for overflow, still */ | ||
3322 | r = (total > UINT_MAX/1000) ? (total / (plain/1000)) | ||
3323 | : (1000 * total / plain); | ||
3324 | |||
3325 | if (r > 1000) | ||
3326 | r = 1000; | ||
3327 | |||
3328 | r = 1000 - r; | ||
3329 | dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), " | ||
3330 | "total %u; compression: %u.%u%%\n", | ||
3331 | direction, | ||
3332 | c->bytes[1], c->packets[1], | ||
3333 | c->bytes[0], c->packets[0], | ||
3334 | total, r/10, r % 10); | ||
3335 | } | ||
3336 | |||
3337 | /* Since we are processing the bitfield from lower addresses to higher, | ||
3338 | it does not matter if the process it in 32 bit chunks or 64 bit | ||
3339 | chunks as long as it is little endian. (Understand it as byte stream, | ||
3340 | beginning with the lowest byte...) If we would use big endian | ||
3341 | we would need to process it from the highest address to the lowest, | ||
3342 | in order to be agnostic to the 32 vs 64 bits issue. | ||
3343 | |||
3344 | returns 0 on failure, 1 if we successfully received it. */ | ||
3345 | static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h) | ||
3346 | { | ||
3347 | struct bm_xfer_ctx c; | ||
3348 | void *buffer; | ||
3349 | enum receive_bitmap_ret ret; | ||
3350 | int ok = FALSE; | ||
3351 | |||
3352 | wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); | ||
3353 | |||
3354 | drbd_bm_lock(mdev, "receive bitmap"); | ||
3355 | |||
3356 | /* maybe we should use some per thread scratch page, | ||
3357 | * and allocate that during initial device creation? */ | ||
3358 | buffer = (unsigned long *) __get_free_page(GFP_NOIO); | ||
3359 | if (!buffer) { | ||
3360 | dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); | ||
3361 | goto out; | ||
3362 | } | ||
3363 | |||
3364 | c = (struct bm_xfer_ctx) { | ||
3365 | .bm_bits = drbd_bm_bits(mdev), | ||
3366 | .bm_words = drbd_bm_words(mdev), | ||
3367 | }; | ||
3368 | |||
3369 | do { | ||
3370 | if (h->command == P_BITMAP) { | ||
3371 | ret = receive_bitmap_plain(mdev, h, buffer, &c); | ||
3372 | } else if (h->command == P_COMPRESSED_BITMAP) { | ||
3373 | /* MAYBE: sanity check that we speak proto >= 90, | ||
3374 | * and the feature is enabled! */ | ||
3375 | struct p_compressed_bm *p; | ||
3376 | |||
3377 | if (h->length > BM_PACKET_PAYLOAD_BYTES) { | ||
3378 | dev_err(DEV, "ReportCBitmap packet too large\n"); | ||
3379 | goto out; | ||
3380 | } | ||
3381 | /* use the page buff */ | ||
3382 | p = buffer; | ||
3383 | memcpy(p, h, sizeof(*h)); | ||
3384 | if (drbd_recv(mdev, p->head.payload, h->length) != h->length) | ||
3385 | goto out; | ||
3386 | if (p->head.length <= (sizeof(*p) - sizeof(p->head))) { | ||
3387 | dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length); | ||
3388 | return FAILED; | ||
3389 | } | ||
3390 | ret = decode_bitmap_c(mdev, p, &c); | ||
3391 | } else { | ||
3392 | dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command); | ||
3393 | goto out; | ||
3394 | } | ||
3395 | |||
3396 | c.packets[h->command == P_BITMAP]++; | ||
3397 | c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length; | ||
3398 | |||
3399 | if (ret != OK) | ||
3400 | break; | ||
3401 | |||
3402 | if (!drbd_recv_header(mdev, h)) | ||
3403 | goto out; | ||
3404 | } while (ret == OK); | ||
3405 | if (ret == FAILED) | ||
3406 | goto out; | ||
3407 | |||
3408 | INFO_bm_xfer_stats(mdev, "receive", &c); | ||
3409 | |||
3410 | if (mdev->state.conn == C_WF_BITMAP_T) { | ||
3411 | ok = !drbd_send_bitmap(mdev); | ||
3412 | if (!ok) | ||
3413 | goto out; | ||
3414 | /* Omit CS_ORDERED with this state transition to avoid deadlocks. */ | ||
3415 | ok = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); | ||
3416 | D_ASSERT(ok == SS_SUCCESS); | ||
3417 | } else if (mdev->state.conn != C_WF_BITMAP_S) { | ||
3418 | /* admin may have requested C_DISCONNECTING, | ||
3419 | * other threads may have noticed network errors */ | ||
3420 | dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n", | ||
3421 | drbd_conn_str(mdev->state.conn)); | ||
3422 | } | ||
3423 | |||
3424 | ok = TRUE; | ||
3425 | out: | ||
3426 | drbd_bm_unlock(mdev); | ||
3427 | if (ok && mdev->state.conn == C_WF_BITMAP_S) | ||
3428 | drbd_start_resync(mdev, C_SYNC_SOURCE); | ||
3429 | free_page((unsigned long) buffer); | ||
3430 | return ok; | ||
3431 | } | ||
3432 | |||
3433 | static int receive_skip(struct drbd_conf *mdev, struct p_header *h) | ||
3434 | { | ||
3435 | /* TODO zero copy sink :) */ | ||
3436 | static char sink[128]; | ||
3437 | int size, want, r; | ||
3438 | |||
3439 | dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n", | ||
3440 | h->command, h->length); | ||
3441 | |||
3442 | size = h->length; | ||
3443 | while (size > 0) { | ||
3444 | want = min_t(int, size, sizeof(sink)); | ||
3445 | r = drbd_recv(mdev, sink, want); | ||
3446 | ERR_IF(r <= 0) break; | ||
3447 | size -= r; | ||
3448 | } | ||
3449 | return size == 0; | ||
3450 | } | ||
3451 | |||
3452 | static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h) | ||
3453 | { | ||
3454 | if (mdev->state.disk >= D_INCONSISTENT) | ||
3455 | drbd_kick_lo(mdev); | ||
3456 | |||
3457 | /* Make sure we've acked all the TCP data associated | ||
3458 | * with the data requests being unplugged */ | ||
3459 | drbd_tcp_quickack(mdev->data.socket); | ||
3460 | |||
3461 | return TRUE; | ||
3462 | } | ||
3463 | |||
3464 | typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *); | ||
3465 | |||
3466 | static drbd_cmd_handler_f drbd_default_handler[] = { | ||
3467 | [P_DATA] = receive_Data, | ||
3468 | [P_DATA_REPLY] = receive_DataReply, | ||
3469 | [P_RS_DATA_REPLY] = receive_RSDataReply, | ||
3470 | [P_BARRIER] = receive_Barrier, | ||
3471 | [P_BITMAP] = receive_bitmap, | ||
3472 | [P_COMPRESSED_BITMAP] = receive_bitmap, | ||
3473 | [P_UNPLUG_REMOTE] = receive_UnplugRemote, | ||
3474 | [P_DATA_REQUEST] = receive_DataRequest, | ||
3475 | [P_RS_DATA_REQUEST] = receive_DataRequest, | ||
3476 | [P_SYNC_PARAM] = receive_SyncParam, | ||
3477 | [P_SYNC_PARAM89] = receive_SyncParam, | ||
3478 | [P_PROTOCOL] = receive_protocol, | ||
3479 | [P_UUIDS] = receive_uuids, | ||
3480 | [P_SIZES] = receive_sizes, | ||
3481 | [P_STATE] = receive_state, | ||
3482 | [P_STATE_CHG_REQ] = receive_req_state, | ||
3483 | [P_SYNC_UUID] = receive_sync_uuid, | ||
3484 | [P_OV_REQUEST] = receive_DataRequest, | ||
3485 | [P_OV_REPLY] = receive_DataRequest, | ||
3486 | [P_CSUM_RS_REQUEST] = receive_DataRequest, | ||
3487 | /* anything missing from this table is in | ||
3488 | * the asender_tbl, see get_asender_cmd */ | ||
3489 | [P_MAX_CMD] = NULL, | ||
3490 | }; | ||
3491 | |||
3492 | static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler; | ||
3493 | static drbd_cmd_handler_f *drbd_opt_cmd_handler; | ||
3494 | |||
3495 | static void drbdd(struct drbd_conf *mdev) | ||
3496 | { | ||
3497 | drbd_cmd_handler_f handler; | ||
3498 | struct p_header *header = &mdev->data.rbuf.header; | ||
3499 | |||
3500 | while (get_t_state(&mdev->receiver) == Running) { | ||
3501 | drbd_thread_current_set_cpu(mdev); | ||
3502 | if (!drbd_recv_header(mdev, header)) | ||
3503 | break; | ||
3504 | |||
3505 | if (header->command < P_MAX_CMD) | ||
3506 | handler = drbd_cmd_handler[header->command]; | ||
3507 | else if (P_MAY_IGNORE < header->command | ||
3508 | && header->command < P_MAX_OPT_CMD) | ||
3509 | handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE]; | ||
3510 | else if (header->command > P_MAX_OPT_CMD) | ||
3511 | handler = receive_skip; | ||
3512 | else | ||
3513 | handler = NULL; | ||
3514 | |||
3515 | if (unlikely(!handler)) { | ||
3516 | dev_err(DEV, "unknown packet type %d, l: %d!\n", | ||
3517 | header->command, header->length); | ||
3518 | drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); | ||
3519 | break; | ||
3520 | } | ||
3521 | if (unlikely(!handler(mdev, header))) { | ||
3522 | dev_err(DEV, "error receiving %s, l: %d!\n", | ||
3523 | cmdname(header->command), header->length); | ||
3524 | drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); | ||
3525 | break; | ||
3526 | } | ||
3527 | } | ||
3528 | } | ||
3529 | |||
3530 | static void drbd_fail_pending_reads(struct drbd_conf *mdev) | ||
3531 | { | ||
3532 | struct hlist_head *slot; | ||
3533 | struct hlist_node *pos; | ||
3534 | struct hlist_node *tmp; | ||
3535 | struct drbd_request *req; | ||
3536 | int i; | ||
3537 | |||
3538 | /* | ||
3539 | * Application READ requests | ||
3540 | */ | ||
3541 | spin_lock_irq(&mdev->req_lock); | ||
3542 | for (i = 0; i < APP_R_HSIZE; i++) { | ||
3543 | slot = mdev->app_reads_hash+i; | ||
3544 | hlist_for_each_entry_safe(req, pos, tmp, slot, colision) { | ||
3545 | /* it may (but should not any longer!) | ||
3546 | * be on the work queue; if that assert triggers, | ||
3547 | * we need to also grab the | ||
3548 | * spin_lock_irq(&mdev->data.work.q_lock); | ||
3549 | * and list_del_init here. */ | ||
3550 | D_ASSERT(list_empty(&req->w.list)); | ||
3551 | /* It would be nice to complete outside of spinlock. | ||
3552 | * But this is easier for now. */ | ||
3553 | _req_mod(req, connection_lost_while_pending); | ||
3554 | } | ||
3555 | } | ||
3556 | for (i = 0; i < APP_R_HSIZE; i++) | ||
3557 | if (!hlist_empty(mdev->app_reads_hash+i)) | ||
3558 | dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: " | ||
3559 | "%p, should be NULL\n", i, mdev->app_reads_hash[i].first); | ||
3560 | |||
3561 | memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); | ||
3562 | spin_unlock_irq(&mdev->req_lock); | ||
3563 | } | ||
3564 | |||
3565 | void drbd_flush_workqueue(struct drbd_conf *mdev) | ||
3566 | { | ||
3567 | struct drbd_wq_barrier barr; | ||
3568 | |||
3569 | barr.w.cb = w_prev_work_done; | ||
3570 | init_completion(&barr.done); | ||
3571 | drbd_queue_work(&mdev->data.work, &barr.w); | ||
3572 | wait_for_completion(&barr.done); | ||
3573 | } | ||
3574 | |||
3575 | static void drbd_disconnect(struct drbd_conf *mdev) | ||
3576 | { | ||
3577 | enum drbd_fencing_p fp; | ||
3578 | union drbd_state os, ns; | ||
3579 | int rv = SS_UNKNOWN_ERROR; | ||
3580 | unsigned int i; | ||
3581 | |||
3582 | if (mdev->state.conn == C_STANDALONE) | ||
3583 | return; | ||
3584 | if (mdev->state.conn >= C_WF_CONNECTION) | ||
3585 | dev_err(DEV, "ASSERT FAILED cstate = %s, expected < WFConnection\n", | ||
3586 | drbd_conn_str(mdev->state.conn)); | ||
3587 | |||
3588 | /* asender does not clean up anything. it must not interfere, either */ | ||
3589 | drbd_thread_stop(&mdev->asender); | ||
3590 | |||
3591 | mutex_lock(&mdev->data.mutex); | ||
3592 | drbd_free_sock(mdev); | ||
3593 | mutex_unlock(&mdev->data.mutex); | ||
3594 | |||
3595 | spin_lock_irq(&mdev->req_lock); | ||
3596 | _drbd_wait_ee_list_empty(mdev, &mdev->active_ee); | ||
3597 | _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee); | ||
3598 | _drbd_wait_ee_list_empty(mdev, &mdev->read_ee); | ||
3599 | spin_unlock_irq(&mdev->req_lock); | ||
3600 | |||
3601 | /* We do not have data structures that would allow us to | ||
3602 | * get the rs_pending_cnt down to 0 again. | ||
3603 | * * On C_SYNC_TARGET we do not have any data structures describing | ||
3604 | * the pending RSDataRequest's we have sent. | ||
3605 | * * On C_SYNC_SOURCE there is no data structure that tracks | ||
3606 | * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget. | ||
3607 | * And no, it is not the sum of the reference counts in the | ||
3608 | * resync_LRU. The resync_LRU tracks the whole operation including | ||
3609 | * the disk-IO, while the rs_pending_cnt only tracks the blocks | ||
3610 | * on the fly. */ | ||
3611 | drbd_rs_cancel_all(mdev); | ||
3612 | mdev->rs_total = 0; | ||
3613 | mdev->rs_failed = 0; | ||
3614 | atomic_set(&mdev->rs_pending_cnt, 0); | ||
3615 | wake_up(&mdev->misc_wait); | ||
3616 | |||
3617 | /* make sure syncer is stopped and w_resume_next_sg queued */ | ||
3618 | del_timer_sync(&mdev->resync_timer); | ||
3619 | set_bit(STOP_SYNC_TIMER, &mdev->flags); | ||
3620 | resync_timer_fn((unsigned long)mdev); | ||
3621 | |||
3622 | /* so we can be sure that all remote or resync reads | ||
3623 | * made it at least to net_ee */ | ||
3624 | wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); | ||
3625 | |||
3626 | /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier, | ||
3627 | * w_make_resync_request etc. which may still be on the worker queue | ||
3628 | * to be "canceled" */ | ||
3629 | drbd_flush_workqueue(mdev); | ||
3630 | |||
3631 | /* This also does reclaim_net_ee(). If we do this too early, we might | ||
3632 | * miss some resync ee and pages.*/ | ||
3633 | drbd_process_done_ee(mdev); | ||
3634 | |||
3635 | kfree(mdev->p_uuid); | ||
3636 | mdev->p_uuid = NULL; | ||
3637 | |||
3638 | if (!mdev->state.susp) | ||
3639 | tl_clear(mdev); | ||
3640 | |||
3641 | drbd_fail_pending_reads(mdev); | ||
3642 | |||
3643 | dev_info(DEV, "Connection closed\n"); | ||
3644 | |||
3645 | drbd_md_sync(mdev); | ||
3646 | |||
3647 | fp = FP_DONT_CARE; | ||
3648 | if (get_ldev(mdev)) { | ||
3649 | fp = mdev->ldev->dc.fencing; | ||
3650 | put_ldev(mdev); | ||
3651 | } | ||
3652 | |||
3653 | if (mdev->state.role == R_PRIMARY) { | ||
3654 | if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) { | ||
3655 | enum drbd_disk_state nps = drbd_try_outdate_peer(mdev); | ||
3656 | drbd_request_state(mdev, NS(pdsk, nps)); | ||
3657 | } | ||
3658 | } | ||
3659 | |||
3660 | spin_lock_irq(&mdev->req_lock); | ||
3661 | os = mdev->state; | ||
3662 | if (os.conn >= C_UNCONNECTED) { | ||
3663 | /* Do not restart in case we are C_DISCONNECTING */ | ||
3664 | ns = os; | ||
3665 | ns.conn = C_UNCONNECTED; | ||
3666 | rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | ||
3667 | } | ||
3668 | spin_unlock_irq(&mdev->req_lock); | ||
3669 | |||
3670 | if (os.conn == C_DISCONNECTING) { | ||
3671 | struct hlist_head *h; | ||
3672 | wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0); | ||
3673 | |||
3674 | /* we must not free the tl_hash | ||
3675 | * while application io is still on the fly */ | ||
3676 | wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0); | ||
3677 | |||
3678 | spin_lock_irq(&mdev->req_lock); | ||
3679 | /* paranoia code */ | ||
3680 | for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++) | ||
3681 | if (h->first) | ||
3682 | dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n", | ||
3683 | (int)(h - mdev->ee_hash), h->first); | ||
3684 | kfree(mdev->ee_hash); | ||
3685 | mdev->ee_hash = NULL; | ||
3686 | mdev->ee_hash_s = 0; | ||
3687 | |||
3688 | /* paranoia code */ | ||
3689 | for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) | ||
3690 | if (h->first) | ||
3691 | dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n", | ||
3692 | (int)(h - mdev->tl_hash), h->first); | ||
3693 | kfree(mdev->tl_hash); | ||
3694 | mdev->tl_hash = NULL; | ||
3695 | mdev->tl_hash_s = 0; | ||
3696 | spin_unlock_irq(&mdev->req_lock); | ||
3697 | |||
3698 | crypto_free_hash(mdev->cram_hmac_tfm); | ||
3699 | mdev->cram_hmac_tfm = NULL; | ||
3700 | |||
3701 | kfree(mdev->net_conf); | ||
3702 | mdev->net_conf = NULL; | ||
3703 | drbd_request_state(mdev, NS(conn, C_STANDALONE)); | ||
3704 | } | ||
3705 | |||
3706 | /* tcp_close and release of sendpage pages can be deferred. I don't | ||
3707 | * want to use SO_LINGER, because apparently it can be deferred for | ||
3708 | * more than 20 seconds (longest time I checked). | ||
3709 | * | ||
3710 | * Actually we don't care for exactly when the network stack does its | ||
3711 | * put_page(), but release our reference on these pages right here. | ||
3712 | */ | ||
3713 | i = drbd_release_ee(mdev, &mdev->net_ee); | ||
3714 | if (i) | ||
3715 | dev_info(DEV, "net_ee not empty, killed %u entries\n", i); | ||
3716 | i = atomic_read(&mdev->pp_in_use); | ||
3717 | if (i) | ||
3718 | dev_info(DEV, "pp_in_use = %u, expected 0\n", i); | ||
3719 | |||
3720 | D_ASSERT(list_empty(&mdev->read_ee)); | ||
3721 | D_ASSERT(list_empty(&mdev->active_ee)); | ||
3722 | D_ASSERT(list_empty(&mdev->sync_ee)); | ||
3723 | D_ASSERT(list_empty(&mdev->done_ee)); | ||
3724 | |||
3725 | /* ok, no more ee's on the fly, it is safe to reset the epoch_size */ | ||
3726 | atomic_set(&mdev->current_epoch->epoch_size, 0); | ||
3727 | D_ASSERT(list_empty(&mdev->current_epoch->list)); | ||
3728 | } | ||
3729 | |||
3730 | /* | ||
3731 | * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version | ||
3732 | * we can agree on is stored in agreed_pro_version. | ||
3733 | * | ||
3734 | * feature flags and the reserved array should be enough room for future | ||
3735 | * enhancements of the handshake protocol, and possible plugins... | ||
3736 | * | ||
3737 | * for now, they are expected to be zero, but ignored. | ||
3738 | */ | ||
3739 | static int drbd_send_handshake(struct drbd_conf *mdev) | ||
3740 | { | ||
3741 | /* ASSERT current == mdev->receiver ... */ | ||
3742 | struct p_handshake *p = &mdev->data.sbuf.handshake; | ||
3743 | int ok; | ||
3744 | |||
3745 | if (mutex_lock_interruptible(&mdev->data.mutex)) { | ||
3746 | dev_err(DEV, "interrupted during initial handshake\n"); | ||
3747 | return 0; /* interrupted. not ok. */ | ||
3748 | } | ||
3749 | |||
3750 | if (mdev->data.socket == NULL) { | ||
3751 | mutex_unlock(&mdev->data.mutex); | ||
3752 | return 0; | ||
3753 | } | ||
3754 | |||
3755 | memset(p, 0, sizeof(*p)); | ||
3756 | p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); | ||
3757 | p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); | ||
3758 | ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE, | ||
3759 | (struct p_header *)p, sizeof(*p), 0 ); | ||
3760 | mutex_unlock(&mdev->data.mutex); | ||
3761 | return ok; | ||
3762 | } | ||
3763 | |||
3764 | /* | ||
3765 | * return values: | ||
3766 | * 1 yes, we have a valid connection | ||
3767 | * 0 oops, did not work out, please try again | ||
3768 | * -1 peer talks different language, | ||
3769 | * no point in trying again, please go standalone. | ||
3770 | */ | ||
3771 | static int drbd_do_handshake(struct drbd_conf *mdev) | ||
3772 | { | ||
3773 | /* ASSERT current == mdev->receiver ... */ | ||
3774 | struct p_handshake *p = &mdev->data.rbuf.handshake; | ||
3775 | const int expect = sizeof(struct p_handshake) | ||
3776 | -sizeof(struct p_header); | ||
3777 | int rv; | ||
3778 | |||
3779 | rv = drbd_send_handshake(mdev); | ||
3780 | if (!rv) | ||
3781 | return 0; | ||
3782 | |||
3783 | rv = drbd_recv_header(mdev, &p->head); | ||
3784 | if (!rv) | ||
3785 | return 0; | ||
3786 | |||
3787 | if (p->head.command != P_HAND_SHAKE) { | ||
3788 | dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n", | ||
3789 | cmdname(p->head.command), p->head.command); | ||
3790 | return -1; | ||
3791 | } | ||
3792 | |||
3793 | if (p->head.length != expect) { | ||
3794 | dev_err(DEV, "expected HandShake length: %u, received: %u\n", | ||
3795 | expect, p->head.length); | ||
3796 | return -1; | ||
3797 | } | ||
3798 | |||
3799 | rv = drbd_recv(mdev, &p->head.payload, expect); | ||
3800 | |||
3801 | if (rv != expect) { | ||
3802 | dev_err(DEV, "short read receiving handshake packet: l=%u\n", rv); | ||
3803 | return 0; | ||
3804 | } | ||
3805 | |||
3806 | p->protocol_min = be32_to_cpu(p->protocol_min); | ||
3807 | p->protocol_max = be32_to_cpu(p->protocol_max); | ||
3808 | if (p->protocol_max == 0) | ||
3809 | p->protocol_max = p->protocol_min; | ||
3810 | |||
3811 | if (PRO_VERSION_MAX < p->protocol_min || | ||
3812 | PRO_VERSION_MIN > p->protocol_max) | ||
3813 | goto incompat; | ||
3814 | |||
3815 | mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); | ||
3816 | |||
3817 | dev_info(DEV, "Handshake successful: " | ||
3818 | "Agreed network protocol version %d\n", mdev->agreed_pro_version); | ||
3819 | |||
3820 | return 1; | ||
3821 | |||
3822 | incompat: | ||
3823 | dev_err(DEV, "incompatible DRBD dialects: " | ||
3824 | "I support %d-%d, peer supports %d-%d\n", | ||
3825 | PRO_VERSION_MIN, PRO_VERSION_MAX, | ||
3826 | p->protocol_min, p->protocol_max); | ||
3827 | return -1; | ||
3828 | } | ||
3829 | |||
3830 | #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) | ||
3831 | static int drbd_do_auth(struct drbd_conf *mdev) | ||
3832 | { | ||
3833 | dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); | ||
3834 | dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); | ||
3835 | return 0; | ||
3836 | } | ||
3837 | #else | ||
3838 | #define CHALLENGE_LEN 64 | ||
3839 | static int drbd_do_auth(struct drbd_conf *mdev) | ||
3840 | { | ||
3841 | char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */ | ||
3842 | struct scatterlist sg; | ||
3843 | char *response = NULL; | ||
3844 | char *right_response = NULL; | ||
3845 | char *peers_ch = NULL; | ||
3846 | struct p_header p; | ||
3847 | unsigned int key_len = strlen(mdev->net_conf->shared_secret); | ||
3848 | unsigned int resp_size; | ||
3849 | struct hash_desc desc; | ||
3850 | int rv; | ||
3851 | |||
3852 | desc.tfm = mdev->cram_hmac_tfm; | ||
3853 | desc.flags = 0; | ||
3854 | |||
3855 | rv = crypto_hash_setkey(mdev->cram_hmac_tfm, | ||
3856 | (u8 *)mdev->net_conf->shared_secret, key_len); | ||
3857 | if (rv) { | ||
3858 | dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv); | ||
3859 | rv = 0; | ||
3860 | goto fail; | ||
3861 | } | ||
3862 | |||
3863 | get_random_bytes(my_challenge, CHALLENGE_LEN); | ||
3864 | |||
3865 | rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN); | ||
3866 | if (!rv) | ||
3867 | goto fail; | ||
3868 | |||
3869 | rv = drbd_recv_header(mdev, &p); | ||
3870 | if (!rv) | ||
3871 | goto fail; | ||
3872 | |||
3873 | if (p.command != P_AUTH_CHALLENGE) { | ||
3874 | dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n", | ||
3875 | cmdname(p.command), p.command); | ||
3876 | rv = 0; | ||
3877 | goto fail; | ||
3878 | } | ||
3879 | |||
3880 | if (p.length > CHALLENGE_LEN*2) { | ||
3881 | dev_err(DEV, "expected AuthChallenge payload too big.\n"); | ||
3882 | rv = 0; | ||
3883 | goto fail; | ||
3884 | } | ||
3885 | |||
3886 | peers_ch = kmalloc(p.length, GFP_NOIO); | ||
3887 | if (peers_ch == NULL) { | ||
3888 | dev_err(DEV, "kmalloc of peers_ch failed\n"); | ||
3889 | rv = 0; | ||
3890 | goto fail; | ||
3891 | } | ||
3892 | |||
3893 | rv = drbd_recv(mdev, peers_ch, p.length); | ||
3894 | |||
3895 | if (rv != p.length) { | ||
3896 | dev_err(DEV, "short read AuthChallenge: l=%u\n", rv); | ||
3897 | rv = 0; | ||
3898 | goto fail; | ||
3899 | } | ||
3900 | |||
3901 | resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm); | ||
3902 | response = kmalloc(resp_size, GFP_NOIO); | ||
3903 | if (response == NULL) { | ||
3904 | dev_err(DEV, "kmalloc of response failed\n"); | ||
3905 | rv = 0; | ||
3906 | goto fail; | ||
3907 | } | ||
3908 | |||
3909 | sg_init_table(&sg, 1); | ||
3910 | sg_set_buf(&sg, peers_ch, p.length); | ||
3911 | |||
3912 | rv = crypto_hash_digest(&desc, &sg, sg.length, response); | ||
3913 | if (rv) { | ||
3914 | dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv); | ||
3915 | rv = 0; | ||
3916 | goto fail; | ||
3917 | } | ||
3918 | |||
3919 | rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size); | ||
3920 | if (!rv) | ||
3921 | goto fail; | ||
3922 | |||
3923 | rv = drbd_recv_header(mdev, &p); | ||
3924 | if (!rv) | ||
3925 | goto fail; | ||
3926 | |||
3927 | if (p.command != P_AUTH_RESPONSE) { | ||
3928 | dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n", | ||
3929 | cmdname(p.command), p.command); | ||
3930 | rv = 0; | ||
3931 | goto fail; | ||
3932 | } | ||
3933 | |||
3934 | if (p.length != resp_size) { | ||
3935 | dev_err(DEV, "expected AuthResponse payload of wrong size\n"); | ||
3936 | rv = 0; | ||
3937 | goto fail; | ||
3938 | } | ||
3939 | |||
3940 | rv = drbd_recv(mdev, response , resp_size); | ||
3941 | |||
3942 | if (rv != resp_size) { | ||
3943 | dev_err(DEV, "short read receiving AuthResponse: l=%u\n", rv); | ||
3944 | rv = 0; | ||
3945 | goto fail; | ||
3946 | } | ||
3947 | |||
3948 | right_response = kmalloc(resp_size, GFP_NOIO); | ||
3949 | if (response == NULL) { | ||
3950 | dev_err(DEV, "kmalloc of right_response failed\n"); | ||
3951 | rv = 0; | ||
3952 | goto fail; | ||
3953 | } | ||
3954 | |||
3955 | sg_set_buf(&sg, my_challenge, CHALLENGE_LEN); | ||
3956 | |||
3957 | rv = crypto_hash_digest(&desc, &sg, sg.length, right_response); | ||
3958 | if (rv) { | ||
3959 | dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv); | ||
3960 | rv = 0; | ||
3961 | goto fail; | ||
3962 | } | ||
3963 | |||
3964 | rv = !memcmp(response, right_response, resp_size); | ||
3965 | |||
3966 | if (rv) | ||
3967 | dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n", | ||
3968 | resp_size, mdev->net_conf->cram_hmac_alg); | ||
3969 | |||
3970 | fail: | ||
3971 | kfree(peers_ch); | ||
3972 | kfree(response); | ||
3973 | kfree(right_response); | ||
3974 | |||
3975 | return rv; | ||
3976 | } | ||
3977 | #endif | ||
3978 | |||
3979 | int drbdd_init(struct drbd_thread *thi) | ||
3980 | { | ||
3981 | struct drbd_conf *mdev = thi->mdev; | ||
3982 | unsigned int minor = mdev_to_minor(mdev); | ||
3983 | int h; | ||
3984 | |||
3985 | sprintf(current->comm, "drbd%d_receiver", minor); | ||
3986 | |||
3987 | dev_info(DEV, "receiver (re)started\n"); | ||
3988 | |||
3989 | do { | ||
3990 | h = drbd_connect(mdev); | ||
3991 | if (h == 0) { | ||
3992 | drbd_disconnect(mdev); | ||
3993 | __set_current_state(TASK_INTERRUPTIBLE); | ||
3994 | schedule_timeout(HZ); | ||
3995 | } | ||
3996 | if (h == -1) { | ||
3997 | dev_warn(DEV, "Discarding network configuration.\n"); | ||
3998 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
3999 | } | ||
4000 | } while (h == 0); | ||
4001 | |||
4002 | if (h > 0) { | ||
4003 | if (get_net_conf(mdev)) { | ||
4004 | drbdd(mdev); | ||
4005 | put_net_conf(mdev); | ||
4006 | } | ||
4007 | } | ||
4008 | |||
4009 | drbd_disconnect(mdev); | ||
4010 | |||
4011 | dev_info(DEV, "receiver terminated\n"); | ||
4012 | return 0; | ||
4013 | } | ||
4014 | |||
4015 | /* ********* acknowledge sender ******** */ | ||
4016 | |||
4017 | static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h) | ||
4018 | { | ||
4019 | struct p_req_state_reply *p = (struct p_req_state_reply *)h; | ||
4020 | |||
4021 | int retcode = be32_to_cpu(p->retcode); | ||
4022 | |||
4023 | if (retcode >= SS_SUCCESS) { | ||
4024 | set_bit(CL_ST_CHG_SUCCESS, &mdev->flags); | ||
4025 | } else { | ||
4026 | set_bit(CL_ST_CHG_FAIL, &mdev->flags); | ||
4027 | dev_err(DEV, "Requested state change failed by peer: %s (%d)\n", | ||
4028 | drbd_set_st_err_str(retcode), retcode); | ||
4029 | } | ||
4030 | wake_up(&mdev->state_wait); | ||
4031 | |||
4032 | return TRUE; | ||
4033 | } | ||
4034 | |||
4035 | static int got_Ping(struct drbd_conf *mdev, struct p_header *h) | ||
4036 | { | ||
4037 | return drbd_send_ping_ack(mdev); | ||
4038 | |||
4039 | } | ||
4040 | |||
4041 | static int got_PingAck(struct drbd_conf *mdev, struct p_header *h) | ||
4042 | { | ||
4043 | /* restore idle timeout */ | ||
4044 | mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; | ||
4045 | |||
4046 | return TRUE; | ||
4047 | } | ||
4048 | |||
4049 | static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h) | ||
4050 | { | ||
4051 | struct p_block_ack *p = (struct p_block_ack *)h; | ||
4052 | sector_t sector = be64_to_cpu(p->sector); | ||
4053 | int blksize = be32_to_cpu(p->blksize); | ||
4054 | |||
4055 | D_ASSERT(mdev->agreed_pro_version >= 89); | ||
4056 | |||
4057 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | ||
4058 | |||
4059 | drbd_rs_complete_io(mdev, sector); | ||
4060 | drbd_set_in_sync(mdev, sector, blksize); | ||
4061 | /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */ | ||
4062 | mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT); | ||
4063 | dec_rs_pending(mdev); | ||
4064 | |||
4065 | return TRUE; | ||
4066 | } | ||
4067 | |||
4068 | /* when we receive the ACK for a write request, | ||
4069 | * verify that we actually know about it */ | ||
4070 | static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev, | ||
4071 | u64 id, sector_t sector) | ||
4072 | { | ||
4073 | struct hlist_head *slot = tl_hash_slot(mdev, sector); | ||
4074 | struct hlist_node *n; | ||
4075 | struct drbd_request *req; | ||
4076 | |||
4077 | hlist_for_each_entry(req, n, slot, colision) { | ||
4078 | if ((unsigned long)req == (unsigned long)id) { | ||
4079 | if (req->sector != sector) { | ||
4080 | dev_err(DEV, "_ack_id_to_req: found req %p but it has " | ||
4081 | "wrong sector (%llus versus %llus)\n", req, | ||
4082 | (unsigned long long)req->sector, | ||
4083 | (unsigned long long)sector); | ||
4084 | break; | ||
4085 | } | ||
4086 | return req; | ||
4087 | } | ||
4088 | } | ||
4089 | dev_err(DEV, "_ack_id_to_req: failed to find req %p, sector %llus in list\n", | ||
4090 | (void *)(unsigned long)id, (unsigned long long)sector); | ||
4091 | return NULL; | ||
4092 | } | ||
4093 | |||
4094 | typedef struct drbd_request *(req_validator_fn) | ||
4095 | (struct drbd_conf *mdev, u64 id, sector_t sector); | ||
4096 | |||
4097 | static int validate_req_change_req_state(struct drbd_conf *mdev, | ||
4098 | u64 id, sector_t sector, req_validator_fn validator, | ||
4099 | const char *func, enum drbd_req_event what) | ||
4100 | { | ||
4101 | struct drbd_request *req; | ||
4102 | struct bio_and_error m; | ||
4103 | |||
4104 | spin_lock_irq(&mdev->req_lock); | ||
4105 | req = validator(mdev, id, sector); | ||
4106 | if (unlikely(!req)) { | ||
4107 | spin_unlock_irq(&mdev->req_lock); | ||
4108 | dev_err(DEV, "%s: got a corrupt block_id/sector pair\n", func); | ||
4109 | return FALSE; | ||
4110 | } | ||
4111 | __req_mod(req, what, &m); | ||
4112 | spin_unlock_irq(&mdev->req_lock); | ||
4113 | |||
4114 | if (m.bio) | ||
4115 | complete_master_bio(mdev, &m); | ||
4116 | return TRUE; | ||
4117 | } | ||
4118 | |||
4119 | static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h) | ||
4120 | { | ||
4121 | struct p_block_ack *p = (struct p_block_ack *)h; | ||
4122 | sector_t sector = be64_to_cpu(p->sector); | ||
4123 | int blksize = be32_to_cpu(p->blksize); | ||
4124 | enum drbd_req_event what; | ||
4125 | |||
4126 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | ||
4127 | |||
4128 | if (is_syncer_block_id(p->block_id)) { | ||
4129 | drbd_set_in_sync(mdev, sector, blksize); | ||
4130 | dec_rs_pending(mdev); | ||
4131 | return TRUE; | ||
4132 | } | ||
4133 | switch (be16_to_cpu(h->command)) { | ||
4134 | case P_RS_WRITE_ACK: | ||
4135 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | ||
4136 | what = write_acked_by_peer_and_sis; | ||
4137 | break; | ||
4138 | case P_WRITE_ACK: | ||
4139 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | ||
4140 | what = write_acked_by_peer; | ||
4141 | break; | ||
4142 | case P_RECV_ACK: | ||
4143 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B); | ||
4144 | what = recv_acked_by_peer; | ||
4145 | break; | ||
4146 | case P_DISCARD_ACK: | ||
4147 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | ||
4148 | what = conflict_discarded_by_peer; | ||
4149 | break; | ||
4150 | default: | ||
4151 | D_ASSERT(0); | ||
4152 | return FALSE; | ||
4153 | } | ||
4154 | |||
4155 | return validate_req_change_req_state(mdev, p->block_id, sector, | ||
4156 | _ack_id_to_req, __func__ , what); | ||
4157 | } | ||
4158 | |||
4159 | static int got_NegAck(struct drbd_conf *mdev, struct p_header *h) | ||
4160 | { | ||
4161 | struct p_block_ack *p = (struct p_block_ack *)h; | ||
4162 | sector_t sector = be64_to_cpu(p->sector); | ||
4163 | |||
4164 | if (__ratelimit(&drbd_ratelimit_state)) | ||
4165 | dev_warn(DEV, "Got NegAck packet. Peer is in troubles?\n"); | ||
4166 | |||
4167 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | ||
4168 | |||
4169 | if (is_syncer_block_id(p->block_id)) { | ||
4170 | int size = be32_to_cpu(p->blksize); | ||
4171 | dec_rs_pending(mdev); | ||
4172 | drbd_rs_failed_io(mdev, sector, size); | ||
4173 | return TRUE; | ||
4174 | } | ||
4175 | return validate_req_change_req_state(mdev, p->block_id, sector, | ||
4176 | _ack_id_to_req, __func__ , neg_acked); | ||
4177 | } | ||
4178 | |||
4179 | static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h) | ||
4180 | { | ||
4181 | struct p_block_ack *p = (struct p_block_ack *)h; | ||
4182 | sector_t sector = be64_to_cpu(p->sector); | ||
4183 | |||
4184 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | ||
4185 | dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n", | ||
4186 | (unsigned long long)sector, be32_to_cpu(p->blksize)); | ||
4187 | |||
4188 | return validate_req_change_req_state(mdev, p->block_id, sector, | ||
4189 | _ar_id_to_req, __func__ , neg_acked); | ||
4190 | } | ||
4191 | |||
4192 | static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h) | ||
4193 | { | ||
4194 | sector_t sector; | ||
4195 | int size; | ||
4196 | struct p_block_ack *p = (struct p_block_ack *)h; | ||
4197 | |||
4198 | sector = be64_to_cpu(p->sector); | ||
4199 | size = be32_to_cpu(p->blksize); | ||
4200 | D_ASSERT(p->block_id == ID_SYNCER); | ||
4201 | |||
4202 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | ||
4203 | |||
4204 | dec_rs_pending(mdev); | ||
4205 | |||
4206 | if (get_ldev_if_state(mdev, D_FAILED)) { | ||
4207 | drbd_rs_complete_io(mdev, sector); | ||
4208 | drbd_rs_failed_io(mdev, sector, size); | ||
4209 | put_ldev(mdev); | ||
4210 | } | ||
4211 | |||
4212 | return TRUE; | ||
4213 | } | ||
4214 | |||
4215 | static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h) | ||
4216 | { | ||
4217 | struct p_barrier_ack *p = (struct p_barrier_ack *)h; | ||
4218 | |||
4219 | tl_release(mdev, p->barrier, be32_to_cpu(p->set_size)); | ||
4220 | |||
4221 | return TRUE; | ||
4222 | } | ||
4223 | |||
4224 | static int got_OVResult(struct drbd_conf *mdev, struct p_header *h) | ||
4225 | { | ||
4226 | struct p_block_ack *p = (struct p_block_ack *)h; | ||
4227 | struct drbd_work *w; | ||
4228 | sector_t sector; | ||
4229 | int size; | ||
4230 | |||
4231 | sector = be64_to_cpu(p->sector); | ||
4232 | size = be32_to_cpu(p->blksize); | ||
4233 | |||
4234 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | ||
4235 | |||
4236 | if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC) | ||
4237 | drbd_ov_oos_found(mdev, sector, size); | ||
4238 | else | ||
4239 | ov_oos_print(mdev); | ||
4240 | |||
4241 | drbd_rs_complete_io(mdev, sector); | ||
4242 | dec_rs_pending(mdev); | ||
4243 | |||
4244 | if (--mdev->ov_left == 0) { | ||
4245 | w = kmalloc(sizeof(*w), GFP_NOIO); | ||
4246 | if (w) { | ||
4247 | w->cb = w_ov_finished; | ||
4248 | drbd_queue_work_front(&mdev->data.work, w); | ||
4249 | } else { | ||
4250 | dev_err(DEV, "kmalloc(w) failed."); | ||
4251 | ov_oos_print(mdev); | ||
4252 | drbd_resync_finished(mdev); | ||
4253 | } | ||
4254 | } | ||
4255 | return TRUE; | ||
4256 | } | ||
4257 | |||
4258 | struct asender_cmd { | ||
4259 | size_t pkt_size; | ||
4260 | int (*process)(struct drbd_conf *mdev, struct p_header *h); | ||
4261 | }; | ||
4262 | |||
4263 | static struct asender_cmd *get_asender_cmd(int cmd) | ||
4264 | { | ||
4265 | static struct asender_cmd asender_tbl[] = { | ||
4266 | /* anything missing from this table is in | ||
4267 | * the drbd_cmd_handler (drbd_default_handler) table, | ||
4268 | * see the beginning of drbdd() */ | ||
4269 | [P_PING] = { sizeof(struct p_header), got_Ping }, | ||
4270 | [P_PING_ACK] = { sizeof(struct p_header), got_PingAck }, | ||
4271 | [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | ||
4272 | [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | ||
4273 | [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | ||
4274 | [P_DISCARD_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | ||
4275 | [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck }, | ||
4276 | [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply }, | ||
4277 | [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply}, | ||
4278 | [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult }, | ||
4279 | [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, | ||
4280 | [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, | ||
4281 | [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, | ||
4282 | [P_MAX_CMD] = { 0, NULL }, | ||
4283 | }; | ||
4284 | if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL) | ||
4285 | return NULL; | ||
4286 | return &asender_tbl[cmd]; | ||
4287 | } | ||
4288 | |||
4289 | int drbd_asender(struct drbd_thread *thi) | ||
4290 | { | ||
4291 | struct drbd_conf *mdev = thi->mdev; | ||
4292 | struct p_header *h = &mdev->meta.rbuf.header; | ||
4293 | struct asender_cmd *cmd = NULL; | ||
4294 | |||
4295 | int rv, len; | ||
4296 | void *buf = h; | ||
4297 | int received = 0; | ||
4298 | int expect = sizeof(struct p_header); | ||
4299 | int empty; | ||
4300 | |||
4301 | sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev)); | ||
4302 | |||
4303 | current->policy = SCHED_RR; /* Make this a realtime task! */ | ||
4304 | current->rt_priority = 2; /* more important than all other tasks */ | ||
4305 | |||
4306 | while (get_t_state(thi) == Running) { | ||
4307 | drbd_thread_current_set_cpu(mdev); | ||
4308 | if (test_and_clear_bit(SEND_PING, &mdev->flags)) { | ||
4309 | ERR_IF(!drbd_send_ping(mdev)) goto reconnect; | ||
4310 | mdev->meta.socket->sk->sk_rcvtimeo = | ||
4311 | mdev->net_conf->ping_timeo*HZ/10; | ||
4312 | } | ||
4313 | |||
4314 | /* conditionally cork; | ||
4315 | * it may hurt latency if we cork without much to send */ | ||
4316 | if (!mdev->net_conf->no_cork && | ||
4317 | 3 < atomic_read(&mdev->unacked_cnt)) | ||
4318 | drbd_tcp_cork(mdev->meta.socket); | ||
4319 | while (1) { | ||
4320 | clear_bit(SIGNAL_ASENDER, &mdev->flags); | ||
4321 | flush_signals(current); | ||
4322 | if (!drbd_process_done_ee(mdev)) { | ||
4323 | dev_err(DEV, "process_done_ee() = NOT_OK\n"); | ||
4324 | goto reconnect; | ||
4325 | } | ||
4326 | /* to avoid race with newly queued ACKs */ | ||
4327 | set_bit(SIGNAL_ASENDER, &mdev->flags); | ||
4328 | spin_lock_irq(&mdev->req_lock); | ||
4329 | empty = list_empty(&mdev->done_ee); | ||
4330 | spin_unlock_irq(&mdev->req_lock); | ||
4331 | /* new ack may have been queued right here, | ||
4332 | * but then there is also a signal pending, | ||
4333 | * and we start over... */ | ||
4334 | if (empty) | ||
4335 | break; | ||
4336 | } | ||
4337 | /* but unconditionally uncork unless disabled */ | ||
4338 | if (!mdev->net_conf->no_cork) | ||
4339 | drbd_tcp_uncork(mdev->meta.socket); | ||
4340 | |||
4341 | /* short circuit, recv_msg would return EINTR anyways. */ | ||
4342 | if (signal_pending(current)) | ||
4343 | continue; | ||
4344 | |||
4345 | rv = drbd_recv_short(mdev, mdev->meta.socket, | ||
4346 | buf, expect-received, 0); | ||
4347 | clear_bit(SIGNAL_ASENDER, &mdev->flags); | ||
4348 | |||
4349 | flush_signals(current); | ||
4350 | |||
4351 | /* Note: | ||
4352 | * -EINTR (on meta) we got a signal | ||
4353 | * -EAGAIN (on meta) rcvtimeo expired | ||
4354 | * -ECONNRESET other side closed the connection | ||
4355 | * -ERESTARTSYS (on data) we got a signal | ||
4356 | * rv < 0 other than above: unexpected error! | ||
4357 | * rv == expected: full header or command | ||
4358 | * rv < expected: "woken" by signal during receive | ||
4359 | * rv == 0 : "connection shut down by peer" | ||
4360 | */ | ||
4361 | if (likely(rv > 0)) { | ||
4362 | received += rv; | ||
4363 | buf += rv; | ||
4364 | } else if (rv == 0) { | ||
4365 | dev_err(DEV, "meta connection shut down by peer.\n"); | ||
4366 | goto reconnect; | ||
4367 | } else if (rv == -EAGAIN) { | ||
4368 | if (mdev->meta.socket->sk->sk_rcvtimeo == | ||
4369 | mdev->net_conf->ping_timeo*HZ/10) { | ||
4370 | dev_err(DEV, "PingAck did not arrive in time.\n"); | ||
4371 | goto reconnect; | ||
4372 | } | ||
4373 | set_bit(SEND_PING, &mdev->flags); | ||
4374 | continue; | ||
4375 | } else if (rv == -EINTR) { | ||
4376 | continue; | ||
4377 | } else { | ||
4378 | dev_err(DEV, "sock_recvmsg returned %d\n", rv); | ||
4379 | goto reconnect; | ||
4380 | } | ||
4381 | |||
4382 | if (received == expect && cmd == NULL) { | ||
4383 | if (unlikely(h->magic != BE_DRBD_MAGIC)) { | ||
4384 | dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n", | ||
4385 | (long)be32_to_cpu(h->magic), | ||
4386 | h->command, h->length); | ||
4387 | goto reconnect; | ||
4388 | } | ||
4389 | cmd = get_asender_cmd(be16_to_cpu(h->command)); | ||
4390 | len = be16_to_cpu(h->length); | ||
4391 | if (unlikely(cmd == NULL)) { | ||
4392 | dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n", | ||
4393 | (long)be32_to_cpu(h->magic), | ||
4394 | h->command, h->length); | ||
4395 | goto disconnect; | ||
4396 | } | ||
4397 | expect = cmd->pkt_size; | ||
4398 | ERR_IF(len != expect-sizeof(struct p_header)) | ||
4399 | goto reconnect; | ||
4400 | } | ||
4401 | if (received == expect) { | ||
4402 | D_ASSERT(cmd != NULL); | ||
4403 | if (!cmd->process(mdev, h)) | ||
4404 | goto reconnect; | ||
4405 | |||
4406 | buf = h; | ||
4407 | received = 0; | ||
4408 | expect = sizeof(struct p_header); | ||
4409 | cmd = NULL; | ||
4410 | } | ||
4411 | } | ||
4412 | |||
4413 | if (0) { | ||
4414 | reconnect: | ||
4415 | drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); | ||
4416 | } | ||
4417 | if (0) { | ||
4418 | disconnect: | ||
4419 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
4420 | } | ||
4421 | clear_bit(SIGNAL_ASENDER, &mdev->flags); | ||
4422 | |||
4423 | D_ASSERT(mdev->state.conn < C_CONNECTED); | ||
4424 | dev_info(DEV, "asender terminated\n"); | ||
4425 | |||
4426 | return 0; | ||
4427 | } | ||
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c new file mode 100644 index 000000000000..3678d3d66c6c --- /dev/null +++ b/drivers/block/drbd/drbd_req.c | |||
@@ -0,0 +1,1120 @@ | |||
1 | /* | ||
2 | drbd_req.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | |||
28 | #include <linux/slab.h> | ||
29 | #include <linux/drbd.h> | ||
30 | #include "drbd_int.h" | ||
31 | #include "drbd_req.h" | ||
32 | |||
33 | |||
34 | /* Update disk stats at start of I/O request */ | ||
35 | static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio) | ||
36 | { | ||
37 | const int rw = bio_data_dir(bio); | ||
38 | int cpu; | ||
39 | cpu = part_stat_lock(); | ||
40 | part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); | ||
41 | part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); | ||
42 | part_stat_unlock(); | ||
43 | mdev->vdisk->part0.in_flight[rw]++; | ||
44 | } | ||
45 | |||
46 | /* Update disk stats when completing request upwards */ | ||
47 | static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req) | ||
48 | { | ||
49 | int rw = bio_data_dir(req->master_bio); | ||
50 | unsigned long duration = jiffies - req->start_time; | ||
51 | int cpu; | ||
52 | cpu = part_stat_lock(); | ||
53 | part_stat_add(cpu, &mdev->vdisk->part0, ticks[rw], duration); | ||
54 | part_round_stats(cpu, &mdev->vdisk->part0); | ||
55 | part_stat_unlock(); | ||
56 | mdev->vdisk->part0.in_flight[rw]--; | ||
57 | } | ||
58 | |||
59 | static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw) | ||
60 | { | ||
61 | const unsigned long s = req->rq_state; | ||
62 | /* if it was a write, we may have to set the corresponding | ||
63 | * bit(s) out-of-sync first. If it had a local part, we need to | ||
64 | * release the reference to the activity log. */ | ||
65 | if (rw == WRITE) { | ||
66 | /* remove it from the transfer log. | ||
67 | * well, only if it had been there in the first | ||
68 | * place... if it had not (local only or conflicting | ||
69 | * and never sent), it should still be "empty" as | ||
70 | * initialized in drbd_req_new(), so we can list_del() it | ||
71 | * here unconditionally */ | ||
72 | list_del(&req->tl_requests); | ||
73 | /* Set out-of-sync unless both OK flags are set | ||
74 | * (local only or remote failed). | ||
75 | * Other places where we set out-of-sync: | ||
76 | * READ with local io-error */ | ||
77 | if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK)) | ||
78 | drbd_set_out_of_sync(mdev, req->sector, req->size); | ||
79 | |||
80 | if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS)) | ||
81 | drbd_set_in_sync(mdev, req->sector, req->size); | ||
82 | |||
83 | /* one might be tempted to move the drbd_al_complete_io | ||
84 | * to the local io completion callback drbd_endio_pri. | ||
85 | * but, if this was a mirror write, we may only | ||
86 | * drbd_al_complete_io after this is RQ_NET_DONE, | ||
87 | * otherwise the extent could be dropped from the al | ||
88 | * before it has actually been written on the peer. | ||
89 | * if we crash before our peer knows about the request, | ||
90 | * but after the extent has been dropped from the al, | ||
91 | * we would forget to resync the corresponding extent. | ||
92 | */ | ||
93 | if (s & RQ_LOCAL_MASK) { | ||
94 | if (get_ldev_if_state(mdev, D_FAILED)) { | ||
95 | drbd_al_complete_io(mdev, req->sector); | ||
96 | put_ldev(mdev); | ||
97 | } else if (__ratelimit(&drbd_ratelimit_state)) { | ||
98 | dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), " | ||
99 | "but my Disk seems to have failed :(\n", | ||
100 | (unsigned long long) req->sector); | ||
101 | } | ||
102 | } | ||
103 | } | ||
104 | |||
105 | /* if it was a local io error, we want to notify our | ||
106 | * peer about that, and see if we need to | ||
107 | * detach the disk and stuff. | ||
108 | * to avoid allocating some special work | ||
109 | * struct, reuse the request. */ | ||
110 | |||
111 | /* THINK | ||
112 | * why do we do this not when we detect the error, | ||
113 | * but delay it until it is "done", i.e. possibly | ||
114 | * until the next barrier ack? */ | ||
115 | |||
116 | if (rw == WRITE && | ||
117 | ((s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) { | ||
118 | if (!(req->w.list.next == LIST_POISON1 || | ||
119 | list_empty(&req->w.list))) { | ||
120 | /* DEBUG ASSERT only; if this triggers, we | ||
121 | * probably corrupt the worker list here */ | ||
122 | dev_err(DEV, "req->w.list.next = %p\n", req->w.list.next); | ||
123 | dev_err(DEV, "req->w.list.prev = %p\n", req->w.list.prev); | ||
124 | } | ||
125 | req->w.cb = w_io_error; | ||
126 | drbd_queue_work(&mdev->data.work, &req->w); | ||
127 | /* drbd_req_free() is done in w_io_error */ | ||
128 | } else { | ||
129 | drbd_req_free(req); | ||
130 | } | ||
131 | } | ||
132 | |||
133 | static void queue_barrier(struct drbd_conf *mdev) | ||
134 | { | ||
135 | struct drbd_tl_epoch *b; | ||
136 | |||
137 | /* We are within the req_lock. Once we queued the barrier for sending, | ||
138 | * we set the CREATE_BARRIER bit. It is cleared as soon as a new | ||
139 | * barrier/epoch object is added. This is the only place this bit is | ||
140 | * set. It indicates that the barrier for this epoch is already queued, | ||
141 | * and no new epoch has been created yet. */ | ||
142 | if (test_bit(CREATE_BARRIER, &mdev->flags)) | ||
143 | return; | ||
144 | |||
145 | b = mdev->newest_tle; | ||
146 | b->w.cb = w_send_barrier; | ||
147 | /* inc_ap_pending done here, so we won't | ||
148 | * get imbalanced on connection loss. | ||
149 | * dec_ap_pending will be done in got_BarrierAck | ||
150 | * or (on connection loss) in tl_clear. */ | ||
151 | inc_ap_pending(mdev); | ||
152 | drbd_queue_work(&mdev->data.work, &b->w); | ||
153 | set_bit(CREATE_BARRIER, &mdev->flags); | ||
154 | } | ||
155 | |||
156 | static void _about_to_complete_local_write(struct drbd_conf *mdev, | ||
157 | struct drbd_request *req) | ||
158 | { | ||
159 | const unsigned long s = req->rq_state; | ||
160 | struct drbd_request *i; | ||
161 | struct drbd_epoch_entry *e; | ||
162 | struct hlist_node *n; | ||
163 | struct hlist_head *slot; | ||
164 | |||
165 | /* before we can signal completion to the upper layers, | ||
166 | * we may need to close the current epoch */ | ||
167 | if (mdev->state.conn >= C_CONNECTED && | ||
168 | req->epoch == mdev->newest_tle->br_number) | ||
169 | queue_barrier(mdev); | ||
170 | |||
171 | /* we need to do the conflict detection stuff, | ||
172 | * if we have the ee_hash (two_primaries) and | ||
173 | * this has been on the network */ | ||
174 | if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) { | ||
175 | const sector_t sector = req->sector; | ||
176 | const int size = req->size; | ||
177 | |||
178 | /* ASSERT: | ||
179 | * there must be no conflicting requests, since | ||
180 | * they must have been failed on the spot */ | ||
181 | #define OVERLAPS overlaps(sector, size, i->sector, i->size) | ||
182 | slot = tl_hash_slot(mdev, sector); | ||
183 | hlist_for_each_entry(i, n, slot, colision) { | ||
184 | if (OVERLAPS) { | ||
185 | dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; " | ||
186 | "other: %p %llus +%u\n", | ||
187 | req, (unsigned long long)sector, size, | ||
188 | i, (unsigned long long)i->sector, i->size); | ||
189 | } | ||
190 | } | ||
191 | |||
192 | /* maybe "wake" those conflicting epoch entries | ||
193 | * that wait for this request to finish. | ||
194 | * | ||
195 | * currently, there can be only _one_ such ee | ||
196 | * (well, or some more, which would be pending | ||
197 | * P_DISCARD_ACK not yet sent by the asender...), | ||
198 | * since we block the receiver thread upon the | ||
199 | * first conflict detection, which will wait on | ||
200 | * misc_wait. maybe we want to assert that? | ||
201 | * | ||
202 | * anyways, if we found one, | ||
203 | * we just have to do a wake_up. */ | ||
204 | #undef OVERLAPS | ||
205 | #define OVERLAPS overlaps(sector, size, e->sector, e->size) | ||
206 | slot = ee_hash_slot(mdev, req->sector); | ||
207 | hlist_for_each_entry(e, n, slot, colision) { | ||
208 | if (OVERLAPS) { | ||
209 | wake_up(&mdev->misc_wait); | ||
210 | break; | ||
211 | } | ||
212 | } | ||
213 | } | ||
214 | #undef OVERLAPS | ||
215 | } | ||
216 | |||
217 | void complete_master_bio(struct drbd_conf *mdev, | ||
218 | struct bio_and_error *m) | ||
219 | { | ||
220 | bio_endio(m->bio, m->error); | ||
221 | dec_ap_bio(mdev); | ||
222 | } | ||
223 | |||
224 | /* Helper for __req_mod(). | ||
225 | * Set m->bio to the master bio, if it is fit to be completed, | ||
226 | * or leave it alone (it is initialized to NULL in __req_mod), | ||
227 | * if it has already been completed, or cannot be completed yet. | ||
228 | * If m->bio is set, the error status to be returned is placed in m->error. | ||
229 | */ | ||
230 | void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) | ||
231 | { | ||
232 | const unsigned long s = req->rq_state; | ||
233 | struct drbd_conf *mdev = req->mdev; | ||
234 | /* only WRITES may end up here without a master bio (on barrier ack) */ | ||
235 | int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE; | ||
236 | |||
237 | /* we must not complete the master bio, while it is | ||
238 | * still being processed by _drbd_send_zc_bio (drbd_send_dblock) | ||
239 | * not yet acknowledged by the peer | ||
240 | * not yet completed by the local io subsystem | ||
241 | * these flags may get cleared in any order by | ||
242 | * the worker, | ||
243 | * the receiver, | ||
244 | * the bio_endio completion callbacks. | ||
245 | */ | ||
246 | if (s & RQ_NET_QUEUED) | ||
247 | return; | ||
248 | if (s & RQ_NET_PENDING) | ||
249 | return; | ||
250 | if (s & RQ_LOCAL_PENDING) | ||
251 | return; | ||
252 | |||
253 | if (req->master_bio) { | ||
254 | /* this is data_received (remote read) | ||
255 | * or protocol C P_WRITE_ACK | ||
256 | * or protocol B P_RECV_ACK | ||
257 | * or protocol A "handed_over_to_network" (SendAck) | ||
258 | * or canceled or failed, | ||
259 | * or killed from the transfer log due to connection loss. | ||
260 | */ | ||
261 | |||
262 | /* | ||
263 | * figure out whether to report success or failure. | ||
264 | * | ||
265 | * report success when at least one of the operations succeeded. | ||
266 | * or, to put the other way, | ||
267 | * only report failure, when both operations failed. | ||
268 | * | ||
269 | * what to do about the failures is handled elsewhere. | ||
270 | * what we need to do here is just: complete the master_bio. | ||
271 | * | ||
272 | * local completion error, if any, has been stored as ERR_PTR | ||
273 | * in private_bio within drbd_endio_pri. | ||
274 | */ | ||
275 | int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); | ||
276 | int error = PTR_ERR(req->private_bio); | ||
277 | |||
278 | /* remove the request from the conflict detection | ||
279 | * respective block_id verification hash */ | ||
280 | if (!hlist_unhashed(&req->colision)) | ||
281 | hlist_del(&req->colision); | ||
282 | else | ||
283 | D_ASSERT((s & RQ_NET_MASK) == 0); | ||
284 | |||
285 | /* for writes we need to do some extra housekeeping */ | ||
286 | if (rw == WRITE) | ||
287 | _about_to_complete_local_write(mdev, req); | ||
288 | |||
289 | /* Update disk stats */ | ||
290 | _drbd_end_io_acct(mdev, req); | ||
291 | |||
292 | m->error = ok ? 0 : (error ?: -EIO); | ||
293 | m->bio = req->master_bio; | ||
294 | req->master_bio = NULL; | ||
295 | } | ||
296 | |||
297 | if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) { | ||
298 | /* this is disconnected (local only) operation, | ||
299 | * or protocol C P_WRITE_ACK, | ||
300 | * or protocol A or B P_BARRIER_ACK, | ||
301 | * or killed from the transfer log due to connection loss. */ | ||
302 | _req_is_done(mdev, req, rw); | ||
303 | } | ||
304 | /* else: network part and not DONE yet. that is | ||
305 | * protocol A or B, barrier ack still pending... */ | ||
306 | } | ||
307 | |||
308 | /* | ||
309 | * checks whether there was an overlapping request | ||
310 | * or ee already registered. | ||
311 | * | ||
312 | * if so, return 1, in which case this request is completed on the spot, | ||
313 | * without ever being submitted or send. | ||
314 | * | ||
315 | * return 0 if it is ok to submit this request. | ||
316 | * | ||
317 | * NOTE: | ||
318 | * paranoia: assume something above us is broken, and issues different write | ||
319 | * requests for the same block simultaneously... | ||
320 | * | ||
321 | * To ensure these won't be reordered differently on both nodes, resulting in | ||
322 | * diverging data sets, we discard the later one(s). Not that this is supposed | ||
323 | * to happen, but this is the rationale why we also have to check for | ||
324 | * conflicting requests with local origin, and why we have to do so regardless | ||
325 | * of whether we allowed multiple primaries. | ||
326 | * | ||
327 | * BTW, in case we only have one primary, the ee_hash is empty anyways, and the | ||
328 | * second hlist_for_each_entry becomes a noop. This is even simpler than to | ||
329 | * grab a reference on the net_conf, and check for the two_primaries flag... | ||
330 | */ | ||
331 | static int _req_conflicts(struct drbd_request *req) | ||
332 | { | ||
333 | struct drbd_conf *mdev = req->mdev; | ||
334 | const sector_t sector = req->sector; | ||
335 | const int size = req->size; | ||
336 | struct drbd_request *i; | ||
337 | struct drbd_epoch_entry *e; | ||
338 | struct hlist_node *n; | ||
339 | struct hlist_head *slot; | ||
340 | |||
341 | D_ASSERT(hlist_unhashed(&req->colision)); | ||
342 | |||
343 | if (!get_net_conf(mdev)) | ||
344 | return 0; | ||
345 | |||
346 | /* BUG_ON */ | ||
347 | ERR_IF (mdev->tl_hash_s == 0) | ||
348 | goto out_no_conflict; | ||
349 | BUG_ON(mdev->tl_hash == NULL); | ||
350 | |||
351 | #define OVERLAPS overlaps(i->sector, i->size, sector, size) | ||
352 | slot = tl_hash_slot(mdev, sector); | ||
353 | hlist_for_each_entry(i, n, slot, colision) { | ||
354 | if (OVERLAPS) { | ||
355 | dev_alert(DEV, "%s[%u] Concurrent local write detected! " | ||
356 | "[DISCARD L] new: %llus +%u; " | ||
357 | "pending: %llus +%u\n", | ||
358 | current->comm, current->pid, | ||
359 | (unsigned long long)sector, size, | ||
360 | (unsigned long long)i->sector, i->size); | ||
361 | goto out_conflict; | ||
362 | } | ||
363 | } | ||
364 | |||
365 | if (mdev->ee_hash_s) { | ||
366 | /* now, check for overlapping requests with remote origin */ | ||
367 | BUG_ON(mdev->ee_hash == NULL); | ||
368 | #undef OVERLAPS | ||
369 | #define OVERLAPS overlaps(e->sector, e->size, sector, size) | ||
370 | slot = ee_hash_slot(mdev, sector); | ||
371 | hlist_for_each_entry(e, n, slot, colision) { | ||
372 | if (OVERLAPS) { | ||
373 | dev_alert(DEV, "%s[%u] Concurrent remote write detected!" | ||
374 | " [DISCARD L] new: %llus +%u; " | ||
375 | "pending: %llus +%u\n", | ||
376 | current->comm, current->pid, | ||
377 | (unsigned long long)sector, size, | ||
378 | (unsigned long long)e->sector, e->size); | ||
379 | goto out_conflict; | ||
380 | } | ||
381 | } | ||
382 | } | ||
383 | #undef OVERLAPS | ||
384 | |||
385 | out_no_conflict: | ||
386 | /* this is like it should be, and what we expected. | ||
387 | * our users do behave after all... */ | ||
388 | put_net_conf(mdev); | ||
389 | return 0; | ||
390 | |||
391 | out_conflict: | ||
392 | put_net_conf(mdev); | ||
393 | return 1; | ||
394 | } | ||
395 | |||
396 | /* obviously this could be coded as many single functions | ||
397 | * instead of one huge switch, | ||
398 | * or by putting the code directly in the respective locations | ||
399 | * (as it has been before). | ||
400 | * | ||
401 | * but having it this way | ||
402 | * enforces that it is all in this one place, where it is easier to audit, | ||
403 | * it makes it obvious that whatever "event" "happens" to a request should | ||
404 | * happen "atomically" within the req_lock, | ||
405 | * and it enforces that we have to think in a very structured manner | ||
406 | * about the "events" that may happen to a request during its life time ... | ||
407 | */ | ||
408 | void __req_mod(struct drbd_request *req, enum drbd_req_event what, | ||
409 | struct bio_and_error *m) | ||
410 | { | ||
411 | struct drbd_conf *mdev = req->mdev; | ||
412 | m->bio = NULL; | ||
413 | |||
414 | switch (what) { | ||
415 | default: | ||
416 | dev_err(DEV, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__); | ||
417 | break; | ||
418 | |||
419 | /* does not happen... | ||
420 | * initialization done in drbd_req_new | ||
421 | case created: | ||
422 | break; | ||
423 | */ | ||
424 | |||
425 | case to_be_send: /* via network */ | ||
426 | /* reached via drbd_make_request_common | ||
427 | * and from w_read_retry_remote */ | ||
428 | D_ASSERT(!(req->rq_state & RQ_NET_MASK)); | ||
429 | req->rq_state |= RQ_NET_PENDING; | ||
430 | inc_ap_pending(mdev); | ||
431 | break; | ||
432 | |||
433 | case to_be_submitted: /* locally */ | ||
434 | /* reached via drbd_make_request_common */ | ||
435 | D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK)); | ||
436 | req->rq_state |= RQ_LOCAL_PENDING; | ||
437 | break; | ||
438 | |||
439 | case completed_ok: | ||
440 | if (bio_data_dir(req->master_bio) == WRITE) | ||
441 | mdev->writ_cnt += req->size>>9; | ||
442 | else | ||
443 | mdev->read_cnt += req->size>>9; | ||
444 | |||
445 | req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); | ||
446 | req->rq_state &= ~RQ_LOCAL_PENDING; | ||
447 | |||
448 | _req_may_be_done(req, m); | ||
449 | put_ldev(mdev); | ||
450 | break; | ||
451 | |||
452 | case write_completed_with_error: | ||
453 | req->rq_state |= RQ_LOCAL_COMPLETED; | ||
454 | req->rq_state &= ~RQ_LOCAL_PENDING; | ||
455 | |||
456 | dev_alert(DEV, "Local WRITE failed sec=%llus size=%u\n", | ||
457 | (unsigned long long)req->sector, req->size); | ||
458 | /* and now: check how to handle local io error. */ | ||
459 | __drbd_chk_io_error(mdev, FALSE); | ||
460 | _req_may_be_done(req, m); | ||
461 | put_ldev(mdev); | ||
462 | break; | ||
463 | |||
464 | case read_ahead_completed_with_error: | ||
465 | /* it is legal to fail READA */ | ||
466 | req->rq_state |= RQ_LOCAL_COMPLETED; | ||
467 | req->rq_state &= ~RQ_LOCAL_PENDING; | ||
468 | _req_may_be_done(req, m); | ||
469 | put_ldev(mdev); | ||
470 | break; | ||
471 | |||
472 | case read_completed_with_error: | ||
473 | drbd_set_out_of_sync(mdev, req->sector, req->size); | ||
474 | |||
475 | req->rq_state |= RQ_LOCAL_COMPLETED; | ||
476 | req->rq_state &= ~RQ_LOCAL_PENDING; | ||
477 | |||
478 | dev_alert(DEV, "Local READ failed sec=%llus size=%u\n", | ||
479 | (unsigned long long)req->sector, req->size); | ||
480 | /* _req_mod(req,to_be_send); oops, recursion... */ | ||
481 | D_ASSERT(!(req->rq_state & RQ_NET_MASK)); | ||
482 | req->rq_state |= RQ_NET_PENDING; | ||
483 | inc_ap_pending(mdev); | ||
484 | |||
485 | __drbd_chk_io_error(mdev, FALSE); | ||
486 | put_ldev(mdev); | ||
487 | /* NOTE: if we have no connection, | ||
488 | * or know the peer has no good data either, | ||
489 | * then we don't actually need to "queue_for_net_read", | ||
490 | * but we do so anyways, since the drbd_io_error() | ||
491 | * and the potential state change to "Diskless" | ||
492 | * needs to be done from process context */ | ||
493 | |||
494 | /* fall through: _req_mod(req,queue_for_net_read); */ | ||
495 | |||
496 | case queue_for_net_read: | ||
497 | /* READ or READA, and | ||
498 | * no local disk, | ||
499 | * or target area marked as invalid, | ||
500 | * or just got an io-error. */ | ||
501 | /* from drbd_make_request_common | ||
502 | * or from bio_endio during read io-error recovery */ | ||
503 | |||
504 | /* so we can verify the handle in the answer packet | ||
505 | * corresponding hlist_del is in _req_may_be_done() */ | ||
506 | hlist_add_head(&req->colision, ar_hash_slot(mdev, req->sector)); | ||
507 | |||
508 | set_bit(UNPLUG_REMOTE, &mdev->flags); /* why? */ | ||
509 | |||
510 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | ||
511 | req->rq_state |= RQ_NET_QUEUED; | ||
512 | req->w.cb = (req->rq_state & RQ_LOCAL_MASK) | ||
513 | ? w_read_retry_remote | ||
514 | : w_send_read_req; | ||
515 | drbd_queue_work(&mdev->data.work, &req->w); | ||
516 | break; | ||
517 | |||
518 | case queue_for_net_write: | ||
519 | /* assert something? */ | ||
520 | /* from drbd_make_request_common only */ | ||
521 | |||
522 | hlist_add_head(&req->colision, tl_hash_slot(mdev, req->sector)); | ||
523 | /* corresponding hlist_del is in _req_may_be_done() */ | ||
524 | |||
525 | /* NOTE | ||
526 | * In case the req ended up on the transfer log before being | ||
527 | * queued on the worker, it could lead to this request being | ||
528 | * missed during cleanup after connection loss. | ||
529 | * So we have to do both operations here, | ||
530 | * within the same lock that protects the transfer log. | ||
531 | * | ||
532 | * _req_add_to_epoch(req); this has to be after the | ||
533 | * _maybe_start_new_epoch(req); which happened in | ||
534 | * drbd_make_request_common, because we now may set the bit | ||
535 | * again ourselves to close the current epoch. | ||
536 | * | ||
537 | * Add req to the (now) current epoch (barrier). */ | ||
538 | |||
539 | /* see drbd_make_request_common, | ||
540 | * just after it grabs the req_lock */ | ||
541 | D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0); | ||
542 | |||
543 | req->epoch = mdev->newest_tle->br_number; | ||
544 | list_add_tail(&req->tl_requests, | ||
545 | &mdev->newest_tle->requests); | ||
546 | |||
547 | /* increment size of current epoch */ | ||
548 | mdev->newest_tle->n_req++; | ||
549 | |||
550 | /* queue work item to send data */ | ||
551 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | ||
552 | req->rq_state |= RQ_NET_QUEUED; | ||
553 | req->w.cb = w_send_dblock; | ||
554 | drbd_queue_work(&mdev->data.work, &req->w); | ||
555 | |||
556 | /* close the epoch, in case it outgrew the limit */ | ||
557 | if (mdev->newest_tle->n_req >= mdev->net_conf->max_epoch_size) | ||
558 | queue_barrier(mdev); | ||
559 | |||
560 | break; | ||
561 | |||
562 | case send_canceled: | ||
563 | /* treat it the same */ | ||
564 | case send_failed: | ||
565 | /* real cleanup will be done from tl_clear. just update flags | ||
566 | * so it is no longer marked as on the worker queue */ | ||
567 | req->rq_state &= ~RQ_NET_QUEUED; | ||
568 | /* if we did it right, tl_clear should be scheduled only after | ||
569 | * this, so this should not be necessary! */ | ||
570 | _req_may_be_done(req, m); | ||
571 | break; | ||
572 | |||
573 | case handed_over_to_network: | ||
574 | /* assert something? */ | ||
575 | if (bio_data_dir(req->master_bio) == WRITE && | ||
576 | mdev->net_conf->wire_protocol == DRBD_PROT_A) { | ||
577 | /* this is what is dangerous about protocol A: | ||
578 | * pretend it was successfully written on the peer. */ | ||
579 | if (req->rq_state & RQ_NET_PENDING) { | ||
580 | dec_ap_pending(mdev); | ||
581 | req->rq_state &= ~RQ_NET_PENDING; | ||
582 | req->rq_state |= RQ_NET_OK; | ||
583 | } /* else: neg-ack was faster... */ | ||
584 | /* it is still not yet RQ_NET_DONE until the | ||
585 | * corresponding epoch barrier got acked as well, | ||
586 | * so we know what to dirty on connection loss */ | ||
587 | } | ||
588 | req->rq_state &= ~RQ_NET_QUEUED; | ||
589 | req->rq_state |= RQ_NET_SENT; | ||
590 | /* because _drbd_send_zc_bio could sleep, and may want to | ||
591 | * dereference the bio even after the "write_acked_by_peer" and | ||
592 | * "completed_ok" events came in, once we return from | ||
593 | * _drbd_send_zc_bio (drbd_send_dblock), we have to check | ||
594 | * whether it is done already, and end it. */ | ||
595 | _req_may_be_done(req, m); | ||
596 | break; | ||
597 | |||
598 | case connection_lost_while_pending: | ||
599 | /* transfer log cleanup after connection loss */ | ||
600 | /* assert something? */ | ||
601 | if (req->rq_state & RQ_NET_PENDING) | ||
602 | dec_ap_pending(mdev); | ||
603 | req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); | ||
604 | req->rq_state |= RQ_NET_DONE; | ||
605 | /* if it is still queued, we may not complete it here. | ||
606 | * it will be canceled soon. */ | ||
607 | if (!(req->rq_state & RQ_NET_QUEUED)) | ||
608 | _req_may_be_done(req, m); | ||
609 | break; | ||
610 | |||
611 | case write_acked_by_peer_and_sis: | ||
612 | req->rq_state |= RQ_NET_SIS; | ||
613 | case conflict_discarded_by_peer: | ||
614 | /* for discarded conflicting writes of multiple primaries, | ||
615 | * there is no need to keep anything in the tl, potential | ||
616 | * node crashes are covered by the activity log. */ | ||
617 | if (what == conflict_discarded_by_peer) | ||
618 | dev_alert(DEV, "Got DiscardAck packet %llus +%u!" | ||
619 | " DRBD is not a random data generator!\n", | ||
620 | (unsigned long long)req->sector, req->size); | ||
621 | req->rq_state |= RQ_NET_DONE; | ||
622 | /* fall through */ | ||
623 | case write_acked_by_peer: | ||
624 | /* protocol C; successfully written on peer. | ||
625 | * Nothing to do here. | ||
626 | * We want to keep the tl in place for all protocols, to cater | ||
627 | * for volatile write-back caches on lower level devices. | ||
628 | * | ||
629 | * A barrier request is expected to have forced all prior | ||
630 | * requests onto stable storage, so completion of a barrier | ||
631 | * request could set NET_DONE right here, and not wait for the | ||
632 | * P_BARRIER_ACK, but that is an unnecessary optimization. */ | ||
633 | |||
634 | /* this makes it effectively the same as for: */ | ||
635 | case recv_acked_by_peer: | ||
636 | /* protocol B; pretends to be successfully written on peer. | ||
637 | * see also notes above in handed_over_to_network about | ||
638 | * protocol != C */ | ||
639 | req->rq_state |= RQ_NET_OK; | ||
640 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | ||
641 | dec_ap_pending(mdev); | ||
642 | req->rq_state &= ~RQ_NET_PENDING; | ||
643 | _req_may_be_done(req, m); | ||
644 | break; | ||
645 | |||
646 | case neg_acked: | ||
647 | /* assert something? */ | ||
648 | if (req->rq_state & RQ_NET_PENDING) | ||
649 | dec_ap_pending(mdev); | ||
650 | req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); | ||
651 | |||
652 | req->rq_state |= RQ_NET_DONE; | ||
653 | _req_may_be_done(req, m); | ||
654 | /* else: done by handed_over_to_network */ | ||
655 | break; | ||
656 | |||
657 | case barrier_acked: | ||
658 | if (req->rq_state & RQ_NET_PENDING) { | ||
659 | /* barrier came in before all requests have been acked. | ||
660 | * this is bad, because if the connection is lost now, | ||
661 | * we won't be able to clean them up... */ | ||
662 | dev_err(DEV, "FIXME (barrier_acked but pending)\n"); | ||
663 | list_move(&req->tl_requests, &mdev->out_of_sequence_requests); | ||
664 | } | ||
665 | D_ASSERT(req->rq_state & RQ_NET_SENT); | ||
666 | req->rq_state |= RQ_NET_DONE; | ||
667 | _req_may_be_done(req, m); | ||
668 | break; | ||
669 | |||
670 | case data_received: | ||
671 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | ||
672 | dec_ap_pending(mdev); | ||
673 | req->rq_state &= ~RQ_NET_PENDING; | ||
674 | req->rq_state |= (RQ_NET_OK|RQ_NET_DONE); | ||
675 | _req_may_be_done(req, m); | ||
676 | break; | ||
677 | }; | ||
678 | } | ||
679 | |||
680 | /* we may do a local read if: | ||
681 | * - we are consistent (of course), | ||
682 | * - or we are generally inconsistent, | ||
683 | * BUT we are still/already IN SYNC for this area. | ||
684 | * since size may be bigger than BM_BLOCK_SIZE, | ||
685 | * we may need to check several bits. | ||
686 | */ | ||
687 | static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size) | ||
688 | { | ||
689 | unsigned long sbnr, ebnr; | ||
690 | sector_t esector, nr_sectors; | ||
691 | |||
692 | if (mdev->state.disk == D_UP_TO_DATE) | ||
693 | return 1; | ||
694 | if (mdev->state.disk >= D_OUTDATED) | ||
695 | return 0; | ||
696 | if (mdev->state.disk < D_INCONSISTENT) | ||
697 | return 0; | ||
698 | /* state.disk == D_INCONSISTENT We will have a look at the BitMap */ | ||
699 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | ||
700 | esector = sector + (size >> 9) - 1; | ||
701 | |||
702 | D_ASSERT(sector < nr_sectors); | ||
703 | D_ASSERT(esector < nr_sectors); | ||
704 | |||
705 | sbnr = BM_SECT_TO_BIT(sector); | ||
706 | ebnr = BM_SECT_TO_BIT(esector); | ||
707 | |||
708 | return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr); | ||
709 | } | ||
710 | |||
711 | static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio) | ||
712 | { | ||
713 | const int rw = bio_rw(bio); | ||
714 | const int size = bio->bi_size; | ||
715 | const sector_t sector = bio->bi_sector; | ||
716 | struct drbd_tl_epoch *b = NULL; | ||
717 | struct drbd_request *req; | ||
718 | int local, remote; | ||
719 | int err = -EIO; | ||
720 | |||
721 | /* allocate outside of all locks; */ | ||
722 | req = drbd_req_new(mdev, bio); | ||
723 | if (!req) { | ||
724 | dec_ap_bio(mdev); | ||
725 | /* only pass the error to the upper layers. | ||
726 | * if user cannot handle io errors, that's not our business. */ | ||
727 | dev_err(DEV, "could not kmalloc() req\n"); | ||
728 | bio_endio(bio, -ENOMEM); | ||
729 | return 0; | ||
730 | } | ||
731 | |||
732 | local = get_ldev(mdev); | ||
733 | if (!local) { | ||
734 | bio_put(req->private_bio); /* or we get a bio leak */ | ||
735 | req->private_bio = NULL; | ||
736 | } | ||
737 | if (rw == WRITE) { | ||
738 | remote = 1; | ||
739 | } else { | ||
740 | /* READ || READA */ | ||
741 | if (local) { | ||
742 | if (!drbd_may_do_local_read(mdev, sector, size)) { | ||
743 | /* we could kick the syncer to | ||
744 | * sync this extent asap, wait for | ||
745 | * it, then continue locally. | ||
746 | * Or just issue the request remotely. | ||
747 | */ | ||
748 | local = 0; | ||
749 | bio_put(req->private_bio); | ||
750 | req->private_bio = NULL; | ||
751 | put_ldev(mdev); | ||
752 | } | ||
753 | } | ||
754 | remote = !local && mdev->state.pdsk >= D_UP_TO_DATE; | ||
755 | } | ||
756 | |||
757 | /* If we have a disk, but a READA request is mapped to remote, | ||
758 | * we are R_PRIMARY, D_INCONSISTENT, SyncTarget. | ||
759 | * Just fail that READA request right here. | ||
760 | * | ||
761 | * THINK: maybe fail all READA when not local? | ||
762 | * or make this configurable... | ||
763 | * if network is slow, READA won't do any good. | ||
764 | */ | ||
765 | if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) { | ||
766 | err = -EWOULDBLOCK; | ||
767 | goto fail_and_free_req; | ||
768 | } | ||
769 | |||
770 | /* For WRITES going to the local disk, grab a reference on the target | ||
771 | * extent. This waits for any resync activity in the corresponding | ||
772 | * resync extent to finish, and, if necessary, pulls in the target | ||
773 | * extent into the activity log, which involves further disk io because | ||
774 | * of transactional on-disk meta data updates. */ | ||
775 | if (rw == WRITE && local) | ||
776 | drbd_al_begin_io(mdev, sector); | ||
777 | |||
778 | remote = remote && (mdev->state.pdsk == D_UP_TO_DATE || | ||
779 | (mdev->state.pdsk == D_INCONSISTENT && | ||
780 | mdev->state.conn >= C_CONNECTED)); | ||
781 | |||
782 | if (!(local || remote)) { | ||
783 | dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); | ||
784 | goto fail_free_complete; | ||
785 | } | ||
786 | |||
787 | /* For WRITE request, we have to make sure that we have an | ||
788 | * unused_spare_tle, in case we need to start a new epoch. | ||
789 | * I try to be smart and avoid to pre-allocate always "just in case", | ||
790 | * but there is a race between testing the bit and pointer outside the | ||
791 | * spinlock, and grabbing the spinlock. | ||
792 | * if we lost that race, we retry. */ | ||
793 | if (rw == WRITE && remote && | ||
794 | mdev->unused_spare_tle == NULL && | ||
795 | test_bit(CREATE_BARRIER, &mdev->flags)) { | ||
796 | allocate_barrier: | ||
797 | b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO); | ||
798 | if (!b) { | ||
799 | dev_err(DEV, "Failed to alloc barrier.\n"); | ||
800 | err = -ENOMEM; | ||
801 | goto fail_free_complete; | ||
802 | } | ||
803 | } | ||
804 | |||
805 | /* GOOD, everything prepared, grab the spin_lock */ | ||
806 | spin_lock_irq(&mdev->req_lock); | ||
807 | |||
808 | if (remote) { | ||
809 | remote = (mdev->state.pdsk == D_UP_TO_DATE || | ||
810 | (mdev->state.pdsk == D_INCONSISTENT && | ||
811 | mdev->state.conn >= C_CONNECTED)); | ||
812 | if (!remote) | ||
813 | dev_warn(DEV, "lost connection while grabbing the req_lock!\n"); | ||
814 | if (!(local || remote)) { | ||
815 | dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); | ||
816 | spin_unlock_irq(&mdev->req_lock); | ||
817 | goto fail_free_complete; | ||
818 | } | ||
819 | } | ||
820 | |||
821 | if (b && mdev->unused_spare_tle == NULL) { | ||
822 | mdev->unused_spare_tle = b; | ||
823 | b = NULL; | ||
824 | } | ||
825 | if (rw == WRITE && remote && | ||
826 | mdev->unused_spare_tle == NULL && | ||
827 | test_bit(CREATE_BARRIER, &mdev->flags)) { | ||
828 | /* someone closed the current epoch | ||
829 | * while we were grabbing the spinlock */ | ||
830 | spin_unlock_irq(&mdev->req_lock); | ||
831 | goto allocate_barrier; | ||
832 | } | ||
833 | |||
834 | |||
835 | /* Update disk stats */ | ||
836 | _drbd_start_io_acct(mdev, req, bio); | ||
837 | |||
838 | /* _maybe_start_new_epoch(mdev); | ||
839 | * If we need to generate a write barrier packet, we have to add the | ||
840 | * new epoch (barrier) object, and queue the barrier packet for sending, | ||
841 | * and queue the req's data after it _within the same lock_, otherwise | ||
842 | * we have race conditions were the reorder domains could be mixed up. | ||
843 | * | ||
844 | * Even read requests may start a new epoch and queue the corresponding | ||
845 | * barrier packet. To get the write ordering right, we only have to | ||
846 | * make sure that, if this is a write request and it triggered a | ||
847 | * barrier packet, this request is queued within the same spinlock. */ | ||
848 | if (remote && mdev->unused_spare_tle && | ||
849 | test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { | ||
850 | _tl_add_barrier(mdev, mdev->unused_spare_tle); | ||
851 | mdev->unused_spare_tle = NULL; | ||
852 | } else { | ||
853 | D_ASSERT(!(remote && rw == WRITE && | ||
854 | test_bit(CREATE_BARRIER, &mdev->flags))); | ||
855 | } | ||
856 | |||
857 | /* NOTE | ||
858 | * Actually, 'local' may be wrong here already, since we may have failed | ||
859 | * to write to the meta data, and may become wrong anytime because of | ||
860 | * local io-error for some other request, which would lead to us | ||
861 | * "detaching" the local disk. | ||
862 | * | ||
863 | * 'remote' may become wrong any time because the network could fail. | ||
864 | * | ||
865 | * This is a harmless race condition, though, since it is handled | ||
866 | * correctly at the appropriate places; so it just defers the failure | ||
867 | * of the respective operation. | ||
868 | */ | ||
869 | |||
870 | /* mark them early for readability. | ||
871 | * this just sets some state flags. */ | ||
872 | if (remote) | ||
873 | _req_mod(req, to_be_send); | ||
874 | if (local) | ||
875 | _req_mod(req, to_be_submitted); | ||
876 | |||
877 | /* check this request on the collision detection hash tables. | ||
878 | * if we have a conflict, just complete it here. | ||
879 | * THINK do we want to check reads, too? (I don't think so...) */ | ||
880 | if (rw == WRITE && _req_conflicts(req)) { | ||
881 | /* this is a conflicting request. | ||
882 | * even though it may have been only _partially_ | ||
883 | * overlapping with one of the currently pending requests, | ||
884 | * without even submitting or sending it, we will | ||
885 | * pretend that it was successfully served right now. | ||
886 | */ | ||
887 | if (local) { | ||
888 | bio_put(req->private_bio); | ||
889 | req->private_bio = NULL; | ||
890 | drbd_al_complete_io(mdev, req->sector); | ||
891 | put_ldev(mdev); | ||
892 | local = 0; | ||
893 | } | ||
894 | if (remote) | ||
895 | dec_ap_pending(mdev); | ||
896 | _drbd_end_io_acct(mdev, req); | ||
897 | /* THINK: do we want to fail it (-EIO), or pretend success? */ | ||
898 | bio_endio(req->master_bio, 0); | ||
899 | req->master_bio = NULL; | ||
900 | dec_ap_bio(mdev); | ||
901 | drbd_req_free(req); | ||
902 | remote = 0; | ||
903 | } | ||
904 | |||
905 | /* NOTE remote first: to get the concurrent write detection right, | ||
906 | * we must register the request before start of local IO. */ | ||
907 | if (remote) { | ||
908 | /* either WRITE and C_CONNECTED, | ||
909 | * or READ, and no local disk, | ||
910 | * or READ, but not in sync. | ||
911 | */ | ||
912 | _req_mod(req, (rw == WRITE) | ||
913 | ? queue_for_net_write | ||
914 | : queue_for_net_read); | ||
915 | } | ||
916 | spin_unlock_irq(&mdev->req_lock); | ||
917 | kfree(b); /* if someone else has beaten us to it... */ | ||
918 | |||
919 | if (local) { | ||
920 | req->private_bio->bi_bdev = mdev->ldev->backing_bdev; | ||
921 | |||
922 | if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR | ||
923 | : rw == READ ? DRBD_FAULT_DT_RD | ||
924 | : DRBD_FAULT_DT_RA)) | ||
925 | bio_endio(req->private_bio, -EIO); | ||
926 | else | ||
927 | generic_make_request(req->private_bio); | ||
928 | } | ||
929 | |||
930 | /* we need to plug ALWAYS since we possibly need to kick lo_dev. | ||
931 | * we plug after submit, so we won't miss an unplug event */ | ||
932 | drbd_plug_device(mdev); | ||
933 | |||
934 | return 0; | ||
935 | |||
936 | fail_free_complete: | ||
937 | if (rw == WRITE && local) | ||
938 | drbd_al_complete_io(mdev, sector); | ||
939 | fail_and_free_req: | ||
940 | if (local) { | ||
941 | bio_put(req->private_bio); | ||
942 | req->private_bio = NULL; | ||
943 | put_ldev(mdev); | ||
944 | } | ||
945 | bio_endio(bio, err); | ||
946 | drbd_req_free(req); | ||
947 | dec_ap_bio(mdev); | ||
948 | kfree(b); | ||
949 | |||
950 | return 0; | ||
951 | } | ||
952 | |||
953 | /* helper function for drbd_make_request | ||
954 | * if we can determine just by the mdev (state) that this request will fail, | ||
955 | * return 1 | ||
956 | * otherwise return 0 | ||
957 | */ | ||
958 | static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write) | ||
959 | { | ||
960 | /* Unconfigured */ | ||
961 | if (mdev->state.conn == C_DISCONNECTING && | ||
962 | mdev->state.disk == D_DISKLESS) | ||
963 | return 1; | ||
964 | |||
965 | if (mdev->state.role != R_PRIMARY && | ||
966 | (!allow_oos || is_write)) { | ||
967 | if (__ratelimit(&drbd_ratelimit_state)) { | ||
968 | dev_err(DEV, "Process %s[%u] tried to %s; " | ||
969 | "since we are not in Primary state, " | ||
970 | "we cannot allow this\n", | ||
971 | current->comm, current->pid, | ||
972 | is_write ? "WRITE" : "READ"); | ||
973 | } | ||
974 | return 1; | ||
975 | } | ||
976 | |||
977 | /* | ||
978 | * Paranoia: we might have been primary, but sync target, or | ||
979 | * even diskless, then lost the connection. | ||
980 | * This should have been handled (panic? suspend?) somewhere | ||
981 | * else. But maybe it was not, so check again here. | ||
982 | * Caution: as long as we do not have a read/write lock on mdev, | ||
983 | * to serialize state changes, this is racy, since we may lose | ||
984 | * the connection *after* we test for the cstate. | ||
985 | */ | ||
986 | if (mdev->state.disk < D_UP_TO_DATE && mdev->state.pdsk < D_UP_TO_DATE) { | ||
987 | if (__ratelimit(&drbd_ratelimit_state)) | ||
988 | dev_err(DEV, "Sorry, I have no access to good data anymore.\n"); | ||
989 | return 1; | ||
990 | } | ||
991 | |||
992 | return 0; | ||
993 | } | ||
994 | |||
995 | int drbd_make_request_26(struct request_queue *q, struct bio *bio) | ||
996 | { | ||
997 | unsigned int s_enr, e_enr; | ||
998 | struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; | ||
999 | |||
1000 | if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) { | ||
1001 | bio_endio(bio, -EPERM); | ||
1002 | return 0; | ||
1003 | } | ||
1004 | |||
1005 | /* Reject barrier requests if we know the underlying device does | ||
1006 | * not support them. | ||
1007 | * XXX: Need to get this info from peer as well some how so we | ||
1008 | * XXX: reject if EITHER side/data/metadata area does not support them. | ||
1009 | * | ||
1010 | * because of those XXX, this is not yet enabled, | ||
1011 | * i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit. | ||
1012 | */ | ||
1013 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags))) { | ||
1014 | /* dev_warn(DEV, "Rejecting barrier request as underlying device does not support\n"); */ | ||
1015 | bio_endio(bio, -EOPNOTSUPP); | ||
1016 | return 0; | ||
1017 | } | ||
1018 | |||
1019 | /* | ||
1020 | * what we "blindly" assume: | ||
1021 | */ | ||
1022 | D_ASSERT(bio->bi_size > 0); | ||
1023 | D_ASSERT((bio->bi_size & 0x1ff) == 0); | ||
1024 | D_ASSERT(bio->bi_idx == 0); | ||
1025 | |||
1026 | /* to make some things easier, force alignment of requests within the | ||
1027 | * granularity of our hash tables */ | ||
1028 | s_enr = bio->bi_sector >> HT_SHIFT; | ||
1029 | e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT; | ||
1030 | |||
1031 | if (likely(s_enr == e_enr)) { | ||
1032 | inc_ap_bio(mdev, 1); | ||
1033 | return drbd_make_request_common(mdev, bio); | ||
1034 | } | ||
1035 | |||
1036 | /* can this bio be split generically? | ||
1037 | * Maybe add our own split-arbitrary-bios function. */ | ||
1038 | if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_SEGMENT_SIZE) { | ||
1039 | /* rather error out here than BUG in bio_split */ | ||
1040 | dev_err(DEV, "bio would need to, but cannot, be split: " | ||
1041 | "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n", | ||
1042 | bio->bi_vcnt, bio->bi_idx, bio->bi_size, | ||
1043 | (unsigned long long)bio->bi_sector); | ||
1044 | bio_endio(bio, -EINVAL); | ||
1045 | } else { | ||
1046 | /* This bio crosses some boundary, so we have to split it. */ | ||
1047 | struct bio_pair *bp; | ||
1048 | /* works for the "do not cross hash slot boundaries" case | ||
1049 | * e.g. sector 262269, size 4096 | ||
1050 | * s_enr = 262269 >> 6 = 4097 | ||
1051 | * e_enr = (262269+8-1) >> 6 = 4098 | ||
1052 | * HT_SHIFT = 6 | ||
1053 | * sps = 64, mask = 63 | ||
1054 | * first_sectors = 64 - (262269 & 63) = 3 | ||
1055 | */ | ||
1056 | const sector_t sect = bio->bi_sector; | ||
1057 | const int sps = 1 << HT_SHIFT; /* sectors per slot */ | ||
1058 | const int mask = sps - 1; | ||
1059 | const sector_t first_sectors = sps - (sect & mask); | ||
1060 | bp = bio_split(bio, | ||
1061 | #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) | ||
1062 | bio_split_pool, | ||
1063 | #endif | ||
1064 | first_sectors); | ||
1065 | |||
1066 | /* we need to get a "reference count" (ap_bio_cnt) | ||
1067 | * to avoid races with the disconnect/reconnect/suspend code. | ||
1068 | * In case we need to split the bio here, we need to get two references | ||
1069 | * atomically, otherwise we might deadlock when trying to submit the | ||
1070 | * second one! */ | ||
1071 | inc_ap_bio(mdev, 2); | ||
1072 | |||
1073 | D_ASSERT(e_enr == s_enr + 1); | ||
1074 | |||
1075 | drbd_make_request_common(mdev, &bp->bio1); | ||
1076 | drbd_make_request_common(mdev, &bp->bio2); | ||
1077 | bio_pair_release(bp); | ||
1078 | } | ||
1079 | return 0; | ||
1080 | } | ||
1081 | |||
1082 | /* This is called by bio_add_page(). With this function we reduce | ||
1083 | * the number of BIOs that span over multiple DRBD_MAX_SEGMENT_SIZEs | ||
1084 | * units (was AL_EXTENTs). | ||
1085 | * | ||
1086 | * we do the calculation within the lower 32bit of the byte offsets, | ||
1087 | * since we don't care for actual offset, but only check whether it | ||
1088 | * would cross "activity log extent" boundaries. | ||
1089 | * | ||
1090 | * As long as the BIO is empty we have to allow at least one bvec, | ||
1091 | * regardless of size and offset. so the resulting bio may still | ||
1092 | * cross extent boundaries. those are dealt with (bio_split) in | ||
1093 | * drbd_make_request_26. | ||
1094 | */ | ||
1095 | int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec) | ||
1096 | { | ||
1097 | struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; | ||
1098 | unsigned int bio_offset = | ||
1099 | (unsigned int)bvm->bi_sector << 9; /* 32 bit */ | ||
1100 | unsigned int bio_size = bvm->bi_size; | ||
1101 | int limit, backing_limit; | ||
1102 | |||
1103 | limit = DRBD_MAX_SEGMENT_SIZE | ||
1104 | - ((bio_offset & (DRBD_MAX_SEGMENT_SIZE-1)) + bio_size); | ||
1105 | if (limit < 0) | ||
1106 | limit = 0; | ||
1107 | if (bio_size == 0) { | ||
1108 | if (limit <= bvec->bv_len) | ||
1109 | limit = bvec->bv_len; | ||
1110 | } else if (limit && get_ldev(mdev)) { | ||
1111 | struct request_queue * const b = | ||
1112 | mdev->ldev->backing_bdev->bd_disk->queue; | ||
1113 | if (b->merge_bvec_fn && mdev->ldev->dc.use_bmbv) { | ||
1114 | backing_limit = b->merge_bvec_fn(b, bvm, bvec); | ||
1115 | limit = min(limit, backing_limit); | ||
1116 | } | ||
1117 | put_ldev(mdev); | ||
1118 | } | ||
1119 | return limit; | ||
1120 | } | ||
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h new file mode 100644 index 000000000000..f22c1bc8ec7e --- /dev/null +++ b/drivers/block/drbd/drbd_req.h | |||
@@ -0,0 +1,326 @@ | |||
1 | /* | ||
2 | drbd_req.h | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2006-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 2006-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
8 | Copyright (C) 2006-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
9 | |||
10 | DRBD is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | DRBD is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | */ | ||
24 | |||
25 | #ifndef _DRBD_REQ_H | ||
26 | #define _DRBD_REQ_H | ||
27 | |||
28 | #include <linux/module.h> | ||
29 | |||
30 | #include <linux/slab.h> | ||
31 | #include <linux/drbd.h> | ||
32 | #include "drbd_int.h" | ||
33 | #include "drbd_wrappers.h" | ||
34 | |||
35 | /* The request callbacks will be called in irq context by the IDE drivers, | ||
36 | and in Softirqs/Tasklets/BH context by the SCSI drivers, | ||
37 | and by the receiver and worker in kernel-thread context. | ||
38 | Try to get the locking right :) */ | ||
39 | |||
40 | /* | ||
41 | * Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are | ||
42 | * associated with IO requests originating from the block layer above us. | ||
43 | * | ||
44 | * There are quite a few things that may happen to a drbd request | ||
45 | * during its lifetime. | ||
46 | * | ||
47 | * It will be created. | ||
48 | * It will be marked with the intention to be | ||
49 | * submitted to local disk and/or | ||
50 | * send via the network. | ||
51 | * | ||
52 | * It has to be placed on the transfer log and other housekeeping lists, | ||
53 | * In case we have a network connection. | ||
54 | * | ||
55 | * It may be identified as a concurrent (write) request | ||
56 | * and be handled accordingly. | ||
57 | * | ||
58 | * It may me handed over to the local disk subsystem. | ||
59 | * It may be completed by the local disk subsystem, | ||
60 | * either sucessfully or with io-error. | ||
61 | * In case it is a READ request, and it failed locally, | ||
62 | * it may be retried remotely. | ||
63 | * | ||
64 | * It may be queued for sending. | ||
65 | * It may be handed over to the network stack, | ||
66 | * which may fail. | ||
67 | * It may be acknowledged by the "peer" according to the wire_protocol in use. | ||
68 | * this may be a negative ack. | ||
69 | * It may receive a faked ack when the network connection is lost and the | ||
70 | * transfer log is cleaned up. | ||
71 | * Sending may be canceled due to network connection loss. | ||
72 | * When it finally has outlived its time, | ||
73 | * corresponding dirty bits in the resync-bitmap may be cleared or set, | ||
74 | * it will be destroyed, | ||
75 | * and completion will be signalled to the originator, | ||
76 | * with or without "success". | ||
77 | */ | ||
78 | |||
79 | enum drbd_req_event { | ||
80 | created, | ||
81 | to_be_send, | ||
82 | to_be_submitted, | ||
83 | |||
84 | /* XXX yes, now I am inconsistent... | ||
85 | * these two are not "events" but "actions" | ||
86 | * oh, well... */ | ||
87 | queue_for_net_write, | ||
88 | queue_for_net_read, | ||
89 | |||
90 | send_canceled, | ||
91 | send_failed, | ||
92 | handed_over_to_network, | ||
93 | connection_lost_while_pending, | ||
94 | recv_acked_by_peer, | ||
95 | write_acked_by_peer, | ||
96 | write_acked_by_peer_and_sis, /* and set_in_sync */ | ||
97 | conflict_discarded_by_peer, | ||
98 | neg_acked, | ||
99 | barrier_acked, /* in protocol A and B */ | ||
100 | data_received, /* (remote read) */ | ||
101 | |||
102 | read_completed_with_error, | ||
103 | read_ahead_completed_with_error, | ||
104 | write_completed_with_error, | ||
105 | completed_ok, | ||
106 | nothing, /* for tracing only */ | ||
107 | }; | ||
108 | |||
109 | /* encoding of request states for now. we don't actually need that many bits. | ||
110 | * we don't need to do atomic bit operations either, since most of the time we | ||
111 | * need to look at the connection state and/or manipulate some lists at the | ||
112 | * same time, so we should hold the request lock anyways. | ||
113 | */ | ||
114 | enum drbd_req_state_bits { | ||
115 | /* 210 | ||
116 | * 000: no local possible | ||
117 | * 001: to be submitted | ||
118 | * UNUSED, we could map: 011: submitted, completion still pending | ||
119 | * 110: completed ok | ||
120 | * 010: completed with error | ||
121 | */ | ||
122 | __RQ_LOCAL_PENDING, | ||
123 | __RQ_LOCAL_COMPLETED, | ||
124 | __RQ_LOCAL_OK, | ||
125 | |||
126 | /* 76543 | ||
127 | * 00000: no network possible | ||
128 | * 00001: to be send | ||
129 | * 00011: to be send, on worker queue | ||
130 | * 00101: sent, expecting recv_ack (B) or write_ack (C) | ||
131 | * 11101: sent, | ||
132 | * recv_ack (B) or implicit "ack" (A), | ||
133 | * still waiting for the barrier ack. | ||
134 | * master_bio may already be completed and invalidated. | ||
135 | * 11100: write_acked (C), | ||
136 | * data_received (for remote read, any protocol) | ||
137 | * or finally the barrier ack has arrived (B,A)... | ||
138 | * request can be freed | ||
139 | * 01100: neg-acked (write, protocol C) | ||
140 | * or neg-d-acked (read, any protocol) | ||
141 | * or killed from the transfer log | ||
142 | * during cleanup after connection loss | ||
143 | * request can be freed | ||
144 | * 01000: canceled or send failed... | ||
145 | * request can be freed | ||
146 | */ | ||
147 | |||
148 | /* if "SENT" is not set, yet, this can still fail or be canceled. | ||
149 | * if "SENT" is set already, we still wait for an Ack packet. | ||
150 | * when cleared, the master_bio may be completed. | ||
151 | * in (B,A) the request object may still linger on the transaction log | ||
152 | * until the corresponding barrier ack comes in */ | ||
153 | __RQ_NET_PENDING, | ||
154 | |||
155 | /* If it is QUEUED, and it is a WRITE, it is also registered in the | ||
156 | * transfer log. Currently we need this flag to avoid conflicts between | ||
157 | * worker canceling the request and tl_clear_barrier killing it from | ||
158 | * transfer log. We should restructure the code so this conflict does | ||
159 | * no longer occur. */ | ||
160 | __RQ_NET_QUEUED, | ||
161 | |||
162 | /* well, actually only "handed over to the network stack". | ||
163 | * | ||
164 | * TODO can potentially be dropped because of the similar meaning | ||
165 | * of RQ_NET_SENT and ~RQ_NET_QUEUED. | ||
166 | * however it is not exactly the same. before we drop it | ||
167 | * we must ensure that we can tell a request with network part | ||
168 | * from a request without, regardless of what happens to it. */ | ||
169 | __RQ_NET_SENT, | ||
170 | |||
171 | /* when set, the request may be freed (if RQ_NET_QUEUED is clear). | ||
172 | * basically this means the corresponding P_BARRIER_ACK was received */ | ||
173 | __RQ_NET_DONE, | ||
174 | |||
175 | /* whether or not we know (C) or pretend (B,A) that the write | ||
176 | * was successfully written on the peer. | ||
177 | */ | ||
178 | __RQ_NET_OK, | ||
179 | |||
180 | /* peer called drbd_set_in_sync() for this write */ | ||
181 | __RQ_NET_SIS, | ||
182 | |||
183 | /* keep this last, its for the RQ_NET_MASK */ | ||
184 | __RQ_NET_MAX, | ||
185 | }; | ||
186 | |||
187 | #define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) | ||
188 | #define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED) | ||
189 | #define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK) | ||
190 | |||
191 | #define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */ | ||
192 | |||
193 | #define RQ_NET_PENDING (1UL << __RQ_NET_PENDING) | ||
194 | #define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED) | ||
195 | #define RQ_NET_SENT (1UL << __RQ_NET_SENT) | ||
196 | #define RQ_NET_DONE (1UL << __RQ_NET_DONE) | ||
197 | #define RQ_NET_OK (1UL << __RQ_NET_OK) | ||
198 | #define RQ_NET_SIS (1UL << __RQ_NET_SIS) | ||
199 | |||
200 | /* 0x1f8 */ | ||
201 | #define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK) | ||
202 | |||
203 | /* epoch entries */ | ||
204 | static inline | ||
205 | struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector) | ||
206 | { | ||
207 | BUG_ON(mdev->ee_hash_s == 0); | ||
208 | return mdev->ee_hash + | ||
209 | ((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s); | ||
210 | } | ||
211 | |||
212 | /* transfer log (drbd_request objects) */ | ||
213 | static inline | ||
214 | struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector) | ||
215 | { | ||
216 | BUG_ON(mdev->tl_hash_s == 0); | ||
217 | return mdev->tl_hash + | ||
218 | ((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s); | ||
219 | } | ||
220 | |||
221 | /* application reads (drbd_request objects) */ | ||
222 | static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector) | ||
223 | { | ||
224 | return mdev->app_reads_hash | ||
225 | + ((unsigned int)(sector) % APP_R_HSIZE); | ||
226 | } | ||
227 | |||
228 | /* when we receive the answer for a read request, | ||
229 | * verify that we actually know about it */ | ||
230 | static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev, | ||
231 | u64 id, sector_t sector) | ||
232 | { | ||
233 | struct hlist_head *slot = ar_hash_slot(mdev, sector); | ||
234 | struct hlist_node *n; | ||
235 | struct drbd_request *req; | ||
236 | |||
237 | hlist_for_each_entry(req, n, slot, colision) { | ||
238 | if ((unsigned long)req == (unsigned long)id) { | ||
239 | D_ASSERT(req->sector == sector); | ||
240 | return req; | ||
241 | } | ||
242 | } | ||
243 | return NULL; | ||
244 | } | ||
245 | |||
246 | static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev, | ||
247 | struct bio *bio_src) | ||
248 | { | ||
249 | struct bio *bio; | ||
250 | struct drbd_request *req = | ||
251 | mempool_alloc(drbd_request_mempool, GFP_NOIO); | ||
252 | if (likely(req)) { | ||
253 | bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */ | ||
254 | |||
255 | req->rq_state = 0; | ||
256 | req->mdev = mdev; | ||
257 | req->master_bio = bio_src; | ||
258 | req->private_bio = bio; | ||
259 | req->epoch = 0; | ||
260 | req->sector = bio->bi_sector; | ||
261 | req->size = bio->bi_size; | ||
262 | req->start_time = jiffies; | ||
263 | INIT_HLIST_NODE(&req->colision); | ||
264 | INIT_LIST_HEAD(&req->tl_requests); | ||
265 | INIT_LIST_HEAD(&req->w.list); | ||
266 | |||
267 | bio->bi_private = req; | ||
268 | bio->bi_end_io = drbd_endio_pri; | ||
269 | bio->bi_next = NULL; | ||
270 | } | ||
271 | return req; | ||
272 | } | ||
273 | |||
274 | static inline void drbd_req_free(struct drbd_request *req) | ||
275 | { | ||
276 | mempool_free(req, drbd_request_mempool); | ||
277 | } | ||
278 | |||
279 | static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2) | ||
280 | { | ||
281 | return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9))); | ||
282 | } | ||
283 | |||
284 | /* Short lived temporary struct on the stack. | ||
285 | * We could squirrel the error to be returned into | ||
286 | * bio->bi_size, or similar. But that would be too ugly. */ | ||
287 | struct bio_and_error { | ||
288 | struct bio *bio; | ||
289 | int error; | ||
290 | }; | ||
291 | |||
292 | extern void _req_may_be_done(struct drbd_request *req, | ||
293 | struct bio_and_error *m); | ||
294 | extern void __req_mod(struct drbd_request *req, enum drbd_req_event what, | ||
295 | struct bio_and_error *m); | ||
296 | extern void complete_master_bio(struct drbd_conf *mdev, | ||
297 | struct bio_and_error *m); | ||
298 | |||
299 | /* use this if you don't want to deal with calling complete_master_bio() | ||
300 | * outside the spinlock, e.g. when walking some list on cleanup. */ | ||
301 | static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what) | ||
302 | { | ||
303 | struct drbd_conf *mdev = req->mdev; | ||
304 | struct bio_and_error m; | ||
305 | |||
306 | /* __req_mod possibly frees req, do not touch req after that! */ | ||
307 | __req_mod(req, what, &m); | ||
308 | if (m.bio) | ||
309 | complete_master_bio(mdev, &m); | ||
310 | } | ||
311 | |||
312 | /* completion of master bio is outside of spinlock. | ||
313 | * If you need it irqsave, do it your self! */ | ||
314 | static inline void req_mod(struct drbd_request *req, | ||
315 | enum drbd_req_event what) | ||
316 | { | ||
317 | struct drbd_conf *mdev = req->mdev; | ||
318 | struct bio_and_error m; | ||
319 | spin_lock_irq(&mdev->req_lock); | ||
320 | __req_mod(req, what, &m); | ||
321 | spin_unlock_irq(&mdev->req_lock); | ||
322 | |||
323 | if (m.bio) | ||
324 | complete_master_bio(mdev, &m); | ||
325 | } | ||
326 | #endif | ||
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c new file mode 100644 index 000000000000..76863e3f05be --- /dev/null +++ b/drivers/block/drbd/drbd_strings.c | |||
@@ -0,0 +1,113 @@ | |||
1 | /* | ||
2 | drbd.h | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #include <linux/drbd.h> | ||
27 | |||
28 | static const char *drbd_conn_s_names[] = { | ||
29 | [C_STANDALONE] = "StandAlone", | ||
30 | [C_DISCONNECTING] = "Disconnecting", | ||
31 | [C_UNCONNECTED] = "Unconnected", | ||
32 | [C_TIMEOUT] = "Timeout", | ||
33 | [C_BROKEN_PIPE] = "BrokenPipe", | ||
34 | [C_NETWORK_FAILURE] = "NetworkFailure", | ||
35 | [C_PROTOCOL_ERROR] = "ProtocolError", | ||
36 | [C_WF_CONNECTION] = "WFConnection", | ||
37 | [C_WF_REPORT_PARAMS] = "WFReportParams", | ||
38 | [C_TEAR_DOWN] = "TearDown", | ||
39 | [C_CONNECTED] = "Connected", | ||
40 | [C_STARTING_SYNC_S] = "StartingSyncS", | ||
41 | [C_STARTING_SYNC_T] = "StartingSyncT", | ||
42 | [C_WF_BITMAP_S] = "WFBitMapS", | ||
43 | [C_WF_BITMAP_T] = "WFBitMapT", | ||
44 | [C_WF_SYNC_UUID] = "WFSyncUUID", | ||
45 | [C_SYNC_SOURCE] = "SyncSource", | ||
46 | [C_SYNC_TARGET] = "SyncTarget", | ||
47 | [C_PAUSED_SYNC_S] = "PausedSyncS", | ||
48 | [C_PAUSED_SYNC_T] = "PausedSyncT", | ||
49 | [C_VERIFY_S] = "VerifyS", | ||
50 | [C_VERIFY_T] = "VerifyT", | ||
51 | }; | ||
52 | |||
53 | static const char *drbd_role_s_names[] = { | ||
54 | [R_PRIMARY] = "Primary", | ||
55 | [R_SECONDARY] = "Secondary", | ||
56 | [R_UNKNOWN] = "Unknown" | ||
57 | }; | ||
58 | |||
59 | static const char *drbd_disk_s_names[] = { | ||
60 | [D_DISKLESS] = "Diskless", | ||
61 | [D_ATTACHING] = "Attaching", | ||
62 | [D_FAILED] = "Failed", | ||
63 | [D_NEGOTIATING] = "Negotiating", | ||
64 | [D_INCONSISTENT] = "Inconsistent", | ||
65 | [D_OUTDATED] = "Outdated", | ||
66 | [D_UNKNOWN] = "DUnknown", | ||
67 | [D_CONSISTENT] = "Consistent", | ||
68 | [D_UP_TO_DATE] = "UpToDate", | ||
69 | }; | ||
70 | |||
71 | static const char *drbd_state_sw_errors[] = { | ||
72 | [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config", | ||
73 | [-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk", | ||
74 | [-SS_NO_LOCAL_DISK] = "Can not resync without local disk", | ||
75 | [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk", | ||
76 | [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected", | ||
77 | [-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated", | ||
78 | [-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active", | ||
79 | [-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device", | ||
80 | [-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node", | ||
81 | [-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk", | ||
82 | [-SS_DEVICE_IN_USE] = "Device is held open by someone", | ||
83 | [-SS_NO_NET_CONFIG] = "Have no net/connection configuration", | ||
84 | [-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify", | ||
85 | [-SS_NEED_CONNECTION] = "Need a connection to start verify or resync", | ||
86 | [-SS_NOT_SUPPORTED] = "Peer does not support protocol", | ||
87 | [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", | ||
88 | [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", | ||
89 | [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", | ||
90 | }; | ||
91 | |||
92 | const char *drbd_conn_str(enum drbd_conns s) | ||
93 | { | ||
94 | /* enums are unsigned... */ | ||
95 | return s > C_PAUSED_SYNC_T ? "TOO_LARGE" : drbd_conn_s_names[s]; | ||
96 | } | ||
97 | |||
98 | const char *drbd_role_str(enum drbd_role s) | ||
99 | { | ||
100 | return s > R_SECONDARY ? "TOO_LARGE" : drbd_role_s_names[s]; | ||
101 | } | ||
102 | |||
103 | const char *drbd_disk_str(enum drbd_disk_state s) | ||
104 | { | ||
105 | return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s]; | ||
106 | } | ||
107 | |||
108 | const char *drbd_set_st_err_str(enum drbd_state_ret_codes err) | ||
109 | { | ||
110 | return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" : | ||
111 | err > SS_TWO_PRIMARIES ? "TOO_LARGE" | ||
112 | : drbd_state_sw_errors[-err]; | ||
113 | } | ||
diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h new file mode 100644 index 000000000000..fc824006e721 --- /dev/null +++ b/drivers/block/drbd/drbd_vli.h | |||
@@ -0,0 +1,351 @@ | |||
1 | /* | ||
2 | -*- linux-c -*- | ||
3 | drbd_receiver.c | ||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | */ | ||
24 | |||
25 | #ifndef _DRBD_VLI_H | ||
26 | #define _DRBD_VLI_H | ||
27 | |||
28 | /* | ||
29 | * At a granularity of 4KiB storage represented per bit, | ||
30 | * and stroage sizes of several TiB, | ||
31 | * and possibly small-bandwidth replication, | ||
32 | * the bitmap transfer time can take much too long, | ||
33 | * if transmitted in plain text. | ||
34 | * | ||
35 | * We try to reduce the transfered bitmap information | ||
36 | * by encoding runlengths of bit polarity. | ||
37 | * | ||
38 | * We never actually need to encode a "zero" (runlengths are positive). | ||
39 | * But then we have to store the value of the first bit. | ||
40 | * The first bit of information thus shall encode if the first runlength | ||
41 | * gives the number of set or unset bits. | ||
42 | * | ||
43 | * We assume that large areas are either completely set or unset, | ||
44 | * which gives good compression with any runlength method, | ||
45 | * even when encoding the runlength as fixed size 32bit/64bit integers. | ||
46 | * | ||
47 | * Still, there may be areas where the polarity flips every few bits, | ||
48 | * and encoding the runlength sequence of those areas with fix size | ||
49 | * integers would be much worse than plaintext. | ||
50 | * | ||
51 | * We want to encode small runlength values with minimum code length, | ||
52 | * while still being able to encode a Huge run of all zeros. | ||
53 | * | ||
54 | * Thus we need a Variable Length Integer encoding, VLI. | ||
55 | * | ||
56 | * For some cases, we produce more code bits than plaintext input. | ||
57 | * We need to send incompressible chunks as plaintext, skip over them | ||
58 | * and then see if the next chunk compresses better. | ||
59 | * | ||
60 | * We don't care too much about "excellent" compression ratio for large | ||
61 | * runlengths (all set/all clear): whether we achieve a factor of 100 | ||
62 | * or 1000 is not that much of an issue. | ||
63 | * We do not want to waste too much on short runlengths in the "noisy" | ||
64 | * parts of the bitmap, though. | ||
65 | * | ||
66 | * There are endless variants of VLI, we experimented with: | ||
67 | * * simple byte-based | ||
68 | * * various bit based with different code word length. | ||
69 | * | ||
70 | * To avoid yet an other configuration parameter (choice of bitmap compression | ||
71 | * algorithm) which was difficult to explain and tune, we just chose the one | ||
72 | * variant that turned out best in all test cases. | ||
73 | * Based on real world usage patterns, with device sizes ranging from a few GiB | ||
74 | * to several TiB, file server/mailserver/webserver/mysql/postgress, | ||
75 | * mostly idle to really busy, the all time winner (though sometimes only | ||
76 | * marginally better) is: | ||
77 | */ | ||
78 | |||
79 | /* | ||
80 | * encoding is "visualised" as | ||
81 | * __little endian__ bitstream, least significant bit first (left most) | ||
82 | * | ||
83 | * this particular encoding is chosen so that the prefix code | ||
84 | * starts as unary encoding the level, then modified so that | ||
85 | * 10 levels can be described in 8bit, with minimal overhead | ||
86 | * for the smaller levels. | ||
87 | * | ||
88 | * Number of data bits follow fibonacci sequence, with the exception of the | ||
89 | * last level (+1 data bit, so it makes 64bit total). The only worse code when | ||
90 | * encoding bit polarity runlength is 1 plain bits => 2 code bits. | ||
91 | prefix data bits max val Nº data bits | ||
92 | 0 x 0x2 1 | ||
93 | 10 x 0x4 1 | ||
94 | 110 xx 0x8 2 | ||
95 | 1110 xxx 0x10 3 | ||
96 | 11110 xxx xx 0x30 5 | ||
97 | 111110 xx xxxxxx 0x130 8 | ||
98 | 11111100 xxxxxxxx xxxxx 0x2130 13 | ||
99 | 11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21 | ||
100 | 11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34 | ||
101 | 11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56 | ||
102 | * maximum encodable value: 0x100000400202130 == 2**56 + some */ | ||
103 | |||
104 | /* compression "table": | ||
105 | transmitted x 0.29 | ||
106 | as plaintext x ........................ | ||
107 | x ........................ | ||
108 | x ........................ | ||
109 | x 0.59 0.21........................ | ||
110 | x ........................................................ | ||
111 | x .. c ................................................... | ||
112 | x 0.44.. o ................................................... | ||
113 | x .......... d ................................................... | ||
114 | x .......... e ................................................... | ||
115 | X............. ................................................... | ||
116 | x.............. b ................................................... | ||
117 | 2.0x............... i ................................................... | ||
118 | #X................ t ................................................... | ||
119 | #................. s ........................... plain bits .......... | ||
120 | -+----------------------------------------------------------------------- | ||
121 | 1 16 32 64 | ||
122 | */ | ||
123 | |||
124 | /* LEVEL: (total bits, prefix bits, prefix value), | ||
125 | * sorted ascending by number of total bits. | ||
126 | * The rest of the code table is calculated at compiletime from this. */ | ||
127 | |||
128 | /* fibonacci data 1, 1, ... */ | ||
129 | #define VLI_L_1_1() do { \ | ||
130 | LEVEL( 2, 1, 0x00); \ | ||
131 | LEVEL( 3, 2, 0x01); \ | ||
132 | LEVEL( 5, 3, 0x03); \ | ||
133 | LEVEL( 7, 4, 0x07); \ | ||
134 | LEVEL(10, 5, 0x0f); \ | ||
135 | LEVEL(14, 6, 0x1f); \ | ||
136 | LEVEL(21, 8, 0x3f); \ | ||
137 | LEVEL(29, 8, 0x7f); \ | ||
138 | LEVEL(42, 8, 0xbf); \ | ||
139 | LEVEL(64, 8, 0xff); \ | ||
140 | } while (0) | ||
141 | |||
142 | /* finds a suitable level to decode the least significant part of in. | ||
143 | * returns number of bits consumed. | ||
144 | * | ||
145 | * BUG() for bad input, as that would mean a buggy code table. */ | ||
146 | static inline int vli_decode_bits(u64 *out, const u64 in) | ||
147 | { | ||
148 | u64 adj = 1; | ||
149 | |||
150 | #define LEVEL(t,b,v) \ | ||
151 | do { \ | ||
152 | if ((in & ((1 << b) -1)) == v) { \ | ||
153 | *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \ | ||
154 | return t; \ | ||
155 | } \ | ||
156 | adj += 1ULL << (t - b); \ | ||
157 | } while (0) | ||
158 | |||
159 | VLI_L_1_1(); | ||
160 | |||
161 | /* NOT REACHED, if VLI_LEVELS code table is defined properly */ | ||
162 | BUG(); | ||
163 | #undef LEVEL | ||
164 | } | ||
165 | |||
166 | /* return number of code bits needed, | ||
167 | * or negative error number */ | ||
168 | static inline int __vli_encode_bits(u64 *out, const u64 in) | ||
169 | { | ||
170 | u64 max = 0; | ||
171 | u64 adj = 1; | ||
172 | |||
173 | if (in == 0) | ||
174 | return -EINVAL; | ||
175 | |||
176 | #define LEVEL(t,b,v) do { \ | ||
177 | max += 1ULL << (t - b); \ | ||
178 | if (in <= max) { \ | ||
179 | if (out) \ | ||
180 | *out = ((in - adj) << b) | v; \ | ||
181 | return t; \ | ||
182 | } \ | ||
183 | adj = max + 1; \ | ||
184 | } while (0) | ||
185 | |||
186 | VLI_L_1_1(); | ||
187 | |||
188 | return -EOVERFLOW; | ||
189 | #undef LEVEL | ||
190 | } | ||
191 | |||
192 | #undef VLI_L_1_1 | ||
193 | |||
194 | /* code from here down is independend of actually used bit code */ | ||
195 | |||
196 | /* | ||
197 | * Code length is determined by some unique (e.g. unary) prefix. | ||
198 | * This encodes arbitrary bit length, not whole bytes: we have a bit-stream, | ||
199 | * not a byte stream. | ||
200 | */ | ||
201 | |||
202 | /* for the bitstream, we need a cursor */ | ||
203 | struct bitstream_cursor { | ||
204 | /* the current byte */ | ||
205 | u8 *b; | ||
206 | /* the current bit within *b, nomalized: 0..7 */ | ||
207 | unsigned int bit; | ||
208 | }; | ||
209 | |||
210 | /* initialize cursor to point to first bit of stream */ | ||
211 | static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s) | ||
212 | { | ||
213 | cur->b = s; | ||
214 | cur->bit = 0; | ||
215 | } | ||
216 | |||
217 | /* advance cursor by that many bits; maximum expected input value: 64, | ||
218 | * but depending on VLI implementation, it may be more. */ | ||
219 | static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits) | ||
220 | { | ||
221 | bits += cur->bit; | ||
222 | cur->b = cur->b + (bits >> 3); | ||
223 | cur->bit = bits & 7; | ||
224 | } | ||
225 | |||
226 | /* the bitstream itself knows its length */ | ||
227 | struct bitstream { | ||
228 | struct bitstream_cursor cur; | ||
229 | unsigned char *buf; | ||
230 | size_t buf_len; /* in bytes */ | ||
231 | |||
232 | /* for input stream: | ||
233 | * number of trailing 0 bits for padding | ||
234 | * total number of valid bits in stream: buf_len * 8 - pad_bits */ | ||
235 | unsigned int pad_bits; | ||
236 | }; | ||
237 | |||
238 | static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits) | ||
239 | { | ||
240 | bs->buf = s; | ||
241 | bs->buf_len = len; | ||
242 | bs->pad_bits = pad_bits; | ||
243 | bitstream_cursor_reset(&bs->cur, bs->buf); | ||
244 | } | ||
245 | |||
246 | static inline void bitstream_rewind(struct bitstream *bs) | ||
247 | { | ||
248 | bitstream_cursor_reset(&bs->cur, bs->buf); | ||
249 | memset(bs->buf, 0, bs->buf_len); | ||
250 | } | ||
251 | |||
252 | /* Put (at most 64) least significant bits of val into bitstream, and advance cursor. | ||
253 | * Ignores "pad_bits". | ||
254 | * Returns zero if bits == 0 (nothing to do). | ||
255 | * Returns number of bits used if successful. | ||
256 | * | ||
257 | * If there is not enough room left in bitstream, | ||
258 | * leaves bitstream unchanged and returns -ENOBUFS. | ||
259 | */ | ||
260 | static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits) | ||
261 | { | ||
262 | unsigned char *b = bs->cur.b; | ||
263 | unsigned int tmp; | ||
264 | |||
265 | if (bits == 0) | ||
266 | return 0; | ||
267 | |||
268 | if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len) | ||
269 | return -ENOBUFS; | ||
270 | |||
271 | /* paranoia: strip off hi bits; they should not be set anyways. */ | ||
272 | if (bits < 64) | ||
273 | val &= ~0ULL >> (64 - bits); | ||
274 | |||
275 | *b++ |= (val & 0xff) << bs->cur.bit; | ||
276 | |||
277 | for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8) | ||
278 | *b++ |= (val >> tmp) & 0xff; | ||
279 | |||
280 | bitstream_cursor_advance(&bs->cur, bits); | ||
281 | return bits; | ||
282 | } | ||
283 | |||
284 | /* Fetch (at most 64) bits from bitstream into *out, and advance cursor. | ||
285 | * | ||
286 | * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged. | ||
287 | * | ||
288 | * If there are less than the requested number of valid bits left in the | ||
289 | * bitstream, still fetches all available bits. | ||
290 | * | ||
291 | * Returns number of actually fetched bits. | ||
292 | */ | ||
293 | static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits) | ||
294 | { | ||
295 | u64 val; | ||
296 | unsigned int n; | ||
297 | |||
298 | if (bits > 64) | ||
299 | return -EINVAL; | ||
300 | |||
301 | if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len) | ||
302 | bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3) | ||
303 | - bs->cur.bit - bs->pad_bits; | ||
304 | |||
305 | if (bits == 0) { | ||
306 | *out = 0; | ||
307 | return 0; | ||
308 | } | ||
309 | |||
310 | /* get the high bits */ | ||
311 | val = 0; | ||
312 | n = (bs->cur.bit + bits + 7) >> 3; | ||
313 | /* n may be at most 9, if cur.bit + bits > 64 */ | ||
314 | /* which means this copies at most 8 byte */ | ||
315 | if (n) { | ||
316 | memcpy(&val, bs->cur.b+1, n - 1); | ||
317 | val = le64_to_cpu(val) << (8 - bs->cur.bit); | ||
318 | } | ||
319 | |||
320 | /* we still need the low bits */ | ||
321 | val |= bs->cur.b[0] >> bs->cur.bit; | ||
322 | |||
323 | /* and mask out bits we don't want */ | ||
324 | val &= ~0ULL >> (64 - bits); | ||
325 | |||
326 | bitstream_cursor_advance(&bs->cur, bits); | ||
327 | *out = val; | ||
328 | |||
329 | return bits; | ||
330 | } | ||
331 | |||
332 | /* encodes @in as vli into @bs; | ||
333 | |||
334 | * return values | ||
335 | * > 0: number of bits successfully stored in bitstream | ||
336 | * -ENOBUFS @bs is full | ||
337 | * -EINVAL input zero (invalid) | ||
338 | * -EOVERFLOW input too large for this vli code (invalid) | ||
339 | */ | ||
340 | static inline int vli_encode_bits(struct bitstream *bs, u64 in) | ||
341 | { | ||
342 | u64 code = code; | ||
343 | int bits = __vli_encode_bits(&code, in); | ||
344 | |||
345 | if (bits <= 0) | ||
346 | return bits; | ||
347 | |||
348 | return bitstream_put_bits(bs, code, bits); | ||
349 | } | ||
350 | |||
351 | #endif | ||
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c new file mode 100644 index 000000000000..ed8796f1112d --- /dev/null +++ b/drivers/block/drbd/drbd_worker.c | |||
@@ -0,0 +1,1512 @@ | |||
1 | /* | ||
2 | drbd_worker.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | #include <linux/version.h> | ||
28 | #include <linux/drbd.h> | ||
29 | #include <linux/sched.h> | ||
30 | #include <linux/smp_lock.h> | ||
31 | #include <linux/wait.h> | ||
32 | #include <linux/mm.h> | ||
33 | #include <linux/memcontrol.h> | ||
34 | #include <linux/mm_inline.h> | ||
35 | #include <linux/slab.h> | ||
36 | #include <linux/random.h> | ||
37 | #include <linux/mm.h> | ||
38 | #include <linux/string.h> | ||
39 | #include <linux/scatterlist.h> | ||
40 | |||
41 | #include "drbd_int.h" | ||
42 | #include "drbd_req.h" | ||
43 | |||
44 | #define SLEEP_TIME (HZ/10) | ||
45 | |||
46 | static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel); | ||
47 | |||
48 | |||
49 | |||
50 | /* defined here: | ||
51 | drbd_md_io_complete | ||
52 | drbd_endio_write_sec | ||
53 | drbd_endio_read_sec | ||
54 | drbd_endio_pri | ||
55 | |||
56 | * more endio handlers: | ||
57 | atodb_endio in drbd_actlog.c | ||
58 | drbd_bm_async_io_complete in drbd_bitmap.c | ||
59 | |||
60 | * For all these callbacks, note the following: | ||
61 | * The callbacks will be called in irq context by the IDE drivers, | ||
62 | * and in Softirqs/Tasklets/BH context by the SCSI drivers. | ||
63 | * Try to get the locking right :) | ||
64 | * | ||
65 | */ | ||
66 | |||
67 | |||
68 | /* About the global_state_lock | ||
69 | Each state transition on an device holds a read lock. In case we have | ||
70 | to evaluate the sync after dependencies, we grab a write lock, because | ||
71 | we need stable states on all devices for that. */ | ||
72 | rwlock_t global_state_lock; | ||
73 | |||
74 | /* used for synchronous meta data and bitmap IO | ||
75 | * submitted by drbd_md_sync_page_io() | ||
76 | */ | ||
77 | void drbd_md_io_complete(struct bio *bio, int error) | ||
78 | { | ||
79 | struct drbd_md_io *md_io; | ||
80 | |||
81 | md_io = (struct drbd_md_io *)bio->bi_private; | ||
82 | md_io->error = error; | ||
83 | |||
84 | complete(&md_io->event); | ||
85 | } | ||
86 | |||
87 | /* reads on behalf of the partner, | ||
88 | * "submitted" by the receiver | ||
89 | */ | ||
90 | void drbd_endio_read_sec(struct bio *bio, int error) __releases(local) | ||
91 | { | ||
92 | unsigned long flags = 0; | ||
93 | struct drbd_epoch_entry *e = NULL; | ||
94 | struct drbd_conf *mdev; | ||
95 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | ||
96 | |||
97 | e = bio->bi_private; | ||
98 | mdev = e->mdev; | ||
99 | |||
100 | if (error) | ||
101 | dev_warn(DEV, "read: error=%d s=%llus\n", error, | ||
102 | (unsigned long long)e->sector); | ||
103 | if (!error && !uptodate) { | ||
104 | dev_warn(DEV, "read: setting error to -EIO s=%llus\n", | ||
105 | (unsigned long long)e->sector); | ||
106 | /* strange behavior of some lower level drivers... | ||
107 | * fail the request by clearing the uptodate flag, | ||
108 | * but do not return any error?! */ | ||
109 | error = -EIO; | ||
110 | } | ||
111 | |||
112 | D_ASSERT(e->block_id != ID_VACANT); | ||
113 | |||
114 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
115 | mdev->read_cnt += e->size >> 9; | ||
116 | list_del(&e->w.list); | ||
117 | if (list_empty(&mdev->read_ee)) | ||
118 | wake_up(&mdev->ee_wait); | ||
119 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
120 | |||
121 | drbd_chk_io_error(mdev, error, FALSE); | ||
122 | drbd_queue_work(&mdev->data.work, &e->w); | ||
123 | put_ldev(mdev); | ||
124 | } | ||
125 | |||
126 | /* writes on behalf of the partner, or resync writes, | ||
127 | * "submitted" by the receiver. | ||
128 | */ | ||
129 | void drbd_endio_write_sec(struct bio *bio, int error) __releases(local) | ||
130 | { | ||
131 | unsigned long flags = 0; | ||
132 | struct drbd_epoch_entry *e = NULL; | ||
133 | struct drbd_conf *mdev; | ||
134 | sector_t e_sector; | ||
135 | int do_wake; | ||
136 | int is_syncer_req; | ||
137 | int do_al_complete_io; | ||
138 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | ||
139 | int is_barrier = bio_rw_flagged(bio, BIO_RW_BARRIER); | ||
140 | |||
141 | e = bio->bi_private; | ||
142 | mdev = e->mdev; | ||
143 | |||
144 | if (error) | ||
145 | dev_warn(DEV, "write: error=%d s=%llus\n", error, | ||
146 | (unsigned long long)e->sector); | ||
147 | if (!error && !uptodate) { | ||
148 | dev_warn(DEV, "write: setting error to -EIO s=%llus\n", | ||
149 | (unsigned long long)e->sector); | ||
150 | /* strange behavior of some lower level drivers... | ||
151 | * fail the request by clearing the uptodate flag, | ||
152 | * but do not return any error?! */ | ||
153 | error = -EIO; | ||
154 | } | ||
155 | |||
156 | /* error == -ENOTSUPP would be a better test, | ||
157 | * alas it is not reliable */ | ||
158 | if (error && is_barrier && e->flags & EE_IS_BARRIER) { | ||
159 | drbd_bump_write_ordering(mdev, WO_bdev_flush); | ||
160 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
161 | list_del(&e->w.list); | ||
162 | e->w.cb = w_e_reissue; | ||
163 | /* put_ldev actually happens below, once we come here again. */ | ||
164 | __release(local); | ||
165 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
166 | drbd_queue_work(&mdev->data.work, &e->w); | ||
167 | return; | ||
168 | } | ||
169 | |||
170 | D_ASSERT(e->block_id != ID_VACANT); | ||
171 | |||
172 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
173 | mdev->writ_cnt += e->size >> 9; | ||
174 | is_syncer_req = is_syncer_block_id(e->block_id); | ||
175 | |||
176 | /* after we moved e to done_ee, | ||
177 | * we may no longer access it, | ||
178 | * it may be freed/reused already! | ||
179 | * (as soon as we release the req_lock) */ | ||
180 | e_sector = e->sector; | ||
181 | do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO; | ||
182 | |||
183 | list_del(&e->w.list); /* has been on active_ee or sync_ee */ | ||
184 | list_add_tail(&e->w.list, &mdev->done_ee); | ||
185 | |||
186 | /* No hlist_del_init(&e->colision) here, we did not send the Ack yet, | ||
187 | * neither did we wake possibly waiting conflicting requests. | ||
188 | * done from "drbd_process_done_ee" within the appropriate w.cb | ||
189 | * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */ | ||
190 | |||
191 | do_wake = is_syncer_req | ||
192 | ? list_empty(&mdev->sync_ee) | ||
193 | : list_empty(&mdev->active_ee); | ||
194 | |||
195 | if (error) | ||
196 | __drbd_chk_io_error(mdev, FALSE); | ||
197 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
198 | |||
199 | if (is_syncer_req) | ||
200 | drbd_rs_complete_io(mdev, e_sector); | ||
201 | |||
202 | if (do_wake) | ||
203 | wake_up(&mdev->ee_wait); | ||
204 | |||
205 | if (do_al_complete_io) | ||
206 | drbd_al_complete_io(mdev, e_sector); | ||
207 | |||
208 | wake_asender(mdev); | ||
209 | put_ldev(mdev); | ||
210 | |||
211 | } | ||
212 | |||
213 | /* read, readA or write requests on R_PRIMARY coming from drbd_make_request | ||
214 | */ | ||
215 | void drbd_endio_pri(struct bio *bio, int error) | ||
216 | { | ||
217 | unsigned long flags; | ||
218 | struct drbd_request *req = bio->bi_private; | ||
219 | struct drbd_conf *mdev = req->mdev; | ||
220 | struct bio_and_error m; | ||
221 | enum drbd_req_event what; | ||
222 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | ||
223 | |||
224 | if (error) | ||
225 | dev_warn(DEV, "p %s: error=%d\n", | ||
226 | bio_data_dir(bio) == WRITE ? "write" : "read", error); | ||
227 | if (!error && !uptodate) { | ||
228 | dev_warn(DEV, "p %s: setting error to -EIO\n", | ||
229 | bio_data_dir(bio) == WRITE ? "write" : "read"); | ||
230 | /* strange behavior of some lower level drivers... | ||
231 | * fail the request by clearing the uptodate flag, | ||
232 | * but do not return any error?! */ | ||
233 | error = -EIO; | ||
234 | } | ||
235 | |||
236 | /* to avoid recursion in __req_mod */ | ||
237 | if (unlikely(error)) { | ||
238 | what = (bio_data_dir(bio) == WRITE) | ||
239 | ? write_completed_with_error | ||
240 | : (bio_rw(bio) == READA) | ||
241 | ? read_completed_with_error | ||
242 | : read_ahead_completed_with_error; | ||
243 | } else | ||
244 | what = completed_ok; | ||
245 | |||
246 | bio_put(req->private_bio); | ||
247 | req->private_bio = ERR_PTR(error); | ||
248 | |||
249 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
250 | __req_mod(req, what, &m); | ||
251 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
252 | |||
253 | if (m.bio) | ||
254 | complete_master_bio(mdev, &m); | ||
255 | } | ||
256 | |||
257 | int w_io_error(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
258 | { | ||
259 | struct drbd_request *req = container_of(w, struct drbd_request, w); | ||
260 | |||
261 | /* NOTE: mdev->ldev can be NULL by the time we get here! */ | ||
262 | /* D_ASSERT(mdev->ldev->dc.on_io_error != EP_PASS_ON); */ | ||
263 | |||
264 | /* the only way this callback is scheduled is from _req_may_be_done, | ||
265 | * when it is done and had a local write error, see comments there */ | ||
266 | drbd_req_free(req); | ||
267 | |||
268 | return TRUE; | ||
269 | } | ||
270 | |||
271 | int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
272 | { | ||
273 | struct drbd_request *req = container_of(w, struct drbd_request, w); | ||
274 | |||
275 | /* We should not detach for read io-error, | ||
276 | * but try to WRITE the P_DATA_REPLY to the failed location, | ||
277 | * to give the disk the chance to relocate that block */ | ||
278 | |||
279 | spin_lock_irq(&mdev->req_lock); | ||
280 | if (cancel || | ||
281 | mdev->state.conn < C_CONNECTED || | ||
282 | mdev->state.pdsk <= D_INCONSISTENT) { | ||
283 | _req_mod(req, send_canceled); | ||
284 | spin_unlock_irq(&mdev->req_lock); | ||
285 | dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n"); | ||
286 | return 1; | ||
287 | } | ||
288 | spin_unlock_irq(&mdev->req_lock); | ||
289 | |||
290 | return w_send_read_req(mdev, w, 0); | ||
291 | } | ||
292 | |||
293 | int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
294 | { | ||
295 | ERR_IF(cancel) return 1; | ||
296 | dev_err(DEV, "resync inactive, but callback triggered??\n"); | ||
297 | return 1; /* Simply ignore this! */ | ||
298 | } | ||
299 | |||
300 | void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest) | ||
301 | { | ||
302 | struct hash_desc desc; | ||
303 | struct scatterlist sg; | ||
304 | struct bio_vec *bvec; | ||
305 | int i; | ||
306 | |||
307 | desc.tfm = tfm; | ||
308 | desc.flags = 0; | ||
309 | |||
310 | sg_init_table(&sg, 1); | ||
311 | crypto_hash_init(&desc); | ||
312 | |||
313 | __bio_for_each_segment(bvec, bio, i, 0) { | ||
314 | sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset); | ||
315 | crypto_hash_update(&desc, &sg, sg.length); | ||
316 | } | ||
317 | crypto_hash_final(&desc, digest); | ||
318 | } | ||
319 | |||
320 | static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
321 | { | ||
322 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | ||
323 | int digest_size; | ||
324 | void *digest; | ||
325 | int ok; | ||
326 | |||
327 | D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef); | ||
328 | |||
329 | if (unlikely(cancel)) { | ||
330 | drbd_free_ee(mdev, e); | ||
331 | return 1; | ||
332 | } | ||
333 | |||
334 | if (likely(drbd_bio_uptodate(e->private_bio))) { | ||
335 | digest_size = crypto_hash_digestsize(mdev->csums_tfm); | ||
336 | digest = kmalloc(digest_size, GFP_NOIO); | ||
337 | if (digest) { | ||
338 | drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest); | ||
339 | |||
340 | inc_rs_pending(mdev); | ||
341 | ok = drbd_send_drequest_csum(mdev, | ||
342 | e->sector, | ||
343 | e->size, | ||
344 | digest, | ||
345 | digest_size, | ||
346 | P_CSUM_RS_REQUEST); | ||
347 | kfree(digest); | ||
348 | } else { | ||
349 | dev_err(DEV, "kmalloc() of digest failed.\n"); | ||
350 | ok = 0; | ||
351 | } | ||
352 | } else | ||
353 | ok = 1; | ||
354 | |||
355 | drbd_free_ee(mdev, e); | ||
356 | |||
357 | if (unlikely(!ok)) | ||
358 | dev_err(DEV, "drbd_send_drequest(..., csum) failed\n"); | ||
359 | return ok; | ||
360 | } | ||
361 | |||
362 | #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) | ||
363 | |||
364 | static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) | ||
365 | { | ||
366 | struct drbd_epoch_entry *e; | ||
367 | |||
368 | if (!get_ldev(mdev)) | ||
369 | return 0; | ||
370 | |||
371 | /* GFP_TRY, because if there is no memory available right now, this may | ||
372 | * be rescheduled for later. It is "only" background resync, after all. */ | ||
373 | e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY); | ||
374 | if (!e) { | ||
375 | put_ldev(mdev); | ||
376 | return 2; | ||
377 | } | ||
378 | |||
379 | spin_lock_irq(&mdev->req_lock); | ||
380 | list_add(&e->w.list, &mdev->read_ee); | ||
381 | spin_unlock_irq(&mdev->req_lock); | ||
382 | |||
383 | e->private_bio->bi_end_io = drbd_endio_read_sec; | ||
384 | e->private_bio->bi_rw = READ; | ||
385 | e->w.cb = w_e_send_csum; | ||
386 | |||
387 | mdev->read_cnt += size >> 9; | ||
388 | drbd_generic_make_request(mdev, DRBD_FAULT_RS_RD, e->private_bio); | ||
389 | |||
390 | return 1; | ||
391 | } | ||
392 | |||
393 | void resync_timer_fn(unsigned long data) | ||
394 | { | ||
395 | unsigned long flags; | ||
396 | struct drbd_conf *mdev = (struct drbd_conf *) data; | ||
397 | int queue; | ||
398 | |||
399 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
400 | |||
401 | if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) { | ||
402 | queue = 1; | ||
403 | if (mdev->state.conn == C_VERIFY_S) | ||
404 | mdev->resync_work.cb = w_make_ov_request; | ||
405 | else | ||
406 | mdev->resync_work.cb = w_make_resync_request; | ||
407 | } else { | ||
408 | queue = 0; | ||
409 | mdev->resync_work.cb = w_resync_inactive; | ||
410 | } | ||
411 | |||
412 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
413 | |||
414 | /* harmless race: list_empty outside data.work.q_lock */ | ||
415 | if (list_empty(&mdev->resync_work.list) && queue) | ||
416 | drbd_queue_work(&mdev->data.work, &mdev->resync_work); | ||
417 | } | ||
418 | |||
419 | int w_make_resync_request(struct drbd_conf *mdev, | ||
420 | struct drbd_work *w, int cancel) | ||
421 | { | ||
422 | unsigned long bit; | ||
423 | sector_t sector; | ||
424 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); | ||
425 | int max_segment_size = queue_max_segment_size(mdev->rq_queue); | ||
426 | int number, i, size, pe, mx; | ||
427 | int align, queued, sndbuf; | ||
428 | |||
429 | if (unlikely(cancel)) | ||
430 | return 1; | ||
431 | |||
432 | if (unlikely(mdev->state.conn < C_CONNECTED)) { | ||
433 | dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected"); | ||
434 | return 0; | ||
435 | } | ||
436 | |||
437 | if (mdev->state.conn != C_SYNC_TARGET) | ||
438 | dev_err(DEV, "%s in w_make_resync_request\n", | ||
439 | drbd_conn_str(mdev->state.conn)); | ||
440 | |||
441 | if (!get_ldev(mdev)) { | ||
442 | /* Since we only need to access mdev->rsync a | ||
443 | get_ldev_if_state(mdev,D_FAILED) would be sufficient, but | ||
444 | to continue resync with a broken disk makes no sense at | ||
445 | all */ | ||
446 | dev_err(DEV, "Disk broke down during resync!\n"); | ||
447 | mdev->resync_work.cb = w_resync_inactive; | ||
448 | return 1; | ||
449 | } | ||
450 | |||
451 | number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ); | ||
452 | pe = atomic_read(&mdev->rs_pending_cnt); | ||
453 | |||
454 | mutex_lock(&mdev->data.mutex); | ||
455 | if (mdev->data.socket) | ||
456 | mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req); | ||
457 | else | ||
458 | mx = 1; | ||
459 | mutex_unlock(&mdev->data.mutex); | ||
460 | |||
461 | /* For resync rates >160MB/sec, allow more pending RS requests */ | ||
462 | if (number > mx) | ||
463 | mx = number; | ||
464 | |||
465 | /* Limit the number of pending RS requests to no more than the peer's receive buffer */ | ||
466 | if ((pe + number) > mx) { | ||
467 | number = mx - pe; | ||
468 | } | ||
469 | |||
470 | for (i = 0; i < number; i++) { | ||
471 | /* Stop generating RS requests, when half of the send buffer is filled */ | ||
472 | mutex_lock(&mdev->data.mutex); | ||
473 | if (mdev->data.socket) { | ||
474 | queued = mdev->data.socket->sk->sk_wmem_queued; | ||
475 | sndbuf = mdev->data.socket->sk->sk_sndbuf; | ||
476 | } else { | ||
477 | queued = 1; | ||
478 | sndbuf = 0; | ||
479 | } | ||
480 | mutex_unlock(&mdev->data.mutex); | ||
481 | if (queued > sndbuf / 2) | ||
482 | goto requeue; | ||
483 | |||
484 | next_sector: | ||
485 | size = BM_BLOCK_SIZE; | ||
486 | bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo); | ||
487 | |||
488 | if (bit == -1UL) { | ||
489 | mdev->bm_resync_fo = drbd_bm_bits(mdev); | ||
490 | mdev->resync_work.cb = w_resync_inactive; | ||
491 | put_ldev(mdev); | ||
492 | return 1; | ||
493 | } | ||
494 | |||
495 | sector = BM_BIT_TO_SECT(bit); | ||
496 | |||
497 | if (drbd_try_rs_begin_io(mdev, sector)) { | ||
498 | mdev->bm_resync_fo = bit; | ||
499 | goto requeue; | ||
500 | } | ||
501 | mdev->bm_resync_fo = bit + 1; | ||
502 | |||
503 | if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) { | ||
504 | drbd_rs_complete_io(mdev, sector); | ||
505 | goto next_sector; | ||
506 | } | ||
507 | |||
508 | #if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE | ||
509 | /* try to find some adjacent bits. | ||
510 | * we stop if we have already the maximum req size. | ||
511 | * | ||
512 | * Additionally always align bigger requests, in order to | ||
513 | * be prepared for all stripe sizes of software RAIDs. | ||
514 | * | ||
515 | * we _do_ care about the agreed-upon q->max_segment_size | ||
516 | * here, as splitting up the requests on the other side is more | ||
517 | * difficult. the consequence is, that on lvm and md and other | ||
518 | * "indirect" devices, this is dead code, since | ||
519 | * q->max_segment_size will be PAGE_SIZE. | ||
520 | */ | ||
521 | align = 1; | ||
522 | for (;;) { | ||
523 | if (size + BM_BLOCK_SIZE > max_segment_size) | ||
524 | break; | ||
525 | |||
526 | /* Be always aligned */ | ||
527 | if (sector & ((1<<(align+3))-1)) | ||
528 | break; | ||
529 | |||
530 | /* do not cross extent boundaries */ | ||
531 | if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0) | ||
532 | break; | ||
533 | /* now, is it actually dirty, after all? | ||
534 | * caution, drbd_bm_test_bit is tri-state for some | ||
535 | * obscure reason; ( b == 0 ) would get the out-of-band | ||
536 | * only accidentally right because of the "oddly sized" | ||
537 | * adjustment below */ | ||
538 | if (drbd_bm_test_bit(mdev, bit+1) != 1) | ||
539 | break; | ||
540 | bit++; | ||
541 | size += BM_BLOCK_SIZE; | ||
542 | if ((BM_BLOCK_SIZE << align) <= size) | ||
543 | align++; | ||
544 | i++; | ||
545 | } | ||
546 | /* if we merged some, | ||
547 | * reset the offset to start the next drbd_bm_find_next from */ | ||
548 | if (size > BM_BLOCK_SIZE) | ||
549 | mdev->bm_resync_fo = bit + 1; | ||
550 | #endif | ||
551 | |||
552 | /* adjust very last sectors, in case we are oddly sized */ | ||
553 | if (sector + (size>>9) > capacity) | ||
554 | size = (capacity-sector)<<9; | ||
555 | if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) { | ||
556 | switch (read_for_csum(mdev, sector, size)) { | ||
557 | case 0: /* Disk failure*/ | ||
558 | put_ldev(mdev); | ||
559 | return 0; | ||
560 | case 2: /* Allocation failed */ | ||
561 | drbd_rs_complete_io(mdev, sector); | ||
562 | mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); | ||
563 | goto requeue; | ||
564 | /* case 1: everything ok */ | ||
565 | } | ||
566 | } else { | ||
567 | inc_rs_pending(mdev); | ||
568 | if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST, | ||
569 | sector, size, ID_SYNCER)) { | ||
570 | dev_err(DEV, "drbd_send_drequest() failed, aborting...\n"); | ||
571 | dec_rs_pending(mdev); | ||
572 | put_ldev(mdev); | ||
573 | return 0; | ||
574 | } | ||
575 | } | ||
576 | } | ||
577 | |||
578 | if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) { | ||
579 | /* last syncer _request_ was sent, | ||
580 | * but the P_RS_DATA_REPLY not yet received. sync will end (and | ||
581 | * next sync group will resume), as soon as we receive the last | ||
582 | * resync data block, and the last bit is cleared. | ||
583 | * until then resync "work" is "inactive" ... | ||
584 | */ | ||
585 | mdev->resync_work.cb = w_resync_inactive; | ||
586 | put_ldev(mdev); | ||
587 | return 1; | ||
588 | } | ||
589 | |||
590 | requeue: | ||
591 | mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); | ||
592 | put_ldev(mdev); | ||
593 | return 1; | ||
594 | } | ||
595 | |||
596 | static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
597 | { | ||
598 | int number, i, size; | ||
599 | sector_t sector; | ||
600 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); | ||
601 | |||
602 | if (unlikely(cancel)) | ||
603 | return 1; | ||
604 | |||
605 | if (unlikely(mdev->state.conn < C_CONNECTED)) { | ||
606 | dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected"); | ||
607 | return 0; | ||
608 | } | ||
609 | |||
610 | number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ); | ||
611 | if (atomic_read(&mdev->rs_pending_cnt) > number) | ||
612 | goto requeue; | ||
613 | |||
614 | number -= atomic_read(&mdev->rs_pending_cnt); | ||
615 | |||
616 | sector = mdev->ov_position; | ||
617 | for (i = 0; i < number; i++) { | ||
618 | if (sector >= capacity) { | ||
619 | mdev->resync_work.cb = w_resync_inactive; | ||
620 | return 1; | ||
621 | } | ||
622 | |||
623 | size = BM_BLOCK_SIZE; | ||
624 | |||
625 | if (drbd_try_rs_begin_io(mdev, sector)) { | ||
626 | mdev->ov_position = sector; | ||
627 | goto requeue; | ||
628 | } | ||
629 | |||
630 | if (sector + (size>>9) > capacity) | ||
631 | size = (capacity-sector)<<9; | ||
632 | |||
633 | inc_rs_pending(mdev); | ||
634 | if (!drbd_send_ov_request(mdev, sector, size)) { | ||
635 | dec_rs_pending(mdev); | ||
636 | return 0; | ||
637 | } | ||
638 | sector += BM_SECT_PER_BIT; | ||
639 | } | ||
640 | mdev->ov_position = sector; | ||
641 | |||
642 | requeue: | ||
643 | mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); | ||
644 | return 1; | ||
645 | } | ||
646 | |||
647 | |||
648 | int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
649 | { | ||
650 | kfree(w); | ||
651 | ov_oos_print(mdev); | ||
652 | drbd_resync_finished(mdev); | ||
653 | |||
654 | return 1; | ||
655 | } | ||
656 | |||
657 | static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
658 | { | ||
659 | kfree(w); | ||
660 | |||
661 | drbd_resync_finished(mdev); | ||
662 | |||
663 | return 1; | ||
664 | } | ||
665 | |||
666 | int drbd_resync_finished(struct drbd_conf *mdev) | ||
667 | { | ||
668 | unsigned long db, dt, dbdt; | ||
669 | unsigned long n_oos; | ||
670 | union drbd_state os, ns; | ||
671 | struct drbd_work *w; | ||
672 | char *khelper_cmd = NULL; | ||
673 | |||
674 | /* Remove all elements from the resync LRU. Since future actions | ||
675 | * might set bits in the (main) bitmap, then the entries in the | ||
676 | * resync LRU would be wrong. */ | ||
677 | if (drbd_rs_del_all(mdev)) { | ||
678 | /* In case this is not possible now, most probably because | ||
679 | * there are P_RS_DATA_REPLY Packets lingering on the worker's | ||
680 | * queue (or even the read operations for those packets | ||
681 | * is not finished by now). Retry in 100ms. */ | ||
682 | |||
683 | drbd_kick_lo(mdev); | ||
684 | __set_current_state(TASK_INTERRUPTIBLE); | ||
685 | schedule_timeout(HZ / 10); | ||
686 | w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC); | ||
687 | if (w) { | ||
688 | w->cb = w_resync_finished; | ||
689 | drbd_queue_work(&mdev->data.work, w); | ||
690 | return 1; | ||
691 | } | ||
692 | dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n"); | ||
693 | } | ||
694 | |||
695 | dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; | ||
696 | if (dt <= 0) | ||
697 | dt = 1; | ||
698 | db = mdev->rs_total; | ||
699 | dbdt = Bit2KB(db/dt); | ||
700 | mdev->rs_paused /= HZ; | ||
701 | |||
702 | if (!get_ldev(mdev)) | ||
703 | goto out; | ||
704 | |||
705 | spin_lock_irq(&mdev->req_lock); | ||
706 | os = mdev->state; | ||
707 | |||
708 | /* This protects us against multiple calls (that can happen in the presence | ||
709 | of application IO), and against connectivity loss just before we arrive here. */ | ||
710 | if (os.conn <= C_CONNECTED) | ||
711 | goto out_unlock; | ||
712 | |||
713 | ns = os; | ||
714 | ns.conn = C_CONNECTED; | ||
715 | |||
716 | dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n", | ||
717 | (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ? | ||
718 | "Online verify " : "Resync", | ||
719 | dt + mdev->rs_paused, mdev->rs_paused, dbdt); | ||
720 | |||
721 | n_oos = drbd_bm_total_weight(mdev); | ||
722 | |||
723 | if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) { | ||
724 | if (n_oos) { | ||
725 | dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n", | ||
726 | n_oos, Bit2KB(1)); | ||
727 | khelper_cmd = "out-of-sync"; | ||
728 | } | ||
729 | } else { | ||
730 | D_ASSERT((n_oos - mdev->rs_failed) == 0); | ||
731 | |||
732 | if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) | ||
733 | khelper_cmd = "after-resync-target"; | ||
734 | |||
735 | if (mdev->csums_tfm && mdev->rs_total) { | ||
736 | const unsigned long s = mdev->rs_same_csum; | ||
737 | const unsigned long t = mdev->rs_total; | ||
738 | const int ratio = | ||
739 | (t == 0) ? 0 : | ||
740 | (t < 100000) ? ((s*100)/t) : (s/(t/100)); | ||
741 | dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; " | ||
742 | "transferred %luK total %luK\n", | ||
743 | ratio, | ||
744 | Bit2KB(mdev->rs_same_csum), | ||
745 | Bit2KB(mdev->rs_total - mdev->rs_same_csum), | ||
746 | Bit2KB(mdev->rs_total)); | ||
747 | } | ||
748 | } | ||
749 | |||
750 | if (mdev->rs_failed) { | ||
751 | dev_info(DEV, " %lu failed blocks\n", mdev->rs_failed); | ||
752 | |||
753 | if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) { | ||
754 | ns.disk = D_INCONSISTENT; | ||
755 | ns.pdsk = D_UP_TO_DATE; | ||
756 | } else { | ||
757 | ns.disk = D_UP_TO_DATE; | ||
758 | ns.pdsk = D_INCONSISTENT; | ||
759 | } | ||
760 | } else { | ||
761 | ns.disk = D_UP_TO_DATE; | ||
762 | ns.pdsk = D_UP_TO_DATE; | ||
763 | |||
764 | if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) { | ||
765 | if (mdev->p_uuid) { | ||
766 | int i; | ||
767 | for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++) | ||
768 | _drbd_uuid_set(mdev, i, mdev->p_uuid[i]); | ||
769 | drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]); | ||
770 | _drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]); | ||
771 | } else { | ||
772 | dev_err(DEV, "mdev->p_uuid is NULL! BUG\n"); | ||
773 | } | ||
774 | } | ||
775 | |||
776 | drbd_uuid_set_bm(mdev, 0UL); | ||
777 | |||
778 | if (mdev->p_uuid) { | ||
779 | /* Now the two UUID sets are equal, update what we | ||
780 | * know of the peer. */ | ||
781 | int i; | ||
782 | for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++) | ||
783 | mdev->p_uuid[i] = mdev->ldev->md.uuid[i]; | ||
784 | } | ||
785 | } | ||
786 | |||
787 | _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | ||
788 | out_unlock: | ||
789 | spin_unlock_irq(&mdev->req_lock); | ||
790 | put_ldev(mdev); | ||
791 | out: | ||
792 | mdev->rs_total = 0; | ||
793 | mdev->rs_failed = 0; | ||
794 | mdev->rs_paused = 0; | ||
795 | mdev->ov_start_sector = 0; | ||
796 | |||
797 | if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) { | ||
798 | dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n"); | ||
799 | drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished"); | ||
800 | } | ||
801 | |||
802 | if (khelper_cmd) | ||
803 | drbd_khelper(mdev, khelper_cmd); | ||
804 | |||
805 | return 1; | ||
806 | } | ||
807 | |||
808 | /* helper */ | ||
809 | static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e) | ||
810 | { | ||
811 | if (drbd_bio_has_active_page(e->private_bio)) { | ||
812 | /* This might happen if sendpage() has not finished */ | ||
813 | spin_lock_irq(&mdev->req_lock); | ||
814 | list_add_tail(&e->w.list, &mdev->net_ee); | ||
815 | spin_unlock_irq(&mdev->req_lock); | ||
816 | } else | ||
817 | drbd_free_ee(mdev, e); | ||
818 | } | ||
819 | |||
820 | /** | ||
821 | * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST | ||
822 | * @mdev: DRBD device. | ||
823 | * @w: work object. | ||
824 | * @cancel: The connection will be closed anyways | ||
825 | */ | ||
826 | int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
827 | { | ||
828 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | ||
829 | int ok; | ||
830 | |||
831 | if (unlikely(cancel)) { | ||
832 | drbd_free_ee(mdev, e); | ||
833 | dec_unacked(mdev); | ||
834 | return 1; | ||
835 | } | ||
836 | |||
837 | if (likely(drbd_bio_uptodate(e->private_bio))) { | ||
838 | ok = drbd_send_block(mdev, P_DATA_REPLY, e); | ||
839 | } else { | ||
840 | if (__ratelimit(&drbd_ratelimit_state)) | ||
841 | dev_err(DEV, "Sending NegDReply. sector=%llus.\n", | ||
842 | (unsigned long long)e->sector); | ||
843 | |||
844 | ok = drbd_send_ack(mdev, P_NEG_DREPLY, e); | ||
845 | } | ||
846 | |||
847 | dec_unacked(mdev); | ||
848 | |||
849 | move_to_net_ee_or_free(mdev, e); | ||
850 | |||
851 | if (unlikely(!ok)) | ||
852 | dev_err(DEV, "drbd_send_block() failed\n"); | ||
853 | return ok; | ||
854 | } | ||
855 | |||
856 | /** | ||
857 | * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS | ||
858 | * @mdev: DRBD device. | ||
859 | * @w: work object. | ||
860 | * @cancel: The connection will be closed anyways | ||
861 | */ | ||
862 | int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
863 | { | ||
864 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | ||
865 | int ok; | ||
866 | |||
867 | if (unlikely(cancel)) { | ||
868 | drbd_free_ee(mdev, e); | ||
869 | dec_unacked(mdev); | ||
870 | return 1; | ||
871 | } | ||
872 | |||
873 | if (get_ldev_if_state(mdev, D_FAILED)) { | ||
874 | drbd_rs_complete_io(mdev, e->sector); | ||
875 | put_ldev(mdev); | ||
876 | } | ||
877 | |||
878 | if (likely(drbd_bio_uptodate(e->private_bio))) { | ||
879 | if (likely(mdev->state.pdsk >= D_INCONSISTENT)) { | ||
880 | inc_rs_pending(mdev); | ||
881 | ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); | ||
882 | } else { | ||
883 | if (__ratelimit(&drbd_ratelimit_state)) | ||
884 | dev_err(DEV, "Not sending RSDataReply, " | ||
885 | "partner DISKLESS!\n"); | ||
886 | ok = 1; | ||
887 | } | ||
888 | } else { | ||
889 | if (__ratelimit(&drbd_ratelimit_state)) | ||
890 | dev_err(DEV, "Sending NegRSDReply. sector %llus.\n", | ||
891 | (unsigned long long)e->sector); | ||
892 | |||
893 | ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); | ||
894 | |||
895 | /* update resync data with failure */ | ||
896 | drbd_rs_failed_io(mdev, e->sector, e->size); | ||
897 | } | ||
898 | |||
899 | dec_unacked(mdev); | ||
900 | |||
901 | move_to_net_ee_or_free(mdev, e); | ||
902 | |||
903 | if (unlikely(!ok)) | ||
904 | dev_err(DEV, "drbd_send_block() failed\n"); | ||
905 | return ok; | ||
906 | } | ||
907 | |||
908 | int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
909 | { | ||
910 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | ||
911 | struct digest_info *di; | ||
912 | int digest_size; | ||
913 | void *digest = NULL; | ||
914 | int ok, eq = 0; | ||
915 | |||
916 | if (unlikely(cancel)) { | ||
917 | drbd_free_ee(mdev, e); | ||
918 | dec_unacked(mdev); | ||
919 | return 1; | ||
920 | } | ||
921 | |||
922 | drbd_rs_complete_io(mdev, e->sector); | ||
923 | |||
924 | di = (struct digest_info *)(unsigned long)e->block_id; | ||
925 | |||
926 | if (likely(drbd_bio_uptodate(e->private_bio))) { | ||
927 | /* quick hack to try to avoid a race against reconfiguration. | ||
928 | * a real fix would be much more involved, | ||
929 | * introducing more locking mechanisms */ | ||
930 | if (mdev->csums_tfm) { | ||
931 | digest_size = crypto_hash_digestsize(mdev->csums_tfm); | ||
932 | D_ASSERT(digest_size == di->digest_size); | ||
933 | digest = kmalloc(digest_size, GFP_NOIO); | ||
934 | } | ||
935 | if (digest) { | ||
936 | drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest); | ||
937 | eq = !memcmp(digest, di->digest, digest_size); | ||
938 | kfree(digest); | ||
939 | } | ||
940 | |||
941 | if (eq) { | ||
942 | drbd_set_in_sync(mdev, e->sector, e->size); | ||
943 | mdev->rs_same_csum++; | ||
944 | ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e); | ||
945 | } else { | ||
946 | inc_rs_pending(mdev); | ||
947 | e->block_id = ID_SYNCER; | ||
948 | ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); | ||
949 | } | ||
950 | } else { | ||
951 | ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); | ||
952 | if (__ratelimit(&drbd_ratelimit_state)) | ||
953 | dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n"); | ||
954 | } | ||
955 | |||
956 | dec_unacked(mdev); | ||
957 | |||
958 | kfree(di); | ||
959 | |||
960 | move_to_net_ee_or_free(mdev, e); | ||
961 | |||
962 | if (unlikely(!ok)) | ||
963 | dev_err(DEV, "drbd_send_block/ack() failed\n"); | ||
964 | return ok; | ||
965 | } | ||
966 | |||
967 | int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
968 | { | ||
969 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | ||
970 | int digest_size; | ||
971 | void *digest; | ||
972 | int ok = 1; | ||
973 | |||
974 | if (unlikely(cancel)) | ||
975 | goto out; | ||
976 | |||
977 | if (unlikely(!drbd_bio_uptodate(e->private_bio))) | ||
978 | goto out; | ||
979 | |||
980 | digest_size = crypto_hash_digestsize(mdev->verify_tfm); | ||
981 | /* FIXME if this allocation fails, online verify will not terminate! */ | ||
982 | digest = kmalloc(digest_size, GFP_NOIO); | ||
983 | if (digest) { | ||
984 | drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest); | ||
985 | inc_rs_pending(mdev); | ||
986 | ok = drbd_send_drequest_csum(mdev, e->sector, e->size, | ||
987 | digest, digest_size, P_OV_REPLY); | ||
988 | if (!ok) | ||
989 | dec_rs_pending(mdev); | ||
990 | kfree(digest); | ||
991 | } | ||
992 | |||
993 | out: | ||
994 | drbd_free_ee(mdev, e); | ||
995 | |||
996 | dec_unacked(mdev); | ||
997 | |||
998 | return ok; | ||
999 | } | ||
1000 | |||
1001 | void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size) | ||
1002 | { | ||
1003 | if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) { | ||
1004 | mdev->ov_last_oos_size += size>>9; | ||
1005 | } else { | ||
1006 | mdev->ov_last_oos_start = sector; | ||
1007 | mdev->ov_last_oos_size = size>>9; | ||
1008 | } | ||
1009 | drbd_set_out_of_sync(mdev, sector, size); | ||
1010 | set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); | ||
1011 | } | ||
1012 | |||
1013 | int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1014 | { | ||
1015 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | ||
1016 | struct digest_info *di; | ||
1017 | int digest_size; | ||
1018 | void *digest; | ||
1019 | int ok, eq = 0; | ||
1020 | |||
1021 | if (unlikely(cancel)) { | ||
1022 | drbd_free_ee(mdev, e); | ||
1023 | dec_unacked(mdev); | ||
1024 | return 1; | ||
1025 | } | ||
1026 | |||
1027 | /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all | ||
1028 | * the resync lru has been cleaned up already */ | ||
1029 | drbd_rs_complete_io(mdev, e->sector); | ||
1030 | |||
1031 | di = (struct digest_info *)(unsigned long)e->block_id; | ||
1032 | |||
1033 | if (likely(drbd_bio_uptodate(e->private_bio))) { | ||
1034 | digest_size = crypto_hash_digestsize(mdev->verify_tfm); | ||
1035 | digest = kmalloc(digest_size, GFP_NOIO); | ||
1036 | if (digest) { | ||
1037 | drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest); | ||
1038 | |||
1039 | D_ASSERT(digest_size == di->digest_size); | ||
1040 | eq = !memcmp(digest, di->digest, digest_size); | ||
1041 | kfree(digest); | ||
1042 | } | ||
1043 | } else { | ||
1044 | ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); | ||
1045 | if (__ratelimit(&drbd_ratelimit_state)) | ||
1046 | dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n"); | ||
1047 | } | ||
1048 | |||
1049 | dec_unacked(mdev); | ||
1050 | |||
1051 | kfree(di); | ||
1052 | |||
1053 | if (!eq) | ||
1054 | drbd_ov_oos_found(mdev, e->sector, e->size); | ||
1055 | else | ||
1056 | ov_oos_print(mdev); | ||
1057 | |||
1058 | ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size, | ||
1059 | eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); | ||
1060 | |||
1061 | drbd_free_ee(mdev, e); | ||
1062 | |||
1063 | if (--mdev->ov_left == 0) { | ||
1064 | ov_oos_print(mdev); | ||
1065 | drbd_resync_finished(mdev); | ||
1066 | } | ||
1067 | |||
1068 | return ok; | ||
1069 | } | ||
1070 | |||
1071 | int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1072 | { | ||
1073 | struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w); | ||
1074 | complete(&b->done); | ||
1075 | return 1; | ||
1076 | } | ||
1077 | |||
1078 | int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1079 | { | ||
1080 | struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w); | ||
1081 | struct p_barrier *p = &mdev->data.sbuf.barrier; | ||
1082 | int ok = 1; | ||
1083 | |||
1084 | /* really avoid racing with tl_clear. w.cb may have been referenced | ||
1085 | * just before it was reassigned and re-queued, so double check that. | ||
1086 | * actually, this race was harmless, since we only try to send the | ||
1087 | * barrier packet here, and otherwise do nothing with the object. | ||
1088 | * but compare with the head of w_clear_epoch */ | ||
1089 | spin_lock_irq(&mdev->req_lock); | ||
1090 | if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED) | ||
1091 | cancel = 1; | ||
1092 | spin_unlock_irq(&mdev->req_lock); | ||
1093 | if (cancel) | ||
1094 | return 1; | ||
1095 | |||
1096 | if (!drbd_get_data_sock(mdev)) | ||
1097 | return 0; | ||
1098 | p->barrier = b->br_number; | ||
1099 | /* inc_ap_pending was done where this was queued. | ||
1100 | * dec_ap_pending will be done in got_BarrierAck | ||
1101 | * or (on connection loss) in w_clear_epoch. */ | ||
1102 | ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER, | ||
1103 | (struct p_header *)p, sizeof(*p), 0); | ||
1104 | drbd_put_data_sock(mdev); | ||
1105 | |||
1106 | return ok; | ||
1107 | } | ||
1108 | |||
1109 | int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1110 | { | ||
1111 | if (cancel) | ||
1112 | return 1; | ||
1113 | return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE); | ||
1114 | } | ||
1115 | |||
1116 | /** | ||
1117 | * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request | ||
1118 | * @mdev: DRBD device. | ||
1119 | * @w: work object. | ||
1120 | * @cancel: The connection will be closed anyways | ||
1121 | */ | ||
1122 | int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1123 | { | ||
1124 | struct drbd_request *req = container_of(w, struct drbd_request, w); | ||
1125 | int ok; | ||
1126 | |||
1127 | if (unlikely(cancel)) { | ||
1128 | req_mod(req, send_canceled); | ||
1129 | return 1; | ||
1130 | } | ||
1131 | |||
1132 | ok = drbd_send_dblock(mdev, req); | ||
1133 | req_mod(req, ok ? handed_over_to_network : send_failed); | ||
1134 | |||
1135 | return ok; | ||
1136 | } | ||
1137 | |||
1138 | /** | ||
1139 | * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet | ||
1140 | * @mdev: DRBD device. | ||
1141 | * @w: work object. | ||
1142 | * @cancel: The connection will be closed anyways | ||
1143 | */ | ||
1144 | int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1145 | { | ||
1146 | struct drbd_request *req = container_of(w, struct drbd_request, w); | ||
1147 | int ok; | ||
1148 | |||
1149 | if (unlikely(cancel)) { | ||
1150 | req_mod(req, send_canceled); | ||
1151 | return 1; | ||
1152 | } | ||
1153 | |||
1154 | ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size, | ||
1155 | (unsigned long)req); | ||
1156 | |||
1157 | if (!ok) { | ||
1158 | /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send(); | ||
1159 | * so this is probably redundant */ | ||
1160 | if (mdev->state.conn >= C_CONNECTED) | ||
1161 | drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); | ||
1162 | } | ||
1163 | req_mod(req, ok ? handed_over_to_network : send_failed); | ||
1164 | |||
1165 | return ok; | ||
1166 | } | ||
1167 | |||
1168 | static int _drbd_may_sync_now(struct drbd_conf *mdev) | ||
1169 | { | ||
1170 | struct drbd_conf *odev = mdev; | ||
1171 | |||
1172 | while (1) { | ||
1173 | if (odev->sync_conf.after == -1) | ||
1174 | return 1; | ||
1175 | odev = minor_to_mdev(odev->sync_conf.after); | ||
1176 | ERR_IF(!odev) return 1; | ||
1177 | if ((odev->state.conn >= C_SYNC_SOURCE && | ||
1178 | odev->state.conn <= C_PAUSED_SYNC_T) || | ||
1179 | odev->state.aftr_isp || odev->state.peer_isp || | ||
1180 | odev->state.user_isp) | ||
1181 | return 0; | ||
1182 | } | ||
1183 | } | ||
1184 | |||
1185 | /** | ||
1186 | * _drbd_pause_after() - Pause resync on all devices that may not resync now | ||
1187 | * @mdev: DRBD device. | ||
1188 | * | ||
1189 | * Called from process context only (admin command and after_state_ch). | ||
1190 | */ | ||
1191 | static int _drbd_pause_after(struct drbd_conf *mdev) | ||
1192 | { | ||
1193 | struct drbd_conf *odev; | ||
1194 | int i, rv = 0; | ||
1195 | |||
1196 | for (i = 0; i < minor_count; i++) { | ||
1197 | odev = minor_to_mdev(i); | ||
1198 | if (!odev) | ||
1199 | continue; | ||
1200 | if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) | ||
1201 | continue; | ||
1202 | if (!_drbd_may_sync_now(odev)) | ||
1203 | rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL) | ||
1204 | != SS_NOTHING_TO_DO); | ||
1205 | } | ||
1206 | |||
1207 | return rv; | ||
1208 | } | ||
1209 | |||
1210 | /** | ||
1211 | * _drbd_resume_next() - Resume resync on all devices that may resync now | ||
1212 | * @mdev: DRBD device. | ||
1213 | * | ||
1214 | * Called from process context only (admin command and worker). | ||
1215 | */ | ||
1216 | static int _drbd_resume_next(struct drbd_conf *mdev) | ||
1217 | { | ||
1218 | struct drbd_conf *odev; | ||
1219 | int i, rv = 0; | ||
1220 | |||
1221 | for (i = 0; i < minor_count; i++) { | ||
1222 | odev = minor_to_mdev(i); | ||
1223 | if (!odev) | ||
1224 | continue; | ||
1225 | if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) | ||
1226 | continue; | ||
1227 | if (odev->state.aftr_isp) { | ||
1228 | if (_drbd_may_sync_now(odev)) | ||
1229 | rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0), | ||
1230 | CS_HARD, NULL) | ||
1231 | != SS_NOTHING_TO_DO) ; | ||
1232 | } | ||
1233 | } | ||
1234 | return rv; | ||
1235 | } | ||
1236 | |||
1237 | void resume_next_sg(struct drbd_conf *mdev) | ||
1238 | { | ||
1239 | write_lock_irq(&global_state_lock); | ||
1240 | _drbd_resume_next(mdev); | ||
1241 | write_unlock_irq(&global_state_lock); | ||
1242 | } | ||
1243 | |||
1244 | void suspend_other_sg(struct drbd_conf *mdev) | ||
1245 | { | ||
1246 | write_lock_irq(&global_state_lock); | ||
1247 | _drbd_pause_after(mdev); | ||
1248 | write_unlock_irq(&global_state_lock); | ||
1249 | } | ||
1250 | |||
1251 | static int sync_after_error(struct drbd_conf *mdev, int o_minor) | ||
1252 | { | ||
1253 | struct drbd_conf *odev; | ||
1254 | |||
1255 | if (o_minor == -1) | ||
1256 | return NO_ERROR; | ||
1257 | if (o_minor < -1 || minor_to_mdev(o_minor) == NULL) | ||
1258 | return ERR_SYNC_AFTER; | ||
1259 | |||
1260 | /* check for loops */ | ||
1261 | odev = minor_to_mdev(o_minor); | ||
1262 | while (1) { | ||
1263 | if (odev == mdev) | ||
1264 | return ERR_SYNC_AFTER_CYCLE; | ||
1265 | |||
1266 | /* dependency chain ends here, no cycles. */ | ||
1267 | if (odev->sync_conf.after == -1) | ||
1268 | return NO_ERROR; | ||
1269 | |||
1270 | /* follow the dependency chain */ | ||
1271 | odev = minor_to_mdev(odev->sync_conf.after); | ||
1272 | } | ||
1273 | } | ||
1274 | |||
1275 | int drbd_alter_sa(struct drbd_conf *mdev, int na) | ||
1276 | { | ||
1277 | int changes; | ||
1278 | int retcode; | ||
1279 | |||
1280 | write_lock_irq(&global_state_lock); | ||
1281 | retcode = sync_after_error(mdev, na); | ||
1282 | if (retcode == NO_ERROR) { | ||
1283 | mdev->sync_conf.after = na; | ||
1284 | do { | ||
1285 | changes = _drbd_pause_after(mdev); | ||
1286 | changes |= _drbd_resume_next(mdev); | ||
1287 | } while (changes); | ||
1288 | } | ||
1289 | write_unlock_irq(&global_state_lock); | ||
1290 | return retcode; | ||
1291 | } | ||
1292 | |||
1293 | /** | ||
1294 | * drbd_start_resync() - Start the resync process | ||
1295 | * @mdev: DRBD device. | ||
1296 | * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET | ||
1297 | * | ||
1298 | * This function might bring you directly into one of the | ||
1299 | * C_PAUSED_SYNC_* states. | ||
1300 | */ | ||
1301 | void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | ||
1302 | { | ||
1303 | union drbd_state ns; | ||
1304 | int r; | ||
1305 | |||
1306 | if (mdev->state.conn >= C_SYNC_SOURCE) { | ||
1307 | dev_err(DEV, "Resync already running!\n"); | ||
1308 | return; | ||
1309 | } | ||
1310 | |||
1311 | /* In case a previous resync run was aborted by an IO error/detach on the peer. */ | ||
1312 | drbd_rs_cancel_all(mdev); | ||
1313 | |||
1314 | if (side == C_SYNC_TARGET) { | ||
1315 | /* Since application IO was locked out during C_WF_BITMAP_T and | ||
1316 | C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET | ||
1317 | we check that we might make the data inconsistent. */ | ||
1318 | r = drbd_khelper(mdev, "before-resync-target"); | ||
1319 | r = (r >> 8) & 0xff; | ||
1320 | if (r > 0) { | ||
1321 | dev_info(DEV, "before-resync-target handler returned %d, " | ||
1322 | "dropping connection.\n", r); | ||
1323 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
1324 | return; | ||
1325 | } | ||
1326 | } | ||
1327 | |||
1328 | drbd_state_lock(mdev); | ||
1329 | |||
1330 | if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { | ||
1331 | drbd_state_unlock(mdev); | ||
1332 | return; | ||
1333 | } | ||
1334 | |||
1335 | if (side == C_SYNC_TARGET) { | ||
1336 | mdev->bm_resync_fo = 0; | ||
1337 | } else /* side == C_SYNC_SOURCE */ { | ||
1338 | u64 uuid; | ||
1339 | |||
1340 | get_random_bytes(&uuid, sizeof(u64)); | ||
1341 | drbd_uuid_set(mdev, UI_BITMAP, uuid); | ||
1342 | drbd_send_sync_uuid(mdev, uuid); | ||
1343 | |||
1344 | D_ASSERT(mdev->state.disk == D_UP_TO_DATE); | ||
1345 | } | ||
1346 | |||
1347 | write_lock_irq(&global_state_lock); | ||
1348 | ns = mdev->state; | ||
1349 | |||
1350 | ns.aftr_isp = !_drbd_may_sync_now(mdev); | ||
1351 | |||
1352 | ns.conn = side; | ||
1353 | |||
1354 | if (side == C_SYNC_TARGET) | ||
1355 | ns.disk = D_INCONSISTENT; | ||
1356 | else /* side == C_SYNC_SOURCE */ | ||
1357 | ns.pdsk = D_INCONSISTENT; | ||
1358 | |||
1359 | r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | ||
1360 | ns = mdev->state; | ||
1361 | |||
1362 | if (ns.conn < C_CONNECTED) | ||
1363 | r = SS_UNKNOWN_ERROR; | ||
1364 | |||
1365 | if (r == SS_SUCCESS) { | ||
1366 | mdev->rs_total = | ||
1367 | mdev->rs_mark_left = drbd_bm_total_weight(mdev); | ||
1368 | mdev->rs_failed = 0; | ||
1369 | mdev->rs_paused = 0; | ||
1370 | mdev->rs_start = | ||
1371 | mdev->rs_mark_time = jiffies; | ||
1372 | mdev->rs_same_csum = 0; | ||
1373 | _drbd_pause_after(mdev); | ||
1374 | } | ||
1375 | write_unlock_irq(&global_state_lock); | ||
1376 | drbd_state_unlock(mdev); | ||
1377 | put_ldev(mdev); | ||
1378 | |||
1379 | if (r == SS_SUCCESS) { | ||
1380 | dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n", | ||
1381 | drbd_conn_str(ns.conn), | ||
1382 | (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10), | ||
1383 | (unsigned long) mdev->rs_total); | ||
1384 | |||
1385 | if (mdev->rs_total == 0) { | ||
1386 | /* Peer still reachable? Beware of failing before-resync-target handlers! */ | ||
1387 | request_ping(mdev); | ||
1388 | __set_current_state(TASK_INTERRUPTIBLE); | ||
1389 | schedule_timeout(mdev->net_conf->ping_timeo*HZ/9); /* 9 instead 10 */ | ||
1390 | drbd_resync_finished(mdev); | ||
1391 | return; | ||
1392 | } | ||
1393 | |||
1394 | /* ns.conn may already be != mdev->state.conn, | ||
1395 | * we may have been paused in between, or become paused until | ||
1396 | * the timer triggers. | ||
1397 | * No matter, that is handled in resync_timer_fn() */ | ||
1398 | if (ns.conn == C_SYNC_TARGET) | ||
1399 | mod_timer(&mdev->resync_timer, jiffies); | ||
1400 | |||
1401 | drbd_md_sync(mdev); | ||
1402 | } | ||
1403 | } | ||
1404 | |||
1405 | int drbd_worker(struct drbd_thread *thi) | ||
1406 | { | ||
1407 | struct drbd_conf *mdev = thi->mdev; | ||
1408 | struct drbd_work *w = NULL; | ||
1409 | LIST_HEAD(work_list); | ||
1410 | int intr = 0, i; | ||
1411 | |||
1412 | sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev)); | ||
1413 | |||
1414 | while (get_t_state(thi) == Running) { | ||
1415 | drbd_thread_current_set_cpu(mdev); | ||
1416 | |||
1417 | if (down_trylock(&mdev->data.work.s)) { | ||
1418 | mutex_lock(&mdev->data.mutex); | ||
1419 | if (mdev->data.socket && !mdev->net_conf->no_cork) | ||
1420 | drbd_tcp_uncork(mdev->data.socket); | ||
1421 | mutex_unlock(&mdev->data.mutex); | ||
1422 | |||
1423 | intr = down_interruptible(&mdev->data.work.s); | ||
1424 | |||
1425 | mutex_lock(&mdev->data.mutex); | ||
1426 | if (mdev->data.socket && !mdev->net_conf->no_cork) | ||
1427 | drbd_tcp_cork(mdev->data.socket); | ||
1428 | mutex_unlock(&mdev->data.mutex); | ||
1429 | } | ||
1430 | |||
1431 | if (intr) { | ||
1432 | D_ASSERT(intr == -EINTR); | ||
1433 | flush_signals(current); | ||
1434 | ERR_IF (get_t_state(thi) == Running) | ||
1435 | continue; | ||
1436 | break; | ||
1437 | } | ||
1438 | |||
1439 | if (get_t_state(thi) != Running) | ||
1440 | break; | ||
1441 | /* With this break, we have done a down() but not consumed | ||
1442 | the entry from the list. The cleanup code takes care of | ||
1443 | this... */ | ||
1444 | |||
1445 | w = NULL; | ||
1446 | spin_lock_irq(&mdev->data.work.q_lock); | ||
1447 | ERR_IF(list_empty(&mdev->data.work.q)) { | ||
1448 | /* something terribly wrong in our logic. | ||
1449 | * we were able to down() the semaphore, | ||
1450 | * but the list is empty... doh. | ||
1451 | * | ||
1452 | * what is the best thing to do now? | ||
1453 | * try again from scratch, restarting the receiver, | ||
1454 | * asender, whatnot? could break even more ugly, | ||
1455 | * e.g. when we are primary, but no good local data. | ||
1456 | * | ||
1457 | * I'll try to get away just starting over this loop. | ||
1458 | */ | ||
1459 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1460 | continue; | ||
1461 | } | ||
1462 | w = list_entry(mdev->data.work.q.next, struct drbd_work, list); | ||
1463 | list_del_init(&w->list); | ||
1464 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1465 | |||
1466 | if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) { | ||
1467 | /* dev_warn(DEV, "worker: a callback failed! \n"); */ | ||
1468 | if (mdev->state.conn >= C_CONNECTED) | ||
1469 | drbd_force_state(mdev, | ||
1470 | NS(conn, C_NETWORK_FAILURE)); | ||
1471 | } | ||
1472 | } | ||
1473 | D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags)); | ||
1474 | D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags)); | ||
1475 | |||
1476 | spin_lock_irq(&mdev->data.work.q_lock); | ||
1477 | i = 0; | ||
1478 | while (!list_empty(&mdev->data.work.q)) { | ||
1479 | list_splice_init(&mdev->data.work.q, &work_list); | ||
1480 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1481 | |||
1482 | while (!list_empty(&work_list)) { | ||
1483 | w = list_entry(work_list.next, struct drbd_work, list); | ||
1484 | list_del_init(&w->list); | ||
1485 | w->cb(mdev, w, 1); | ||
1486 | i++; /* dead debugging code */ | ||
1487 | } | ||
1488 | |||
1489 | spin_lock_irq(&mdev->data.work.q_lock); | ||
1490 | } | ||
1491 | sema_init(&mdev->data.work.s, 0); | ||
1492 | /* DANGEROUS race: if someone did queue his work within the spinlock, | ||
1493 | * but up() ed outside the spinlock, we could get an up() on the | ||
1494 | * semaphore without corresponding list entry. | ||
1495 | * So don't do that. | ||
1496 | */ | ||
1497 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1498 | |||
1499 | D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE); | ||
1500 | /* _drbd_set_state only uses stop_nowait. | ||
1501 | * wait here for the Exiting receiver. */ | ||
1502 | drbd_thread_stop(&mdev->receiver); | ||
1503 | drbd_mdev_cleanup(mdev); | ||
1504 | |||
1505 | dev_info(DEV, "worker terminated\n"); | ||
1506 | |||
1507 | clear_bit(DEVICE_DYING, &mdev->flags); | ||
1508 | clear_bit(CONFIG_PENDING, &mdev->flags); | ||
1509 | wake_up(&mdev->state_wait); | ||
1510 | |||
1511 | return 0; | ||
1512 | } | ||
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h new file mode 100644 index 000000000000..f93fa111ce50 --- /dev/null +++ b/drivers/block/drbd/drbd_wrappers.h | |||
@@ -0,0 +1,91 @@ | |||
1 | #ifndef _DRBD_WRAPPERS_H | ||
2 | #define _DRBD_WRAPPERS_H | ||
3 | |||
4 | #include <linux/ctype.h> | ||
5 | #include <linux/mm.h> | ||
6 | |||
7 | /* see get_sb_bdev and bd_claim */ | ||
8 | extern char *drbd_sec_holder; | ||
9 | |||
10 | /* sets the number of 512 byte sectors of our virtual device */ | ||
11 | static inline void drbd_set_my_capacity(struct drbd_conf *mdev, | ||
12 | sector_t size) | ||
13 | { | ||
14 | /* set_capacity(mdev->this_bdev->bd_disk, size); */ | ||
15 | set_capacity(mdev->vdisk, size); | ||
16 | mdev->this_bdev->bd_inode->i_size = (loff_t)size << 9; | ||
17 | } | ||
18 | |||
19 | #define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE) | ||
20 | |||
21 | static inline int drbd_bio_has_active_page(struct bio *bio) | ||
22 | { | ||
23 | struct bio_vec *bvec; | ||
24 | int i; | ||
25 | |||
26 | __bio_for_each_segment(bvec, bio, i, 0) { | ||
27 | if (page_count(bvec->bv_page) > 1) | ||
28 | return 1; | ||
29 | } | ||
30 | |||
31 | return 0; | ||
32 | } | ||
33 | |||
34 | /* bi_end_io handlers */ | ||
35 | extern void drbd_md_io_complete(struct bio *bio, int error); | ||
36 | extern void drbd_endio_read_sec(struct bio *bio, int error); | ||
37 | extern void drbd_endio_write_sec(struct bio *bio, int error); | ||
38 | extern void drbd_endio_pri(struct bio *bio, int error); | ||
39 | |||
40 | /* | ||
41 | * used to submit our private bio | ||
42 | */ | ||
43 | static inline void drbd_generic_make_request(struct drbd_conf *mdev, | ||
44 | int fault_type, struct bio *bio) | ||
45 | { | ||
46 | __release(local); | ||
47 | if (!bio->bi_bdev) { | ||
48 | printk(KERN_ERR "drbd%d: drbd_generic_make_request: " | ||
49 | "bio->bi_bdev == NULL\n", | ||
50 | mdev_to_minor(mdev)); | ||
51 | dump_stack(); | ||
52 | bio_endio(bio, -ENODEV); | ||
53 | return; | ||
54 | } | ||
55 | |||
56 | if (FAULT_ACTIVE(mdev, fault_type)) | ||
57 | bio_endio(bio, -EIO); | ||
58 | else | ||
59 | generic_make_request(bio); | ||
60 | } | ||
61 | |||
62 | static inline void drbd_plug_device(struct drbd_conf *mdev) | ||
63 | { | ||
64 | struct request_queue *q; | ||
65 | q = bdev_get_queue(mdev->this_bdev); | ||
66 | |||
67 | spin_lock_irq(q->queue_lock); | ||
68 | |||
69 | /* XXX the check on !blk_queue_plugged is redundant, | ||
70 | * implicitly checked in blk_plug_device */ | ||
71 | |||
72 | if (!blk_queue_plugged(q)) { | ||
73 | blk_plug_device(q); | ||
74 | del_timer(&q->unplug_timer); | ||
75 | /* unplugging should not happen automatically... */ | ||
76 | } | ||
77 | spin_unlock_irq(q->queue_lock); | ||
78 | } | ||
79 | |||
80 | static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm) | ||
81 | { | ||
82 | return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK) | ||
83 | == CRYPTO_ALG_TYPE_HASH; | ||
84 | } | ||
85 | |||
86 | #ifndef __CHECKER__ | ||
87 | # undef __cond_lock | ||
88 | # define __cond_lock(x,c) (c) | ||
89 | #endif | ||
90 | |||
91 | #endif | ||