diff options
author | Philipp Reisner <philipp.reisner@linbit.com> | 2009-09-25 19:07:19 -0400 |
---|---|---|
committer | Jens Axboe <jens.axboe@oracle.com> | 2009-10-01 15:17:49 -0400 |
commit | b411b3637fa71fce9cf2acf0639009500f5892fe (patch) | |
tree | 6b88e5202e0f137fef50e95b0441bcafdbf91990 /drivers | |
parent | 1a35e0f6443f4266dad4c569c55c57a9032596fa (diff) |
The DRBD driver
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/block/Kconfig | 2 | ||||
-rw-r--r-- | drivers/block/Makefile | 1 | ||||
-rw-r--r-- | drivers/block/drbd/Kconfig | 82 | ||||
-rw-r--r-- | drivers/block/drbd/Makefile | 8 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_actlog.c | 1484 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_bitmap.c | 1327 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_int.h | 2258 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_main.c | 3735 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_nl.c | 2365 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_proc.c | 266 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_receiver.c | 4456 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.c | 1132 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.h | 327 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_strings.c | 113 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_tracing.c | 752 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_tracing.h | 87 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_vli.h | 351 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_worker.c | 1529 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_wrappers.h | 91 |
19 files changed, 20366 insertions, 0 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 1d886e079c5..77bfce52e9c 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig | |||
@@ -271,6 +271,8 @@ config BLK_DEV_CRYPTOLOOP | |||
271 | instead, which can be configured to be on-disk compatible with the | 271 | instead, which can be configured to be on-disk compatible with the |
272 | cryptoloop device. | 272 | cryptoloop device. |
273 | 273 | ||
274 | source "drivers/block/drbd/Kconfig" | ||
275 | |||
274 | config BLK_DEV_NBD | 276 | config BLK_DEV_NBD |
275 | tristate "Network block device support" | 277 | tristate "Network block device support" |
276 | depends on NET | 278 | depends on NET |
diff --git a/drivers/block/Makefile b/drivers/block/Makefile index cdaa3f8fddf..aff5ac925c3 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile | |||
@@ -36,5 +36,6 @@ obj-$(CONFIG_BLK_DEV_UB) += ub.o | |||
36 | obj-$(CONFIG_BLK_DEV_HD) += hd.o | 36 | obj-$(CONFIG_BLK_DEV_HD) += hd.o |
37 | 37 | ||
38 | obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o | 38 | obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o |
39 | obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ | ||
39 | 40 | ||
40 | swim_mod-objs := swim.o swim_asm.o | 41 | swim_mod-objs := swim.o swim_asm.o |
diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig new file mode 100644 index 00000000000..4e6f90f487c --- /dev/null +++ b/drivers/block/drbd/Kconfig | |||
@@ -0,0 +1,82 @@ | |||
1 | # | ||
2 | # DRBD device driver configuration | ||
3 | # | ||
4 | |||
5 | comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected" | ||
6 | depends on !PROC_FS || !INET || !CONNECTOR | ||
7 | |||
8 | config BLK_DEV_DRBD | ||
9 | tristate "DRBD Distributed Replicated Block Device support" | ||
10 | depends on PROC_FS && INET && CONNECTOR | ||
11 | select LRU_CACHE | ||
12 | default n | ||
13 | help | ||
14 | |||
15 | NOTE: In order to authenticate connections you have to select | ||
16 | CRYPTO_HMAC and a hash function as well. | ||
17 | |||
18 | DRBD is a shared-nothing, synchronously replicated block device. It | ||
19 | is designed to serve as a building block for high availability | ||
20 | clusters and in this context, is a "drop-in" replacement for shared | ||
21 | storage. Simplistically, you could see it as a network RAID 1. | ||
22 | |||
23 | Each minor device has a role, which can be 'primary' or 'secondary'. | ||
24 | On the node with the primary device the application is supposed to | ||
25 | run and to access the device (/dev/drbdX). Every write is sent to | ||
26 | the local 'lower level block device' and, across the network, to the | ||
27 | node with the device in 'secondary' state. The secondary device | ||
28 | simply writes the data to its lower level block device. | ||
29 | |||
30 | DRBD can also be used in dual-Primary mode (device writable on both | ||
31 | nodes), which means it can exhibit shared disk semantics in a | ||
32 | shared-nothing cluster. Needless to say, on top of dual-Primary | ||
33 | DRBD utilizing a cluster file system is necessary to maintain for | ||
34 | cache coherency. | ||
35 | |||
36 | For automatic failover you need a cluster manager (e.g. heartbeat). | ||
37 | See also: http://www.drbd.org/, http://www.linux-ha.org | ||
38 | |||
39 | If unsure, say N. | ||
40 | |||
41 | config DRBD_TRACE | ||
42 | tristate "DRBD tracing" | ||
43 | depends on BLK_DEV_DRBD | ||
44 | select TRACEPOINTS | ||
45 | default n | ||
46 | help | ||
47 | |||
48 | Say Y here if you want to be able to trace various events in DRBD. | ||
49 | |||
50 | If unsure, say N. | ||
51 | |||
52 | config DRBD_FAULT_INJECTION | ||
53 | bool "DRBD fault injection" | ||
54 | depends on BLK_DEV_DRBD | ||
55 | help | ||
56 | |||
57 | Say Y here if you want to simulate IO errors, in order to test DRBD's | ||
58 | behavior. | ||
59 | |||
60 | The actual simulation of IO errors is done by writing 3 values to | ||
61 | /sys/module/drbd/parameters/ | ||
62 | |||
63 | enable_faults: bitmask of... | ||
64 | 1 meta data write | ||
65 | 2 read | ||
66 | 4 resync data write | ||
67 | 8 read | ||
68 | 16 data write | ||
69 | 32 data read | ||
70 | 64 read ahead | ||
71 | 128 kmalloc of bitmap | ||
72 | 256 allocation of EE (epoch_entries) | ||
73 | |||
74 | fault_devs: bitmask of minor numbers | ||
75 | fault_rate: frequency in percent | ||
76 | |||
77 | Example: Simulate data write errors on /dev/drbd0 with a probability of 5%. | ||
78 | echo 16 > /sys/module/drbd/parameters/enable_faults | ||
79 | echo 1 > /sys/module/drbd/parameters/fault_devs | ||
80 | echo 5 > /sys/module/drbd/parameters/fault_rate | ||
81 | |||
82 | If unsure, say N. | ||
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile new file mode 100644 index 00000000000..7d86ef8a8b4 --- /dev/null +++ b/drivers/block/drbd/Makefile | |||
@@ -0,0 +1,8 @@ | |||
1 | drbd-y := drbd_bitmap.o drbd_proc.o | ||
2 | drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o | ||
3 | drbd-y += drbd_main.o drbd_strings.o drbd_nl.o | ||
4 | |||
5 | drbd_trace-y := drbd_tracing.o | ||
6 | |||
7 | obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o | ||
8 | obj-$(CONFIG_DRBD_TRACE) += drbd_trace.o | ||
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c new file mode 100644 index 00000000000..74b4835d310 --- /dev/null +++ b/drivers/block/drbd/drbd_actlog.c | |||
@@ -0,0 +1,1484 @@ | |||
1 | /* | ||
2 | drbd_actlog.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #include <linux/slab.h> | ||
27 | #include <linux/drbd.h> | ||
28 | #include "drbd_int.h" | ||
29 | #include "drbd_tracing.h" | ||
30 | #include "drbd_wrappers.h" | ||
31 | |||
32 | /* We maintain a trivial check sum in our on disk activity log. | ||
33 | * With that we can ensure correct operation even when the storage | ||
34 | * device might do a partial (last) sector write while loosing power. | ||
35 | */ | ||
36 | struct __packed al_transaction { | ||
37 | u32 magic; | ||
38 | u32 tr_number; | ||
39 | struct __packed { | ||
40 | u32 pos; | ||
41 | u32 extent; } updates[1 + AL_EXTENTS_PT]; | ||
42 | u32 xor_sum; | ||
43 | }; | ||
44 | |||
45 | struct update_odbm_work { | ||
46 | struct drbd_work w; | ||
47 | unsigned int enr; | ||
48 | }; | ||
49 | |||
50 | struct update_al_work { | ||
51 | struct drbd_work w; | ||
52 | struct lc_element *al_ext; | ||
53 | struct completion event; | ||
54 | unsigned int enr; | ||
55 | /* if old_enr != LC_FREE, write corresponding bitmap sector, too */ | ||
56 | unsigned int old_enr; | ||
57 | }; | ||
58 | |||
59 | struct drbd_atodb_wait { | ||
60 | atomic_t count; | ||
61 | struct completion io_done; | ||
62 | struct drbd_conf *mdev; | ||
63 | int error; | ||
64 | }; | ||
65 | |||
66 | |||
67 | int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int); | ||
68 | |||
69 | /* The actual tracepoint needs to have constant number of known arguments... | ||
70 | */ | ||
71 | void trace_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, ...) | ||
72 | { | ||
73 | va_list ap; | ||
74 | |||
75 | va_start(ap, fmt); | ||
76 | trace__drbd_resync(mdev, level, fmt, ap); | ||
77 | va_end(ap); | ||
78 | } | ||
79 | |||
80 | static int _drbd_md_sync_page_io(struct drbd_conf *mdev, | ||
81 | struct drbd_backing_dev *bdev, | ||
82 | struct page *page, sector_t sector, | ||
83 | int rw, int size) | ||
84 | { | ||
85 | struct bio *bio; | ||
86 | struct drbd_md_io md_io; | ||
87 | int ok; | ||
88 | |||
89 | md_io.mdev = mdev; | ||
90 | init_completion(&md_io.event); | ||
91 | md_io.error = 0; | ||
92 | |||
93 | if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags)) | ||
94 | rw |= (1 << BIO_RW_BARRIER); | ||
95 | rw |= ((1<<BIO_RW_UNPLUG) | (1<<BIO_RW_SYNCIO)); | ||
96 | |||
97 | retry: | ||
98 | bio = bio_alloc(GFP_NOIO, 1); | ||
99 | bio->bi_bdev = bdev->md_bdev; | ||
100 | bio->bi_sector = sector; | ||
101 | ok = (bio_add_page(bio, page, size, 0) == size); | ||
102 | if (!ok) | ||
103 | goto out; | ||
104 | bio->bi_private = &md_io; | ||
105 | bio->bi_end_io = drbd_md_io_complete; | ||
106 | bio->bi_rw = rw; | ||
107 | |||
108 | trace_drbd_bio(mdev, "Md", bio, 0, NULL); | ||
109 | |||
110 | if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) | ||
111 | bio_endio(bio, -EIO); | ||
112 | else | ||
113 | submit_bio(rw, bio); | ||
114 | wait_for_completion(&md_io.event); | ||
115 | ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0; | ||
116 | |||
117 | /* check for unsupported barrier op. | ||
118 | * would rather check on EOPNOTSUPP, but that is not reliable. | ||
119 | * don't try again for ANY return value != 0 */ | ||
120 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && !ok)) { | ||
121 | /* Try again with no barrier */ | ||
122 | dev_warn(DEV, "Barriers not supported on meta data device - disabling\n"); | ||
123 | set_bit(MD_NO_BARRIER, &mdev->flags); | ||
124 | rw &= ~(1 << BIO_RW_BARRIER); | ||
125 | bio_put(bio); | ||
126 | goto retry; | ||
127 | } | ||
128 | out: | ||
129 | bio_put(bio); | ||
130 | return ok; | ||
131 | } | ||
132 | |||
133 | int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | ||
134 | sector_t sector, int rw) | ||
135 | { | ||
136 | int logical_block_size, mask, ok; | ||
137 | int offset = 0; | ||
138 | struct page *iop = mdev->md_io_page; | ||
139 | |||
140 | D_ASSERT(mutex_is_locked(&mdev->md_io_mutex)); | ||
141 | |||
142 | BUG_ON(!bdev->md_bdev); | ||
143 | |||
144 | logical_block_size = bdev_logical_block_size(bdev->md_bdev); | ||
145 | if (logical_block_size == 0) | ||
146 | logical_block_size = MD_SECTOR_SIZE; | ||
147 | |||
148 | /* in case logical_block_size != 512 [ s390 only? ] */ | ||
149 | if (logical_block_size != MD_SECTOR_SIZE) { | ||
150 | mask = (logical_block_size / MD_SECTOR_SIZE) - 1; | ||
151 | D_ASSERT(mask == 1 || mask == 3 || mask == 7); | ||
152 | D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE); | ||
153 | offset = sector & mask; | ||
154 | sector = sector & ~mask; | ||
155 | iop = mdev->md_io_tmpp; | ||
156 | |||
157 | if (rw & WRITE) { | ||
158 | /* these are GFP_KERNEL pages, pre-allocated | ||
159 | * on device initialization */ | ||
160 | void *p = page_address(mdev->md_io_page); | ||
161 | void *hp = page_address(mdev->md_io_tmpp); | ||
162 | |||
163 | ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, | ||
164 | READ, logical_block_size); | ||
165 | |||
166 | if (unlikely(!ok)) { | ||
167 | dev_err(DEV, "drbd_md_sync_page_io(,%llus," | ||
168 | "READ [logical_block_size!=512]) failed!\n", | ||
169 | (unsigned long long)sector); | ||
170 | return 0; | ||
171 | } | ||
172 | |||
173 | memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE); | ||
174 | } | ||
175 | } | ||
176 | |||
177 | if (sector < drbd_md_first_sector(bdev) || | ||
178 | sector > drbd_md_last_sector(bdev)) | ||
179 | dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n", | ||
180 | current->comm, current->pid, __func__, | ||
181 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); | ||
182 | |||
183 | ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size); | ||
184 | if (unlikely(!ok)) { | ||
185 | dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n", | ||
186 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); | ||
187 | return 0; | ||
188 | } | ||
189 | |||
190 | if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) { | ||
191 | void *p = page_address(mdev->md_io_page); | ||
192 | void *hp = page_address(mdev->md_io_tmpp); | ||
193 | |||
194 | memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE); | ||
195 | } | ||
196 | |||
197 | return ok; | ||
198 | } | ||
199 | |||
200 | static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) | ||
201 | { | ||
202 | struct lc_element *al_ext; | ||
203 | struct lc_element *tmp; | ||
204 | unsigned long al_flags = 0; | ||
205 | |||
206 | spin_lock_irq(&mdev->al_lock); | ||
207 | tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); | ||
208 | if (unlikely(tmp != NULL)) { | ||
209 | struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); | ||
210 | if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { | ||
211 | spin_unlock_irq(&mdev->al_lock); | ||
212 | return NULL; | ||
213 | } | ||
214 | } | ||
215 | al_ext = lc_get(mdev->act_log, enr); | ||
216 | al_flags = mdev->act_log->flags; | ||
217 | spin_unlock_irq(&mdev->al_lock); | ||
218 | |||
219 | /* | ||
220 | if (!al_ext) { | ||
221 | if (al_flags & LC_STARVING) | ||
222 | dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n"); | ||
223 | if (al_flags & LC_DIRTY) | ||
224 | dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n"); | ||
225 | } | ||
226 | */ | ||
227 | |||
228 | return al_ext; | ||
229 | } | ||
230 | |||
231 | void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector) | ||
232 | { | ||
233 | unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); | ||
234 | struct lc_element *al_ext; | ||
235 | struct update_al_work al_work; | ||
236 | |||
237 | D_ASSERT(atomic_read(&mdev->local_cnt) > 0); | ||
238 | |||
239 | trace_drbd_actlog(mdev, sector, "al_begin_io"); | ||
240 | |||
241 | wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr))); | ||
242 | |||
243 | if (al_ext->lc_number != enr) { | ||
244 | /* drbd_al_write_transaction(mdev,al_ext,enr); | ||
245 | * recurses into generic_make_request(), which | ||
246 | * disallows recursion, bios being serialized on the | ||
247 | * current->bio_tail list now. | ||
248 | * we have to delegate updates to the activity log | ||
249 | * to the worker thread. */ | ||
250 | init_completion(&al_work.event); | ||
251 | al_work.al_ext = al_ext; | ||
252 | al_work.enr = enr; | ||
253 | al_work.old_enr = al_ext->lc_number; | ||
254 | al_work.w.cb = w_al_write_transaction; | ||
255 | drbd_queue_work_front(&mdev->data.work, &al_work.w); | ||
256 | wait_for_completion(&al_work.event); | ||
257 | |||
258 | mdev->al_writ_cnt++; | ||
259 | |||
260 | spin_lock_irq(&mdev->al_lock); | ||
261 | lc_changed(mdev->act_log, al_ext); | ||
262 | spin_unlock_irq(&mdev->al_lock); | ||
263 | wake_up(&mdev->al_wait); | ||
264 | } | ||
265 | } | ||
266 | |||
267 | void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector) | ||
268 | { | ||
269 | unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); | ||
270 | struct lc_element *extent; | ||
271 | unsigned long flags; | ||
272 | |||
273 | trace_drbd_actlog(mdev, sector, "al_complete_io"); | ||
274 | |||
275 | spin_lock_irqsave(&mdev->al_lock, flags); | ||
276 | |||
277 | extent = lc_find(mdev->act_log, enr); | ||
278 | |||
279 | if (!extent) { | ||
280 | spin_unlock_irqrestore(&mdev->al_lock, flags); | ||
281 | dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr); | ||
282 | return; | ||
283 | } | ||
284 | |||
285 | if (lc_put(mdev->act_log, extent) == 0) | ||
286 | wake_up(&mdev->al_wait); | ||
287 | |||
288 | spin_unlock_irqrestore(&mdev->al_lock, flags); | ||
289 | } | ||
290 | |||
291 | int | ||
292 | w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
293 | { | ||
294 | struct update_al_work *aw = container_of(w, struct update_al_work, w); | ||
295 | struct lc_element *updated = aw->al_ext; | ||
296 | const unsigned int new_enr = aw->enr; | ||
297 | const unsigned int evicted = aw->old_enr; | ||
298 | struct al_transaction *buffer; | ||
299 | sector_t sector; | ||
300 | int i, n, mx; | ||
301 | unsigned int extent_nr; | ||
302 | u32 xor_sum = 0; | ||
303 | |||
304 | if (!get_ldev(mdev)) { | ||
305 | dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n"); | ||
306 | complete(&((struct update_al_work *)w)->event); | ||
307 | return 1; | ||
308 | } | ||
309 | /* do we have to do a bitmap write, first? | ||
310 | * TODO reduce maximum latency: | ||
311 | * submit both bios, then wait for both, | ||
312 | * instead of doing two synchronous sector writes. */ | ||
313 | if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE) | ||
314 | drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT); | ||
315 | |||
316 | mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */ | ||
317 | buffer = (struct al_transaction *)page_address(mdev->md_io_page); | ||
318 | |||
319 | buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); | ||
320 | buffer->tr_number = cpu_to_be32(mdev->al_tr_number); | ||
321 | |||
322 | n = lc_index_of(mdev->act_log, updated); | ||
323 | |||
324 | buffer->updates[0].pos = cpu_to_be32(n); | ||
325 | buffer->updates[0].extent = cpu_to_be32(new_enr); | ||
326 | |||
327 | xor_sum ^= new_enr; | ||
328 | |||
329 | mx = min_t(int, AL_EXTENTS_PT, | ||
330 | mdev->act_log->nr_elements - mdev->al_tr_cycle); | ||
331 | for (i = 0; i < mx; i++) { | ||
332 | unsigned idx = mdev->al_tr_cycle + i; | ||
333 | extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number; | ||
334 | buffer->updates[i+1].pos = cpu_to_be32(idx); | ||
335 | buffer->updates[i+1].extent = cpu_to_be32(extent_nr); | ||
336 | xor_sum ^= extent_nr; | ||
337 | } | ||
338 | for (; i < AL_EXTENTS_PT; i++) { | ||
339 | buffer->updates[i+1].pos = __constant_cpu_to_be32(-1); | ||
340 | buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE); | ||
341 | xor_sum ^= LC_FREE; | ||
342 | } | ||
343 | mdev->al_tr_cycle += AL_EXTENTS_PT; | ||
344 | if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) | ||
345 | mdev->al_tr_cycle = 0; | ||
346 | |||
347 | buffer->xor_sum = cpu_to_be32(xor_sum); | ||
348 | |||
349 | sector = mdev->ldev->md.md_offset | ||
350 | + mdev->ldev->md.al_offset + mdev->al_tr_pos; | ||
351 | |||
352 | if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) | ||
353 | drbd_chk_io_error(mdev, 1, TRUE); | ||
354 | |||
355 | if (++mdev->al_tr_pos > | ||
356 | div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) | ||
357 | mdev->al_tr_pos = 0; | ||
358 | |||
359 | D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE); | ||
360 | mdev->al_tr_number++; | ||
361 | |||
362 | mutex_unlock(&mdev->md_io_mutex); | ||
363 | |||
364 | complete(&((struct update_al_work *)w)->event); | ||
365 | put_ldev(mdev); | ||
366 | |||
367 | return 1; | ||
368 | } | ||
369 | |||
370 | /** | ||
371 | * drbd_al_read_tr() - Read a single transaction from the on disk activity log | ||
372 | * @mdev: DRBD device. | ||
373 | * @bdev: Block device to read form. | ||
374 | * @b: pointer to an al_transaction. | ||
375 | * @index: On disk slot of the transaction to read. | ||
376 | * | ||
377 | * Returns -1 on IO error, 0 on checksum error and 1 upon success. | ||
378 | */ | ||
379 | static int drbd_al_read_tr(struct drbd_conf *mdev, | ||
380 | struct drbd_backing_dev *bdev, | ||
381 | struct al_transaction *b, | ||
382 | int index) | ||
383 | { | ||
384 | sector_t sector; | ||
385 | int rv, i; | ||
386 | u32 xor_sum = 0; | ||
387 | |||
388 | sector = bdev->md.md_offset + bdev->md.al_offset + index; | ||
389 | |||
390 | /* Dont process error normally, | ||
391 | * as this is done before disk is attached! */ | ||
392 | if (!drbd_md_sync_page_io(mdev, bdev, sector, READ)) | ||
393 | return -1; | ||
394 | |||
395 | rv = (be32_to_cpu(b->magic) == DRBD_MAGIC); | ||
396 | |||
397 | for (i = 0; i < AL_EXTENTS_PT + 1; i++) | ||
398 | xor_sum ^= be32_to_cpu(b->updates[i].extent); | ||
399 | rv &= (xor_sum == be32_to_cpu(b->xor_sum)); | ||
400 | |||
401 | return rv; | ||
402 | } | ||
403 | |||
404 | /** | ||
405 | * drbd_al_read_log() - Restores the activity log from its on disk representation. | ||
406 | * @mdev: DRBD device. | ||
407 | * @bdev: Block device to read form. | ||
408 | * | ||
409 | * Returns 1 on success, returns 0 when reading the log failed due to IO errors. | ||
410 | */ | ||
411 | int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | ||
412 | { | ||
413 | struct al_transaction *buffer; | ||
414 | int i; | ||
415 | int rv; | ||
416 | int mx; | ||
417 | int active_extents = 0; | ||
418 | int transactions = 0; | ||
419 | int found_valid = 0; | ||
420 | int from = 0; | ||
421 | int to = 0; | ||
422 | u32 from_tnr = 0; | ||
423 | u32 to_tnr = 0; | ||
424 | u32 cnr; | ||
425 | |||
426 | mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT); | ||
427 | |||
428 | /* lock out all other meta data io for now, | ||
429 | * and make sure the page is mapped. | ||
430 | */ | ||
431 | mutex_lock(&mdev->md_io_mutex); | ||
432 | buffer = page_address(mdev->md_io_page); | ||
433 | |||
434 | /* Find the valid transaction in the log */ | ||
435 | for (i = 0; i <= mx; i++) { | ||
436 | rv = drbd_al_read_tr(mdev, bdev, buffer, i); | ||
437 | if (rv == 0) | ||
438 | continue; | ||
439 | if (rv == -1) { | ||
440 | mutex_unlock(&mdev->md_io_mutex); | ||
441 | return 0; | ||
442 | } | ||
443 | cnr = be32_to_cpu(buffer->tr_number); | ||
444 | |||
445 | if (++found_valid == 1) { | ||
446 | from = i; | ||
447 | to = i; | ||
448 | from_tnr = cnr; | ||
449 | to_tnr = cnr; | ||
450 | continue; | ||
451 | } | ||
452 | if ((int)cnr - (int)from_tnr < 0) { | ||
453 | D_ASSERT(from_tnr - cnr + i - from == mx+1); | ||
454 | from = i; | ||
455 | from_tnr = cnr; | ||
456 | } | ||
457 | if ((int)cnr - (int)to_tnr > 0) { | ||
458 | D_ASSERT(cnr - to_tnr == i - to); | ||
459 | to = i; | ||
460 | to_tnr = cnr; | ||
461 | } | ||
462 | } | ||
463 | |||
464 | if (!found_valid) { | ||
465 | dev_warn(DEV, "No usable activity log found.\n"); | ||
466 | mutex_unlock(&mdev->md_io_mutex); | ||
467 | return 1; | ||
468 | } | ||
469 | |||
470 | /* Read the valid transactions. | ||
471 | * dev_info(DEV, "Reading from %d to %d.\n",from,to); */ | ||
472 | i = from; | ||
473 | while (1) { | ||
474 | int j, pos; | ||
475 | unsigned int extent_nr; | ||
476 | unsigned int trn; | ||
477 | |||
478 | rv = drbd_al_read_tr(mdev, bdev, buffer, i); | ||
479 | ERR_IF(rv == 0) goto cancel; | ||
480 | if (rv == -1) { | ||
481 | mutex_unlock(&mdev->md_io_mutex); | ||
482 | return 0; | ||
483 | } | ||
484 | |||
485 | trn = be32_to_cpu(buffer->tr_number); | ||
486 | |||
487 | spin_lock_irq(&mdev->al_lock); | ||
488 | |||
489 | /* This loop runs backwards because in the cyclic | ||
490 | elements there might be an old version of the | ||
491 | updated element (in slot 0). So the element in slot 0 | ||
492 | can overwrite old versions. */ | ||
493 | for (j = AL_EXTENTS_PT; j >= 0; j--) { | ||
494 | pos = be32_to_cpu(buffer->updates[j].pos); | ||
495 | extent_nr = be32_to_cpu(buffer->updates[j].extent); | ||
496 | |||
497 | if (extent_nr == LC_FREE) | ||
498 | continue; | ||
499 | |||
500 | lc_set(mdev->act_log, extent_nr, pos); | ||
501 | active_extents++; | ||
502 | } | ||
503 | spin_unlock_irq(&mdev->al_lock); | ||
504 | |||
505 | transactions++; | ||
506 | |||
507 | cancel: | ||
508 | if (i == to) | ||
509 | break; | ||
510 | i++; | ||
511 | if (i > mx) | ||
512 | i = 0; | ||
513 | } | ||
514 | |||
515 | mdev->al_tr_number = to_tnr+1; | ||
516 | mdev->al_tr_pos = to; | ||
517 | if (++mdev->al_tr_pos > | ||
518 | div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) | ||
519 | mdev->al_tr_pos = 0; | ||
520 | |||
521 | /* ok, we are done with it */ | ||
522 | mutex_unlock(&mdev->md_io_mutex); | ||
523 | |||
524 | dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n", | ||
525 | transactions, active_extents); | ||
526 | |||
527 | return 1; | ||
528 | } | ||
529 | |||
530 | static void atodb_endio(struct bio *bio, int error) | ||
531 | { | ||
532 | struct drbd_atodb_wait *wc = bio->bi_private; | ||
533 | struct drbd_conf *mdev = wc->mdev; | ||
534 | struct page *page; | ||
535 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | ||
536 | |||
537 | /* strange behavior of some lower level drivers... | ||
538 | * fail the request by clearing the uptodate flag, | ||
539 | * but do not return any error?! */ | ||
540 | if (!error && !uptodate) | ||
541 | error = -EIO; | ||
542 | |||
543 | drbd_chk_io_error(mdev, error, TRUE); | ||
544 | if (error && wc->error == 0) | ||
545 | wc->error = error; | ||
546 | |||
547 | if (atomic_dec_and_test(&wc->count)) | ||
548 | complete(&wc->io_done); | ||
549 | |||
550 | page = bio->bi_io_vec[0].bv_page; | ||
551 | put_page(page); | ||
552 | bio_put(bio); | ||
553 | mdev->bm_writ_cnt++; | ||
554 | put_ldev(mdev); | ||
555 | } | ||
556 | |||
557 | #define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) | ||
558 | /* activity log to on disk bitmap -- prepare bio unless that sector | ||
559 | * is already covered by previously prepared bios */ | ||
560 | static int atodb_prepare_unless_covered(struct drbd_conf *mdev, | ||
561 | struct bio **bios, | ||
562 | unsigned int enr, | ||
563 | struct drbd_atodb_wait *wc) __must_hold(local) | ||
564 | { | ||
565 | struct bio *bio; | ||
566 | struct page *page; | ||
567 | sector_t on_disk_sector = enr + mdev->ldev->md.md_offset | ||
568 | + mdev->ldev->md.bm_offset; | ||
569 | unsigned int page_offset = PAGE_SIZE; | ||
570 | int offset; | ||
571 | int i = 0; | ||
572 | int err = -ENOMEM; | ||
573 | |||
574 | /* Check if that enr is already covered by an already created bio. | ||
575 | * Caution, bios[] is not NULL terminated, | ||
576 | * but only initialized to all NULL. | ||
577 | * For completely scattered activity log, | ||
578 | * the last invocation iterates over all bios, | ||
579 | * and finds the last NULL entry. | ||
580 | */ | ||
581 | while ((bio = bios[i])) { | ||
582 | if (bio->bi_sector == on_disk_sector) | ||
583 | return 0; | ||
584 | i++; | ||
585 | } | ||
586 | /* bios[i] == NULL, the next not yet used slot */ | ||
587 | |||
588 | /* GFP_KERNEL, we are not in the write-out path */ | ||
589 | bio = bio_alloc(GFP_KERNEL, 1); | ||
590 | if (bio == NULL) | ||
591 | return -ENOMEM; | ||
592 | |||
593 | if (i > 0) { | ||
594 | const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec; | ||
595 | page_offset = prev_bv->bv_offset + prev_bv->bv_len; | ||
596 | page = prev_bv->bv_page; | ||
597 | } | ||
598 | if (page_offset == PAGE_SIZE) { | ||
599 | page = alloc_page(__GFP_HIGHMEM); | ||
600 | if (page == NULL) | ||
601 | goto out_bio_put; | ||
602 | page_offset = 0; | ||
603 | } else { | ||
604 | get_page(page); | ||
605 | } | ||
606 | |||
607 | offset = S2W(enr); | ||
608 | drbd_bm_get_lel(mdev, offset, | ||
609 | min_t(size_t, S2W(1), drbd_bm_words(mdev) - offset), | ||
610 | kmap(page) + page_offset); | ||
611 | kunmap(page); | ||
612 | |||
613 | bio->bi_private = wc; | ||
614 | bio->bi_end_io = atodb_endio; | ||
615 | bio->bi_bdev = mdev->ldev->md_bdev; | ||
616 | bio->bi_sector = on_disk_sector; | ||
617 | |||
618 | if (bio_add_page(bio, page, MD_SECTOR_SIZE, page_offset) != MD_SECTOR_SIZE) | ||
619 | goto out_put_page; | ||
620 | |||
621 | atomic_inc(&wc->count); | ||
622 | /* we already know that we may do this... | ||
623 | * get_ldev_if_state(mdev,D_ATTACHING); | ||
624 | * just get the extra reference, so that the local_cnt reflects | ||
625 | * the number of pending IO requests DRBD at its backing device. | ||
626 | */ | ||
627 | atomic_inc(&mdev->local_cnt); | ||
628 | |||
629 | bios[i] = bio; | ||
630 | |||
631 | return 0; | ||
632 | |||
633 | out_put_page: | ||
634 | err = -EINVAL; | ||
635 | put_page(page); | ||
636 | out_bio_put: | ||
637 | bio_put(bio); | ||
638 | return err; | ||
639 | } | ||
640 | |||
641 | /** | ||
642 | * drbd_al_to_on_disk_bm() - * Writes bitmap parts covered by active AL extents | ||
643 | * @mdev: DRBD device. | ||
644 | * | ||
645 | * Called when we detach (unconfigure) local storage, | ||
646 | * or when we go from R_PRIMARY to R_SECONDARY role. | ||
647 | */ | ||
648 | void drbd_al_to_on_disk_bm(struct drbd_conf *mdev) | ||
649 | { | ||
650 | int i, nr_elements; | ||
651 | unsigned int enr; | ||
652 | struct bio **bios; | ||
653 | struct drbd_atodb_wait wc; | ||
654 | |||
655 | ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING)) | ||
656 | return; /* sorry, I don't have any act_log etc... */ | ||
657 | |||
658 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | ||
659 | |||
660 | nr_elements = mdev->act_log->nr_elements; | ||
661 | |||
662 | /* GFP_KERNEL, we are not in anyone's write-out path */ | ||
663 | bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL); | ||
664 | if (!bios) | ||
665 | goto submit_one_by_one; | ||
666 | |||
667 | atomic_set(&wc.count, 0); | ||
668 | init_completion(&wc.io_done); | ||
669 | wc.mdev = mdev; | ||
670 | wc.error = 0; | ||
671 | |||
672 | for (i = 0; i < nr_elements; i++) { | ||
673 | enr = lc_element_by_index(mdev->act_log, i)->lc_number; | ||
674 | if (enr == LC_FREE) | ||
675 | continue; | ||
676 | /* next statement also does atomic_inc wc.count and local_cnt */ | ||
677 | if (atodb_prepare_unless_covered(mdev, bios, | ||
678 | enr/AL_EXT_PER_BM_SECT, | ||
679 | &wc)) | ||
680 | goto free_bios_submit_one_by_one; | ||
681 | } | ||
682 | |||
683 | /* unnecessary optimization? */ | ||
684 | lc_unlock(mdev->act_log); | ||
685 | wake_up(&mdev->al_wait); | ||
686 | |||
687 | /* all prepared, submit them */ | ||
688 | for (i = 0; i < nr_elements; i++) { | ||
689 | if (bios[i] == NULL) | ||
690 | break; | ||
691 | if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) { | ||
692 | bios[i]->bi_rw = WRITE; | ||
693 | bio_endio(bios[i], -EIO); | ||
694 | } else { | ||
695 | submit_bio(WRITE, bios[i]); | ||
696 | } | ||
697 | } | ||
698 | |||
699 | drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev)); | ||
700 | |||
701 | /* always (try to) flush bitmap to stable storage */ | ||
702 | drbd_md_flush(mdev); | ||
703 | |||
704 | /* In case we did not submit a single IO do not wait for | ||
705 | * them to complete. ( Because we would wait forever here. ) | ||
706 | * | ||
707 | * In case we had IOs and they are already complete, there | ||
708 | * is not point in waiting anyways. | ||
709 | * Therefore this if () ... */ | ||
710 | if (atomic_read(&wc.count)) | ||
711 | wait_for_completion(&wc.io_done); | ||
712 | |||
713 | put_ldev(mdev); | ||
714 | |||
715 | kfree(bios); | ||
716 | return; | ||
717 | |||
718 | free_bios_submit_one_by_one: | ||
719 | /* free everything by calling the endio callback directly. */ | ||
720 | for (i = 0; i < nr_elements && bios[i]; i++) | ||
721 | bio_endio(bios[i], 0); | ||
722 | |||
723 | kfree(bios); | ||
724 | |||
725 | submit_one_by_one: | ||
726 | dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n"); | ||
727 | |||
728 | for (i = 0; i < mdev->act_log->nr_elements; i++) { | ||
729 | enr = lc_element_by_index(mdev->act_log, i)->lc_number; | ||
730 | if (enr == LC_FREE) | ||
731 | continue; | ||
732 | /* Really slow: if we have al-extents 16..19 active, | ||
733 | * sector 4 will be written four times! Synchronous! */ | ||
734 | drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT); | ||
735 | } | ||
736 | |||
737 | lc_unlock(mdev->act_log); | ||
738 | wake_up(&mdev->al_wait); | ||
739 | put_ldev(mdev); | ||
740 | } | ||
741 | |||
742 | /** | ||
743 | * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents | ||
744 | * @mdev: DRBD device. | ||
745 | */ | ||
746 | void drbd_al_apply_to_bm(struct drbd_conf *mdev) | ||
747 | { | ||
748 | unsigned int enr; | ||
749 | unsigned long add = 0; | ||
750 | char ppb[10]; | ||
751 | int i; | ||
752 | |||
753 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | ||
754 | |||
755 | for (i = 0; i < mdev->act_log->nr_elements; i++) { | ||
756 | enr = lc_element_by_index(mdev->act_log, i)->lc_number; | ||
757 | if (enr == LC_FREE) | ||
758 | continue; | ||
759 | add += drbd_bm_ALe_set_all(mdev, enr); | ||
760 | } | ||
761 | |||
762 | lc_unlock(mdev->act_log); | ||
763 | wake_up(&mdev->al_wait); | ||
764 | |||
765 | dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n", | ||
766 | ppsize(ppb, Bit2KB(add))); | ||
767 | } | ||
768 | |||
769 | static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) | ||
770 | { | ||
771 | int rv; | ||
772 | |||
773 | spin_lock_irq(&mdev->al_lock); | ||
774 | rv = (al_ext->refcnt == 0); | ||
775 | if (likely(rv)) | ||
776 | lc_del(mdev->act_log, al_ext); | ||
777 | spin_unlock_irq(&mdev->al_lock); | ||
778 | |||
779 | return rv; | ||
780 | } | ||
781 | |||
782 | /** | ||
783 | * drbd_al_shrink() - Removes all active extents form the activity log | ||
784 | * @mdev: DRBD device. | ||
785 | * | ||
786 | * Removes all active extents form the activity log, waiting until | ||
787 | * the reference count of each entry dropped to 0 first, of course. | ||
788 | * | ||
789 | * You need to lock mdev->act_log with lc_try_lock() / lc_unlock() | ||
790 | */ | ||
791 | void drbd_al_shrink(struct drbd_conf *mdev) | ||
792 | { | ||
793 | struct lc_element *al_ext; | ||
794 | int i; | ||
795 | |||
796 | D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags)); | ||
797 | |||
798 | for (i = 0; i < mdev->act_log->nr_elements; i++) { | ||
799 | al_ext = lc_element_by_index(mdev->act_log, i); | ||
800 | if (al_ext->lc_number == LC_FREE) | ||
801 | continue; | ||
802 | wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext)); | ||
803 | } | ||
804 | |||
805 | wake_up(&mdev->al_wait); | ||
806 | } | ||
807 | |||
808 | static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
809 | { | ||
810 | struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); | ||
811 | |||
812 | if (!get_ldev(mdev)) { | ||
813 | if (__ratelimit(&drbd_ratelimit_state)) | ||
814 | dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n"); | ||
815 | kfree(udw); | ||
816 | return 1; | ||
817 | } | ||
818 | |||
819 | drbd_bm_write_sect(mdev, udw->enr); | ||
820 | put_ldev(mdev); | ||
821 | |||
822 | kfree(udw); | ||
823 | |||
824 | if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) { | ||
825 | switch (mdev->state.conn) { | ||
826 | case C_SYNC_SOURCE: case C_SYNC_TARGET: | ||
827 | case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T: | ||
828 | drbd_resync_finished(mdev); | ||
829 | default: | ||
830 | /* nothing to do */ | ||
831 | break; | ||
832 | } | ||
833 | } | ||
834 | drbd_bcast_sync_progress(mdev); | ||
835 | |||
836 | return 1; | ||
837 | } | ||
838 | |||
839 | |||
840 | /* ATTENTION. The AL's extents are 4MB each, while the extents in the | ||
841 | * resync LRU-cache are 16MB each. | ||
842 | * The caller of this function has to hold an get_ldev() reference. | ||
843 | * | ||
844 | * TODO will be obsoleted once we have a caching lru of the on disk bitmap | ||
845 | */ | ||
846 | static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, | ||
847 | int count, int success) | ||
848 | { | ||
849 | struct lc_element *e; | ||
850 | struct update_odbm_work *udw; | ||
851 | |||
852 | unsigned int enr; | ||
853 | |||
854 | D_ASSERT(atomic_read(&mdev->local_cnt)); | ||
855 | |||
856 | /* I simply assume that a sector/size pair never crosses | ||
857 | * a 16 MB extent border. (Currently this is true...) */ | ||
858 | enr = BM_SECT_TO_EXT(sector); | ||
859 | |||
860 | e = lc_get(mdev->resync, enr); | ||
861 | if (e) { | ||
862 | struct bm_extent *ext = lc_entry(e, struct bm_extent, lce); | ||
863 | if (ext->lce.lc_number == enr) { | ||
864 | if (success) | ||
865 | ext->rs_left -= count; | ||
866 | else | ||
867 | ext->rs_failed += count; | ||
868 | if (ext->rs_left < ext->rs_failed) { | ||
869 | dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d " | ||
870 | "rs_failed=%d count=%d\n", | ||
871 | (unsigned long long)sector, | ||
872 | ext->lce.lc_number, ext->rs_left, | ||
873 | ext->rs_failed, count); | ||
874 | dump_stack(); | ||
875 | |||
876 | lc_put(mdev->resync, &ext->lce); | ||
877 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
878 | return; | ||
879 | } | ||
880 | } else { | ||
881 | /* Normally this element should be in the cache, | ||
882 | * since drbd_rs_begin_io() pulled it already in. | ||
883 | * | ||
884 | * But maybe an application write finished, and we set | ||
885 | * something outside the resync lru_cache in sync. | ||
886 | */ | ||
887 | int rs_left = drbd_bm_e_weight(mdev, enr); | ||
888 | if (ext->flags != 0) { | ||
889 | dev_warn(DEV, "changing resync lce: %d[%u;%02lx]" | ||
890 | " -> %d[%u;00]\n", | ||
891 | ext->lce.lc_number, ext->rs_left, | ||
892 | ext->flags, enr, rs_left); | ||
893 | ext->flags = 0; | ||
894 | } | ||
895 | if (ext->rs_failed) { | ||
896 | dev_warn(DEV, "Kicking resync_lru element enr=%u " | ||
897 | "out with rs_failed=%d\n", | ||
898 | ext->lce.lc_number, ext->rs_failed); | ||
899 | set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); | ||
900 | } | ||
901 | ext->rs_left = rs_left; | ||
902 | ext->rs_failed = success ? 0 : count; | ||
903 | lc_changed(mdev->resync, &ext->lce); | ||
904 | } | ||
905 | lc_put(mdev->resync, &ext->lce); | ||
906 | /* no race, we are within the al_lock! */ | ||
907 | |||
908 | if (ext->rs_left == ext->rs_failed) { | ||
909 | ext->rs_failed = 0; | ||
910 | |||
911 | udw = kmalloc(sizeof(*udw), GFP_ATOMIC); | ||
912 | if (udw) { | ||
913 | udw->enr = ext->lce.lc_number; | ||
914 | udw->w.cb = w_update_odbm; | ||
915 | drbd_queue_work_front(&mdev->data.work, &udw->w); | ||
916 | } else { | ||
917 | dev_warn(DEV, "Could not kmalloc an udw\n"); | ||
918 | set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); | ||
919 | } | ||
920 | } | ||
921 | } else { | ||
922 | dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n", | ||
923 | mdev->resync_locked, | ||
924 | mdev->resync->nr_elements, | ||
925 | mdev->resync->flags); | ||
926 | } | ||
927 | } | ||
928 | |||
929 | /* clear the bit corresponding to the piece of storage in question: | ||
930 | * size byte of data starting from sector. Only clear a bits of the affected | ||
931 | * one ore more _aligned_ BM_BLOCK_SIZE blocks. | ||
932 | * | ||
933 | * called by worker on C_SYNC_TARGET and receiver on SyncSource. | ||
934 | * | ||
935 | */ | ||
936 | void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, | ||
937 | const char *file, const unsigned int line) | ||
938 | { | ||
939 | /* Is called from worker and receiver context _only_ */ | ||
940 | unsigned long sbnr, ebnr, lbnr; | ||
941 | unsigned long count = 0; | ||
942 | sector_t esector, nr_sectors; | ||
943 | int wake_up = 0; | ||
944 | unsigned long flags; | ||
945 | |||
946 | if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { | ||
947 | dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", | ||
948 | (unsigned long long)sector, size); | ||
949 | return; | ||
950 | } | ||
951 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | ||
952 | esector = sector + (size >> 9) - 1; | ||
953 | |||
954 | ERR_IF(sector >= nr_sectors) return; | ||
955 | ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); | ||
956 | |||
957 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); | ||
958 | |||
959 | /* we clear it (in sync). | ||
960 | * round up start sector, round down end sector. we make sure we only | ||
961 | * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */ | ||
962 | if (unlikely(esector < BM_SECT_PER_BIT-1)) | ||
963 | return; | ||
964 | if (unlikely(esector == (nr_sectors-1))) | ||
965 | ebnr = lbnr; | ||
966 | else | ||
967 | ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); | ||
968 | sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); | ||
969 | |||
970 | trace_drbd_resync(mdev, TRACE_LVL_METRICS, | ||
971 | "drbd_set_in_sync: sector=%llus size=%u sbnr=%lu ebnr=%lu\n", | ||
972 | (unsigned long long)sector, size, sbnr, ebnr); | ||
973 | |||
974 | if (sbnr > ebnr) | ||
975 | return; | ||
976 | |||
977 | /* | ||
978 | * ok, (capacity & 7) != 0 sometimes, but who cares... | ||
979 | * we count rs_{total,left} in bits, not sectors. | ||
980 | */ | ||
981 | spin_lock_irqsave(&mdev->al_lock, flags); | ||
982 | count = drbd_bm_clear_bits(mdev, sbnr, ebnr); | ||
983 | if (count) { | ||
984 | /* we need the lock for drbd_try_clear_on_disk_bm */ | ||
985 | if (jiffies - mdev->rs_mark_time > HZ*10) { | ||
986 | /* should be rolling marks, | ||
987 | * but we estimate only anyways. */ | ||
988 | if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) && | ||
989 | mdev->state.conn != C_PAUSED_SYNC_T && | ||
990 | mdev->state.conn != C_PAUSED_SYNC_S) { | ||
991 | mdev->rs_mark_time = jiffies; | ||
992 | mdev->rs_mark_left = drbd_bm_total_weight(mdev); | ||
993 | } | ||
994 | } | ||
995 | if (get_ldev(mdev)) { | ||
996 | drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE); | ||
997 | put_ldev(mdev); | ||
998 | } | ||
999 | /* just wake_up unconditional now, various lc_chaged(), | ||
1000 | * lc_put() in drbd_try_clear_on_disk_bm(). */ | ||
1001 | wake_up = 1; | ||
1002 | } | ||
1003 | spin_unlock_irqrestore(&mdev->al_lock, flags); | ||
1004 | if (wake_up) | ||
1005 | wake_up(&mdev->al_wait); | ||
1006 | } | ||
1007 | |||
1008 | /* | ||
1009 | * this is intended to set one request worth of data out of sync. | ||
1010 | * affects at least 1 bit, | ||
1011 | * and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits. | ||
1012 | * | ||
1013 | * called by tl_clear and drbd_send_dblock (==drbd_make_request). | ||
1014 | * so this can be _any_ process. | ||
1015 | */ | ||
1016 | void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, | ||
1017 | const char *file, const unsigned int line) | ||
1018 | { | ||
1019 | unsigned long sbnr, ebnr, lbnr, flags; | ||
1020 | sector_t esector, nr_sectors; | ||
1021 | unsigned int enr, count; | ||
1022 | struct lc_element *e; | ||
1023 | |||
1024 | if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { | ||
1025 | dev_err(DEV, "sector: %llus, size: %d\n", | ||
1026 | (unsigned long long)sector, size); | ||
1027 | return; | ||
1028 | } | ||
1029 | |||
1030 | if (!get_ldev(mdev)) | ||
1031 | return; /* no disk, no metadata, no bitmap to set bits in */ | ||
1032 | |||
1033 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | ||
1034 | esector = sector + (size >> 9) - 1; | ||
1035 | |||
1036 | ERR_IF(sector >= nr_sectors) | ||
1037 | goto out; | ||
1038 | ERR_IF(esector >= nr_sectors) | ||
1039 | esector = (nr_sectors-1); | ||
1040 | |||
1041 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); | ||
1042 | |||
1043 | /* we set it out of sync, | ||
1044 | * we do not need to round anything here */ | ||
1045 | sbnr = BM_SECT_TO_BIT(sector); | ||
1046 | ebnr = BM_SECT_TO_BIT(esector); | ||
1047 | |||
1048 | trace_drbd_resync(mdev, TRACE_LVL_METRICS, | ||
1049 | "drbd_set_out_of_sync: sector=%llus size=%u sbnr=%lu ebnr=%lu\n", | ||
1050 | (unsigned long long)sector, size, sbnr, ebnr); | ||
1051 | |||
1052 | /* ok, (capacity & 7) != 0 sometimes, but who cares... | ||
1053 | * we count rs_{total,left} in bits, not sectors. */ | ||
1054 | spin_lock_irqsave(&mdev->al_lock, flags); | ||
1055 | count = drbd_bm_set_bits(mdev, sbnr, ebnr); | ||
1056 | |||
1057 | enr = BM_SECT_TO_EXT(sector); | ||
1058 | e = lc_find(mdev->resync, enr); | ||
1059 | if (e) | ||
1060 | lc_entry(e, struct bm_extent, lce)->rs_left += count; | ||
1061 | spin_unlock_irqrestore(&mdev->al_lock, flags); | ||
1062 | |||
1063 | out: | ||
1064 | put_ldev(mdev); | ||
1065 | } | ||
1066 | |||
1067 | static | ||
1068 | struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr) | ||
1069 | { | ||
1070 | struct lc_element *e; | ||
1071 | struct bm_extent *bm_ext; | ||
1072 | int wakeup = 0; | ||
1073 | unsigned long rs_flags; | ||
1074 | |||
1075 | spin_lock_irq(&mdev->al_lock); | ||
1076 | if (mdev->resync_locked > mdev->resync->nr_elements/2) { | ||
1077 | spin_unlock_irq(&mdev->al_lock); | ||
1078 | return NULL; | ||
1079 | } | ||
1080 | e = lc_get(mdev->resync, enr); | ||
1081 | bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; | ||
1082 | if (bm_ext) { | ||
1083 | if (bm_ext->lce.lc_number != enr) { | ||
1084 | bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); | ||
1085 | bm_ext->rs_failed = 0; | ||
1086 | lc_changed(mdev->resync, &bm_ext->lce); | ||
1087 | wakeup = 1; | ||
1088 | } | ||
1089 | if (bm_ext->lce.refcnt == 1) | ||
1090 | mdev->resync_locked++; | ||
1091 | set_bit(BME_NO_WRITES, &bm_ext->flags); | ||
1092 | } | ||
1093 | rs_flags = mdev->resync->flags; | ||
1094 | spin_unlock_irq(&mdev->al_lock); | ||
1095 | if (wakeup) | ||
1096 | wake_up(&mdev->al_wait); | ||
1097 | |||
1098 | if (!bm_ext) { | ||
1099 | if (rs_flags & LC_STARVING) | ||
1100 | dev_warn(DEV, "Have to wait for element" | ||
1101 | " (resync LRU too small?)\n"); | ||
1102 | BUG_ON(rs_flags & LC_DIRTY); | ||
1103 | } | ||
1104 | |||
1105 | return bm_ext; | ||
1106 | } | ||
1107 | |||
1108 | static int _is_in_al(struct drbd_conf *mdev, unsigned int enr) | ||
1109 | { | ||
1110 | struct lc_element *al_ext; | ||
1111 | int rv = 0; | ||
1112 | |||
1113 | spin_lock_irq(&mdev->al_lock); | ||
1114 | if (unlikely(enr == mdev->act_log->new_number)) | ||
1115 | rv = 1; | ||
1116 | else { | ||
1117 | al_ext = lc_find(mdev->act_log, enr); | ||
1118 | if (al_ext) { | ||
1119 | if (al_ext->refcnt) | ||
1120 | rv = 1; | ||
1121 | } | ||
1122 | } | ||
1123 | spin_unlock_irq(&mdev->al_lock); | ||
1124 | |||
1125 | /* | ||
1126 | if (unlikely(rv)) { | ||
1127 | dev_info(DEV, "Delaying sync read until app's write is done\n"); | ||
1128 | } | ||
1129 | */ | ||
1130 | return rv; | ||
1131 | } | ||
1132 | |||
1133 | /** | ||
1134 | * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED | ||
1135 | * @mdev: DRBD device. | ||
1136 | * @sector: The sector number. | ||
1137 | * | ||
1138 | * This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted. | ||
1139 | */ | ||
1140 | int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector) | ||
1141 | { | ||
1142 | unsigned int enr = BM_SECT_TO_EXT(sector); | ||
1143 | struct bm_extent *bm_ext; | ||
1144 | int i, sig; | ||
1145 | |||
1146 | trace_drbd_resync(mdev, TRACE_LVL_ALL, | ||
1147 | "drbd_rs_begin_io: sector=%llus (rs_end=%d)\n", | ||
1148 | (unsigned long long)sector, enr); | ||
1149 | |||
1150 | sig = wait_event_interruptible(mdev->al_wait, | ||
1151 | (bm_ext = _bme_get(mdev, enr))); | ||
1152 | if (sig) | ||
1153 | return 0; | ||
1154 | |||
1155 | if (test_bit(BME_LOCKED, &bm_ext->flags)) | ||
1156 | return 1; | ||
1157 | |||
1158 | for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { | ||
1159 | sig = wait_event_interruptible(mdev->al_wait, | ||
1160 | !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i)); | ||
1161 | if (sig) { | ||
1162 | spin_lock_irq(&mdev->al_lock); | ||
1163 | if (lc_put(mdev->resync, &bm_ext->lce) == 0) { | ||
1164 | clear_bit(BME_NO_WRITES, &bm_ext->flags); | ||
1165 | mdev->resync_locked--; | ||
1166 | wake_up(&mdev->al_wait); | ||
1167 | } | ||
1168 | spin_unlock_irq(&mdev->al_lock); | ||
1169 | return 0; | ||
1170 | } | ||
1171 | } | ||
1172 | |||
1173 | set_bit(BME_LOCKED, &bm_ext->flags); | ||
1174 | |||
1175 | return 1; | ||
1176 | } | ||
1177 | |||
1178 | /** | ||
1179 | * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep | ||
1180 | * @mdev: DRBD device. | ||
1181 | * @sector: The sector number. | ||
1182 | * | ||
1183 | * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then | ||
1184 | * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN | ||
1185 | * if there is still application IO going on in this area. | ||
1186 | */ | ||
1187 | int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector) | ||
1188 | { | ||
1189 | unsigned int enr = BM_SECT_TO_EXT(sector); | ||
1190 | const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT; | ||
1191 | struct lc_element *e; | ||
1192 | struct bm_extent *bm_ext; | ||
1193 | int i; | ||
1194 | |||
1195 | trace_drbd_resync(mdev, TRACE_LVL_ALL, "drbd_try_rs_begin_io: sector=%llus\n", | ||
1196 | (unsigned long long)sector); | ||
1197 | |||
1198 | spin_lock_irq(&mdev->al_lock); | ||
1199 | if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) { | ||
1200 | /* in case you have very heavy scattered io, it may | ||
1201 | * stall the syncer undefined if we give up the ref count | ||
1202 | * when we try again and requeue. | ||
1203 | * | ||
1204 | * if we don't give up the refcount, but the next time | ||
1205 | * we are scheduled this extent has been "synced" by new | ||
1206 | * application writes, we'd miss the lc_put on the | ||
1207 | * extent we keep the refcount on. | ||
1208 | * so we remembered which extent we had to try again, and | ||
1209 | * if the next requested one is something else, we do | ||
1210 | * the lc_put here... | ||
1211 | * we also have to wake_up | ||
1212 | */ | ||
1213 | |||
1214 | trace_drbd_resync(mdev, TRACE_LVL_ALL, | ||
1215 | "dropping %u, apparently got 'synced' by application io\n", | ||
1216 | mdev->resync_wenr); | ||
1217 | |||
1218 | e = lc_find(mdev->resync, mdev->resync_wenr); | ||
1219 | bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; | ||
1220 | if (bm_ext) { | ||
1221 | D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); | ||
1222 | D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags)); | ||
1223 | clear_bit(BME_NO_WRITES, &bm_ext->flags); | ||
1224 | mdev->resync_wenr = LC_FREE; | ||
1225 | if (lc_put(mdev->resync, &bm_ext->lce) == 0) | ||
1226 | mdev->resync_locked--; | ||
1227 | wake_up(&mdev->al_wait); | ||
1228 | } else { | ||
1229 | dev_alert(DEV, "LOGIC BUG\n"); | ||
1230 | } | ||
1231 | } | ||
1232 | /* TRY. */ | ||
1233 | e = lc_try_get(mdev->resync, enr); | ||
1234 | bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; | ||
1235 | if (bm_ext) { | ||
1236 | if (test_bit(BME_LOCKED, &bm_ext->flags)) | ||
1237 | goto proceed; | ||
1238 | if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) { | ||
1239 | mdev->resync_locked++; | ||
1240 | } else { | ||
1241 | /* we did set the BME_NO_WRITES, | ||
1242 | * but then could not set BME_LOCKED, | ||
1243 | * so we tried again. | ||
1244 | * drop the extra reference. */ | ||
1245 | trace_drbd_resync(mdev, TRACE_LVL_ALL, | ||
1246 | "dropping extra reference on %u\n", enr); | ||
1247 | |||
1248 | bm_ext->lce.refcnt--; | ||
1249 | D_ASSERT(bm_ext->lce.refcnt > 0); | ||
1250 | } | ||
1251 | goto check_al; | ||
1252 | } else { | ||
1253 | /* do we rather want to try later? */ | ||
1254 | if (mdev->resync_locked > mdev->resync->nr_elements-3) { | ||
1255 | trace_drbd_resync(mdev, TRACE_LVL_ALL, | ||
1256 | "resync_locked = %u!\n", mdev->resync_locked); | ||
1257 | |||
1258 | goto try_again; | ||
1259 | } | ||
1260 | /* Do or do not. There is no try. -- Yoda */ | ||
1261 | e = lc_get(mdev->resync, enr); | ||
1262 | bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; | ||
1263 | if (!bm_ext) { | ||
1264 | const unsigned long rs_flags = mdev->resync->flags; | ||
1265 | if (rs_flags & LC_STARVING) | ||
1266 | dev_warn(DEV, "Have to wait for element" | ||
1267 | " (resync LRU too small?)\n"); | ||
1268 | BUG_ON(rs_flags & LC_DIRTY); | ||
1269 | goto try_again; | ||
1270 | } | ||
1271 | if (bm_ext->lce.lc_number != enr) { | ||
1272 | bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); | ||
1273 | bm_ext->rs_failed = 0; | ||
1274 | lc_changed(mdev->resync, &bm_ext->lce); | ||
1275 | wake_up(&mdev->al_wait); | ||
1276 | D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0); | ||
1277 | } | ||
1278 | set_bit(BME_NO_WRITES, &bm_ext->flags); | ||
1279 | D_ASSERT(bm_ext->lce.refcnt == 1); | ||
1280 | mdev->resync_locked++; | ||
1281 | goto check_al; | ||
1282 | } | ||
1283 | check_al: | ||
1284 | trace_drbd_resync(mdev, TRACE_LVL_ALL, "checking al for %u\n", enr); | ||
1285 | |||
1286 | for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { | ||
1287 | if (unlikely(al_enr+i == mdev->act_log->new_number)) | ||
1288 | goto try_again; | ||
1289 | if (lc_is_used(mdev->act_log, al_enr+i)) | ||
1290 | goto try_again; | ||
1291 | } | ||
1292 | set_bit(BME_LOCKED, &bm_ext->flags); | ||
1293 | proceed: | ||
1294 | mdev->resync_wenr = LC_FREE; | ||
1295 | spin_unlock_irq(&mdev->al_lock); | ||
1296 | return 0; | ||
1297 | |||
1298 | try_again: | ||
1299 | trace_drbd_resync(mdev, TRACE_LVL_ALL, "need to try again for %u\n", enr); | ||
1300 | if (bm_ext) | ||
1301 | mdev->resync_wenr = enr; | ||
1302 | spin_unlock_irq(&mdev->al_lock); | ||
1303 | return -EAGAIN; | ||
1304 | } | ||
1305 | |||
1306 | void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector) | ||
1307 | { | ||
1308 | unsigned int enr = BM_SECT_TO_EXT(sector); | ||
1309 | struct lc_element *e; | ||
1310 | struct bm_extent *bm_ext; | ||
1311 | unsigned long flags; | ||
1312 | |||
1313 | trace_drbd_resync(mdev, TRACE_LVL_ALL, | ||
1314 | "drbd_rs_complete_io: sector=%llus (rs_enr=%d)\n", | ||
1315 | (long long)sector, enr); | ||
1316 | |||
1317 | spin_lock_irqsave(&mdev->al_lock, flags); | ||
1318 | e = lc_find(mdev->resync, enr); | ||
1319 | bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; | ||
1320 | if (!bm_ext) { | ||
1321 | spin_unlock_irqrestore(&mdev->al_lock, flags); | ||
1322 | if (__ratelimit(&drbd_ratelimit_state)) | ||
1323 | dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n"); | ||
1324 | return; | ||
1325 | } | ||
1326 | |||
1327 | if (bm_ext->lce.refcnt == 0) { | ||
1328 | spin_unlock_irqrestore(&mdev->al_lock, flags); | ||
1329 | dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, " | ||
1330 | "but refcnt is 0!?\n", | ||
1331 | (unsigned long long)sector, enr); | ||
1332 | return; | ||
1333 | } | ||
1334 | |||
1335 | if (lc_put(mdev->resync, &bm_ext->lce) == 0) { | ||
1336 | clear_bit(BME_LOCKED, &bm_ext->flags); | ||
1337 | clear_bit(BME_NO_WRITES, &bm_ext->flags); | ||
1338 | mdev->resync_locked--; | ||
1339 | wake_up(&mdev->al_wait); | ||
1340 | } | ||
1341 | |||
1342 | spin_unlock_irqrestore(&mdev->al_lock, flags); | ||
1343 | } | ||
1344 | |||
1345 | /** | ||
1346 | * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED) | ||
1347 | * @mdev: DRBD device. | ||
1348 | */ | ||
1349 | void drbd_rs_cancel_all(struct drbd_conf *mdev) | ||
1350 | { | ||
1351 | trace_drbd_resync(mdev, TRACE_LVL_METRICS, "drbd_rs_cancel_all\n"); | ||
1352 | |||
1353 | spin_lock_irq(&mdev->al_lock); | ||
1354 | |||
1355 | if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */ | ||
1356 | lc_reset(mdev->resync); | ||
1357 | put_ldev(mdev); | ||
1358 | } | ||
1359 | mdev->resync_locked = 0; | ||
1360 | mdev->resync_wenr = LC_FREE; | ||
1361 | spin_unlock_irq(&mdev->al_lock); | ||
1362 | wake_up(&mdev->al_wait); | ||
1363 | } | ||
1364 | |||
1365 | /** | ||
1366 | * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU | ||
1367 | * @mdev: DRBD device. | ||
1368 | * | ||
1369 | * Returns 0 upon success, -EAGAIN if at least one reference count was | ||
1370 | * not zero. | ||
1371 | */ | ||
1372 | int drbd_rs_del_all(struct drbd_conf *mdev) | ||
1373 | { | ||
1374 | struct lc_element *e; | ||
1375 | struct bm_extent *bm_ext; | ||
1376 | int i; | ||
1377 | |||
1378 | trace_drbd_resync(mdev, TRACE_LVL_METRICS, "drbd_rs_del_all\n"); | ||
1379 | |||
1380 | spin_lock_irq(&mdev->al_lock); | ||
1381 | |||
1382 | if (get_ldev_if_state(mdev, D_FAILED)) { | ||
1383 | /* ok, ->resync is there. */ | ||
1384 | for (i = 0; i < mdev->resync->nr_elements; i++) { | ||
1385 | e = lc_element_by_index(mdev->resync, i); | ||
1386 | bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; | ||
1387 | if (bm_ext->lce.lc_number == LC_FREE) | ||
1388 | continue; | ||
1389 | if (bm_ext->lce.lc_number == mdev->resync_wenr) { | ||
1390 | dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently" | ||
1391 | " got 'synced' by application io\n", | ||
1392 | mdev->resync_wenr); | ||
1393 | D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); | ||
1394 | D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags)); | ||
1395 | clear_bit(BME_NO_WRITES, &bm_ext->flags); | ||
1396 | mdev->resync_wenr = LC_FREE; | ||
1397 | lc_put(mdev->resync, &bm_ext->lce); | ||
1398 | } | ||
1399 | if (bm_ext->lce.refcnt != 0) { | ||
1400 | dev_info(DEV, "Retrying drbd_rs_del_all() later. " | ||
1401 | "refcnt=%d\n", bm_ext->lce.refcnt); | ||
1402 | put_ldev(mdev); | ||
1403 | spin_unlock_irq(&mdev->al_lock); | ||
1404 | return -EAGAIN; | ||
1405 | } | ||
1406 | D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); | ||
1407 | D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags)); | ||
1408 | lc_del(mdev->resync, &bm_ext->lce); | ||
1409 | } | ||
1410 | D_ASSERT(mdev->resync->used == 0); | ||
1411 | put_ldev(mdev); | ||
1412 | } | ||
1413 | spin_unlock_irq(&mdev->al_lock); | ||
1414 | |||
1415 | return 0; | ||
1416 | } | ||
1417 | |||
1418 | /** | ||
1419 | * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks | ||
1420 | * @mdev: DRBD device. | ||
1421 | * @sector: The sector number. | ||
1422 | * @size: Size of failed IO operation, in byte. | ||
1423 | */ | ||
1424 | void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size) | ||
1425 | { | ||
1426 | /* Is called from worker and receiver context _only_ */ | ||
1427 | unsigned long sbnr, ebnr, lbnr; | ||
1428 | unsigned long count; | ||
1429 | sector_t esector, nr_sectors; | ||
1430 | int wake_up = 0; | ||
1431 | |||
1432 | trace_drbd_resync(mdev, TRACE_LVL_SUMMARY, | ||
1433 | "drbd_rs_failed_io: sector=%llus, size=%u\n", | ||
1434 | (unsigned long long)sector, size); | ||
1435 | |||
1436 | if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { | ||
1437 | dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", | ||
1438 | (unsigned long long)sector, size); | ||
1439 | return; | ||
1440 | } | ||
1441 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | ||
1442 | esector = sector + (size >> 9) - 1; | ||
1443 | |||
1444 | ERR_IF(sector >= nr_sectors) return; | ||
1445 | ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); | ||
1446 | |||
1447 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); | ||
1448 | |||
1449 | /* | ||
1450 | * round up start sector, round down end sector. we make sure we only | ||
1451 | * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */ | ||
1452 | if (unlikely(esector < BM_SECT_PER_BIT-1)) | ||
1453 | return; | ||
1454 | if (unlikely(esector == (nr_sectors-1))) | ||
1455 | ebnr = lbnr; | ||
1456 | else | ||
1457 | ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); | ||
1458 | sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); | ||
1459 | |||
1460 | if (sbnr > ebnr) | ||
1461 | return; | ||
1462 | |||
1463 | /* | ||
1464 | * ok, (capacity & 7) != 0 sometimes, but who cares... | ||
1465 | * we count rs_{total,left} in bits, not sectors. | ||
1466 | */ | ||
1467 | spin_lock_irq(&mdev->al_lock); | ||
1468 | count = drbd_bm_count_bits(mdev, sbnr, ebnr); | ||
1469 | if (count) { | ||
1470 | mdev->rs_failed += count; | ||
1471 | |||
1472 | if (get_ldev(mdev)) { | ||
1473 | drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE); | ||
1474 | put_ldev(mdev); | ||
1475 | } | ||
1476 | |||
1477 | /* just wake_up unconditional now, various lc_chaged(), | ||
1478 | * lc_put() in drbd_try_clear_on_disk_bm(). */ | ||
1479 | wake_up = 1; | ||
1480 | } | ||
1481 | spin_unlock_irq(&mdev->al_lock); | ||
1482 | if (wake_up) | ||
1483 | wake_up(&mdev->al_wait); | ||
1484 | } | ||
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c new file mode 100644 index 00000000000..b61057e7788 --- /dev/null +++ b/drivers/block/drbd/drbd_bitmap.c | |||
@@ -0,0 +1,1327 @@ | |||
1 | /* | ||
2 | drbd_bitmap.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2004-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | */ | ||
24 | |||
25 | #include <linux/bitops.h> | ||
26 | #include <linux/vmalloc.h> | ||
27 | #include <linux/string.h> | ||
28 | #include <linux/drbd.h> | ||
29 | #include <asm/kmap_types.h> | ||
30 | #include "drbd_int.h" | ||
31 | |||
32 | /* OPAQUE outside this file! | ||
33 | * interface defined in drbd_int.h | ||
34 | |||
35 | * convention: | ||
36 | * function name drbd_bm_... => used elsewhere, "public". | ||
37 | * function name bm_... => internal to implementation, "private". | ||
38 | |||
39 | * Note that since find_first_bit returns int, at the current granularity of | ||
40 | * the bitmap (4KB per byte), this implementation "only" supports up to | ||
41 | * 1<<(32+12) == 16 TB... | ||
42 | */ | ||
43 | |||
44 | /* | ||
45 | * NOTE | ||
46 | * Access to the *bm_pages is protected by bm_lock. | ||
47 | * It is safe to read the other members within the lock. | ||
48 | * | ||
49 | * drbd_bm_set_bits is called from bio_endio callbacks, | ||
50 | * We may be called with irq already disabled, | ||
51 | * so we need spin_lock_irqsave(). | ||
52 | * And we need the kmap_atomic. | ||
53 | */ | ||
54 | struct drbd_bitmap { | ||
55 | struct page **bm_pages; | ||
56 | spinlock_t bm_lock; | ||
57 | /* WARNING unsigned long bm_*: | ||
58 | * 32bit number of bit offset is just enough for 512 MB bitmap. | ||
59 | * it will blow up if we make the bitmap bigger... | ||
60 | * not that it makes much sense to have a bitmap that large, | ||
61 | * rather change the granularity to 16k or 64k or something. | ||
62 | * (that implies other problems, however...) | ||
63 | */ | ||
64 | unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */ | ||
65 | unsigned long bm_bits; | ||
66 | size_t bm_words; | ||
67 | size_t bm_number_of_pages; | ||
68 | sector_t bm_dev_capacity; | ||
69 | struct semaphore bm_change; /* serializes resize operations */ | ||
70 | |||
71 | atomic_t bm_async_io; | ||
72 | wait_queue_head_t bm_io_wait; | ||
73 | |||
74 | unsigned long bm_flags; | ||
75 | |||
76 | /* debugging aid, in case we are still racy somewhere */ | ||
77 | char *bm_why; | ||
78 | struct task_struct *bm_task; | ||
79 | }; | ||
80 | |||
81 | /* definition of bits in bm_flags */ | ||
82 | #define BM_LOCKED 0 | ||
83 | #define BM_MD_IO_ERROR 1 | ||
84 | #define BM_P_VMALLOCED 2 | ||
85 | |||
86 | static int bm_is_locked(struct drbd_bitmap *b) | ||
87 | { | ||
88 | return test_bit(BM_LOCKED, &b->bm_flags); | ||
89 | } | ||
90 | |||
91 | #define bm_print_lock_info(m) __bm_print_lock_info(m, __func__) | ||
92 | static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func) | ||
93 | { | ||
94 | struct drbd_bitmap *b = mdev->bitmap; | ||
95 | if (!__ratelimit(&drbd_ratelimit_state)) | ||
96 | return; | ||
97 | dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n", | ||
98 | current == mdev->receiver.task ? "receiver" : | ||
99 | current == mdev->asender.task ? "asender" : | ||
100 | current == mdev->worker.task ? "worker" : current->comm, | ||
101 | func, b->bm_why ?: "?", | ||
102 | b->bm_task == mdev->receiver.task ? "receiver" : | ||
103 | b->bm_task == mdev->asender.task ? "asender" : | ||
104 | b->bm_task == mdev->worker.task ? "worker" : "?"); | ||
105 | } | ||
106 | |||
107 | void drbd_bm_lock(struct drbd_conf *mdev, char *why) | ||
108 | { | ||
109 | struct drbd_bitmap *b = mdev->bitmap; | ||
110 | int trylock_failed; | ||
111 | |||
112 | if (!b) { | ||
113 | dev_err(DEV, "FIXME no bitmap in drbd_bm_lock!?\n"); | ||
114 | return; | ||
115 | } | ||
116 | |||
117 | trylock_failed = down_trylock(&b->bm_change); | ||
118 | |||
119 | if (trylock_failed) { | ||
120 | dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n", | ||
121 | current == mdev->receiver.task ? "receiver" : | ||
122 | current == mdev->asender.task ? "asender" : | ||
123 | current == mdev->worker.task ? "worker" : current->comm, | ||
124 | why, b->bm_why ?: "?", | ||
125 | b->bm_task == mdev->receiver.task ? "receiver" : | ||
126 | b->bm_task == mdev->asender.task ? "asender" : | ||
127 | b->bm_task == mdev->worker.task ? "worker" : "?"); | ||
128 | down(&b->bm_change); | ||
129 | } | ||
130 | if (__test_and_set_bit(BM_LOCKED, &b->bm_flags)) | ||
131 | dev_err(DEV, "FIXME bitmap already locked in bm_lock\n"); | ||
132 | |||
133 | b->bm_why = why; | ||
134 | b->bm_task = current; | ||
135 | } | ||
136 | |||
137 | void drbd_bm_unlock(struct drbd_conf *mdev) | ||
138 | { | ||
139 | struct drbd_bitmap *b = mdev->bitmap; | ||
140 | if (!b) { | ||
141 | dev_err(DEV, "FIXME no bitmap in drbd_bm_unlock!?\n"); | ||
142 | return; | ||
143 | } | ||
144 | |||
145 | if (!__test_and_clear_bit(BM_LOCKED, &mdev->bitmap->bm_flags)) | ||
146 | dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n"); | ||
147 | |||
148 | b->bm_why = NULL; | ||
149 | b->bm_task = NULL; | ||
150 | up(&b->bm_change); | ||
151 | } | ||
152 | |||
153 | /* word offset to long pointer */ | ||
154 | static unsigned long *__bm_map_paddr(struct drbd_bitmap *b, unsigned long offset, const enum km_type km) | ||
155 | { | ||
156 | struct page *page; | ||
157 | unsigned long page_nr; | ||
158 | |||
159 | /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */ | ||
160 | page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3); | ||
161 | BUG_ON(page_nr >= b->bm_number_of_pages); | ||
162 | page = b->bm_pages[page_nr]; | ||
163 | |||
164 | return (unsigned long *) kmap_atomic(page, km); | ||
165 | } | ||
166 | |||
167 | static unsigned long * bm_map_paddr(struct drbd_bitmap *b, unsigned long offset) | ||
168 | { | ||
169 | return __bm_map_paddr(b, offset, KM_IRQ1); | ||
170 | } | ||
171 | |||
172 | static void __bm_unmap(unsigned long *p_addr, const enum km_type km) | ||
173 | { | ||
174 | kunmap_atomic(p_addr, km); | ||
175 | }; | ||
176 | |||
177 | static void bm_unmap(unsigned long *p_addr) | ||
178 | { | ||
179 | return __bm_unmap(p_addr, KM_IRQ1); | ||
180 | } | ||
181 | |||
182 | /* long word offset of _bitmap_ sector */ | ||
183 | #define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) | ||
184 | /* word offset from start of bitmap to word number _in_page_ | ||
185 | * modulo longs per page | ||
186 | #define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long)) | ||
187 | hm, well, Philipp thinks gcc might not optimze the % into & (... - 1) | ||
188 | so do it explicitly: | ||
189 | */ | ||
190 | #define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1)) | ||
191 | |||
192 | /* Long words per page */ | ||
193 | #define LWPP (PAGE_SIZE/sizeof(long)) | ||
194 | |||
195 | /* | ||
196 | * actually most functions herein should take a struct drbd_bitmap*, not a | ||
197 | * struct drbd_conf*, but for the debug macros I like to have the mdev around | ||
198 | * to be able to report device specific. | ||
199 | */ | ||
200 | |||
201 | static void bm_free_pages(struct page **pages, unsigned long number) | ||
202 | { | ||
203 | unsigned long i; | ||
204 | if (!pages) | ||
205 | return; | ||
206 | |||
207 | for (i = 0; i < number; i++) { | ||
208 | if (!pages[i]) { | ||
209 | printk(KERN_ALERT "drbd: bm_free_pages tried to free " | ||
210 | "a NULL pointer; i=%lu n=%lu\n", | ||
211 | i, number); | ||
212 | continue; | ||
213 | } | ||
214 | __free_page(pages[i]); | ||
215 | pages[i] = NULL; | ||
216 | } | ||
217 | } | ||
218 | |||
219 | static void bm_vk_free(void *ptr, int v) | ||
220 | { | ||
221 | if (v) | ||
222 | vfree(ptr); | ||
223 | else | ||
224 | kfree(ptr); | ||
225 | } | ||
226 | |||
227 | /* | ||
228 | * "have" and "want" are NUMBER OF PAGES. | ||
229 | */ | ||
230 | static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) | ||
231 | { | ||
232 | struct page **old_pages = b->bm_pages; | ||
233 | struct page **new_pages, *page; | ||
234 | unsigned int i, bytes, vmalloced = 0; | ||
235 | unsigned long have = b->bm_number_of_pages; | ||
236 | |||
237 | BUG_ON(have == 0 && old_pages != NULL); | ||
238 | BUG_ON(have != 0 && old_pages == NULL); | ||
239 | |||
240 | if (have == want) | ||
241 | return old_pages; | ||
242 | |||
243 | /* Trying kmalloc first, falling back to vmalloc. | ||
244 | * GFP_KERNEL is ok, as this is done when a lower level disk is | ||
245 | * "attached" to the drbd. Context is receiver thread or cqueue | ||
246 | * thread. As we have no disk yet, we are not in the IO path, | ||
247 | * not even the IO path of the peer. */ | ||
248 | bytes = sizeof(struct page *)*want; | ||
249 | new_pages = kmalloc(bytes, GFP_KERNEL); | ||
250 | if (!new_pages) { | ||
251 | new_pages = vmalloc(bytes); | ||
252 | if (!new_pages) | ||
253 | return NULL; | ||
254 | vmalloced = 1; | ||
255 | } | ||
256 | |||
257 | memset(new_pages, 0, bytes); | ||
258 | if (want >= have) { | ||
259 | for (i = 0; i < have; i++) | ||
260 | new_pages[i] = old_pages[i]; | ||
261 | for (; i < want; i++) { | ||
262 | page = alloc_page(GFP_HIGHUSER); | ||
263 | if (!page) { | ||
264 | bm_free_pages(new_pages + have, i - have); | ||
265 | bm_vk_free(new_pages, vmalloced); | ||
266 | return NULL; | ||
267 | } | ||
268 | new_pages[i] = page; | ||
269 | } | ||
270 | } else { | ||
271 | for (i = 0; i < want; i++) | ||
272 | new_pages[i] = old_pages[i]; | ||
273 | /* NOT HERE, we are outside the spinlock! | ||
274 | bm_free_pages(old_pages + want, have - want); | ||
275 | */ | ||
276 | } | ||
277 | |||
278 | if (vmalloced) | ||
279 | set_bit(BM_P_VMALLOCED, &b->bm_flags); | ||
280 | else | ||
281 | clear_bit(BM_P_VMALLOCED, &b->bm_flags); | ||
282 | |||
283 | return new_pages; | ||
284 | } | ||
285 | |||
286 | /* | ||
287 | * called on driver init only. TODO call when a device is created. | ||
288 | * allocates the drbd_bitmap, and stores it in mdev->bitmap. | ||
289 | */ | ||
290 | int drbd_bm_init(struct drbd_conf *mdev) | ||
291 | { | ||
292 | struct drbd_bitmap *b = mdev->bitmap; | ||
293 | WARN_ON(b != NULL); | ||
294 | b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL); | ||
295 | if (!b) | ||
296 | return -ENOMEM; | ||
297 | spin_lock_init(&b->bm_lock); | ||
298 | init_MUTEX(&b->bm_change); | ||
299 | init_waitqueue_head(&b->bm_io_wait); | ||
300 | |||
301 | mdev->bitmap = b; | ||
302 | |||
303 | return 0; | ||
304 | } | ||
305 | |||
306 | sector_t drbd_bm_capacity(struct drbd_conf *mdev) | ||
307 | { | ||
308 | ERR_IF(!mdev->bitmap) return 0; | ||
309 | return mdev->bitmap->bm_dev_capacity; | ||
310 | } | ||
311 | |||
312 | /* called on driver unload. TODO: call when a device is destroyed. | ||
313 | */ | ||
314 | void drbd_bm_cleanup(struct drbd_conf *mdev) | ||
315 | { | ||
316 | ERR_IF (!mdev->bitmap) return; | ||
317 | bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages); | ||
318 | bm_vk_free(mdev->bitmap->bm_pages, test_bit(BM_P_VMALLOCED, &mdev->bitmap->bm_flags)); | ||
319 | kfree(mdev->bitmap); | ||
320 | mdev->bitmap = NULL; | ||
321 | } | ||
322 | |||
323 | /* | ||
324 | * since (b->bm_bits % BITS_PER_LONG) != 0, | ||
325 | * this masks out the remaining bits. | ||
326 | * Returns the number of bits cleared. | ||
327 | */ | ||
328 | static int bm_clear_surplus(struct drbd_bitmap *b) | ||
329 | { | ||
330 | const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1; | ||
331 | size_t w = b->bm_bits >> LN2_BPL; | ||
332 | int cleared = 0; | ||
333 | unsigned long *p_addr, *bm; | ||
334 | |||
335 | p_addr = bm_map_paddr(b, w); | ||
336 | bm = p_addr + MLPP(w); | ||
337 | if (w < b->bm_words) { | ||
338 | cleared = hweight_long(*bm & ~mask); | ||
339 | *bm &= mask; | ||
340 | w++; bm++; | ||
341 | } | ||
342 | |||
343 | if (w < b->bm_words) { | ||
344 | cleared += hweight_long(*bm); | ||
345 | *bm = 0; | ||
346 | } | ||
347 | bm_unmap(p_addr); | ||
348 | return cleared; | ||
349 | } | ||
350 | |||
351 | static void bm_set_surplus(struct drbd_bitmap *b) | ||
352 | { | ||
353 | const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1; | ||
354 | size_t w = b->bm_bits >> LN2_BPL; | ||
355 | unsigned long *p_addr, *bm; | ||
356 | |||
357 | p_addr = bm_map_paddr(b, w); | ||
358 | bm = p_addr + MLPP(w); | ||
359 | if (w < b->bm_words) { | ||
360 | *bm |= ~mask; | ||
361 | bm++; w++; | ||
362 | } | ||
363 | |||
364 | if (w < b->bm_words) { | ||
365 | *bm = ~(0UL); | ||
366 | } | ||
367 | bm_unmap(p_addr); | ||
368 | } | ||
369 | |||
370 | static unsigned long __bm_count_bits(struct drbd_bitmap *b, const int swap_endian) | ||
371 | { | ||
372 | unsigned long *p_addr, *bm, offset = 0; | ||
373 | unsigned long bits = 0; | ||
374 | unsigned long i, do_now; | ||
375 | |||
376 | while (offset < b->bm_words) { | ||
377 | i = do_now = min_t(size_t, b->bm_words-offset, LWPP); | ||
378 | p_addr = __bm_map_paddr(b, offset, KM_USER0); | ||
379 | bm = p_addr + MLPP(offset); | ||
380 | while (i--) { | ||
381 | #ifndef __LITTLE_ENDIAN | ||
382 | if (swap_endian) | ||
383 | *bm = lel_to_cpu(*bm); | ||
384 | #endif | ||
385 | bits += hweight_long(*bm++); | ||
386 | } | ||
387 | __bm_unmap(p_addr, KM_USER0); | ||
388 | offset += do_now; | ||
389 | cond_resched(); | ||
390 | } | ||
391 | |||
392 | return bits; | ||
393 | } | ||
394 | |||
395 | static unsigned long bm_count_bits(struct drbd_bitmap *b) | ||
396 | { | ||
397 | return __bm_count_bits(b, 0); | ||
398 | } | ||
399 | |||
400 | static unsigned long bm_count_bits_swap_endian(struct drbd_bitmap *b) | ||
401 | { | ||
402 | return __bm_count_bits(b, 1); | ||
403 | } | ||
404 | |||
405 | /* offset and len in long words.*/ | ||
406 | static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) | ||
407 | { | ||
408 | unsigned long *p_addr, *bm; | ||
409 | size_t do_now, end; | ||
410 | |||
411 | #define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512) | ||
412 | |||
413 | end = offset + len; | ||
414 | |||
415 | if (end > b->bm_words) { | ||
416 | printk(KERN_ALERT "drbd: bm_memset end > bm_words\n"); | ||
417 | return; | ||
418 | } | ||
419 | |||
420 | while (offset < end) { | ||
421 | do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset; | ||
422 | p_addr = bm_map_paddr(b, offset); | ||
423 | bm = p_addr + MLPP(offset); | ||
424 | if (bm+do_now > p_addr + LWPP) { | ||
425 | printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n", | ||
426 | p_addr, bm, (int)do_now); | ||
427 | break; /* breaks to after catch_oob_access_end() only! */ | ||
428 | } | ||
429 | memset(bm, c, do_now * sizeof(long)); | ||
430 | bm_unmap(p_addr); | ||
431 | offset += do_now; | ||
432 | } | ||
433 | } | ||
434 | |||
435 | /* | ||
436 | * make sure the bitmap has enough room for the attached storage, | ||
437 | * if necessary, resize. | ||
438 | * called whenever we may have changed the device size. | ||
439 | * returns -ENOMEM if we could not allocate enough memory, 0 on success. | ||
440 | * In case this is actually a resize, we copy the old bitmap into the new one. | ||
441 | * Otherwise, the bitmap is initialized to all bits set. | ||
442 | */ | ||
443 | int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity) | ||
444 | { | ||
445 | struct drbd_bitmap *b = mdev->bitmap; | ||
446 | unsigned long bits, words, owords, obits, *p_addr, *bm; | ||
447 | unsigned long want, have, onpages; /* number of pages */ | ||
448 | struct page **npages, **opages = NULL; | ||
449 | int err = 0, growing; | ||
450 | int opages_vmalloced; | ||
451 | |||
452 | ERR_IF(!b) return -ENOMEM; | ||
453 | |||
454 | drbd_bm_lock(mdev, "resize"); | ||
455 | |||
456 | dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n", | ||
457 | (unsigned long long)capacity); | ||
458 | |||
459 | if (capacity == b->bm_dev_capacity) | ||
460 | goto out; | ||
461 | |||
462 | opages_vmalloced = test_bit(BM_P_VMALLOCED, &b->bm_flags); | ||
463 | |||
464 | if (capacity == 0) { | ||
465 | spin_lock_irq(&b->bm_lock); | ||
466 | opages = b->bm_pages; | ||
467 | onpages = b->bm_number_of_pages; | ||
468 | owords = b->bm_words; | ||
469 | b->bm_pages = NULL; | ||
470 | b->bm_number_of_pages = | ||
471 | b->bm_set = | ||
472 | b->bm_bits = | ||
473 | b->bm_words = | ||
474 | b->bm_dev_capacity = 0; | ||
475 | spin_unlock_irq(&b->bm_lock); | ||
476 | bm_free_pages(opages, onpages); | ||
477 | bm_vk_free(opages, opages_vmalloced); | ||
478 | goto out; | ||
479 | } | ||
480 | bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT)); | ||
481 | |||
482 | /* if we would use | ||
483 | words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL; | ||
484 | a 32bit host could present the wrong number of words | ||
485 | to a 64bit host. | ||
486 | */ | ||
487 | words = ALIGN(bits, 64) >> LN2_BPL; | ||
488 | |||
489 | if (get_ldev(mdev)) { | ||
490 | D_ASSERT((u64)bits <= (((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12)); | ||
491 | put_ldev(mdev); | ||
492 | } | ||
493 | |||
494 | /* one extra long to catch off by one errors */ | ||
495 | want = ALIGN((words+1)*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT; | ||
496 | have = b->bm_number_of_pages; | ||
497 | if (want == have) { | ||
498 | D_ASSERT(b->bm_pages != NULL); | ||
499 | npages = b->bm_pages; | ||
500 | } else { | ||
501 | if (FAULT_ACTIVE(mdev, DRBD_FAULT_BM_ALLOC)) | ||
502 | npages = NULL; | ||
503 | else | ||
504 | npages = bm_realloc_pages(b, want); | ||
505 | } | ||
506 | |||
507 | if (!npages) { | ||
508 | err = -ENOMEM; | ||
509 | goto out; | ||
510 | } | ||
511 | |||
512 | spin_lock_irq(&b->bm_lock); | ||
513 | opages = b->bm_pages; | ||
514 | owords = b->bm_words; | ||
515 | obits = b->bm_bits; | ||
516 | |||
517 | growing = bits > obits; | ||
518 | if (opages) | ||
519 | bm_set_surplus(b); | ||
520 | |||
521 | b->bm_pages = npages; | ||
522 | b->bm_number_of_pages = want; | ||
523 | b->bm_bits = bits; | ||
524 | b->bm_words = words; | ||
525 | b->bm_dev_capacity = capacity; | ||
526 | |||
527 | if (growing) { | ||
528 | bm_memset(b, owords, 0xff, words-owords); | ||
529 | b->bm_set += bits - obits; | ||
530 | } | ||
531 | |||
532 | if (want < have) { | ||
533 | /* implicit: (opages != NULL) && (opages != npages) */ | ||
534 | bm_free_pages(opages + want, have - want); | ||
535 | } | ||
536 | |||
537 | p_addr = bm_map_paddr(b, words); | ||
538 | bm = p_addr + MLPP(words); | ||
539 | *bm = DRBD_MAGIC; | ||
540 | bm_unmap(p_addr); | ||
541 | |||
542 | (void)bm_clear_surplus(b); | ||
543 | |||
544 | spin_unlock_irq(&b->bm_lock); | ||
545 | if (opages != npages) | ||
546 | bm_vk_free(opages, opages_vmalloced); | ||
547 | if (!growing) | ||
548 | b->bm_set = bm_count_bits(b); | ||
549 | dev_info(DEV, "resync bitmap: bits=%lu words=%lu\n", bits, words); | ||
550 | |||
551 | out: | ||
552 | drbd_bm_unlock(mdev); | ||
553 | return err; | ||
554 | } | ||
555 | |||
556 | /* inherently racy: | ||
557 | * if not protected by other means, return value may be out of date when | ||
558 | * leaving this function... | ||
559 | * we still need to lock it, since it is important that this returns | ||
560 | * bm_set == 0 precisely. | ||
561 | * | ||
562 | * maybe bm_set should be atomic_t ? | ||
563 | */ | ||
564 | static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev) | ||
565 | { | ||
566 | struct drbd_bitmap *b = mdev->bitmap; | ||
567 | unsigned long s; | ||
568 | unsigned long flags; | ||
569 | |||
570 | ERR_IF(!b) return 0; | ||
571 | ERR_IF(!b->bm_pages) return 0; | ||
572 | |||
573 | spin_lock_irqsave(&b->bm_lock, flags); | ||
574 | s = b->bm_set; | ||
575 | spin_unlock_irqrestore(&b->bm_lock, flags); | ||
576 | |||
577 | return s; | ||
578 | } | ||
579 | |||
580 | unsigned long drbd_bm_total_weight(struct drbd_conf *mdev) | ||
581 | { | ||
582 | unsigned long s; | ||
583 | /* if I don't have a disk, I don't know about out-of-sync status */ | ||
584 | if (!get_ldev_if_state(mdev, D_NEGOTIATING)) | ||
585 | return 0; | ||
586 | s = _drbd_bm_total_weight(mdev); | ||
587 | put_ldev(mdev); | ||
588 | return s; | ||
589 | } | ||
590 | |||
591 | size_t drbd_bm_words(struct drbd_conf *mdev) | ||
592 | { | ||
593 | struct drbd_bitmap *b = mdev->bitmap; | ||
594 | ERR_IF(!b) return 0; | ||
595 | ERR_IF(!b->bm_pages) return 0; | ||
596 | |||
597 | return b->bm_words; | ||
598 | } | ||
599 | |||
600 | unsigned long drbd_bm_bits(struct drbd_conf *mdev) | ||
601 | { | ||
602 | struct drbd_bitmap *b = mdev->bitmap; | ||
603 | ERR_IF(!b) return 0; | ||
604 | |||
605 | return b->bm_bits; | ||
606 | } | ||
607 | |||
608 | /* merge number words from buffer into the bitmap starting at offset. | ||
609 | * buffer[i] is expected to be little endian unsigned long. | ||
610 | * bitmap must be locked by drbd_bm_lock. | ||
611 | * currently only used from receive_bitmap. | ||
612 | */ | ||
613 | void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number, | ||
614 | unsigned long *buffer) | ||
615 | { | ||
616 | struct drbd_bitmap *b = mdev->bitmap; | ||
617 | unsigned long *p_addr, *bm; | ||
618 | unsigned long word, bits; | ||
619 | size_t end, do_now; | ||
620 | |||
621 | end = offset + number; | ||
622 | |||
623 | ERR_IF(!b) return; | ||
624 | ERR_IF(!b->bm_pages) return; | ||
625 | if (number == 0) | ||
626 | return; | ||
627 | WARN_ON(offset >= b->bm_words); | ||
628 | WARN_ON(end > b->bm_words); | ||
629 | |||
630 | spin_lock_irq(&b->bm_lock); | ||
631 | while (offset < end) { | ||
632 | do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; | ||
633 | p_addr = bm_map_paddr(b, offset); | ||
634 | bm = p_addr + MLPP(offset); | ||
635 | offset += do_now; | ||
636 | while (do_now--) { | ||
637 | bits = hweight_long(*bm); | ||
638 | word = *bm | lel_to_cpu(*buffer++); | ||
639 | *bm++ = word; | ||
640 | b->bm_set += hweight_long(word) - bits; | ||
641 | } | ||
642 | bm_unmap(p_addr); | ||
643 | } | ||
644 | /* with 32bit <-> 64bit cross-platform connect | ||
645 | * this is only correct for current usage, | ||
646 | * where we _know_ that we are 64 bit aligned, | ||
647 | * and know that this function is used in this way, too... | ||
648 | */ | ||
649 | if (end == b->bm_words) | ||
650 | b->bm_set -= bm_clear_surplus(b); | ||
651 | |||
652 | spin_unlock_irq(&b->bm_lock); | ||
653 | } | ||
654 | |||
655 | /* copy number words from the bitmap starting at offset into the buffer. | ||
656 | * buffer[i] will be little endian unsigned long. | ||
657 | */ | ||
658 | void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number, | ||
659 | unsigned long *buffer) | ||
660 | { | ||
661 | struct drbd_bitmap *b = mdev->bitmap; | ||
662 | unsigned long *p_addr, *bm; | ||
663 | size_t end, do_now; | ||
664 | |||
665 | end = offset + number; | ||
666 | |||
667 | ERR_IF(!b) return; | ||
668 | ERR_IF(!b->bm_pages) return; | ||
669 | |||
670 | spin_lock_irq(&b->bm_lock); | ||
671 | if ((offset >= b->bm_words) || | ||
672 | (end > b->bm_words) || | ||
673 | (number <= 0)) | ||
674 | dev_err(DEV, "offset=%lu number=%lu bm_words=%lu\n", | ||
675 | (unsigned long) offset, | ||
676 | (unsigned long) number, | ||
677 | (unsigned long) b->bm_words); | ||
678 | else { | ||
679 | while (offset < end) { | ||
680 | do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; | ||
681 | p_addr = bm_map_paddr(b, offset); | ||
682 | bm = p_addr + MLPP(offset); | ||
683 | offset += do_now; | ||
684 | while (do_now--) | ||
685 | *buffer++ = cpu_to_lel(*bm++); | ||
686 | bm_unmap(p_addr); | ||
687 | } | ||
688 | } | ||
689 | spin_unlock_irq(&b->bm_lock); | ||
690 | } | ||
691 | |||
692 | /* set all bits in the bitmap */ | ||
693 | void drbd_bm_set_all(struct drbd_conf *mdev) | ||
694 | { | ||
695 | struct drbd_bitmap *b = mdev->bitmap; | ||
696 | ERR_IF(!b) return; | ||
697 | ERR_IF(!b->bm_pages) return; | ||
698 | |||
699 | spin_lock_irq(&b->bm_lock); | ||
700 | bm_memset(b, 0, 0xff, b->bm_words); | ||
701 | (void)bm_clear_surplus(b); | ||
702 | b->bm_set = b->bm_bits; | ||
703 | spin_unlock_irq(&b->bm_lock); | ||
704 | } | ||
705 | |||
706 | /* clear all bits in the bitmap */ | ||
707 | void drbd_bm_clear_all(struct drbd_conf *mdev) | ||
708 | { | ||
709 | struct drbd_bitmap *b = mdev->bitmap; | ||
710 | ERR_IF(!b) return; | ||
711 | ERR_IF(!b->bm_pages) return; | ||
712 | |||
713 | spin_lock_irq(&b->bm_lock); | ||
714 | bm_memset(b, 0, 0, b->bm_words); | ||
715 | b->bm_set = 0; | ||
716 | spin_unlock_irq(&b->bm_lock); | ||
717 | } | ||
718 | |||
719 | static void bm_async_io_complete(struct bio *bio, int error) | ||
720 | { | ||
721 | struct drbd_bitmap *b = bio->bi_private; | ||
722 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | ||
723 | |||
724 | |||
725 | /* strange behavior of some lower level drivers... | ||
726 | * fail the request by clearing the uptodate flag, | ||
727 | * but do not return any error?! | ||
728 | * do we want to WARN() on this? */ | ||
729 | if (!error && !uptodate) | ||
730 | error = -EIO; | ||
731 | |||
732 | if (error) { | ||
733 | /* doh. what now? | ||
734 | * for now, set all bits, and flag MD_IO_ERROR */ | ||
735 | __set_bit(BM_MD_IO_ERROR, &b->bm_flags); | ||
736 | } | ||
737 | if (atomic_dec_and_test(&b->bm_async_io)) | ||
738 | wake_up(&b->bm_io_wait); | ||
739 | |||
740 | bio_put(bio); | ||
741 | } | ||
742 | |||
743 | static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local) | ||
744 | { | ||
745 | /* we are process context. we always get a bio */ | ||
746 | struct bio *bio = bio_alloc(GFP_KERNEL, 1); | ||
747 | unsigned int len; | ||
748 | sector_t on_disk_sector = | ||
749 | mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset; | ||
750 | on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9); | ||
751 | |||
752 | /* this might happen with very small | ||
753 | * flexible external meta data device */ | ||
754 | len = min_t(unsigned int, PAGE_SIZE, | ||
755 | (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9); | ||
756 | |||
757 | bio->bi_bdev = mdev->ldev->md_bdev; | ||
758 | bio->bi_sector = on_disk_sector; | ||
759 | bio_add_page(bio, b->bm_pages[page_nr], len, 0); | ||
760 | bio->bi_private = b; | ||
761 | bio->bi_end_io = bm_async_io_complete; | ||
762 | |||
763 | if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) { | ||
764 | bio->bi_rw |= rw; | ||
765 | bio_endio(bio, -EIO); | ||
766 | } else { | ||
767 | submit_bio(rw, bio); | ||
768 | } | ||
769 | } | ||
770 | |||
771 | # if defined(__LITTLE_ENDIAN) | ||
772 | /* nothing to do, on disk == in memory */ | ||
773 | # define bm_cpu_to_lel(x) ((void)0) | ||
774 | # else | ||
775 | void bm_cpu_to_lel(struct drbd_bitmap *b) | ||
776 | { | ||
777 | /* need to cpu_to_lel all the pages ... | ||
778 | * this may be optimized by using | ||
779 | * cpu_to_lel(-1) == -1 and cpu_to_lel(0) == 0; | ||
780 | * the following is still not optimal, but better than nothing */ | ||
781 | unsigned int i; | ||
782 | unsigned long *p_addr, *bm; | ||
783 | if (b->bm_set == 0) { | ||
784 | /* no page at all; avoid swap if all is 0 */ | ||
785 | i = b->bm_number_of_pages; | ||
786 | } else if (b->bm_set == b->bm_bits) { | ||
787 | /* only the last page */ | ||
788 | i = b->bm_number_of_pages - 1; | ||
789 | } else { | ||
790 | /* all pages */ | ||
791 | i = 0; | ||
792 | } | ||
793 | for (; i < b->bm_number_of_pages; i++) { | ||
794 | p_addr = kmap_atomic(b->bm_pages[i], KM_USER0); | ||
795 | for (bm = p_addr; bm < p_addr + PAGE_SIZE/sizeof(long); bm++) | ||
796 | *bm = cpu_to_lel(*bm); | ||
797 | kunmap_atomic(p_addr, KM_USER0); | ||
798 | } | ||
799 | } | ||
800 | # endif | ||
801 | /* lel_to_cpu == cpu_to_lel */ | ||
802 | # define bm_lel_to_cpu(x) bm_cpu_to_lel(x) | ||
803 | |||
804 | /* | ||
805 | * bm_rw: read/write the whole bitmap from/to its on disk location. | ||
806 | */ | ||
807 | static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local) | ||
808 | { | ||
809 | struct drbd_bitmap *b = mdev->bitmap; | ||
810 | /* sector_t sector; */ | ||
811 | int bm_words, num_pages, i; | ||
812 | unsigned long now; | ||
813 | char ppb[10]; | ||
814 | int err = 0; | ||
815 | |||
816 | WARN_ON(!bm_is_locked(b)); | ||
817 | |||
818 | /* no spinlock here, the drbd_bm_lock should be enough! */ | ||
819 | |||
820 | bm_words = drbd_bm_words(mdev); | ||
821 | num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT; | ||
822 | |||
823 | /* on disk bitmap is little endian */ | ||
824 | if (rw == WRITE) | ||
825 | bm_cpu_to_lel(b); | ||
826 | |||
827 | now = jiffies; | ||
828 | atomic_set(&b->bm_async_io, num_pages); | ||
829 | __clear_bit(BM_MD_IO_ERROR, &b->bm_flags); | ||
830 | |||
831 | /* let the layers below us try to merge these bios... */ | ||
832 | for (i = 0; i < num_pages; i++) | ||
833 | bm_page_io_async(mdev, b, i, rw); | ||
834 | |||
835 | drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev)); | ||
836 | wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0); | ||
837 | |||
838 | if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) { | ||
839 | dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); | ||
840 | drbd_chk_io_error(mdev, 1, TRUE); | ||
841 | err = -EIO; | ||
842 | } | ||
843 | |||
844 | now = jiffies; | ||
845 | if (rw == WRITE) { | ||
846 | /* swap back endianness */ | ||
847 | bm_lel_to_cpu(b); | ||
848 | /* flush bitmap to stable storage */ | ||
849 | drbd_md_flush(mdev); | ||
850 | } else /* rw == READ */ { | ||
851 | /* just read, if necessary adjust endianness */ | ||
852 | b->bm_set = bm_count_bits_swap_endian(b); | ||
853 | dev_info(DEV, "recounting of set bits took additional %lu jiffies\n", | ||
854 | jiffies - now); | ||
855 | } | ||
856 | now = b->bm_set; | ||
857 | |||
858 | dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", | ||
859 | ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); | ||
860 | |||
861 | return err; | ||
862 | } | ||
863 | |||
864 | /** | ||
865 | * drbd_bm_read() - Read the whole bitmap from its on disk location. | ||
866 | * @mdev: DRBD device. | ||
867 | */ | ||
868 | int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) | ||
869 | { | ||
870 | return bm_rw(mdev, READ); | ||
871 | } | ||
872 | |||
873 | /** | ||
874 | * drbd_bm_write() - Write the whole bitmap to its on disk location. | ||
875 | * @mdev: DRBD device. | ||
876 | */ | ||
877 | int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) | ||
878 | { | ||
879 | return bm_rw(mdev, WRITE); | ||
880 | } | ||
881 | |||
882 | /** | ||
883 | * drbd_bm_write_sect: Writes a 512 (MD_SECTOR_SIZE) byte piece of the bitmap | ||
884 | * @mdev: DRBD device. | ||
885 | * @enr: Extent number in the resync lru (happens to be sector offset) | ||
886 | * | ||
887 | * The BM_EXT_SIZE is on purpose exactly the amount of the bitmap covered | ||
888 | * by a single sector write. Therefore enr == sector offset from the | ||
889 | * start of the bitmap. | ||
890 | */ | ||
891 | int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local) | ||
892 | { | ||
893 | sector_t on_disk_sector = enr + mdev->ldev->md.md_offset | ||
894 | + mdev->ldev->md.bm_offset; | ||
895 | int bm_words, num_words, offset; | ||
896 | int err = 0; | ||
897 | |||
898 | mutex_lock(&mdev->md_io_mutex); | ||
899 | bm_words = drbd_bm_words(mdev); | ||
900 | offset = S2W(enr); /* word offset into bitmap */ | ||
901 | num_words = min(S2W(1), bm_words - offset); | ||
902 | if (num_words < S2W(1)) | ||
903 | memset(page_address(mdev->md_io_page), 0, MD_SECTOR_SIZE); | ||
904 | drbd_bm_get_lel(mdev, offset, num_words, | ||
905 | page_address(mdev->md_io_page)); | ||
906 | if (!drbd_md_sync_page_io(mdev, mdev->ldev, on_disk_sector, WRITE)) { | ||
907 | int i; | ||
908 | err = -EIO; | ||
909 | dev_err(DEV, "IO ERROR writing bitmap sector %lu " | ||
910 | "(meta-disk sector %llus)\n", | ||
911 | enr, (unsigned long long)on_disk_sector); | ||
912 | drbd_chk_io_error(mdev, 1, TRUE); | ||
913 | for (i = 0; i < AL_EXT_PER_BM_SECT; i++) | ||
914 | drbd_bm_ALe_set_all(mdev, enr*AL_EXT_PER_BM_SECT+i); | ||
915 | } | ||
916 | mdev->bm_writ_cnt++; | ||
917 | mutex_unlock(&mdev->md_io_mutex); | ||
918 | return err; | ||
919 | } | ||
920 | |||
921 | /* NOTE | ||
922 | * find_first_bit returns int, we return unsigned long. | ||
923 | * should not make much difference anyways, but ... | ||
924 | * | ||
925 | * this returns a bit number, NOT a sector! | ||
926 | */ | ||
927 | #define BPP_MASK ((1UL << (PAGE_SHIFT+3)) - 1) | ||
928 | static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo, | ||
929 | const int find_zero_bit, const enum km_type km) | ||
930 | { | ||
931 | struct drbd_bitmap *b = mdev->bitmap; | ||
932 | unsigned long i = -1UL; | ||
933 | unsigned long *p_addr; | ||
934 | unsigned long bit_offset; /* bit offset of the mapped page. */ | ||
935 | |||
936 | if (bm_fo > b->bm_bits) { | ||
937 | dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits); | ||
938 | } else { | ||
939 | while (bm_fo < b->bm_bits) { | ||
940 | unsigned long offset; | ||
941 | bit_offset = bm_fo & ~BPP_MASK; /* bit offset of the page */ | ||
942 | offset = bit_offset >> LN2_BPL; /* word offset of the page */ | ||
943 | p_addr = __bm_map_paddr(b, offset, km); | ||
944 | |||
945 | if (find_zero_bit) | ||
946 | i = find_next_zero_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK); | ||
947 | else | ||
948 | i = find_next_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK); | ||
949 | |||
950 | __bm_unmap(p_addr, km); | ||
951 | if (i < PAGE_SIZE*8) { | ||
952 | i = bit_offset + i; | ||
953 | if (i >= b->bm_bits) | ||
954 | break; | ||
955 | goto found; | ||
956 | } | ||
957 | bm_fo = bit_offset + PAGE_SIZE*8; | ||
958 | } | ||
959 | i = -1UL; | ||
960 | } | ||
961 | found: | ||
962 | return i; | ||
963 | } | ||
964 | |||
965 | static unsigned long bm_find_next(struct drbd_conf *mdev, | ||
966 | unsigned long bm_fo, const int find_zero_bit) | ||
967 | { | ||
968 | struct drbd_bitmap *b = mdev->bitmap; | ||
969 | unsigned long i = -1UL; | ||
970 | |||
971 | ERR_IF(!b) return i; | ||
972 | ERR_IF(!b->bm_pages) return i; | ||
973 | |||
974 | spin_lock_irq(&b->bm_lock); | ||
975 | if (bm_is_locked(b)) | ||
976 | bm_print_lock_info(mdev); | ||
977 | |||
978 | i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1); | ||
979 | |||
980 | spin_unlock_irq(&b->bm_lock); | ||
981 | return i; | ||
982 | } | ||
983 | |||
984 | unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo) | ||
985 | { | ||
986 | return bm_find_next(mdev, bm_fo, 0); | ||
987 | } | ||
988 | |||
989 | #if 0 | ||
990 | /* not yet needed for anything. */ | ||
991 | unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo) | ||
992 | { | ||
993 | return bm_find_next(mdev, bm_fo, 1); | ||
994 | } | ||
995 | #endif | ||
996 | |||
997 | /* does not spin_lock_irqsave. | ||
998 | * you must take drbd_bm_lock() first */ | ||
999 | unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo) | ||
1000 | { | ||
1001 | /* WARN_ON(!bm_is_locked(mdev)); */ | ||
1002 | return __bm_find_next(mdev, bm_fo, 0, KM_USER1); | ||
1003 | } | ||
1004 | |||
1005 | unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo) | ||
1006 | { | ||
1007 | /* WARN_ON(!bm_is_locked(mdev)); */ | ||
1008 | return __bm_find_next(mdev, bm_fo, 1, KM_USER1); | ||
1009 | } | ||
1010 | |||
1011 | /* returns number of bits actually changed. | ||
1012 | * for val != 0, we change 0 -> 1, return code positive | ||
1013 | * for val == 0, we change 1 -> 0, return code negative | ||
1014 | * wants bitnr, not sector. | ||
1015 | * expected to be called for only a few bits (e - s about BITS_PER_LONG). | ||
1016 | * Must hold bitmap lock already. */ | ||
1017 | int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, | ||
1018 | unsigned long e, int val, const enum km_type km) | ||
1019 | { | ||
1020 | struct drbd_bitmap *b = mdev->bitmap; | ||
1021 | unsigned long *p_addr = NULL; | ||
1022 | unsigned long bitnr; | ||
1023 | unsigned long last_page_nr = -1UL; | ||
1024 | int c = 0; | ||
1025 | |||
1026 | if (e >= b->bm_bits) { | ||
1027 | dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n", | ||
1028 | s, e, b->bm_bits); | ||
1029 | e = b->bm_bits ? b->bm_bits -1 : 0; | ||
1030 | } | ||
1031 | for (bitnr = s; bitnr <= e; bitnr++) { | ||
1032 | unsigned long offset = bitnr>>LN2_BPL; | ||
1033 | unsigned long page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3); | ||
1034 | if (page_nr != last_page_nr) { | ||
1035 | if (p_addr) | ||
1036 | __bm_unmap(p_addr, km); | ||
1037 | p_addr = __bm_map_paddr(b, offset, km); | ||
1038 | last_page_nr = page_nr; | ||
1039 | } | ||
1040 | if (val) | ||
1041 | c += (0 == __test_and_set_bit(bitnr & BPP_MASK, p_addr)); | ||
1042 | else | ||
1043 | c -= (0 != __test_and_clear_bit(bitnr & BPP_MASK, p_addr)); | ||
1044 | } | ||
1045 | if (p_addr) | ||
1046 | __bm_unmap(p_addr, km); | ||
1047 | b->bm_set += c; | ||
1048 | return c; | ||
1049 | } | ||
1050 | |||
1051 | /* returns number of bits actually changed. | ||
1052 | * for val != 0, we change 0 -> 1, return code positive | ||
1053 | * for val == 0, we change 1 -> 0, return code negative | ||
1054 | * wants bitnr, not sector */ | ||
1055 | int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, | ||
1056 | const unsigned long e, int val) | ||
1057 | { | ||
1058 | unsigned long flags; | ||
1059 | struct drbd_bitmap *b = mdev->bitmap; | ||
1060 | int c = 0; | ||
1061 | |||
1062 | ERR_IF(!b) return 1; | ||
1063 | ERR_IF(!b->bm_pages) return 0; | ||
1064 | |||
1065 | spin_lock_irqsave(&b->bm_lock, flags); | ||
1066 | if (bm_is_locked(b)) | ||
1067 | bm_print_lock_info(mdev); | ||
1068 | |||
1069 | c = __bm_change_bits_to(mdev, s, e, val, KM_IRQ1); | ||
1070 | |||
1071 | spin_unlock_irqrestore(&b->bm_lock, flags); | ||
1072 | return c; | ||
1073 | } | ||
1074 | |||
1075 | /* returns number of bits changed 0 -> 1 */ | ||
1076 | int drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) | ||
1077 | { | ||
1078 | return bm_change_bits_to(mdev, s, e, 1); | ||
1079 | } | ||
1080 | |||
1081 | /* returns number of bits changed 1 -> 0 */ | ||
1082 | int drbd_bm_clear_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) | ||
1083 | { | ||
1084 | return -bm_change_bits_to(mdev, s, e, 0); | ||
1085 | } | ||
1086 | |||
1087 | /* sets all bits in full words, | ||
1088 | * from first_word up to, but not including, last_word */ | ||
1089 | static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b, | ||
1090 | int page_nr, int first_word, int last_word) | ||
1091 | { | ||
1092 | int i; | ||
1093 | int bits; | ||
1094 | unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_USER0); | ||
1095 | for (i = first_word; i < last_word; i++) { | ||
1096 | bits = hweight_long(paddr[i]); | ||
1097 | paddr[i] = ~0UL; | ||
1098 | b->bm_set += BITS_PER_LONG - bits; | ||
1099 | } | ||
1100 | kunmap_atomic(paddr, KM_USER0); | ||
1101 | } | ||
1102 | |||
1103 | /* Same thing as drbd_bm_set_bits, but without taking the spin_lock_irqsave. | ||
1104 | * You must first drbd_bm_lock(). | ||
1105 | * Can be called to set the whole bitmap in one go. | ||
1106 | * Sets bits from s to e _inclusive_. */ | ||
1107 | void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) | ||
1108 | { | ||
1109 | /* First set_bit from the first bit (s) | ||
1110 | * up to the next long boundary (sl), | ||
1111 | * then assign full words up to the last long boundary (el), | ||
1112 | * then set_bit up to and including the last bit (e). | ||
1113 | * | ||
1114 | * Do not use memset, because we must account for changes, | ||
1115 | * so we need to loop over the words with hweight() anyways. | ||
1116 | */ | ||
1117 | unsigned long sl = ALIGN(s,BITS_PER_LONG); | ||
1118 | unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1); | ||
1119 | int first_page; | ||
1120 | int last_page; | ||
1121 | int page_nr; | ||
1122 | int first_word; | ||
1123 | int last_word; | ||
1124 | |||
1125 | if (e - s <= 3*BITS_PER_LONG) { | ||
1126 | /* don't bother; el and sl may even be wrong. */ | ||
1127 | __bm_change_bits_to(mdev, s, e, 1, KM_USER0); | ||
1128 | return; | ||
1129 | } | ||
1130 | |||
1131 | /* difference is large enough that we can trust sl and el */ | ||
1132 | |||
1133 | /* bits filling the current long */ | ||
1134 | if (sl) | ||
1135 | __bm_change_bits_to(mdev, s, sl-1, 1, KM_USER0); | ||
1136 | |||
1137 | first_page = sl >> (3 + PAGE_SHIFT); | ||
1138 | last_page = el >> (3 + PAGE_SHIFT); | ||
1139 | |||
1140 | /* MLPP: modulo longs per page */ | ||
1141 | /* LWPP: long words per page */ | ||
1142 | first_word = MLPP(sl >> LN2_BPL); | ||
1143 | last_word = LWPP; | ||
1144 | |||
1145 | /* first and full pages, unless first page == last page */ | ||
1146 | for (page_nr = first_page; page_nr < last_page; page_nr++) { | ||
1147 | bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word); | ||
1148 | cond_resched(); | ||
1149 | first_word = 0; | ||
1150 | } | ||
1151 | |||
1152 | /* last page (respectively only page, for first page == last page) */ | ||
1153 | last_word = MLPP(el >> LN2_BPL); | ||
1154 | bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word); | ||
1155 | |||
1156 | /* possibly trailing bits. | ||
1157 | * example: (e & 63) == 63, el will be e+1. | ||
1158 | * if that even was the very last bit, | ||
1159 | * it would trigger an assert in __bm_change_bits_to() | ||
1160 | */ | ||
1161 | if (el <= e) | ||
1162 | __bm_change_bits_to(mdev, el, e, 1, KM_USER0); | ||
1163 | } | ||
1164 | |||
1165 | /* returns bit state | ||
1166 | * wants bitnr, NOT sector. | ||
1167 | * inherently racy... area needs to be locked by means of {al,rs}_lru | ||
1168 | * 1 ... bit set | ||
1169 | * 0 ... bit not set | ||
1170 | * -1 ... first out of bounds access, stop testing for bits! | ||
1171 | */ | ||
1172 | int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr) | ||
1173 | { | ||
1174 | unsigned long flags; | ||
1175 | struct drbd_bitmap *b = mdev->bitmap; | ||
1176 | unsigned long *p_addr; | ||
1177 | int i; | ||
1178 | |||
1179 | ERR_IF(!b) return 0; | ||
1180 | ERR_IF(!b->bm_pages) return 0; | ||
1181 | |||
1182 | spin_lock_irqsave(&b->bm_lock, flags); | ||
1183 | if (bm_is_locked(b)) | ||
1184 | bm_print_lock_info(mdev); | ||
1185 | if (bitnr < b->bm_bits) { | ||
1186 | unsigned long offset = bitnr>>LN2_BPL; | ||
1187 | p_addr = bm_map_paddr(b, offset); | ||
1188 | i = test_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0; | ||
1189 | bm_unmap(p_addr); | ||
1190 | } else if (bitnr == b->bm_bits) { | ||
1191 | i = -1; | ||
1192 | } else { /* (bitnr > b->bm_bits) */ | ||
1193 | dev_err(DEV, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits); | ||
1194 | i = 0; | ||
1195 | } | ||
1196 | |||
1197 | spin_unlock_irqrestore(&b->bm_lock, flags); | ||
1198 | return i; | ||
1199 | } | ||
1200 | |||
1201 | /* returns number of bits set in the range [s, e] */ | ||
1202 | int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) | ||
1203 | { | ||
1204 | unsigned long flags; | ||
1205 | struct drbd_bitmap *b = mdev->bitmap; | ||
1206 | unsigned long *p_addr = NULL, page_nr = -1; | ||
1207 | unsigned long bitnr; | ||
1208 | int c = 0; | ||
1209 | size_t w; | ||
1210 | |||
1211 | /* If this is called without a bitmap, that is a bug. But just to be | ||
1212 | * robust in case we screwed up elsewhere, in that case pretend there | ||
1213 | * was one dirty bit in the requested area, so we won't try to do a | ||
1214 | * local read there (no bitmap probably implies no disk) */ | ||
1215 | ERR_IF(!b) return 1; | ||
1216 | ERR_IF(!b->bm_pages) return 1; | ||
1217 | |||
1218 | spin_lock_irqsave(&b->bm_lock, flags); | ||
1219 | if (bm_is_locked(b)) | ||
1220 | bm_print_lock_info(mdev); | ||
1221 | for (bitnr = s; bitnr <= e; bitnr++) { | ||
1222 | w = bitnr >> LN2_BPL; | ||
1223 | if (page_nr != w >> (PAGE_SHIFT - LN2_BPL + 3)) { | ||
1224 | page_nr = w >> (PAGE_SHIFT - LN2_BPL + 3); | ||
1225 | if (p_addr) | ||
1226 | bm_unmap(p_addr); | ||
1227 | p_addr = bm_map_paddr(b, w); | ||
1228 | } | ||
1229 | ERR_IF (bitnr >= b->bm_bits) { | ||
1230 | dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); | ||
1231 | } else { | ||
1232 | c += (0 != test_bit(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr)); | ||
1233 | } | ||
1234 | } | ||
1235 | if (p_addr) | ||
1236 | bm_unmap(p_addr); | ||
1237 | spin_unlock_irqrestore(&b->bm_lock, flags); | ||
1238 | return c; | ||
1239 | } | ||
1240 | |||
1241 | |||
1242 | /* inherently racy... | ||
1243 | * return value may be already out-of-date when this function returns. | ||
1244 | * but the general usage is that this is only use during a cstate when bits are | ||
1245 | * only cleared, not set, and typically only care for the case when the return | ||
1246 | * value is zero, or we already "locked" this "bitmap extent" by other means. | ||
1247 | * | ||
1248 | * enr is bm-extent number, since we chose to name one sector (512 bytes) | ||
1249 | * worth of the bitmap a "bitmap extent". | ||
1250 | * | ||
1251 | * TODO | ||
1252 | * I think since we use it like a reference count, we should use the real | ||
1253 | * reference count of some bitmap extent element from some lru instead... | ||
1254 | * | ||
1255 | */ | ||
1256 | int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr) | ||
1257 | { | ||
1258 | struct drbd_bitmap *b = mdev->bitmap; | ||
1259 | int count, s, e; | ||
1260 | unsigned long flags; | ||
1261 | unsigned long *p_addr, *bm; | ||
1262 | |||
1263 | ERR_IF(!b) return 0; | ||
1264 | ERR_IF(!b->bm_pages) return 0; | ||
1265 | |||
1266 | spin_lock_irqsave(&b->bm_lock, flags); | ||
1267 | if (bm_is_locked(b)) | ||
1268 | bm_print_lock_info(mdev); | ||
1269 | |||
1270 | s = S2W(enr); | ||
1271 | e = min((size_t)S2W(enr+1), b->bm_words); | ||
1272 | count = 0; | ||
1273 | if (s < b->bm_words) { | ||
1274 | int n = e-s; | ||
1275 | p_addr = bm_map_paddr(b, s); | ||
1276 | bm = p_addr + MLPP(s); | ||
1277 | while (n--) | ||
1278 | count += hweight_long(*bm++); | ||
1279 | bm_unmap(p_addr); | ||
1280 | } else { | ||
1281 | dev_err(DEV, "start offset (%d) too large in drbd_bm_e_weight\n", s); | ||
1282 | } | ||
1283 | spin_unlock_irqrestore(&b->bm_lock, flags); | ||
1284 | return count; | ||
1285 | } | ||
1286 | |||
1287 | /* set all bits covered by the AL-extent al_enr */ | ||
1288 | unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr) | ||
1289 | { | ||
1290 | struct drbd_bitmap *b = mdev->bitmap; | ||
1291 | unsigned long *p_addr, *bm; | ||
1292 | unsigned long weight; | ||
1293 | int count, s, e, i, do_now; | ||
1294 | ERR_IF(!b) return 0; | ||
1295 | ERR_IF(!b->bm_pages) return 0; | ||
1296 | |||
1297 | spin_lock_irq(&b->bm_lock); | ||
1298 | if (bm_is_locked(b)) | ||
1299 | bm_print_lock_info(mdev); | ||
1300 | weight = b->bm_set; | ||
1301 | |||
1302 | s = al_enr * BM_WORDS_PER_AL_EXT; | ||
1303 | e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words); | ||
1304 | /* assert that s and e are on the same page */ | ||
1305 | D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3) | ||
1306 | == s >> (PAGE_SHIFT - LN2_BPL + 3)); | ||
1307 | count = 0; | ||
1308 | if (s < b->bm_words) { | ||
1309 | i = do_now = e-s; | ||
1310 | p_addr = bm_map_paddr(b, s); | ||
1311 | bm = p_addr + MLPP(s); | ||
1312 | while (i--) { | ||
1313 | count += hweight_long(*bm); | ||
1314 | *bm = -1UL; | ||
1315 | bm++; | ||
1316 | } | ||
1317 | bm_unmap(p_addr); | ||
1318 | b->bm_set += do_now*BITS_PER_LONG - count; | ||
1319 | if (e == b->bm_words) | ||
1320 | b->bm_set -= bm_clear_surplus(b); | ||
1321 | } else { | ||
1322 | dev_err(DEV, "start offset (%d) too large in drbd_bm_ALe_set_all\n", s); | ||
1323 | } | ||
1324 | weight = b->bm_set - weight; | ||
1325 | spin_unlock_irq(&b->bm_lock); | ||
1326 | return weight; | ||
1327 | } | ||
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h new file mode 100644 index 00000000000..8da602e010b --- /dev/null +++ b/drivers/block/drbd/drbd_int.h | |||
@@ -0,0 +1,2258 @@ | |||
1 | /* | ||
2 | drbd_int.h | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #ifndef _DRBD_INT_H | ||
27 | #define _DRBD_INT_H | ||
28 | |||
29 | #include <linux/compiler.h> | ||
30 | #include <linux/types.h> | ||
31 | #include <linux/version.h> | ||
32 | #include <linux/list.h> | ||
33 | #include <linux/sched.h> | ||
34 | #include <linux/bitops.h> | ||
35 | #include <linux/slab.h> | ||
36 | #include <linux/crypto.h> | ||
37 | #include <linux/tcp.h> | ||
38 | #include <linux/mutex.h> | ||
39 | #include <linux/major.h> | ||
40 | #include <linux/blkdev.h> | ||
41 | #include <linux/genhd.h> | ||
42 | #include <net/tcp.h> | ||
43 | #include <linux/lru_cache.h> | ||
44 | |||
45 | #ifdef __CHECKER__ | ||
46 | # define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) | ||
47 | # define __protected_read_by(x) __attribute__((require_context(x,1,999,"read"))) | ||
48 | # define __protected_write_by(x) __attribute__((require_context(x,1,999,"write"))) | ||
49 | # define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call"))) | ||
50 | #else | ||
51 | # define __protected_by(x) | ||
52 | # define __protected_read_by(x) | ||
53 | # define __protected_write_by(x) | ||
54 | # define __must_hold(x) | ||
55 | #endif | ||
56 | |||
57 | #define __no_warn(lock, stmt) do { __acquire(lock); stmt; __release(lock); } while (0) | ||
58 | |||
59 | /* module parameter, defined in drbd_main.c */ | ||
60 | extern unsigned int minor_count; | ||
61 | extern int disable_sendpage; | ||
62 | extern int allow_oos; | ||
63 | extern unsigned int cn_idx; | ||
64 | |||
65 | #ifdef CONFIG_DRBD_FAULT_INJECTION | ||
66 | extern int enable_faults; | ||
67 | extern int fault_rate; | ||
68 | extern int fault_devs; | ||
69 | #endif | ||
70 | |||
71 | extern char usermode_helper[]; | ||
72 | |||
73 | |||
74 | #ifndef TRUE | ||
75 | #define TRUE 1 | ||
76 | #endif | ||
77 | #ifndef FALSE | ||
78 | #define FALSE 0 | ||
79 | #endif | ||
80 | |||
81 | /* I don't remember why XCPU ... | ||
82 | * This is used to wake the asender, | ||
83 | * and to interrupt sending the sending task | ||
84 | * on disconnect. | ||
85 | */ | ||
86 | #define DRBD_SIG SIGXCPU | ||
87 | |||
88 | /* This is used to stop/restart our threads. | ||
89 | * Cannot use SIGTERM nor SIGKILL, since these | ||
90 | * are sent out by init on runlevel changes | ||
91 | * I choose SIGHUP for now. | ||
92 | */ | ||
93 | #define DRBD_SIGKILL SIGHUP | ||
94 | |||
95 | /* All EEs on the free list should have ID_VACANT (== 0) | ||
96 | * freshly allocated EEs get !ID_VACANT (== 1) | ||
97 | * so if it says "cannot dereference null pointer at adress 0x00000001", | ||
98 | * it is most likely one of these :( */ | ||
99 | |||
100 | #define ID_IN_SYNC (4711ULL) | ||
101 | #define ID_OUT_OF_SYNC (4712ULL) | ||
102 | |||
103 | #define ID_SYNCER (-1ULL) | ||
104 | #define ID_VACANT 0 | ||
105 | #define is_syncer_block_id(id) ((id) == ID_SYNCER) | ||
106 | |||
107 | struct drbd_conf; | ||
108 | |||
109 | |||
110 | /* to shorten dev_warn(DEV, "msg"); and relatives statements */ | ||
111 | #define DEV (disk_to_dev(mdev->vdisk)) | ||
112 | |||
113 | #define D_ASSERT(exp) if (!(exp)) \ | ||
114 | dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__) | ||
115 | |||
116 | #define ERR_IF(exp) if (({ \ | ||
117 | int _b = (exp) != 0; \ | ||
118 | if (_b) dev_err(DEV, "%s: (%s) in %s:%d\n", \ | ||
119 | __func__, #exp, __FILE__, __LINE__); \ | ||
120 | _b; \ | ||
121 | })) | ||
122 | |||
123 | /* Defines to control fault insertion */ | ||
124 | enum { | ||
125 | DRBD_FAULT_MD_WR = 0, /* meta data write */ | ||
126 | DRBD_FAULT_MD_RD = 1, /* read */ | ||
127 | DRBD_FAULT_RS_WR = 2, /* resync */ | ||
128 | DRBD_FAULT_RS_RD = 3, | ||
129 | DRBD_FAULT_DT_WR = 4, /* data */ | ||
130 | DRBD_FAULT_DT_RD = 5, | ||
131 | DRBD_FAULT_DT_RA = 6, /* data read ahead */ | ||
132 | DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */ | ||
133 | DRBD_FAULT_AL_EE = 8, /* alloc ee */ | ||
134 | |||
135 | DRBD_FAULT_MAX, | ||
136 | }; | ||
137 | |||
138 | extern void trace_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, ...); | ||
139 | |||
140 | #ifdef CONFIG_DRBD_FAULT_INJECTION | ||
141 | extern unsigned int | ||
142 | _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type); | ||
143 | static inline int | ||
144 | drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) { | ||
145 | return fault_rate && | ||
146 | (enable_faults & (1<<type)) && | ||
147 | _drbd_insert_fault(mdev, type); | ||
148 | } | ||
149 | #define FAULT_ACTIVE(_m, _t) (drbd_insert_fault((_m), (_t))) | ||
150 | |||
151 | #else | ||
152 | #define FAULT_ACTIVE(_m, _t) (0) | ||
153 | #endif | ||
154 | |||
155 | /* integer division, round _UP_ to the next integer */ | ||
156 | #define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0)) | ||
157 | /* usual integer division */ | ||
158 | #define div_floor(A, B) ((A)/(B)) | ||
159 | |||
160 | /* drbd_meta-data.c (still in drbd_main.c) */ | ||
161 | /* 4th incarnation of the disk layout. */ | ||
162 | #define DRBD_MD_MAGIC (DRBD_MAGIC+4) | ||
163 | |||
164 | extern struct drbd_conf **minor_table; | ||
165 | extern struct ratelimit_state drbd_ratelimit_state; | ||
166 | |||
167 | /* on the wire */ | ||
168 | enum drbd_packets { | ||
169 | /* receiver (data socket) */ | ||
170 | P_DATA = 0x00, | ||
171 | P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */ | ||
172 | P_RS_DATA_REPLY = 0x02, /* Response to P_RS_DATA_REQUEST */ | ||
173 | P_BARRIER = 0x03, | ||
174 | P_BITMAP = 0x04, | ||
175 | P_BECOME_SYNC_TARGET = 0x05, | ||
176 | P_BECOME_SYNC_SOURCE = 0x06, | ||
177 | P_UNPLUG_REMOTE = 0x07, /* Used at various times to hint the peer */ | ||
178 | P_DATA_REQUEST = 0x08, /* Used to ask for a data block */ | ||
179 | P_RS_DATA_REQUEST = 0x09, /* Used to ask for a data block for resync */ | ||
180 | P_SYNC_PARAM = 0x0a, | ||
181 | P_PROTOCOL = 0x0b, | ||
182 | P_UUIDS = 0x0c, | ||
183 | P_SIZES = 0x0d, | ||
184 | P_STATE = 0x0e, | ||
185 | P_SYNC_UUID = 0x0f, | ||
186 | P_AUTH_CHALLENGE = 0x10, | ||
187 | P_AUTH_RESPONSE = 0x11, | ||
188 | P_STATE_CHG_REQ = 0x12, | ||
189 | |||
190 | /* asender (meta socket */ | ||
191 | P_PING = 0x13, | ||
192 | P_PING_ACK = 0x14, | ||
193 | P_RECV_ACK = 0x15, /* Used in protocol B */ | ||
194 | P_WRITE_ACK = 0x16, /* Used in protocol C */ | ||
195 | P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */ | ||
196 | P_DISCARD_ACK = 0x18, /* Used in proto C, two-primaries conflict detection */ | ||
197 | P_NEG_ACK = 0x19, /* Sent if local disk is unusable */ | ||
198 | P_NEG_DREPLY = 0x1a, /* Local disk is broken... */ | ||
199 | P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */ | ||
200 | P_BARRIER_ACK = 0x1c, | ||
201 | P_STATE_CHG_REPLY = 0x1d, | ||
202 | |||
203 | /* "new" commands, no longer fitting into the ordering scheme above */ | ||
204 | |||
205 | P_OV_REQUEST = 0x1e, /* data socket */ | ||
206 | P_OV_REPLY = 0x1f, | ||
207 | P_OV_RESULT = 0x20, /* meta socket */ | ||
208 | P_CSUM_RS_REQUEST = 0x21, /* data socket */ | ||
209 | P_RS_IS_IN_SYNC = 0x22, /* meta socket */ | ||
210 | P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */ | ||
211 | P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */ | ||
212 | |||
213 | P_MAX_CMD = 0x25, | ||
214 | P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ | ||
215 | P_MAX_OPT_CMD = 0x101, | ||
216 | |||
217 | /* special command ids for handshake */ | ||
218 | |||
219 | P_HAND_SHAKE_M = 0xfff1, /* First Packet on the MetaSock */ | ||
220 | P_HAND_SHAKE_S = 0xfff2, /* First Packet on the Socket */ | ||
221 | |||
222 | P_HAND_SHAKE = 0xfffe /* FIXED for the next century! */ | ||
223 | }; | ||
224 | |||
225 | static inline const char *cmdname(enum drbd_packets cmd) | ||
226 | { | ||
227 | /* THINK may need to become several global tables | ||
228 | * when we want to support more than | ||
229 | * one PRO_VERSION */ | ||
230 | static const char *cmdnames[] = { | ||
231 | [P_DATA] = "Data", | ||
232 | [P_DATA_REPLY] = "DataReply", | ||
233 | [P_RS_DATA_REPLY] = "RSDataReply", | ||
234 | [P_BARRIER] = "Barrier", | ||
235 | [P_BITMAP] = "ReportBitMap", | ||
236 | [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget", | ||
237 | [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource", | ||
238 | [P_UNPLUG_REMOTE] = "UnplugRemote", | ||
239 | [P_DATA_REQUEST] = "DataRequest", | ||
240 | [P_RS_DATA_REQUEST] = "RSDataRequest", | ||
241 | [P_SYNC_PARAM] = "SyncParam", | ||
242 | [P_SYNC_PARAM89] = "SyncParam89", | ||
243 | [P_PROTOCOL] = "ReportProtocol", | ||
244 | [P_UUIDS] = "ReportUUIDs", | ||
245 | [P_SIZES] = "ReportSizes", | ||
246 | [P_STATE] = "ReportState", | ||
247 | [P_SYNC_UUID] = "ReportSyncUUID", | ||
248 | [P_AUTH_CHALLENGE] = "AuthChallenge", | ||
249 | [P_AUTH_RESPONSE] = "AuthResponse", | ||
250 | [P_PING] = "Ping", | ||
251 | [P_PING_ACK] = "PingAck", | ||
252 | [P_RECV_ACK] = "RecvAck", | ||
253 | [P_WRITE_ACK] = "WriteAck", | ||
254 | [P_RS_WRITE_ACK] = "RSWriteAck", | ||
255 | [P_DISCARD_ACK] = "DiscardAck", | ||
256 | [P_NEG_ACK] = "NegAck", | ||
257 | [P_NEG_DREPLY] = "NegDReply", | ||
258 | [P_NEG_RS_DREPLY] = "NegRSDReply", | ||
259 | [P_BARRIER_ACK] = "BarrierAck", | ||
260 | [P_STATE_CHG_REQ] = "StateChgRequest", | ||
261 | [P_STATE_CHG_REPLY] = "StateChgReply", | ||
262 | [P_OV_REQUEST] = "OVRequest", | ||
263 | [P_OV_REPLY] = "OVReply", | ||
264 | [P_OV_RESULT] = "OVResult", | ||
265 | [P_MAX_CMD] = NULL, | ||
266 | }; | ||
267 | |||
268 | if (cmd == P_HAND_SHAKE_M) | ||
269 | return "HandShakeM"; | ||
270 | if (cmd == P_HAND_SHAKE_S) | ||
271 | return "HandShakeS"; | ||
272 | if (cmd == P_HAND_SHAKE) | ||
273 | return "HandShake"; | ||
274 | if (cmd >= P_MAX_CMD) | ||
275 | return "Unknown"; | ||
276 | return cmdnames[cmd]; | ||
277 | } | ||
278 | |||
279 | /* for sending/receiving the bitmap, | ||
280 | * possibly in some encoding scheme */ | ||
281 | struct bm_xfer_ctx { | ||
282 | /* "const" | ||
283 | * stores total bits and long words | ||
284 | * of the bitmap, so we don't need to | ||
285 | * call the accessor functions over and again. */ | ||
286 | unsigned long bm_bits; | ||
287 | unsigned long bm_words; | ||
288 | /* during xfer, current position within the bitmap */ | ||
289 | unsigned long bit_offset; | ||
290 | unsigned long word_offset; | ||
291 | |||
292 | /* statistics; index: (h->command == P_BITMAP) */ | ||
293 | unsigned packets[2]; | ||
294 | unsigned bytes[2]; | ||
295 | }; | ||
296 | |||
297 | extern void INFO_bm_xfer_stats(struct drbd_conf *mdev, | ||
298 | const char *direction, struct bm_xfer_ctx *c); | ||
299 | |||
300 | static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c) | ||
301 | { | ||
302 | /* word_offset counts "native long words" (32 or 64 bit), | ||
303 | * aligned at 64 bit. | ||
304 | * Encoded packet may end at an unaligned bit offset. | ||
305 | * In case a fallback clear text packet is transmitted in | ||
306 | * between, we adjust this offset back to the last 64bit | ||
307 | * aligned "native long word", which makes coding and decoding | ||
308 | * the plain text bitmap much more convenient. */ | ||
309 | #if BITS_PER_LONG == 64 | ||
310 | c->word_offset = c->bit_offset >> 6; | ||
311 | #elif BITS_PER_LONG == 32 | ||
312 | c->word_offset = c->bit_offset >> 5; | ||
313 | c->word_offset &= ~(1UL); | ||
314 | #else | ||
315 | # error "unsupported BITS_PER_LONG" | ||
316 | #endif | ||
317 | } | ||
318 | |||
319 | #ifndef __packed | ||
320 | #define __packed __attribute__((packed)) | ||
321 | #endif | ||
322 | |||
323 | /* This is the layout for a packet on the wire. | ||
324 | * The byteorder is the network byte order. | ||
325 | * (except block_id and barrier fields. | ||
326 | * these are pointers to local structs | ||
327 | * and have no relevance for the partner, | ||
328 | * which just echoes them as received.) | ||
329 | * | ||
330 | * NOTE that the payload starts at a long aligned offset, | ||
331 | * regardless of 32 or 64 bit arch! | ||
332 | */ | ||
333 | struct p_header { | ||
334 | u32 magic; | ||
335 | u16 command; | ||
336 | u16 length; /* bytes of data after this header */ | ||
337 | u8 payload[0]; | ||
338 | } __packed; | ||
339 | /* 8 bytes. packet FIXED for the next century! */ | ||
340 | |||
341 | /* | ||
342 | * short commands, packets without payload, plain p_header: | ||
343 | * P_PING | ||
344 | * P_PING_ACK | ||
345 | * P_BECOME_SYNC_TARGET | ||
346 | * P_BECOME_SYNC_SOURCE | ||
347 | * P_UNPLUG_REMOTE | ||
348 | */ | ||
349 | |||
350 | /* | ||
351 | * commands with out-of-struct payload: | ||
352 | * P_BITMAP (no additional fields) | ||
353 | * P_DATA, P_DATA_REPLY (see p_data) | ||
354 | * P_COMPRESSED_BITMAP (see receive_compressed_bitmap) | ||
355 | */ | ||
356 | |||
357 | /* these defines must not be changed without changing the protocol version */ | ||
358 | #define DP_HARDBARRIER 1 | ||
359 | #define DP_RW_SYNC 2 | ||
360 | #define DP_MAY_SET_IN_SYNC 4 | ||
361 | |||
362 | struct p_data { | ||
363 | struct p_header head; | ||
364 | u64 sector; /* 64 bits sector number */ | ||
365 | u64 block_id; /* to identify the request in protocol B&C */ | ||
366 | u32 seq_num; | ||
367 | u32 dp_flags; | ||
368 | } __packed; | ||
369 | |||
370 | /* | ||
371 | * commands which share a struct: | ||
372 | * p_block_ack: | ||
373 | * P_RECV_ACK (proto B), P_WRITE_ACK (proto C), | ||
374 | * P_DISCARD_ACK (proto C, two-primaries conflict detection) | ||
375 | * p_block_req: | ||
376 | * P_DATA_REQUEST, P_RS_DATA_REQUEST | ||
377 | */ | ||
378 | struct p_block_ack { | ||
379 | struct p_header head; | ||
380 | u64 sector; | ||
381 | u64 block_id; | ||
382 | u32 blksize; | ||
383 | u32 seq_num; | ||
384 | } __packed; | ||
385 | |||
386 | |||
387 | struct p_block_req { | ||
388 | struct p_header head; | ||
389 | u64 sector; | ||
390 | u64 block_id; | ||
391 | u32 blksize; | ||
392 | u32 pad; /* to multiple of 8 Byte */ | ||
393 | } __packed; | ||
394 | |||
395 | /* | ||
396 | * commands with their own struct for additional fields: | ||
397 | * P_HAND_SHAKE | ||
398 | * P_BARRIER | ||
399 | * P_BARRIER_ACK | ||
400 | * P_SYNC_PARAM | ||
401 | * ReportParams | ||
402 | */ | ||
403 | |||
404 | struct p_handshake { | ||
405 | struct p_header head; /* 8 bytes */ | ||
406 | u32 protocol_min; | ||
407 | u32 feature_flags; | ||
408 | u32 protocol_max; | ||
409 | |||
410 | /* should be more than enough for future enhancements | ||
411 | * for now, feature_flags and the reserverd array shall be zero. | ||
412 | */ | ||
413 | |||
414 | u32 _pad; | ||
415 | u64 reserverd[7]; | ||
416 | } __packed; | ||
417 | /* 80 bytes, FIXED for the next century */ | ||
418 | |||
419 | struct p_barrier { | ||
420 | struct p_header head; | ||
421 | u32 barrier; /* barrier number _handle_ only */ | ||
422 | u32 pad; /* to multiple of 8 Byte */ | ||
423 | } __packed; | ||
424 | |||
425 | struct p_barrier_ack { | ||
426 | struct p_header head; | ||
427 | u32 barrier; | ||
428 | u32 set_size; | ||
429 | } __packed; | ||
430 | |||
431 | struct p_rs_param { | ||
432 | struct p_header head; | ||
433 | u32 rate; | ||
434 | |||
435 | /* Since protocol version 88 and higher. */ | ||
436 | char verify_alg[0]; | ||
437 | } __packed; | ||
438 | |||
439 | struct p_rs_param_89 { | ||
440 | struct p_header head; | ||
441 | u32 rate; | ||
442 | /* protocol version 89: */ | ||
443 | char verify_alg[SHARED_SECRET_MAX]; | ||
444 | char csums_alg[SHARED_SECRET_MAX]; | ||
445 | } __packed; | ||
446 | |||
447 | struct p_protocol { | ||
448 | struct p_header head; | ||
449 | u32 protocol; | ||
450 | u32 after_sb_0p; | ||
451 | u32 after_sb_1p; | ||
452 | u32 after_sb_2p; | ||
453 | u32 want_lose; | ||
454 | u32 two_primaries; | ||
455 | |||
456 | /* Since protocol version 87 and higher. */ | ||
457 | char integrity_alg[0]; | ||
458 | |||
459 | } __packed; | ||
460 | |||
461 | struct p_uuids { | ||
462 | struct p_header head; | ||
463 | u64 uuid[UI_EXTENDED_SIZE]; | ||
464 | } __packed; | ||
465 | |||
466 | struct p_rs_uuid { | ||
467 | struct p_header head; | ||
468 | u64 uuid; | ||
469 | } __packed; | ||
470 | |||
471 | struct p_sizes { | ||
472 | struct p_header head; | ||
473 | u64 d_size; /* size of disk */ | ||
474 | u64 u_size; /* user requested size */ | ||
475 | u64 c_size; /* current exported size */ | ||
476 | u32 max_segment_size; /* Maximal size of a BIO */ | ||
477 | u32 queue_order_type; | ||
478 | } __packed; | ||
479 | |||
480 | struct p_state { | ||
481 | struct p_header head; | ||
482 | u32 state; | ||
483 | } __packed; | ||
484 | |||
485 | struct p_req_state { | ||
486 | struct p_header head; | ||
487 | u32 mask; | ||
488 | u32 val; | ||
489 | } __packed; | ||
490 | |||
491 | struct p_req_state_reply { | ||
492 | struct p_header head; | ||
493 | u32 retcode; | ||
494 | } __packed; | ||
495 | |||
496 | struct p_drbd06_param { | ||
497 | u64 size; | ||
498 | u32 state; | ||
499 | u32 blksize; | ||
500 | u32 protocol; | ||
501 | u32 version; | ||
502 | u32 gen_cnt[5]; | ||
503 | u32 bit_map_gen[5]; | ||
504 | } __packed; | ||
505 | |||
506 | struct p_discard { | ||
507 | struct p_header head; | ||
508 | u64 block_id; | ||
509 | u32 seq_num; | ||
510 | u32 pad; | ||
511 | } __packed; | ||
512 | |||
513 | /* Valid values for the encoding field. | ||
514 | * Bump proto version when changing this. */ | ||
515 | enum drbd_bitmap_code { | ||
516 | /* RLE_VLI_Bytes = 0, | ||
517 | * and other bit variants had been defined during | ||
518 | * algorithm evaluation. */ | ||
519 | RLE_VLI_Bits = 2, | ||
520 | }; | ||
521 | |||
522 | struct p_compressed_bm { | ||
523 | struct p_header head; | ||
524 | /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code | ||
525 | * (encoding & 0x80): polarity (set/unset) of first runlength | ||
526 | * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits | ||
527 | * used to pad up to head.length bytes | ||
528 | */ | ||
529 | u8 encoding; | ||
530 | |||
531 | u8 code[0]; | ||
532 | } __packed; | ||
533 | |||
534 | /* DCBP: Drbd Compressed Bitmap Packet ... */ | ||
535 | static inline enum drbd_bitmap_code | ||
536 | DCBP_get_code(struct p_compressed_bm *p) | ||
537 | { | ||
538 | return (enum drbd_bitmap_code)(p->encoding & 0x0f); | ||
539 | } | ||
540 | |||
541 | static inline void | ||
542 | DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code) | ||
543 | { | ||
544 | BUG_ON(code & ~0xf); | ||
545 | p->encoding = (p->encoding & ~0xf) | code; | ||
546 | } | ||
547 | |||
548 | static inline int | ||
549 | DCBP_get_start(struct p_compressed_bm *p) | ||
550 | { | ||
551 | return (p->encoding & 0x80) != 0; | ||
552 | } | ||
553 | |||
554 | static inline void | ||
555 | DCBP_set_start(struct p_compressed_bm *p, int set) | ||
556 | { | ||
557 | p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0); | ||
558 | } | ||
559 | |||
560 | static inline int | ||
561 | DCBP_get_pad_bits(struct p_compressed_bm *p) | ||
562 | { | ||
563 | return (p->encoding >> 4) & 0x7; | ||
564 | } | ||
565 | |||
566 | static inline void | ||
567 | DCBP_set_pad_bits(struct p_compressed_bm *p, int n) | ||
568 | { | ||
569 | BUG_ON(n & ~0x7); | ||
570 | p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4); | ||
571 | } | ||
572 | |||
573 | /* one bitmap packet, including the p_header, | ||
574 | * should fit within one _architecture independend_ page. | ||
575 | * so we need to use the fixed size 4KiB page size | ||
576 | * most architechtures have used for a long time. | ||
577 | */ | ||
578 | #define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header)) | ||
579 | #define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long)) | ||
580 | #define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm)) | ||
581 | #if (PAGE_SIZE < 4096) | ||
582 | /* drbd_send_bitmap / receive_bitmap would break horribly */ | ||
583 | #error "PAGE_SIZE too small" | ||
584 | #endif | ||
585 | |||
586 | union p_polymorph { | ||
587 | struct p_header header; | ||
588 | struct p_handshake handshake; | ||
589 | struct p_data data; | ||
590 | struct p_block_ack block_ack; | ||
591 | struct p_barrier barrier; | ||
592 | struct p_barrier_ack barrier_ack; | ||
593 | struct p_rs_param_89 rs_param_89; | ||
594 | struct p_protocol protocol; | ||
595 | struct p_sizes sizes; | ||
596 | struct p_uuids uuids; | ||
597 | struct p_state state; | ||
598 | struct p_req_state req_state; | ||
599 | struct p_req_state_reply req_state_reply; | ||
600 | struct p_block_req block_req; | ||
601 | } __packed; | ||
602 | |||
603 | /**********************************************************************/ | ||
604 | enum drbd_thread_state { | ||
605 | None, | ||
606 | Running, | ||
607 | Exiting, | ||
608 | Restarting | ||
609 | }; | ||
610 | |||
611 | struct drbd_thread { | ||
612 | spinlock_t t_lock; | ||
613 | struct task_struct *task; | ||
614 | struct completion stop; | ||
615 | enum drbd_thread_state t_state; | ||
616 | int (*function) (struct drbd_thread *); | ||
617 | struct drbd_conf *mdev; | ||
618 | int reset_cpu_mask; | ||
619 | }; | ||
620 | |||
621 | static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi) | ||
622 | { | ||
623 | /* THINK testing the t_state seems to be uncritical in all cases | ||
624 | * (but thread_{start,stop}), so we can read it *without* the lock. | ||
625 | * --lge */ | ||
626 | |||
627 | smp_rmb(); | ||
628 | return thi->t_state; | ||
629 | } | ||
630 | |||
631 | |||
632 | /* | ||
633 | * Having this as the first member of a struct provides sort of "inheritance". | ||
634 | * "derived" structs can be "drbd_queue_work()"ed. | ||
635 | * The callback should know and cast back to the descendant struct. | ||
636 | * drbd_request and drbd_epoch_entry are descendants of drbd_work. | ||
637 | */ | ||
638 | struct drbd_work; | ||
639 | typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel); | ||
640 | struct drbd_work { | ||
641 | struct list_head list; | ||
642 | drbd_work_cb cb; | ||
643 | }; | ||
644 | |||
645 | struct drbd_tl_epoch; | ||
646 | struct drbd_request { | ||
647 | struct drbd_work w; | ||
648 | struct drbd_conf *mdev; | ||
649 | |||
650 | /* if local IO is not allowed, will be NULL. | ||
651 | * if local IO _is_ allowed, holds the locally submitted bio clone, | ||
652 | * or, after local IO completion, the ERR_PTR(error). | ||
653 | * see drbd_endio_pri(). */ | ||
654 | struct bio *private_bio; | ||
655 | |||
656 | struct hlist_node colision; | ||
657 | sector_t sector; | ||
658 | unsigned int size; | ||
659 | unsigned int epoch; /* barrier_nr */ | ||
660 | |||
661 | /* barrier_nr: used to check on "completion" whether this req was in | ||
662 | * the current epoch, and we therefore have to close it, | ||
663 | * starting a new epoch... | ||
664 | */ | ||
665 | |||
666 | /* up to here, the struct layout is identical to drbd_epoch_entry; | ||
667 | * we might be able to use that to our advantage... */ | ||
668 | |||
669 | struct list_head tl_requests; /* ring list in the transfer log */ | ||
670 | struct bio *master_bio; /* master bio pointer */ | ||
671 | unsigned long rq_state; /* see comments above _req_mod() */ | ||
672 | int seq_num; | ||
673 | unsigned long start_time; | ||
674 | }; | ||
675 | |||
676 | struct drbd_tl_epoch { | ||
677 | struct drbd_work w; | ||
678 | struct list_head requests; /* requests before */ | ||
679 | struct drbd_tl_epoch *next; /* pointer to the next barrier */ | ||
680 | unsigned int br_number; /* the barriers identifier. */ | ||
681 | int n_req; /* number of requests attached before this barrier */ | ||
682 | }; | ||
683 | |||
684 | struct drbd_request; | ||
685 | |||
686 | /* These Tl_epoch_entries may be in one of 6 lists: | ||
687 | active_ee .. data packet being written | ||
688 | sync_ee .. syncer block being written | ||
689 | done_ee .. block written, need to send P_WRITE_ACK | ||
690 | read_ee .. [RS]P_DATA_REQUEST being read | ||
691 | */ | ||
692 | |||
693 | struct drbd_epoch { | ||
694 | struct list_head list; | ||
695 | unsigned int barrier_nr; | ||
696 | atomic_t epoch_size; /* increased on every request added. */ | ||
697 | atomic_t active; /* increased on every req. added, and dec on every finished. */ | ||
698 | unsigned long flags; | ||
699 | }; | ||
700 | |||
701 | /* drbd_epoch flag bits */ | ||
702 | enum { | ||
703 | DE_BARRIER_IN_NEXT_EPOCH_ISSUED, | ||
704 | DE_BARRIER_IN_NEXT_EPOCH_DONE, | ||
705 | DE_CONTAINS_A_BARRIER, | ||
706 | DE_HAVE_BARRIER_NUMBER, | ||
707 | DE_IS_FINISHING, | ||
708 | }; | ||
709 | |||
710 | enum epoch_event { | ||
711 | EV_PUT, | ||
712 | EV_GOT_BARRIER_NR, | ||
713 | EV_BARRIER_DONE, | ||
714 | EV_BECAME_LAST, | ||
715 | EV_TRACE_FLUSH, /* TRACE_ are not real events, only used for tracing */ | ||
716 | EV_TRACE_ADD_BARRIER, /* Doing the first write as a barrier write */ | ||
717 | EV_TRACE_SETTING_BI, /* Barrier is expressed with the first write of the next epoch */ | ||
718 | EV_TRACE_ALLOC, | ||
719 | EV_TRACE_FREE, | ||
720 | EV_CLEANUP = 32, /* used as flag */ | ||
721 | }; | ||
722 | |||
723 | struct drbd_epoch_entry { | ||
724 | struct drbd_work w; | ||
725 | struct drbd_conf *mdev; | ||
726 | struct bio *private_bio; | ||
727 | struct hlist_node colision; | ||
728 | sector_t sector; | ||
729 | unsigned int size; | ||
730 | struct drbd_epoch *epoch; | ||
731 | |||
732 | /* up to here, the struct layout is identical to drbd_request; | ||
733 | * we might be able to use that to our advantage... */ | ||
734 | |||
735 | unsigned int flags; | ||
736 | u64 block_id; | ||
737 | }; | ||
738 | |||
739 | struct drbd_wq_barrier { | ||
740 | struct drbd_work w; | ||
741 | struct completion done; | ||
742 | }; | ||
743 | |||
744 | struct digest_info { | ||
745 | int digest_size; | ||
746 | void *digest; | ||
747 | }; | ||
748 | |||
749 | /* ee flag bits */ | ||
750 | enum { | ||
751 | __EE_CALL_AL_COMPLETE_IO, | ||
752 | __EE_CONFLICT_PENDING, | ||
753 | __EE_MAY_SET_IN_SYNC, | ||
754 | __EE_IS_BARRIER, | ||
755 | }; | ||
756 | #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) | ||
757 | #define EE_CONFLICT_PENDING (1<<__EE_CONFLICT_PENDING) | ||
758 | #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) | ||
759 | #define EE_IS_BARRIER (1<<__EE_IS_BARRIER) | ||
760 | |||
761 | /* global flag bits */ | ||
762 | enum { | ||
763 | CREATE_BARRIER, /* next P_DATA is preceeded by a P_BARRIER */ | ||
764 | SIGNAL_ASENDER, /* whether asender wants to be interrupted */ | ||
765 | SEND_PING, /* whether asender should send a ping asap */ | ||
766 | |||
767 | STOP_SYNC_TIMER, /* tell timer to cancel itself */ | ||
768 | UNPLUG_QUEUED, /* only relevant with kernel 2.4 */ | ||
769 | UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ | ||
770 | MD_DIRTY, /* current uuids and flags not yet on disk */ | ||
771 | DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */ | ||
772 | USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */ | ||
773 | CLUSTER_ST_CHANGE, /* Cluster wide state change going on... */ | ||
774 | CL_ST_CHG_SUCCESS, | ||
775 | CL_ST_CHG_FAIL, | ||
776 | CRASHED_PRIMARY, /* This node was a crashed primary. | ||
777 | * Gets cleared when the state.conn | ||
778 | * goes into C_CONNECTED state. */ | ||
779 | WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */ | ||
780 | NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */ | ||
781 | CONSIDER_RESYNC, | ||
782 | |||
783 | MD_NO_BARRIER, /* meta data device does not support barriers, | ||
784 | so don't even try */ | ||
785 | SUSPEND_IO, /* suspend application io */ | ||
786 | BITMAP_IO, /* suspend application io; | ||
787 | once no more io in flight, start bitmap io */ | ||
788 | BITMAP_IO_QUEUED, /* Started bitmap IO */ | ||
789 | RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ | ||
790 | NET_CONGESTED, /* The data socket is congested */ | ||
791 | |||
792 | CONFIG_PENDING, /* serialization of (re)configuration requests. | ||
793 | * if set, also prevents the device from dying */ | ||
794 | DEVICE_DYING, /* device became unconfigured, | ||
795 | * but worker thread is still handling the cleanup. | ||
796 | * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed, | ||
797 | * while this is set. */ | ||
798 | RESIZE_PENDING, /* Size change detected locally, waiting for the response from | ||
799 | * the peer, if it changed there as well. */ | ||
800 | }; | ||
801 | |||
802 | struct drbd_bitmap; /* opaque for drbd_conf */ | ||
803 | |||
804 | /* TODO sort members for performance | ||
805 | * MAYBE group them further */ | ||
806 | |||
807 | /* THINK maybe we actually want to use the default "event/%s" worker threads | ||
808 | * or similar in linux 2.6, which uses per cpu data and threads. | ||
809 | * | ||
810 | * To be general, this might need a spin_lock member. | ||
811 | * For now, please use the mdev->req_lock to protect list_head, | ||
812 | * see drbd_queue_work below. | ||
813 | */ | ||
814 | struct drbd_work_queue { | ||
815 | struct list_head q; | ||
816 | struct semaphore s; /* producers up it, worker down()s it */ | ||
817 | spinlock_t q_lock; /* to protect the list. */ | ||
818 | }; | ||
819 | |||
820 | struct drbd_socket { | ||
821 | struct drbd_work_queue work; | ||
822 | struct mutex mutex; | ||
823 | struct socket *socket; | ||
824 | /* this way we get our | ||
825 | * send/receive buffers off the stack */ | ||
826 | union p_polymorph sbuf; | ||
827 | union p_polymorph rbuf; | ||
828 | }; | ||
829 | |||
830 | struct drbd_md { | ||
831 | u64 md_offset; /* sector offset to 'super' block */ | ||
832 | |||
833 | u64 la_size_sect; /* last agreed size, unit sectors */ | ||
834 | u64 uuid[UI_SIZE]; | ||
835 | u64 device_uuid; | ||
836 | u32 flags; | ||
837 | u32 md_size_sect; | ||
838 | |||
839 | s32 al_offset; /* signed relative sector offset to al area */ | ||
840 | s32 bm_offset; /* signed relative sector offset to bitmap */ | ||
841 | |||
842 | /* u32 al_nr_extents; important for restoring the AL | ||
843 | * is stored into sync_conf.al_extents, which in turn | ||
844 | * gets applied to act_log->nr_elements | ||
845 | */ | ||
846 | }; | ||
847 | |||
848 | /* for sync_conf and other types... */ | ||
849 | #define NL_PACKET(name, number, fields) struct name { fields }; | ||
850 | #define NL_INTEGER(pn,pr,member) int member; | ||
851 | #define NL_INT64(pn,pr,member) __u64 member; | ||
852 | #define NL_BIT(pn,pr,member) unsigned member:1; | ||
853 | #define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len; | ||
854 | #include "linux/drbd_nl.h" | ||
855 | |||
856 | struct drbd_backing_dev { | ||
857 | struct block_device *backing_bdev; | ||
858 | struct block_device *md_bdev; | ||
859 | struct file *lo_file; | ||
860 | struct file *md_file; | ||
861 | struct drbd_md md; | ||
862 | struct disk_conf dc; /* The user provided config... */ | ||
863 | sector_t known_size; /* last known size of that backing device */ | ||
864 | }; | ||
865 | |||
866 | struct drbd_md_io { | ||
867 | struct drbd_conf *mdev; | ||
868 | struct completion event; | ||
869 | int error; | ||
870 | }; | ||
871 | |||
872 | struct bm_io_work { | ||
873 | struct drbd_work w; | ||
874 | char *why; | ||
875 | int (*io_fn)(struct drbd_conf *mdev); | ||
876 | void (*done)(struct drbd_conf *mdev, int rv); | ||
877 | }; | ||
878 | |||
879 | enum write_ordering_e { | ||
880 | WO_none, | ||
881 | WO_drain_io, | ||
882 | WO_bdev_flush, | ||
883 | WO_bio_barrier | ||
884 | }; | ||
885 | |||
886 | struct drbd_conf { | ||
887 | /* things that are stored as / read from meta data on disk */ | ||
888 | unsigned long flags; | ||
889 | |||
890 | /* configured by drbdsetup */ | ||
891 | struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */ | ||
892 | struct syncer_conf sync_conf; | ||
893 | struct drbd_backing_dev *ldev __protected_by(local); | ||
894 | |||
895 | sector_t p_size; /* partner's disk size */ | ||
896 | struct request_queue *rq_queue; | ||
897 | struct block_device *this_bdev; | ||
898 | struct gendisk *vdisk; | ||
899 | |||
900 | struct drbd_socket data; /* data/barrier/cstate/parameter packets */ | ||
901 | struct drbd_socket meta; /* ping/ack (metadata) packets */ | ||
902 | int agreed_pro_version; /* actually used protocol version */ | ||
903 | unsigned long last_received; /* in jiffies, either socket */ | ||
904 | unsigned int ko_count; | ||
905 | struct drbd_work resync_work, | ||
906 | unplug_work, | ||
907 | md_sync_work; | ||
908 | struct timer_list resync_timer; | ||
909 | struct timer_list md_sync_timer; | ||
910 | |||
911 | /* Used after attach while negotiating new disk state. */ | ||
912 | union drbd_state new_state_tmp; | ||
913 | |||
914 | union drbd_state state; | ||
915 | wait_queue_head_t misc_wait; | ||
916 | wait_queue_head_t state_wait; /* upon each state change. */ | ||
917 | unsigned int send_cnt; | ||
918 | unsigned int recv_cnt; | ||
919 | unsigned int read_cnt; | ||
920 | unsigned int writ_cnt; | ||
921 | unsigned int al_writ_cnt; | ||
922 | unsigned int bm_writ_cnt; | ||
923 | atomic_t ap_bio_cnt; /* Requests we need to complete */ | ||
924 | atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */ | ||
925 | atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ | ||
926 | atomic_t unacked_cnt; /* Need to send replys for */ | ||
927 | atomic_t local_cnt; /* Waiting for local completion */ | ||
928 | atomic_t net_cnt; /* Users of net_conf */ | ||
929 | spinlock_t req_lock; | ||
930 | struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */ | ||
931 | struct drbd_tl_epoch *newest_tle; | ||
932 | struct drbd_tl_epoch *oldest_tle; | ||
933 | struct list_head out_of_sequence_requests; | ||
934 | struct hlist_head *tl_hash; | ||
935 | unsigned int tl_hash_s; | ||
936 | |||
937 | /* blocks to sync in this run [unit BM_BLOCK_SIZE] */ | ||
938 | unsigned long rs_total; | ||
939 | /* number of sync IOs that failed in this run */ | ||
940 | unsigned long rs_failed; | ||
941 | /* Syncer's start time [unit jiffies] */ | ||
942 | unsigned long rs_start; | ||
943 | /* cumulated time in PausedSyncX state [unit jiffies] */ | ||
944 | unsigned long rs_paused; | ||
945 | /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */ | ||
946 | unsigned long rs_mark_left; | ||
947 | /* marks's time [unit jiffies] */ | ||
948 | unsigned long rs_mark_time; | ||
949 | /* skipped because csum was equeal [unit BM_BLOCK_SIZE] */ | ||
950 | unsigned long rs_same_csum; | ||
951 | |||
952 | /* where does the admin want us to start? (sector) */ | ||
953 | sector_t ov_start_sector; | ||
954 | /* where are we now? (sector) */ | ||
955 | sector_t ov_position; | ||
956 | /* Start sector of out of sync range (to merge printk reporting). */ | ||
957 | sector_t ov_last_oos_start; | ||
958 | /* size of out-of-sync range in sectors. */ | ||
959 | sector_t ov_last_oos_size; | ||
960 | unsigned long ov_left; /* in bits */ | ||
961 | struct crypto_hash *csums_tfm; | ||
962 | struct crypto_hash *verify_tfm; | ||
963 | |||
964 | struct drbd_thread receiver; | ||
965 | struct drbd_thread worker; | ||
966 | struct drbd_thread asender; | ||
967 | struct drbd_bitmap *bitmap; | ||
968 | unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */ | ||
969 | |||
970 | /* Used to track operations of resync... */ | ||
971 | struct lru_cache *resync; | ||
972 | /* Number of locked elements in resync LRU */ | ||
973 | unsigned int resync_locked; | ||
974 | /* resync extent number waiting for application requests */ | ||
975 | unsigned int resync_wenr; | ||
976 | |||
977 | int open_cnt; | ||
978 | u64 *p_uuid; | ||
979 | struct drbd_epoch *current_epoch; | ||
980 | spinlock_t epoch_lock; | ||
981 | unsigned int epochs; | ||
982 | enum write_ordering_e write_ordering; | ||
983 | struct list_head active_ee; /* IO in progress */ | ||
984 | struct list_head sync_ee; /* IO in progress */ | ||
985 | struct list_head done_ee; /* send ack */ | ||
986 | struct list_head read_ee; /* IO in progress */ | ||
987 | struct list_head net_ee; /* zero-copy network send in progress */ | ||
988 | struct hlist_head *ee_hash; /* is proteced by req_lock! */ | ||
989 | unsigned int ee_hash_s; | ||
990 | |||
991 | /* this one is protected by ee_lock, single thread */ | ||
992 | struct drbd_epoch_entry *last_write_w_barrier; | ||
993 | |||
994 | int next_barrier_nr; | ||
995 | struct hlist_head *app_reads_hash; /* is proteced by req_lock */ | ||
996 | struct list_head resync_reads; | ||
997 | atomic_t pp_in_use; | ||
998 | wait_queue_head_t ee_wait; | ||
999 | struct page *md_io_page; /* one page buffer for md_io */ | ||
1000 | struct page *md_io_tmpp; /* for logical_block_size != 512 */ | ||
1001 | struct mutex md_io_mutex; /* protects the md_io_buffer */ | ||
1002 | spinlock_t al_lock; | ||
1003 | wait_queue_head_t al_wait; | ||
1004 | struct lru_cache *act_log; /* activity log */ | ||
1005 | unsigned int al_tr_number; | ||
1006 | int al_tr_cycle; | ||
1007 | int al_tr_pos; /* position of the next transaction in the journal */ | ||
1008 | struct crypto_hash *cram_hmac_tfm; | ||
1009 | struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */ | ||
1010 | struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */ | ||
1011 | void *int_dig_out; | ||
1012 | void *int_dig_in; | ||
1013 | void *int_dig_vv; | ||
1014 | wait_queue_head_t seq_wait; | ||
1015 | atomic_t packet_seq; | ||
1016 | unsigned int peer_seq; | ||
1017 | spinlock_t peer_seq_lock; | ||
1018 | unsigned int minor; | ||
1019 | unsigned long comm_bm_set; /* communicated number of set bits. */ | ||
1020 | cpumask_var_t cpu_mask; | ||
1021 | struct bm_io_work bm_io_work; | ||
1022 | u64 ed_uuid; /* UUID of the exposed data */ | ||
1023 | struct mutex state_mutex; | ||
1024 | char congestion_reason; /* Why we where congested... */ | ||
1025 | }; | ||
1026 | |||
1027 | static inline struct drbd_conf *minor_to_mdev(unsigned int minor) | ||
1028 | { | ||
1029 | struct drbd_conf *mdev; | ||
1030 | |||
1031 | mdev = minor < minor_count ? minor_table[minor] : NULL; | ||
1032 | |||
1033 | return mdev; | ||
1034 | } | ||
1035 | |||
1036 | static inline unsigned int mdev_to_minor(struct drbd_conf *mdev) | ||
1037 | { | ||
1038 | return mdev->minor; | ||
1039 | } | ||
1040 | |||
1041 | /* returns 1 if it was successfull, | ||
1042 | * returns 0 if there was no data socket. | ||
1043 | * so wherever you are going to use the data.socket, e.g. do | ||
1044 | * if (!drbd_get_data_sock(mdev)) | ||
1045 | * return 0; | ||
1046 | * CODE(); | ||
1047 | * drbd_put_data_sock(mdev); | ||
1048 | */ | ||
1049 | static inline int drbd_get_data_sock(struct drbd_conf *mdev) | ||
1050 | { | ||
1051 | mutex_lock(&mdev->data.mutex); | ||
1052 | /* drbd_disconnect() could have called drbd_free_sock() | ||
1053 | * while we were waiting in down()... */ | ||
1054 | if (unlikely(mdev->data.socket == NULL)) { | ||
1055 | mutex_unlock(&mdev->data.mutex); | ||
1056 | return 0; | ||
1057 | } | ||
1058 | return 1; | ||
1059 | } | ||
1060 | |||
1061 | static inline void drbd_put_data_sock(struct drbd_conf *mdev) | ||
1062 | { | ||
1063 | mutex_unlock(&mdev->data.mutex); | ||
1064 | } | ||
1065 | |||
1066 | /* | ||
1067 | * function declarations | ||
1068 | *************************/ | ||
1069 | |||
1070 | /* drbd_main.c */ | ||
1071 | |||
1072 | enum chg_state_flags { | ||
1073 | CS_HARD = 1, | ||
1074 | CS_VERBOSE = 2, | ||
1075 | CS_WAIT_COMPLETE = 4, | ||
1076 | CS_SERIALIZE = 8, | ||
1077 | CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, | ||
1078 | }; | ||
1079 | |||
1080 | extern void drbd_init_set_defaults(struct drbd_conf *mdev); | ||
1081 | extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, | ||
1082 | union drbd_state mask, union drbd_state val); | ||
1083 | extern void drbd_force_state(struct drbd_conf *, union drbd_state, | ||
1084 | union drbd_state); | ||
1085 | extern int _drbd_request_state(struct drbd_conf *, union drbd_state, | ||
1086 | union drbd_state, enum chg_state_flags); | ||
1087 | extern int __drbd_set_state(struct drbd_conf *, union drbd_state, | ||
1088 | enum chg_state_flags, struct completion *done); | ||
1089 | extern void print_st_err(struct drbd_conf *, union drbd_state, | ||
1090 | union drbd_state, int); | ||
1091 | extern int drbd_thread_start(struct drbd_thread *thi); | ||
1092 | extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait); | ||
1093 | #ifdef CONFIG_SMP | ||
1094 | extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev); | ||
1095 | extern void drbd_calc_cpu_mask(struct drbd_conf *mdev); | ||
1096 | #else | ||
1097 | #define drbd_thread_current_set_cpu(A) ({}) | ||
1098 | #define drbd_calc_cpu_mask(A) ({}) | ||
1099 | #endif | ||
1100 | extern void drbd_free_resources(struct drbd_conf *mdev); | ||
1101 | extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, | ||
1102 | unsigned int set_size); | ||
1103 | extern void tl_clear(struct drbd_conf *mdev); | ||
1104 | extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *); | ||
1105 | extern void drbd_free_sock(struct drbd_conf *mdev); | ||
1106 | extern int drbd_send(struct drbd_conf *mdev, struct socket *sock, | ||
1107 | void *buf, size_t size, unsigned msg_flags); | ||
1108 | extern int drbd_send_protocol(struct drbd_conf *mdev); | ||
1109 | extern int drbd_send_uuids(struct drbd_conf *mdev); | ||
1110 | extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); | ||
1111 | extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val); | ||
1112 | extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply); | ||
1113 | extern int _drbd_send_state(struct drbd_conf *mdev); | ||
1114 | extern int drbd_send_state(struct drbd_conf *mdev); | ||
1115 | extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, | ||
1116 | enum drbd_packets cmd, struct p_header *h, | ||
1117 | size_t size, unsigned msg_flags); | ||
1118 | #define USE_DATA_SOCKET 1 | ||
1119 | #define USE_META_SOCKET 0 | ||
1120 | extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, | ||
1121 | enum drbd_packets cmd, struct p_header *h, | ||
1122 | size_t size); | ||
1123 | extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1124 | char *data, size_t size); | ||
1125 | extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc); | ||
1126 | extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, | ||
1127 | u32 set_size); | ||
1128 | extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1129 | struct drbd_epoch_entry *e); | ||
1130 | extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1131 | struct p_block_req *rp); | ||
1132 | extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1133 | struct p_data *dp); | ||
1134 | extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1135 | sector_t sector, int blksize, u64 block_id); | ||
1136 | extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1137 | struct drbd_epoch_entry *e); | ||
1138 | extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req); | ||
1139 | extern int _drbd_send_barrier(struct drbd_conf *mdev, | ||
1140 | struct drbd_tl_epoch *barrier); | ||
1141 | extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd, | ||
1142 | sector_t sector, int size, u64 block_id); | ||
1143 | extern int drbd_send_drequest_csum(struct drbd_conf *mdev, | ||
1144 | sector_t sector,int size, | ||
1145 | void *digest, int digest_size, | ||
1146 | enum drbd_packets cmd); | ||
1147 | extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size); | ||
1148 | |||
1149 | extern int drbd_send_bitmap(struct drbd_conf *mdev); | ||
1150 | extern int _drbd_send_bitmap(struct drbd_conf *mdev); | ||
1151 | extern int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode); | ||
1152 | extern void drbd_free_bc(struct drbd_backing_dev *ldev); | ||
1153 | extern void drbd_mdev_cleanup(struct drbd_conf *mdev); | ||
1154 | |||
1155 | /* drbd_meta-data.c (still in drbd_main.c) */ | ||
1156 | extern void drbd_md_sync(struct drbd_conf *mdev); | ||
1157 | extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev); | ||
1158 | /* maybe define them below as inline? */ | ||
1159 | extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); | ||
1160 | extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); | ||
1161 | extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local); | ||
1162 | extern void _drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local); | ||
1163 | extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local); | ||
1164 | extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local); | ||
1165 | extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local); | ||
1166 | extern int drbd_md_test_flag(struct drbd_backing_dev *, int); | ||
1167 | extern void drbd_md_mark_dirty(struct drbd_conf *mdev); | ||
1168 | extern void drbd_queue_bitmap_io(struct drbd_conf *mdev, | ||
1169 | int (*io_fn)(struct drbd_conf *), | ||
1170 | void (*done)(struct drbd_conf *, int), | ||
1171 | char *why); | ||
1172 | extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); | ||
1173 | extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); | ||
1174 | extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why); | ||
1175 | |||
1176 | |||
1177 | /* Meta data layout | ||
1178 | We reserve a 128MB Block (4k aligned) | ||
1179 | * either at the end of the backing device | ||
1180 | * or on a seperate meta data device. */ | ||
1181 | |||
1182 | #define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */ | ||
1183 | /* The following numbers are sectors */ | ||
1184 | #define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */ | ||
1185 | #define MD_AL_MAX_SIZE 64 /* = 32 kb LOG ~ 3776 extents ~ 14 GB Storage */ | ||
1186 | /* Allows up to about 3.8TB */ | ||
1187 | #define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE) | ||
1188 | |||
1189 | /* Since the smalles IO unit is usually 512 byte */ | ||
1190 | #define MD_SECTOR_SHIFT 9 | ||
1191 | #define MD_SECTOR_SIZE (1<<MD_SECTOR_SHIFT) | ||
1192 | |||
1193 | /* activity log */ | ||
1194 | #define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */ | ||
1195 | #define AL_EXTENT_SHIFT 22 /* One extent represents 4M Storage */ | ||
1196 | #define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT) | ||
1197 | |||
1198 | #if BITS_PER_LONG == 32 | ||
1199 | #define LN2_BPL 5 | ||
1200 | #define cpu_to_lel(A) cpu_to_le32(A) | ||
1201 | #define lel_to_cpu(A) le32_to_cpu(A) | ||
1202 | #elif BITS_PER_LONG == 64 | ||
1203 | #define LN2_BPL 6 | ||
1204 | #define cpu_to_lel(A) cpu_to_le64(A) | ||
1205 | #define lel_to_cpu(A) le64_to_cpu(A) | ||
1206 | #else | ||
1207 | #error "LN2 of BITS_PER_LONG unknown!" | ||
1208 | #endif | ||
1209 | |||
1210 | /* resync bitmap */ | ||
1211 | /* 16MB sized 'bitmap extent' to track syncer usage */ | ||
1212 | struct bm_extent { | ||
1213 | int rs_left; /* number of bits set (out of sync) in this extent. */ | ||
1214 | int rs_failed; /* number of failed resync requests in this extent. */ | ||
1215 | unsigned long flags; | ||
1216 | struct lc_element lce; | ||
1217 | }; | ||
1218 | |||
1219 | #define BME_NO_WRITES 0 /* bm_extent.flags: no more requests on this one! */ | ||
1220 | #define BME_LOCKED 1 /* bm_extent.flags: syncer active on this one. */ | ||
1221 | |||
1222 | /* drbd_bitmap.c */ | ||
1223 | /* | ||
1224 | * We need to store one bit for a block. | ||
1225 | * Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap. | ||
1226 | * Bit 0 ==> local node thinks this block is binary identical on both nodes | ||
1227 | * Bit 1 ==> local node thinks this block needs to be synced. | ||
1228 | */ | ||
1229 | |||
1230 | #define BM_BLOCK_SHIFT 12 /* 4k per bit */ | ||
1231 | #define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT) | ||
1232 | /* (9+3) : 512 bytes @ 8 bits; representing 16M storage | ||
1233 | * per sector of on disk bitmap */ | ||
1234 | #define BM_EXT_SHIFT (BM_BLOCK_SHIFT + MD_SECTOR_SHIFT + 3) /* = 24 */ | ||
1235 | #define BM_EXT_SIZE (1<<BM_EXT_SHIFT) | ||
1236 | |||
1237 | #if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12) | ||
1238 | #error "HAVE YOU FIXED drbdmeta AS WELL??" | ||
1239 | #endif | ||
1240 | |||
1241 | /* thus many _storage_ sectors are described by one bit */ | ||
1242 | #define BM_SECT_TO_BIT(x) ((x)>>(BM_BLOCK_SHIFT-9)) | ||
1243 | #define BM_BIT_TO_SECT(x) ((sector_t)(x)<<(BM_BLOCK_SHIFT-9)) | ||
1244 | #define BM_SECT_PER_BIT BM_BIT_TO_SECT(1) | ||
1245 | |||
1246 | /* bit to represented kilo byte conversion */ | ||
1247 | #define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10)) | ||
1248 | |||
1249 | /* in which _bitmap_ extent (resp. sector) the bit for a certain | ||
1250 | * _storage_ sector is located in */ | ||
1251 | #define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9)) | ||
1252 | |||
1253 | /* how much _storage_ sectors we have per bitmap sector */ | ||
1254 | #define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9)) | ||
1255 | #define BM_SECT_PER_EXT BM_EXT_TO_SECT(1) | ||
1256 | |||
1257 | /* in one sector of the bitmap, we have this many activity_log extents. */ | ||
1258 | #define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT)) | ||
1259 | #define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) | ||
1260 | |||
1261 | #define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT) | ||
1262 | #define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1) | ||
1263 | |||
1264 | /* the extent in "PER_EXTENT" below is an activity log extent | ||
1265 | * we need that many (long words/bytes) to store the bitmap | ||
1266 | * of one AL_EXTENT_SIZE chunk of storage. | ||
1267 | * we can store the bitmap for that many AL_EXTENTS within | ||
1268 | * one sector of the _on_disk_ bitmap: | ||
1269 | * bit 0 bit 37 bit 38 bit (512*8)-1 | ||
1270 | * ...|........|........|.. // ..|........| | ||
1271 | * sect. 0 `296 `304 ^(512*8*8)-1 | ||
1272 | * | ||
1273 | #define BM_WORDS_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG ) | ||
1274 | #define BM_BYTES_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 ) // 128 | ||
1275 | #define BM_EXT_PER_SECT ( 512 / BM_BYTES_PER_EXTENT ) // 4 | ||
1276 | */ | ||
1277 | |||
1278 | #define DRBD_MAX_SECTORS_32 (0xffffffffLU) | ||
1279 | #define DRBD_MAX_SECTORS_BM \ | ||
1280 | ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9))) | ||
1281 | #if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32 | ||
1282 | #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM | ||
1283 | #define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM | ||
1284 | #elif !defined(CONFIG_LBD) && BITS_PER_LONG == 32 | ||
1285 | #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32 | ||
1286 | #define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32 | ||
1287 | #else | ||
1288 | #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM | ||
1289 | /* 16 TB in units of sectors */ | ||
1290 | #if BITS_PER_LONG == 32 | ||
1291 | /* adjust by one page worth of bitmap, | ||
1292 | * so we won't wrap around in drbd_bm_find_next_bit. | ||
1293 | * you should use 64bit OS for that much storage, anyways. */ | ||
1294 | #define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff) | ||
1295 | #else | ||
1296 | #define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0x1LU << 32) | ||
1297 | #endif | ||
1298 | #endif | ||
1299 | |||
1300 | /* Sector shift value for the "hash" functions of tl_hash and ee_hash tables. | ||
1301 | * With a value of 6 all IO in one 32K block make it to the same slot of the | ||
1302 | * hash table. */ | ||
1303 | #define HT_SHIFT 6 | ||
1304 | #define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT)) | ||
1305 | |||
1306 | /* Number of elements in the app_reads_hash */ | ||
1307 | #define APP_R_HSIZE 15 | ||
1308 | |||
1309 | extern int drbd_bm_init(struct drbd_conf *mdev); | ||
1310 | extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors); | ||
1311 | extern void drbd_bm_cleanup(struct drbd_conf *mdev); | ||
1312 | extern void drbd_bm_set_all(struct drbd_conf *mdev); | ||
1313 | extern void drbd_bm_clear_all(struct drbd_conf *mdev); | ||
1314 | extern int drbd_bm_set_bits( | ||
1315 | struct drbd_conf *mdev, unsigned long s, unsigned long e); | ||
1316 | extern int drbd_bm_clear_bits( | ||
1317 | struct drbd_conf *mdev, unsigned long s, unsigned long e); | ||
1318 | /* bm_set_bits variant for use while holding drbd_bm_lock */ | ||
1319 | extern void _drbd_bm_set_bits(struct drbd_conf *mdev, | ||
1320 | const unsigned long s, const unsigned long e); | ||
1321 | extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr); | ||
1322 | extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); | ||
1323 | extern int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local); | ||
1324 | extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); | ||
1325 | extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); | ||
1326 | extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, | ||
1327 | unsigned long al_enr); | ||
1328 | extern size_t drbd_bm_words(struct drbd_conf *mdev); | ||
1329 | extern unsigned long drbd_bm_bits(struct drbd_conf *mdev); | ||
1330 | extern sector_t drbd_bm_capacity(struct drbd_conf *mdev); | ||
1331 | extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo); | ||
1332 | /* bm_find_next variants for use while you hold drbd_bm_lock() */ | ||
1333 | extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo); | ||
1334 | extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo); | ||
1335 | extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev); | ||
1336 | extern int drbd_bm_rs_done(struct drbd_conf *mdev); | ||
1337 | /* for receive_bitmap */ | ||
1338 | extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, | ||
1339 | size_t number, unsigned long *buffer); | ||
1340 | /* for _drbd_send_bitmap and drbd_bm_write_sect */ | ||
1341 | extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, | ||
1342 | size_t number, unsigned long *buffer); | ||
1343 | |||
1344 | extern void drbd_bm_lock(struct drbd_conf *mdev, char *why); | ||
1345 | extern void drbd_bm_unlock(struct drbd_conf *mdev); | ||
1346 | |||
1347 | extern int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e); | ||
1348 | /* drbd_main.c */ | ||
1349 | |||
1350 | extern struct kmem_cache *drbd_request_cache; | ||
1351 | extern struct kmem_cache *drbd_ee_cache; /* epoch entries */ | ||
1352 | extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ | ||
1353 | extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ | ||
1354 | extern mempool_t *drbd_request_mempool; | ||
1355 | extern mempool_t *drbd_ee_mempool; | ||
1356 | |||
1357 | extern struct page *drbd_pp_pool; /* drbd's page pool */ | ||
1358 | extern spinlock_t drbd_pp_lock; | ||
1359 | extern int drbd_pp_vacant; | ||
1360 | extern wait_queue_head_t drbd_pp_wait; | ||
1361 | |||
1362 | extern rwlock_t global_state_lock; | ||
1363 | |||
1364 | extern struct drbd_conf *drbd_new_device(unsigned int minor); | ||
1365 | extern void drbd_free_mdev(struct drbd_conf *mdev); | ||
1366 | |||
1367 | extern int proc_details; | ||
1368 | |||
1369 | /* drbd_req */ | ||
1370 | extern int drbd_make_request_26(struct request_queue *q, struct bio *bio); | ||
1371 | extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); | ||
1372 | extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec); | ||
1373 | extern int is_valid_ar_handle(struct drbd_request *, sector_t); | ||
1374 | |||
1375 | |||
1376 | /* drbd_nl.c */ | ||
1377 | extern void drbd_suspend_io(struct drbd_conf *mdev); | ||
1378 | extern void drbd_resume_io(struct drbd_conf *mdev); | ||
1379 | extern char *ppsize(char *buf, unsigned long long size); | ||
1380 | extern sector_t drbd_new_dev_size(struct drbd_conf *, | ||
1381 | struct drbd_backing_dev *); | ||
1382 | enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; | ||
1383 | extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *) __must_hold(local); | ||
1384 | extern void resync_after_online_grow(struct drbd_conf *); | ||
1385 | extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local); | ||
1386 | extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, | ||
1387 | int force); | ||
1388 | enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev); | ||
1389 | extern int drbd_khelper(struct drbd_conf *mdev, char *cmd); | ||
1390 | |||
1391 | /* drbd_worker.c */ | ||
1392 | extern int drbd_worker(struct drbd_thread *thi); | ||
1393 | extern int drbd_alter_sa(struct drbd_conf *mdev, int na); | ||
1394 | extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side); | ||
1395 | extern void resume_next_sg(struct drbd_conf *mdev); | ||
1396 | extern void suspend_other_sg(struct drbd_conf *mdev); | ||
1397 | extern int drbd_resync_finished(struct drbd_conf *mdev); | ||
1398 | /* maybe rather drbd_main.c ? */ | ||
1399 | extern int drbd_md_sync_page_io(struct drbd_conf *mdev, | ||
1400 | struct drbd_backing_dev *bdev, sector_t sector, int rw); | ||
1401 | extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int); | ||
1402 | |||
1403 | static inline void ov_oos_print(struct drbd_conf *mdev) | ||
1404 | { | ||
1405 | if (mdev->ov_last_oos_size) { | ||
1406 | dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n", | ||
1407 | (unsigned long long)mdev->ov_last_oos_start, | ||
1408 | (unsigned long)mdev->ov_last_oos_size); | ||
1409 | } | ||
1410 | mdev->ov_last_oos_size=0; | ||
1411 | } | ||
1412 | |||
1413 | |||
1414 | extern void drbd_csum(struct drbd_conf *, struct crypto_hash *, struct bio *, void *); | ||
1415 | /* worker callbacks */ | ||
1416 | extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int); | ||
1417 | extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int); | ||
1418 | extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int); | ||
1419 | extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int); | ||
1420 | extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int); | ||
1421 | extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int); | ||
1422 | extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int); | ||
1423 | extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int); | ||
1424 | extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int); | ||
1425 | extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int); | ||
1426 | extern int w_io_error(struct drbd_conf *, struct drbd_work *, int); | ||
1427 | extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int); | ||
1428 | extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int); | ||
1429 | extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int); | ||
1430 | extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int); | ||
1431 | extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int); | ||
1432 | extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int); | ||
1433 | extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int); | ||
1434 | |||
1435 | extern void resync_timer_fn(unsigned long data); | ||
1436 | |||
1437 | /* drbd_receiver.c */ | ||
1438 | extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list); | ||
1439 | extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, | ||
1440 | u64 id, | ||
1441 | sector_t sector, | ||
1442 | unsigned int data_size, | ||
1443 | gfp_t gfp_mask) __must_hold(local); | ||
1444 | extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e); | ||
1445 | extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev, | ||
1446 | struct list_head *head); | ||
1447 | extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, | ||
1448 | struct list_head *head); | ||
1449 | extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled); | ||
1450 | extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed); | ||
1451 | extern void drbd_flush_workqueue(struct drbd_conf *mdev); | ||
1452 | |||
1453 | /* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to | ||
1454 | * mess with get_fs/set_fs, we know we are KERNEL_DS always. */ | ||
1455 | static inline int drbd_setsockopt(struct socket *sock, int level, int optname, | ||
1456 | char __user *optval, int optlen) | ||
1457 | { | ||
1458 | int err; | ||
1459 | if (level == SOL_SOCKET) | ||
1460 | err = sock_setsockopt(sock, level, optname, optval, optlen); | ||
1461 | else | ||
1462 | err = sock->ops->setsockopt(sock, level, optname, optval, | ||
1463 | optlen); | ||
1464 | return err; | ||
1465 | } | ||
1466 | |||
1467 | static inline void drbd_tcp_cork(struct socket *sock) | ||
1468 | { | ||
1469 | int __user val = 1; | ||
1470 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, | ||
1471 | (char __user *)&val, sizeof(val)); | ||
1472 | } | ||
1473 | |||
1474 | static inline void drbd_tcp_uncork(struct socket *sock) | ||
1475 | { | ||
1476 | int __user val = 0; | ||
1477 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, | ||
1478 | (char __user *)&val, sizeof(val)); | ||
1479 | } | ||
1480 | |||
1481 | static inline void drbd_tcp_nodelay(struct socket *sock) | ||
1482 | { | ||
1483 | int __user val = 1; | ||
1484 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY, | ||
1485 | (char __user *)&val, sizeof(val)); | ||
1486 | } | ||
1487 | |||
1488 | static inline void drbd_tcp_quickack(struct socket *sock) | ||
1489 | { | ||
1490 | int __user val = 1; | ||
1491 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK, | ||
1492 | (char __user *)&val, sizeof(val)); | ||
1493 | } | ||
1494 | |||
1495 | void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo); | ||
1496 | |||
1497 | /* drbd_proc.c */ | ||
1498 | extern struct proc_dir_entry *drbd_proc; | ||
1499 | extern struct file_operations drbd_proc_fops; | ||
1500 | extern const char *drbd_conn_str(enum drbd_conns s); | ||
1501 | extern const char *drbd_role_str(enum drbd_role s); | ||
1502 | |||
1503 | /* drbd_actlog.c */ | ||
1504 | extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector); | ||
1505 | extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector); | ||
1506 | extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); | ||
1507 | extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector); | ||
1508 | extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector); | ||
1509 | extern void drbd_rs_cancel_all(struct drbd_conf *mdev); | ||
1510 | extern int drbd_rs_del_all(struct drbd_conf *mdev); | ||
1511 | extern void drbd_rs_failed_io(struct drbd_conf *mdev, | ||
1512 | sector_t sector, int size); | ||
1513 | extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *); | ||
1514 | extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, | ||
1515 | int size, const char *file, const unsigned int line); | ||
1516 | #define drbd_set_in_sync(mdev, sector, size) \ | ||
1517 | __drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__) | ||
1518 | extern void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, | ||
1519 | int size, const char *file, const unsigned int line); | ||
1520 | #define drbd_set_out_of_sync(mdev, sector, size) \ | ||
1521 | __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) | ||
1522 | extern void drbd_al_apply_to_bm(struct drbd_conf *mdev); | ||
1523 | extern void drbd_al_to_on_disk_bm(struct drbd_conf *mdev); | ||
1524 | extern void drbd_al_shrink(struct drbd_conf *mdev); | ||
1525 | |||
1526 | |||
1527 | /* drbd_nl.c */ | ||
1528 | |||
1529 | void drbd_nl_cleanup(void); | ||
1530 | int __init drbd_nl_init(void); | ||
1531 | void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state); | ||
1532 | void drbd_bcast_sync_progress(struct drbd_conf *mdev); | ||
1533 | void drbd_bcast_ee(struct drbd_conf *mdev, | ||
1534 | const char *reason, const int dgs, | ||
1535 | const char* seen_hash, const char* calc_hash, | ||
1536 | const struct drbd_epoch_entry* e); | ||
1537 | |||
1538 | |||
1539 | /** | ||
1540 | * DOC: DRBD State macros | ||
1541 | * | ||
1542 | * These macros are used to express state changes in easily readable form. | ||
1543 | * | ||
1544 | * The NS macros expand to a mask and a value, that can be bit ored onto the | ||
1545 | * current state as soon as the spinlock (req_lock) was taken. | ||
1546 | * | ||
1547 | * The _NS macros are used for state functions that get called with the | ||
1548 | * spinlock. These macros expand directly to the new state value. | ||
1549 | * | ||
1550 | * Besides the basic forms NS() and _NS() additional _?NS[23] are defined | ||
1551 | * to express state changes that affect more than one aspect of the state. | ||
1552 | * | ||
1553 | * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY) | ||
1554 | * Means that the network connection was established and that the peer | ||
1555 | * is in secondary role. | ||
1556 | */ | ||
1557 | #define role_MASK R_MASK | ||
1558 | #define peer_MASK R_MASK | ||
1559 | #define disk_MASK D_MASK | ||
1560 | #define pdsk_MASK D_MASK | ||
1561 | #define conn_MASK C_MASK | ||
1562 | #define susp_MASK 1 | ||
1563 | #define user_isp_MASK 1 | ||
1564 | #define aftr_isp_MASK 1 | ||
1565 | |||
1566 | #define NS(T, S) \ | ||
1567 | ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \ | ||
1568 | ({ union drbd_state val; val.i = 0; val.T = (S); val; }) | ||
1569 | #define NS2(T1, S1, T2, S2) \ | ||
1570 | ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ | ||
1571 | mask.T2 = T2##_MASK; mask; }), \ | ||
1572 | ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ | ||
1573 | val.T2 = (S2); val; }) | ||
1574 | #define NS3(T1, S1, T2, S2, T3, S3) \ | ||
1575 | ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ | ||
1576 | mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \ | ||
1577 | ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ | ||
1578 | val.T2 = (S2); val.T3 = (S3); val; }) | ||
1579 | |||
1580 | #define _NS(D, T, S) \ | ||
1581 | D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; }) | ||
1582 | #define _NS2(D, T1, S1, T2, S2) \ | ||
1583 | D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \ | ||
1584 | __ns.T2 = (S2); __ns; }) | ||
1585 | #define _NS3(D, T1, S1, T2, S2, T3, S3) \ | ||
1586 | D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \ | ||
1587 | __ns.T2 = (S2); __ns.T3 = (S3); __ns; }) | ||
1588 | |||
1589 | /* | ||
1590 | * inline helper functions | ||
1591 | *************************/ | ||
1592 | |||
1593 | static inline void drbd_state_lock(struct drbd_conf *mdev) | ||
1594 | { | ||
1595 | wait_event(mdev->misc_wait, | ||
1596 | !test_and_set_bit(CLUSTER_ST_CHANGE, &mdev->flags)); | ||
1597 | } | ||
1598 | |||
1599 | static inline void drbd_state_unlock(struct drbd_conf *mdev) | ||
1600 | { | ||
1601 | clear_bit(CLUSTER_ST_CHANGE, &mdev->flags); | ||
1602 | wake_up(&mdev->misc_wait); | ||
1603 | } | ||
1604 | |||
1605 | static inline int _drbd_set_state(struct drbd_conf *mdev, | ||
1606 | union drbd_state ns, enum chg_state_flags flags, | ||
1607 | struct completion *done) | ||
1608 | { | ||
1609 | int rv; | ||
1610 | |||
1611 | read_lock(&global_state_lock); | ||
1612 | rv = __drbd_set_state(mdev, ns, flags, done); | ||
1613 | read_unlock(&global_state_lock); | ||
1614 | |||
1615 | return rv; | ||
1616 | } | ||
1617 | |||
1618 | /** | ||
1619 | * drbd_request_state() - Reqest a state change | ||
1620 | * @mdev: DRBD device. | ||
1621 | * @mask: mask of state bits to change. | ||
1622 | * @val: value of new state bits. | ||
1623 | * | ||
1624 | * This is the most graceful way of requesting a state change. It is verbose | ||
1625 | * quite verbose in case the state change is not possible, and all those | ||
1626 | * state changes are globally serialized. | ||
1627 | */ | ||
1628 | static inline int drbd_request_state(struct drbd_conf *mdev, | ||
1629 | union drbd_state mask, | ||
1630 | union drbd_state val) | ||
1631 | { | ||
1632 | return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED); | ||
1633 | } | ||
1634 | |||
1635 | #define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__) | ||
1636 | static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, const char *where) | ||
1637 | { | ||
1638 | switch (mdev->ldev->dc.on_io_error) { | ||
1639 | case EP_PASS_ON: | ||
1640 | if (!forcedetach) { | ||
1641 | if (printk_ratelimit()) | ||
1642 | dev_err(DEV, "Local IO failed in %s." | ||
1643 | "Passing error on...\n", where); | ||
1644 | break; | ||
1645 | } | ||
1646 | /* NOTE fall through to detach case if forcedetach set */ | ||
1647 | case EP_DETACH: | ||
1648 | case EP_CALL_HELPER: | ||
1649 | if (mdev->state.disk > D_FAILED) { | ||
1650 | _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); | ||
1651 | dev_err(DEV, "Local IO failed in %s." | ||
1652 | "Detaching...\n", where); | ||
1653 | } | ||
1654 | break; | ||
1655 | } | ||
1656 | } | ||
1657 | |||
1658 | /** | ||
1659 | * drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers | ||
1660 | * @mdev: DRBD device. | ||
1661 | * @error: Error code passed to the IO completion callback | ||
1662 | * @forcedetach: Force detach. I.e. the error happened while accessing the meta data | ||
1663 | * | ||
1664 | * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED) | ||
1665 | */ | ||
1666 | #define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__) | ||
1667 | static inline void drbd_chk_io_error_(struct drbd_conf *mdev, | ||
1668 | int error, int forcedetach, const char *where) | ||
1669 | { | ||
1670 | if (error) { | ||
1671 | unsigned long flags; | ||
1672 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
1673 | __drbd_chk_io_error_(mdev, forcedetach, where); | ||
1674 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
1675 | } | ||
1676 | } | ||
1677 | |||
1678 | |||
1679 | /** | ||
1680 | * drbd_md_first_sector() - Returns the first sector number of the meta data area | ||
1681 | * @bdev: Meta data block device. | ||
1682 | * | ||
1683 | * BTW, for internal meta data, this happens to be the maximum capacity | ||
1684 | * we could agree upon with our peer node. | ||
1685 | */ | ||
1686 | static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) | ||
1687 | { | ||
1688 | switch (bdev->dc.meta_dev_idx) { | ||
1689 | case DRBD_MD_INDEX_INTERNAL: | ||
1690 | case DRBD_MD_INDEX_FLEX_INT: | ||
1691 | return bdev->md.md_offset + bdev->md.bm_offset; | ||
1692 | case DRBD_MD_INDEX_FLEX_EXT: | ||
1693 | default: | ||
1694 | return bdev->md.md_offset; | ||
1695 | } | ||
1696 | } | ||
1697 | |||
1698 | /** | ||
1699 | * drbd_md_last_sector() - Return the last sector number of the meta data area | ||
1700 | * @bdev: Meta data block device. | ||
1701 | */ | ||
1702 | static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) | ||
1703 | { | ||
1704 | switch (bdev->dc.meta_dev_idx) { | ||
1705 | case DRBD_MD_INDEX_INTERNAL: | ||
1706 | case DRBD_MD_INDEX_FLEX_INT: | ||
1707 | return bdev->md.md_offset + MD_AL_OFFSET - 1; | ||
1708 | case DRBD_MD_INDEX_FLEX_EXT: | ||
1709 | default: | ||
1710 | return bdev->md.md_offset + bdev->md.md_size_sect; | ||
1711 | } | ||
1712 | } | ||
1713 | |||
1714 | /* Returns the number of 512 byte sectors of the device */ | ||
1715 | static inline sector_t drbd_get_capacity(struct block_device *bdev) | ||
1716 | { | ||
1717 | /* return bdev ? get_capacity(bdev->bd_disk) : 0; */ | ||
1718 | return bdev ? bdev->bd_inode->i_size >> 9 : 0; | ||
1719 | } | ||
1720 | |||
1721 | /** | ||
1722 | * drbd_get_max_capacity() - Returns the capacity we announce to out peer | ||
1723 | * @bdev: Meta data block device. | ||
1724 | * | ||
1725 | * returns the capacity we announce to out peer. we clip ourselves at the | ||
1726 | * various MAX_SECTORS, because if we don't, current implementation will | ||
1727 | * oops sooner or later | ||
1728 | */ | ||
1729 | static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) | ||
1730 | { | ||
1731 | sector_t s; | ||
1732 | switch (bdev->dc.meta_dev_idx) { | ||
1733 | case DRBD_MD_INDEX_INTERNAL: | ||
1734 | case DRBD_MD_INDEX_FLEX_INT: | ||
1735 | s = drbd_get_capacity(bdev->backing_bdev) | ||
1736 | ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, | ||
1737 | drbd_md_first_sector(bdev)) | ||
1738 | : 0; | ||
1739 | break; | ||
1740 | case DRBD_MD_INDEX_FLEX_EXT: | ||
1741 | s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX, | ||
1742 | drbd_get_capacity(bdev->backing_bdev)); | ||
1743 | /* clip at maximum size the meta device can support */ | ||
1744 | s = min_t(sector_t, s, | ||
1745 | BM_EXT_TO_SECT(bdev->md.md_size_sect | ||
1746 | - bdev->md.bm_offset)); | ||
1747 | break; | ||
1748 | default: | ||
1749 | s = min_t(sector_t, DRBD_MAX_SECTORS, | ||
1750 | drbd_get_capacity(bdev->backing_bdev)); | ||
1751 | } | ||
1752 | return s; | ||
1753 | } | ||
1754 | |||
1755 | /** | ||
1756 | * drbd_md_ss__() - Return the sector number of our meta data super block | ||
1757 | * @mdev: DRBD device. | ||
1758 | * @bdev: Meta data block device. | ||
1759 | */ | ||
1760 | static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, | ||
1761 | struct drbd_backing_dev *bdev) | ||
1762 | { | ||
1763 | switch (bdev->dc.meta_dev_idx) { | ||
1764 | default: /* external, some index */ | ||
1765 | return MD_RESERVED_SECT * bdev->dc.meta_dev_idx; | ||
1766 | case DRBD_MD_INDEX_INTERNAL: | ||
1767 | /* with drbd08, internal meta data is always "flexible" */ | ||
1768 | case DRBD_MD_INDEX_FLEX_INT: | ||
1769 | /* sizeof(struct md_on_disk_07) == 4k | ||
1770 | * position: last 4k aligned block of 4k size */ | ||
1771 | if (!bdev->backing_bdev) { | ||
1772 | if (__ratelimit(&drbd_ratelimit_state)) { | ||
1773 | dev_err(DEV, "bdev->backing_bdev==NULL\n"); | ||
1774 | dump_stack(); | ||
1775 | } | ||
1776 | return 0; | ||
1777 | } | ||
1778 | return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) | ||
1779 | - MD_AL_OFFSET; | ||
1780 | case DRBD_MD_INDEX_FLEX_EXT: | ||
1781 | return 0; | ||
1782 | } | ||
1783 | } | ||
1784 | |||
1785 | static inline void | ||
1786 | _drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) | ||
1787 | { | ||
1788 | list_add_tail(&w->list, &q->q); | ||
1789 | up(&q->s); | ||
1790 | } | ||
1791 | |||
1792 | static inline void | ||
1793 | drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w) | ||
1794 | { | ||
1795 | unsigned long flags; | ||
1796 | spin_lock_irqsave(&q->q_lock, flags); | ||
1797 | list_add(&w->list, &q->q); | ||
1798 | up(&q->s); /* within the spinlock, | ||
1799 | see comment near end of drbd_worker() */ | ||
1800 | spin_unlock_irqrestore(&q->q_lock, flags); | ||
1801 | } | ||
1802 | |||
1803 | static inline void | ||
1804 | drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) | ||
1805 | { | ||
1806 | unsigned long flags; | ||
1807 | spin_lock_irqsave(&q->q_lock, flags); | ||
1808 | list_add_tail(&w->list, &q->q); | ||
1809 | up(&q->s); /* within the spinlock, | ||
1810 | see comment near end of drbd_worker() */ | ||
1811 | spin_unlock_irqrestore(&q->q_lock, flags); | ||
1812 | } | ||
1813 | |||
1814 | static inline void wake_asender(struct drbd_conf *mdev) | ||
1815 | { | ||
1816 | if (test_bit(SIGNAL_ASENDER, &mdev->flags)) | ||
1817 | force_sig(DRBD_SIG, mdev->asender.task); | ||
1818 | } | ||
1819 | |||
1820 | static inline void request_ping(struct drbd_conf *mdev) | ||
1821 | { | ||
1822 | set_bit(SEND_PING, &mdev->flags); | ||
1823 | wake_asender(mdev); | ||
1824 | } | ||
1825 | |||
1826 | static inline int drbd_send_short_cmd(struct drbd_conf *mdev, | ||
1827 | enum drbd_packets cmd) | ||
1828 | { | ||
1829 | struct p_header h; | ||
1830 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h)); | ||
1831 | } | ||
1832 | |||
1833 | static inline int drbd_send_ping(struct drbd_conf *mdev) | ||
1834 | { | ||
1835 | struct p_header h; | ||
1836 | return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h)); | ||
1837 | } | ||
1838 | |||
1839 | static inline int drbd_send_ping_ack(struct drbd_conf *mdev) | ||
1840 | { | ||
1841 | struct p_header h; | ||
1842 | return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h)); | ||
1843 | } | ||
1844 | |||
1845 | static inline void drbd_thread_stop(struct drbd_thread *thi) | ||
1846 | { | ||
1847 | _drbd_thread_stop(thi, FALSE, TRUE); | ||
1848 | } | ||
1849 | |||
1850 | static inline void drbd_thread_stop_nowait(struct drbd_thread *thi) | ||
1851 | { | ||
1852 | _drbd_thread_stop(thi, FALSE, FALSE); | ||
1853 | } | ||
1854 | |||
1855 | static inline void drbd_thread_restart_nowait(struct drbd_thread *thi) | ||
1856 | { | ||
1857 | _drbd_thread_stop(thi, TRUE, FALSE); | ||
1858 | } | ||
1859 | |||
1860 | /* counts how many answer packets packets we expect from our peer, | ||
1861 | * for either explicit application requests, | ||
1862 | * or implicit barrier packets as necessary. | ||
1863 | * increased: | ||
1864 | * w_send_barrier | ||
1865 | * _req_mod(req, queue_for_net_write or queue_for_net_read); | ||
1866 | * it is much easier and equally valid to count what we queue for the | ||
1867 | * worker, even before it actually was queued or send. | ||
1868 | * (drbd_make_request_common; recovery path on read io-error) | ||
1869 | * decreased: | ||
1870 | * got_BarrierAck (respective tl_clear, tl_clear_barrier) | ||
1871 | * _req_mod(req, data_received) | ||
1872 | * [from receive_DataReply] | ||
1873 | * _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked) | ||
1874 | * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)] | ||
1875 | * for some reason it is NOT decreased in got_NegAck, | ||
1876 | * but in the resulting cleanup code from report_params. | ||
1877 | * we should try to remember the reason for that... | ||
1878 | * _req_mod(req, send_failed or send_canceled) | ||
1879 | * _req_mod(req, connection_lost_while_pending) | ||
1880 | * [from tl_clear_barrier] | ||
1881 | */ | ||
1882 | static inline void inc_ap_pending(struct drbd_conf *mdev) | ||
1883 | { | ||
1884 | atomic_inc(&mdev->ap_pending_cnt); | ||
1885 | } | ||
1886 | |||
1887 | #define ERR_IF_CNT_IS_NEGATIVE(which) \ | ||
1888 | if (atomic_read(&mdev->which) < 0) \ | ||
1889 | dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \ | ||
1890 | __func__ , __LINE__ , \ | ||
1891 | atomic_read(&mdev->which)) | ||
1892 | |||
1893 | #define dec_ap_pending(mdev) do { \ | ||
1894 | typecheck(struct drbd_conf *, mdev); \ | ||
1895 | if (atomic_dec_and_test(&mdev->ap_pending_cnt)) \ | ||
1896 | wake_up(&mdev->misc_wait); \ | ||
1897 | ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0) | ||
1898 | |||
1899 | /* counts how many resync-related answers we still expect from the peer | ||
1900 | * increase decrease | ||
1901 | * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY) | ||
1902 | * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK whith ID_SYNCER) | ||
1903 | * (or P_NEG_ACK with ID_SYNCER) | ||
1904 | */ | ||
1905 | static inline void inc_rs_pending(struct drbd_conf *mdev) | ||
1906 | { | ||
1907 | atomic_inc(&mdev->rs_pending_cnt); | ||
1908 | } | ||
1909 | |||
1910 | #define dec_rs_pending(mdev) do { \ | ||
1911 | typecheck(struct drbd_conf *, mdev); \ | ||
1912 | atomic_dec(&mdev->rs_pending_cnt); \ | ||
1913 | ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0) | ||
1914 | |||
1915 | /* counts how many answers we still need to send to the peer. | ||
1916 | * increased on | ||
1917 | * receive_Data unless protocol A; | ||
1918 | * we need to send a P_RECV_ACK (proto B) | ||
1919 | * or P_WRITE_ACK (proto C) | ||
1920 | * receive_RSDataReply (recv_resync_read) we need to send a P_WRITE_ACK | ||
1921 | * receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA | ||
1922 | * receive_Barrier_* we need to send a P_BARRIER_ACK | ||
1923 | */ | ||
1924 | static inline void inc_unacked(struct drbd_conf *mdev) | ||
1925 | { | ||
1926 | atomic_inc(&mdev->unacked_cnt); | ||
1927 | } | ||
1928 | |||
1929 | #define dec_unacked(mdev) do { \ | ||
1930 | typecheck(struct drbd_conf *, mdev); \ | ||
1931 | atomic_dec(&mdev->unacked_cnt); \ | ||
1932 | ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) | ||
1933 | |||
1934 | #define sub_unacked(mdev, n) do { \ | ||
1935 | typecheck(struct drbd_conf *, mdev); \ | ||
1936 | atomic_sub(n, &mdev->unacked_cnt); \ | ||
1937 | ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) | ||
1938 | |||
1939 | |||
1940 | static inline void put_net_conf(struct drbd_conf *mdev) | ||
1941 | { | ||
1942 | if (atomic_dec_and_test(&mdev->net_cnt)) | ||
1943 | wake_up(&mdev->misc_wait); | ||
1944 | } | ||
1945 | |||
1946 | /** | ||
1947 | * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there | ||
1948 | * @mdev: DRBD device. | ||
1949 | * | ||
1950 | * You have to call put_net_conf() when finished working with mdev->net_conf. | ||
1951 | */ | ||
1952 | static inline int get_net_conf(struct drbd_conf *mdev) | ||
1953 | { | ||
1954 | int have_net_conf; | ||
1955 | |||
1956 | atomic_inc(&mdev->net_cnt); | ||
1957 | have_net_conf = mdev->state.conn >= C_UNCONNECTED; | ||
1958 | if (!have_net_conf) | ||
1959 | put_net_conf(mdev); | ||
1960 | return have_net_conf; | ||
1961 | } | ||
1962 | |||
1963 | /** | ||
1964 | * get_ldev() - Increase the ref count on mdev->ldev. Returns 0 if there is no ldev | ||
1965 | * @M: DRBD device. | ||
1966 | * | ||
1967 | * You have to call put_ldev() when finished working with mdev->ldev. | ||
1968 | */ | ||
1969 | #define get_ldev(M) __cond_lock(local, _get_ldev_if_state(M,D_INCONSISTENT)) | ||
1970 | #define get_ldev_if_state(M,MINS) __cond_lock(local, _get_ldev_if_state(M,MINS)) | ||
1971 | |||
1972 | static inline void put_ldev(struct drbd_conf *mdev) | ||
1973 | { | ||
1974 | __release(local); | ||
1975 | if (atomic_dec_and_test(&mdev->local_cnt)) | ||
1976 | wake_up(&mdev->misc_wait); | ||
1977 | D_ASSERT(atomic_read(&mdev->local_cnt) >= 0); | ||
1978 | } | ||
1979 | |||
1980 | #ifndef __CHECKER__ | ||
1981 | static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins) | ||
1982 | { | ||
1983 | int io_allowed; | ||
1984 | |||
1985 | atomic_inc(&mdev->local_cnt); | ||
1986 | io_allowed = (mdev->state.disk >= mins); | ||
1987 | if (!io_allowed) | ||
1988 | put_ldev(mdev); | ||
1989 | return io_allowed; | ||
1990 | } | ||
1991 | #else | ||
1992 | extern int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins); | ||
1993 | #endif | ||
1994 | |||
1995 | /* you must have an "get_ldev" reference */ | ||
1996 | static inline void drbd_get_syncer_progress(struct drbd_conf *mdev, | ||
1997 | unsigned long *bits_left, unsigned int *per_mil_done) | ||
1998 | { | ||
1999 | /* | ||
2000 | * this is to break it at compile time when we change that | ||
2001 | * (we may feel 4TB maximum storage per drbd is not enough) | ||
2002 | */ | ||
2003 | typecheck(unsigned long, mdev->rs_total); | ||
2004 | |||
2005 | /* note: both rs_total and rs_left are in bits, i.e. in | ||
2006 | * units of BM_BLOCK_SIZE. | ||
2007 | * for the percentage, we don't care. */ | ||
2008 | |||
2009 | *bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed; | ||
2010 | /* >> 10 to prevent overflow, | ||
2011 | * +1 to prevent division by zero */ | ||
2012 | if (*bits_left > mdev->rs_total) { | ||
2013 | /* doh. maybe a logic bug somewhere. | ||
2014 | * may also be just a race condition | ||
2015 | * between this and a disconnect during sync. | ||
2016 | * for now, just prevent in-kernel buffer overflow. | ||
2017 | */ | ||
2018 | smp_rmb(); | ||
2019 | dev_warn(DEV, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n", | ||
2020 | drbd_conn_str(mdev->state.conn), | ||
2021 | *bits_left, mdev->rs_total, mdev->rs_failed); | ||
2022 | *per_mil_done = 0; | ||
2023 | } else { | ||
2024 | /* make sure the calculation happens in long context */ | ||
2025 | unsigned long tmp = 1000UL - | ||
2026 | (*bits_left >> 10)*1000UL | ||
2027 | / ((mdev->rs_total >> 10) + 1UL); | ||
2028 | *per_mil_done = tmp; | ||
2029 | } | ||
2030 | } | ||
2031 | |||
2032 | |||
2033 | /* this throttles on-the-fly application requests | ||
2034 | * according to max_buffers settings; | ||
2035 | * maybe re-implement using semaphores? */ | ||
2036 | static inline int drbd_get_max_buffers(struct drbd_conf *mdev) | ||
2037 | { | ||
2038 | int mxb = 1000000; /* arbitrary limit on open requests */ | ||
2039 | if (get_net_conf(mdev)) { | ||
2040 | mxb = mdev->net_conf->max_buffers; | ||
2041 | put_net_conf(mdev); | ||
2042 | } | ||
2043 | return mxb; | ||
2044 | } | ||
2045 | |||
2046 | static inline int drbd_state_is_stable(union drbd_state s) | ||
2047 | { | ||
2048 | |||
2049 | /* DO NOT add a default clause, we want the compiler to warn us | ||
2050 | * for any newly introduced state we may have forgotten to add here */ | ||
2051 | |||
2052 | switch ((enum drbd_conns)s.conn) { | ||
2053 | /* new io only accepted when there is no connection, ... */ | ||
2054 | case C_STANDALONE: | ||
2055 | case C_WF_CONNECTION: | ||
2056 | /* ... or there is a well established connection. */ | ||
2057 | case C_CONNECTED: | ||
2058 | case C_SYNC_SOURCE: | ||
2059 | case C_SYNC_TARGET: | ||
2060 | case C_VERIFY_S: | ||
2061 | case C_VERIFY_T: | ||
2062 | case C_PAUSED_SYNC_S: | ||
2063 | case C_PAUSED_SYNC_T: | ||
2064 | /* maybe stable, look at the disk state */ | ||
2065 | break; | ||
2066 | |||
2067 | /* no new io accepted during tansitional states | ||
2068 | * like handshake or teardown */ | ||
2069 | case C_DISCONNECTING: | ||
2070 | case C_UNCONNECTED: | ||
2071 | case C_TIMEOUT: | ||
2072 | case C_BROKEN_PIPE: | ||
2073 | case C_NETWORK_FAILURE: | ||
2074 | case C_PROTOCOL_ERROR: | ||
2075 | case C_TEAR_DOWN: | ||
2076 | case C_WF_REPORT_PARAMS: | ||
2077 | case C_STARTING_SYNC_S: | ||
2078 | case C_STARTING_SYNC_T: | ||
2079 | case C_WF_BITMAP_S: | ||
2080 | case C_WF_BITMAP_T: | ||
2081 | case C_WF_SYNC_UUID: | ||
2082 | case C_MASK: | ||
2083 | /* not "stable" */ | ||
2084 | return 0; | ||
2085 | } | ||
2086 | |||
2087 | switch ((enum drbd_disk_state)s.disk) { | ||
2088 | case D_DISKLESS: | ||
2089 | case D_INCONSISTENT: | ||
2090 | case D_OUTDATED: | ||
2091 | case D_CONSISTENT: | ||
2092 | case D_UP_TO_DATE: | ||
2093 | /* disk state is stable as well. */ | ||
2094 | break; | ||
2095 | |||
2096 | /* no new io accepted during tansitional states */ | ||
2097 | case D_ATTACHING: | ||
2098 | case D_FAILED: | ||
2099 | case D_NEGOTIATING: | ||
2100 | case D_UNKNOWN: | ||
2101 | case D_MASK: | ||
2102 | /* not "stable" */ | ||
2103 | return 0; | ||
2104 | } | ||
2105 | |||
2106 | return 1; | ||
2107 | } | ||
2108 | |||
2109 | static inline int __inc_ap_bio_cond(struct drbd_conf *mdev) | ||
2110 | { | ||
2111 | int mxb = drbd_get_max_buffers(mdev); | ||
2112 | |||
2113 | if (mdev->state.susp) | ||
2114 | return 0; | ||
2115 | if (test_bit(SUSPEND_IO, &mdev->flags)) | ||
2116 | return 0; | ||
2117 | |||
2118 | /* to avoid potential deadlock or bitmap corruption, | ||
2119 | * in various places, we only allow new application io | ||
2120 | * to start during "stable" states. */ | ||
2121 | |||
2122 | /* no new io accepted when attaching or detaching the disk */ | ||
2123 | if (!drbd_state_is_stable(mdev->state)) | ||
2124 | return 0; | ||
2125 | |||
2126 | /* since some older kernels don't have atomic_add_unless, | ||
2127 | * and we are within the spinlock anyways, we have this workaround. */ | ||
2128 | if (atomic_read(&mdev->ap_bio_cnt) > mxb) | ||
2129 | return 0; | ||
2130 | if (test_bit(BITMAP_IO, &mdev->flags)) | ||
2131 | return 0; | ||
2132 | return 1; | ||
2133 | } | ||
2134 | |||
2135 | /* I'd like to use wait_event_lock_irq, | ||
2136 | * but I'm not sure when it got introduced, | ||
2137 | * and not sure when it has 3 or 4 arguments */ | ||
2138 | static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two) | ||
2139 | { | ||
2140 | /* compare with after_state_ch, | ||
2141 | * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */ | ||
2142 | DEFINE_WAIT(wait); | ||
2143 | |||
2144 | /* we wait here | ||
2145 | * as long as the device is suspended | ||
2146 | * until the bitmap is no longer on the fly during connection | ||
2147 | * handshake as long as we would exeed the max_buffer limit. | ||
2148 | * | ||
2149 | * to avoid races with the reconnect code, | ||
2150 | * we need to atomic_inc within the spinlock. */ | ||
2151 | |||
2152 | spin_lock_irq(&mdev->req_lock); | ||
2153 | while (!__inc_ap_bio_cond(mdev)) { | ||
2154 | prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE); | ||
2155 | spin_unlock_irq(&mdev->req_lock); | ||
2156 | schedule(); | ||
2157 | finish_wait(&mdev->misc_wait, &wait); | ||
2158 | spin_lock_irq(&mdev->req_lock); | ||
2159 | } | ||
2160 | atomic_add(one_or_two, &mdev->ap_bio_cnt); | ||
2161 | spin_unlock_irq(&mdev->req_lock); | ||
2162 | } | ||
2163 | |||
2164 | static inline void dec_ap_bio(struct drbd_conf *mdev) | ||
2165 | { | ||
2166 | int mxb = drbd_get_max_buffers(mdev); | ||
2167 | int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt); | ||
2168 | |||
2169 | D_ASSERT(ap_bio >= 0); | ||
2170 | /* this currently does wake_up for every dec_ap_bio! | ||
2171 | * maybe rather introduce some type of hysteresis? | ||
2172 | * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */ | ||
2173 | if (ap_bio < mxb) | ||
2174 | wake_up(&mdev->misc_wait); | ||
2175 | if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) { | ||
2176 | if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) | ||
2177 | drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); | ||
2178 | } | ||
2179 | } | ||
2180 | |||
2181 | static inline void drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) | ||
2182 | { | ||
2183 | mdev->ed_uuid = val; | ||
2184 | } | ||
2185 | |||
2186 | static inline int seq_cmp(u32 a, u32 b) | ||
2187 | { | ||
2188 | /* we assume wrap around at 32bit. | ||
2189 | * for wrap around at 24bit (old atomic_t), | ||
2190 | * we'd have to | ||
2191 | * a <<= 8; b <<= 8; | ||
2192 | */ | ||
2193 | return (s32)(a) - (s32)(b); | ||
2194 | } | ||
2195 | #define seq_lt(a, b) (seq_cmp((a), (b)) < 0) | ||
2196 | #define seq_gt(a, b) (seq_cmp((a), (b)) > 0) | ||
2197 | #define seq_ge(a, b) (seq_cmp((a), (b)) >= 0) | ||
2198 | #define seq_le(a, b) (seq_cmp((a), (b)) <= 0) | ||
2199 | /* CAUTION: please no side effects in arguments! */ | ||
2200 | #define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b))) | ||
2201 | |||
2202 | static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq) | ||
2203 | { | ||
2204 | unsigned int m; | ||
2205 | spin_lock(&mdev->peer_seq_lock); | ||
2206 | m = seq_max(mdev->peer_seq, new_seq); | ||
2207 | mdev->peer_seq = m; | ||
2208 | spin_unlock(&mdev->peer_seq_lock); | ||
2209 | if (m == new_seq) | ||
2210 | wake_up(&mdev->seq_wait); | ||
2211 | } | ||
2212 | |||
2213 | static inline void drbd_update_congested(struct drbd_conf *mdev) | ||
2214 | { | ||
2215 | struct sock *sk = mdev->data.socket->sk; | ||
2216 | if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5) | ||
2217 | set_bit(NET_CONGESTED, &mdev->flags); | ||
2218 | } | ||
2219 | |||
2220 | static inline int drbd_queue_order_type(struct drbd_conf *mdev) | ||
2221 | { | ||
2222 | /* sorry, we currently have no working implementation | ||
2223 | * of distributed TCQ stuff */ | ||
2224 | #ifndef QUEUE_ORDERED_NONE | ||
2225 | #define QUEUE_ORDERED_NONE 0 | ||
2226 | #endif | ||
2227 | return QUEUE_ORDERED_NONE; | ||
2228 | } | ||
2229 | |||
2230 | static inline void drbd_blk_run_queue(struct request_queue *q) | ||
2231 | { | ||
2232 | if (q && q->unplug_fn) | ||
2233 | q->unplug_fn(q); | ||
2234 | } | ||
2235 | |||
2236 | static inline void drbd_kick_lo(struct drbd_conf *mdev) | ||
2237 | { | ||
2238 | if (get_ldev(mdev)) { | ||
2239 | drbd_blk_run_queue(bdev_get_queue(mdev->ldev->backing_bdev)); | ||
2240 | put_ldev(mdev); | ||
2241 | } | ||
2242 | } | ||
2243 | |||
2244 | static inline void drbd_md_flush(struct drbd_conf *mdev) | ||
2245 | { | ||
2246 | int r; | ||
2247 | |||
2248 | if (test_bit(MD_NO_BARRIER, &mdev->flags)) | ||
2249 | return; | ||
2250 | |||
2251 | r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL); | ||
2252 | if (r) { | ||
2253 | set_bit(MD_NO_BARRIER, &mdev->flags); | ||
2254 | dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); | ||
2255 | } | ||
2256 | } | ||
2257 | |||
2258 | #endif | ||
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c new file mode 100644 index 00000000000..edf0b8031e6 --- /dev/null +++ b/drivers/block/drbd/drbd_main.c | |||
@@ -0,0 +1,3735 @@ | |||
1 | /* | ||
2 | drbd.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev | ||
11 | from Logicworks, Inc. for making SDP replication support possible. | ||
12 | |||
13 | drbd is free software; you can redistribute it and/or modify | ||
14 | it under the terms of the GNU General Public License as published by | ||
15 | the Free Software Foundation; either version 2, or (at your option) | ||
16 | any later version. | ||
17 | |||
18 | drbd is distributed in the hope that it will be useful, | ||
19 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
20 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
21 | GNU General Public License for more details. | ||
22 | |||
23 | You should have received a copy of the GNU General Public License | ||
24 | along with drbd; see the file COPYING. If not, write to | ||
25 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
26 | |||
27 | */ | ||
28 | |||
29 | #include <linux/autoconf.h> | ||
30 | #include <linux/module.h> | ||
31 | #include <linux/version.h> | ||
32 | #include <linux/drbd.h> | ||
33 | #include <asm/uaccess.h> | ||
34 | #include <asm/types.h> | ||
35 | #include <net/sock.h> | ||
36 | #include <linux/ctype.h> | ||
37 | #include <linux/smp_lock.h> | ||
38 | #include <linux/fs.h> | ||
39 | #include <linux/file.h> | ||
40 | #include <linux/proc_fs.h> | ||
41 | #include <linux/init.h> | ||
42 | #include <linux/mm.h> | ||
43 | #include <linux/memcontrol.h> | ||
44 | #include <linux/mm_inline.h> | ||
45 | #include <linux/slab.h> | ||
46 | #include <linux/random.h> | ||
47 | #include <linux/reboot.h> | ||
48 | #include <linux/notifier.h> | ||
49 | #include <linux/kthread.h> | ||
50 | |||
51 | #define __KERNEL_SYSCALLS__ | ||
52 | #include <linux/unistd.h> | ||
53 | #include <linux/vmalloc.h> | ||
54 | |||
55 | #include <linux/drbd_limits.h> | ||
56 | #include "drbd_int.h" | ||
57 | #include "drbd_tracing.h" | ||
58 | #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */ | ||
59 | |||
60 | #include "drbd_vli.h" | ||
61 | |||
62 | struct after_state_chg_work { | ||
63 | struct drbd_work w; | ||
64 | union drbd_state os; | ||
65 | union drbd_state ns; | ||
66 | enum chg_state_flags flags; | ||
67 | struct completion *done; | ||
68 | }; | ||
69 | |||
70 | int drbdd_init(struct drbd_thread *); | ||
71 | int drbd_worker(struct drbd_thread *); | ||
72 | int drbd_asender(struct drbd_thread *); | ||
73 | |||
74 | int drbd_init(void); | ||
75 | static int drbd_open(struct block_device *bdev, fmode_t mode); | ||
76 | static int drbd_release(struct gendisk *gd, fmode_t mode); | ||
77 | static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused); | ||
78 | static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | ||
79 | union drbd_state ns, enum chg_state_flags flags); | ||
80 | static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused); | ||
81 | static void md_sync_timer_fn(unsigned long data); | ||
82 | static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); | ||
83 | |||
84 | DEFINE_TRACE(drbd_unplug); | ||
85 | DEFINE_TRACE(drbd_uuid); | ||
86 | DEFINE_TRACE(drbd_ee); | ||
87 | DEFINE_TRACE(drbd_packet); | ||
88 | DEFINE_TRACE(drbd_md_io); | ||
89 | DEFINE_TRACE(drbd_epoch); | ||
90 | DEFINE_TRACE(drbd_netlink); | ||
91 | DEFINE_TRACE(drbd_actlog); | ||
92 | DEFINE_TRACE(drbd_bio); | ||
93 | DEFINE_TRACE(_drbd_resync); | ||
94 | DEFINE_TRACE(drbd_req); | ||
95 | |||
96 | MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " | ||
97 | "Lars Ellenberg <lars@linbit.com>"); | ||
98 | MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION); | ||
99 | MODULE_VERSION(REL_VERSION); | ||
100 | MODULE_LICENSE("GPL"); | ||
101 | MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)"); | ||
102 | MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR); | ||
103 | |||
104 | #include <linux/moduleparam.h> | ||
105 | /* allow_open_on_secondary */ | ||
106 | MODULE_PARM_DESC(allow_oos, "DONT USE!"); | ||
107 | /* thanks to these macros, if compiled into the kernel (not-module), | ||
108 | * this becomes the boot parameter drbd.minor_count */ | ||
109 | module_param(minor_count, uint, 0444); | ||
110 | module_param(disable_sendpage, bool, 0644); | ||
111 | module_param(allow_oos, bool, 0); | ||
112 | module_param(cn_idx, uint, 0444); | ||
113 | module_param(proc_details, int, 0644); | ||
114 | |||
115 | #ifdef CONFIG_DRBD_FAULT_INJECTION | ||
116 | int enable_faults; | ||
117 | int fault_rate; | ||
118 | static int fault_count; | ||
119 | int fault_devs; | ||
120 | /* bitmap of enabled faults */ | ||
121 | module_param(enable_faults, int, 0664); | ||
122 | /* fault rate % value - applies to all enabled faults */ | ||
123 | module_param(fault_rate, int, 0664); | ||
124 | /* count of faults inserted */ | ||
125 | module_param(fault_count, int, 0664); | ||
126 | /* bitmap of devices to insert faults on */ | ||
127 | module_param(fault_devs, int, 0644); | ||
128 | #endif | ||
129 | |||
130 | /* module parameter, defined */ | ||
131 | unsigned int minor_count = 32; | ||
132 | int disable_sendpage; | ||
133 | int allow_oos; | ||
134 | unsigned int cn_idx = CN_IDX_DRBD; | ||
135 | int proc_details; /* Detail level in proc drbd*/ | ||
136 | |||
137 | /* Module parameter for setting the user mode helper program | ||
138 | * to run. Default is /sbin/drbdadm */ | ||
139 | char usermode_helper[80] = "/sbin/drbdadm"; | ||
140 | |||
141 | module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644); | ||
142 | |||
143 | /* in 2.6.x, our device mapping and config info contains our virtual gendisks | ||
144 | * as member "struct gendisk *vdisk;" | ||
145 | */ | ||
146 | struct drbd_conf **minor_table; | ||
147 | |||
148 | struct kmem_cache *drbd_request_cache; | ||
149 | struct kmem_cache *drbd_ee_cache; /* epoch entries */ | ||
150 | struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ | ||
151 | struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ | ||
152 | mempool_t *drbd_request_mempool; | ||
153 | mempool_t *drbd_ee_mempool; | ||
154 | |||
155 | /* I do not use a standard mempool, because: | ||
156 | 1) I want to hand out the pre-allocated objects first. | ||
157 | 2) I want to be able to interrupt sleeping allocation with a signal. | ||
158 | Note: This is a single linked list, the next pointer is the private | ||
159 | member of struct page. | ||
160 | */ | ||
161 | struct page *drbd_pp_pool; | ||
162 | spinlock_t drbd_pp_lock; | ||
163 | int drbd_pp_vacant; | ||
164 | wait_queue_head_t drbd_pp_wait; | ||
165 | |||
166 | DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5); | ||
167 | |||
168 | static struct block_device_operations drbd_ops = { | ||
169 | .owner = THIS_MODULE, | ||
170 | .open = drbd_open, | ||
171 | .release = drbd_release, | ||
172 | }; | ||
173 | |||
174 | #define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0])) | ||
175 | |||
176 | #ifdef __CHECKER__ | ||
177 | /* When checking with sparse, and this is an inline function, sparse will | ||
178 | give tons of false positives. When this is a real functions sparse works. | ||
179 | */ | ||
180 | int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins) | ||
181 | { | ||
182 | int io_allowed; | ||
183 | |||
184 | atomic_inc(&mdev->local_cnt); | ||
185 | io_allowed = (mdev->state.disk >= mins); | ||
186 | if (!io_allowed) { | ||
187 | if (atomic_dec_and_test(&mdev->local_cnt)) | ||
188 | wake_up(&mdev->misc_wait); | ||
189 | } | ||
190 | return io_allowed; | ||
191 | } | ||
192 | |||
193 | #endif | ||
194 | |||
195 | /** | ||
196 | * DOC: The transfer log | ||
197 | * | ||
198 | * The transfer log is a single linked list of &struct drbd_tl_epoch objects. | ||
199 | * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail | ||
200 | * of the list. There is always at least one &struct drbd_tl_epoch object. | ||
201 | * | ||
202 | * Each &struct drbd_tl_epoch has a circular double linked list of requests | ||
203 | * attached. | ||
204 | */ | ||
205 | static int tl_init(struct drbd_conf *mdev) | ||
206 | { | ||
207 | struct drbd_tl_epoch *b; | ||
208 | |||
209 | /* during device minor initialization, we may well use GFP_KERNEL */ | ||
210 | b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL); | ||
211 | if (!b) | ||
212 | return 0; | ||
213 | INIT_LIST_HEAD(&b->requests); | ||
214 | INIT_LIST_HEAD(&b->w.list); | ||
215 | b->next = NULL; | ||
216 | b->br_number = 4711; | ||
217 | b->n_req = 0; | ||
218 | b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ | ||
219 | |||
220 | mdev->oldest_tle = b; | ||
221 | mdev->newest_tle = b; | ||
222 | INIT_LIST_HEAD(&mdev->out_of_sequence_requests); | ||
223 | |||
224 | mdev->tl_hash = NULL; | ||
225 | mdev->tl_hash_s = 0; | ||
226 | |||
227 | return 1; | ||
228 | } | ||
229 | |||
230 | static void tl_cleanup(struct drbd_conf *mdev) | ||
231 | { | ||
232 | D_ASSERT(mdev->oldest_tle == mdev->newest_tle); | ||
233 | D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); | ||
234 | kfree(mdev->oldest_tle); | ||
235 | mdev->oldest_tle = NULL; | ||
236 | kfree(mdev->unused_spare_tle); | ||
237 | mdev->unused_spare_tle = NULL; | ||
238 | kfree(mdev->tl_hash); | ||
239 | mdev->tl_hash = NULL; | ||
240 | mdev->tl_hash_s = 0; | ||
241 | } | ||
242 | |||
243 | /** | ||
244 | * _tl_add_barrier() - Adds a barrier to the transfer log | ||
245 | * @mdev: DRBD device. | ||
246 | * @new: Barrier to be added before the current head of the TL. | ||
247 | * | ||
248 | * The caller must hold the req_lock. | ||
249 | */ | ||
250 | void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new) | ||
251 | { | ||
252 | struct drbd_tl_epoch *newest_before; | ||
253 | |||
254 | INIT_LIST_HEAD(&new->requests); | ||
255 | INIT_LIST_HEAD(&new->w.list); | ||
256 | new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ | ||
257 | new->next = NULL; | ||
258 | new->n_req = 0; | ||
259 | |||
260 | newest_before = mdev->newest_tle; | ||
261 | /* never send a barrier number == 0, because that is special-cased | ||
262 | * when using TCQ for our write ordering code */ | ||
263 | new->br_number = (newest_before->br_number+1) ?: 1; | ||
264 | if (mdev->newest_tle != new) { | ||
265 | mdev->newest_tle->next = new; | ||
266 | mdev->newest_tle = new; | ||
267 | } | ||
268 | } | ||
269 | |||
270 | /** | ||
271 | * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL | ||
272 | * @mdev: DRBD device. | ||
273 | * @barrier_nr: Expected identifier of the DRBD write barrier packet. | ||
274 | * @set_size: Expected number of requests before that barrier. | ||
275 | * | ||
276 | * In case the passed barrier_nr or set_size does not match the oldest | ||
277 | * &struct drbd_tl_epoch objects this function will cause a termination | ||
278 | * of the connection. | ||
279 | */ | ||
280 | void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, | ||
281 | unsigned int set_size) | ||
282 | { | ||
283 | struct drbd_tl_epoch *b, *nob; /* next old barrier */ | ||
284 | struct list_head *le, *tle; | ||
285 | struct drbd_request *r; | ||
286 | |||
287 | spin_lock_irq(&mdev->req_lock); | ||
288 | |||
289 | b = mdev->oldest_tle; | ||
290 | |||
291 | /* first some paranoia code */ | ||
292 | if (b == NULL) { | ||
293 | dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n", | ||
294 | barrier_nr); | ||
295 | goto bail; | ||
296 | } | ||
297 | if (b->br_number != barrier_nr) { | ||
298 | dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n", | ||
299 | barrier_nr, b->br_number); | ||
300 | goto bail; | ||
301 | } | ||
302 | if (b->n_req != set_size) { | ||
303 | dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n", | ||
304 | barrier_nr, set_size, b->n_req); | ||
305 | goto bail; | ||
306 | } | ||
307 | |||
308 | /* Clean up list of requests processed during current epoch */ | ||
309 | list_for_each_safe(le, tle, &b->requests) { | ||
310 | r = list_entry(le, struct drbd_request, tl_requests); | ||
311 | _req_mod(r, barrier_acked); | ||
312 | } | ||
313 | /* There could be requests on the list waiting for completion | ||
314 | of the write to the local disk. To avoid corruptions of | ||
315 | slab's data structures we have to remove the lists head. | ||
316 | |||
317 | Also there could have been a barrier ack out of sequence, overtaking | ||
318 | the write acks - which would be a bug and violating write ordering. | ||
319 | To not deadlock in case we lose connection while such requests are | ||
320 | still pending, we need some way to find them for the | ||
321 | _req_mode(connection_lost_while_pending). | ||
322 | |||
323 | These have been list_move'd to the out_of_sequence_requests list in | ||
324 | _req_mod(, barrier_acked) above. | ||
325 | */ | ||
326 | list_del_init(&b->requests); | ||
327 | |||
328 | nob = b->next; | ||
329 | if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { | ||
330 | _tl_add_barrier(mdev, b); | ||
331 | if (nob) | ||
332 | mdev->oldest_tle = nob; | ||
333 | /* if nob == NULL b was the only barrier, and becomes the new | ||
334 | barrier. Therefore mdev->oldest_tle points already to b */ | ||
335 | } else { | ||
336 | D_ASSERT(nob != NULL); | ||
337 | mdev->oldest_tle = nob; | ||
338 | kfree(b); | ||
339 | } | ||
340 | |||
341 | spin_unlock_irq(&mdev->req_lock); | ||
342 | dec_ap_pending(mdev); | ||
343 | |||
344 | return; | ||
345 | |||
346 | bail: | ||
347 | spin_unlock_irq(&mdev->req_lock); | ||
348 | drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); | ||
349 | } | ||
350 | |||
351 | |||
352 | /** | ||
353 | * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL | ||
354 | * @mdev: DRBD device. | ||
355 | * | ||
356 | * This is called after the connection to the peer was lost. The storage covered | ||
357 | * by the requests on the transfer gets marked as our of sync. Called from the | ||
358 | * receiver thread and the worker thread. | ||
359 | */ | ||
360 | void tl_clear(struct drbd_conf *mdev) | ||
361 | { | ||
362 | struct drbd_tl_epoch *b, *tmp; | ||
363 | struct list_head *le, *tle; | ||
364 | struct drbd_request *r; | ||
365 | int new_initial_bnr = net_random(); | ||
366 | |||
367 | spin_lock_irq(&mdev->req_lock); | ||
368 | |||
369 | b = mdev->oldest_tle; | ||
370 | while (b) { | ||
371 | list_for_each_safe(le, tle, &b->requests) { | ||
372 | r = list_entry(le, struct drbd_request, tl_requests); | ||
373 | /* It would be nice to complete outside of spinlock. | ||
374 | * But this is easier for now. */ | ||
375 | _req_mod(r, connection_lost_while_pending); | ||
376 | } | ||
377 | tmp = b->next; | ||
378 | |||
379 | /* there could still be requests on that ring list, | ||
380 | * in case local io is still pending */ | ||
381 | list_del(&b->requests); | ||
382 | |||
383 | /* dec_ap_pending corresponding to queue_barrier. | ||
384 | * the newest barrier may not have been queued yet, | ||
385 | * in which case w.cb is still NULL. */ | ||
386 | if (b->w.cb != NULL) | ||
387 | dec_ap_pending(mdev); | ||
388 | |||
389 | if (b == mdev->newest_tle) { | ||
390 | /* recycle, but reinit! */ | ||
391 | D_ASSERT(tmp == NULL); | ||
392 | INIT_LIST_HEAD(&b->requests); | ||
393 | INIT_LIST_HEAD(&b->w.list); | ||
394 | b->w.cb = NULL; | ||
395 | b->br_number = new_initial_bnr; | ||
396 | b->n_req = 0; | ||
397 | |||
398 | mdev->oldest_tle = b; | ||
399 | break; | ||
400 | } | ||
401 | kfree(b); | ||
402 | b = tmp; | ||
403 | } | ||
404 | |||
405 | /* we expect this list to be empty. */ | ||
406 | D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); | ||
407 | |||
408 | /* but just in case, clean it up anyways! */ | ||
409 | list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) { | ||
410 | r = list_entry(le, struct drbd_request, tl_requests); | ||
411 | /* It would be nice to complete outside of spinlock. | ||
412 | * But this is easier for now. */ | ||
413 | _req_mod(r, connection_lost_while_pending); | ||
414 | } | ||
415 | |||
416 | /* ensure bit indicating barrier is required is clear */ | ||
417 | clear_bit(CREATE_BARRIER, &mdev->flags); | ||
418 | |||
419 | spin_unlock_irq(&mdev->req_lock); | ||
420 | } | ||
421 | |||
422 | /** | ||
423 | * cl_wide_st_chg() - TRUE if the state change is a cluster wide one | ||
424 | * @mdev: DRBD device. | ||
425 | * @os: old (current) state. | ||
426 | * @ns: new (wanted) state. | ||
427 | */ | ||
428 | static int cl_wide_st_chg(struct drbd_conf *mdev, | ||
429 | union drbd_state os, union drbd_state ns) | ||
430 | { | ||
431 | return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED && | ||
432 | ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || | ||
433 | (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || | ||
434 | (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || | ||
435 | (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) || | ||
436 | (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || | ||
437 | (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S); | ||
438 | } | ||
439 | |||
440 | int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, | ||
441 | union drbd_state mask, union drbd_state val) | ||
442 | { | ||
443 | unsigned long flags; | ||
444 | union drbd_state os, ns; | ||
445 | int rv; | ||
446 | |||
447 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
448 | os = mdev->state; | ||
449 | ns.i = (os.i & ~mask.i) | val.i; | ||
450 | rv = _drbd_set_state(mdev, ns, f, NULL); | ||
451 | ns = mdev->state; | ||
452 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
453 | |||
454 | return rv; | ||
455 | } | ||
456 | |||
457 | /** | ||
458 | * drbd_force_state() - Impose a change which happens outside our control on our state | ||
459 | * @mdev: DRBD device. | ||
460 | * @mask: mask of state bits to change. | ||
461 | * @val: value of new state bits. | ||
462 | */ | ||
463 | void drbd_force_state(struct drbd_conf *mdev, | ||
464 | union drbd_state mask, union drbd_state val) | ||
465 | { | ||
466 | drbd_change_state(mdev, CS_HARD, mask, val); | ||
467 | } | ||
468 | |||
469 | static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns); | ||
470 | static int is_valid_state_transition(struct drbd_conf *, | ||
471 | union drbd_state, union drbd_state); | ||
472 | static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, | ||
473 | union drbd_state ns, int *warn_sync_abort); | ||
474 | int drbd_send_state_req(struct drbd_conf *, | ||
475 | union drbd_state, union drbd_state); | ||
476 | |||
477 | static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev, | ||
478 | union drbd_state mask, union drbd_state val) | ||
479 | { | ||
480 | union drbd_state os, ns; | ||
481 | unsigned long flags; | ||
482 | int rv; | ||
483 | |||
484 | if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags)) | ||
485 | return SS_CW_SUCCESS; | ||
486 | |||
487 | if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags)) | ||
488 | return SS_CW_FAILED_BY_PEER; | ||
489 | |||
490 | rv = 0; | ||
491 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
492 | os = mdev->state; | ||
493 | ns.i = (os.i & ~mask.i) | val.i; | ||
494 | ns = sanitize_state(mdev, os, ns, NULL); | ||
495 | |||
496 | if (!cl_wide_st_chg(mdev, os, ns)) | ||
497 | rv = SS_CW_NO_NEED; | ||
498 | if (!rv) { | ||
499 | rv = is_valid_state(mdev, ns); | ||
500 | if (rv == SS_SUCCESS) { | ||
501 | rv = is_valid_state_transition(mdev, ns, os); | ||
502 | if (rv == SS_SUCCESS) | ||
503 | rv = 0; /* cont waiting, otherwise fail. */ | ||
504 | } | ||
505 | } | ||
506 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
507 | |||
508 | return rv; | ||
509 | } | ||
510 | |||
511 | /** | ||
512 | * drbd_req_state() - Perform an eventually cluster wide state change | ||
513 | * @mdev: DRBD device. | ||
514 | * @mask: mask of state bits to change. | ||
515 | * @val: value of new state bits. | ||
516 | * @f: flags | ||
517 | * | ||
518 | * Should not be called directly, use drbd_request_state() or | ||
519 | * _drbd_request_state(). | ||
520 | */ | ||
521 | static int drbd_req_state(struct drbd_conf *mdev, | ||
522 | union drbd_state mask, union drbd_state val, | ||
523 | enum chg_state_flags f) | ||
524 | { | ||
525 | struct completion done; | ||
526 | unsigned long flags; | ||
527 | union drbd_state os, ns; | ||
528 | int rv; | ||
529 | |||
530 | init_completion(&done); | ||
531 | |||
532 | if (f & CS_SERIALIZE) | ||
533 | mutex_lock(&mdev->state_mutex); | ||
534 | |||
535 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
536 | os = mdev->state; | ||
537 | ns.i = (os.i & ~mask.i) | val.i; | ||
538 | ns = sanitize_state(mdev, os, ns, NULL); | ||
539 | |||
540 | if (cl_wide_st_chg(mdev, os, ns)) { | ||
541 | rv = is_valid_state(mdev, ns); | ||
542 | if (rv == SS_SUCCESS) | ||
543 | rv = is_valid_state_transition(mdev, ns, os); | ||
544 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
545 | |||
546 | if (rv < SS_SUCCESS) { | ||
547 | if (f & CS_VERBOSE) | ||
548 | print_st_err(mdev, os, ns, rv); | ||
549 | goto abort; | ||
550 | } | ||
551 | |||
552 | drbd_state_lock(mdev); | ||
553 | if (!drbd_send_state_req(mdev, mask, val)) { | ||
554 | drbd_state_unlock(mdev); | ||
555 | rv = SS_CW_FAILED_BY_PEER; | ||
556 | if (f & CS_VERBOSE) | ||
557 | print_st_err(mdev, os, ns, rv); | ||
558 | goto abort; | ||
559 | } | ||
560 | |||
561 | wait_event(mdev->state_wait, | ||
562 | (rv = _req_st_cond(mdev, mask, val))); | ||
563 | |||
564 | if (rv < SS_SUCCESS) { | ||
565 | drbd_state_unlock(mdev); | ||
566 | if (f & CS_VERBOSE) | ||
567 | print_st_err(mdev, os, ns, rv); | ||
568 | goto abort; | ||
569 | } | ||
570 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
571 | os = mdev->state; | ||
572 | ns.i = (os.i & ~mask.i) | val.i; | ||
573 | rv = _drbd_set_state(mdev, ns, f, &done); | ||
574 | drbd_state_unlock(mdev); | ||
575 | } else { | ||
576 | rv = _drbd_set_state(mdev, ns, f, &done); | ||
577 | } | ||
578 | |||
579 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
580 | |||
581 | if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) { | ||
582 | D_ASSERT(current != mdev->worker.task); | ||
583 | wait_for_completion(&done); | ||
584 | } | ||
585 | |||
586 | abort: | ||
587 | if (f & CS_SERIALIZE) | ||
588 | mutex_unlock(&mdev->state_mutex); | ||
589 | |||
590 | return rv; | ||
591 | } | ||
592 | |||
593 | /** | ||
594 | * _drbd_request_state() - Request a state change (with flags) | ||
595 | * @mdev: DRBD device. | ||
596 | * @mask: mask of state bits to change. | ||
597 | * @val: value of new state bits. | ||
598 | * @f: flags | ||
599 | * | ||
600 | * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE | ||
601 | * flag, or when logging of failed state change requests is not desired. | ||
602 | */ | ||
603 | int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask, | ||
604 | union drbd_state val, enum chg_state_flags f) | ||
605 | { | ||
606 | int rv; | ||
607 | |||
608 | wait_event(mdev->state_wait, | ||
609 | (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE); | ||
610 | |||
611 | return rv; | ||
612 | } | ||
613 | |||
614 | static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns) | ||
615 | { | ||
616 | dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n", | ||
617 | name, | ||
618 | drbd_conn_str(ns.conn), | ||
619 | drbd_role_str(ns.role), | ||
620 | drbd_role_str(ns.peer), | ||
621 | drbd_disk_str(ns.disk), | ||
622 | drbd_disk_str(ns.pdsk), | ||
623 | ns.susp ? 's' : 'r', | ||
624 | ns.aftr_isp ? 'a' : '-', | ||
625 | ns.peer_isp ? 'p' : '-', | ||
626 | ns.user_isp ? 'u' : '-' | ||
627 | ); | ||
628 | } | ||
629 | |||
630 | void print_st_err(struct drbd_conf *mdev, | ||
631 | union drbd_state os, union drbd_state ns, int err) | ||
632 | { | ||
633 | if (err == SS_IN_TRANSIENT_STATE) | ||
634 | return; | ||
635 | dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err)); | ||
636 | print_st(mdev, " state", os); | ||
637 | print_st(mdev, "wanted", ns); | ||
638 | } | ||
639 | |||
640 | |||
641 | #define drbd_peer_str drbd_role_str | ||
642 | #define drbd_pdsk_str drbd_disk_str | ||
643 | |||
644 | #define drbd_susp_str(A) ((A) ? "1" : "0") | ||
645 | #define drbd_aftr_isp_str(A) ((A) ? "1" : "0") | ||
646 | #define drbd_peer_isp_str(A) ((A) ? "1" : "0") | ||
647 | #define drbd_user_isp_str(A) ((A) ? "1" : "0") | ||
648 | |||
649 | #define PSC(A) \ | ||
650 | ({ if (ns.A != os.A) { \ | ||
651 | pbp += sprintf(pbp, #A "( %s -> %s ) ", \ | ||
652 | drbd_##A##_str(os.A), \ | ||
653 | drbd_##A##_str(ns.A)); \ | ||
654 | } }) | ||
655 | |||
656 | /** | ||
657 | * is_valid_state() - Returns an SS_ error code if ns is not valid | ||
658 | * @mdev: DRBD device. | ||
659 | * @ns: State to consider. | ||
660 | */ | ||
661 | static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns) | ||
662 | { | ||
663 | /* See drbd_state_sw_errors in drbd_strings.c */ | ||
664 | |||
665 | enum drbd_fencing_p fp; | ||
666 | int rv = SS_SUCCESS; | ||
667 | |||
668 | fp = FP_DONT_CARE; | ||
669 | if (get_ldev(mdev)) { | ||
670 | fp = mdev->ldev->dc.fencing; | ||
671 | put_ldev(mdev); | ||
672 | } | ||
673 | |||
674 | if (get_net_conf(mdev)) { | ||
675 | if (!mdev->net_conf->two_primaries && | ||
676 | ns.role == R_PRIMARY && ns.peer == R_PRIMARY) | ||
677 | rv = SS_TWO_PRIMARIES; | ||
678 | put_net_conf(mdev); | ||
679 | } | ||
680 | |||
681 | if (rv <= 0) | ||
682 | /* already found a reason to abort */; | ||
683 | else if (ns.role == R_SECONDARY && mdev->open_cnt) | ||
684 | rv = SS_DEVICE_IN_USE; | ||
685 | |||
686 | else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE) | ||
687 | rv = SS_NO_UP_TO_DATE_DISK; | ||
688 | |||
689 | else if (fp >= FP_RESOURCE && | ||
690 | ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN) | ||
691 | rv = SS_PRIMARY_NOP; | ||
692 | |||
693 | else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT) | ||
694 | rv = SS_NO_UP_TO_DATE_DISK; | ||
695 | |||
696 | else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT) | ||
697 | rv = SS_NO_LOCAL_DISK; | ||
698 | |||
699 | else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) | ||
700 | rv = SS_NO_REMOTE_DISK; | ||
701 | |||
702 | else if ((ns.conn == C_CONNECTED || | ||
703 | ns.conn == C_WF_BITMAP_S || | ||
704 | ns.conn == C_SYNC_SOURCE || | ||
705 | ns.conn == C_PAUSED_SYNC_S) && | ||
706 | ns.disk == D_OUTDATED) | ||
707 | rv = SS_CONNECTED_OUTDATES; | ||
708 | |||
709 | else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
710 | (mdev->sync_conf.verify_alg[0] == 0)) | ||
711 | rv = SS_NO_VERIFY_ALG; | ||
712 | |||
713 | else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
714 | mdev->agreed_pro_version < 88) | ||
715 | rv = SS_NOT_SUPPORTED; | ||
716 | |||
717 | return rv; | ||
718 | } | ||
719 | |||
720 | /** | ||
721 | * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible | ||
722 | * @mdev: DRBD device. | ||
723 | * @ns: new state. | ||
724 | * @os: old state. | ||
725 | */ | ||
726 | static int is_valid_state_transition(struct drbd_conf *mdev, | ||
727 | union drbd_state ns, union drbd_state os) | ||
728 | { | ||
729 | int rv = SS_SUCCESS; | ||
730 | |||
731 | if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) && | ||
732 | os.conn > C_CONNECTED) | ||
733 | rv = SS_RESYNC_RUNNING; | ||
734 | |||
735 | if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE) | ||
736 | rv = SS_ALREADY_STANDALONE; | ||
737 | |||
738 | if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS) | ||
739 | rv = SS_IS_DISKLESS; | ||
740 | |||
741 | if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED) | ||
742 | rv = SS_NO_NET_CONFIG; | ||
743 | |||
744 | if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING) | ||
745 | rv = SS_LOWER_THAN_OUTDATED; | ||
746 | |||
747 | if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) | ||
748 | rv = SS_IN_TRANSIENT_STATE; | ||
749 | |||
750 | if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) | ||
751 | rv = SS_IN_TRANSIENT_STATE; | ||
752 | |||
753 | if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) | ||
754 | rv = SS_NEED_CONNECTION; | ||
755 | |||
756 | if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
757 | ns.conn != os.conn && os.conn > C_CONNECTED) | ||
758 | rv = SS_RESYNC_RUNNING; | ||
759 | |||
760 | if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && | ||
761 | os.conn < C_CONNECTED) | ||
762 | rv = SS_NEED_CONNECTION; | ||
763 | |||
764 | return rv; | ||
765 | } | ||
766 | |||
767 | /** | ||
768 | * sanitize_state() - Resolves implicitly necessary additional changes to a state transition | ||
769 | * @mdev: DRBD device. | ||
770 | * @os: old state. | ||
771 | * @ns: new state. | ||
772 | * @warn_sync_abort: | ||
773 | * | ||
774 | * When we loose connection, we have to set the state of the peers disk (pdsk) | ||
775 | * to D_UNKNOWN. This rule and many more along those lines are in this function. | ||
776 | */ | ||
777 | static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, | ||
778 | union drbd_state ns, int *warn_sync_abort) | ||
779 | { | ||
780 | enum drbd_fencing_p fp; | ||
781 | |||
782 | fp = FP_DONT_CARE; | ||
783 | if (get_ldev(mdev)) { | ||
784 | fp = mdev->ldev->dc.fencing; | ||
785 | put_ldev(mdev); | ||
786 | } | ||
787 | |||
788 | /* Disallow Network errors to configure a device's network part */ | ||
789 | if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) && | ||
790 | os.conn <= C_DISCONNECTING) | ||
791 | ns.conn = os.conn; | ||
792 | |||
793 | /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */ | ||
794 | if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN && | ||
795 | ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING) | ||
796 | ns.conn = os.conn; | ||
797 | |||
798 | /* After C_DISCONNECTING only C_STANDALONE may follow */ | ||
799 | if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) | ||
800 | ns.conn = os.conn; | ||
801 | |||
802 | if (ns.conn < C_CONNECTED) { | ||
803 | ns.peer_isp = 0; | ||
804 | ns.peer = R_UNKNOWN; | ||
805 | if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT) | ||
806 | ns.pdsk = D_UNKNOWN; | ||
807 | } | ||
808 | |||
809 | /* Clear the aftr_isp when becoming unconfigured */ | ||
810 | if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) | ||
811 | ns.aftr_isp = 0; | ||
812 | |||
813 | if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS) | ||
814 | ns.pdsk = D_UNKNOWN; | ||
815 | |||
816 | /* Abort resync if a disk fails/detaches */ | ||
817 | if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && | ||
818 | (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { | ||
819 | if (warn_sync_abort) | ||
820 | *warn_sync_abort = 1; | ||
821 | ns.conn = C_CONNECTED; | ||
822 | } | ||
823 | |||
824 | if (ns.conn >= C_CONNECTED && | ||
825 | ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) || | ||
826 | (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) { | ||
827 | switch (ns.conn) { | ||
828 | case C_WF_BITMAP_T: | ||
829 | case C_PAUSED_SYNC_T: | ||
830 | ns.disk = D_OUTDATED; | ||
831 | break; | ||
832 | case C_CONNECTED: | ||
833 | case C_WF_BITMAP_S: | ||
834 | case C_SYNC_SOURCE: | ||
835 | case C_PAUSED_SYNC_S: | ||
836 | ns.disk = D_UP_TO_DATE; | ||
837 | break; | ||
838 | case C_SYNC_TARGET: | ||
839 | ns.disk = D_INCONSISTENT; | ||
840 | dev_warn(DEV, "Implicitly set disk state Inconsistent!\n"); | ||
841 | break; | ||
842 | } | ||
843 | if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE) | ||
844 | dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n"); | ||
845 | } | ||
846 | |||
847 | if (ns.conn >= C_CONNECTED && | ||
848 | (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) { | ||
849 | switch (ns.conn) { | ||
850 | case C_CONNECTED: | ||
851 | case C_WF_BITMAP_T: | ||
852 | case C_PAUSED_SYNC_T: | ||
853 | case C_SYNC_TARGET: | ||
854 | ns.pdsk = D_UP_TO_DATE; | ||
855 | break; | ||
856 | case C_WF_BITMAP_S: | ||
857 | case C_PAUSED_SYNC_S: | ||
858 | ns.pdsk = D_OUTDATED; | ||
859 | break; | ||
860 | case C_SYNC_SOURCE: | ||
861 | ns.pdsk = D_INCONSISTENT; | ||
862 | dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n"); | ||
863 | break; | ||
864 | } | ||
865 | if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE) | ||
866 | dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n"); | ||
867 | } | ||
868 | |||
869 | /* Connection breaks down before we finished "Negotiating" */ | ||
870 | if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING && | ||
871 | get_ldev_if_state(mdev, D_NEGOTIATING)) { | ||
872 | if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) { | ||
873 | ns.disk = mdev->new_state_tmp.disk; | ||
874 | ns.pdsk = mdev->new_state_tmp.pdsk; | ||
875 | } else { | ||
876 | dev_alert(DEV, "Connection lost while negotiating, no data!\n"); | ||
877 | ns.disk = D_DISKLESS; | ||
878 | ns.pdsk = D_UNKNOWN; | ||
879 | } | ||
880 | put_ldev(mdev); | ||
881 | } | ||
882 | |||
883 | if (fp == FP_STONITH && | ||
884 | (ns.role == R_PRIMARY && | ||
885 | ns.conn < C_CONNECTED && | ||
886 | ns.pdsk > D_OUTDATED)) | ||
887 | ns.susp = 1; | ||
888 | |||
889 | if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { | ||
890 | if (ns.conn == C_SYNC_SOURCE) | ||
891 | ns.conn = C_PAUSED_SYNC_S; | ||
892 | if (ns.conn == C_SYNC_TARGET) | ||
893 | ns.conn = C_PAUSED_SYNC_T; | ||
894 | } else { | ||
895 | if (ns.conn == C_PAUSED_SYNC_S) | ||
896 | ns.conn = C_SYNC_SOURCE; | ||
897 | if (ns.conn == C_PAUSED_SYNC_T) | ||
898 | ns.conn = C_SYNC_TARGET; | ||
899 | } | ||
900 | |||
901 | return ns; | ||
902 | } | ||
903 | |||
904 | /* helper for __drbd_set_state */ | ||
905 | static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs) | ||
906 | { | ||
907 | if (cs == C_VERIFY_T) { | ||
908 | /* starting online verify from an arbitrary position | ||
909 | * does not fit well into the existing protocol. | ||
910 | * on C_VERIFY_T, we initialize ov_left and friends | ||
911 | * implicitly in receive_DataRequest once the | ||
912 | * first P_OV_REQUEST is received */ | ||
913 | mdev->ov_start_sector = ~(sector_t)0; | ||
914 | } else { | ||
915 | unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector); | ||
916 | if (bit >= mdev->rs_total) | ||
917 | mdev->ov_start_sector = | ||
918 | BM_BIT_TO_SECT(mdev->rs_total - 1); | ||
919 | mdev->ov_position = mdev->ov_start_sector; | ||
920 | } | ||
921 | } | ||
922 | |||
923 | /** | ||
924 | * __drbd_set_state() - Set a new DRBD state | ||
925 | * @mdev: DRBD device. | ||
926 | * @ns: new state. | ||
927 | * @flags: Flags | ||
928 | * @done: Optional completion, that will get completed after the after_state_ch() finished | ||
929 | * | ||
930 | * Caller needs to hold req_lock, and global_state_lock. Do not call directly. | ||
931 | */ | ||
932 | int __drbd_set_state(struct drbd_conf *mdev, | ||
933 | union drbd_state ns, enum chg_state_flags flags, | ||
934 | struct completion *done) | ||
935 | { | ||
936 | union drbd_state os; | ||
937 | int rv = SS_SUCCESS; | ||
938 | int warn_sync_abort = 0; | ||
939 | struct after_state_chg_work *ascw; | ||
940 | |||
941 | os = mdev->state; | ||
942 | |||
943 | ns = sanitize_state(mdev, os, ns, &warn_sync_abort); | ||
944 | |||
945 | if (ns.i == os.i) | ||
946 | return SS_NOTHING_TO_DO; | ||
947 | |||
948 | if (!(flags & CS_HARD)) { | ||
949 | /* pre-state-change checks ; only look at ns */ | ||
950 | /* See drbd_state_sw_errors in drbd_strings.c */ | ||
951 | |||
952 | rv = is_valid_state(mdev, ns); | ||
953 | if (rv < SS_SUCCESS) { | ||
954 | /* If the old state was illegal as well, then let | ||
955 | this happen...*/ | ||
956 | |||
957 | if (is_valid_state(mdev, os) == rv) { | ||
958 | dev_err(DEV, "Considering state change from bad state. " | ||
959 | "Error would be: '%s'\n", | ||
960 | drbd_set_st_err_str(rv)); | ||
961 | print_st(mdev, "old", os); | ||
962 | print_st(mdev, "new", ns); | ||
963 | rv = is_valid_state_transition(mdev, ns, os); | ||
964 | } | ||
965 | } else | ||
966 | rv = is_valid_state_transition(mdev, ns, os); | ||
967 | } | ||
968 | |||
969 | if (rv < SS_SUCCESS) { | ||
970 | if (flags & CS_VERBOSE) | ||
971 | print_st_err(mdev, os, ns, rv); | ||
972 | return rv; | ||
973 | } | ||
974 | |||
975 | if (warn_sync_abort) | ||
976 | dev_warn(DEV, "Resync aborted.\n"); | ||
977 | |||
978 | { | ||
979 | char *pbp, pb[300]; | ||
980 | pbp = pb; | ||
981 | *pbp = 0; | ||
982 | PSC(role); | ||
983 | PSC(peer); | ||
984 | PSC(conn); | ||
985 | PSC(disk); | ||
986 | PSC(pdsk); | ||
987 | PSC(susp); | ||
988 | PSC(aftr_isp); | ||
989 | PSC(peer_isp); | ||
990 | PSC(user_isp); | ||
991 | dev_info(DEV, "%s\n", pb); | ||
992 | } | ||
993 | |||
994 | /* solve the race between becoming unconfigured, | ||
995 | * worker doing the cleanup, and | ||
996 | * admin reconfiguring us: | ||
997 | * on (re)configure, first set CONFIG_PENDING, | ||
998 | * then wait for a potentially exiting worker, | ||
999 | * start the worker, and schedule one no_op. | ||
1000 | * then proceed with configuration. | ||
1001 | */ | ||
1002 | if (ns.disk == D_DISKLESS && | ||
1003 | ns.conn == C_STANDALONE && | ||
1004 | ns.role == R_SECONDARY && | ||
1005 | !test_and_set_bit(CONFIG_PENDING, &mdev->flags)) | ||
1006 | set_bit(DEVICE_DYING, &mdev->flags); | ||
1007 | |||
1008 | mdev->state.i = ns.i; | ||
1009 | wake_up(&mdev->misc_wait); | ||
1010 | wake_up(&mdev->state_wait); | ||
1011 | |||
1012 | /* post-state-change actions */ | ||
1013 | if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) { | ||
1014 | set_bit(STOP_SYNC_TIMER, &mdev->flags); | ||
1015 | mod_timer(&mdev->resync_timer, jiffies); | ||
1016 | } | ||
1017 | |||
1018 | /* aborted verify run. log the last position */ | ||
1019 | if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && | ||
1020 | ns.conn < C_CONNECTED) { | ||
1021 | mdev->ov_start_sector = | ||
1022 | BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left); | ||
1023 | dev_info(DEV, "Online Verify reached sector %llu\n", | ||
1024 | (unsigned long long)mdev->ov_start_sector); | ||
1025 | } | ||
1026 | |||
1027 | if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && | ||
1028 | (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { | ||
1029 | dev_info(DEV, "Syncer continues.\n"); | ||
1030 | mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time; | ||
1031 | if (ns.conn == C_SYNC_TARGET) { | ||
1032 | if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags)) | ||
1033 | mod_timer(&mdev->resync_timer, jiffies); | ||
1034 | /* This if (!test_bit) is only needed for the case | ||
1035 | that a device that has ceased to used its timer, | ||
1036 | i.e. it is already in drbd_resync_finished() gets | ||
1037 | paused and resumed. */ | ||
1038 | } | ||
1039 | } | ||
1040 | |||
1041 | if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && | ||
1042 | (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { | ||
1043 | dev_info(DEV, "Resync suspended\n"); | ||
1044 | mdev->rs_mark_time = jiffies; | ||
1045 | if (ns.conn == C_PAUSED_SYNC_T) | ||
1046 | set_bit(STOP_SYNC_TIMER, &mdev->flags); | ||
1047 | } | ||
1048 | |||
1049 | if (os.conn == C_CONNECTED && | ||
1050 | (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { | ||
1051 | mdev->ov_position = 0; | ||
1052 | mdev->rs_total = | ||
1053 | mdev->rs_mark_left = drbd_bm_bits(mdev); | ||
1054 | if (mdev->agreed_pro_version >= 90) | ||
1055 | set_ov_position(mdev, ns.conn); | ||
1056 | else | ||
1057 | mdev->ov_start_sector = 0; | ||
1058 | mdev->ov_left = mdev->rs_total | ||
1059 | - BM_SECT_TO_BIT(mdev->ov_position); | ||
1060 | mdev->rs_start = | ||
1061 | mdev->rs_mark_time = jiffies; | ||
1062 | mdev->ov_last_oos_size = 0; | ||
1063 | mdev->ov_last_oos_start = 0; | ||
1064 | |||
1065 | if (ns.conn == C_VERIFY_S) { | ||
1066 | dev_info(DEV, "Starting Online Verify from sector %llu\n", | ||
1067 | (unsigned long long)mdev->ov_position); | ||
1068 | mod_timer(&mdev->resync_timer, jiffies); | ||
1069 | } | ||
1070 | } | ||
1071 | |||
1072 | if (get_ldev(mdev)) { | ||
1073 | u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND| | ||
1074 | MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE| | ||
1075 | MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY); | ||
1076 | |||
1077 | if (test_bit(CRASHED_PRIMARY, &mdev->flags)) | ||
1078 | mdf |= MDF_CRASHED_PRIMARY; | ||
1079 | if (mdev->state.role == R_PRIMARY || | ||
1080 | (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY)) | ||
1081 | mdf |= MDF_PRIMARY_IND; | ||
1082 | if (mdev->state.conn > C_WF_REPORT_PARAMS) | ||
1083 | mdf |= MDF_CONNECTED_IND; | ||
1084 | if (mdev->state.disk > D_INCONSISTENT) | ||
1085 | mdf |= MDF_CONSISTENT; | ||
1086 | if (mdev->state.disk > D_OUTDATED) | ||
1087 | mdf |= MDF_WAS_UP_TO_DATE; | ||
1088 | if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT) | ||
1089 | mdf |= MDF_PEER_OUT_DATED; | ||
1090 | if (mdf != mdev->ldev->md.flags) { | ||
1091 | mdev->ldev->md.flags = mdf; | ||
1092 | drbd_md_mark_dirty(mdev); | ||
1093 | } | ||
1094 | if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT) | ||
1095 | drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]); | ||
1096 | put_ldev(mdev); | ||
1097 | } | ||
1098 | |||
1099 | /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */ | ||
1100 | if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT && | ||
1101 | os.peer == R_SECONDARY && ns.peer == R_PRIMARY) | ||
1102 | set_bit(CONSIDER_RESYNC, &mdev->flags); | ||
1103 | |||
1104 | /* Receiver should clean up itself */ | ||
1105 | if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING) | ||
1106 | drbd_thread_stop_nowait(&mdev->receiver); | ||
1107 | |||
1108 | /* Now the receiver finished cleaning up itself, it should die */ | ||
1109 | if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE) | ||
1110 | drbd_thread_stop_nowait(&mdev->receiver); | ||
1111 | |||
1112 | /* Upon network failure, we need to restart the receiver. */ | ||
1113 | if (os.conn > C_TEAR_DOWN && | ||
1114 | ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) | ||
1115 | drbd_thread_restart_nowait(&mdev->receiver); | ||
1116 | |||
1117 | ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); | ||
1118 | if (ascw) { | ||
1119 | ascw->os = os; | ||
1120 | ascw->ns = ns; | ||
1121 | ascw->flags = flags; | ||
1122 | ascw->w.cb = w_after_state_ch; | ||
1123 | ascw->done = done; | ||
1124 | drbd_queue_work(&mdev->data.work, &ascw->w); | ||
1125 | } else { | ||
1126 | dev_warn(DEV, "Could not kmalloc an ascw\n"); | ||
1127 | } | ||
1128 | |||
1129 | return rv; | ||
1130 | } | ||
1131 | |||
1132 | static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
1133 | { | ||
1134 | struct after_state_chg_work *ascw = | ||
1135 | container_of(w, struct after_state_chg_work, w); | ||
1136 | after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags); | ||
1137 | if (ascw->flags & CS_WAIT_COMPLETE) { | ||
1138 | D_ASSERT(ascw->done != NULL); | ||
1139 | complete(ascw->done); | ||
1140 | } | ||
1141 | kfree(ascw); | ||
1142 | |||
1143 | return 1; | ||
1144 | } | ||
1145 | |||
1146 | static void abw_start_sync(struct drbd_conf *mdev, int rv) | ||
1147 | { | ||
1148 | if (rv) { | ||
1149 | dev_err(DEV, "Writing the bitmap failed not starting resync.\n"); | ||
1150 | _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE); | ||
1151 | return; | ||
1152 | } | ||
1153 | |||
1154 | switch (mdev->state.conn) { | ||
1155 | case C_STARTING_SYNC_T: | ||
1156 | _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); | ||
1157 | break; | ||
1158 | case C_STARTING_SYNC_S: | ||
1159 | drbd_start_resync(mdev, C_SYNC_SOURCE); | ||
1160 | break; | ||
1161 | } | ||
1162 | } | ||
1163 | |||
1164 | /** | ||
1165 | * after_state_ch() - Perform after state change actions that may sleep | ||
1166 | * @mdev: DRBD device. | ||
1167 | * @os: old state. | ||
1168 | * @ns: new state. | ||
1169 | * @flags: Flags | ||
1170 | */ | ||
1171 | static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | ||
1172 | union drbd_state ns, enum chg_state_flags flags) | ||
1173 | { | ||
1174 | enum drbd_fencing_p fp; | ||
1175 | |||
1176 | if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { | ||
1177 | clear_bit(CRASHED_PRIMARY, &mdev->flags); | ||
1178 | if (mdev->p_uuid) | ||
1179 | mdev->p_uuid[UI_FLAGS] &= ~((u64)2); | ||
1180 | } | ||
1181 | |||
1182 | fp = FP_DONT_CARE; | ||
1183 | if (get_ldev(mdev)) { | ||
1184 | fp = mdev->ldev->dc.fencing; | ||
1185 | put_ldev(mdev); | ||
1186 | } | ||
1187 | |||
1188 | /* Inform userspace about the change... */ | ||
1189 | drbd_bcast_state(mdev, ns); | ||
1190 | |||
1191 | if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) && | ||
1192 | (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) | ||
1193 | drbd_khelper(mdev, "pri-on-incon-degr"); | ||
1194 | |||
1195 | /* Here we have the actions that are performed after a | ||
1196 | state change. This function might sleep */ | ||
1197 | |||
1198 | if (fp == FP_STONITH && ns.susp) { | ||
1199 | /* case1: The outdate peer handler is successful: | ||
1200 | * case2: The connection was established again: */ | ||
1201 | if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) || | ||
1202 | (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) { | ||
1203 | tl_clear(mdev); | ||
1204 | spin_lock_irq(&mdev->req_lock); | ||
1205 | _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL); | ||
1206 | spin_unlock_irq(&mdev->req_lock); | ||
1207 | } | ||
1208 | } | ||
1209 | /* Do not change the order of the if above and the two below... */ | ||
1210 | if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ | ||
1211 | drbd_send_uuids(mdev); | ||
1212 | drbd_send_state(mdev); | ||
1213 | } | ||
1214 | if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S) | ||
1215 | drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)"); | ||
1216 | |||
1217 | /* Lost contact to peer's copy of the data */ | ||
1218 | if ((os.pdsk >= D_INCONSISTENT && | ||
1219 | os.pdsk != D_UNKNOWN && | ||
1220 | os.pdsk != D_OUTDATED) | ||
1221 | && (ns.pdsk < D_INCONSISTENT || | ||
1222 | ns.pdsk == D_UNKNOWN || | ||
1223 | ns.pdsk == D_OUTDATED)) { | ||
1224 | kfree(mdev->p_uuid); | ||
1225 | mdev->p_uuid = NULL; | ||
1226 | if (get_ldev(mdev)) { | ||
1227 | if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && | ||
1228 | mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { | ||
1229 | drbd_uuid_new_current(mdev); | ||
1230 | drbd_send_uuids(mdev); | ||
1231 | } | ||
1232 | put_ldev(mdev); | ||
1233 | } | ||
1234 | } | ||
1235 | |||
1236 | if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { | ||
1237 | if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) | ||
1238 | drbd_uuid_new_current(mdev); | ||
1239 | |||
1240 | /* D_DISKLESS Peer becomes secondary */ | ||
1241 | if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) | ||
1242 | drbd_al_to_on_disk_bm(mdev); | ||
1243 | put_ldev(mdev); | ||
1244 | } | ||
1245 | |||
1246 | /* Last part of the attaching process ... */ | ||
1247 | if (ns.conn >= C_CONNECTED && | ||
1248 | os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { | ||
1249 | kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */ | ||
1250 | mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */ | ||
1251 | drbd_send_sizes(mdev, 0); /* to start sync... */ | ||
1252 | drbd_send_uuids(mdev); | ||
1253 | drbd_send_state(mdev); | ||
1254 | } | ||
1255 | |||
1256 | /* We want to pause/continue resync, tell peer. */ | ||
1257 | if (ns.conn >= C_CONNECTED && | ||
1258 | ((os.aftr_isp != ns.aftr_isp) || | ||
1259 | (os.user_isp != ns.user_isp))) | ||
1260 | drbd_send_state(mdev); | ||
1261 | |||
1262 | /* In case one of the isp bits got set, suspend other devices. */ | ||
1263 | if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && | ||
1264 | (ns.aftr_isp || ns.peer_isp || ns.user_isp)) | ||
1265 | suspend_other_sg(mdev); | ||
1266 | |||
1267 | /* Make sure the peer gets informed about eventual state | ||
1268 | changes (ISP bits) while we were in WFReportParams. */ | ||
1269 | if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) | ||
1270 | drbd_send_state(mdev); | ||
1271 | |||
1272 | /* We are in the progress to start a full sync... */ | ||
1273 | if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || | ||
1274 | (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S)) | ||
1275 | drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync"); | ||
1276 | |||
1277 | /* We are invalidating our self... */ | ||
1278 | if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED && | ||
1279 | os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) | ||
1280 | drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate"); | ||
1281 | |||
1282 | if (os.disk > D_FAILED && ns.disk == D_FAILED) { | ||
1283 | enum drbd_io_error_p eh; | ||
1284 | |||
1285 | eh = EP_PASS_ON; | ||
1286 | if (get_ldev_if_state(mdev, D_FAILED)) { | ||
1287 | eh = mdev->ldev->dc.on_io_error; | ||
1288 | put_ldev(mdev); | ||
1289 | } | ||
1290 | |||
1291 | drbd_rs_cancel_all(mdev); | ||
1292 | /* since get_ldev() only works as long as disk>=D_INCONSISTENT, | ||
1293 | and it is D_DISKLESS here, local_cnt can only go down, it can | ||
1294 | not increase... It will reach zero */ | ||
1295 | wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); | ||
1296 | mdev->rs_total = 0; | ||
1297 | mdev->rs_failed = 0; | ||
1298 | atomic_set(&mdev->rs_pending_cnt, 0); | ||
1299 | |||
1300 | spin_lock_irq(&mdev->req_lock); | ||
1301 | _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL); | ||
1302 | spin_unlock_irq(&mdev->req_lock); | ||
1303 | |||
1304 | if (eh == EP_CALL_HELPER) | ||
1305 | drbd_khelper(mdev, "local-io-error"); | ||
1306 | } | ||
1307 | |||
1308 | if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) { | ||
1309 | |||
1310 | if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ { | ||
1311 | if (drbd_send_state(mdev)) | ||
1312 | dev_warn(DEV, "Notified peer that my disk is broken.\n"); | ||
1313 | else | ||
1314 | dev_err(DEV, "Sending state in drbd_io_error() failed\n"); | ||
1315 | } | ||
1316 | |||
1317 | lc_destroy(mdev->resync); | ||
1318 | mdev->resync = NULL; | ||
1319 | lc_destroy(mdev->act_log); | ||
1320 | mdev->act_log = NULL; | ||
1321 | __no_warn(local, | ||
1322 | drbd_free_bc(mdev->ldev); | ||
1323 | mdev->ldev = NULL;); | ||
1324 | |||
1325 | if (mdev->md_io_tmpp) | ||
1326 | __free_page(mdev->md_io_tmpp); | ||
1327 | } | ||
1328 | |||
1329 | /* Disks got bigger while they were detached */ | ||
1330 | if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && | ||
1331 | test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) { | ||
1332 | if (ns.conn == C_CONNECTED) | ||
1333 | resync_after_online_grow(mdev); | ||
1334 | } | ||
1335 | |||
1336 | /* A resync finished or aborted, wake paused devices... */ | ||
1337 | if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) || | ||
1338 | (os.peer_isp && !ns.peer_isp) || | ||
1339 | (os.user_isp && !ns.user_isp)) | ||
1340 | resume_next_sg(mdev); | ||
1341 | |||
1342 | /* Upon network connection, we need to start the receiver */ | ||
1343 | if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED) | ||
1344 | drbd_thread_start(&mdev->receiver); | ||
1345 | |||
1346 | /* Terminate worker thread if we are unconfigured - it will be | ||
1347 | restarted as needed... */ | ||
1348 | if (ns.disk == D_DISKLESS && | ||
1349 | ns.conn == C_STANDALONE && | ||
1350 | ns.role == R_SECONDARY) { | ||
1351 | if (os.aftr_isp != ns.aftr_isp) | ||
1352 | resume_next_sg(mdev); | ||
1353 | /* set in __drbd_set_state, unless CONFIG_PENDING was set */ | ||
1354 | if (test_bit(DEVICE_DYING, &mdev->flags)) | ||
1355 | drbd_thread_stop_nowait(&mdev->worker); | ||
1356 | } | ||
1357 | |||
1358 | drbd_md_sync(mdev); | ||
1359 | } | ||
1360 | |||
1361 | |||
1362 | static int drbd_thread_setup(void *arg) | ||
1363 | { | ||
1364 | struct drbd_thread *thi = (struct drbd_thread *) arg; | ||
1365 | struct drbd_conf *mdev = thi->mdev; | ||
1366 | unsigned long flags; | ||
1367 | int retval; | ||
1368 | |||
1369 | restart: | ||
1370 | retval = thi->function(thi); | ||
1371 | |||
1372 | spin_lock_irqsave(&thi->t_lock, flags); | ||
1373 | |||
1374 | /* if the receiver has been "Exiting", the last thing it did | ||
1375 | * was set the conn state to "StandAlone", | ||
1376 | * if now a re-connect request comes in, conn state goes C_UNCONNECTED, | ||
1377 | * and receiver thread will be "started". | ||
1378 | * drbd_thread_start needs to set "Restarting" in that case. | ||
1379 | * t_state check and assignment needs to be within the same spinlock, | ||
1380 | * so either thread_start sees Exiting, and can remap to Restarting, | ||
1381 | * or thread_start see None, and can proceed as normal. | ||
1382 | */ | ||
1383 | |||
1384 | if (thi->t_state == Restarting) { | ||
1385 | dev_info(DEV, "Restarting %s\n", current->comm); | ||
1386 | thi->t_state = Running; | ||
1387 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1388 | goto restart; | ||
1389 | } | ||
1390 | |||
1391 | thi->task = NULL; | ||
1392 | thi->t_state = None; | ||
1393 | smp_mb(); | ||
1394 | complete(&thi->stop); | ||
1395 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1396 | |||
1397 | dev_info(DEV, "Terminating %s\n", current->comm); | ||
1398 | |||
1399 | /* Release mod reference taken when thread was started */ | ||
1400 | module_put(THIS_MODULE); | ||
1401 | return retval; | ||
1402 | } | ||
1403 | |||
1404 | static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi, | ||
1405 | int (*func) (struct drbd_thread *)) | ||
1406 | { | ||
1407 | spin_lock_init(&thi->t_lock); | ||
1408 | thi->task = NULL; | ||
1409 | thi->t_state = None; | ||
1410 | thi->function = func; | ||
1411 | thi->mdev = mdev; | ||
1412 | } | ||
1413 | |||
1414 | int drbd_thread_start(struct drbd_thread *thi) | ||
1415 | { | ||
1416 | struct drbd_conf *mdev = thi->mdev; | ||
1417 | struct task_struct *nt; | ||
1418 | unsigned long flags; | ||
1419 | |||
1420 | const char *me = | ||
1421 | thi == &mdev->receiver ? "receiver" : | ||
1422 | thi == &mdev->asender ? "asender" : | ||
1423 | thi == &mdev->worker ? "worker" : "NONSENSE"; | ||
1424 | |||
1425 | /* is used from state engine doing drbd_thread_stop_nowait, | ||
1426 | * while holding the req lock irqsave */ | ||
1427 | spin_lock_irqsave(&thi->t_lock, flags); | ||
1428 | |||
1429 | switch (thi->t_state) { | ||
1430 | case None: | ||
1431 | dev_info(DEV, "Starting %s thread (from %s [%d])\n", | ||
1432 | me, current->comm, current->pid); | ||
1433 | |||
1434 | /* Get ref on module for thread - this is released when thread exits */ | ||
1435 | if (!try_module_get(THIS_MODULE)) { | ||
1436 | dev_err(DEV, "Failed to get module reference in drbd_thread_start\n"); | ||
1437 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1438 | return FALSE; | ||
1439 | } | ||
1440 | |||
1441 | init_completion(&thi->stop); | ||
1442 | D_ASSERT(thi->task == NULL); | ||
1443 | thi->reset_cpu_mask = 1; | ||
1444 | thi->t_state = Running; | ||
1445 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1446 | flush_signals(current); /* otherw. may get -ERESTARTNOINTR */ | ||
1447 | |||
1448 | nt = kthread_create(drbd_thread_setup, (void *) thi, | ||
1449 | "drbd%d_%s", mdev_to_minor(mdev), me); | ||
1450 | |||
1451 | if (IS_ERR(nt)) { | ||
1452 | dev_err(DEV, "Couldn't start thread\n"); | ||
1453 | |||
1454 | module_put(THIS_MODULE); | ||
1455 | return FALSE; | ||
1456 | } | ||
1457 | spin_lock_irqsave(&thi->t_lock, flags); | ||
1458 | thi->task = nt; | ||
1459 | thi->t_state = Running; | ||
1460 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1461 | wake_up_process(nt); | ||
1462 | break; | ||
1463 | case Exiting: | ||
1464 | thi->t_state = Restarting; | ||
1465 | dev_info(DEV, "Restarting %s thread (from %s [%d])\n", | ||
1466 | me, current->comm, current->pid); | ||
1467 | /* fall through */ | ||
1468 | case Running: | ||
1469 | case Restarting: | ||
1470 | default: | ||
1471 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1472 | break; | ||
1473 | } | ||
1474 | |||
1475 | return TRUE; | ||
1476 | } | ||
1477 | |||
1478 | |||
1479 | void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) | ||
1480 | { | ||
1481 | unsigned long flags; | ||
1482 | |||
1483 | enum drbd_thread_state ns = restart ? Restarting : Exiting; | ||
1484 | |||
1485 | /* may be called from state engine, holding the req lock irqsave */ | ||
1486 | spin_lock_irqsave(&thi->t_lock, flags); | ||
1487 | |||
1488 | if (thi->t_state == None) { | ||
1489 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1490 | if (restart) | ||
1491 | drbd_thread_start(thi); | ||
1492 | return; | ||
1493 | } | ||
1494 | |||
1495 | if (thi->t_state != ns) { | ||
1496 | if (thi->task == NULL) { | ||
1497 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1498 | return; | ||
1499 | } | ||
1500 | |||
1501 | thi->t_state = ns; | ||
1502 | smp_mb(); | ||
1503 | init_completion(&thi->stop); | ||
1504 | if (thi->task != current) | ||
1505 | force_sig(DRBD_SIGKILL, thi->task); | ||
1506 | |||
1507 | } | ||
1508 | |||
1509 | spin_unlock_irqrestore(&thi->t_lock, flags); | ||
1510 | |||
1511 | if (wait) | ||
1512 | wait_for_completion(&thi->stop); | ||
1513 | } | ||
1514 | |||
1515 | #ifdef CONFIG_SMP | ||
1516 | /** | ||
1517 | * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs | ||
1518 | * @mdev: DRBD device. | ||
1519 | * | ||
1520 | * Forces all threads of a device onto the same CPU. This is beneficial for | ||
1521 | * DRBD's performance. May be overwritten by user's configuration. | ||
1522 | */ | ||
1523 | void drbd_calc_cpu_mask(struct drbd_conf *mdev) | ||
1524 | { | ||
1525 | int ord, cpu; | ||
1526 | |||
1527 | /* user override. */ | ||
1528 | if (cpumask_weight(mdev->cpu_mask)) | ||
1529 | return; | ||
1530 | |||
1531 | ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask); | ||
1532 | for_each_online_cpu(cpu) { | ||
1533 | if (ord-- == 0) { | ||
1534 | cpumask_set_cpu(cpu, mdev->cpu_mask); | ||
1535 | return; | ||
1536 | } | ||
1537 | } | ||
1538 | /* should not be reached */ | ||
1539 | cpumask_setall(mdev->cpu_mask); | ||
1540 | } | ||
1541 | |||
1542 | /** | ||
1543 | * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread | ||
1544 | * @mdev: DRBD device. | ||
1545 | * | ||
1546 | * call in the "main loop" of _all_ threads, no need for any mutex, current won't die | ||
1547 | * prematurely. | ||
1548 | */ | ||
1549 | void drbd_thread_current_set_cpu(struct drbd_conf *mdev) | ||
1550 | { | ||
1551 | struct task_struct *p = current; | ||
1552 | struct drbd_thread *thi = | ||
1553 | p == mdev->asender.task ? &mdev->asender : | ||
1554 | p == mdev->receiver.task ? &mdev->receiver : | ||
1555 | p == mdev->worker.task ? &mdev->worker : | ||
1556 | NULL; | ||
1557 | ERR_IF(thi == NULL) | ||
1558 | return; | ||
1559 | if (!thi->reset_cpu_mask) | ||
1560 | return; | ||
1561 | thi->reset_cpu_mask = 0; | ||
1562 | set_cpus_allowed_ptr(p, mdev->cpu_mask); | ||
1563 | } | ||
1564 | #endif | ||
1565 | |||
1566 | /* the appropriate socket mutex must be held already */ | ||
1567 | int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, | ||
1568 | enum drbd_packets cmd, struct p_header *h, | ||
1569 | size_t size, unsigned msg_flags) | ||
1570 | { | ||
1571 | int sent, ok; | ||
1572 | |||
1573 | ERR_IF(!h) return FALSE; | ||
1574 | ERR_IF(!size) return FALSE; | ||
1575 | |||
1576 | h->magic = BE_DRBD_MAGIC; | ||
1577 | h->command = cpu_to_be16(cmd); | ||
1578 | h->length = cpu_to_be16(size-sizeof(struct p_header)); | ||
1579 | |||
1580 | trace_drbd_packet(mdev, sock, 0, (void *)h, __FILE__, __LINE__); | ||
1581 | sent = drbd_send(mdev, sock, h, size, msg_flags); | ||
1582 | |||
1583 | ok = (sent == size); | ||
1584 | if (!ok) | ||
1585 | dev_err(DEV, "short sent %s size=%d sent=%d\n", | ||
1586 | cmdname(cmd), (int)size, sent); | ||
1587 | return ok; | ||
1588 | } | ||
1589 | |||
1590 | /* don't pass the socket. we may only look at it | ||
1591 | * when we hold the appropriate socket mutex. | ||
1592 | */ | ||
1593 | int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, | ||
1594 | enum drbd_packets cmd, struct p_header *h, size_t size) | ||
1595 | { | ||
1596 | int ok = 0; | ||
1597 | struct socket *sock; | ||
1598 | |||
1599 | if (use_data_socket) { | ||
1600 | mutex_lock(&mdev->data.mutex); | ||
1601 | sock = mdev->data.socket; | ||
1602 | } else { | ||
1603 | mutex_lock(&mdev->meta.mutex); | ||
1604 | sock = mdev->meta.socket; | ||
1605 | } | ||
1606 | |||
1607 | /* drbd_disconnect() could have called drbd_free_sock() | ||
1608 | * while we were waiting in down()... */ | ||
1609 | if (likely(sock != NULL)) | ||
1610 | ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0); | ||
1611 | |||
1612 | if (use_data_socket) | ||
1613 | mutex_unlock(&mdev->data.mutex); | ||
1614 | else | ||
1615 | mutex_unlock(&mdev->meta.mutex); | ||
1616 | return ok; | ||
1617 | } | ||
1618 | |||
1619 | int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data, | ||
1620 | size_t size) | ||
1621 | { | ||
1622 | struct p_header h; | ||
1623 | int ok; | ||
1624 | |||
1625 | h.magic = BE_DRBD_MAGIC; | ||
1626 | h.command = cpu_to_be16(cmd); | ||
1627 | h.length = cpu_to_be16(size); | ||
1628 | |||
1629 | if (!drbd_get_data_sock(mdev)) | ||
1630 | return 0; | ||
1631 | |||
1632 | trace_drbd_packet(mdev, mdev->data.socket, 0, (void *)&h, __FILE__, __LINE__); | ||
1633 | |||
1634 | ok = (sizeof(h) == | ||
1635 | drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0)); | ||
1636 | ok = ok && (size == | ||
1637 | drbd_send(mdev, mdev->data.socket, data, size, 0)); | ||
1638 | |||
1639 | drbd_put_data_sock(mdev); | ||
1640 | |||
1641 | return ok; | ||
1642 | } | ||
1643 | |||
1644 | int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc) | ||
1645 | { | ||
1646 | struct p_rs_param_89 *p; | ||
1647 | struct socket *sock; | ||
1648 | int size, rv; | ||
1649 | const int apv = mdev->agreed_pro_version; | ||
1650 | |||
1651 | size = apv <= 87 ? sizeof(struct p_rs_param) | ||
1652 | : apv == 88 ? sizeof(struct p_rs_param) | ||
1653 | + strlen(mdev->sync_conf.verify_alg) + 1 | ||
1654 | : /* 89 */ sizeof(struct p_rs_param_89); | ||
1655 | |||
1656 | /* used from admin command context and receiver/worker context. | ||
1657 | * to avoid kmalloc, grab the socket right here, | ||
1658 | * then use the pre-allocated sbuf there */ | ||
1659 | mutex_lock(&mdev->data.mutex); | ||
1660 | sock = mdev->data.socket; | ||
1661 | |||
1662 | if (likely(sock != NULL)) { | ||
1663 | enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; | ||
1664 | |||
1665 | p = &mdev->data.sbuf.rs_param_89; | ||
1666 | |||
1667 | /* initialize verify_alg and csums_alg */ | ||
1668 | memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); | ||
1669 | |||
1670 | p->rate = cpu_to_be32(sc->rate); | ||
1671 | |||
1672 | if (apv >= 88) | ||
1673 | strcpy(p->verify_alg, mdev->sync_conf.verify_alg); | ||
1674 | if (apv >= 89) | ||
1675 | strcpy(p->csums_alg, mdev->sync_conf.csums_alg); | ||
1676 | |||
1677 | rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0); | ||
1678 | } else | ||
1679 | rv = 0; /* not ok */ | ||
1680 | |||
1681 | mutex_unlock(&mdev->data.mutex); | ||
1682 | |||
1683 | return rv; | ||
1684 | } | ||
1685 | |||
1686 | int drbd_send_protocol(struct drbd_conf *mdev) | ||
1687 | { | ||
1688 | struct p_protocol *p; | ||
1689 | int size, rv; | ||
1690 | |||
1691 | size = sizeof(struct p_protocol); | ||
1692 | |||
1693 | if (mdev->agreed_pro_version >= 87) | ||
1694 | size += strlen(mdev->net_conf->integrity_alg) + 1; | ||
1695 | |||
1696 | /* we must not recurse into our own queue, | ||
1697 | * as that is blocked during handshake */ | ||
1698 | p = kmalloc(size, GFP_NOIO); | ||
1699 | if (p == NULL) | ||
1700 | return 0; | ||
1701 | |||
1702 | p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol); | ||
1703 | p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p); | ||
1704 | p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p); | ||
1705 | p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p); | ||
1706 | p->want_lose = cpu_to_be32(mdev->net_conf->want_lose); | ||
1707 | p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries); | ||
1708 | |||
1709 | if (mdev->agreed_pro_version >= 87) | ||
1710 | strcpy(p->integrity_alg, mdev->net_conf->integrity_alg); | ||
1711 | |||
1712 | rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL, | ||
1713 | (struct p_header *)p, size); | ||
1714 | kfree(p); | ||
1715 | return rv; | ||
1716 | } | ||
1717 | |||
1718 | int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags) | ||
1719 | { | ||
1720 | struct p_uuids p; | ||
1721 | int i; | ||
1722 | |||
1723 | if (!get_ldev_if_state(mdev, D_NEGOTIATING)) | ||
1724 | return 1; | ||
1725 | |||
1726 | for (i = UI_CURRENT; i < UI_SIZE; i++) | ||
1727 | p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0; | ||
1728 | |||
1729 | mdev->comm_bm_set = drbd_bm_total_weight(mdev); | ||
1730 | p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set); | ||
1731 | uuid_flags |= mdev->net_conf->want_lose ? 1 : 0; | ||
1732 | uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0; | ||
1733 | uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0; | ||
1734 | p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags); | ||
1735 | |||
1736 | put_ldev(mdev); | ||
1737 | |||
1738 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS, | ||
1739 | (struct p_header *)&p, sizeof(p)); | ||
1740 | } | ||
1741 | |||
1742 | int drbd_send_uuids(struct drbd_conf *mdev) | ||
1743 | { | ||
1744 | return _drbd_send_uuids(mdev, 0); | ||
1745 | } | ||
1746 | |||
1747 | int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev) | ||
1748 | { | ||
1749 | return _drbd_send_uuids(mdev, 8); | ||
1750 | } | ||
1751 | |||
1752 | |||
1753 | int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val) | ||
1754 | { | ||
1755 | struct p_rs_uuid p; | ||
1756 | |||
1757 | p.uuid = cpu_to_be64(val); | ||
1758 | |||
1759 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID, | ||
1760 | (struct p_header *)&p, sizeof(p)); | ||
1761 | } | ||
1762 | |||
1763 | int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply) | ||
1764 | { | ||
1765 | struct p_sizes p; | ||
1766 | sector_t d_size, u_size; | ||
1767 | int q_order_type; | ||
1768 | int ok; | ||
1769 | |||
1770 | if (get_ldev_if_state(mdev, D_NEGOTIATING)) { | ||
1771 | D_ASSERT(mdev->ldev->backing_bdev); | ||
1772 | d_size = drbd_get_max_capacity(mdev->ldev); | ||
1773 | u_size = mdev->ldev->dc.disk_size; | ||
1774 | q_order_type = drbd_queue_order_type(mdev); | ||
1775 | p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev)); | ||
1776 | put_ldev(mdev); | ||
1777 | } else { | ||
1778 | d_size = 0; | ||
1779 | u_size = 0; | ||
1780 | q_order_type = QUEUE_ORDERED_NONE; | ||
1781 | } | ||
1782 | |||
1783 | p.d_size = cpu_to_be64(d_size); | ||
1784 | p.u_size = cpu_to_be64(u_size); | ||
1785 | p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); | ||
1786 | p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue)); | ||
1787 | p.queue_order_type = cpu_to_be32(q_order_type); | ||
1788 | |||
1789 | ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, | ||
1790 | (struct p_header *)&p, sizeof(p)); | ||
1791 | return ok; | ||
1792 | } | ||
1793 | |||
1794 | /** | ||
1795 | * drbd_send_state() - Sends the drbd state to the peer | ||
1796 | * @mdev: DRBD device. | ||
1797 | */ | ||
1798 | int drbd_send_state(struct drbd_conf *mdev) | ||
1799 | { | ||
1800 | struct socket *sock; | ||
1801 | struct p_state p; | ||
1802 | int ok = 0; | ||
1803 | |||
1804 | /* Grab state lock so we wont send state if we're in the middle | ||
1805 | * of a cluster wide state change on another thread */ | ||
1806 | drbd_state_lock(mdev); | ||
1807 | |||
1808 | mutex_lock(&mdev->data.mutex); | ||
1809 | |||
1810 | p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */ | ||
1811 | sock = mdev->data.socket; | ||
1812 | |||
1813 | if (likely(sock != NULL)) { | ||
1814 | ok = _drbd_send_cmd(mdev, sock, P_STATE, | ||
1815 | (struct p_header *)&p, sizeof(p), 0); | ||
1816 | } | ||
1817 | |||
1818 | mutex_unlock(&mdev->data.mutex); | ||
1819 | |||
1820 | drbd_state_unlock(mdev); | ||
1821 | return ok; | ||
1822 | } | ||
1823 | |||
1824 | int drbd_send_state_req(struct drbd_conf *mdev, | ||
1825 | union drbd_state mask, union drbd_state val) | ||
1826 | { | ||
1827 | struct p_req_state p; | ||
1828 | |||
1829 | p.mask = cpu_to_be32(mask.i); | ||
1830 | p.val = cpu_to_be32(val.i); | ||
1831 | |||
1832 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ, | ||
1833 | (struct p_header *)&p, sizeof(p)); | ||
1834 | } | ||
1835 | |||
1836 | int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode) | ||
1837 | { | ||
1838 | struct p_req_state_reply p; | ||
1839 | |||
1840 | p.retcode = cpu_to_be32(retcode); | ||
1841 | |||
1842 | return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY, | ||
1843 | (struct p_header *)&p, sizeof(p)); | ||
1844 | } | ||
1845 | |||
1846 | int fill_bitmap_rle_bits(struct drbd_conf *mdev, | ||
1847 | struct p_compressed_bm *p, | ||
1848 | struct bm_xfer_ctx *c) | ||
1849 | { | ||
1850 | struct bitstream bs; | ||
1851 | unsigned long plain_bits; | ||
1852 | unsigned long tmp; | ||
1853 | unsigned long rl; | ||
1854 | unsigned len; | ||
1855 | unsigned toggle; | ||
1856 | int bits; | ||
1857 | |||
1858 | /* may we use this feature? */ | ||
1859 | if ((mdev->sync_conf.use_rle == 0) || | ||
1860 | (mdev->agreed_pro_version < 90)) | ||
1861 | return 0; | ||
1862 | |||
1863 | if (c->bit_offset >= c->bm_bits) | ||
1864 | return 0; /* nothing to do. */ | ||
1865 | |||
1866 | /* use at most thus many bytes */ | ||
1867 | bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0); | ||
1868 | memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX); | ||
1869 | /* plain bits covered in this code string */ | ||
1870 | plain_bits = 0; | ||
1871 | |||
1872 | /* p->encoding & 0x80 stores whether the first run length is set. | ||
1873 | * bit offset is implicit. | ||
1874 | * start with toggle == 2 to be able to tell the first iteration */ | ||
1875 | toggle = 2; | ||
1876 | |||
1877 | /* see how much plain bits we can stuff into one packet | ||
1878 | * using RLE and VLI. */ | ||
1879 | do { | ||
1880 | tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset) | ||
1881 | : _drbd_bm_find_next(mdev, c->bit_offset); | ||
1882 | if (tmp == -1UL) | ||
1883 | tmp = c->bm_bits; | ||
1884 | rl = tmp - c->bit_offset; | ||
1885 | |||
1886 | if (toggle == 2) { /* first iteration */ | ||
1887 | if (rl == 0) { | ||
1888 | /* the first checked bit was set, | ||
1889 | * store start value, */ | ||
1890 | DCBP_set_start(p, 1); | ||
1891 | /* but skip encoding of zero run length */ | ||
1892 | toggle = !toggle; | ||
1893 | continue; | ||
1894 | } | ||
1895 | DCBP_set_start(p, 0); | ||
1896 | } | ||
1897 | |||
1898 | /* paranoia: catch zero runlength. | ||
1899 | * can only happen if bitmap is modified while we scan it. */ | ||
1900 | if (rl == 0) { | ||
1901 | dev_err(DEV, "unexpected zero runlength while encoding bitmap " | ||
1902 | "t:%u bo:%lu\n", toggle, c->bit_offset); | ||
1903 | return -1; | ||
1904 | } | ||
1905 | |||
1906 | bits = vli_encode_bits(&bs, rl); | ||
1907 | if (bits == -ENOBUFS) /* buffer full */ | ||
1908 | break; | ||
1909 | if (bits <= 0) { | ||
1910 | dev_err(DEV, "error while encoding bitmap: %d\n", bits); | ||
1911 | return 0; | ||
1912 | } | ||
1913 | |||
1914 | toggle = !toggle; | ||
1915 | plain_bits += rl; | ||
1916 | c->bit_offset = tmp; | ||
1917 | } while (c->bit_offset < c->bm_bits); | ||
1918 | |||
1919 | len = bs.cur.b - p->code + !!bs.cur.bit; | ||
1920 | |||
1921 | if (plain_bits < (len << 3)) { | ||
1922 | /* incompressible with this method. | ||
1923 | * we need to rewind both word and bit position. */ | ||
1924 | c->bit_offset -= plain_bits; | ||
1925 | bm_xfer_ctx_bit_to_word_offset(c); | ||
1926 | c->bit_offset = c->word_offset * BITS_PER_LONG; | ||
1927 | return 0; | ||
1928 | } | ||
1929 | |||
1930 | /* RLE + VLI was able to compress it just fine. | ||
1931 | * update c->word_offset. */ | ||
1932 | bm_xfer_ctx_bit_to_word_offset(c); | ||
1933 | |||
1934 | /* store pad_bits */ | ||
1935 | DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7); | ||
1936 | |||
1937 | return len; | ||
1938 | } | ||
1939 | |||
1940 | enum { OK, FAILED, DONE } | ||
1941 | send_bitmap_rle_or_plain(struct drbd_conf *mdev, | ||
1942 | struct p_header *h, struct bm_xfer_ctx *c) | ||
1943 | { | ||
1944 | struct p_compressed_bm *p = (void*)h; | ||
1945 | unsigned long num_words; | ||
1946 | int len; | ||
1947 | int ok; | ||
1948 | |||
1949 | len = fill_bitmap_rle_bits(mdev, p, c); | ||
1950 | |||
1951 | if (len < 0) | ||
1952 | return FAILED; | ||
1953 | |||
1954 | if (len) { | ||
1955 | DCBP_set_code(p, RLE_VLI_Bits); | ||
1956 | ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h, | ||
1957 | sizeof(*p) + len, 0); | ||
1958 | |||
1959 | c->packets[0]++; | ||
1960 | c->bytes[0] += sizeof(*p) + len; | ||
1961 | |||
1962 | if (c->bit_offset >= c->bm_bits) | ||
1963 | len = 0; /* DONE */ | ||
1964 | } else { | ||
1965 | /* was not compressible. | ||
1966 | * send a buffer full of plain text bits instead. */ | ||
1967 | num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); | ||
1968 | len = num_words * sizeof(long); | ||
1969 | if (len) | ||
1970 | drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload); | ||
1971 | ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP, | ||
1972 | h, sizeof(struct p_header) + len, 0); | ||
1973 | c->word_offset += num_words; | ||
1974 | c->bit_offset = c->word_offset * BITS_PER_LONG; | ||
1975 | |||
1976 | c->packets[1]++; | ||
1977 | c->bytes[1] += sizeof(struct p_header) + len; | ||
1978 | |||
1979 | if (c->bit_offset > c->bm_bits) | ||
1980 | c->bit_offset = c->bm_bits; | ||
1981 | } | ||
1982 | ok = ok ? ((len == 0) ? DONE : OK) : FAILED; | ||
1983 | |||
1984 | if (ok == DONE) | ||
1985 | INFO_bm_xfer_stats(mdev, "send", c); | ||
1986 | return ok; | ||
1987 | } | ||
1988 | |||
1989 | /* See the comment at receive_bitmap() */ | ||
1990 | int _drbd_send_bitmap(struct drbd_conf *mdev) | ||
1991 | { | ||
1992 | struct bm_xfer_ctx c; | ||
1993 | struct p_header *p; | ||
1994 | int ret; | ||
1995 | |||
1996 | ERR_IF(!mdev->bitmap) return FALSE; | ||
1997 | |||
1998 | /* maybe we should use some per thread scratch page, | ||
1999 | * and allocate that during initial device creation? */ | ||
2000 | p = (struct p_header *) __get_free_page(GFP_NOIO); | ||
2001 | if (!p) { | ||
2002 | dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); | ||
2003 | return FALSE; | ||
2004 | } | ||
2005 | |||
2006 | if (get_ldev(mdev)) { | ||
2007 | if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { | ||
2008 | dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n"); | ||
2009 | drbd_bm_set_all(mdev); | ||
2010 | if (drbd_bm_write(mdev)) { | ||
2011 | /* write_bm did fail! Leave full sync flag set in Meta P_DATA | ||
2012 | * but otherwise process as per normal - need to tell other | ||
2013 | * side that a full resync is required! */ | ||
2014 | dev_err(DEV, "Failed to write bitmap to disk!\n"); | ||
2015 | } else { | ||
2016 | drbd_md_clear_flag(mdev, MDF_FULL_SYNC); | ||
2017 | drbd_md_sync(mdev); | ||
2018 | } | ||
2019 | } | ||
2020 | put_ldev(mdev); | ||
2021 | } | ||
2022 | |||
2023 | c = (struct bm_xfer_ctx) { | ||
2024 | .bm_bits = drbd_bm_bits(mdev), | ||
2025 | .bm_words = drbd_bm_words(mdev), | ||
2026 | }; | ||
2027 | |||
2028 | do { | ||
2029 | ret = send_bitmap_rle_or_plain(mdev, p, &c); | ||
2030 | } while (ret == OK); | ||
2031 | |||
2032 | free_page((unsigned long) p); | ||
2033 | return (ret == DONE); | ||
2034 | } | ||
2035 | |||
2036 | int drbd_send_bitmap(struct drbd_conf *mdev) | ||
2037 | { | ||
2038 | int err; | ||
2039 | |||
2040 | if (!drbd_get_data_sock(mdev)) | ||
2041 | return -1; | ||
2042 | err = !_drbd_send_bitmap(mdev); | ||
2043 | drbd_put_data_sock(mdev); | ||
2044 | return err; | ||
2045 | } | ||
2046 | |||
2047 | int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size) | ||
2048 | { | ||
2049 | int ok; | ||
2050 | struct p_barrier_ack p; | ||
2051 | |||
2052 | p.barrier = barrier_nr; | ||
2053 | p.set_size = cpu_to_be32(set_size); | ||
2054 | |||
2055 | if (mdev->state.conn < C_CONNECTED) | ||
2056 | return FALSE; | ||
2057 | ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK, | ||
2058 | (struct p_header *)&p, sizeof(p)); | ||
2059 | return ok; | ||
2060 | } | ||
2061 | |||
2062 | /** | ||
2063 | * _drbd_send_ack() - Sends an ack packet | ||
2064 | * @mdev: DRBD device. | ||
2065 | * @cmd: Packet command code. | ||
2066 | * @sector: sector, needs to be in big endian byte order | ||
2067 | * @blksize: size in byte, needs to be in big endian byte order | ||
2068 | * @block_id: Id, big endian byte order | ||
2069 | */ | ||
2070 | static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
2071 | u64 sector, | ||
2072 | u32 blksize, | ||
2073 | u64 block_id) | ||
2074 | { | ||
2075 | int ok; | ||
2076 | struct p_block_ack p; | ||
2077 | |||
2078 | p.sector = sector; | ||
2079 | p.block_id = block_id; | ||
2080 | p.blksize = blksize; | ||
2081 | p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq)); | ||
2082 | |||
2083 | if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED) | ||
2084 | return FALSE; | ||
2085 | ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd, | ||
2086 | (struct p_header *)&p, sizeof(p)); | ||
2087 | return ok; | ||
2088 | } | ||
2089 | |||
2090 | int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
2091 | struct p_data *dp) | ||
2092 | { | ||
2093 | const int header_size = sizeof(struct p_data) | ||
2094 | - sizeof(struct p_header); | ||
2095 | int data_size = ((struct p_header *)dp)->length - header_size; | ||
2096 | |||
2097 | return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size), | ||
2098 | dp->block_id); | ||
2099 | } | ||
2100 | |||
2101 | int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
2102 | struct p_block_req *rp) | ||
2103 | { | ||
2104 | return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id); | ||
2105 | } | ||
2106 | |||
2107 | /** | ||
2108 | * drbd_send_ack() - Sends an ack packet | ||
2109 | * @mdev: DRBD device. | ||
2110 | * @cmd: Packet command code. | ||
2111 | * @e: Epoch entry. | ||
2112 | */ | ||
2113 | int drbd_send_ack(struct drbd_conf *mdev, | ||
2114 | enum drbd_packets cmd, struct drbd_epoch_entry *e) | ||
2115 | { | ||
2116 | return _drbd_send_ack(mdev, cmd, | ||
2117 | cpu_to_be64(e->sector), | ||
2118 | cpu_to_be32(e->size), | ||
2119 | e->block_id); | ||
2120 | } | ||
2121 | |||
2122 | /* This function misuses the block_id field to signal if the blocks | ||
2123 | * are is sync or not. */ | ||
2124 | int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
2125 | sector_t sector, int blksize, u64 block_id) | ||
2126 | { | ||
2127 | return _drbd_send_ack(mdev, cmd, | ||
2128 | cpu_to_be64(sector), | ||
2129 | cpu_to_be32(blksize), | ||
2130 | cpu_to_be64(block_id)); | ||
2131 | } | ||
2132 | |||
2133 | int drbd_send_drequest(struct drbd_conf *mdev, int cmd, | ||
2134 | sector_t sector, int size, u64 block_id) | ||
2135 | { | ||
2136 | int ok; | ||
2137 | struct p_block_req p; | ||
2138 | |||
2139 | p.sector = cpu_to_be64(sector); | ||
2140 | p.block_id = block_id; | ||
2141 | p.blksize = cpu_to_be32(size); | ||
2142 | |||
2143 | ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, | ||
2144 | (struct p_header *)&p, sizeof(p)); | ||
2145 | return ok; | ||
2146 | } | ||
2147 | |||
2148 | int drbd_send_drequest_csum(struct drbd_conf *mdev, | ||
2149 | sector_t sector, int size, | ||
2150 | void *digest, int digest_size, | ||
2151 | enum drbd_packets cmd) | ||
2152 | { | ||
2153 | int ok; | ||
2154 | struct p_block_req p; | ||
2155 | |||
2156 | p.sector = cpu_to_be64(sector); | ||
2157 | p.block_id = BE_DRBD_MAGIC + 0xbeef; | ||
2158 | p.blksize = cpu_to_be32(size); | ||
2159 | |||
2160 | p.head.magic = BE_DRBD_MAGIC; | ||
2161 | p.head.command = cpu_to_be16(cmd); | ||
2162 | p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size); | ||
2163 | |||
2164 | mutex_lock(&mdev->data.mutex); | ||
2165 | |||
2166 | ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0)); | ||
2167 | ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0)); | ||
2168 | |||
2169 | mutex_unlock(&mdev->data.mutex); | ||
2170 | |||
2171 | return ok; | ||
2172 | } | ||
2173 | |||
2174 | int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size) | ||
2175 | { | ||
2176 | int ok; | ||
2177 | struct p_block_req p; | ||
2178 | |||
2179 | p.sector = cpu_to_be64(sector); | ||
2180 | p.block_id = BE_DRBD_MAGIC + 0xbabe; | ||
2181 | p.blksize = cpu_to_be32(size); | ||
2182 | |||
2183 | ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST, | ||
2184 | (struct p_header *)&p, sizeof(p)); | ||
2185 | return ok; | ||
2186 | } | ||
2187 | |||
2188 | /* called on sndtimeo | ||
2189 | * returns FALSE if we should retry, | ||
2190 | * TRUE if we think connection is dead | ||
2191 | */ | ||
2192 | static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock) | ||
2193 | { | ||
2194 | int drop_it; | ||
2195 | /* long elapsed = (long)(jiffies - mdev->last_received); */ | ||
2196 | |||
2197 | drop_it = mdev->meta.socket == sock | ||
2198 | || !mdev->asender.task | ||
2199 | || get_t_state(&mdev->asender) != Running | ||
2200 | || mdev->state.conn < C_CONNECTED; | ||
2201 | |||
2202 | if (drop_it) | ||
2203 | return TRUE; | ||
2204 | |||
2205 | drop_it = !--mdev->ko_count; | ||
2206 | if (!drop_it) { | ||
2207 | dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n", | ||
2208 | current->comm, current->pid, mdev->ko_count); | ||
2209 | request_ping(mdev); | ||
2210 | } | ||
2211 | |||
2212 | return drop_it; /* && (mdev->state == R_PRIMARY) */; | ||
2213 | } | ||
2214 | |||
2215 | /* The idea of sendpage seems to be to put some kind of reference | ||
2216 | * to the page into the skb, and to hand it over to the NIC. In | ||
2217 | * this process get_page() gets called. | ||
2218 | * | ||
2219 | * As soon as the page was really sent over the network put_page() | ||
2220 | * gets called by some part of the network layer. [ NIC driver? ] | ||
2221 | * | ||
2222 | * [ get_page() / put_page() increment/decrement the count. If count | ||
2223 | * reaches 0 the page will be freed. ] | ||
2224 | * | ||
2225 | * This works nicely with pages from FSs. | ||
2226 | * But this means that in protocol A we might signal IO completion too early! | ||
2227 | * | ||
2228 | * In order not to corrupt data during a resync we must make sure | ||
2229 | * that we do not reuse our own buffer pages (EEs) to early, therefore | ||
2230 | * we have the net_ee list. | ||
2231 | * | ||
2232 | * XFS seems to have problems, still, it submits pages with page_count == 0! | ||
2233 | * As a workaround, we disable sendpage on pages | ||
2234 | * with page_count == 0 or PageSlab. | ||
2235 | */ | ||
2236 | static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page, | ||
2237 | int offset, size_t size) | ||
2238 | { | ||
2239 | int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0); | ||
2240 | kunmap(page); | ||
2241 | if (sent == size) | ||
2242 | mdev->send_cnt += size>>9; | ||
2243 | return sent == size; | ||
2244 | } | ||
2245 | |||
2246 | static int _drbd_send_page(struct drbd_conf *mdev, struct page *page, | ||
2247 | int offset, size_t size) | ||
2248 | { | ||
2249 | mm_segment_t oldfs = get_fs(); | ||
2250 | int sent, ok; | ||
2251 | int len = size; | ||
2252 | |||
2253 | /* e.g. XFS meta- & log-data is in slab pages, which have a | ||
2254 | * page_count of 0 and/or have PageSlab() set. | ||
2255 | * we cannot use send_page for those, as that does get_page(); | ||
2256 | * put_page(); and would cause either a VM_BUG directly, or | ||
2257 | * __page_cache_release a page that would actually still be referenced | ||
2258 | * by someone, leading to some obscure delayed Oops somewhere else. */ | ||
2259 | if (disable_sendpage || (page_count(page) < 1) || PageSlab(page)) | ||
2260 | return _drbd_no_send_page(mdev, page, offset, size); | ||
2261 | |||
2262 | drbd_update_congested(mdev); | ||
2263 | set_fs(KERNEL_DS); | ||
2264 | do { | ||
2265 | sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page, | ||
2266 | offset, len, | ||
2267 | MSG_NOSIGNAL); | ||
2268 | if (sent == -EAGAIN) { | ||
2269 | if (we_should_drop_the_connection(mdev, | ||
2270 | mdev->data.socket)) | ||
2271 | break; | ||
2272 | else | ||
2273 | continue; | ||
2274 | } | ||
2275 | if (sent <= 0) { | ||
2276 | dev_warn(DEV, "%s: size=%d len=%d sent=%d\n", | ||
2277 | __func__, (int)size, len, sent); | ||
2278 | break; | ||
2279 | } | ||
2280 | len -= sent; | ||
2281 | offset += sent; | ||
2282 | } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/); | ||
2283 | set_fs(oldfs); | ||
2284 | clear_bit(NET_CONGESTED, &mdev->flags); | ||
2285 | |||
2286 | ok = (len == 0); | ||
2287 | if (likely(ok)) | ||
2288 | mdev->send_cnt += size>>9; | ||
2289 | return ok; | ||
2290 | } | ||
2291 | |||
2292 | static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) | ||
2293 | { | ||
2294 | struct bio_vec *bvec; | ||
2295 | int i; | ||
2296 | __bio_for_each_segment(bvec, bio, i, 0) { | ||
2297 | if (!_drbd_no_send_page(mdev, bvec->bv_page, | ||
2298 | bvec->bv_offset, bvec->bv_len)) | ||
2299 | return 0; | ||
2300 | } | ||
2301 | return 1; | ||
2302 | } | ||
2303 | |||
2304 | static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) | ||
2305 | { | ||
2306 | struct bio_vec *bvec; | ||
2307 | int i; | ||
2308 | __bio_for_each_segment(bvec, bio, i, 0) { | ||
2309 | if (!_drbd_send_page(mdev, bvec->bv_page, | ||
2310 | bvec->bv_offset, bvec->bv_len)) | ||
2311 | return 0; | ||
2312 | } | ||
2313 | |||
2314 | return 1; | ||
2315 | } | ||
2316 | |||
2317 | /* Used to send write requests | ||
2318 | * R_PRIMARY -> Peer (P_DATA) | ||
2319 | */ | ||
2320 | int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) | ||
2321 | { | ||
2322 | int ok = 1; | ||
2323 | struct p_data p; | ||
2324 | unsigned int dp_flags = 0; | ||
2325 | void *dgb; | ||
2326 | int dgs; | ||
2327 | |||
2328 | if (!drbd_get_data_sock(mdev)) | ||
2329 | return 0; | ||
2330 | |||
2331 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? | ||
2332 | crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; | ||
2333 | |||
2334 | p.head.magic = BE_DRBD_MAGIC; | ||
2335 | p.head.command = cpu_to_be16(P_DATA); | ||
2336 | p.head.length = | ||
2337 | cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size); | ||
2338 | |||
2339 | p.sector = cpu_to_be64(req->sector); | ||
2340 | p.block_id = (unsigned long)req; | ||
2341 | p.seq_num = cpu_to_be32(req->seq_num = | ||
2342 | atomic_add_return(1, &mdev->packet_seq)); | ||
2343 | dp_flags = 0; | ||
2344 | |||
2345 | /* NOTE: no need to check if barriers supported here as we would | ||
2346 | * not pass the test in make_request_common in that case | ||
2347 | */ | ||
2348 | if (bio_rw_flagged(req->master_bio, BIO_RW_BARRIER)) { | ||
2349 | dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n"); | ||
2350 | /* dp_flags |= DP_HARDBARRIER; */ | ||
2351 | } | ||
2352 | if (bio_rw_flagged(req->master_bio, BIO_RW_SYNCIO)) | ||
2353 | dp_flags |= DP_RW_SYNC; | ||
2354 | /* for now handle SYNCIO and UNPLUG | ||
2355 | * as if they still were one and the same flag */ | ||
2356 | if (bio_rw_flagged(req->master_bio, BIO_RW_UNPLUG)) | ||
2357 | dp_flags |= DP_RW_SYNC; | ||
2358 | if (mdev->state.conn >= C_SYNC_SOURCE && | ||
2359 | mdev->state.conn <= C_PAUSED_SYNC_T) | ||
2360 | dp_flags |= DP_MAY_SET_IN_SYNC; | ||
2361 | |||
2362 | p.dp_flags = cpu_to_be32(dp_flags); | ||
2363 | trace_drbd_packet(mdev, mdev->data.socket, 0, (void *)&p, __FILE__, __LINE__); | ||
2364 | set_bit(UNPLUG_REMOTE, &mdev->flags); | ||
2365 | ok = (sizeof(p) == | ||
2366 | drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE)); | ||
2367 | if (ok && dgs) { | ||
2368 | dgb = mdev->int_dig_out; | ||
2369 | drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb); | ||
2370 | ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); | ||
2371 | } | ||
2372 | if (ok) { | ||
2373 | if (mdev->net_conf->wire_protocol == DRBD_PROT_A) | ||
2374 | ok = _drbd_send_bio(mdev, req->master_bio); | ||
2375 | else | ||
2376 | ok = _drbd_send_zc_bio(mdev, req->master_bio); | ||
2377 | } | ||
2378 | |||
2379 | drbd_put_data_sock(mdev); | ||
2380 | return ok; | ||
2381 | } | ||
2382 | |||
2383 | /* answer packet, used to send data back for read requests: | ||
2384 | * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY) | ||
2385 | * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY) | ||
2386 | */ | ||
2387 | int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
2388 | struct drbd_epoch_entry *e) | ||
2389 | { | ||
2390 | int ok; | ||
2391 | struct p_data p; | ||
2392 | void *dgb; | ||
2393 | int dgs; | ||
2394 | |||
2395 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? | ||
2396 | crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; | ||
2397 | |||
2398 | p.head.magic = BE_DRBD_MAGIC; | ||
2399 | p.head.command = cpu_to_be16(cmd); | ||
2400 | p.head.length = | ||
2401 | cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size); | ||
2402 | |||
2403 | p.sector = cpu_to_be64(e->sector); | ||
2404 | p.block_id = e->block_id; | ||
2405 | /* p.seq_num = 0; No sequence numbers here.. */ | ||
2406 | |||
2407 | /* Only called by our kernel thread. | ||
2408 | * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL | ||
2409 | * in response to admin command or module unload. | ||
2410 | */ | ||
2411 | if (!drbd_get_data_sock(mdev)) | ||
2412 | return 0; | ||
2413 | |||
2414 | trace_drbd_packet(mdev, mdev->data.socket, 0, (void *)&p, __FILE__, __LINE__); | ||
2415 | ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, | ||
2416 | sizeof(p), MSG_MORE); | ||
2417 | if (ok && dgs) { | ||
2418 | dgb = mdev->int_dig_out; | ||
2419 | drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb); | ||
2420 | ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); | ||
2421 | } | ||
2422 | if (ok) | ||
2423 | ok = _drbd_send_zc_bio(mdev, e->private_bio); | ||
2424 | |||
2425 | drbd_put_data_sock(mdev); | ||
2426 | return ok; | ||
2427 | } | ||
2428 | |||
2429 | /* | ||
2430 | drbd_send distinguishes two cases: | ||
2431 | |||
2432 | Packets sent via the data socket "sock" | ||
2433 | and packets sent via the meta data socket "msock" | ||
2434 | |||
2435 | sock msock | ||
2436 | -----------------+-------------------------+------------------------------ | ||
2437 | timeout conf.timeout / 2 conf.timeout / 2 | ||
2438 | timeout action send a ping via msock Abort communication | ||
2439 | and close all sockets | ||
2440 | */ | ||
2441 | |||
2442 | /* | ||
2443 | * you must have down()ed the appropriate [m]sock_mutex elsewhere! | ||
2444 | */ | ||
2445 | int drbd_send(struct drbd_conf *mdev, struct socket *sock, | ||
2446 | void *buf, size_t size, unsigned msg_flags) | ||
2447 | { | ||
2448 | struct kvec iov; | ||
2449 | struct msghdr msg; | ||
2450 | int rv, sent = 0; | ||
2451 | |||
2452 | if (!sock) | ||
2453 | return -1000; | ||
2454 | |||
2455 | /* THINK if (signal_pending) return ... ? */ | ||
2456 | |||
2457 | iov.iov_base = buf; | ||
2458 | iov.iov_len = size; | ||
2459 | |||
2460 | msg.msg_name = NULL; | ||
2461 | msg.msg_namelen = 0; | ||
2462 | msg.msg_control = NULL; | ||
2463 | msg.msg_controllen = 0; | ||
2464 | msg.msg_flags = msg_flags | MSG_NOSIGNAL; | ||
2465 | |||
2466 | if (sock == mdev->data.socket) { | ||
2467 | mdev->ko_count = mdev->net_conf->ko_count; | ||
2468 | drbd_update_congested(mdev); | ||
2469 | } | ||
2470 | do { | ||
2471 | /* STRANGE | ||
2472 | * tcp_sendmsg does _not_ use its size parameter at all ? | ||
2473 | * | ||
2474 | * -EAGAIN on timeout, -EINTR on signal. | ||
2475 | */ | ||
2476 | /* THINK | ||
2477 | * do we need to block DRBD_SIG if sock == &meta.socket ?? | ||
2478 | * otherwise wake_asender() might interrupt some send_*Ack ! | ||
2479 | */ | ||
2480 | rv = kernel_sendmsg(sock, &msg, &iov, 1, size); | ||
2481 | if (rv == -EAGAIN) { | ||
2482 | if (we_should_drop_the_connection(mdev, sock)) | ||
2483 | break; | ||
2484 | else | ||
2485 | continue; | ||
2486 | } | ||
2487 | D_ASSERT(rv != 0); | ||
2488 | if (rv == -EINTR) { | ||
2489 | flush_signals(current); | ||
2490 | rv = 0; | ||
2491 | } | ||
2492 | if (rv < 0) | ||
2493 | break; | ||
2494 | sent += rv; | ||
2495 | iov.iov_base += rv; | ||
2496 | iov.iov_len -= rv; | ||
2497 | } while (sent < size); | ||
2498 | |||
2499 | if (sock == mdev->data.socket) | ||
2500 | clear_bit(NET_CONGESTED, &mdev->flags); | ||
2501 | |||
2502 | if (rv <= 0) { | ||
2503 | if (rv != -EAGAIN) { | ||
2504 | dev_err(DEV, "%s_sendmsg returned %d\n", | ||
2505 | sock == mdev->meta.socket ? "msock" : "sock", | ||
2506 | rv); | ||
2507 | drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE)); | ||
2508 | } else | ||
2509 | drbd_force_state(mdev, NS(conn, C_TIMEOUT)); | ||
2510 | } | ||
2511 | |||
2512 | return sent; | ||
2513 | } | ||
2514 | |||
2515 | static int drbd_open(struct block_device *bdev, fmode_t mode) | ||
2516 | { | ||
2517 | struct drbd_conf *mdev = bdev->bd_disk->private_data; | ||
2518 | unsigned long flags; | ||
2519 | int rv = 0; | ||
2520 | |||
2521 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
2522 | /* to have a stable mdev->state.role | ||
2523 | * and no race with updating open_cnt */ | ||
2524 | |||
2525 | if (mdev->state.role != R_PRIMARY) { | ||
2526 | if (mode & FMODE_WRITE) | ||
2527 | rv = -EROFS; | ||
2528 | else if (!allow_oos) | ||
2529 | rv = -EMEDIUMTYPE; | ||
2530 | } | ||
2531 | |||
2532 | if (!rv) | ||
2533 | mdev->open_cnt++; | ||
2534 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
2535 | |||
2536 | return rv; | ||
2537 | } | ||
2538 | |||
2539 | static int drbd_release(struct gendisk *gd, fmode_t mode) | ||
2540 | { | ||
2541 | struct drbd_conf *mdev = gd->private_data; | ||
2542 | mdev->open_cnt--; | ||
2543 | return 0; | ||
2544 | } | ||
2545 | |||
2546 | static void drbd_unplug_fn(struct request_queue *q) | ||
2547 | { | ||
2548 | struct drbd_conf *mdev = q->queuedata; | ||
2549 | |||
2550 | trace_drbd_unplug(mdev, "got unplugged"); | ||
2551 | |||
2552 | /* unplug FIRST */ | ||
2553 | spin_lock_irq(q->queue_lock); | ||
2554 | blk_remove_plug(q); | ||
2555 | spin_unlock_irq(q->queue_lock); | ||
2556 | |||
2557 | /* only if connected */ | ||
2558 | spin_lock_irq(&mdev->req_lock); | ||
2559 | if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) { | ||
2560 | D_ASSERT(mdev->state.role == R_PRIMARY); | ||
2561 | if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) { | ||
2562 | /* add to the data.work queue, | ||
2563 | * unless already queued. | ||
2564 | * XXX this might be a good addition to drbd_queue_work | ||
2565 | * anyways, to detect "double queuing" ... */ | ||
2566 | if (list_empty(&mdev->unplug_work.list)) | ||
2567 | drbd_queue_work(&mdev->data.work, | ||
2568 | &mdev->unplug_work); | ||
2569 | } | ||
2570 | } | ||
2571 | spin_unlock_irq(&mdev->req_lock); | ||
2572 | |||
2573 | if (mdev->state.disk >= D_INCONSISTENT) | ||
2574 | drbd_kick_lo(mdev); | ||
2575 | } | ||
2576 | |||
2577 | static void drbd_set_defaults(struct drbd_conf *mdev) | ||
2578 | { | ||
2579 | mdev->sync_conf.after = DRBD_AFTER_DEF; | ||
2580 | mdev->sync_conf.rate = DRBD_RATE_DEF; | ||
2581 | mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_DEF; | ||
2582 | mdev->state = (union drbd_state) { | ||
2583 | { .role = R_SECONDARY, | ||
2584 | .peer = R_UNKNOWN, | ||
2585 | .conn = C_STANDALONE, | ||
2586 | .disk = D_DISKLESS, | ||
2587 | .pdsk = D_UNKNOWN, | ||
2588 | .susp = 0 | ||
2589 | } }; | ||
2590 | } | ||
2591 | |||
2592 | void drbd_init_set_defaults(struct drbd_conf *mdev) | ||
2593 | { | ||
2594 | /* the memset(,0,) did most of this. | ||
2595 | * note: only assignments, no allocation in here */ | ||
2596 | |||
2597 | drbd_set_defaults(mdev); | ||
2598 | |||
2599 | /* for now, we do NOT yet support it, | ||
2600 | * even though we start some framework | ||
2601 | * to eventually support barriers */ | ||
2602 | set_bit(NO_BARRIER_SUPP, &mdev->flags); | ||
2603 | |||
2604 | atomic_set(&mdev->ap_bio_cnt, 0); | ||
2605 | atomic_set(&mdev->ap_pending_cnt, 0); | ||
2606 | atomic_set(&mdev->rs_pending_cnt, 0); | ||
2607 | atomic_set(&mdev->unacked_cnt, 0); | ||
2608 | atomic_set(&mdev->local_cnt, 0); | ||
2609 | atomic_set(&mdev->net_cnt, 0); | ||
2610 | atomic_set(&mdev->packet_seq, 0); | ||
2611 | atomic_set(&mdev->pp_in_use, 0); | ||
2612 | |||
2613 | mutex_init(&mdev->md_io_mutex); | ||
2614 | mutex_init(&mdev->data.mutex); | ||
2615 | mutex_init(&mdev->meta.mutex); | ||
2616 | sema_init(&mdev->data.work.s, 0); | ||
2617 | sema_init(&mdev->meta.work.s, 0); | ||
2618 | mutex_init(&mdev->state_mutex); | ||
2619 | |||
2620 | spin_lock_init(&mdev->data.work.q_lock); | ||
2621 | spin_lock_init(&mdev->meta.work.q_lock); | ||
2622 | |||
2623 | spin_lock_init(&mdev->al_lock); | ||
2624 | spin_lock_init(&mdev->req_lock); | ||
2625 | spin_lock_init(&mdev->peer_seq_lock); | ||
2626 | spin_lock_init(&mdev->epoch_lock); | ||
2627 | |||
2628 | INIT_LIST_HEAD(&mdev->active_ee); | ||
2629 | INIT_LIST_HEAD(&mdev->sync_ee); | ||
2630 | INIT_LIST_HEAD(&mdev->done_ee); | ||
2631 | INIT_LIST_HEAD(&mdev->read_ee); | ||
2632 | INIT_LIST_HEAD(&mdev->net_ee); | ||
2633 | INIT_LIST_HEAD(&mdev->resync_reads); | ||
2634 | INIT_LIST_HEAD(&mdev->data.work.q); | ||
2635 | INIT_LIST_HEAD(&mdev->meta.work.q); | ||
2636 | INIT_LIST_HEAD(&mdev->resync_work.list); | ||
2637 | INIT_LIST_HEAD(&mdev->unplug_work.list); | ||
2638 | INIT_LIST_HEAD(&mdev->md_sync_work.list); | ||
2639 | INIT_LIST_HEAD(&mdev->bm_io_work.w.list); | ||
2640 | mdev->resync_work.cb = w_resync_inactive; | ||
2641 | mdev->unplug_work.cb = w_send_write_hint; | ||
2642 | mdev->md_sync_work.cb = w_md_sync; | ||
2643 | mdev->bm_io_work.w.cb = w_bitmap_io; | ||
2644 | init_timer(&mdev->resync_timer); | ||
2645 | init_timer(&mdev->md_sync_timer); | ||
2646 | mdev->resync_timer.function = resync_timer_fn; | ||
2647 | mdev->resync_timer.data = (unsigned long) mdev; | ||
2648 | mdev->md_sync_timer.function = md_sync_timer_fn; | ||
2649 | mdev->md_sync_timer.data = (unsigned long) mdev; | ||
2650 | |||
2651 | init_waitqueue_head(&mdev->misc_wait); | ||
2652 | init_waitqueue_head(&mdev->state_wait); | ||
2653 | init_waitqueue_head(&mdev->ee_wait); | ||
2654 | init_waitqueue_head(&mdev->al_wait); | ||
2655 | init_waitqueue_head(&mdev->seq_wait); | ||
2656 | |||
2657 | drbd_thread_init(mdev, &mdev->receiver, drbdd_init); | ||
2658 | drbd_thread_init(mdev, &mdev->worker, drbd_worker); | ||
2659 | drbd_thread_init(mdev, &mdev->asender, drbd_asender); | ||
2660 | |||
2661 | mdev->agreed_pro_version = PRO_VERSION_MAX; | ||
2662 | mdev->write_ordering = WO_bio_barrier; | ||
2663 | mdev->resync_wenr = LC_FREE; | ||
2664 | } | ||
2665 | |||
2666 | void drbd_mdev_cleanup(struct drbd_conf *mdev) | ||
2667 | { | ||
2668 | if (mdev->receiver.t_state != None) | ||
2669 | dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n", | ||
2670 | mdev->receiver.t_state); | ||
2671 | |||
2672 | /* no need to lock it, I'm the only thread alive */ | ||
2673 | if (atomic_read(&mdev->current_epoch->epoch_size) != 0) | ||
2674 | dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size)); | ||
2675 | mdev->al_writ_cnt = | ||
2676 | mdev->bm_writ_cnt = | ||
2677 | mdev->read_cnt = | ||
2678 | mdev->recv_cnt = | ||
2679 | mdev->send_cnt = | ||
2680 | mdev->writ_cnt = | ||
2681 | mdev->p_size = | ||
2682 | mdev->rs_start = | ||
2683 | mdev->rs_total = | ||
2684 | mdev->rs_failed = | ||
2685 | mdev->rs_mark_left = | ||
2686 | mdev->rs_mark_time = 0; | ||
2687 | D_ASSERT(mdev->net_conf == NULL); | ||
2688 | |||
2689 | drbd_set_my_capacity(mdev, 0); | ||
2690 | if (mdev->bitmap) { | ||
2691 | /* maybe never allocated. */ | ||
2692 | drbd_bm_resize(mdev, 0); | ||
2693 | drbd_bm_cleanup(mdev); | ||
2694 | } | ||
2695 | |||
2696 | drbd_free_resources(mdev); | ||
2697 | |||
2698 | /* | ||
2699 | * currently we drbd_init_ee only on module load, so | ||
2700 | * we may do drbd_release_ee only on module unload! | ||
2701 | */ | ||
2702 | D_ASSERT(list_empty(&mdev->active_ee)); | ||
2703 | D_ASSERT(list_empty(&mdev->sync_ee)); | ||
2704 | D_ASSERT(list_empty(&mdev->done_ee)); | ||
2705 | D_ASSERT(list_empty(&mdev->read_ee)); | ||
2706 | D_ASSERT(list_empty(&mdev->net_ee)); | ||
2707 | D_ASSERT(list_empty(&mdev->resync_reads)); | ||
2708 | D_ASSERT(list_empty(&mdev->data.work.q)); | ||
2709 | D_ASSERT(list_empty(&mdev->meta.work.q)); | ||
2710 | D_ASSERT(list_empty(&mdev->resync_work.list)); | ||
2711 | D_ASSERT(list_empty(&mdev->unplug_work.list)); | ||
2712 | |||
2713 | } | ||
2714 | |||
2715 | |||
2716 | static void drbd_destroy_mempools(void) | ||
2717 | { | ||
2718 | struct page *page; | ||
2719 | |||
2720 | while (drbd_pp_pool) { | ||
2721 | page = drbd_pp_pool; | ||
2722 | drbd_pp_pool = (struct page *)page_private(page); | ||
2723 | __free_page(page); | ||
2724 | drbd_pp_vacant--; | ||
2725 | } | ||
2726 | |||
2727 | /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */ | ||
2728 | |||
2729 | if (drbd_ee_mempool) | ||
2730 | mempool_destroy(drbd_ee_mempool); | ||
2731 | if (drbd_request_mempool) | ||
2732 | mempool_destroy(drbd_request_mempool); | ||
2733 | if (drbd_ee_cache) | ||
2734 | kmem_cache_destroy(drbd_ee_cache); | ||
2735 | if (drbd_request_cache) | ||
2736 | kmem_cache_destroy(drbd_request_cache); | ||
2737 | if (drbd_bm_ext_cache) | ||
2738 | kmem_cache_destroy(drbd_bm_ext_cache); | ||
2739 | if (drbd_al_ext_cache) | ||
2740 | kmem_cache_destroy(drbd_al_ext_cache); | ||
2741 | |||
2742 | drbd_ee_mempool = NULL; | ||
2743 | drbd_request_mempool = NULL; | ||
2744 | drbd_ee_cache = NULL; | ||
2745 | drbd_request_cache = NULL; | ||
2746 | drbd_bm_ext_cache = NULL; | ||
2747 | drbd_al_ext_cache = NULL; | ||
2748 | |||
2749 | return; | ||
2750 | } | ||
2751 | |||
2752 | static int drbd_create_mempools(void) | ||
2753 | { | ||
2754 | struct page *page; | ||
2755 | const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count; | ||
2756 | int i; | ||
2757 | |||
2758 | /* prepare our caches and mempools */ | ||
2759 | drbd_request_mempool = NULL; | ||
2760 | drbd_ee_cache = NULL; | ||
2761 | drbd_request_cache = NULL; | ||
2762 | drbd_bm_ext_cache = NULL; | ||
2763 | drbd_al_ext_cache = NULL; | ||
2764 | drbd_pp_pool = NULL; | ||
2765 | |||
2766 | /* caches */ | ||
2767 | drbd_request_cache = kmem_cache_create( | ||
2768 | "drbd_req", sizeof(struct drbd_request), 0, 0, NULL); | ||
2769 | if (drbd_request_cache == NULL) | ||
2770 | goto Enomem; | ||
2771 | |||
2772 | drbd_ee_cache = kmem_cache_create( | ||
2773 | "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL); | ||
2774 | if (drbd_ee_cache == NULL) | ||
2775 | goto Enomem; | ||
2776 | |||
2777 | drbd_bm_ext_cache = kmem_cache_create( | ||
2778 | "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL); | ||
2779 | if (drbd_bm_ext_cache == NULL) | ||
2780 | goto Enomem; | ||
2781 | |||
2782 | drbd_al_ext_cache = kmem_cache_create( | ||
2783 | "drbd_al", sizeof(struct lc_element), 0, 0, NULL); | ||
2784 | if (drbd_al_ext_cache == NULL) | ||
2785 | goto Enomem; | ||
2786 | |||
2787 | /* mempools */ | ||
2788 | drbd_request_mempool = mempool_create(number, | ||
2789 | mempool_alloc_slab, mempool_free_slab, drbd_request_cache); | ||
2790 | if (drbd_request_mempool == NULL) | ||
2791 | goto Enomem; | ||
2792 | |||
2793 | drbd_ee_mempool = mempool_create(number, | ||
2794 | mempool_alloc_slab, mempool_free_slab, drbd_ee_cache); | ||
2795 | if (drbd_request_mempool == NULL) | ||
2796 | goto Enomem; | ||
2797 | |||
2798 | /* drbd's page pool */ | ||
2799 | spin_lock_init(&drbd_pp_lock); | ||
2800 | |||
2801 | for (i = 0; i < number; i++) { | ||
2802 | page = alloc_page(GFP_HIGHUSER); | ||
2803 | if (!page) | ||
2804 | goto Enomem; | ||
2805 | set_page_private(page, (unsigned long)drbd_pp_pool); | ||
2806 | drbd_pp_pool = page; | ||
2807 | } | ||
2808 | drbd_pp_vacant = number; | ||
2809 | |||
2810 | return 0; | ||
2811 | |||
2812 | Enomem: | ||
2813 | drbd_destroy_mempools(); /* in case we allocated some */ | ||
2814 | return -ENOMEM; | ||
2815 | } | ||
2816 | |||
2817 | static int drbd_notify_sys(struct notifier_block *this, unsigned long code, | ||
2818 | void *unused) | ||
2819 | { | ||
2820 | /* just so we have it. you never know what interesting things we | ||
2821 | * might want to do here some day... | ||
2822 | */ | ||
2823 | |||
2824 | return NOTIFY_DONE; | ||
2825 | } | ||
2826 | |||
2827 | static struct notifier_block drbd_notifier = { | ||
2828 | .notifier_call = drbd_notify_sys, | ||
2829 | }; | ||
2830 | |||
2831 | static void drbd_release_ee_lists(struct drbd_conf *mdev) | ||
2832 | { | ||
2833 | int rr; | ||
2834 | |||
2835 | rr = drbd_release_ee(mdev, &mdev->active_ee); | ||
2836 | if (rr) | ||
2837 | dev_err(DEV, "%d EEs in active list found!\n", rr); | ||
2838 | |||
2839 | rr = drbd_release_ee(mdev, &mdev->sync_ee); | ||
2840 | if (rr) | ||
2841 | dev_err(DEV, "%d EEs in sync list found!\n", rr); | ||
2842 | |||
2843 | rr = drbd_release_ee(mdev, &mdev->read_ee); | ||
2844 | if (rr) | ||
2845 | dev_err(DEV, "%d EEs in read list found!\n", rr); | ||
2846 | |||
2847 | rr = drbd_release_ee(mdev, &mdev->done_ee); | ||
2848 | if (rr) | ||
2849 | dev_err(DEV, "%d EEs in done list found!\n", rr); | ||
2850 | |||
2851 | rr = drbd_release_ee(mdev, &mdev->net_ee); | ||
2852 | if (rr) | ||
2853 | dev_err(DEV, "%d EEs in net list found!\n", rr); | ||
2854 | } | ||
2855 | |||
2856 | /* caution. no locking. | ||
2857 | * currently only used from module cleanup code. */ | ||
2858 | static void drbd_delete_device(unsigned int minor) | ||
2859 | { | ||
2860 | struct drbd_conf *mdev = minor_to_mdev(minor); | ||
2861 | |||
2862 | if (!mdev) | ||
2863 | return; | ||
2864 | |||
2865 | /* paranoia asserts */ | ||
2866 | if (mdev->open_cnt != 0) | ||
2867 | dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt, | ||
2868 | __FILE__ , __LINE__); | ||
2869 | |||
2870 | ERR_IF (!list_empty(&mdev->data.work.q)) { | ||
2871 | struct list_head *lp; | ||
2872 | list_for_each(lp, &mdev->data.work.q) { | ||
2873 | dev_err(DEV, "lp = %p\n", lp); | ||
2874 | } | ||
2875 | }; | ||
2876 | /* end paranoia asserts */ | ||
2877 | |||
2878 | del_gendisk(mdev->vdisk); | ||
2879 | |||
2880 | /* cleanup stuff that may have been allocated during | ||
2881 | * device (re-)configuration or state changes */ | ||
2882 | |||
2883 | if (mdev->this_bdev) | ||
2884 | bdput(mdev->this_bdev); | ||
2885 | |||
2886 | drbd_free_resources(mdev); | ||
2887 | |||
2888 | drbd_release_ee_lists(mdev); | ||
2889 | |||
2890 | /* should be free'd on disconnect? */ | ||
2891 | kfree(mdev->ee_hash); | ||
2892 | /* | ||
2893 | mdev->ee_hash_s = 0; | ||
2894 | mdev->ee_hash = NULL; | ||
2895 | */ | ||
2896 | |||
2897 | lc_destroy(mdev->act_log); | ||
2898 | lc_destroy(mdev->resync); | ||
2899 | |||
2900 | kfree(mdev->p_uuid); | ||
2901 | /* mdev->p_uuid = NULL; */ | ||
2902 | |||
2903 | kfree(mdev->int_dig_out); | ||
2904 | kfree(mdev->int_dig_in); | ||
2905 | kfree(mdev->int_dig_vv); | ||
2906 | |||
2907 | /* cleanup the rest that has been | ||
2908 | * allocated from drbd_new_device | ||
2909 | * and actually free the mdev itself */ | ||
2910 | drbd_free_mdev(mdev); | ||
2911 | } | ||
2912 | |||
2913 | static void drbd_cleanup(void) | ||
2914 | { | ||
2915 | unsigned int i; | ||
2916 | |||
2917 | unregister_reboot_notifier(&drbd_notifier); | ||
2918 | |||
2919 | drbd_nl_cleanup(); | ||
2920 | |||
2921 | if (minor_table) { | ||
2922 | if (drbd_proc) | ||
2923 | remove_proc_entry("drbd", NULL); | ||
2924 | i = minor_count; | ||
2925 | while (i--) | ||
2926 | drbd_delete_device(i); | ||
2927 | drbd_destroy_mempools(); | ||
2928 | } | ||
2929 | |||
2930 | kfree(minor_table); | ||
2931 | |||
2932 | unregister_blkdev(DRBD_MAJOR, "drbd"); | ||
2933 | |||
2934 | printk(KERN_INFO "drbd: module cleanup done.\n"); | ||
2935 | } | ||
2936 | |||
2937 | /** | ||
2938 | * drbd_congested() - Callback for pdflush | ||
2939 | * @congested_data: User data | ||
2940 | * @bdi_bits: Bits pdflush is currently interested in | ||
2941 | * | ||
2942 | * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested. | ||
2943 | */ | ||
2944 | static int drbd_congested(void *congested_data, int bdi_bits) | ||
2945 | { | ||
2946 | struct drbd_conf *mdev = congested_data; | ||
2947 | struct request_queue *q; | ||
2948 | char reason = '-'; | ||
2949 | int r = 0; | ||
2950 | |||
2951 | if (!__inc_ap_bio_cond(mdev)) { | ||
2952 | /* DRBD has frozen IO */ | ||
2953 | r = bdi_bits; | ||
2954 | reason = 'd'; | ||
2955 | goto out; | ||
2956 | } | ||
2957 | |||
2958 | if (get_ldev(mdev)) { | ||
2959 | q = bdev_get_queue(mdev->ldev->backing_bdev); | ||
2960 | r = bdi_congested(&q->backing_dev_info, bdi_bits); | ||
2961 | put_ldev(mdev); | ||
2962 | if (r) | ||
2963 | reason = 'b'; | ||
2964 | } | ||
2965 | |||
2966 | if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) { | ||
2967 | r |= (1 << BDI_async_congested); | ||
2968 | reason = reason == 'b' ? 'a' : 'n'; | ||
2969 | } | ||
2970 | |||
2971 | out: | ||
2972 | mdev->congestion_reason = reason; | ||
2973 | return r; | ||
2974 | } | ||
2975 | |||
2976 | struct drbd_conf *drbd_new_device(unsigned int minor) | ||
2977 | { | ||
2978 | struct drbd_conf *mdev; | ||
2979 | struct gendisk *disk; | ||
2980 | struct request_queue *q; | ||
2981 | |||
2982 | /* GFP_KERNEL, we are outside of all write-out paths */ | ||
2983 | mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL); | ||
2984 | if (!mdev) | ||
2985 | return NULL; | ||
2986 | if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL)) | ||
2987 | goto out_no_cpumask; | ||
2988 | |||
2989 | mdev->minor = minor; | ||
2990 | |||
2991 | drbd_init_set_defaults(mdev); | ||
2992 | |||
2993 | q = blk_alloc_queue(GFP_KERNEL); | ||
2994 | if (!q) | ||
2995 | goto out_no_q; | ||
2996 | mdev->rq_queue = q; | ||
2997 | q->queuedata = mdev; | ||
2998 | blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE); | ||
2999 | |||
3000 | disk = alloc_disk(1); | ||
3001 | if (!disk) | ||
3002 | goto out_no_disk; | ||
3003 | mdev->vdisk = disk; | ||
3004 | |||
3005 | set_disk_ro(disk, TRUE); | ||
3006 | |||
3007 | disk->queue = q; | ||
3008 | disk->major = DRBD_MAJOR; | ||
3009 | disk->first_minor = minor; | ||
3010 | disk->fops = &drbd_ops; | ||
3011 | sprintf(disk->disk_name, "drbd%d", minor); | ||
3012 | disk->private_data = mdev; | ||
3013 | |||
3014 | mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor)); | ||
3015 | /* we have no partitions. we contain only ourselves. */ | ||
3016 | mdev->this_bdev->bd_contains = mdev->this_bdev; | ||
3017 | |||
3018 | q->backing_dev_info.congested_fn = drbd_congested; | ||
3019 | q->backing_dev_info.congested_data = mdev; | ||
3020 | |||
3021 | blk_queue_make_request(q, drbd_make_request_26); | ||
3022 | blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); | ||
3023 | blk_queue_merge_bvec(q, drbd_merge_bvec); | ||
3024 | q->queue_lock = &mdev->req_lock; /* needed since we use */ | ||
3025 | /* plugging on a queue, that actually has no requests! */ | ||
3026 | q->unplug_fn = drbd_unplug_fn; | ||
3027 | |||
3028 | mdev->md_io_page = alloc_page(GFP_KERNEL); | ||
3029 | if (!mdev->md_io_page) | ||
3030 | goto out_no_io_page; | ||
3031 | |||
3032 | if (drbd_bm_init(mdev)) | ||
3033 | goto out_no_bitmap; | ||
3034 | /* no need to lock access, we are still initializing this minor device. */ | ||
3035 | if (!tl_init(mdev)) | ||
3036 | goto out_no_tl; | ||
3037 | |||
3038 | mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL); | ||
3039 | if (!mdev->app_reads_hash) | ||
3040 | goto out_no_app_reads; | ||
3041 | |||
3042 | mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL); | ||
3043 | if (!mdev->current_epoch) | ||
3044 | goto out_no_epoch; | ||
3045 | |||
3046 | INIT_LIST_HEAD(&mdev->current_epoch->list); | ||
3047 | mdev->epochs = 1; | ||
3048 | |||
3049 | return mdev; | ||
3050 | |||
3051 | /* out_whatever_else: | ||
3052 | kfree(mdev->current_epoch); */ | ||
3053 | out_no_epoch: | ||
3054 | kfree(mdev->app_reads_hash); | ||
3055 | out_no_app_reads: | ||
3056 | tl_cleanup(mdev); | ||
3057 | out_no_tl: | ||
3058 | drbd_bm_cleanup(mdev); | ||
3059 | out_no_bitmap: | ||
3060 | __free_page(mdev->md_io_page); | ||
3061 | out_no_io_page: | ||
3062 | put_disk(disk); | ||
3063 | out_no_disk: | ||
3064 | blk_cleanup_queue(q); | ||
3065 | out_no_q: | ||
3066 | free_cpumask_var(mdev->cpu_mask); | ||
3067 | out_no_cpumask: | ||
3068 | kfree(mdev); | ||
3069 | return NULL; | ||
3070 | } | ||
3071 | |||
3072 | /* counterpart of drbd_new_device. | ||
3073 | * last part of drbd_delete_device. */ | ||
3074 | void drbd_free_mdev(struct drbd_conf *mdev) | ||
3075 | { | ||
3076 | kfree(mdev->current_epoch); | ||
3077 | kfree(mdev->app_reads_hash); | ||
3078 | tl_cleanup(mdev); | ||
3079 | if (mdev->bitmap) /* should no longer be there. */ | ||
3080 | drbd_bm_cleanup(mdev); | ||
3081 | __free_page(mdev->md_io_page); | ||
3082 | put_disk(mdev->vdisk); | ||
3083 | blk_cleanup_queue(mdev->rq_queue); | ||
3084 | free_cpumask_var(mdev->cpu_mask); | ||
3085 | kfree(mdev); | ||
3086 | } | ||
3087 | |||
3088 | |||
3089 | int __init drbd_init(void) | ||
3090 | { | ||
3091 | int err; | ||
3092 | |||
3093 | if (sizeof(struct p_handshake) != 80) { | ||
3094 | printk(KERN_ERR | ||
3095 | "drbd: never change the size or layout " | ||
3096 | "of the HandShake packet.\n"); | ||
3097 | return -EINVAL; | ||
3098 | } | ||
3099 | |||
3100 | if (1 > minor_count || minor_count > 255) { | ||
3101 | printk(KERN_ERR | ||
3102 | "drbd: invalid minor_count (%d)\n", minor_count); | ||
3103 | #ifdef MODULE | ||
3104 | return -EINVAL; | ||
3105 | #else | ||
3106 | minor_count = 8; | ||
3107 | #endif | ||
3108 | } | ||
3109 | |||
3110 | err = drbd_nl_init(); | ||
3111 | if (err) | ||
3112 | return err; | ||
3113 | |||
3114 | err = register_blkdev(DRBD_MAJOR, "drbd"); | ||
3115 | if (err) { | ||
3116 | printk(KERN_ERR | ||
3117 | "drbd: unable to register block device major %d\n", | ||
3118 | DRBD_MAJOR); | ||
3119 | return err; | ||
3120 | } | ||
3121 | |||
3122 | register_reboot_notifier(&drbd_notifier); | ||
3123 | |||
3124 | /* | ||
3125 | * allocate all necessary structs | ||
3126 | */ | ||
3127 | err = -ENOMEM; | ||
3128 | |||
3129 | init_waitqueue_head(&drbd_pp_wait); | ||
3130 | |||
3131 | drbd_proc = NULL; /* play safe for drbd_cleanup */ | ||
3132 | minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count, | ||
3133 | GFP_KERNEL); | ||
3134 | if (!minor_table) | ||
3135 | goto Enomem; | ||
3136 | |||
3137 | err = drbd_create_mempools(); | ||
3138 | if (err) | ||
3139 | goto Enomem; | ||
3140 | |||
3141 | drbd_proc = proc_create("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops); | ||
3142 | if (!drbd_proc) { | ||
3143 | printk(KERN_ERR "drbd: unable to register proc file\n"); | ||
3144 | goto Enomem; | ||
3145 | } | ||
3146 | |||
3147 | rwlock_init(&global_state_lock); | ||
3148 | |||
3149 | printk(KERN_INFO "drbd: initialized. " | ||
3150 | "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n", | ||
3151 | API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX); | ||
3152 | printk(KERN_INFO "drbd: %s\n", drbd_buildtag()); | ||
3153 | printk(KERN_INFO "drbd: registered as block device major %d\n", | ||
3154 | DRBD_MAJOR); | ||
3155 | printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table); | ||
3156 | |||
3157 | return 0; /* Success! */ | ||
3158 | |||
3159 | Enomem: | ||
3160 | drbd_cleanup(); | ||
3161 | if (err == -ENOMEM) | ||
3162 | /* currently always the case */ | ||
3163 | printk(KERN_ERR "drbd: ran out of memory\n"); | ||
3164 | else | ||
3165 | printk(KERN_ERR "drbd: initialization failure\n"); | ||
3166 | return err; | ||
3167 | } | ||
3168 | |||
3169 | void drbd_free_bc(struct drbd_backing_dev *ldev) | ||
3170 | { | ||
3171 | if (ldev == NULL) | ||
3172 | return; | ||
3173 | |||
3174 | bd_release(ldev->backing_bdev); | ||
3175 | bd_release(ldev->md_bdev); | ||
3176 | |||
3177 | fput(ldev->lo_file); | ||
3178 | fput(ldev->md_file); | ||
3179 | |||
3180 | kfree(ldev); | ||
3181 | } | ||
3182 | |||
3183 | void drbd_free_sock(struct drbd_conf *mdev) | ||
3184 | { | ||
3185 | if (mdev->data.socket) { | ||
3186 | kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR); | ||
3187 | sock_release(mdev->data.socket); | ||
3188 | mdev->data.socket = NULL; | ||
3189 | } | ||
3190 | if (mdev->meta.socket) { | ||
3191 | kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR); | ||
3192 | sock_release(mdev->meta.socket); | ||
3193 | mdev->meta.socket = NULL; | ||
3194 | } | ||
3195 | } | ||
3196 | |||
3197 | |||
3198 | void drbd_free_resources(struct drbd_conf *mdev) | ||
3199 | { | ||
3200 | crypto_free_hash(mdev->csums_tfm); | ||
3201 | mdev->csums_tfm = NULL; | ||
3202 | crypto_free_hash(mdev->verify_tfm); | ||
3203 | mdev->verify_tfm = NULL; | ||
3204 | crypto_free_hash(mdev->cram_hmac_tfm); | ||
3205 | mdev->cram_hmac_tfm = NULL; | ||
3206 | crypto_free_hash(mdev->integrity_w_tfm); | ||
3207 | mdev->integrity_w_tfm = NULL; | ||
3208 | crypto_free_hash(mdev->integrity_r_tfm); | ||
3209 | mdev->integrity_r_tfm = NULL; | ||
3210 | |||
3211 | drbd_free_sock(mdev); | ||
3212 | |||
3213 | __no_warn(local, | ||
3214 | drbd_free_bc(mdev->ldev); | ||
3215 | mdev->ldev = NULL;); | ||
3216 | } | ||
3217 | |||
3218 | /* meta data management */ | ||
3219 | |||
3220 | struct meta_data_on_disk { | ||
3221 | u64 la_size; /* last agreed size. */ | ||
3222 | u64 uuid[UI_SIZE]; /* UUIDs. */ | ||
3223 | u64 device_uuid; | ||
3224 | u64 reserved_u64_1; | ||
3225 | u32 flags; /* MDF */ | ||
3226 | u32 magic; | ||
3227 | u32 md_size_sect; | ||
3228 | u32 al_offset; /* offset to this block */ | ||
3229 | u32 al_nr_extents; /* important for restoring the AL */ | ||
3230 | /* `-- act_log->nr_elements <-- sync_conf.al_extents */ | ||
3231 | u32 bm_offset; /* offset to the bitmap, from here */ | ||
3232 | u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ | ||
3233 | u32 reserved_u32[4]; | ||
3234 | |||
3235 | } __packed; | ||
3236 | |||
3237 | /** | ||
3238 | * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set | ||
3239 | * @mdev: DRBD device. | ||
3240 | */ | ||
3241 | void drbd_md_sync(struct drbd_conf *mdev) | ||
3242 | { | ||
3243 | struct meta_data_on_disk *buffer; | ||
3244 | sector_t sector; | ||
3245 | int i; | ||
3246 | |||
3247 | if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) | ||
3248 | return; | ||
3249 | del_timer(&mdev->md_sync_timer); | ||
3250 | |||
3251 | /* We use here D_FAILED and not D_ATTACHING because we try to write | ||
3252 | * metadata even if we detach due to a disk failure! */ | ||
3253 | if (!get_ldev_if_state(mdev, D_FAILED)) | ||
3254 | return; | ||
3255 | |||
3256 | trace_drbd_md_io(mdev, WRITE, mdev->ldev); | ||
3257 | |||
3258 | mutex_lock(&mdev->md_io_mutex); | ||
3259 | buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); | ||
3260 | memset(buffer, 0, 512); | ||
3261 | |||
3262 | buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); | ||
3263 | for (i = UI_CURRENT; i < UI_SIZE; i++) | ||
3264 | buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); | ||
3265 | buffer->flags = cpu_to_be32(mdev->ldev->md.flags); | ||
3266 | buffer->magic = cpu_to_be32(DRBD_MD_MAGIC); | ||
3267 | |||
3268 | buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect); | ||
3269 | buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset); | ||
3270 | buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements); | ||
3271 | buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE); | ||
3272 | buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid); | ||
3273 | |||
3274 | buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset); | ||
3275 | |||
3276 | D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); | ||
3277 | sector = mdev->ldev->md.md_offset; | ||
3278 | |||
3279 | if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { | ||
3280 | clear_bit(MD_DIRTY, &mdev->flags); | ||
3281 | } else { | ||
3282 | /* this was a try anyways ... */ | ||
3283 | dev_err(DEV, "meta data update failed!\n"); | ||
3284 | |||
3285 | drbd_chk_io_error(mdev, 1, TRUE); | ||
3286 | } | ||
3287 | |||
3288 | /* Update mdev->ldev->md.la_size_sect, | ||
3289 | * since we updated it on metadata. */ | ||
3290 | mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev); | ||
3291 | |||
3292 | mutex_unlock(&mdev->md_io_mutex); | ||
3293 | put_ldev(mdev); | ||
3294 | } | ||
3295 | |||
3296 | /** | ||
3297 | * drbd_md_read() - Reads in the meta data super block | ||
3298 | * @mdev: DRBD device. | ||
3299 | * @bdev: Device from which the meta data should be read in. | ||
3300 | * | ||
3301 | * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case | ||
3302 | * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID. | ||
3303 | */ | ||
3304 | int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | ||
3305 | { | ||
3306 | struct meta_data_on_disk *buffer; | ||
3307 | int i, rv = NO_ERROR; | ||
3308 | |||
3309 | if (!get_ldev_if_state(mdev, D_ATTACHING)) | ||
3310 | return ERR_IO_MD_DISK; | ||
3311 | |||
3312 | trace_drbd_md_io(mdev, READ, bdev); | ||
3313 | |||
3314 | mutex_lock(&mdev->md_io_mutex); | ||
3315 | buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); | ||
3316 | |||
3317 | if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { | ||
3318 | /* NOTE: cant do normal error processing here as this is | ||
3319 | called BEFORE disk is attached */ | ||
3320 | dev_err(DEV, "Error while reading metadata.\n"); | ||
3321 | rv = ERR_IO_MD_DISK; | ||
3322 | goto err; | ||
3323 | } | ||
3324 | |||
3325 | if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) { | ||
3326 | dev_err(DEV, "Error while reading metadata, magic not found.\n"); | ||
3327 | rv = ERR_MD_INVALID; | ||
3328 | goto err; | ||
3329 | } | ||
3330 | if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) { | ||
3331 | dev_err(DEV, "unexpected al_offset: %d (expected %d)\n", | ||
3332 | be32_to_cpu(buffer->al_offset), bdev->md.al_offset); | ||
3333 | rv = ERR_MD_INVALID; | ||
3334 | goto err; | ||
3335 | } | ||
3336 | if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) { | ||
3337 | dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n", | ||
3338 | be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset); | ||
3339 | rv = ERR_MD_INVALID; | ||
3340 | goto err; | ||
3341 | } | ||
3342 | if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) { | ||
3343 | dev_err(DEV, "unexpected md_size: %u (expected %u)\n", | ||
3344 | be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect); | ||
3345 | rv = ERR_MD_INVALID; | ||
3346 | goto err; | ||
3347 | } | ||
3348 | |||
3349 | if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { | ||
3350 | dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n", | ||
3351 | be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE); | ||
3352 | rv = ERR_MD_INVALID; | ||
3353 | goto err; | ||
3354 | } | ||
3355 | |||
3356 | bdev->md.la_size_sect = be64_to_cpu(buffer->la_size); | ||
3357 | for (i = UI_CURRENT; i < UI_SIZE; i++) | ||
3358 | bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); | ||
3359 | bdev->md.flags = be32_to_cpu(buffer->flags); | ||
3360 | mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents); | ||
3361 | bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); | ||
3362 | |||
3363 | if (mdev->sync_conf.al_extents < 7) | ||
3364 | mdev->sync_conf.al_extents = 127; | ||
3365 | |||
3366 | err: | ||
3367 | mutex_unlock(&mdev->md_io_mutex); | ||
3368 | put_ldev(mdev); | ||
3369 | |||
3370 | return rv; | ||
3371 | } | ||
3372 | |||
3373 | /** | ||
3374 | * drbd_md_mark_dirty() - Mark meta data super block as dirty | ||
3375 | * @mdev: DRBD device. | ||
3376 | * | ||
3377 | * Call this function if you change anything that should be written to | ||
3378 | * the meta-data super block. This function sets MD_DIRTY, and starts a | ||
3379 | * timer that ensures that within five seconds you have to call drbd_md_sync(). | ||
3380 | */ | ||
3381 | void drbd_md_mark_dirty(struct drbd_conf *mdev) | ||
3382 | { | ||
3383 | set_bit(MD_DIRTY, &mdev->flags); | ||
3384 | mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ); | ||
3385 | } | ||
3386 | |||
3387 | |||
3388 | static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local) | ||
3389 | { | ||
3390 | int i; | ||
3391 | |||
3392 | for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) { | ||
3393 | mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i]; | ||
3394 | |||
3395 | trace_drbd_uuid(mdev, i+1); | ||
3396 | } | ||
3397 | } | ||
3398 | |||
3399 | void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) | ||
3400 | { | ||
3401 | if (idx == UI_CURRENT) { | ||
3402 | if (mdev->state.role == R_PRIMARY) | ||
3403 | val |= 1; | ||
3404 | else | ||
3405 | val &= ~((u64)1); | ||
3406 | |||
3407 | drbd_set_ed_uuid(mdev, val); | ||
3408 | } | ||
3409 | |||
3410 | mdev->ldev->md.uuid[idx] = val; | ||
3411 | trace_drbd_uuid(mdev, idx); | ||
3412 | drbd_md_mark_dirty(mdev); | ||
3413 | } | ||
3414 | |||
3415 | |||
3416 | void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) | ||
3417 | { | ||
3418 | if (mdev->ldev->md.uuid[idx]) { | ||
3419 | drbd_uuid_move_history(mdev); | ||
3420 | mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx]; | ||
3421 | trace_drbd_uuid(mdev, UI_HISTORY_START); | ||
3422 | } | ||
3423 | _drbd_uuid_set(mdev, idx, val); | ||
3424 | } | ||
3425 | |||
3426 | /** | ||
3427 | * drbd_uuid_new_current() - Creates a new current UUID | ||
3428 | * @mdev: DRBD device. | ||
3429 | * | ||
3430 | * Creates a new current UUID, and rotates the old current UUID into | ||
3431 | * the bitmap slot. Causes an incremental resync upon next connect. | ||
3432 | */ | ||
3433 | void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local) | ||
3434 | { | ||
3435 | u64 val; | ||
3436 | |||
3437 | dev_info(DEV, "Creating new current UUID\n"); | ||
3438 | D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0); | ||
3439 | mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT]; | ||
3440 | trace_drbd_uuid(mdev, UI_BITMAP); | ||
3441 | |||
3442 | get_random_bytes(&val, sizeof(u64)); | ||
3443 | _drbd_uuid_set(mdev, UI_CURRENT, val); | ||
3444 | } | ||
3445 | |||
3446 | void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) | ||
3447 | { | ||
3448 | if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0) | ||
3449 | return; | ||
3450 | |||
3451 | if (val == 0) { | ||
3452 | drbd_uuid_move_history(mdev); | ||
3453 | mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP]; | ||
3454 | mdev->ldev->md.uuid[UI_BITMAP] = 0; | ||
3455 | trace_drbd_uuid(mdev, UI_HISTORY_START); | ||
3456 | trace_drbd_uuid(mdev, UI_BITMAP); | ||
3457 | } else { | ||
3458 | if (mdev->ldev->md.uuid[UI_BITMAP]) | ||
3459 | dev_warn(DEV, "bm UUID already set"); | ||
3460 | |||
3461 | mdev->ldev->md.uuid[UI_BITMAP] = val; | ||
3462 | mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1); | ||
3463 | |||
3464 | trace_drbd_uuid(mdev, UI_BITMAP); | ||
3465 | } | ||
3466 | drbd_md_mark_dirty(mdev); | ||
3467 | } | ||
3468 | |||
3469 | /** | ||
3470 | * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() | ||
3471 | * @mdev: DRBD device. | ||
3472 | * | ||
3473 | * Sets all bits in the bitmap and writes the whole bitmap to stable storage. | ||
3474 | */ | ||
3475 | int drbd_bmio_set_n_write(struct drbd_conf *mdev) | ||
3476 | { | ||
3477 | int rv = -EIO; | ||
3478 | |||
3479 | if (get_ldev_if_state(mdev, D_ATTACHING)) { | ||
3480 | drbd_md_set_flag(mdev, MDF_FULL_SYNC); | ||
3481 | drbd_md_sync(mdev); | ||
3482 | drbd_bm_set_all(mdev); | ||
3483 | |||
3484 | rv = drbd_bm_write(mdev); | ||
3485 | |||
3486 | if (!rv) { | ||
3487 | drbd_md_clear_flag(mdev, MDF_FULL_SYNC); | ||
3488 | drbd_md_sync(mdev); | ||
3489 | } | ||
3490 | |||
3491 | put_ldev(mdev); | ||
3492 | } | ||
3493 | |||
3494 | return rv; | ||
3495 | } | ||
3496 | |||
3497 | /** | ||
3498 | * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() | ||
3499 | * @mdev: DRBD device. | ||
3500 | * | ||
3501 | * Clears all bits in the bitmap and writes the whole bitmap to stable storage. | ||
3502 | */ | ||
3503 | int drbd_bmio_clear_n_write(struct drbd_conf *mdev) | ||
3504 | { | ||
3505 | int rv = -EIO; | ||
3506 | |||
3507 | if (get_ldev_if_state(mdev, D_ATTACHING)) { | ||
3508 | drbd_bm_clear_all(mdev); | ||
3509 | rv = drbd_bm_write(mdev); | ||
3510 | put_ldev(mdev); | ||
3511 | } | ||
3512 | |||
3513 | return rv; | ||
3514 | } | ||
3515 | |||
3516 | static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
3517 | { | ||
3518 | struct bm_io_work *work = container_of(w, struct bm_io_work, w); | ||
3519 | int rv; | ||
3520 | |||
3521 | D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0); | ||
3522 | |||
3523 | drbd_bm_lock(mdev, work->why); | ||
3524 | rv = work->io_fn(mdev); | ||
3525 | drbd_bm_unlock(mdev); | ||
3526 | |||
3527 | clear_bit(BITMAP_IO, &mdev->flags); | ||
3528 | wake_up(&mdev->misc_wait); | ||
3529 | |||
3530 | if (work->done) | ||
3531 | work->done(mdev, rv); | ||
3532 | |||
3533 | clear_bit(BITMAP_IO_QUEUED, &mdev->flags); | ||
3534 | work->why = NULL; | ||
3535 | |||
3536 | return 1; | ||
3537 | } | ||
3538 | |||
3539 | /** | ||
3540 | * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap | ||
3541 | * @mdev: DRBD device. | ||
3542 | * @io_fn: IO callback to be called when bitmap IO is possible | ||
3543 | * @done: callback to be called after the bitmap IO was performed | ||
3544 | * @why: Descriptive text of the reason for doing the IO | ||
3545 | * | ||
3546 | * While IO on the bitmap happens we freeze application IO thus we ensure | ||
3547 | * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be | ||
3548 | * called from worker context. It MUST NOT be used while a previous such | ||
3549 | * work is still pending! | ||
3550 | */ | ||
3551 | void drbd_queue_bitmap_io(struct drbd_conf *mdev, | ||
3552 | int (*io_fn)(struct drbd_conf *), | ||
3553 | void (*done)(struct drbd_conf *, int), | ||
3554 | char *why) | ||
3555 | { | ||
3556 | D_ASSERT(current == mdev->worker.task); | ||
3557 | |||
3558 | D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags)); | ||
3559 | D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags)); | ||
3560 | D_ASSERT(list_empty(&mdev->bm_io_work.w.list)); | ||
3561 | if (mdev->bm_io_work.why) | ||
3562 | dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n", | ||
3563 | why, mdev->bm_io_work.why); | ||
3564 | |||
3565 | mdev->bm_io_work.io_fn = io_fn; | ||
3566 | mdev->bm_io_work.done = done; | ||
3567 | mdev->bm_io_work.why = why; | ||
3568 | |||
3569 | set_bit(BITMAP_IO, &mdev->flags); | ||
3570 | if (atomic_read(&mdev->ap_bio_cnt) == 0) { | ||
3571 | if (list_empty(&mdev->bm_io_work.w.list)) { | ||
3572 | set_bit(BITMAP_IO_QUEUED, &mdev->flags); | ||
3573 | drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); | ||
3574 | } else | ||
3575 | dev_err(DEV, "FIXME avoided double queuing bm_io_work\n"); | ||
3576 | } | ||
3577 | } | ||
3578 | |||
3579 | /** | ||
3580 | * drbd_bitmap_io() - Does an IO operation on the whole bitmap | ||
3581 | * @mdev: DRBD device. | ||
3582 | * @io_fn: IO callback to be called when bitmap IO is possible | ||
3583 | * @why: Descriptive text of the reason for doing the IO | ||
3584 | * | ||
3585 | * freezes application IO while that the actual IO operations runs. This | ||
3586 | * functions MAY NOT be called from worker context. | ||
3587 | */ | ||
3588 | int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why) | ||
3589 | { | ||
3590 | int rv; | ||
3591 | |||
3592 | D_ASSERT(current != mdev->worker.task); | ||
3593 | |||
3594 | drbd_suspend_io(mdev); | ||
3595 | |||
3596 | drbd_bm_lock(mdev, why); | ||
3597 | rv = io_fn(mdev); | ||
3598 | drbd_bm_unlock(mdev); | ||
3599 | |||
3600 | drbd_resume_io(mdev); | ||
3601 | |||
3602 | return rv; | ||
3603 | } | ||
3604 | |||
3605 | void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local) | ||
3606 | { | ||
3607 | if ((mdev->ldev->md.flags & flag) != flag) { | ||
3608 | drbd_md_mark_dirty(mdev); | ||
3609 | mdev->ldev->md.flags |= flag; | ||
3610 | } | ||
3611 | } | ||
3612 | |||
3613 | void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local) | ||
3614 | { | ||
3615 | if ((mdev->ldev->md.flags & flag) != 0) { | ||
3616 | drbd_md_mark_dirty(mdev); | ||
3617 | mdev->ldev->md.flags &= ~flag; | ||
3618 | } | ||
3619 | } | ||
3620 | int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag) | ||
3621 | { | ||
3622 | return (bdev->md.flags & flag) != 0; | ||
3623 | } | ||
3624 | |||
3625 | static void md_sync_timer_fn(unsigned long data) | ||
3626 | { | ||
3627 | struct drbd_conf *mdev = (struct drbd_conf *) data; | ||
3628 | |||
3629 | drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work); | ||
3630 | } | ||
3631 | |||
3632 | static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
3633 | { | ||
3634 | dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); | ||
3635 | drbd_md_sync(mdev); | ||
3636 | |||
3637 | return 1; | ||
3638 | } | ||
3639 | |||
3640 | #ifdef CONFIG_DRBD_FAULT_INJECTION | ||
3641 | /* Fault insertion support including random number generator shamelessly | ||
3642 | * stolen from kernel/rcutorture.c */ | ||
3643 | struct fault_random_state { | ||
3644 | unsigned long state; | ||
3645 | unsigned long count; | ||
3646 | }; | ||
3647 | |||
3648 | #define FAULT_RANDOM_MULT 39916801 /* prime */ | ||
3649 | #define FAULT_RANDOM_ADD 479001701 /* prime */ | ||
3650 | #define FAULT_RANDOM_REFRESH 10000 | ||
3651 | |||
3652 | /* | ||
3653 | * Crude but fast random-number generator. Uses a linear congruential | ||
3654 | * generator, with occasional help from get_random_bytes(). | ||
3655 | */ | ||
3656 | static unsigned long | ||
3657 | _drbd_fault_random(struct fault_random_state *rsp) | ||
3658 | { | ||
3659 | long refresh; | ||
3660 | |||
3661 | if (--rsp->count < 0) { | ||
3662 | get_random_bytes(&refresh, sizeof(refresh)); | ||
3663 | rsp->state += refresh; | ||
3664 | rsp->count = FAULT_RANDOM_REFRESH; | ||
3665 | } | ||
3666 | rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD; | ||
3667 | return swahw32(rsp->state); | ||
3668 | } | ||
3669 | |||
3670 | static char * | ||
3671 | _drbd_fault_str(unsigned int type) { | ||
3672 | static char *_faults[] = { | ||
3673 | [DRBD_FAULT_MD_WR] = "Meta-data write", | ||
3674 | [DRBD_FAULT_MD_RD] = "Meta-data read", | ||
3675 | [DRBD_FAULT_RS_WR] = "Resync write", | ||
3676 | [DRBD_FAULT_RS_RD] = "Resync read", | ||
3677 | [DRBD_FAULT_DT_WR] = "Data write", | ||
3678 | [DRBD_FAULT_DT_RD] = "Data read", | ||
3679 | [DRBD_FAULT_DT_RA] = "Data read ahead", | ||
3680 | [DRBD_FAULT_BM_ALLOC] = "BM allocation", | ||
3681 | [DRBD_FAULT_AL_EE] = "EE allocation" | ||
3682 | }; | ||
3683 | |||
3684 | return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**"; | ||
3685 | } | ||
3686 | |||
3687 | unsigned int | ||
3688 | _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) | ||
3689 | { | ||
3690 | static struct fault_random_state rrs = {0, 0}; | ||
3691 | |||
3692 | unsigned int ret = ( | ||
3693 | (fault_devs == 0 || | ||
3694 | ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) && | ||
3695 | (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate)); | ||
3696 | |||
3697 | if (ret) { | ||
3698 | fault_count++; | ||
3699 | |||
3700 | if (printk_ratelimit()) | ||
3701 | dev_warn(DEV, "***Simulating %s failure\n", | ||
3702 | _drbd_fault_str(type)); | ||
3703 | } | ||
3704 | |||
3705 | return ret; | ||
3706 | } | ||
3707 | #endif | ||
3708 | |||
3709 | const char *drbd_buildtag(void) | ||
3710 | { | ||
3711 | /* DRBD built from external sources has here a reference to the | ||
3712 | git hash of the source code. */ | ||
3713 | |||
3714 | static char buildtag[38] = "\0uilt-in"; | ||
3715 | |||
3716 | if (buildtag[0] == 0) { | ||
3717 | #ifdef CONFIG_MODULES | ||
3718 | if (THIS_MODULE != NULL) | ||
3719 | sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion); | ||
3720 | else | ||
3721 | #endif | ||
3722 | buildtag[0] = 'b'; | ||
3723 | } | ||
3724 | |||
3725 | return buildtag; | ||
3726 | } | ||
3727 | |||
3728 | module_init(drbd_init) | ||
3729 | module_exit(drbd_cleanup) | ||
3730 | |||
3731 | /* For drbd_tracing: */ | ||
3732 | EXPORT_SYMBOL(drbd_conn_str); | ||
3733 | EXPORT_SYMBOL(drbd_role_str); | ||
3734 | EXPORT_SYMBOL(drbd_disk_str); | ||
3735 | EXPORT_SYMBOL(drbd_set_st_err_str); | ||
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c new file mode 100644 index 00000000000..1927acefe23 --- /dev/null +++ b/drivers/block/drbd/drbd_nl.c | |||
@@ -0,0 +1,2365 @@ | |||
1 | /* | ||
2 | drbd_nl.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #include <linux/autoconf.h> | ||
27 | #include <linux/module.h> | ||
28 | #include <linux/drbd.h> | ||
29 | #include <linux/in.h> | ||
30 | #include <linux/fs.h> | ||
31 | #include <linux/file.h> | ||
32 | #include <linux/slab.h> | ||
33 | #include <linux/connector.h> | ||
34 | #include <linux/blkpg.h> | ||
35 | #include <linux/cpumask.h> | ||
36 | #include "drbd_int.h" | ||
37 | #include "drbd_tracing.h" | ||
38 | #include "drbd_wrappers.h" | ||
39 | #include <asm/unaligned.h> | ||
40 | #include <linux/drbd_tag_magic.h> | ||
41 | #include <linux/drbd_limits.h> | ||
42 | |||
43 | static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int); | ||
44 | static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *); | ||
45 | static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *); | ||
46 | |||
47 | /* see get_sb_bdev and bd_claim */ | ||
48 | static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; | ||
49 | |||
50 | /* Generate the tag_list to struct functions */ | ||
51 | #define NL_PACKET(name, number, fields) \ | ||
52 | static int name ## _from_tags(struct drbd_conf *mdev, \ | ||
53 | unsigned short *tags, struct name *arg) __attribute__ ((unused)); \ | ||
54 | static int name ## _from_tags(struct drbd_conf *mdev, \ | ||
55 | unsigned short *tags, struct name *arg) \ | ||
56 | { \ | ||
57 | int tag; \ | ||
58 | int dlen; \ | ||
59 | \ | ||
60 | while ((tag = get_unaligned(tags++)) != TT_END) { \ | ||
61 | dlen = get_unaligned(tags++); \ | ||
62 | switch (tag_number(tag)) { \ | ||
63 | fields \ | ||
64 | default: \ | ||
65 | if (tag & T_MANDATORY) { \ | ||
66 | dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \ | ||
67 | return 0; \ | ||
68 | } \ | ||
69 | } \ | ||
70 | tags = (unsigned short *)((char *)tags + dlen); \ | ||
71 | } \ | ||
72 | return 1; \ | ||
73 | } | ||
74 | #define NL_INTEGER(pn, pr, member) \ | ||
75 | case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \ | ||
76 | arg->member = get_unaligned((int *)(tags)); \ | ||
77 | break; | ||
78 | #define NL_INT64(pn, pr, member) \ | ||
79 | case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \ | ||
80 | arg->member = get_unaligned((u64 *)(tags)); \ | ||
81 | break; | ||
82 | #define NL_BIT(pn, pr, member) \ | ||
83 | case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \ | ||
84 | arg->member = *(char *)(tags) ? 1 : 0; \ | ||
85 | break; | ||
86 | #define NL_STRING(pn, pr, member, len) \ | ||
87 | case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \ | ||
88 | if (dlen > len) { \ | ||
89 | dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \ | ||
90 | #member, dlen, (unsigned int)len); \ | ||
91 | return 0; \ | ||
92 | } \ | ||
93 | arg->member ## _len = dlen; \ | ||
94 | memcpy(arg->member, tags, min_t(size_t, dlen, len)); \ | ||
95 | break; | ||
96 | #include "linux/drbd_nl.h" | ||
97 | |||
98 | /* Generate the struct to tag_list functions */ | ||
99 | #define NL_PACKET(name, number, fields) \ | ||
100 | static unsigned short* \ | ||
101 | name ## _to_tags(struct drbd_conf *mdev, \ | ||
102 | struct name *arg, unsigned short *tags) __attribute__ ((unused)); \ | ||
103 | static unsigned short* \ | ||
104 | name ## _to_tags(struct drbd_conf *mdev, \ | ||
105 | struct name *arg, unsigned short *tags) \ | ||
106 | { \ | ||
107 | fields \ | ||
108 | return tags; \ | ||
109 | } | ||
110 | |||
111 | #define NL_INTEGER(pn, pr, member) \ | ||
112 | put_unaligned(pn | pr | TT_INTEGER, tags++); \ | ||
113 | put_unaligned(sizeof(int), tags++); \ | ||
114 | put_unaligned(arg->member, (int *)tags); \ | ||
115 | tags = (unsigned short *)((char *)tags+sizeof(int)); | ||
116 | #define NL_INT64(pn, pr, member) \ | ||
117 | put_unaligned(pn | pr | TT_INT64, tags++); \ | ||
118 | put_unaligned(sizeof(u64), tags++); \ | ||
119 | put_unaligned(arg->member, (u64 *)tags); \ | ||
120 | tags = (unsigned short *)((char *)tags+sizeof(u64)); | ||
121 | #define NL_BIT(pn, pr, member) \ | ||
122 | put_unaligned(pn | pr | TT_BIT, tags++); \ | ||
123 | put_unaligned(sizeof(char), tags++); \ | ||
124 | *(char *)tags = arg->member; \ | ||
125 | tags = (unsigned short *)((char *)tags+sizeof(char)); | ||
126 | #define NL_STRING(pn, pr, member, len) \ | ||
127 | put_unaligned(pn | pr | TT_STRING, tags++); \ | ||
128 | put_unaligned(arg->member ## _len, tags++); \ | ||
129 | memcpy(tags, arg->member, arg->member ## _len); \ | ||
130 | tags = (unsigned short *)((char *)tags + arg->member ## _len); | ||
131 | #include "linux/drbd_nl.h" | ||
132 | |||
133 | void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name); | ||
134 | void drbd_nl_send_reply(struct cn_msg *, int); | ||
135 | |||
136 | int drbd_khelper(struct drbd_conf *mdev, char *cmd) | ||
137 | { | ||
138 | char *envp[] = { "HOME=/", | ||
139 | "TERM=linux", | ||
140 | "PATH=/sbin:/usr/sbin:/bin:/usr/bin", | ||
141 | NULL, /* Will be set to address family */ | ||
142 | NULL, /* Will be set to address */ | ||
143 | NULL }; | ||
144 | |||
145 | char mb[12], af[20], ad[60], *afs; | ||
146 | char *argv[] = {usermode_helper, cmd, mb, NULL }; | ||
147 | int ret; | ||
148 | |||
149 | snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev)); | ||
150 | |||
151 | if (get_net_conf(mdev)) { | ||
152 | switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) { | ||
153 | case AF_INET6: | ||
154 | afs = "ipv6"; | ||
155 | snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6", | ||
156 | &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr); | ||
157 | break; | ||
158 | case AF_INET: | ||
159 | afs = "ipv4"; | ||
160 | snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4", | ||
161 | &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr); | ||
162 | break; | ||
163 | default: | ||
164 | afs = "ssocks"; | ||
165 | snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4", | ||
166 | &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr); | ||
167 | } | ||
168 | snprintf(af, 20, "DRBD_PEER_AF=%s", afs); | ||
169 | envp[3]=af; | ||
170 | envp[4]=ad; | ||
171 | put_net_conf(mdev); | ||
172 | } | ||
173 | |||
174 | dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb); | ||
175 | |||
176 | drbd_bcast_ev_helper(mdev, cmd); | ||
177 | ret = call_usermodehelper(usermode_helper, argv, envp, 1); | ||
178 | if (ret) | ||
179 | dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", | ||
180 | usermode_helper, cmd, mb, | ||
181 | (ret >> 8) & 0xff, ret); | ||
182 | else | ||
183 | dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", | ||
184 | usermode_helper, cmd, mb, | ||
185 | (ret >> 8) & 0xff, ret); | ||
186 | |||
187 | if (ret < 0) /* Ignore any ERRNOs we got. */ | ||
188 | ret = 0; | ||
189 | |||
190 | return ret; | ||
191 | } | ||
192 | |||
193 | enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev) | ||
194 | { | ||
195 | char *ex_to_string; | ||
196 | int r; | ||
197 | enum drbd_disk_state nps; | ||
198 | enum drbd_fencing_p fp; | ||
199 | |||
200 | D_ASSERT(mdev->state.pdsk == D_UNKNOWN); | ||
201 | |||
202 | if (get_ldev_if_state(mdev, D_CONSISTENT)) { | ||
203 | fp = mdev->ldev->dc.fencing; | ||
204 | put_ldev(mdev); | ||
205 | } else { | ||
206 | dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n"); | ||
207 | return mdev->state.pdsk; | ||
208 | } | ||
209 | |||
210 | if (fp == FP_STONITH) | ||
211 | _drbd_request_state(mdev, NS(susp, 1), CS_WAIT_COMPLETE); | ||
212 | |||
213 | r = drbd_khelper(mdev, "fence-peer"); | ||
214 | |||
215 | switch ((r>>8) & 0xff) { | ||
216 | case 3: /* peer is inconsistent */ | ||
217 | ex_to_string = "peer is inconsistent or worse"; | ||
218 | nps = D_INCONSISTENT; | ||
219 | break; | ||
220 | case 4: /* peer got outdated, or was already outdated */ | ||
221 | ex_to_string = "peer was fenced"; | ||
222 | nps = D_OUTDATED; | ||
223 | break; | ||
224 | case 5: /* peer was down */ | ||
225 | if (mdev->state.disk == D_UP_TO_DATE) { | ||
226 | /* we will(have) create(d) a new UUID anyways... */ | ||
227 | ex_to_string = "peer is unreachable, assumed to be dead"; | ||
228 | nps = D_OUTDATED; | ||
229 | } else { | ||
230 | ex_to_string = "peer unreachable, doing nothing since disk != UpToDate"; | ||
231 | nps = mdev->state.pdsk; | ||
232 | } | ||
233 | break; | ||
234 | case 6: /* Peer is primary, voluntarily outdate myself. | ||
235 | * This is useful when an unconnected R_SECONDARY is asked to | ||
236 | * become R_PRIMARY, but finds the other peer being active. */ | ||
237 | ex_to_string = "peer is active"; | ||
238 | dev_warn(DEV, "Peer is primary, outdating myself.\n"); | ||
239 | nps = D_UNKNOWN; | ||
240 | _drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE); | ||
241 | break; | ||
242 | case 7: | ||
243 | if (fp != FP_STONITH) | ||
244 | dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n"); | ||
245 | ex_to_string = "peer was stonithed"; | ||
246 | nps = D_OUTDATED; | ||
247 | break; | ||
248 | default: | ||
249 | /* The script is broken ... */ | ||
250 | nps = D_UNKNOWN; | ||
251 | dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff); | ||
252 | return nps; | ||
253 | } | ||
254 | |||
255 | dev_info(DEV, "fence-peer helper returned %d (%s)\n", | ||
256 | (r>>8) & 0xff, ex_to_string); | ||
257 | return nps; | ||
258 | } | ||
259 | |||
260 | |||
261 | int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) | ||
262 | { | ||
263 | const int max_tries = 4; | ||
264 | int r = 0; | ||
265 | int try = 0; | ||
266 | int forced = 0; | ||
267 | union drbd_state mask, val; | ||
268 | enum drbd_disk_state nps; | ||
269 | |||
270 | if (new_role == R_PRIMARY) | ||
271 | request_ping(mdev); /* Detect a dead peer ASAP */ | ||
272 | |||
273 | mutex_lock(&mdev->state_mutex); | ||
274 | |||
275 | mask.i = 0; mask.role = R_MASK; | ||
276 | val.i = 0; val.role = new_role; | ||
277 | |||
278 | while (try++ < max_tries) { | ||
279 | r = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE); | ||
280 | |||
281 | /* in case we first succeeded to outdate, | ||
282 | * but now suddenly could establish a connection */ | ||
283 | if (r == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) { | ||
284 | val.pdsk = 0; | ||
285 | mask.pdsk = 0; | ||
286 | continue; | ||
287 | } | ||
288 | |||
289 | if (r == SS_NO_UP_TO_DATE_DISK && force && | ||
290 | (mdev->state.disk == D_INCONSISTENT || | ||
291 | mdev->state.disk == D_OUTDATED)) { | ||
292 | mask.disk = D_MASK; | ||
293 | val.disk = D_UP_TO_DATE; | ||
294 | forced = 1; | ||
295 | continue; | ||
296 | } | ||
297 | |||
298 | if (r == SS_NO_UP_TO_DATE_DISK && | ||
299 | mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) { | ||
300 | D_ASSERT(mdev->state.pdsk == D_UNKNOWN); | ||
301 | nps = drbd_try_outdate_peer(mdev); | ||
302 | |||
303 | if (nps == D_OUTDATED || nps == D_INCONSISTENT) { | ||
304 | val.disk = D_UP_TO_DATE; | ||
305 | mask.disk = D_MASK; | ||
306 | } | ||
307 | |||
308 | val.pdsk = nps; | ||
309 | mask.pdsk = D_MASK; | ||
310 | |||
311 | continue; | ||
312 | } | ||
313 | |||
314 | if (r == SS_NOTHING_TO_DO) | ||
315 | goto fail; | ||
316 | if (r == SS_PRIMARY_NOP && mask.pdsk == 0) { | ||
317 | nps = drbd_try_outdate_peer(mdev); | ||
318 | |||
319 | if (force && nps > D_OUTDATED) { | ||
320 | dev_warn(DEV, "Forced into split brain situation!\n"); | ||
321 | nps = D_OUTDATED; | ||
322 | } | ||
323 | |||
324 | mask.pdsk = D_MASK; | ||
325 | val.pdsk = nps; | ||
326 | |||
327 | continue; | ||
328 | } | ||
329 | if (r == SS_TWO_PRIMARIES) { | ||
330 | /* Maybe the peer is detected as dead very soon... | ||
331 | retry at most once more in this case. */ | ||
332 | __set_current_state(TASK_INTERRUPTIBLE); | ||
333 | schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10); | ||
334 | if (try < max_tries) | ||
335 | try = max_tries - 1; | ||
336 | continue; | ||
337 | } | ||
338 | if (r < SS_SUCCESS) { | ||
339 | r = _drbd_request_state(mdev, mask, val, | ||
340 | CS_VERBOSE + CS_WAIT_COMPLETE); | ||
341 | if (r < SS_SUCCESS) | ||
342 | goto fail; | ||
343 | } | ||
344 | break; | ||
345 | } | ||
346 | |||
347 | if (r < SS_SUCCESS) | ||
348 | goto fail; | ||
349 | |||
350 | if (forced) | ||
351 | dev_warn(DEV, "Forced to consider local data as UpToDate!\n"); | ||
352 | |||
353 | /* Wait until nothing is on the fly :) */ | ||
354 | wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0); | ||
355 | |||
356 | if (new_role == R_SECONDARY) { | ||
357 | set_disk_ro(mdev->vdisk, TRUE); | ||
358 | if (get_ldev(mdev)) { | ||
359 | mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; | ||
360 | put_ldev(mdev); | ||
361 | } | ||
362 | } else { | ||
363 | if (get_net_conf(mdev)) { | ||
364 | mdev->net_conf->want_lose = 0; | ||
365 | put_net_conf(mdev); | ||
366 | } | ||
367 | set_disk_ro(mdev->vdisk, FALSE); | ||
368 | if (get_ldev(mdev)) { | ||
369 | if (((mdev->state.conn < C_CONNECTED || | ||
370 | mdev->state.pdsk <= D_FAILED) | ||
371 | && mdev->ldev->md.uuid[UI_BITMAP] == 0) || forced) | ||
372 | drbd_uuid_new_current(mdev); | ||
373 | |||
374 | mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1; | ||
375 | put_ldev(mdev); | ||
376 | } | ||
377 | } | ||
378 | |||
379 | if ((new_role == R_SECONDARY) && get_ldev(mdev)) { | ||
380 | drbd_al_to_on_disk_bm(mdev); | ||
381 | put_ldev(mdev); | ||
382 | } | ||
383 | |||
384 | if (mdev->state.conn >= C_WF_REPORT_PARAMS) { | ||
385 | /* if this was forced, we should consider sync */ | ||
386 | if (forced) | ||
387 | drbd_send_uuids(mdev); | ||
388 | drbd_send_state(mdev); | ||
389 | } | ||
390 | |||
391 | drbd_md_sync(mdev); | ||
392 | |||
393 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | ||
394 | fail: | ||
395 | mutex_unlock(&mdev->state_mutex); | ||
396 | return r; | ||
397 | } | ||
398 | |||
399 | |||
400 | static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
401 | struct drbd_nl_cfg_reply *reply) | ||
402 | { | ||
403 | struct primary primary_args; | ||
404 | |||
405 | memset(&primary_args, 0, sizeof(struct primary)); | ||
406 | if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) { | ||
407 | reply->ret_code = ERR_MANDATORY_TAG; | ||
408 | return 0; | ||
409 | } | ||
410 | |||
411 | reply->ret_code = | ||
412 | drbd_set_role(mdev, R_PRIMARY, primary_args.overwrite_peer); | ||
413 | |||
414 | return 0; | ||
415 | } | ||
416 | |||
417 | static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
418 | struct drbd_nl_cfg_reply *reply) | ||
419 | { | ||
420 | reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0); | ||
421 | |||
422 | return 0; | ||
423 | } | ||
424 | |||
425 | /* initializes the md.*_offset members, so we are able to find | ||
426 | * the on disk meta data */ | ||
427 | static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, | ||
428 | struct drbd_backing_dev *bdev) | ||
429 | { | ||
430 | sector_t md_size_sect = 0; | ||
431 | switch (bdev->dc.meta_dev_idx) { | ||
432 | default: | ||
433 | /* v07 style fixed size indexed meta data */ | ||
434 | bdev->md.md_size_sect = MD_RESERVED_SECT; | ||
435 | bdev->md.md_offset = drbd_md_ss__(mdev, bdev); | ||
436 | bdev->md.al_offset = MD_AL_OFFSET; | ||
437 | bdev->md.bm_offset = MD_BM_OFFSET; | ||
438 | break; | ||
439 | case DRBD_MD_INDEX_FLEX_EXT: | ||
440 | /* just occupy the full device; unit: sectors */ | ||
441 | bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev); | ||
442 | bdev->md.md_offset = 0; | ||
443 | bdev->md.al_offset = MD_AL_OFFSET; | ||
444 | bdev->md.bm_offset = MD_BM_OFFSET; | ||
445 | break; | ||
446 | case DRBD_MD_INDEX_INTERNAL: | ||
447 | case DRBD_MD_INDEX_FLEX_INT: | ||
448 | bdev->md.md_offset = drbd_md_ss__(mdev, bdev); | ||
449 | /* al size is still fixed */ | ||
450 | bdev->md.al_offset = -MD_AL_MAX_SIZE; | ||
451 | /* we need (slightly less than) ~ this much bitmap sectors: */ | ||
452 | md_size_sect = drbd_get_capacity(bdev->backing_bdev); | ||
453 | md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); | ||
454 | md_size_sect = BM_SECT_TO_EXT(md_size_sect); | ||
455 | md_size_sect = ALIGN(md_size_sect, 8); | ||
456 | |||
457 | /* plus the "drbd meta data super block", | ||
458 | * and the activity log; */ | ||
459 | md_size_sect += MD_BM_OFFSET; | ||
460 | |||
461 | bdev->md.md_size_sect = md_size_sect; | ||
462 | /* bitmap offset is adjusted by 'super' block size */ | ||
463 | bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; | ||
464 | break; | ||
465 | } | ||
466 | } | ||
467 | |||
468 | char *ppsize(char *buf, unsigned long long size) | ||
469 | { | ||
470 | /* Needs 9 bytes at max. */ | ||
471 | static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' }; | ||
472 | int base = 0; | ||
473 | while (size >= 10000) { | ||
474 | /* shift + round */ | ||
475 | size = (size >> 10) + !!(size & (1<<9)); | ||
476 | base++; | ||
477 | } | ||
478 | sprintf(buf, "%lu %cB", (long)size, units[base]); | ||
479 | |||
480 | return buf; | ||
481 | } | ||
482 | |||
483 | /* there is still a theoretical deadlock when called from receiver | ||
484 | * on an D_INCONSISTENT R_PRIMARY: | ||
485 | * remote READ does inc_ap_bio, receiver would need to receive answer | ||
486 | * packet from remote to dec_ap_bio again. | ||
487 | * receiver receive_sizes(), comes here, | ||
488 | * waits for ap_bio_cnt == 0. -> deadlock. | ||
489 | * but this cannot happen, actually, because: | ||
490 | * R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable | ||
491 | * (not connected, or bad/no disk on peer): | ||
492 | * see drbd_fail_request_early, ap_bio_cnt is zero. | ||
493 | * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET: | ||
494 | * peer may not initiate a resize. | ||
495 | */ | ||
496 | void drbd_suspend_io(struct drbd_conf *mdev) | ||
497 | { | ||
498 | set_bit(SUSPEND_IO, &mdev->flags); | ||
499 | wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); | ||
500 | } | ||
501 | |||
502 | void drbd_resume_io(struct drbd_conf *mdev) | ||
503 | { | ||
504 | clear_bit(SUSPEND_IO, &mdev->flags); | ||
505 | wake_up(&mdev->misc_wait); | ||
506 | } | ||
507 | |||
508 | /** | ||
509 | * drbd_determine_dev_size() - Sets the right device size obeying all constraints | ||
510 | * @mdev: DRBD device. | ||
511 | * | ||
512 | * Returns 0 on success, negative return values indicate errors. | ||
513 | * You should call drbd_md_sync() after calling this function. | ||
514 | */ | ||
515 | enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev) __must_hold(local) | ||
516 | { | ||
517 | sector_t prev_first_sect, prev_size; /* previous meta location */ | ||
518 | sector_t la_size; | ||
519 | sector_t size; | ||
520 | char ppb[10]; | ||
521 | |||
522 | int md_moved, la_size_changed; | ||
523 | enum determine_dev_size rv = unchanged; | ||
524 | |||
525 | /* race: | ||
526 | * application request passes inc_ap_bio, | ||
527 | * but then cannot get an AL-reference. | ||
528 | * this function later may wait on ap_bio_cnt == 0. -> deadlock. | ||
529 | * | ||
530 | * to avoid that: | ||
531 | * Suspend IO right here. | ||
532 | * still lock the act_log to not trigger ASSERTs there. | ||
533 | */ | ||
534 | drbd_suspend_io(mdev); | ||
535 | |||
536 | /* no wait necessary anymore, actually we could assert that */ | ||
537 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | ||
538 | |||
539 | prev_first_sect = drbd_md_first_sector(mdev->ldev); | ||
540 | prev_size = mdev->ldev->md.md_size_sect; | ||
541 | la_size = mdev->ldev->md.la_size_sect; | ||
542 | |||
543 | /* TODO: should only be some assert here, not (re)init... */ | ||
544 | drbd_md_set_sector_offsets(mdev, mdev->ldev); | ||
545 | |||
546 | size = drbd_new_dev_size(mdev, mdev->ldev); | ||
547 | |||
548 | if (drbd_get_capacity(mdev->this_bdev) != size || | ||
549 | drbd_bm_capacity(mdev) != size) { | ||
550 | int err; | ||
551 | err = drbd_bm_resize(mdev, size); | ||
552 | if (unlikely(err)) { | ||
553 | /* currently there is only one error: ENOMEM! */ | ||
554 | size = drbd_bm_capacity(mdev)>>1; | ||
555 | if (size == 0) { | ||
556 | dev_err(DEV, "OUT OF MEMORY! " | ||
557 | "Could not allocate bitmap!\n"); | ||
558 | } else { | ||
559 | dev_err(DEV, "BM resizing failed. " | ||
560 | "Leaving size unchanged at size = %lu KB\n", | ||
561 | (unsigned long)size); | ||
562 | } | ||
563 | rv = dev_size_error; | ||
564 | } | ||
565 | /* racy, see comments above. */ | ||
566 | drbd_set_my_capacity(mdev, size); | ||
567 | mdev->ldev->md.la_size_sect = size; | ||
568 | dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), | ||
569 | (unsigned long long)size>>1); | ||
570 | } | ||
571 | if (rv == dev_size_error) | ||
572 | goto out; | ||
573 | |||
574 | la_size_changed = (la_size != mdev->ldev->md.la_size_sect); | ||
575 | |||
576 | md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev) | ||
577 | || prev_size != mdev->ldev->md.md_size_sect; | ||
578 | |||
579 | if (la_size_changed || md_moved) { | ||
580 | drbd_al_shrink(mdev); /* All extents inactive. */ | ||
581 | dev_info(DEV, "Writing the whole bitmap, %s\n", | ||
582 | la_size_changed && md_moved ? "size changed and md moved" : | ||
583 | la_size_changed ? "size changed" : "md moved"); | ||
584 | rv = drbd_bitmap_io(mdev, &drbd_bm_write, "size changed"); /* does drbd_resume_io() ! */ | ||
585 | drbd_md_mark_dirty(mdev); | ||
586 | } | ||
587 | |||
588 | if (size > la_size) | ||
589 | rv = grew; | ||
590 | if (size < la_size) | ||
591 | rv = shrunk; | ||
592 | out: | ||
593 | lc_unlock(mdev->act_log); | ||
594 | wake_up(&mdev->al_wait); | ||
595 | drbd_resume_io(mdev); | ||
596 | |||
597 | return rv; | ||
598 | } | ||
599 | |||
600 | sector_t | ||
601 | drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | ||
602 | { | ||
603 | sector_t p_size = mdev->p_size; /* partner's disk size. */ | ||
604 | sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */ | ||
605 | sector_t m_size; /* my size */ | ||
606 | sector_t u_size = bdev->dc.disk_size; /* size requested by user. */ | ||
607 | sector_t size = 0; | ||
608 | |||
609 | m_size = drbd_get_max_capacity(bdev); | ||
610 | |||
611 | if (p_size && m_size) { | ||
612 | size = min_t(sector_t, p_size, m_size); | ||
613 | } else { | ||
614 | if (la_size) { | ||
615 | size = la_size; | ||
616 | if (m_size && m_size < size) | ||
617 | size = m_size; | ||
618 | if (p_size && p_size < size) | ||
619 | size = p_size; | ||
620 | } else { | ||
621 | if (m_size) | ||
622 | size = m_size; | ||
623 | if (p_size) | ||
624 | size = p_size; | ||
625 | } | ||
626 | } | ||
627 | |||
628 | if (size == 0) | ||
629 | dev_err(DEV, "Both nodes diskless!\n"); | ||
630 | |||
631 | if (u_size) { | ||
632 | if (u_size > size) | ||
633 | dev_err(DEV, "Requested disk size is too big (%lu > %lu)\n", | ||
634 | (unsigned long)u_size>>1, (unsigned long)size>>1); | ||
635 | else | ||
636 | size = u_size; | ||
637 | } | ||
638 | |||
639 | return size; | ||
640 | } | ||
641 | |||
642 | /** | ||
643 | * drbd_check_al_size() - Ensures that the AL is of the right size | ||
644 | * @mdev: DRBD device. | ||
645 | * | ||
646 | * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation | ||
647 | * failed, and 0 on success. You should call drbd_md_sync() after you called | ||
648 | * this function. | ||
649 | */ | ||
650 | static int drbd_check_al_size(struct drbd_conf *mdev) | ||
651 | { | ||
652 | struct lru_cache *n, *t; | ||
653 | struct lc_element *e; | ||
654 | unsigned int in_use; | ||
655 | int i; | ||
656 | |||
657 | ERR_IF(mdev->sync_conf.al_extents < 7) | ||
658 | mdev->sync_conf.al_extents = 127; | ||
659 | |||
660 | if (mdev->act_log && | ||
661 | mdev->act_log->nr_elements == mdev->sync_conf.al_extents) | ||
662 | return 0; | ||
663 | |||
664 | in_use = 0; | ||
665 | t = mdev->act_log; | ||
666 | n = lc_create("act_log", drbd_al_ext_cache, | ||
667 | mdev->sync_conf.al_extents, sizeof(struct lc_element), 0); | ||
668 | |||
669 | if (n == NULL) { | ||
670 | dev_err(DEV, "Cannot allocate act_log lru!\n"); | ||
671 | return -ENOMEM; | ||
672 | } | ||
673 | spin_lock_irq(&mdev->al_lock); | ||
674 | if (t) { | ||
675 | for (i = 0; i < t->nr_elements; i++) { | ||
676 | e = lc_element_by_index(t, i); | ||
677 | if (e->refcnt) | ||
678 | dev_err(DEV, "refcnt(%d)==%d\n", | ||
679 | e->lc_number, e->refcnt); | ||
680 | in_use += e->refcnt; | ||
681 | } | ||
682 | } | ||
683 | if (!in_use) | ||
684 | mdev->act_log = n; | ||
685 | spin_unlock_irq(&mdev->al_lock); | ||
686 | if (in_use) { | ||
687 | dev_err(DEV, "Activity log still in use!\n"); | ||
688 | lc_destroy(n); | ||
689 | return -EBUSY; | ||
690 | } else { | ||
691 | if (t) | ||
692 | lc_destroy(t); | ||
693 | } | ||
694 | drbd_md_mark_dirty(mdev); /* we changed mdev->act_log->nr_elemens */ | ||
695 | return 0; | ||
696 | } | ||
697 | |||
698 | void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __must_hold(local) | ||
699 | { | ||
700 | struct request_queue * const q = mdev->rq_queue; | ||
701 | struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; | ||
702 | int max_segments = mdev->ldev->dc.max_bio_bvecs; | ||
703 | |||
704 | if (b->merge_bvec_fn && !mdev->ldev->dc.use_bmbv) | ||
705 | max_seg_s = PAGE_SIZE; | ||
706 | |||
707 | max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s); | ||
708 | |||
709 | blk_queue_max_sectors(q, max_seg_s >> 9); | ||
710 | blk_queue_max_phys_segments(q, max_segments ? max_segments : MAX_PHYS_SEGMENTS); | ||
711 | blk_queue_max_hw_segments(q, max_segments ? max_segments : MAX_HW_SEGMENTS); | ||
712 | blk_queue_max_segment_size(q, max_seg_s); | ||
713 | blk_queue_logical_block_size(q, 512); | ||
714 | blk_queue_segment_boundary(q, PAGE_SIZE-1); | ||
715 | blk_stack_limits(&q->limits, &b->limits, 0); | ||
716 | |||
717 | if (b->merge_bvec_fn) | ||
718 | dev_warn(DEV, "Backing device's merge_bvec_fn() = %p\n", | ||
719 | b->merge_bvec_fn); | ||
720 | dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q)); | ||
721 | |||
722 | if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { | ||
723 | dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n", | ||
724 | q->backing_dev_info.ra_pages, | ||
725 | b->backing_dev_info.ra_pages); | ||
726 | q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; | ||
727 | } | ||
728 | } | ||
729 | |||
730 | /* serialize deconfig (worker exiting, doing cleanup) | ||
731 | * and reconfig (drbdsetup disk, drbdsetup net) | ||
732 | * | ||
733 | * wait for a potentially exiting worker, then restart it, | ||
734 | * or start a new one. | ||
735 | */ | ||
736 | static void drbd_reconfig_start(struct drbd_conf *mdev) | ||
737 | { | ||
738 | wait_event(mdev->state_wait, test_and_set_bit(CONFIG_PENDING, &mdev->flags)); | ||
739 | wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags)); | ||
740 | drbd_thread_start(&mdev->worker); | ||
741 | } | ||
742 | |||
743 | /* if still unconfigured, stops worker again. | ||
744 | * if configured now, clears CONFIG_PENDING. | ||
745 | * wakes potential waiters */ | ||
746 | static void drbd_reconfig_done(struct drbd_conf *mdev) | ||
747 | { | ||
748 | spin_lock_irq(&mdev->req_lock); | ||
749 | if (mdev->state.disk == D_DISKLESS && | ||
750 | mdev->state.conn == C_STANDALONE && | ||
751 | mdev->state.role == R_SECONDARY) { | ||
752 | set_bit(DEVICE_DYING, &mdev->flags); | ||
753 | drbd_thread_stop_nowait(&mdev->worker); | ||
754 | } else | ||
755 | clear_bit(CONFIG_PENDING, &mdev->flags); | ||
756 | spin_unlock_irq(&mdev->req_lock); | ||
757 | wake_up(&mdev->state_wait); | ||
758 | } | ||
759 | |||
760 | /* does always return 0; | ||
761 | * interesting return code is in reply->ret_code */ | ||
762 | static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
763 | struct drbd_nl_cfg_reply *reply) | ||
764 | { | ||
765 | enum drbd_ret_codes retcode; | ||
766 | enum determine_dev_size dd; | ||
767 | sector_t max_possible_sectors; | ||
768 | sector_t min_md_device_sectors; | ||
769 | struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ | ||
770 | struct inode *inode, *inode2; | ||
771 | struct lru_cache *resync_lru = NULL; | ||
772 | union drbd_state ns, os; | ||
773 | int rv; | ||
774 | int cp_discovered = 0; | ||
775 | int logical_block_size; | ||
776 | |||
777 | drbd_reconfig_start(mdev); | ||
778 | |||
779 | /* if you want to reconfigure, please tear down first */ | ||
780 | if (mdev->state.disk > D_DISKLESS) { | ||
781 | retcode = ERR_DISK_CONFIGURED; | ||
782 | goto fail; | ||
783 | } | ||
784 | |||
785 | /* allocation not in the IO path, cqueue thread context */ | ||
786 | nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); | ||
787 | if (!nbc) { | ||
788 | retcode = ERR_NOMEM; | ||
789 | goto fail; | ||
790 | } | ||
791 | |||
792 | nbc->dc.disk_size = DRBD_DISK_SIZE_SECT_DEF; | ||
793 | nbc->dc.on_io_error = DRBD_ON_IO_ERROR_DEF; | ||
794 | nbc->dc.fencing = DRBD_FENCING_DEF; | ||
795 | nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF; | ||
796 | |||
797 | if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) { | ||
798 | retcode = ERR_MANDATORY_TAG; | ||
799 | goto fail; | ||
800 | } | ||
801 | |||
802 | if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) { | ||
803 | retcode = ERR_MD_IDX_INVALID; | ||
804 | goto fail; | ||
805 | } | ||
806 | |||
807 | nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0); | ||
808 | if (IS_ERR(nbc->lo_file)) { | ||
809 | dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev, | ||
810 | PTR_ERR(nbc->lo_file)); | ||
811 | nbc->lo_file = NULL; | ||
812 | retcode = ERR_OPEN_DISK; | ||
813 | goto fail; | ||
814 | } | ||
815 | |||
816 | inode = nbc->lo_file->f_dentry->d_inode; | ||
817 | |||
818 | if (!S_ISBLK(inode->i_mode)) { | ||
819 | retcode = ERR_DISK_NOT_BDEV; | ||
820 | goto fail; | ||
821 | } | ||
822 | |||
823 | nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0); | ||
824 | if (IS_ERR(nbc->md_file)) { | ||
825 | dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev, | ||
826 | PTR_ERR(nbc->md_file)); | ||
827 | nbc->md_file = NULL; | ||
828 | retcode = ERR_OPEN_MD_DISK; | ||
829 | goto fail; | ||
830 | } | ||
831 | |||
832 | inode2 = nbc->md_file->f_dentry->d_inode; | ||
833 | |||
834 | if (!S_ISBLK(inode2->i_mode)) { | ||
835 | retcode = ERR_MD_NOT_BDEV; | ||
836 | goto fail; | ||
837 | } | ||
838 | |||
839 | nbc->backing_bdev = inode->i_bdev; | ||
840 | if (bd_claim(nbc->backing_bdev, mdev)) { | ||
841 | printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n", | ||
842 | nbc->backing_bdev, mdev, | ||
843 | nbc->backing_bdev->bd_holder, | ||
844 | nbc->backing_bdev->bd_contains->bd_holder, | ||
845 | nbc->backing_bdev->bd_holders); | ||
846 | retcode = ERR_BDCLAIM_DISK; | ||
847 | goto fail; | ||
848 | } | ||
849 | |||
850 | resync_lru = lc_create("resync", drbd_bm_ext_cache, | ||
851 | 61, sizeof(struct bm_extent), | ||
852 | offsetof(struct bm_extent, lce)); | ||
853 | if (!resync_lru) { | ||
854 | retcode = ERR_NOMEM; | ||
855 | goto release_bdev_fail; | ||
856 | } | ||
857 | |||
858 | /* meta_dev_idx >= 0: external fixed size, | ||
859 | * possibly multiple drbd sharing one meta device. | ||
860 | * TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is | ||
861 | * not yet used by some other drbd minor! | ||
862 | * (if you use drbd.conf + drbdadm, | ||
863 | * that should check it for you already; but if you don't, or someone | ||
864 | * fooled it, we need to double check here) */ | ||
865 | nbc->md_bdev = inode2->i_bdev; | ||
866 | if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev | ||
867 | : (void *) drbd_m_holder)) { | ||
868 | retcode = ERR_BDCLAIM_MD_DISK; | ||
869 | goto release_bdev_fail; | ||
870 | } | ||
871 | |||
872 | if ((nbc->backing_bdev == nbc->md_bdev) != | ||
873 | (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL || | ||
874 | nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { | ||
875 | retcode = ERR_MD_IDX_INVALID; | ||
876 | goto release_bdev2_fail; | ||
877 | } | ||
878 | |||
879 | /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ | ||
880 | drbd_md_set_sector_offsets(mdev, nbc); | ||
881 | |||
882 | if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) { | ||
883 | dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", | ||
884 | (unsigned long long) drbd_get_max_capacity(nbc), | ||
885 | (unsigned long long) nbc->dc.disk_size); | ||
886 | retcode = ERR_DISK_TO_SMALL; | ||
887 | goto release_bdev2_fail; | ||
888 | } | ||
889 | |||
890 | if (nbc->dc.meta_dev_idx < 0) { | ||
891 | max_possible_sectors = DRBD_MAX_SECTORS_FLEX; | ||
892 | /* at least one MB, otherwise it does not make sense */ | ||
893 | min_md_device_sectors = (2<<10); | ||
894 | } else { | ||
895 | max_possible_sectors = DRBD_MAX_SECTORS; | ||
896 | min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1); | ||
897 | } | ||
898 | |||
899 | if (drbd_get_capacity(nbc->md_bdev) > max_possible_sectors) | ||
900 | dev_warn(DEV, "truncating very big lower level device " | ||
901 | "to currently maximum possible %llu sectors\n", | ||
902 | (unsigned long long) max_possible_sectors); | ||
903 | |||
904 | if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { | ||
905 | retcode = ERR_MD_DISK_TO_SMALL; | ||
906 | dev_warn(DEV, "refusing attach: md-device too small, " | ||
907 | "at least %llu sectors needed for this meta-disk type\n", | ||
908 | (unsigned long long) min_md_device_sectors); | ||
909 | goto release_bdev2_fail; | ||
910 | } | ||
911 | |||
912 | /* Make sure the new disk is big enough | ||
913 | * (we may currently be R_PRIMARY with no local disk...) */ | ||
914 | if (drbd_get_max_capacity(nbc) < | ||
915 | drbd_get_capacity(mdev->this_bdev)) { | ||
916 | retcode = ERR_DISK_TO_SMALL; | ||
917 | goto release_bdev2_fail; | ||
918 | } | ||
919 | |||
920 | nbc->known_size = drbd_get_capacity(nbc->backing_bdev); | ||
921 | |||
922 | drbd_suspend_io(mdev); | ||
923 | /* also wait for the last barrier ack. */ | ||
924 | wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt)); | ||
925 | /* and for any other previously queued work */ | ||
926 | drbd_flush_workqueue(mdev); | ||
927 | |||
928 | retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE); | ||
929 | drbd_resume_io(mdev); | ||
930 | if (retcode < SS_SUCCESS) | ||
931 | goto release_bdev2_fail; | ||
932 | |||
933 | if (!get_ldev_if_state(mdev, D_ATTACHING)) | ||
934 | goto force_diskless; | ||
935 | |||
936 | drbd_md_set_sector_offsets(mdev, nbc); | ||
937 | |||
938 | if (!mdev->bitmap) { | ||
939 | if (drbd_bm_init(mdev)) { | ||
940 | retcode = ERR_NOMEM; | ||
941 | goto force_diskless_dec; | ||
942 | } | ||
943 | } | ||
944 | |||
945 | retcode = drbd_md_read(mdev, nbc); | ||
946 | if (retcode != NO_ERROR) | ||
947 | goto force_diskless_dec; | ||
948 | |||
949 | if (mdev->state.conn < C_CONNECTED && | ||
950 | mdev->state.role == R_PRIMARY && | ||
951 | (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { | ||
952 | dev_err(DEV, "Can only attach to data with current UUID=%016llX\n", | ||
953 | (unsigned long long)mdev->ed_uuid); | ||
954 | retcode = ERR_DATA_NOT_CURRENT; | ||
955 | goto force_diskless_dec; | ||
956 | } | ||
957 | |||
958 | /* Since we are diskless, fix the activity log first... */ | ||
959 | if (drbd_check_al_size(mdev)) { | ||
960 | retcode = ERR_NOMEM; | ||
961 | goto force_diskless_dec; | ||
962 | } | ||
963 | |||
964 | /* Prevent shrinking of consistent devices ! */ | ||
965 | if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && | ||
966 | drbd_new_dev_size(mdev, nbc) < nbc->md.la_size_sect) { | ||
967 | dev_warn(DEV, "refusing to truncate a consistent device\n"); | ||
968 | retcode = ERR_DISK_TO_SMALL; | ||
969 | goto force_diskless_dec; | ||
970 | } | ||
971 | |||
972 | if (!drbd_al_read_log(mdev, nbc)) { | ||
973 | retcode = ERR_IO_MD_DISK; | ||
974 | goto force_diskless_dec; | ||
975 | } | ||
976 | |||
977 | /* allocate a second IO page if logical_block_size != 512 */ | ||
978 | logical_block_size = bdev_logical_block_size(nbc->md_bdev); | ||
979 | if (logical_block_size == 0) | ||
980 | logical_block_size = MD_SECTOR_SIZE; | ||
981 | |||
982 | if (logical_block_size != MD_SECTOR_SIZE) { | ||
983 | if (!mdev->md_io_tmpp) { | ||
984 | struct page *page = alloc_page(GFP_NOIO); | ||
985 | if (!page) | ||
986 | goto force_diskless_dec; | ||
987 | |||
988 | dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n", | ||
989 | logical_block_size, MD_SECTOR_SIZE); | ||
990 | dev_warn(DEV, "Workaround engaged (has performance impact).\n"); | ||
991 | |||
992 | mdev->md_io_tmpp = page; | ||
993 | } | ||
994 | } | ||
995 | |||
996 | /* Reset the "barriers don't work" bits here, then force meta data to | ||
997 | * be written, to ensure we determine if barriers are supported. */ | ||
998 | if (nbc->dc.no_md_flush) | ||
999 | set_bit(MD_NO_BARRIER, &mdev->flags); | ||
1000 | else | ||
1001 | clear_bit(MD_NO_BARRIER, &mdev->flags); | ||
1002 | |||
1003 | /* Point of no return reached. | ||
1004 | * Devices and memory are no longer released by error cleanup below. | ||
1005 | * now mdev takes over responsibility, and the state engine should | ||
1006 | * clean it up somewhere. */ | ||
1007 | D_ASSERT(mdev->ldev == NULL); | ||
1008 | mdev->ldev = nbc; | ||
1009 | mdev->resync = resync_lru; | ||
1010 | nbc = NULL; | ||
1011 | resync_lru = NULL; | ||
1012 | |||
1013 | mdev->write_ordering = WO_bio_barrier; | ||
1014 | drbd_bump_write_ordering(mdev, WO_bio_barrier); | ||
1015 | |||
1016 | if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY)) | ||
1017 | set_bit(CRASHED_PRIMARY, &mdev->flags); | ||
1018 | else | ||
1019 | clear_bit(CRASHED_PRIMARY, &mdev->flags); | ||
1020 | |||
1021 | if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND)) { | ||
1022 | set_bit(CRASHED_PRIMARY, &mdev->flags); | ||
1023 | cp_discovered = 1; | ||
1024 | } | ||
1025 | |||
1026 | mdev->send_cnt = 0; | ||
1027 | mdev->recv_cnt = 0; | ||
1028 | mdev->read_cnt = 0; | ||
1029 | mdev->writ_cnt = 0; | ||
1030 | |||
1031 | drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE); | ||
1032 | |||
1033 | /* If I am currently not R_PRIMARY, | ||
1034 | * but meta data primary indicator is set, | ||
1035 | * I just now recover from a hard crash, | ||
1036 | * and have been R_PRIMARY before that crash. | ||
1037 | * | ||
1038 | * Now, if I had no connection before that crash | ||
1039 | * (have been degraded R_PRIMARY), chances are that | ||
1040 | * I won't find my peer now either. | ||
1041 | * | ||
1042 | * In that case, and _only_ in that case, | ||
1043 | * we use the degr-wfc-timeout instead of the default, | ||
1044 | * so we can automatically recover from a crash of a | ||
1045 | * degraded but active "cluster" after a certain timeout. | ||
1046 | */ | ||
1047 | clear_bit(USE_DEGR_WFC_T, &mdev->flags); | ||
1048 | if (mdev->state.role != R_PRIMARY && | ||
1049 | drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) && | ||
1050 | !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND)) | ||
1051 | set_bit(USE_DEGR_WFC_T, &mdev->flags); | ||
1052 | |||
1053 | dd = drbd_determin_dev_size(mdev); | ||
1054 | if (dd == dev_size_error) { | ||
1055 | retcode = ERR_NOMEM_BITMAP; | ||
1056 | goto force_diskless_dec; | ||
1057 | } else if (dd == grew) | ||
1058 | set_bit(RESYNC_AFTER_NEG, &mdev->flags); | ||
1059 | |||
1060 | if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { | ||
1061 | dev_info(DEV, "Assuming that all blocks are out of sync " | ||
1062 | "(aka FullSync)\n"); | ||
1063 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from attaching")) { | ||
1064 | retcode = ERR_IO_MD_DISK; | ||
1065 | goto force_diskless_dec; | ||
1066 | } | ||
1067 | } else { | ||
1068 | if (drbd_bitmap_io(mdev, &drbd_bm_read, "read from attaching") < 0) { | ||
1069 | retcode = ERR_IO_MD_DISK; | ||
1070 | goto force_diskless_dec; | ||
1071 | } | ||
1072 | } | ||
1073 | |||
1074 | if (cp_discovered) { | ||
1075 | drbd_al_apply_to_bm(mdev); | ||
1076 | drbd_al_to_on_disk_bm(mdev); | ||
1077 | } | ||
1078 | |||
1079 | spin_lock_irq(&mdev->req_lock); | ||
1080 | os = mdev->state; | ||
1081 | ns.i = os.i; | ||
1082 | /* If MDF_CONSISTENT is not set go into inconsistent state, | ||
1083 | otherwise investigate MDF_WasUpToDate... | ||
1084 | If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state, | ||
1085 | otherwise into D_CONSISTENT state. | ||
1086 | */ | ||
1087 | if (drbd_md_test_flag(mdev->ldev, MDF_CONSISTENT)) { | ||
1088 | if (drbd_md_test_flag(mdev->ldev, MDF_WAS_UP_TO_DATE)) | ||
1089 | ns.disk = D_CONSISTENT; | ||
1090 | else | ||
1091 | ns.disk = D_OUTDATED; | ||
1092 | } else { | ||
1093 | ns.disk = D_INCONSISTENT; | ||
1094 | } | ||
1095 | |||
1096 | if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED)) | ||
1097 | ns.pdsk = D_OUTDATED; | ||
1098 | |||
1099 | if ( ns.disk == D_CONSISTENT && | ||
1100 | (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE)) | ||
1101 | ns.disk = D_UP_TO_DATE; | ||
1102 | |||
1103 | /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND, | ||
1104 | MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before | ||
1105 | this point, because drbd_request_state() modifies these | ||
1106 | flags. */ | ||
1107 | |||
1108 | /* In case we are C_CONNECTED postpone any decision on the new disk | ||
1109 | state after the negotiation phase. */ | ||
1110 | if (mdev->state.conn == C_CONNECTED) { | ||
1111 | mdev->new_state_tmp.i = ns.i; | ||
1112 | ns.i = os.i; | ||
1113 | ns.disk = D_NEGOTIATING; | ||
1114 | } | ||
1115 | |||
1116 | rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | ||
1117 | ns = mdev->state; | ||
1118 | spin_unlock_irq(&mdev->req_lock); | ||
1119 | |||
1120 | if (rv < SS_SUCCESS) | ||
1121 | goto force_diskless_dec; | ||
1122 | |||
1123 | if (mdev->state.role == R_PRIMARY) | ||
1124 | mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1; | ||
1125 | else | ||
1126 | mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; | ||
1127 | |||
1128 | drbd_md_mark_dirty(mdev); | ||
1129 | drbd_md_sync(mdev); | ||
1130 | |||
1131 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | ||
1132 | put_ldev(mdev); | ||
1133 | reply->ret_code = retcode; | ||
1134 | drbd_reconfig_done(mdev); | ||
1135 | return 0; | ||
1136 | |||
1137 | force_diskless_dec: | ||
1138 | put_ldev(mdev); | ||
1139 | force_diskless: | ||
1140 | drbd_force_state(mdev, NS(disk, D_DISKLESS)); | ||
1141 | drbd_md_sync(mdev); | ||
1142 | release_bdev2_fail: | ||
1143 | if (nbc) | ||
1144 | bd_release(nbc->md_bdev); | ||
1145 | release_bdev_fail: | ||
1146 | if (nbc) | ||
1147 | bd_release(nbc->backing_bdev); | ||
1148 | fail: | ||
1149 | if (nbc) { | ||
1150 | if (nbc->lo_file) | ||
1151 | fput(nbc->lo_file); | ||
1152 | if (nbc->md_file) | ||
1153 | fput(nbc->md_file); | ||
1154 | kfree(nbc); | ||
1155 | } | ||
1156 | lc_destroy(resync_lru); | ||
1157 | |||
1158 | reply->ret_code = retcode; | ||
1159 | drbd_reconfig_done(mdev); | ||
1160 | return 0; | ||
1161 | } | ||
1162 | |||
1163 | static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1164 | struct drbd_nl_cfg_reply *reply) | ||
1165 | { | ||
1166 | reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS)); | ||
1167 | return 0; | ||
1168 | } | ||
1169 | |||
1170 | static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1171 | struct drbd_nl_cfg_reply *reply) | ||
1172 | { | ||
1173 | int i, ns; | ||
1174 | enum drbd_ret_codes retcode; | ||
1175 | struct net_conf *new_conf = NULL; | ||
1176 | struct crypto_hash *tfm = NULL; | ||
1177 | struct crypto_hash *integrity_w_tfm = NULL; | ||
1178 | struct crypto_hash *integrity_r_tfm = NULL; | ||
1179 | struct hlist_head *new_tl_hash = NULL; | ||
1180 | struct hlist_head *new_ee_hash = NULL; | ||
1181 | struct drbd_conf *odev; | ||
1182 | char hmac_name[CRYPTO_MAX_ALG_NAME]; | ||
1183 | void *int_dig_out = NULL; | ||
1184 | void *int_dig_in = NULL; | ||
1185 | void *int_dig_vv = NULL; | ||
1186 | struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr; | ||
1187 | |||
1188 | drbd_reconfig_start(mdev); | ||
1189 | |||
1190 | if (mdev->state.conn > C_STANDALONE) { | ||
1191 | retcode = ERR_NET_CONFIGURED; | ||
1192 | goto fail; | ||
1193 | } | ||
1194 | |||
1195 | /* allocation not in the IO path, cqueue thread context */ | ||
1196 | new_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL); | ||
1197 | if (!new_conf) { | ||
1198 | retcode = ERR_NOMEM; | ||
1199 | goto fail; | ||
1200 | } | ||
1201 | |||
1202 | memset(new_conf, 0, sizeof(struct net_conf)); | ||
1203 | new_conf->timeout = DRBD_TIMEOUT_DEF; | ||
1204 | new_conf->try_connect_int = DRBD_CONNECT_INT_DEF; | ||
1205 | new_conf->ping_int = DRBD_PING_INT_DEF; | ||
1206 | new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF; | ||
1207 | new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF; | ||
1208 | new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF; | ||
1209 | new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF; | ||
1210 | new_conf->rcvbuf_size = DRBD_RCVBUF_SIZE_DEF; | ||
1211 | new_conf->ko_count = DRBD_KO_COUNT_DEF; | ||
1212 | new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF; | ||
1213 | new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF; | ||
1214 | new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF; | ||
1215 | new_conf->want_lose = 0; | ||
1216 | new_conf->two_primaries = 0; | ||
1217 | new_conf->wire_protocol = DRBD_PROT_C; | ||
1218 | new_conf->ping_timeo = DRBD_PING_TIMEO_DEF; | ||
1219 | new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF; | ||
1220 | |||
1221 | if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) { | ||
1222 | retcode = ERR_MANDATORY_TAG; | ||
1223 | goto fail; | ||
1224 | } | ||
1225 | |||
1226 | if (new_conf->two_primaries | ||
1227 | && (new_conf->wire_protocol != DRBD_PROT_C)) { | ||
1228 | retcode = ERR_NOT_PROTO_C; | ||
1229 | goto fail; | ||
1230 | }; | ||
1231 | |||
1232 | if (mdev->state.role == R_PRIMARY && new_conf->want_lose) { | ||
1233 | retcode = ERR_DISCARD; | ||
1234 | goto fail; | ||
1235 | } | ||
1236 | |||
1237 | retcode = NO_ERROR; | ||
1238 | |||
1239 | new_my_addr = (struct sockaddr *)&new_conf->my_addr; | ||
1240 | new_peer_addr = (struct sockaddr *)&new_conf->peer_addr; | ||
1241 | for (i = 0; i < minor_count; i++) { | ||
1242 | odev = minor_to_mdev(i); | ||
1243 | if (!odev || odev == mdev) | ||
1244 | continue; | ||
1245 | if (get_net_conf(odev)) { | ||
1246 | taken_addr = (struct sockaddr *)&odev->net_conf->my_addr; | ||
1247 | if (new_conf->my_addr_len == odev->net_conf->my_addr_len && | ||
1248 | !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len)) | ||
1249 | retcode = ERR_LOCAL_ADDR; | ||
1250 | |||
1251 | taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr; | ||
1252 | if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len && | ||
1253 | !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len)) | ||
1254 | retcode = ERR_PEER_ADDR; | ||
1255 | |||
1256 | put_net_conf(odev); | ||
1257 | if (retcode != NO_ERROR) | ||
1258 | goto fail; | ||
1259 | } | ||
1260 | } | ||
1261 | |||
1262 | if (new_conf->cram_hmac_alg[0] != 0) { | ||
1263 | snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)", | ||
1264 | new_conf->cram_hmac_alg); | ||
1265 | tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC); | ||
1266 | if (IS_ERR(tfm)) { | ||
1267 | tfm = NULL; | ||
1268 | retcode = ERR_AUTH_ALG; | ||
1269 | goto fail; | ||
1270 | } | ||
1271 | |||
1272 | if (crypto_tfm_alg_type(crypto_hash_tfm(tfm)) | ||
1273 | != CRYPTO_ALG_TYPE_HASH) { | ||
1274 | retcode = ERR_AUTH_ALG_ND; | ||
1275 | goto fail; | ||
1276 | } | ||
1277 | } | ||
1278 | |||
1279 | if (new_conf->integrity_alg[0]) { | ||
1280 | integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); | ||
1281 | if (IS_ERR(integrity_w_tfm)) { | ||
1282 | integrity_w_tfm = NULL; | ||
1283 | retcode=ERR_INTEGRITY_ALG; | ||
1284 | goto fail; | ||
1285 | } | ||
1286 | |||
1287 | if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) { | ||
1288 | retcode=ERR_INTEGRITY_ALG_ND; | ||
1289 | goto fail; | ||
1290 | } | ||
1291 | |||
1292 | integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); | ||
1293 | if (IS_ERR(integrity_r_tfm)) { | ||
1294 | integrity_r_tfm = NULL; | ||
1295 | retcode=ERR_INTEGRITY_ALG; | ||
1296 | goto fail; | ||
1297 | } | ||
1298 | } | ||
1299 | |||
1300 | ns = new_conf->max_epoch_size/8; | ||
1301 | if (mdev->tl_hash_s != ns) { | ||
1302 | new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); | ||
1303 | if (!new_tl_hash) { | ||
1304 | retcode = ERR_NOMEM; | ||
1305 | goto fail; | ||
1306 | } | ||
1307 | } | ||
1308 | |||
1309 | ns = new_conf->max_buffers/8; | ||
1310 | if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) { | ||
1311 | new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); | ||
1312 | if (!new_ee_hash) { | ||
1313 | retcode = ERR_NOMEM; | ||
1314 | goto fail; | ||
1315 | } | ||
1316 | } | ||
1317 | |||
1318 | ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0; | ||
1319 | |||
1320 | if (integrity_w_tfm) { | ||
1321 | i = crypto_hash_digestsize(integrity_w_tfm); | ||
1322 | int_dig_out = kmalloc(i, GFP_KERNEL); | ||
1323 | if (!int_dig_out) { | ||
1324 | retcode = ERR_NOMEM; | ||
1325 | goto fail; | ||
1326 | } | ||
1327 | int_dig_in = kmalloc(i, GFP_KERNEL); | ||
1328 | if (!int_dig_in) { | ||
1329 | retcode = ERR_NOMEM; | ||
1330 | goto fail; | ||
1331 | } | ||
1332 | int_dig_vv = kmalloc(i, GFP_KERNEL); | ||
1333 | if (!int_dig_vv) { | ||
1334 | retcode = ERR_NOMEM; | ||
1335 | goto fail; | ||
1336 | } | ||
1337 | } | ||
1338 | |||
1339 | if (!mdev->bitmap) { | ||
1340 | if(drbd_bm_init(mdev)) { | ||
1341 | retcode = ERR_NOMEM; | ||
1342 | goto fail; | ||
1343 | } | ||
1344 | } | ||
1345 | |||
1346 | spin_lock_irq(&mdev->req_lock); | ||
1347 | if (mdev->net_conf != NULL) { | ||
1348 | retcode = ERR_NET_CONFIGURED; | ||
1349 | spin_unlock_irq(&mdev->req_lock); | ||
1350 | goto fail; | ||
1351 | } | ||
1352 | mdev->net_conf = new_conf; | ||
1353 | |||
1354 | mdev->send_cnt = 0; | ||
1355 | mdev->recv_cnt = 0; | ||
1356 | |||
1357 | if (new_tl_hash) { | ||
1358 | kfree(mdev->tl_hash); | ||
1359 | mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8; | ||
1360 | mdev->tl_hash = new_tl_hash; | ||
1361 | } | ||
1362 | |||
1363 | if (new_ee_hash) { | ||
1364 | kfree(mdev->ee_hash); | ||
1365 | mdev->ee_hash_s = mdev->net_conf->max_buffers/8; | ||
1366 | mdev->ee_hash = new_ee_hash; | ||
1367 | } | ||
1368 | |||
1369 | crypto_free_hash(mdev->cram_hmac_tfm); | ||
1370 | mdev->cram_hmac_tfm = tfm; | ||
1371 | |||
1372 | crypto_free_hash(mdev->integrity_w_tfm); | ||
1373 | mdev->integrity_w_tfm = integrity_w_tfm; | ||
1374 | |||
1375 | crypto_free_hash(mdev->integrity_r_tfm); | ||
1376 | mdev->integrity_r_tfm = integrity_r_tfm; | ||
1377 | |||
1378 | kfree(mdev->int_dig_out); | ||
1379 | kfree(mdev->int_dig_in); | ||
1380 | kfree(mdev->int_dig_vv); | ||
1381 | mdev->int_dig_out=int_dig_out; | ||
1382 | mdev->int_dig_in=int_dig_in; | ||
1383 | mdev->int_dig_vv=int_dig_vv; | ||
1384 | spin_unlock_irq(&mdev->req_lock); | ||
1385 | |||
1386 | retcode = _drbd_request_state(mdev, NS(conn, C_UNCONNECTED), CS_VERBOSE); | ||
1387 | |||
1388 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | ||
1389 | reply->ret_code = retcode; | ||
1390 | drbd_reconfig_done(mdev); | ||
1391 | return 0; | ||
1392 | |||
1393 | fail: | ||
1394 | kfree(int_dig_out); | ||
1395 | kfree(int_dig_in); | ||
1396 | kfree(int_dig_vv); | ||
1397 | crypto_free_hash(tfm); | ||
1398 | crypto_free_hash(integrity_w_tfm); | ||
1399 | crypto_free_hash(integrity_r_tfm); | ||
1400 | kfree(new_tl_hash); | ||
1401 | kfree(new_ee_hash); | ||
1402 | kfree(new_conf); | ||
1403 | |||
1404 | reply->ret_code = retcode; | ||
1405 | drbd_reconfig_done(mdev); | ||
1406 | return 0; | ||
1407 | } | ||
1408 | |||
1409 | static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1410 | struct drbd_nl_cfg_reply *reply) | ||
1411 | { | ||
1412 | int retcode; | ||
1413 | |||
1414 | retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED); | ||
1415 | |||
1416 | if (retcode == SS_NOTHING_TO_DO) | ||
1417 | goto done; | ||
1418 | else if (retcode == SS_ALREADY_STANDALONE) | ||
1419 | goto done; | ||
1420 | else if (retcode == SS_PRIMARY_NOP) { | ||
1421 | /* Our statche checking code wants to see the peer outdated. */ | ||
1422 | retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING, | ||
1423 | pdsk, D_OUTDATED)); | ||
1424 | } else if (retcode == SS_CW_FAILED_BY_PEER) { | ||
1425 | /* The peer probably wants to see us outdated. */ | ||
1426 | retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING, | ||
1427 | disk, D_OUTDATED), | ||
1428 | CS_ORDERED); | ||
1429 | if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) { | ||
1430 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
1431 | retcode = SS_SUCCESS; | ||
1432 | } | ||
1433 | } | ||
1434 | |||
1435 | if (retcode < SS_SUCCESS) | ||
1436 | goto fail; | ||
1437 | |||
1438 | if (wait_event_interruptible(mdev->state_wait, | ||
1439 | mdev->state.conn != C_DISCONNECTING)) { | ||
1440 | /* Do not test for mdev->state.conn == C_STANDALONE, since | ||
1441 | someone else might connect us in the mean time! */ | ||
1442 | retcode = ERR_INTR; | ||
1443 | goto fail; | ||
1444 | } | ||
1445 | |||
1446 | done: | ||
1447 | retcode = NO_ERROR; | ||
1448 | fail: | ||
1449 | drbd_md_sync(mdev); | ||
1450 | reply->ret_code = retcode; | ||
1451 | return 0; | ||
1452 | } | ||
1453 | |||
1454 | void resync_after_online_grow(struct drbd_conf *mdev) | ||
1455 | { | ||
1456 | int iass; /* I am sync source */ | ||
1457 | |||
1458 | dev_info(DEV, "Resync of new storage after online grow\n"); | ||
1459 | if (mdev->state.role != mdev->state.peer) | ||
1460 | iass = (mdev->state.role == R_PRIMARY); | ||
1461 | else | ||
1462 | iass = test_bit(DISCARD_CONCURRENT, &mdev->flags); | ||
1463 | |||
1464 | if (iass) | ||
1465 | drbd_start_resync(mdev, C_SYNC_SOURCE); | ||
1466 | else | ||
1467 | _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE); | ||
1468 | } | ||
1469 | |||
1470 | static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1471 | struct drbd_nl_cfg_reply *reply) | ||
1472 | { | ||
1473 | struct resize rs; | ||
1474 | int retcode = NO_ERROR; | ||
1475 | int ldsc = 0; /* local disk size changed */ | ||
1476 | enum determine_dev_size dd; | ||
1477 | |||
1478 | memset(&rs, 0, sizeof(struct resize)); | ||
1479 | if (!resize_from_tags(mdev, nlp->tag_list, &rs)) { | ||
1480 | retcode = ERR_MANDATORY_TAG; | ||
1481 | goto fail; | ||
1482 | } | ||
1483 | |||
1484 | if (mdev->state.conn > C_CONNECTED) { | ||
1485 | retcode = ERR_RESIZE_RESYNC; | ||
1486 | goto fail; | ||
1487 | } | ||
1488 | |||
1489 | if (mdev->state.role == R_SECONDARY && | ||
1490 | mdev->state.peer == R_SECONDARY) { | ||
1491 | retcode = ERR_NO_PRIMARY; | ||
1492 | goto fail; | ||
1493 | } | ||
1494 | |||
1495 | if (!get_ldev(mdev)) { | ||
1496 | retcode = ERR_NO_DISK; | ||
1497 | goto fail; | ||
1498 | } | ||
1499 | |||
1500 | if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { | ||
1501 | mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); | ||
1502 | ldsc = 1; | ||
1503 | } | ||
1504 | |||
1505 | mdev->ldev->dc.disk_size = (sector_t)rs.resize_size; | ||
1506 | dd = drbd_determin_dev_size(mdev); | ||
1507 | drbd_md_sync(mdev); | ||
1508 | put_ldev(mdev); | ||
1509 | if (dd == dev_size_error) { | ||
1510 | retcode = ERR_NOMEM_BITMAP; | ||
1511 | goto fail; | ||
1512 | } | ||
1513 | |||
1514 | if (mdev->state.conn == C_CONNECTED && (dd != unchanged || ldsc)) { | ||
1515 | if (dd == grew) | ||
1516 | set_bit(RESIZE_PENDING, &mdev->flags); | ||
1517 | |||
1518 | drbd_send_uuids(mdev); | ||
1519 | drbd_send_sizes(mdev, 1); | ||
1520 | } | ||
1521 | |||
1522 | fail: | ||
1523 | reply->ret_code = retcode; | ||
1524 | return 0; | ||
1525 | } | ||
1526 | |||
1527 | static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1528 | struct drbd_nl_cfg_reply *reply) | ||
1529 | { | ||
1530 | int retcode = NO_ERROR; | ||
1531 | int err; | ||
1532 | int ovr; /* online verify running */ | ||
1533 | int rsr; /* re-sync running */ | ||
1534 | struct crypto_hash *verify_tfm = NULL; | ||
1535 | struct crypto_hash *csums_tfm = NULL; | ||
1536 | struct syncer_conf sc; | ||
1537 | cpumask_var_t new_cpu_mask; | ||
1538 | |||
1539 | if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) { | ||
1540 | retcode = ERR_NOMEM; | ||
1541 | goto fail; | ||
1542 | } | ||
1543 | |||
1544 | if (nlp->flags & DRBD_NL_SET_DEFAULTS) { | ||
1545 | memset(&sc, 0, sizeof(struct syncer_conf)); | ||
1546 | sc.rate = DRBD_RATE_DEF; | ||
1547 | sc.after = DRBD_AFTER_DEF; | ||
1548 | sc.al_extents = DRBD_AL_EXTENTS_DEF; | ||
1549 | } else | ||
1550 | memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf)); | ||
1551 | |||
1552 | if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) { | ||
1553 | retcode = ERR_MANDATORY_TAG; | ||
1554 | goto fail; | ||
1555 | } | ||
1556 | |||
1557 | /* re-sync running */ | ||
1558 | rsr = ( mdev->state.conn == C_SYNC_SOURCE || | ||
1559 | mdev->state.conn == C_SYNC_TARGET || | ||
1560 | mdev->state.conn == C_PAUSED_SYNC_S || | ||
1561 | mdev->state.conn == C_PAUSED_SYNC_T ); | ||
1562 | |||
1563 | if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) { | ||
1564 | retcode = ERR_CSUMS_RESYNC_RUNNING; | ||
1565 | goto fail; | ||
1566 | } | ||
1567 | |||
1568 | if (!rsr && sc.csums_alg[0]) { | ||
1569 | csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC); | ||
1570 | if (IS_ERR(csums_tfm)) { | ||
1571 | csums_tfm = NULL; | ||
1572 | retcode = ERR_CSUMS_ALG; | ||
1573 | goto fail; | ||
1574 | } | ||
1575 | |||
1576 | if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) { | ||
1577 | retcode = ERR_CSUMS_ALG_ND; | ||
1578 | goto fail; | ||
1579 | } | ||
1580 | } | ||
1581 | |||
1582 | /* online verify running */ | ||
1583 | ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T); | ||
1584 | |||
1585 | if (ovr) { | ||
1586 | if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) { | ||
1587 | retcode = ERR_VERIFY_RUNNING; | ||
1588 | goto fail; | ||
1589 | } | ||
1590 | } | ||
1591 | |||
1592 | if (!ovr && sc.verify_alg[0]) { | ||
1593 | verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC); | ||
1594 | if (IS_ERR(verify_tfm)) { | ||
1595 | verify_tfm = NULL; | ||
1596 | retcode = ERR_VERIFY_ALG; | ||
1597 | goto fail; | ||
1598 | } | ||
1599 | |||
1600 | if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) { | ||
1601 | retcode = ERR_VERIFY_ALG_ND; | ||
1602 | goto fail; | ||
1603 | } | ||
1604 | } | ||
1605 | |||
1606 | /* silently ignore cpu mask on UP kernel */ | ||
1607 | if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) { | ||
1608 | err = __bitmap_parse(sc.cpu_mask, 32, 0, | ||
1609 | cpumask_bits(new_cpu_mask), nr_cpu_ids); | ||
1610 | if (err) { | ||
1611 | dev_warn(DEV, "__bitmap_parse() failed with %d\n", err); | ||
1612 | retcode = ERR_CPU_MASK_PARSE; | ||
1613 | goto fail; | ||
1614 | } | ||
1615 | } | ||
1616 | |||
1617 | ERR_IF (sc.rate < 1) sc.rate = 1; | ||
1618 | ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */ | ||
1619 | #define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT) | ||
1620 | if (sc.al_extents > AL_MAX) { | ||
1621 | dev_err(DEV, "sc.al_extents > %d\n", AL_MAX); | ||
1622 | sc.al_extents = AL_MAX; | ||
1623 | } | ||
1624 | #undef AL_MAX | ||
1625 | |||
1626 | /* most sanity checks done, try to assign the new sync-after | ||
1627 | * dependency. need to hold the global lock in there, | ||
1628 | * to avoid a race in the dependency loop check. */ | ||
1629 | retcode = drbd_alter_sa(mdev, sc.after); | ||
1630 | if (retcode != NO_ERROR) | ||
1631 | goto fail; | ||
1632 | |||
1633 | /* ok, assign the rest of it as well. | ||
1634 | * lock against receive_SyncParam() */ | ||
1635 | spin_lock(&mdev->peer_seq_lock); | ||
1636 | mdev->sync_conf = sc; | ||
1637 | |||
1638 | if (!rsr) { | ||
1639 | crypto_free_hash(mdev->csums_tfm); | ||
1640 | mdev->csums_tfm = csums_tfm; | ||
1641 | csums_tfm = NULL; | ||
1642 | } | ||
1643 | |||
1644 | if (!ovr) { | ||
1645 | crypto_free_hash(mdev->verify_tfm); | ||
1646 | mdev->verify_tfm = verify_tfm; | ||
1647 | verify_tfm = NULL; | ||
1648 | } | ||
1649 | spin_unlock(&mdev->peer_seq_lock); | ||
1650 | |||
1651 | if (get_ldev(mdev)) { | ||
1652 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | ||
1653 | drbd_al_shrink(mdev); | ||
1654 | err = drbd_check_al_size(mdev); | ||
1655 | lc_unlock(mdev->act_log); | ||
1656 | wake_up(&mdev->al_wait); | ||
1657 | |||
1658 | put_ldev(mdev); | ||
1659 | drbd_md_sync(mdev); | ||
1660 | |||
1661 | if (err) { | ||
1662 | retcode = ERR_NOMEM; | ||
1663 | goto fail; | ||
1664 | } | ||
1665 | } | ||
1666 | |||
1667 | if (mdev->state.conn >= C_CONNECTED) | ||
1668 | drbd_send_sync_param(mdev, &sc); | ||
1669 | |||
1670 | if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) { | ||
1671 | cpumask_copy(mdev->cpu_mask, new_cpu_mask); | ||
1672 | drbd_calc_cpu_mask(mdev); | ||
1673 | mdev->receiver.reset_cpu_mask = 1; | ||
1674 | mdev->asender.reset_cpu_mask = 1; | ||
1675 | mdev->worker.reset_cpu_mask = 1; | ||
1676 | } | ||
1677 | |||
1678 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | ||
1679 | fail: | ||
1680 | free_cpumask_var(new_cpu_mask); | ||
1681 | crypto_free_hash(csums_tfm); | ||
1682 | crypto_free_hash(verify_tfm); | ||
1683 | reply->ret_code = retcode; | ||
1684 | return 0; | ||
1685 | } | ||
1686 | |||
1687 | static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1688 | struct drbd_nl_cfg_reply *reply) | ||
1689 | { | ||
1690 | int retcode; | ||
1691 | |||
1692 | retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); | ||
1693 | |||
1694 | if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION) | ||
1695 | retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); | ||
1696 | |||
1697 | while (retcode == SS_NEED_CONNECTION) { | ||
1698 | spin_lock_irq(&mdev->req_lock); | ||
1699 | if (mdev->state.conn < C_CONNECTED) | ||
1700 | retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL); | ||
1701 | spin_unlock_irq(&mdev->req_lock); | ||
1702 | |||
1703 | if (retcode != SS_NEED_CONNECTION) | ||
1704 | break; | ||
1705 | |||
1706 | retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); | ||
1707 | } | ||
1708 | |||
1709 | reply->ret_code = retcode; | ||
1710 | return 0; | ||
1711 | } | ||
1712 | |||
1713 | static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1714 | struct drbd_nl_cfg_reply *reply) | ||
1715 | { | ||
1716 | |||
1717 | reply->ret_code = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); | ||
1718 | |||
1719 | return 0; | ||
1720 | } | ||
1721 | |||
1722 | static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1723 | struct drbd_nl_cfg_reply *reply) | ||
1724 | { | ||
1725 | int retcode = NO_ERROR; | ||
1726 | |||
1727 | if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO) | ||
1728 | retcode = ERR_PAUSE_IS_SET; | ||
1729 | |||
1730 | reply->ret_code = retcode; | ||
1731 | return 0; | ||
1732 | } | ||
1733 | |||
1734 | static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1735 | struct drbd_nl_cfg_reply *reply) | ||
1736 | { | ||
1737 | int retcode = NO_ERROR; | ||
1738 | |||
1739 | if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) | ||
1740 | retcode = ERR_PAUSE_IS_CLEAR; | ||
1741 | |||
1742 | reply->ret_code = retcode; | ||
1743 | return 0; | ||
1744 | } | ||
1745 | |||
1746 | static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1747 | struct drbd_nl_cfg_reply *reply) | ||
1748 | { | ||
1749 | reply->ret_code = drbd_request_state(mdev, NS(susp, 1)); | ||
1750 | |||
1751 | return 0; | ||
1752 | } | ||
1753 | |||
1754 | static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1755 | struct drbd_nl_cfg_reply *reply) | ||
1756 | { | ||
1757 | reply->ret_code = drbd_request_state(mdev, NS(susp, 0)); | ||
1758 | return 0; | ||
1759 | } | ||
1760 | |||
1761 | static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1762 | struct drbd_nl_cfg_reply *reply) | ||
1763 | { | ||
1764 | reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED)); | ||
1765 | return 0; | ||
1766 | } | ||
1767 | |||
1768 | static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1769 | struct drbd_nl_cfg_reply *reply) | ||
1770 | { | ||
1771 | unsigned short *tl; | ||
1772 | |||
1773 | tl = reply->tag_list; | ||
1774 | |||
1775 | if (get_ldev(mdev)) { | ||
1776 | tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl); | ||
1777 | put_ldev(mdev); | ||
1778 | } | ||
1779 | |||
1780 | if (get_net_conf(mdev)) { | ||
1781 | tl = net_conf_to_tags(mdev, mdev->net_conf, tl); | ||
1782 | put_net_conf(mdev); | ||
1783 | } | ||
1784 | tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl); | ||
1785 | |||
1786 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
1787 | |||
1788 | return (int)((char *)tl - (char *)reply->tag_list); | ||
1789 | } | ||
1790 | |||
1791 | static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1792 | struct drbd_nl_cfg_reply *reply) | ||
1793 | { | ||
1794 | unsigned short *tl = reply->tag_list; | ||
1795 | union drbd_state s = mdev->state; | ||
1796 | unsigned long rs_left; | ||
1797 | unsigned int res; | ||
1798 | |||
1799 | tl = get_state_to_tags(mdev, (struct get_state *)&s, tl); | ||
1800 | |||
1801 | /* no local ref, no bitmap, no syncer progress. */ | ||
1802 | if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) { | ||
1803 | if (get_ldev(mdev)) { | ||
1804 | drbd_get_syncer_progress(mdev, &rs_left, &res); | ||
1805 | tl = tl_add_int(tl, T_sync_progress, &res); | ||
1806 | put_ldev(mdev); | ||
1807 | } | ||
1808 | } | ||
1809 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
1810 | |||
1811 | return (int)((char *)tl - (char *)reply->tag_list); | ||
1812 | } | ||
1813 | |||
1814 | static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1815 | struct drbd_nl_cfg_reply *reply) | ||
1816 | { | ||
1817 | unsigned short *tl; | ||
1818 | |||
1819 | tl = reply->tag_list; | ||
1820 | |||
1821 | if (get_ldev(mdev)) { | ||
1822 | tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64)); | ||
1823 | tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags); | ||
1824 | put_ldev(mdev); | ||
1825 | } | ||
1826 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
1827 | |||
1828 | return (int)((char *)tl - (char *)reply->tag_list); | ||
1829 | } | ||
1830 | |||
1831 | /** | ||
1832 | * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use | ||
1833 | * @mdev: DRBD device. | ||
1834 | * @nlp: Netlink/connector packet from drbdsetup | ||
1835 | * @reply: Reply packet for drbdsetup | ||
1836 | */ | ||
1837 | static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1838 | struct drbd_nl_cfg_reply *reply) | ||
1839 | { | ||
1840 | unsigned short *tl; | ||
1841 | char rv; | ||
1842 | |||
1843 | tl = reply->tag_list; | ||
1844 | |||
1845 | rv = mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : | ||
1846 | test_bit(USE_DEGR_WFC_T, &mdev->flags) ? UT_DEGRADED : UT_DEFAULT; | ||
1847 | |||
1848 | tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv)); | ||
1849 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
1850 | |||
1851 | return (int)((char *)tl - (char *)reply->tag_list); | ||
1852 | } | ||
1853 | |||
1854 | static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1855 | struct drbd_nl_cfg_reply *reply) | ||
1856 | { | ||
1857 | /* default to resume from last known position, if possible */ | ||
1858 | struct start_ov args = | ||
1859 | { .start_sector = mdev->ov_start_sector }; | ||
1860 | |||
1861 | if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) { | ||
1862 | reply->ret_code = ERR_MANDATORY_TAG; | ||
1863 | return 0; | ||
1864 | } | ||
1865 | /* w_make_ov_request expects position to be aligned */ | ||
1866 | mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT; | ||
1867 | reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); | ||
1868 | return 0; | ||
1869 | } | ||
1870 | |||
1871 | |||
1872 | static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1873 | struct drbd_nl_cfg_reply *reply) | ||
1874 | { | ||
1875 | int retcode = NO_ERROR; | ||
1876 | int skip_initial_sync = 0; | ||
1877 | int err; | ||
1878 | |||
1879 | struct new_c_uuid args; | ||
1880 | |||
1881 | memset(&args, 0, sizeof(struct new_c_uuid)); | ||
1882 | if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) { | ||
1883 | reply->ret_code = ERR_MANDATORY_TAG; | ||
1884 | return 0; | ||
1885 | } | ||
1886 | |||
1887 | mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */ | ||
1888 | |||
1889 | if (!get_ldev(mdev)) { | ||
1890 | retcode = ERR_NO_DISK; | ||
1891 | goto out; | ||
1892 | } | ||
1893 | |||
1894 | /* this is "skip initial sync", assume to be clean */ | ||
1895 | if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 && | ||
1896 | mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) { | ||
1897 | dev_info(DEV, "Preparing to skip initial sync\n"); | ||
1898 | skip_initial_sync = 1; | ||
1899 | } else if (mdev->state.conn != C_STANDALONE) { | ||
1900 | retcode = ERR_CONNECTED; | ||
1901 | goto out_dec; | ||
1902 | } | ||
1903 | |||
1904 | drbd_uuid_set(mdev, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */ | ||
1905 | drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */ | ||
1906 | |||
1907 | if (args.clear_bm) { | ||
1908 | err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, "clear_n_write from new_c_uuid"); | ||
1909 | if (err) { | ||
1910 | dev_err(DEV, "Writing bitmap failed with %d\n",err); | ||
1911 | retcode = ERR_IO_MD_DISK; | ||
1912 | } | ||
1913 | if (skip_initial_sync) { | ||
1914 | drbd_send_uuids_skip_initial_sync(mdev); | ||
1915 | _drbd_uuid_set(mdev, UI_BITMAP, 0); | ||
1916 | spin_lock_irq(&mdev->req_lock); | ||
1917 | _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), | ||
1918 | CS_VERBOSE, NULL); | ||
1919 | spin_unlock_irq(&mdev->req_lock); | ||
1920 | } | ||
1921 | } | ||
1922 | |||
1923 | drbd_md_sync(mdev); | ||
1924 | out_dec: | ||
1925 | put_ldev(mdev); | ||
1926 | out: | ||
1927 | mutex_unlock(&mdev->state_mutex); | ||
1928 | |||
1929 | reply->ret_code = retcode; | ||
1930 | return 0; | ||
1931 | } | ||
1932 | |||
1933 | static struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp) | ||
1934 | { | ||
1935 | struct drbd_conf *mdev; | ||
1936 | |||
1937 | if (nlp->drbd_minor >= minor_count) | ||
1938 | return NULL; | ||
1939 | |||
1940 | mdev = minor_to_mdev(nlp->drbd_minor); | ||
1941 | |||
1942 | if (!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) { | ||
1943 | struct gendisk *disk = NULL; | ||
1944 | mdev = drbd_new_device(nlp->drbd_minor); | ||
1945 | |||
1946 | spin_lock_irq(&drbd_pp_lock); | ||
1947 | if (minor_table[nlp->drbd_minor] == NULL) { | ||
1948 | minor_table[nlp->drbd_minor] = mdev; | ||
1949 | disk = mdev->vdisk; | ||
1950 | mdev = NULL; | ||
1951 | } /* else: we lost the race */ | ||
1952 | spin_unlock_irq(&drbd_pp_lock); | ||
1953 | |||
1954 | if (disk) /* we won the race above */ | ||
1955 | /* in case we ever add a drbd_delete_device(), | ||
1956 | * don't forget the del_gendisk! */ | ||
1957 | add_disk(disk); | ||
1958 | else /* we lost the race above */ | ||
1959 | drbd_free_mdev(mdev); | ||
1960 | |||
1961 | mdev = minor_to_mdev(nlp->drbd_minor); | ||
1962 | } | ||
1963 | |||
1964 | return mdev; | ||
1965 | } | ||
1966 | |||
1967 | struct cn_handler_struct { | ||
1968 | int (*function)(struct drbd_conf *, | ||
1969 | struct drbd_nl_cfg_req *, | ||
1970 | struct drbd_nl_cfg_reply *); | ||
1971 | int reply_body_size; | ||
1972 | }; | ||
1973 | |||
1974 | static struct cn_handler_struct cnd_table[] = { | ||
1975 | [ P_primary ] = { &drbd_nl_primary, 0 }, | ||
1976 | [ P_secondary ] = { &drbd_nl_secondary, 0 }, | ||
1977 | [ P_disk_conf ] = { &drbd_nl_disk_conf, 0 }, | ||
1978 | [ P_detach ] = { &drbd_nl_detach, 0 }, | ||
1979 | [ P_net_conf ] = { &drbd_nl_net_conf, 0 }, | ||
1980 | [ P_disconnect ] = { &drbd_nl_disconnect, 0 }, | ||
1981 | [ P_resize ] = { &drbd_nl_resize, 0 }, | ||
1982 | [ P_syncer_conf ] = { &drbd_nl_syncer_conf, 0 }, | ||
1983 | [ P_invalidate ] = { &drbd_nl_invalidate, 0 }, | ||
1984 | [ P_invalidate_peer ] = { &drbd_nl_invalidate_peer, 0 }, | ||
1985 | [ P_pause_sync ] = { &drbd_nl_pause_sync, 0 }, | ||
1986 | [ P_resume_sync ] = { &drbd_nl_resume_sync, 0 }, | ||
1987 | [ P_suspend_io ] = { &drbd_nl_suspend_io, 0 }, | ||
1988 | [ P_resume_io ] = { &drbd_nl_resume_io, 0 }, | ||
1989 | [ P_outdate ] = { &drbd_nl_outdate, 0 }, | ||
1990 | [ P_get_config ] = { &drbd_nl_get_config, | ||
1991 | sizeof(struct syncer_conf_tag_len_struct) + | ||
1992 | sizeof(struct disk_conf_tag_len_struct) + | ||
1993 | sizeof(struct net_conf_tag_len_struct) }, | ||
1994 | [ P_get_state ] = { &drbd_nl_get_state, | ||
1995 | sizeof(struct get_state_tag_len_struct) + | ||
1996 | sizeof(struct sync_progress_tag_len_struct) }, | ||
1997 | [ P_get_uuids ] = { &drbd_nl_get_uuids, | ||
1998 | sizeof(struct get_uuids_tag_len_struct) }, | ||
1999 | [ P_get_timeout_flag ] = { &drbd_nl_get_timeout_flag, | ||
2000 | sizeof(struct get_timeout_flag_tag_len_struct)}, | ||
2001 | [ P_start_ov ] = { &drbd_nl_start_ov, 0 }, | ||
2002 | [ P_new_c_uuid ] = { &drbd_nl_new_c_uuid, 0 }, | ||
2003 | }; | ||
2004 | |||
2005 | static void drbd_connector_callback(struct cn_msg *req) | ||
2006 | { | ||
2007 | struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data; | ||
2008 | struct cn_handler_struct *cm; | ||
2009 | struct cn_msg *cn_reply; | ||
2010 | struct drbd_nl_cfg_reply *reply; | ||
2011 | struct drbd_conf *mdev; | ||
2012 | int retcode, rr; | ||
2013 | int reply_size = sizeof(struct cn_msg) | ||
2014 | + sizeof(struct drbd_nl_cfg_reply) | ||
2015 | + sizeof(short int); | ||
2016 | |||
2017 | if (!try_module_get(THIS_MODULE)) { | ||
2018 | printk(KERN_ERR "drbd: try_module_get() failed!\n"); | ||
2019 | return; | ||
2020 | } | ||
2021 | |||
2022 | mdev = ensure_mdev(nlp); | ||
2023 | if (!mdev) { | ||
2024 | retcode = ERR_MINOR_INVALID; | ||
2025 | goto fail; | ||
2026 | } | ||
2027 | |||
2028 | trace_drbd_netlink(req, 1); | ||
2029 | |||
2030 | if (nlp->packet_type >= P_nl_after_last_packet) { | ||
2031 | retcode = ERR_PACKET_NR; | ||
2032 | goto fail; | ||
2033 | } | ||
2034 | |||
2035 | cm = cnd_table + nlp->packet_type; | ||
2036 | |||
2037 | /* This may happen if packet number is 0: */ | ||
2038 | if (cm->function == NULL) { | ||
2039 | retcode = ERR_PACKET_NR; | ||
2040 | goto fail; | ||
2041 | } | ||
2042 | |||
2043 | reply_size += cm->reply_body_size; | ||
2044 | |||
2045 | /* allocation not in the IO path, cqueue thread context */ | ||
2046 | cn_reply = kmalloc(reply_size, GFP_KERNEL); | ||
2047 | if (!cn_reply) { | ||
2048 | retcode = ERR_NOMEM; | ||
2049 | goto fail; | ||
2050 | } | ||
2051 | reply = (struct drbd_nl_cfg_reply *) cn_reply->data; | ||
2052 | |||
2053 | reply->packet_type = | ||
2054 | cm->reply_body_size ? nlp->packet_type : P_nl_after_last_packet; | ||
2055 | reply->minor = nlp->drbd_minor; | ||
2056 | reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */ | ||
2057 | /* reply->tag_list; might be modified by cm->function. */ | ||
2058 | |||
2059 | rr = cm->function(mdev, nlp, reply); | ||
2060 | |||
2061 | cn_reply->id = req->id; | ||
2062 | cn_reply->seq = req->seq; | ||
2063 | cn_reply->ack = req->ack + 1; | ||
2064 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr; | ||
2065 | cn_reply->flags = 0; | ||
2066 | |||
2067 | trace_drbd_netlink(cn_reply, 0); | ||
2068 | rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL); | ||
2069 | if (rr && rr != -ESRCH) | ||
2070 | printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); | ||
2071 | |||
2072 | kfree(cn_reply); | ||
2073 | module_put(THIS_MODULE); | ||
2074 | return; | ||
2075 | fail: | ||
2076 | drbd_nl_send_reply(req, retcode); | ||
2077 | module_put(THIS_MODULE); | ||
2078 | } | ||
2079 | |||
2080 | static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */ | ||
2081 | |||
2082 | static unsigned short * | ||
2083 | __tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, | ||
2084 | unsigned short len, int nul_terminated) | ||
2085 | { | ||
2086 | unsigned short l = tag_descriptions[tag_number(tag)].max_len; | ||
2087 | len = (len < l) ? len : l; | ||
2088 | put_unaligned(tag, tl++); | ||
2089 | put_unaligned(len, tl++); | ||
2090 | memcpy(tl, data, len); | ||
2091 | tl = (unsigned short*)((char*)tl + len); | ||
2092 | if (nul_terminated) | ||
2093 | *((char*)tl - 1) = 0; | ||
2094 | return tl; | ||
2095 | } | ||
2096 | |||
2097 | static unsigned short * | ||
2098 | tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len) | ||
2099 | { | ||
2100 | return __tl_add_blob(tl, tag, data, len, 0); | ||
2101 | } | ||
2102 | |||
2103 | static unsigned short * | ||
2104 | tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str) | ||
2105 | { | ||
2106 | return __tl_add_blob(tl, tag, str, strlen(str)+1, 0); | ||
2107 | } | ||
2108 | |||
2109 | static unsigned short * | ||
2110 | tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val) | ||
2111 | { | ||
2112 | put_unaligned(tag, tl++); | ||
2113 | switch(tag_type(tag)) { | ||
2114 | case TT_INTEGER: | ||
2115 | put_unaligned(sizeof(int), tl++); | ||
2116 | put_unaligned(*(int *)val, (int *)tl); | ||
2117 | tl = (unsigned short*)((char*)tl+sizeof(int)); | ||
2118 | break; | ||
2119 | case TT_INT64: | ||
2120 | put_unaligned(sizeof(u64), tl++); | ||
2121 | put_unaligned(*(u64 *)val, (u64 *)tl); | ||
2122 | tl = (unsigned short*)((char*)tl+sizeof(u64)); | ||
2123 | break; | ||
2124 | default: | ||
2125 | /* someone did something stupid. */ | ||
2126 | ; | ||
2127 | } | ||
2128 | return tl; | ||
2129 | } | ||
2130 | |||
2131 | void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state) | ||
2132 | { | ||
2133 | char buffer[sizeof(struct cn_msg)+ | ||
2134 | sizeof(struct drbd_nl_cfg_reply)+ | ||
2135 | sizeof(struct get_state_tag_len_struct)+ | ||
2136 | sizeof(short int)]; | ||
2137 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | ||
2138 | struct drbd_nl_cfg_reply *reply = | ||
2139 | (struct drbd_nl_cfg_reply *)cn_reply->data; | ||
2140 | unsigned short *tl = reply->tag_list; | ||
2141 | |||
2142 | /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */ | ||
2143 | |||
2144 | tl = get_state_to_tags(mdev, (struct get_state *)&state, tl); | ||
2145 | |||
2146 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
2147 | |||
2148 | cn_reply->id.idx = CN_IDX_DRBD; | ||
2149 | cn_reply->id.val = CN_VAL_DRBD; | ||
2150 | |||
2151 | cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); | ||
2152 | cn_reply->ack = 0; /* not used here. */ | ||
2153 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | ||
2154 | (int)((char *)tl - (char *)reply->tag_list); | ||
2155 | cn_reply->flags = 0; | ||
2156 | |||
2157 | reply->packet_type = P_get_state; | ||
2158 | reply->minor = mdev_to_minor(mdev); | ||
2159 | reply->ret_code = NO_ERROR; | ||
2160 | |||
2161 | trace_drbd_netlink(cn_reply, 0); | ||
2162 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | ||
2163 | } | ||
2164 | |||
2165 | void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name) | ||
2166 | { | ||
2167 | char buffer[sizeof(struct cn_msg)+ | ||
2168 | sizeof(struct drbd_nl_cfg_reply)+ | ||
2169 | sizeof(struct call_helper_tag_len_struct)+ | ||
2170 | sizeof(short int)]; | ||
2171 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | ||
2172 | struct drbd_nl_cfg_reply *reply = | ||
2173 | (struct drbd_nl_cfg_reply *)cn_reply->data; | ||
2174 | unsigned short *tl = reply->tag_list; | ||
2175 | |||
2176 | /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */ | ||
2177 | |||
2178 | tl = tl_add_str(tl, T_helper, helper_name); | ||
2179 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
2180 | |||
2181 | cn_reply->id.idx = CN_IDX_DRBD; | ||
2182 | cn_reply->id.val = CN_VAL_DRBD; | ||
2183 | |||
2184 | cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); | ||
2185 | cn_reply->ack = 0; /* not used here. */ | ||
2186 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | ||
2187 | (int)((char *)tl - (char *)reply->tag_list); | ||
2188 | cn_reply->flags = 0; | ||
2189 | |||
2190 | reply->packet_type = P_call_helper; | ||
2191 | reply->minor = mdev_to_minor(mdev); | ||
2192 | reply->ret_code = NO_ERROR; | ||
2193 | |||
2194 | trace_drbd_netlink(cn_reply, 0); | ||
2195 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | ||
2196 | } | ||
2197 | |||
2198 | void drbd_bcast_ee(struct drbd_conf *mdev, | ||
2199 | const char *reason, const int dgs, | ||
2200 | const char* seen_hash, const char* calc_hash, | ||
2201 | const struct drbd_epoch_entry* e) | ||
2202 | { | ||
2203 | struct cn_msg *cn_reply; | ||
2204 | struct drbd_nl_cfg_reply *reply; | ||
2205 | struct bio_vec *bvec; | ||
2206 | unsigned short *tl; | ||
2207 | int i; | ||
2208 | |||
2209 | if (!e) | ||
2210 | return; | ||
2211 | if (!reason || !reason[0]) | ||
2212 | return; | ||
2213 | |||
2214 | /* apparently we have to memcpy twice, first to prepare the data for the | ||
2215 | * struct cn_msg, then within cn_netlink_send from the cn_msg to the | ||
2216 | * netlink skb. */ | ||
2217 | /* receiver thread context, which is not in the writeout path (of this node), | ||
2218 | * but may be in the writeout path of the _other_ node. | ||
2219 | * GFP_NOIO to avoid potential "distributed deadlock". */ | ||
2220 | cn_reply = kmalloc( | ||
2221 | sizeof(struct cn_msg)+ | ||
2222 | sizeof(struct drbd_nl_cfg_reply)+ | ||
2223 | sizeof(struct dump_ee_tag_len_struct)+ | ||
2224 | sizeof(short int), | ||
2225 | GFP_NOIO); | ||
2226 | |||
2227 | if (!cn_reply) { | ||
2228 | dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n", | ||
2229 | (unsigned long long)e->sector, e->size); | ||
2230 | return; | ||
2231 | } | ||
2232 | |||
2233 | reply = (struct drbd_nl_cfg_reply*)cn_reply->data; | ||
2234 | tl = reply->tag_list; | ||
2235 | |||
2236 | tl = tl_add_str(tl, T_dump_ee_reason, reason); | ||
2237 | tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs); | ||
2238 | tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs); | ||
2239 | tl = tl_add_int(tl, T_ee_sector, &e->sector); | ||
2240 | tl = tl_add_int(tl, T_ee_block_id, &e->block_id); | ||
2241 | |||
2242 | put_unaligned(T_ee_data, tl++); | ||
2243 | put_unaligned(e->size, tl++); | ||
2244 | |||
2245 | __bio_for_each_segment(bvec, e->private_bio, i, 0) { | ||
2246 | void *d = kmap(bvec->bv_page); | ||
2247 | memcpy(tl, d + bvec->bv_offset, bvec->bv_len); | ||
2248 | kunmap(bvec->bv_page); | ||
2249 | tl=(unsigned short*)((char*)tl + bvec->bv_len); | ||
2250 | } | ||
2251 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
2252 | |||
2253 | cn_reply->id.idx = CN_IDX_DRBD; | ||
2254 | cn_reply->id.val = CN_VAL_DRBD; | ||
2255 | |||
2256 | cn_reply->seq = atomic_add_return(1,&drbd_nl_seq); | ||
2257 | cn_reply->ack = 0; // not used here. | ||
2258 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | ||
2259 | (int)((char*)tl - (char*)reply->tag_list); | ||
2260 | cn_reply->flags = 0; | ||
2261 | |||
2262 | reply->packet_type = P_dump_ee; | ||
2263 | reply->minor = mdev_to_minor(mdev); | ||
2264 | reply->ret_code = NO_ERROR; | ||
2265 | |||
2266 | trace_drbd_netlink(cn_reply, 0); | ||
2267 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | ||
2268 | kfree(cn_reply); | ||
2269 | } | ||
2270 | |||
2271 | void drbd_bcast_sync_progress(struct drbd_conf *mdev) | ||
2272 | { | ||
2273 | char buffer[sizeof(struct cn_msg)+ | ||
2274 | sizeof(struct drbd_nl_cfg_reply)+ | ||
2275 | sizeof(struct sync_progress_tag_len_struct)+ | ||
2276 | sizeof(short int)]; | ||
2277 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | ||
2278 | struct drbd_nl_cfg_reply *reply = | ||
2279 | (struct drbd_nl_cfg_reply *)cn_reply->data; | ||
2280 | unsigned short *tl = reply->tag_list; | ||
2281 | unsigned long rs_left; | ||
2282 | unsigned int res; | ||
2283 | |||
2284 | /* no local ref, no bitmap, no syncer progress, no broadcast. */ | ||
2285 | if (!get_ldev(mdev)) | ||
2286 | return; | ||
2287 | drbd_get_syncer_progress(mdev, &rs_left, &res); | ||
2288 | put_ldev(mdev); | ||
2289 | |||
2290 | tl = tl_add_int(tl, T_sync_progress, &res); | ||
2291 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
2292 | |||
2293 | cn_reply->id.idx = CN_IDX_DRBD; | ||
2294 | cn_reply->id.val = CN_VAL_DRBD; | ||
2295 | |||
2296 | cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); | ||
2297 | cn_reply->ack = 0; /* not used here. */ | ||
2298 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | ||
2299 | (int)((char *)tl - (char *)reply->tag_list); | ||
2300 | cn_reply->flags = 0; | ||
2301 | |||
2302 | reply->packet_type = P_sync_progress; | ||
2303 | reply->minor = mdev_to_minor(mdev); | ||
2304 | reply->ret_code = NO_ERROR; | ||
2305 | |||
2306 | trace_drbd_netlink(cn_reply, 0); | ||
2307 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | ||
2308 | } | ||
2309 | |||
2310 | int __init drbd_nl_init(void) | ||
2311 | { | ||
2312 | static struct cb_id cn_id_drbd; | ||
2313 | int err, try=10; | ||
2314 | |||
2315 | cn_id_drbd.val = CN_VAL_DRBD; | ||
2316 | do { | ||
2317 | cn_id_drbd.idx = cn_idx; | ||
2318 | err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback); | ||
2319 | if (!err) | ||
2320 | break; | ||
2321 | cn_idx = (cn_idx + CN_IDX_STEP); | ||
2322 | } while (try--); | ||
2323 | |||
2324 | if (err) { | ||
2325 | printk(KERN_ERR "drbd: cn_drbd failed to register\n"); | ||
2326 | return err; | ||
2327 | } | ||
2328 | |||
2329 | return 0; | ||
2330 | } | ||
2331 | |||
2332 | void drbd_nl_cleanup(void) | ||
2333 | { | ||
2334 | static struct cb_id cn_id_drbd; | ||
2335 | |||
2336 | cn_id_drbd.idx = cn_idx; | ||
2337 | cn_id_drbd.val = CN_VAL_DRBD; | ||
2338 | |||
2339 | cn_del_callback(&cn_id_drbd); | ||
2340 | } | ||
2341 | |||
2342 | void drbd_nl_send_reply(struct cn_msg *req, int ret_code) | ||
2343 | { | ||
2344 | char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)]; | ||
2345 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | ||
2346 | struct drbd_nl_cfg_reply *reply = | ||
2347 | (struct drbd_nl_cfg_reply *)cn_reply->data; | ||
2348 | int rr; | ||
2349 | |||
2350 | cn_reply->id = req->id; | ||
2351 | |||
2352 | cn_reply->seq = req->seq; | ||
2353 | cn_reply->ack = req->ack + 1; | ||
2354 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply); | ||
2355 | cn_reply->flags = 0; | ||
2356 | |||
2357 | reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor; | ||
2358 | reply->ret_code = ret_code; | ||
2359 | |||
2360 | trace_drbd_netlink(cn_reply, 0); | ||
2361 | rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | ||
2362 | if (rr && rr != -ESRCH) | ||
2363 | printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); | ||
2364 | } | ||
2365 | |||
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c new file mode 100644 index 00000000000..98fcb7450c7 --- /dev/null +++ b/drivers/block/drbd/drbd_proc.c | |||
@@ -0,0 +1,266 @@ | |||
1 | /* | ||
2 | drbd_proc.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #include <linux/autoconf.h> | ||
27 | #include <linux/module.h> | ||
28 | |||
29 | #include <asm/uaccess.h> | ||
30 | #include <linux/fs.h> | ||
31 | #include <linux/file.h> | ||
32 | #include <linux/slab.h> | ||
33 | #include <linux/proc_fs.h> | ||
34 | #include <linux/seq_file.h> | ||
35 | #include <linux/drbd.h> | ||
36 | #include "drbd_int.h" | ||
37 | |||
38 | static int drbd_proc_open(struct inode *inode, struct file *file); | ||
39 | |||
40 | |||
41 | struct proc_dir_entry *drbd_proc; | ||
42 | struct file_operations drbd_proc_fops = { | ||
43 | .owner = THIS_MODULE, | ||
44 | .open = drbd_proc_open, | ||
45 | .read = seq_read, | ||
46 | .llseek = seq_lseek, | ||
47 | .release = single_release, | ||
48 | }; | ||
49 | |||
50 | |||
51 | /*lge | ||
52 | * progress bars shamelessly adapted from driver/md/md.c | ||
53 | * output looks like | ||
54 | * [=====>..............] 33.5% (23456/123456) | ||
55 | * finish: 2:20:20 speed: 6,345 (6,456) K/sec | ||
56 | */ | ||
57 | static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq) | ||
58 | { | ||
59 | unsigned long db, dt, dbdt, rt, rs_left; | ||
60 | unsigned int res; | ||
61 | int i, x, y; | ||
62 | |||
63 | drbd_get_syncer_progress(mdev, &rs_left, &res); | ||
64 | |||
65 | x = res/50; | ||
66 | y = 20-x; | ||
67 | seq_printf(seq, "\t["); | ||
68 | for (i = 1; i < x; i++) | ||
69 | seq_printf(seq, "="); | ||
70 | seq_printf(seq, ">"); | ||
71 | for (i = 0; i < y; i++) | ||
72 | seq_printf(seq, "."); | ||
73 | seq_printf(seq, "] "); | ||
74 | |||
75 | seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10); | ||
76 | /* if more than 1 GB display in MB */ | ||
77 | if (mdev->rs_total > 0x100000L) | ||
78 | seq_printf(seq, "(%lu/%lu)M\n\t", | ||
79 | (unsigned long) Bit2KB(rs_left >> 10), | ||
80 | (unsigned long) Bit2KB(mdev->rs_total >> 10)); | ||
81 | else | ||
82 | seq_printf(seq, "(%lu/%lu)K\n\t", | ||
83 | (unsigned long) Bit2KB(rs_left), | ||
84 | (unsigned long) Bit2KB(mdev->rs_total)); | ||
85 | |||
86 | /* see drivers/md/md.c | ||
87 | * We do not want to overflow, so the order of operands and | ||
88 | * the * 100 / 100 trick are important. We do a +1 to be | ||
89 | * safe against division by zero. We only estimate anyway. | ||
90 | * | ||
91 | * dt: time from mark until now | ||
92 | * db: blocks written from mark until now | ||
93 | * rt: remaining time | ||
94 | */ | ||
95 | dt = (jiffies - mdev->rs_mark_time) / HZ; | ||
96 | |||
97 | if (dt > 20) { | ||
98 | /* if we made no update to rs_mark_time for too long, | ||
99 | * we are stalled. show that. */ | ||
100 | seq_printf(seq, "stalled\n"); | ||
101 | return; | ||
102 | } | ||
103 | |||
104 | if (!dt) | ||
105 | dt++; | ||
106 | db = mdev->rs_mark_left - rs_left; | ||
107 | rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */ | ||
108 | |||
109 | seq_printf(seq, "finish: %lu:%02lu:%02lu", | ||
110 | rt / 3600, (rt % 3600) / 60, rt % 60); | ||
111 | |||
112 | /* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */ | ||
113 | dbdt = Bit2KB(db/dt); | ||
114 | if (dbdt > 1000) | ||
115 | seq_printf(seq, " speed: %ld,%03ld", | ||
116 | dbdt/1000, dbdt % 1000); | ||
117 | else | ||
118 | seq_printf(seq, " speed: %ld", dbdt); | ||
119 | |||
120 | /* mean speed since syncer started | ||
121 | * we do account for PausedSync periods */ | ||
122 | dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; | ||
123 | if (dt <= 0) | ||
124 | dt = 1; | ||
125 | db = mdev->rs_total - rs_left; | ||
126 | dbdt = Bit2KB(db/dt); | ||
127 | if (dbdt > 1000) | ||
128 | seq_printf(seq, " (%ld,%03ld)", | ||
129 | dbdt/1000, dbdt % 1000); | ||
130 | else | ||
131 | seq_printf(seq, " (%ld)", dbdt); | ||
132 | |||
133 | seq_printf(seq, " K/sec\n"); | ||
134 | } | ||
135 | |||
136 | static void resync_dump_detail(struct seq_file *seq, struct lc_element *e) | ||
137 | { | ||
138 | struct bm_extent *bme = lc_entry(e, struct bm_extent, lce); | ||
139 | |||
140 | seq_printf(seq, "%5d %s %s\n", bme->rs_left, | ||
141 | bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------", | ||
142 | bme->flags & BME_LOCKED ? "LOCKED" : "------" | ||
143 | ); | ||
144 | } | ||
145 | |||
146 | static int drbd_seq_show(struct seq_file *seq, void *v) | ||
147 | { | ||
148 | int i, hole = 0; | ||
149 | const char *sn; | ||
150 | struct drbd_conf *mdev; | ||
151 | |||
152 | static char write_ordering_chars[] = { | ||
153 | [WO_none] = 'n', | ||
154 | [WO_drain_io] = 'd', | ||
155 | [WO_bdev_flush] = 'f', | ||
156 | [WO_bio_barrier] = 'b', | ||
157 | }; | ||
158 | |||
159 | seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n", | ||
160 | API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag()); | ||
161 | |||
162 | /* | ||
163 | cs .. connection state | ||
164 | ro .. node role (local/remote) | ||
165 | ds .. disk state (local/remote) | ||
166 | protocol | ||
167 | various flags | ||
168 | ns .. network send | ||
169 | nr .. network receive | ||
170 | dw .. disk write | ||
171 | dr .. disk read | ||
172 | al .. activity log write count | ||
173 | bm .. bitmap update write count | ||
174 | pe .. pending (waiting for ack or data reply) | ||
175 | ua .. unack'd (still need to send ack or data reply) | ||
176 | ap .. application requests accepted, but not yet completed | ||
177 | ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending | ||
178 | wo .. write ordering mode currently in use | ||
179 | oos .. known out-of-sync kB | ||
180 | */ | ||
181 | |||
182 | for (i = 0; i < minor_count; i++) { | ||
183 | mdev = minor_to_mdev(i); | ||
184 | if (!mdev) { | ||
185 | hole = 1; | ||
186 | continue; | ||
187 | } | ||
188 | if (hole) { | ||
189 | hole = 0; | ||
190 | seq_printf(seq, "\n"); | ||
191 | } | ||
192 | |||
193 | sn = drbd_conn_str(mdev->state.conn); | ||
194 | |||
195 | if (mdev->state.conn == C_STANDALONE && | ||
196 | mdev->state.disk == D_DISKLESS && | ||
197 | mdev->state.role == R_SECONDARY) { | ||
198 | seq_printf(seq, "%2d: cs:Unconfigured\n", i); | ||
199 | } else { | ||
200 | seq_printf(seq, | ||
201 | "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n" | ||
202 | " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " | ||
203 | "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c", | ||
204 | i, sn, | ||
205 | drbd_role_str(mdev->state.role), | ||
206 | drbd_role_str(mdev->state.peer), | ||
207 | drbd_disk_str(mdev->state.disk), | ||
208 | drbd_disk_str(mdev->state.pdsk), | ||
209 | (mdev->net_conf == NULL ? ' ' : | ||
210 | (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')), | ||
211 | mdev->state.susp ? 's' : 'r', | ||
212 | mdev->state.aftr_isp ? 'a' : '-', | ||
213 | mdev->state.peer_isp ? 'p' : '-', | ||
214 | mdev->state.user_isp ? 'u' : '-', | ||
215 | mdev->congestion_reason ?: '-', | ||
216 | mdev->send_cnt/2, | ||
217 | mdev->recv_cnt/2, | ||
218 | mdev->writ_cnt/2, | ||
219 | mdev->read_cnt/2, | ||
220 | mdev->al_writ_cnt, | ||
221 | mdev->bm_writ_cnt, | ||
222 | atomic_read(&mdev->local_cnt), | ||
223 | atomic_read(&mdev->ap_pending_cnt) + | ||
224 | atomic_read(&mdev->rs_pending_cnt), | ||
225 | atomic_read(&mdev->unacked_cnt), | ||
226 | atomic_read(&mdev->ap_bio_cnt), | ||
227 | mdev->epochs, | ||
228 | write_ordering_chars[mdev->write_ordering] | ||
229 | ); | ||
230 | seq_printf(seq, " oos:%lu\n", | ||
231 | Bit2KB(drbd_bm_total_weight(mdev))); | ||
232 | } | ||
233 | if (mdev->state.conn == C_SYNC_SOURCE || | ||
234 | mdev->state.conn == C_SYNC_TARGET) | ||
235 | drbd_syncer_progress(mdev, seq); | ||
236 | |||
237 | if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T) | ||
238 | seq_printf(seq, "\t%3d%% %lu/%lu\n", | ||
239 | (int)((mdev->rs_total-mdev->ov_left) / | ||
240 | (mdev->rs_total/100+1)), | ||
241 | mdev->rs_total - mdev->ov_left, | ||
242 | mdev->rs_total); | ||
243 | |||
244 | if (proc_details >= 1 && get_ldev_if_state(mdev, D_FAILED)) { | ||
245 | lc_seq_printf_stats(seq, mdev->resync); | ||
246 | lc_seq_printf_stats(seq, mdev->act_log); | ||
247 | put_ldev(mdev); | ||
248 | } | ||
249 | |||
250 | if (proc_details >= 2) { | ||
251 | if (mdev->resync) { | ||
252 | lc_seq_dump_details(seq, mdev->resync, "rs_left", | ||
253 | resync_dump_detail); | ||
254 | } | ||
255 | } | ||
256 | } | ||
257 | |||
258 | return 0; | ||
259 | } | ||
260 | |||
261 | static int drbd_proc_open(struct inode *inode, struct file *file) | ||
262 | { | ||
263 | return single_open(file, drbd_seq_show, PDE(inode)->data); | ||
264 | } | ||
265 | |||
266 | /* PROC FS stuff end */ | ||
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c new file mode 100644 index 00000000000..63686c4d85c --- /dev/null +++ b/drivers/block/drbd/drbd_receiver.c | |||
@@ -0,0 +1,4456 @@ | |||
1 | /* | ||
2 | drbd_receiver.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | */ | ||
24 | |||
25 | |||
26 | #include <linux/autoconf.h> | ||
27 | #include <linux/module.h> | ||
28 | |||
29 | #include <asm/uaccess.h> | ||
30 | #include <net/sock.h> | ||
31 | |||
32 | #include <linux/version.h> | ||
33 | #include <linux/drbd.h> | ||
34 | #include <linux/fs.h> | ||
35 | #include <linux/file.h> | ||
36 | #include <linux/in.h> | ||
37 | #include <linux/mm.h> | ||
38 | #include <linux/memcontrol.h> | ||
39 | #include <linux/mm_inline.h> | ||
40 | #include <linux/slab.h> | ||
41 | #include <linux/smp_lock.h> | ||
42 | #include <linux/pkt_sched.h> | ||
43 | #define __KERNEL_SYSCALLS__ | ||
44 | #include <linux/unistd.h> | ||
45 | #include <linux/vmalloc.h> | ||
46 | #include <linux/random.h> | ||
47 | #include <linux/mm.h> | ||
48 | #include <linux/string.h> | ||
49 | #include <linux/scatterlist.h> | ||
50 | #include "drbd_int.h" | ||
51 | #include "drbd_tracing.h" | ||
52 | #include "drbd_req.h" | ||
53 | |||
54 | #include "drbd_vli.h" | ||
55 | |||
56 | struct flush_work { | ||
57 | struct drbd_work w; | ||
58 | struct drbd_epoch *epoch; | ||
59 | }; | ||
60 | |||
61 | enum finish_epoch { | ||
62 | FE_STILL_LIVE, | ||
63 | FE_DESTROYED, | ||
64 | FE_RECYCLED, | ||
65 | }; | ||
66 | |||
67 | static int drbd_do_handshake(struct drbd_conf *mdev); | ||
68 | static int drbd_do_auth(struct drbd_conf *mdev); | ||
69 | |||
70 | static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event); | ||
71 | static int e_end_block(struct drbd_conf *, struct drbd_work *, int); | ||
72 | |||
73 | static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch) | ||
74 | { | ||
75 | struct drbd_epoch *prev; | ||
76 | spin_lock(&mdev->epoch_lock); | ||
77 | prev = list_entry(epoch->list.prev, struct drbd_epoch, list); | ||
78 | if (prev == epoch || prev == mdev->current_epoch) | ||
79 | prev = NULL; | ||
80 | spin_unlock(&mdev->epoch_lock); | ||
81 | return prev; | ||
82 | } | ||
83 | |||
84 | #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) | ||
85 | |||
86 | static struct page *drbd_pp_first_page_or_try_alloc(struct drbd_conf *mdev) | ||
87 | { | ||
88 | struct page *page = NULL; | ||
89 | |||
90 | /* Yes, testing drbd_pp_vacant outside the lock is racy. | ||
91 | * So what. It saves a spin_lock. */ | ||
92 | if (drbd_pp_vacant > 0) { | ||
93 | spin_lock(&drbd_pp_lock); | ||
94 | page = drbd_pp_pool; | ||
95 | if (page) { | ||
96 | drbd_pp_pool = (struct page *)page_private(page); | ||
97 | set_page_private(page, 0); /* just to be polite */ | ||
98 | drbd_pp_vacant--; | ||
99 | } | ||
100 | spin_unlock(&drbd_pp_lock); | ||
101 | } | ||
102 | /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD | ||
103 | * "criss-cross" setup, that might cause write-out on some other DRBD, | ||
104 | * which in turn might block on the other node at this very place. */ | ||
105 | if (!page) | ||
106 | page = alloc_page(GFP_TRY); | ||
107 | if (page) | ||
108 | atomic_inc(&mdev->pp_in_use); | ||
109 | return page; | ||
110 | } | ||
111 | |||
112 | /* kick lower level device, if we have more than (arbitrary number) | ||
113 | * reference counts on it, which typically are locally submitted io | ||
114 | * requests. don't use unacked_cnt, so we speed up proto A and B, too. */ | ||
115 | static void maybe_kick_lo(struct drbd_conf *mdev) | ||
116 | { | ||
117 | if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark) | ||
118 | drbd_kick_lo(mdev); | ||
119 | } | ||
120 | |||
121 | static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed) | ||
122 | { | ||
123 | struct drbd_epoch_entry *e; | ||
124 | struct list_head *le, *tle; | ||
125 | |||
126 | /* The EEs are always appended to the end of the list. Since | ||
127 | they are sent in order over the wire, they have to finish | ||
128 | in order. As soon as we see the first not finished we can | ||
129 | stop to examine the list... */ | ||
130 | |||
131 | list_for_each_safe(le, tle, &mdev->net_ee) { | ||
132 | e = list_entry(le, struct drbd_epoch_entry, w.list); | ||
133 | if (drbd_bio_has_active_page(e->private_bio)) | ||
134 | break; | ||
135 | list_move(le, to_be_freed); | ||
136 | } | ||
137 | } | ||
138 | |||
139 | static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev) | ||
140 | { | ||
141 | LIST_HEAD(reclaimed); | ||
142 | struct drbd_epoch_entry *e, *t; | ||
143 | |||
144 | maybe_kick_lo(mdev); | ||
145 | spin_lock_irq(&mdev->req_lock); | ||
146 | reclaim_net_ee(mdev, &reclaimed); | ||
147 | spin_unlock_irq(&mdev->req_lock); | ||
148 | |||
149 | list_for_each_entry_safe(e, t, &reclaimed, w.list) | ||
150 | drbd_free_ee(mdev, e); | ||
151 | } | ||
152 | |||
153 | /** | ||
154 | * drbd_pp_alloc() - Returns a page, fails only if a signal comes in | ||
155 | * @mdev: DRBD device. | ||
156 | * @retry: whether or not to retry allocation forever (or until signalled) | ||
157 | * | ||
158 | * Tries to allocate a page, first from our own page pool, then from the | ||
159 | * kernel, unless this allocation would exceed the max_buffers setting. | ||
160 | * If @retry is non-zero, retry until DRBD frees a page somewhere else. | ||
161 | */ | ||
162 | static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry) | ||
163 | { | ||
164 | struct page *page = NULL; | ||
165 | DEFINE_WAIT(wait); | ||
166 | |||
167 | if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { | ||
168 | page = drbd_pp_first_page_or_try_alloc(mdev); | ||
169 | if (page) | ||
170 | return page; | ||
171 | } | ||
172 | |||
173 | for (;;) { | ||
174 | prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); | ||
175 | |||
176 | drbd_kick_lo_and_reclaim_net(mdev); | ||
177 | |||
178 | if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { | ||
179 | page = drbd_pp_first_page_or_try_alloc(mdev); | ||
180 | if (page) | ||
181 | break; | ||
182 | } | ||
183 | |||
184 | if (!retry) | ||
185 | break; | ||
186 | |||
187 | if (signal_pending(current)) { | ||
188 | dev_warn(DEV, "drbd_pp_alloc interrupted!\n"); | ||
189 | break; | ||
190 | } | ||
191 | |||
192 | schedule(); | ||
193 | } | ||
194 | finish_wait(&drbd_pp_wait, &wait); | ||
195 | |||
196 | return page; | ||
197 | } | ||
198 | |||
199 | /* Must not be used from irq, as that may deadlock: see drbd_pp_alloc. | ||
200 | * Is also used from inside an other spin_lock_irq(&mdev->req_lock) */ | ||
201 | static void drbd_pp_free(struct drbd_conf *mdev, struct page *page) | ||
202 | { | ||
203 | int free_it; | ||
204 | |||
205 | spin_lock(&drbd_pp_lock); | ||
206 | if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) { | ||
207 | free_it = 1; | ||
208 | } else { | ||
209 | set_page_private(page, (unsigned long)drbd_pp_pool); | ||
210 | drbd_pp_pool = page; | ||
211 | drbd_pp_vacant++; | ||
212 | free_it = 0; | ||
213 | } | ||
214 | spin_unlock(&drbd_pp_lock); | ||
215 | |||
216 | atomic_dec(&mdev->pp_in_use); | ||
217 | |||
218 | if (free_it) | ||
219 | __free_page(page); | ||
220 | |||
221 | wake_up(&drbd_pp_wait); | ||
222 | } | ||
223 | |||
224 | static void drbd_pp_free_bio_pages(struct drbd_conf *mdev, struct bio *bio) | ||
225 | { | ||
226 | struct page *p_to_be_freed = NULL; | ||
227 | struct page *page; | ||
228 | struct bio_vec *bvec; | ||
229 | int i; | ||
230 | |||
231 | spin_lock(&drbd_pp_lock); | ||
232 | __bio_for_each_segment(bvec, bio, i, 0) { | ||
233 | if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) { | ||
234 | set_page_private(bvec->bv_page, (unsigned long)p_to_be_freed); | ||
235 | p_to_be_freed = bvec->bv_page; | ||
236 | } else { | ||
237 | set_page_private(bvec->bv_page, (unsigned long)drbd_pp_pool); | ||
238 | drbd_pp_pool = bvec->bv_page; | ||
239 | drbd_pp_vacant++; | ||
240 | } | ||
241 | } | ||
242 | spin_unlock(&drbd_pp_lock); | ||
243 | atomic_sub(bio->bi_vcnt, &mdev->pp_in_use); | ||
244 | |||
245 | while (p_to_be_freed) { | ||
246 | page = p_to_be_freed; | ||
247 | p_to_be_freed = (struct page *)page_private(page); | ||
248 | set_page_private(page, 0); /* just to be polite */ | ||
249 | put_page(page); | ||
250 | } | ||
251 | |||
252 | wake_up(&drbd_pp_wait); | ||
253 | } | ||
254 | |||
255 | /* | ||
256 | You need to hold the req_lock: | ||
257 | _drbd_wait_ee_list_empty() | ||
258 | |||
259 | You must not have the req_lock: | ||
260 | drbd_free_ee() | ||
261 | drbd_alloc_ee() | ||
262 | drbd_init_ee() | ||
263 | drbd_release_ee() | ||
264 | drbd_ee_fix_bhs() | ||
265 | drbd_process_done_ee() | ||
266 | drbd_clear_done_ee() | ||
267 | drbd_wait_ee_list_empty() | ||
268 | */ | ||
269 | |||
270 | struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, | ||
271 | u64 id, | ||
272 | sector_t sector, | ||
273 | unsigned int data_size, | ||
274 | gfp_t gfp_mask) __must_hold(local) | ||
275 | { | ||
276 | struct request_queue *q; | ||
277 | struct drbd_epoch_entry *e; | ||
278 | struct page *page; | ||
279 | struct bio *bio; | ||
280 | unsigned int ds; | ||
281 | |||
282 | if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE)) | ||
283 | return NULL; | ||
284 | |||
285 | e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); | ||
286 | if (!e) { | ||
287 | if (!(gfp_mask & __GFP_NOWARN)) | ||
288 | dev_err(DEV, "alloc_ee: Allocation of an EE failed\n"); | ||
289 | return NULL; | ||
290 | } | ||
291 | |||
292 | bio = bio_alloc(gfp_mask & ~__GFP_HIGHMEM, div_ceil(data_size, PAGE_SIZE)); | ||
293 | if (!bio) { | ||
294 | if (!(gfp_mask & __GFP_NOWARN)) | ||
295 | dev_err(DEV, "alloc_ee: Allocation of a bio failed\n"); | ||
296 | goto fail1; | ||
297 | } | ||
298 | |||
299 | bio->bi_bdev = mdev->ldev->backing_bdev; | ||
300 | bio->bi_sector = sector; | ||
301 | |||
302 | ds = data_size; | ||
303 | while (ds) { | ||
304 | page = drbd_pp_alloc(mdev, (gfp_mask & __GFP_WAIT)); | ||
305 | if (!page) { | ||
306 | if (!(gfp_mask & __GFP_NOWARN)) | ||
307 | dev_err(DEV, "alloc_ee: Allocation of a page failed\n"); | ||
308 | goto fail2; | ||
309 | } | ||
310 | if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) { | ||
311 | drbd_pp_free(mdev, page); | ||
312 | dev_err(DEV, "alloc_ee: bio_add_page(s=%llu," | ||
313 | "data_size=%u,ds=%u) failed\n", | ||
314 | (unsigned long long)sector, data_size, ds); | ||
315 | |||
316 | q = bdev_get_queue(bio->bi_bdev); | ||
317 | if (q->merge_bvec_fn) { | ||
318 | struct bvec_merge_data bvm = { | ||
319 | .bi_bdev = bio->bi_bdev, | ||
320 | .bi_sector = bio->bi_sector, | ||
321 | .bi_size = bio->bi_size, | ||
322 | .bi_rw = bio->bi_rw, | ||
323 | }; | ||
324 | int l = q->merge_bvec_fn(q, &bvm, | ||
325 | &bio->bi_io_vec[bio->bi_vcnt]); | ||
326 | dev_err(DEV, "merge_bvec_fn() = %d\n", l); | ||
327 | } | ||
328 | |||
329 | /* dump more of the bio. */ | ||
330 | dev_err(DEV, "bio->bi_max_vecs = %d\n", bio->bi_max_vecs); | ||
331 | dev_err(DEV, "bio->bi_vcnt = %d\n", bio->bi_vcnt); | ||
332 | dev_err(DEV, "bio->bi_size = %d\n", bio->bi_size); | ||
333 | dev_err(DEV, "bio->bi_phys_segments = %d\n", bio->bi_phys_segments); | ||
334 | |||
335 | goto fail2; | ||
336 | break; | ||
337 | } | ||
338 | ds -= min_t(int, ds, PAGE_SIZE); | ||
339 | } | ||
340 | |||
341 | D_ASSERT(data_size == bio->bi_size); | ||
342 | |||
343 | bio->bi_private = e; | ||
344 | e->mdev = mdev; | ||
345 | e->sector = sector; | ||
346 | e->size = bio->bi_size; | ||
347 | |||
348 | e->private_bio = bio; | ||
349 | e->block_id = id; | ||
350 | INIT_HLIST_NODE(&e->colision); | ||
351 | e->epoch = NULL; | ||
352 | e->flags = 0; | ||
353 | |||
354 | trace_drbd_ee(mdev, e, "allocated"); | ||
355 | |||
356 | return e; | ||
357 | |||
358 | fail2: | ||
359 | drbd_pp_free_bio_pages(mdev, bio); | ||
360 | bio_put(bio); | ||
361 | fail1: | ||
362 | mempool_free(e, drbd_ee_mempool); | ||
363 | |||
364 | return NULL; | ||
365 | } | ||
366 | |||
367 | void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) | ||
368 | { | ||
369 | struct bio *bio = e->private_bio; | ||
370 | trace_drbd_ee(mdev, e, "freed"); | ||
371 | drbd_pp_free_bio_pages(mdev, bio); | ||
372 | bio_put(bio); | ||
373 | D_ASSERT(hlist_unhashed(&e->colision)); | ||
374 | mempool_free(e, drbd_ee_mempool); | ||
375 | } | ||
376 | |||
377 | int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list) | ||
378 | { | ||
379 | LIST_HEAD(work_list); | ||
380 | struct drbd_epoch_entry *e, *t; | ||
381 | int count = 0; | ||
382 | |||
383 | spin_lock_irq(&mdev->req_lock); | ||
384 | list_splice_init(list, &work_list); | ||
385 | spin_unlock_irq(&mdev->req_lock); | ||
386 | |||
387 | list_for_each_entry_safe(e, t, &work_list, w.list) { | ||
388 | drbd_free_ee(mdev, e); | ||
389 | count++; | ||
390 | } | ||
391 | return count; | ||
392 | } | ||
393 | |||
394 | |||
395 | /* | ||
396 | * This function is called from _asender only_ | ||
397 | * but see also comments in _req_mod(,barrier_acked) | ||
398 | * and receive_Barrier. | ||
399 | * | ||
400 | * Move entries from net_ee to done_ee, if ready. | ||
401 | * Grab done_ee, call all callbacks, free the entries. | ||
402 | * The callbacks typically send out ACKs. | ||
403 | */ | ||
404 | static int drbd_process_done_ee(struct drbd_conf *mdev) | ||
405 | { | ||
406 | LIST_HEAD(work_list); | ||
407 | LIST_HEAD(reclaimed); | ||
408 | struct drbd_epoch_entry *e, *t; | ||
409 | int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS); | ||
410 | |||
411 | spin_lock_irq(&mdev->req_lock); | ||
412 | reclaim_net_ee(mdev, &reclaimed); | ||
413 | list_splice_init(&mdev->done_ee, &work_list); | ||
414 | spin_unlock_irq(&mdev->req_lock); | ||
415 | |||
416 | list_for_each_entry_safe(e, t, &reclaimed, w.list) | ||
417 | drbd_free_ee(mdev, e); | ||
418 | |||
419 | /* possible callbacks here: | ||
420 | * e_end_block, and e_end_resync_block, e_send_discard_ack. | ||
421 | * all ignore the last argument. | ||
422 | */ | ||
423 | list_for_each_entry_safe(e, t, &work_list, w.list) { | ||
424 | trace_drbd_ee(mdev, e, "process_done_ee"); | ||
425 | /* list_del not necessary, next/prev members not touched */ | ||
426 | ok = e->w.cb(mdev, &e->w, !ok) && ok; | ||
427 | drbd_free_ee(mdev, e); | ||
428 | } | ||
429 | wake_up(&mdev->ee_wait); | ||
430 | |||
431 | return ok; | ||
432 | } | ||
433 | |||
434 | void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) | ||
435 | { | ||
436 | DEFINE_WAIT(wait); | ||
437 | |||
438 | /* avoids spin_lock/unlock | ||
439 | * and calling prepare_to_wait in the fast path */ | ||
440 | while (!list_empty(head)) { | ||
441 | prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE); | ||
442 | spin_unlock_irq(&mdev->req_lock); | ||
443 | drbd_kick_lo(mdev); | ||
444 | schedule(); | ||
445 | finish_wait(&mdev->ee_wait, &wait); | ||
446 | spin_lock_irq(&mdev->req_lock); | ||
447 | } | ||
448 | } | ||
449 | |||
450 | void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) | ||
451 | { | ||
452 | spin_lock_irq(&mdev->req_lock); | ||
453 | _drbd_wait_ee_list_empty(mdev, head); | ||
454 | spin_unlock_irq(&mdev->req_lock); | ||
455 | } | ||
456 | |||
457 | /* see also kernel_accept; which is only present since 2.6.18. | ||
458 | * also we want to log which part of it failed, exactly */ | ||
459 | static int drbd_accept(struct drbd_conf *mdev, const char **what, | ||
460 | struct socket *sock, struct socket **newsock) | ||
461 | { | ||
462 | struct sock *sk = sock->sk; | ||
463 | int err = 0; | ||
464 | |||
465 | *what = "listen"; | ||
466 | err = sock->ops->listen(sock, 5); | ||
467 | if (err < 0) | ||
468 | goto out; | ||
469 | |||
470 | *what = "sock_create_lite"; | ||
471 | err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, | ||
472 | newsock); | ||
473 | if (err < 0) | ||
474 | goto out; | ||
475 | |||
476 | *what = "accept"; | ||
477 | err = sock->ops->accept(sock, *newsock, 0); | ||
478 | if (err < 0) { | ||
479 | sock_release(*newsock); | ||
480 | *newsock = NULL; | ||
481 | goto out; | ||
482 | } | ||
483 | (*newsock)->ops = sock->ops; | ||
484 | |||
485 | out: | ||
486 | return err; | ||
487 | } | ||
488 | |||
489 | static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock, | ||
490 | void *buf, size_t size, int flags) | ||
491 | { | ||
492 | mm_segment_t oldfs; | ||
493 | struct kvec iov = { | ||
494 | .iov_base = buf, | ||
495 | .iov_len = size, | ||
496 | }; | ||
497 | struct msghdr msg = { | ||
498 | .msg_iovlen = 1, | ||
499 | .msg_iov = (struct iovec *)&iov, | ||
500 | .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL) | ||
501 | }; | ||
502 | int rv; | ||
503 | |||
504 | oldfs = get_fs(); | ||
505 | set_fs(KERNEL_DS); | ||
506 | rv = sock_recvmsg(sock, &msg, size, msg.msg_flags); | ||
507 | set_fs(oldfs); | ||
508 | |||
509 | return rv; | ||
510 | } | ||
511 | |||
512 | static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size) | ||
513 | { | ||
514 | mm_segment_t oldfs; | ||
515 | struct kvec iov = { | ||
516 | .iov_base = buf, | ||
517 | .iov_len = size, | ||
518 | }; | ||
519 | struct msghdr msg = { | ||
520 | .msg_iovlen = 1, | ||
521 | .msg_iov = (struct iovec *)&iov, | ||
522 | .msg_flags = MSG_WAITALL | MSG_NOSIGNAL | ||
523 | }; | ||
524 | int rv; | ||
525 | |||
526 | oldfs = get_fs(); | ||
527 | set_fs(KERNEL_DS); | ||
528 | |||
529 | for (;;) { | ||
530 | rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags); | ||
531 | if (rv == size) | ||
532 | break; | ||
533 | |||
534 | /* Note: | ||
535 | * ECONNRESET other side closed the connection | ||
536 | * ERESTARTSYS (on sock) we got a signal | ||
537 | */ | ||
538 | |||
539 | if (rv < 0) { | ||
540 | if (rv == -ECONNRESET) | ||
541 | dev_info(DEV, "sock was reset by peer\n"); | ||
542 | else if (rv != -ERESTARTSYS) | ||
543 | dev_err(DEV, "sock_recvmsg returned %d\n", rv); | ||
544 | break; | ||
545 | } else if (rv == 0) { | ||
546 | dev_info(DEV, "sock was shut down by peer\n"); | ||
547 | break; | ||
548 | } else { | ||
549 | /* signal came in, or peer/link went down, | ||
550 | * after we read a partial message | ||
551 | */ | ||
552 | /* D_ASSERT(signal_pending(current)); */ | ||
553 | break; | ||
554 | } | ||
555 | }; | ||
556 | |||
557 | set_fs(oldfs); | ||
558 | |||
559 | if (rv != size) | ||
560 | drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE)); | ||
561 | |||
562 | return rv; | ||
563 | } | ||
564 | |||
565 | static struct socket *drbd_try_connect(struct drbd_conf *mdev) | ||
566 | { | ||
567 | const char *what; | ||
568 | struct socket *sock; | ||
569 | struct sockaddr_in6 src_in6; | ||
570 | int err; | ||
571 | int disconnect_on_error = 1; | ||
572 | |||
573 | if (!get_net_conf(mdev)) | ||
574 | return NULL; | ||
575 | |||
576 | what = "sock_create_kern"; | ||
577 | err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family, | ||
578 | SOCK_STREAM, IPPROTO_TCP, &sock); | ||
579 | if (err < 0) { | ||
580 | sock = NULL; | ||
581 | goto out; | ||
582 | } | ||
583 | |||
584 | sock->sk->sk_rcvtimeo = | ||
585 | sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ; | ||
586 | |||
587 | /* explicitly bind to the configured IP as source IP | ||
588 | * for the outgoing connections. | ||
589 | * This is needed for multihomed hosts and to be | ||
590 | * able to use lo: interfaces for drbd. | ||
591 | * Make sure to use 0 as port number, so linux selects | ||
592 | * a free one dynamically. | ||
593 | */ | ||
594 | memcpy(&src_in6, mdev->net_conf->my_addr, | ||
595 | min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6))); | ||
596 | if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6) | ||
597 | src_in6.sin6_port = 0; | ||
598 | else | ||
599 | ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */ | ||
600 | |||
601 | what = "bind before connect"; | ||
602 | err = sock->ops->bind(sock, | ||
603 | (struct sockaddr *) &src_in6, | ||
604 | mdev->net_conf->my_addr_len); | ||
605 | if (err < 0) | ||
606 | goto out; | ||
607 | |||
608 | /* connect may fail, peer not yet available. | ||
609 | * stay C_WF_CONNECTION, don't go Disconnecting! */ | ||
610 | disconnect_on_error = 0; | ||
611 | what = "connect"; | ||
612 | err = sock->ops->connect(sock, | ||
613 | (struct sockaddr *)mdev->net_conf->peer_addr, | ||
614 | mdev->net_conf->peer_addr_len, 0); | ||
615 | |||
616 | out: | ||
617 | if (err < 0) { | ||
618 | if (sock) { | ||
619 | sock_release(sock); | ||
620 | sock = NULL; | ||
621 | } | ||
622 | switch (-err) { | ||
623 | /* timeout, busy, signal pending */ | ||
624 | case ETIMEDOUT: case EAGAIN: case EINPROGRESS: | ||
625 | case EINTR: case ERESTARTSYS: | ||
626 | /* peer not (yet) available, network problem */ | ||
627 | case ECONNREFUSED: case ENETUNREACH: | ||
628 | case EHOSTDOWN: case EHOSTUNREACH: | ||
629 | disconnect_on_error = 0; | ||
630 | break; | ||
631 | default: | ||
632 | dev_err(DEV, "%s failed, err = %d\n", what, err); | ||
633 | } | ||
634 | if (disconnect_on_error) | ||
635 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
636 | } | ||
637 | put_net_conf(mdev); | ||
638 | return sock; | ||
639 | } | ||
640 | |||
641 | static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev) | ||
642 | { | ||
643 | int timeo, err; | ||
644 | struct socket *s_estab = NULL, *s_listen; | ||
645 | const char *what; | ||
646 | |||
647 | if (!get_net_conf(mdev)) | ||
648 | return NULL; | ||
649 | |||
650 | what = "sock_create_kern"; | ||
651 | err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family, | ||
652 | SOCK_STREAM, IPPROTO_TCP, &s_listen); | ||
653 | if (err) { | ||
654 | s_listen = NULL; | ||
655 | goto out; | ||
656 | } | ||
657 | |||
658 | timeo = mdev->net_conf->try_connect_int * HZ; | ||
659 | timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */ | ||
660 | |||
661 | s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */ | ||
662 | s_listen->sk->sk_rcvtimeo = timeo; | ||
663 | s_listen->sk->sk_sndtimeo = timeo; | ||
664 | |||
665 | what = "bind before listen"; | ||
666 | err = s_listen->ops->bind(s_listen, | ||
667 | (struct sockaddr *) mdev->net_conf->my_addr, | ||
668 | mdev->net_conf->my_addr_len); | ||
669 | if (err < 0) | ||
670 | goto out; | ||
671 | |||
672 | err = drbd_accept(mdev, &what, s_listen, &s_estab); | ||
673 | |||
674 | out: | ||
675 | if (s_listen) | ||
676 | sock_release(s_listen); | ||
677 | if (err < 0) { | ||
678 | if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { | ||
679 | dev_err(DEV, "%s failed, err = %d\n", what, err); | ||
680 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
681 | } | ||
682 | } | ||
683 | put_net_conf(mdev); | ||
684 | |||
685 | return s_estab; | ||
686 | } | ||
687 | |||
688 | static int drbd_send_fp(struct drbd_conf *mdev, | ||
689 | struct socket *sock, enum drbd_packets cmd) | ||
690 | { | ||
691 | struct p_header *h = (struct p_header *) &mdev->data.sbuf.header; | ||
692 | |||
693 | return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0); | ||
694 | } | ||
695 | |||
696 | static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock) | ||
697 | { | ||
698 | struct p_header *h = (struct p_header *) &mdev->data.sbuf.header; | ||
699 | int rr; | ||
700 | |||
701 | rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0); | ||
702 | |||
703 | if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC) | ||
704 | return be16_to_cpu(h->command); | ||
705 | |||
706 | return 0xffff; | ||
707 | } | ||
708 | |||
709 | /** | ||
710 | * drbd_socket_okay() - Free the socket if its connection is not okay | ||
711 | * @mdev: DRBD device. | ||
712 | * @sock: pointer to the pointer to the socket. | ||
713 | */ | ||
714 | static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock) | ||
715 | { | ||
716 | int rr; | ||
717 | char tb[4]; | ||
718 | |||
719 | if (!*sock) | ||
720 | return FALSE; | ||
721 | |||
722 | rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK); | ||
723 | |||
724 | if (rr > 0 || rr == -EAGAIN) { | ||
725 | return TRUE; | ||
726 | } else { | ||
727 | sock_release(*sock); | ||
728 | *sock = NULL; | ||
729 | return FALSE; | ||
730 | } | ||
731 | } | ||
732 | |||
733 | /* | ||
734 | * return values: | ||
735 | * 1 yes, we have a valid connection | ||
736 | * 0 oops, did not work out, please try again | ||
737 | * -1 peer talks different language, | ||
738 | * no point in trying again, please go standalone. | ||
739 | * -2 We do not have a network config... | ||
740 | */ | ||
741 | static int drbd_connect(struct drbd_conf *mdev) | ||
742 | { | ||
743 | struct socket *s, *sock, *msock; | ||
744 | int try, h, ok; | ||
745 | |||
746 | D_ASSERT(!mdev->data.socket); | ||
747 | |||
748 | if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) | ||
749 | dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n"); | ||
750 | |||
751 | if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS) | ||
752 | return -2; | ||
753 | |||
754 | clear_bit(DISCARD_CONCURRENT, &mdev->flags); | ||
755 | |||
756 | sock = NULL; | ||
757 | msock = NULL; | ||
758 | |||
759 | do { | ||
760 | for (try = 0;;) { | ||
761 | /* 3 tries, this should take less than a second! */ | ||
762 | s = drbd_try_connect(mdev); | ||
763 | if (s || ++try >= 3) | ||
764 | break; | ||
765 | /* give the other side time to call bind() & listen() */ | ||
766 | __set_current_state(TASK_INTERRUPTIBLE); | ||
767 | schedule_timeout(HZ / 10); | ||
768 | } | ||
769 | |||
770 | if (s) { | ||
771 | if (!sock) { | ||
772 | drbd_send_fp(mdev, s, P_HAND_SHAKE_S); | ||
773 | sock = s; | ||
774 | s = NULL; | ||
775 | } else if (!msock) { | ||
776 | drbd_send_fp(mdev, s, P_HAND_SHAKE_M); | ||
777 | msock = s; | ||
778 | s = NULL; | ||
779 | } else { | ||
780 | dev_err(DEV, "Logic error in drbd_connect()\n"); | ||
781 | goto out_release_sockets; | ||
782 | } | ||
783 | } | ||
784 | |||
785 | if (sock && msock) { | ||
786 | __set_current_state(TASK_INTERRUPTIBLE); | ||
787 | schedule_timeout(HZ / 10); | ||
788 | ok = drbd_socket_okay(mdev, &sock); | ||
789 | ok = drbd_socket_okay(mdev, &msock) && ok; | ||
790 | if (ok) | ||
791 | break; | ||
792 | } | ||
793 | |||
794 | retry: | ||
795 | s = drbd_wait_for_connect(mdev); | ||
796 | if (s) { | ||
797 | try = drbd_recv_fp(mdev, s); | ||
798 | drbd_socket_okay(mdev, &sock); | ||
799 | drbd_socket_okay(mdev, &msock); | ||
800 | switch (try) { | ||
801 | case P_HAND_SHAKE_S: | ||
802 | if (sock) { | ||
803 | dev_warn(DEV, "initial packet S crossed\n"); | ||
804 | sock_release(sock); | ||
805 | } | ||
806 | sock = s; | ||
807 | break; | ||
808 | case P_HAND_SHAKE_M: | ||
809 | if (msock) { | ||
810 | dev_warn(DEV, "initial packet M crossed\n"); | ||
811 | sock_release(msock); | ||
812 | } | ||
813 | msock = s; | ||
814 | set_bit(DISCARD_CONCURRENT, &mdev->flags); | ||
815 | break; | ||
816 | default: | ||
817 | dev_warn(DEV, "Error receiving initial packet\n"); | ||
818 | sock_release(s); | ||
819 | if (random32() & 1) | ||
820 | goto retry; | ||
821 | } | ||
822 | } | ||
823 | |||
824 | if (mdev->state.conn <= C_DISCONNECTING) | ||
825 | goto out_release_sockets; | ||
826 | if (signal_pending(current)) { | ||
827 | flush_signals(current); | ||
828 | smp_rmb(); | ||
829 | if (get_t_state(&mdev->receiver) == Exiting) | ||
830 | goto out_release_sockets; | ||
831 | } | ||
832 | |||
833 | if (sock && msock) { | ||
834 | ok = drbd_socket_okay(mdev, &sock); | ||
835 | ok = drbd_socket_okay(mdev, &msock) && ok; | ||
836 | if (ok) | ||
837 | break; | ||
838 | } | ||
839 | } while (1); | ||
840 | |||
841 | msock->sk->sk_reuse = 1; /* SO_REUSEADDR */ | ||
842 | sock->sk->sk_reuse = 1; /* SO_REUSEADDR */ | ||
843 | |||
844 | sock->sk->sk_allocation = GFP_NOIO; | ||
845 | msock->sk->sk_allocation = GFP_NOIO; | ||
846 | |||
847 | sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; | ||
848 | msock->sk->sk_priority = TC_PRIO_INTERACTIVE; | ||
849 | |||
850 | if (mdev->net_conf->sndbuf_size) { | ||
851 | sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size; | ||
852 | sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK; | ||
853 | } | ||
854 | |||
855 | if (mdev->net_conf->rcvbuf_size) { | ||
856 | sock->sk->sk_rcvbuf = mdev->net_conf->rcvbuf_size; | ||
857 | sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK; | ||
858 | } | ||
859 | |||
860 | /* NOT YET ... | ||
861 | * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; | ||
862 | * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; | ||
863 | * first set it to the P_HAND_SHAKE timeout, | ||
864 | * which we set to 4x the configured ping_timeout. */ | ||
865 | sock->sk->sk_sndtimeo = | ||
866 | sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10; | ||
867 | |||
868 | msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; | ||
869 | msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; | ||
870 | |||
871 | /* we don't want delays. | ||
872 | * we use TCP_CORK where apropriate, though */ | ||
873 | drbd_tcp_nodelay(sock); | ||
874 | drbd_tcp_nodelay(msock); | ||
875 | |||
876 | mdev->data.socket = sock; | ||
877 | mdev->meta.socket = msock; | ||
878 | mdev->last_received = jiffies; | ||
879 | |||
880 | D_ASSERT(mdev->asender.task == NULL); | ||
881 | |||
882 | h = drbd_do_handshake(mdev); | ||
883 | if (h <= 0) | ||
884 | return h; | ||
885 | |||
886 | if (mdev->cram_hmac_tfm) { | ||
887 | /* drbd_request_state(mdev, NS(conn, WFAuth)); */ | ||
888 | if (!drbd_do_auth(mdev)) { | ||
889 | dev_err(DEV, "Authentication of peer failed\n"); | ||
890 | return -1; | ||
891 | } | ||
892 | } | ||
893 | |||
894 | if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS) | ||
895 | return 0; | ||
896 | |||
897 | sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; | ||
898 | sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; | ||
899 | |||
900 | atomic_set(&mdev->packet_seq, 0); | ||
901 | mdev->peer_seq = 0; | ||
902 | |||
903 | drbd_thread_start(&mdev->asender); | ||
904 | |||
905 | drbd_send_protocol(mdev); | ||
906 | drbd_send_sync_param(mdev, &mdev->sync_conf); | ||
907 | drbd_send_sizes(mdev, 0); | ||
908 | drbd_send_uuids(mdev); | ||
909 | drbd_send_state(mdev); | ||
910 | clear_bit(USE_DEGR_WFC_T, &mdev->flags); | ||
911 | clear_bit(RESIZE_PENDING, &mdev->flags); | ||
912 | |||
913 | return 1; | ||
914 | |||
915 | out_release_sockets: | ||
916 | if (sock) | ||
917 | sock_release(sock); | ||
918 | if (msock) | ||
919 | sock_release(msock); | ||
920 | return -1; | ||
921 | } | ||
922 | |||
923 | static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h) | ||
924 | { | ||
925 | int r; | ||
926 | |||
927 | r = drbd_recv(mdev, h, sizeof(*h)); | ||
928 | |||
929 | if (unlikely(r != sizeof(*h))) { | ||
930 | dev_err(DEV, "short read expecting header on sock: r=%d\n", r); | ||
931 | return FALSE; | ||
932 | }; | ||
933 | h->command = be16_to_cpu(h->command); | ||
934 | h->length = be16_to_cpu(h->length); | ||
935 | if (unlikely(h->magic != BE_DRBD_MAGIC)) { | ||
936 | dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n", | ||
937 | (long)be32_to_cpu(h->magic), | ||
938 | h->command, h->length); | ||
939 | return FALSE; | ||
940 | } | ||
941 | mdev->last_received = jiffies; | ||
942 | |||
943 | return TRUE; | ||
944 | } | ||
945 | |||
946 | static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch) | ||
947 | { | ||
948 | int rv; | ||
949 | |||
950 | if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) { | ||
951 | rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL); | ||
952 | if (rv) { | ||
953 | dev_err(DEV, "local disk flush failed with status %d\n", rv); | ||
954 | /* would rather check on EOPNOTSUPP, but that is not reliable. | ||
955 | * don't try again for ANY return value != 0 | ||
956 | * if (rv == -EOPNOTSUPP) */ | ||
957 | drbd_bump_write_ordering(mdev, WO_drain_io); | ||
958 | } | ||
959 | put_ldev(mdev); | ||
960 | } | ||
961 | |||
962 | return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE); | ||
963 | } | ||
964 | |||
965 | static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
966 | { | ||
967 | struct flush_work *fw = (struct flush_work *)w; | ||
968 | struct drbd_epoch *epoch = fw->epoch; | ||
969 | |||
970 | kfree(w); | ||
971 | |||
972 | if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags)) | ||
973 | drbd_flush_after_epoch(mdev, epoch); | ||
974 | |||
975 | drbd_may_finish_epoch(mdev, epoch, EV_PUT | | ||
976 | (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0)); | ||
977 | |||
978 | return 1; | ||
979 | } | ||
980 | |||
981 | /** | ||
982 | * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it. | ||
983 | * @mdev: DRBD device. | ||
984 | * @epoch: Epoch object. | ||
985 | * @ev: Epoch event. | ||
986 | */ | ||
987 | static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, | ||
988 | struct drbd_epoch *epoch, | ||
989 | enum epoch_event ev) | ||
990 | { | ||
991 | int finish, epoch_size; | ||
992 | struct drbd_epoch *next_epoch; | ||
993 | int schedule_flush = 0; | ||
994 | enum finish_epoch rv = FE_STILL_LIVE; | ||
995 | |||
996 | spin_lock(&mdev->epoch_lock); | ||
997 | do { | ||
998 | next_epoch = NULL; | ||
999 | finish = 0; | ||
1000 | |||
1001 | epoch_size = atomic_read(&epoch->epoch_size); | ||
1002 | |||
1003 | switch (ev & ~EV_CLEANUP) { | ||
1004 | case EV_PUT: | ||
1005 | atomic_dec(&epoch->active); | ||
1006 | break; | ||
1007 | case EV_GOT_BARRIER_NR: | ||
1008 | set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags); | ||
1009 | |||
1010 | /* Special case: If we just switched from WO_bio_barrier to | ||
1011 | WO_bdev_flush we should not finish the current epoch */ | ||
1012 | if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 && | ||
1013 | mdev->write_ordering != WO_bio_barrier && | ||
1014 | epoch == mdev->current_epoch) | ||
1015 | clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags); | ||
1016 | break; | ||
1017 | case EV_BARRIER_DONE: | ||
1018 | set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags); | ||
1019 | break; | ||
1020 | case EV_BECAME_LAST: | ||
1021 | /* nothing to do*/ | ||
1022 | break; | ||
1023 | } | ||
1024 | |||
1025 | trace_drbd_epoch(mdev, epoch, ev); | ||
1026 | |||
1027 | if (epoch_size != 0 && | ||
1028 | atomic_read(&epoch->active) == 0 && | ||
1029 | test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) && | ||
1030 | epoch->list.prev == &mdev->current_epoch->list && | ||
1031 | !test_bit(DE_IS_FINISHING, &epoch->flags)) { | ||
1032 | /* Nearly all conditions are met to finish that epoch... */ | ||
1033 | if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) || | ||
1034 | mdev->write_ordering == WO_none || | ||
1035 | (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) || | ||
1036 | ev & EV_CLEANUP) { | ||
1037 | finish = 1; | ||
1038 | set_bit(DE_IS_FINISHING, &epoch->flags); | ||
1039 | } else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) && | ||
1040 | mdev->write_ordering == WO_bio_barrier) { | ||
1041 | atomic_inc(&epoch->active); | ||
1042 | schedule_flush = 1; | ||
1043 | } | ||
1044 | } | ||
1045 | if (finish) { | ||
1046 | if (!(ev & EV_CLEANUP)) { | ||
1047 | spin_unlock(&mdev->epoch_lock); | ||
1048 | drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size); | ||
1049 | spin_lock(&mdev->epoch_lock); | ||
1050 | } | ||
1051 | dec_unacked(mdev); | ||
1052 | |||
1053 | if (mdev->current_epoch != epoch) { | ||
1054 | next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); | ||
1055 | list_del(&epoch->list); | ||
1056 | ev = EV_BECAME_LAST | (ev & EV_CLEANUP); | ||
1057 | mdev->epochs--; | ||
1058 | trace_drbd_epoch(mdev, epoch, EV_TRACE_FREE); | ||
1059 | kfree(epoch); | ||
1060 | |||
1061 | if (rv == FE_STILL_LIVE) | ||
1062 | rv = FE_DESTROYED; | ||
1063 | } else { | ||
1064 | epoch->flags = 0; | ||
1065 | atomic_set(&epoch->epoch_size, 0); | ||
1066 | /* atomic_set(&epoch->active, 0); is alrady zero */ | ||
1067 | if (rv == FE_STILL_LIVE) | ||
1068 | rv = FE_RECYCLED; | ||
1069 | } | ||
1070 | } | ||
1071 | |||
1072 | if (!next_epoch) | ||
1073 | break; | ||
1074 | |||
1075 | epoch = next_epoch; | ||
1076 | } while (1); | ||
1077 | |||
1078 | spin_unlock(&mdev->epoch_lock); | ||
1079 | |||
1080 | if (schedule_flush) { | ||
1081 | struct flush_work *fw; | ||
1082 | fw = kmalloc(sizeof(*fw), GFP_ATOMIC); | ||
1083 | if (fw) { | ||
1084 | trace_drbd_epoch(mdev, epoch, EV_TRACE_FLUSH); | ||
1085 | fw->w.cb = w_flush; | ||
1086 | fw->epoch = epoch; | ||
1087 | drbd_queue_work(&mdev->data.work, &fw->w); | ||
1088 | } else { | ||
1089 | dev_warn(DEV, "Could not kmalloc a flush_work obj\n"); | ||
1090 | set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); | ||
1091 | /* That is not a recursion, only one level */ | ||
1092 | drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE); | ||
1093 | drbd_may_finish_epoch(mdev, epoch, EV_PUT); | ||
1094 | } | ||
1095 | } | ||
1096 | |||
1097 | return rv; | ||
1098 | } | ||
1099 | |||
1100 | /** | ||
1101 | * drbd_bump_write_ordering() - Fall back to an other write ordering method | ||
1102 | * @mdev: DRBD device. | ||
1103 | * @wo: Write ordering method to try. | ||
1104 | */ | ||
1105 | void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local) | ||
1106 | { | ||
1107 | enum write_ordering_e pwo; | ||
1108 | static char *write_ordering_str[] = { | ||
1109 | [WO_none] = "none", | ||
1110 | [WO_drain_io] = "drain", | ||
1111 | [WO_bdev_flush] = "flush", | ||
1112 | [WO_bio_barrier] = "barrier", | ||
1113 | }; | ||
1114 | |||
1115 | pwo = mdev->write_ordering; | ||
1116 | wo = min(pwo, wo); | ||
1117 | if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier) | ||
1118 | wo = WO_bdev_flush; | ||
1119 | if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush) | ||
1120 | wo = WO_drain_io; | ||
1121 | if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain) | ||
1122 | wo = WO_none; | ||
1123 | mdev->write_ordering = wo; | ||
1124 | if (pwo != mdev->write_ordering || wo == WO_bio_barrier) | ||
1125 | dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]); | ||
1126 | } | ||
1127 | |||
1128 | /** | ||
1129 | * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set | ||
1130 | * @mdev: DRBD device. | ||
1131 | * @w: work object. | ||
1132 | * @cancel: The connection will be closed anyways (unused in this callback) | ||
1133 | */ | ||
1134 | int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local) | ||
1135 | { | ||
1136 | struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; | ||
1137 | struct bio *bio = e->private_bio; | ||
1138 | |||
1139 | /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place, | ||
1140 | (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch) | ||
1141 | so that we can finish that epoch in drbd_may_finish_epoch(). | ||
1142 | That is necessary if we already have a long chain of Epochs, before | ||
1143 | we realize that BIO_RW_BARRIER is actually not supported */ | ||
1144 | |||
1145 | /* As long as the -ENOTSUPP on the barrier is reported immediately | ||
1146 | that will never trigger. If it is reported late, we will just | ||
1147 | print that warning and continue correctly for all future requests | ||
1148 | with WO_bdev_flush */ | ||
1149 | if (previous_epoch(mdev, e->epoch)) | ||
1150 | dev_warn(DEV, "Write ordering was not enforced (one time event)\n"); | ||
1151 | |||
1152 | /* prepare bio for re-submit, | ||
1153 | * re-init volatile members */ | ||
1154 | /* we still have a local reference, | ||
1155 | * get_ldev was done in receive_Data. */ | ||
1156 | bio->bi_bdev = mdev->ldev->backing_bdev; | ||
1157 | bio->bi_sector = e->sector; | ||
1158 | bio->bi_size = e->size; | ||
1159 | bio->bi_idx = 0; | ||
1160 | |||
1161 | bio->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
1162 | bio->bi_flags |= 1 << BIO_UPTODATE; | ||
1163 | |||
1164 | /* don't know whether this is necessary: */ | ||
1165 | bio->bi_phys_segments = 0; | ||
1166 | bio->bi_next = NULL; | ||
1167 | |||
1168 | /* these should be unchanged: */ | ||
1169 | /* bio->bi_end_io = drbd_endio_write_sec; */ | ||
1170 | /* bio->bi_vcnt = whatever; */ | ||
1171 | |||
1172 | e->w.cb = e_end_block; | ||
1173 | |||
1174 | /* This is no longer a barrier request. */ | ||
1175 | bio->bi_rw &= ~(1UL << BIO_RW_BARRIER); | ||
1176 | |||
1177 | drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, bio); | ||
1178 | |||
1179 | return 1; | ||
1180 | } | ||
1181 | |||
1182 | static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h) | ||
1183 | { | ||
1184 | int rv, issue_flush; | ||
1185 | struct p_barrier *p = (struct p_barrier *)h; | ||
1186 | struct drbd_epoch *epoch; | ||
1187 | |||
1188 | ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; | ||
1189 | |||
1190 | rv = drbd_recv(mdev, h->payload, h->length); | ||
1191 | ERR_IF(rv != h->length) return FALSE; | ||
1192 | |||
1193 | inc_unacked(mdev); | ||
1194 | |||
1195 | if (mdev->net_conf->wire_protocol != DRBD_PROT_C) | ||
1196 | drbd_kick_lo(mdev); | ||
1197 | |||
1198 | mdev->current_epoch->barrier_nr = p->barrier; | ||
1199 | rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR); | ||
1200 | |||
1201 | /* P_BARRIER_ACK may imply that the corresponding extent is dropped from | ||
1202 | * the activity log, which means it would not be resynced in case the | ||
1203 | * R_PRIMARY crashes now. | ||
1204 | * Therefore we must send the barrier_ack after the barrier request was | ||
1205 | * completed. */ | ||
1206 | switch (mdev->write_ordering) { | ||
1207 | case WO_bio_barrier: | ||
1208 | case WO_none: | ||
1209 | if (rv == FE_RECYCLED) | ||
1210 | return TRUE; | ||
1211 | break; | ||
1212 | |||
1213 | case WO_bdev_flush: | ||
1214 | case WO_drain_io: | ||
1215 | D_ASSERT(rv == FE_STILL_LIVE); | ||
1216 | set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags); | ||
1217 | drbd_wait_ee_list_empty(mdev, &mdev->active_ee); | ||
1218 | rv = drbd_flush_after_epoch(mdev, mdev->current_epoch); | ||
1219 | if (rv == FE_RECYCLED) | ||
1220 | return TRUE; | ||
1221 | |||
1222 | /* The asender will send all the ACKs and barrier ACKs out, since | ||
1223 | all EEs moved from the active_ee to the done_ee. We need to | ||
1224 | provide a new epoch object for the EEs that come in soon */ | ||
1225 | break; | ||
1226 | } | ||
1227 | |||
1228 | /* receiver context, in the writeout path of the other node. | ||
1229 | * avoid potential distributed deadlock */ | ||
1230 | epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); | ||
1231 | if (!epoch) { | ||
1232 | dev_warn(DEV, "Allocation of an epoch failed, slowing down\n"); | ||
1233 | issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); | ||
1234 | drbd_wait_ee_list_empty(mdev, &mdev->active_ee); | ||
1235 | if (issue_flush) { | ||
1236 | rv = drbd_flush_after_epoch(mdev, mdev->current_epoch); | ||
1237 | if (rv == FE_RECYCLED) | ||
1238 | return TRUE; | ||
1239 | } | ||
1240 | |||
1241 | drbd_wait_ee_list_empty(mdev, &mdev->done_ee); | ||
1242 | |||
1243 | return TRUE; | ||
1244 | } | ||
1245 | |||
1246 | epoch->flags = 0; | ||
1247 | atomic_set(&epoch->epoch_size, 0); | ||
1248 | atomic_set(&epoch->active, 0); | ||
1249 | |||
1250 | spin_lock(&mdev->epoch_lock); | ||
1251 | if (atomic_read(&mdev->current_epoch->epoch_size)) { | ||
1252 | list_add(&epoch->list, &mdev->current_epoch->list); | ||
1253 | mdev->current_epoch = epoch; | ||
1254 | mdev->epochs++; | ||
1255 | trace_drbd_epoch(mdev, epoch, EV_TRACE_ALLOC); | ||
1256 | } else { | ||
1257 | /* The current_epoch got recycled while we allocated this one... */ | ||
1258 | kfree(epoch); | ||
1259 | } | ||
1260 | spin_unlock(&mdev->epoch_lock); | ||
1261 | |||
1262 | return TRUE; | ||
1263 | } | ||
1264 | |||
1265 | /* used from receive_RSDataReply (recv_resync_read) | ||
1266 | * and from receive_Data */ | ||
1267 | static struct drbd_epoch_entry * | ||
1268 | read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local) | ||
1269 | { | ||
1270 | struct drbd_epoch_entry *e; | ||
1271 | struct bio_vec *bvec; | ||
1272 | struct page *page; | ||
1273 | struct bio *bio; | ||
1274 | int dgs, ds, i, rr; | ||
1275 | void *dig_in = mdev->int_dig_in; | ||
1276 | void *dig_vv = mdev->int_dig_vv; | ||
1277 | |||
1278 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? | ||
1279 | crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; | ||
1280 | |||
1281 | if (dgs) { | ||
1282 | rr = drbd_recv(mdev, dig_in, dgs); | ||
1283 | if (rr != dgs) { | ||
1284 | dev_warn(DEV, "short read receiving data digest: read %d expected %d\n", | ||
1285 | rr, dgs); | ||
1286 | return NULL; | ||
1287 | } | ||
1288 | } | ||
1289 | |||
1290 | data_size -= dgs; | ||
1291 | |||
1292 | ERR_IF(data_size & 0x1ff) return NULL; | ||
1293 | ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL; | ||
1294 | |||
1295 | /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD | ||
1296 | * "criss-cross" setup, that might cause write-out on some other DRBD, | ||
1297 | * which in turn might block on the other node at this very place. */ | ||
1298 | e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO); | ||
1299 | if (!e) | ||
1300 | return NULL; | ||
1301 | bio = e->private_bio; | ||
1302 | ds = data_size; | ||
1303 | bio_for_each_segment(bvec, bio, i) { | ||
1304 | page = bvec->bv_page; | ||
1305 | rr = drbd_recv(mdev, kmap(page), min_t(int, ds, PAGE_SIZE)); | ||
1306 | kunmap(page); | ||
1307 | if (rr != min_t(int, ds, PAGE_SIZE)) { | ||
1308 | drbd_free_ee(mdev, e); | ||
1309 | dev_warn(DEV, "short read receiving data: read %d expected %d\n", | ||
1310 | rr, min_t(int, ds, PAGE_SIZE)); | ||
1311 | return NULL; | ||
1312 | } | ||
1313 | ds -= rr; | ||
1314 | } | ||
1315 | |||
1316 | if (dgs) { | ||
1317 | drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv); | ||
1318 | if (memcmp(dig_in, dig_vv, dgs)) { | ||
1319 | dev_err(DEV, "Digest integrity check FAILED.\n"); | ||
1320 | drbd_bcast_ee(mdev, "digest failed", | ||
1321 | dgs, dig_in, dig_vv, e); | ||
1322 | drbd_free_ee(mdev, e); | ||
1323 | return NULL; | ||
1324 | } | ||
1325 | } | ||
1326 | mdev->recv_cnt += data_size>>9; | ||
1327 | return e; | ||
1328 | } | ||
1329 | |||
1330 | /* drbd_drain_block() just takes a data block | ||
1331 | * out of the socket input buffer, and discards it. | ||
1332 | */ | ||
1333 | static int drbd_drain_block(struct drbd_conf *mdev, int data_size) | ||
1334 | { | ||
1335 | struct page *page; | ||
1336 | int rr, rv = 1; | ||
1337 | void *data; | ||
1338 | |||
1339 | page = drbd_pp_alloc(mdev, 1); | ||
1340 | |||
1341 | data = kmap(page); | ||
1342 | while (data_size) { | ||
1343 | rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE)); | ||
1344 | if (rr != min_t(int, data_size, PAGE_SIZE)) { | ||
1345 | rv = 0; | ||
1346 | dev_warn(DEV, "short read receiving data: read %d expected %d\n", | ||
1347 | rr, min_t(int, data_size, PAGE_SIZE)); | ||
1348 | break; | ||
1349 | } | ||
1350 | data_size -= rr; | ||
1351 | } | ||
1352 | kunmap(page); | ||
1353 | drbd_pp_free(mdev, page); | ||
1354 | return rv; | ||
1355 | } | ||
1356 | |||
1357 | static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, | ||
1358 | sector_t sector, int data_size) | ||
1359 | { | ||
1360 | struct bio_vec *bvec; | ||
1361 | struct bio *bio; | ||
1362 | int dgs, rr, i, expect; | ||
1363 | void *dig_in = mdev->int_dig_in; | ||
1364 | void *dig_vv = mdev->int_dig_vv; | ||
1365 | |||
1366 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? | ||
1367 | crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; | ||
1368 | |||
1369 | if (dgs) { | ||
1370 | rr = drbd_recv(mdev, dig_in, dgs); | ||
1371 | if (rr != dgs) { | ||
1372 | dev_warn(DEV, "short read receiving data reply digest: read %d expected %d\n", | ||
1373 | rr, dgs); | ||
1374 | return 0; | ||
1375 | } | ||
1376 | } | ||
1377 | |||
1378 | data_size -= dgs; | ||
1379 | |||
1380 | /* optimistically update recv_cnt. if receiving fails below, | ||
1381 | * we disconnect anyways, and counters will be reset. */ | ||
1382 | mdev->recv_cnt += data_size>>9; | ||
1383 | |||
1384 | bio = req->master_bio; | ||
1385 | D_ASSERT(sector == bio->bi_sector); | ||
1386 | |||
1387 | bio_for_each_segment(bvec, bio, i) { | ||
1388 | expect = min_t(int, data_size, bvec->bv_len); | ||
1389 | rr = drbd_recv(mdev, | ||
1390 | kmap(bvec->bv_page)+bvec->bv_offset, | ||
1391 | expect); | ||
1392 | kunmap(bvec->bv_page); | ||
1393 | if (rr != expect) { | ||
1394 | dev_warn(DEV, "short read receiving data reply: " | ||
1395 | "read %d expected %d\n", | ||
1396 | rr, expect); | ||
1397 | return 0; | ||
1398 | } | ||
1399 | data_size -= rr; | ||
1400 | } | ||
1401 | |||
1402 | if (dgs) { | ||
1403 | drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv); | ||
1404 | if (memcmp(dig_in, dig_vv, dgs)) { | ||
1405 | dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n"); | ||
1406 | return 0; | ||
1407 | } | ||
1408 | } | ||
1409 | |||
1410 | D_ASSERT(data_size == 0); | ||
1411 | return 1; | ||
1412 | } | ||
1413 | |||
1414 | /* e_end_resync_block() is called via | ||
1415 | * drbd_process_done_ee() by asender only */ | ||
1416 | static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
1417 | { | ||
1418 | struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; | ||
1419 | sector_t sector = e->sector; | ||
1420 | int ok; | ||
1421 | |||
1422 | D_ASSERT(hlist_unhashed(&e->colision)); | ||
1423 | |||
1424 | if (likely(drbd_bio_uptodate(e->private_bio))) { | ||
1425 | drbd_set_in_sync(mdev, sector, e->size); | ||
1426 | ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e); | ||
1427 | } else { | ||
1428 | /* Record failure to sync */ | ||
1429 | drbd_rs_failed_io(mdev, sector, e->size); | ||
1430 | |||
1431 | ok = drbd_send_ack(mdev, P_NEG_ACK, e); | ||
1432 | } | ||
1433 | dec_unacked(mdev); | ||
1434 | |||
1435 | return ok; | ||
1436 | } | ||
1437 | |||
1438 | static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local) | ||
1439 | { | ||
1440 | struct drbd_epoch_entry *e; | ||
1441 | |||
1442 | e = read_in_block(mdev, ID_SYNCER, sector, data_size); | ||
1443 | if (!e) { | ||
1444 | put_ldev(mdev); | ||
1445 | return FALSE; | ||
1446 | } | ||
1447 | |||
1448 | dec_rs_pending(mdev); | ||
1449 | |||
1450 | e->private_bio->bi_end_io = drbd_endio_write_sec; | ||
1451 | e->private_bio->bi_rw = WRITE; | ||
1452 | e->w.cb = e_end_resync_block; | ||
1453 | |||
1454 | inc_unacked(mdev); | ||
1455 | /* corresponding dec_unacked() in e_end_resync_block() | ||
1456 | * respective _drbd_clear_done_ee */ | ||
1457 | |||
1458 | spin_lock_irq(&mdev->req_lock); | ||
1459 | list_add(&e->w.list, &mdev->sync_ee); | ||
1460 | spin_unlock_irq(&mdev->req_lock); | ||
1461 | |||
1462 | trace_drbd_ee(mdev, e, "submitting for (rs)write"); | ||
1463 | trace_drbd_bio(mdev, "Sec", e->private_bio, 0, NULL); | ||
1464 | drbd_generic_make_request(mdev, DRBD_FAULT_RS_WR, e->private_bio); | ||
1465 | /* accounting done in endio */ | ||
1466 | |||
1467 | maybe_kick_lo(mdev); | ||
1468 | return TRUE; | ||
1469 | } | ||
1470 | |||
1471 | static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h) | ||
1472 | { | ||
1473 | struct drbd_request *req; | ||
1474 | sector_t sector; | ||
1475 | unsigned int header_size, data_size; | ||
1476 | int ok; | ||
1477 | struct p_data *p = (struct p_data *)h; | ||
1478 | |||
1479 | header_size = sizeof(*p) - sizeof(*h); | ||
1480 | data_size = h->length - header_size; | ||
1481 | |||
1482 | ERR_IF(data_size == 0) return FALSE; | ||
1483 | |||
1484 | if (drbd_recv(mdev, h->payload, header_size) != header_size) | ||
1485 | return FALSE; | ||
1486 | |||
1487 | sector = be64_to_cpu(p->sector); | ||
1488 | |||
1489 | spin_lock_irq(&mdev->req_lock); | ||
1490 | req = _ar_id_to_req(mdev, p->block_id, sector); | ||
1491 | spin_unlock_irq(&mdev->req_lock); | ||
1492 | if (unlikely(!req)) { | ||
1493 | dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n"); | ||
1494 | return FALSE; | ||
1495 | } | ||
1496 | |||
1497 | /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid | ||
1498 | * special casing it there for the various failure cases. | ||
1499 | * still no race with drbd_fail_pending_reads */ | ||
1500 | ok = recv_dless_read(mdev, req, sector, data_size); | ||
1501 | |||
1502 | if (ok) | ||
1503 | req_mod(req, data_received); | ||
1504 | /* else: nothing. handled from drbd_disconnect... | ||
1505 | * I don't think we may complete this just yet | ||
1506 | * in case we are "on-disconnect: freeze" */ | ||
1507 | |||
1508 | return ok; | ||
1509 | } | ||
1510 | |||
1511 | static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h) | ||
1512 | { | ||
1513 | sector_t sector; | ||
1514 | unsigned int header_size, data_size; | ||
1515 | int ok; | ||
1516 | struct p_data *p = (struct p_data *)h; | ||
1517 | |||
1518 | header_size = sizeof(*p) - sizeof(*h); | ||
1519 | data_size = h->length - header_size; | ||
1520 | |||
1521 | ERR_IF(data_size == 0) return FALSE; | ||
1522 | |||
1523 | if (drbd_recv(mdev, h->payload, header_size) != header_size) | ||
1524 | return FALSE; | ||
1525 | |||
1526 | sector = be64_to_cpu(p->sector); | ||
1527 | D_ASSERT(p->block_id == ID_SYNCER); | ||
1528 | |||
1529 | if (get_ldev(mdev)) { | ||
1530 | /* data is submitted to disk within recv_resync_read. | ||
1531 | * corresponding put_ldev done below on error, | ||
1532 | * or in drbd_endio_write_sec. */ | ||
1533 | ok = recv_resync_read(mdev, sector, data_size); | ||
1534 | } else { | ||
1535 | if (__ratelimit(&drbd_ratelimit_state)) | ||
1536 | dev_err(DEV, "Can not write resync data to local disk.\n"); | ||
1537 | |||
1538 | ok = drbd_drain_block(mdev, data_size); | ||
1539 | |||
1540 | drbd_send_ack_dp(mdev, P_NEG_ACK, p); | ||
1541 | } | ||
1542 | |||
1543 | return ok; | ||
1544 | } | ||
1545 | |||
1546 | /* e_end_block() is called via drbd_process_done_ee(). | ||
1547 | * this means this function only runs in the asender thread | ||
1548 | */ | ||
1549 | static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1550 | { | ||
1551 | struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; | ||
1552 | sector_t sector = e->sector; | ||
1553 | struct drbd_epoch *epoch; | ||
1554 | int ok = 1, pcmd; | ||
1555 | |||
1556 | if (e->flags & EE_IS_BARRIER) { | ||
1557 | epoch = previous_epoch(mdev, e->epoch); | ||
1558 | if (epoch) | ||
1559 | drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0)); | ||
1560 | } | ||
1561 | |||
1562 | if (mdev->net_conf->wire_protocol == DRBD_PROT_C) { | ||
1563 | if (likely(drbd_bio_uptodate(e->private_bio))) { | ||
1564 | pcmd = (mdev->state.conn >= C_SYNC_SOURCE && | ||
1565 | mdev->state.conn <= C_PAUSED_SYNC_T && | ||
1566 | e->flags & EE_MAY_SET_IN_SYNC) ? | ||
1567 | P_RS_WRITE_ACK : P_WRITE_ACK; | ||
1568 | ok &= drbd_send_ack(mdev, pcmd, e); | ||
1569 | if (pcmd == P_RS_WRITE_ACK) | ||
1570 | drbd_set_in_sync(mdev, sector, e->size); | ||
1571 | } else { | ||
1572 | ok = drbd_send_ack(mdev, P_NEG_ACK, e); | ||
1573 | /* we expect it to be marked out of sync anyways... | ||
1574 | * maybe assert this? */ | ||
1575 | } | ||
1576 | dec_unacked(mdev); | ||
1577 | } | ||
1578 | /* we delete from the conflict detection hash _after_ we sent out the | ||
1579 | * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ | ||
1580 | if (mdev->net_conf->two_primaries) { | ||
1581 | spin_lock_irq(&mdev->req_lock); | ||
1582 | D_ASSERT(!hlist_unhashed(&e->colision)); | ||
1583 | hlist_del_init(&e->colision); | ||
1584 | spin_unlock_irq(&mdev->req_lock); | ||
1585 | } else { | ||
1586 | D_ASSERT(hlist_unhashed(&e->colision)); | ||
1587 | } | ||
1588 | |||
1589 | drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); | ||
1590 | |||
1591 | return ok; | ||
1592 | } | ||
1593 | |||
1594 | static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
1595 | { | ||
1596 | struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; | ||
1597 | int ok = 1; | ||
1598 | |||
1599 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | ||
1600 | ok = drbd_send_ack(mdev, P_DISCARD_ACK, e); | ||
1601 | |||
1602 | spin_lock_irq(&mdev->req_lock); | ||
1603 | D_ASSERT(!hlist_unhashed(&e->colision)); | ||
1604 | hlist_del_init(&e->colision); | ||
1605 | spin_unlock_irq(&mdev->req_lock); | ||
1606 | |||
1607 | dec_unacked(mdev); | ||
1608 | |||
1609 | return ok; | ||
1610 | } | ||
1611 | |||
1612 | /* Called from receive_Data. | ||
1613 | * Synchronize packets on sock with packets on msock. | ||
1614 | * | ||
1615 | * This is here so even when a P_DATA packet traveling via sock overtook an Ack | ||
1616 | * packet traveling on msock, they are still processed in the order they have | ||
1617 | * been sent. | ||
1618 | * | ||
1619 | * Note: we don't care for Ack packets overtaking P_DATA packets. | ||
1620 | * | ||
1621 | * In case packet_seq is larger than mdev->peer_seq number, there are | ||
1622 | * outstanding packets on the msock. We wait for them to arrive. | ||
1623 | * In case we are the logically next packet, we update mdev->peer_seq | ||
1624 | * ourselves. Correctly handles 32bit wrap around. | ||
1625 | * | ||
1626 | * Assume we have a 10 GBit connection, that is about 1<<30 byte per second, | ||
1627 | * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds | ||
1628 | * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have | ||
1629 | * 1<<9 == 512 seconds aka ages for the 32bit wrap around... | ||
1630 | * | ||
1631 | * returns 0 if we may process the packet, | ||
1632 | * -ERESTARTSYS if we were interrupted (by disconnect signal). */ | ||
1633 | static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq) | ||
1634 | { | ||
1635 | DEFINE_WAIT(wait); | ||
1636 | unsigned int p_seq; | ||
1637 | long timeout; | ||
1638 | int ret = 0; | ||
1639 | spin_lock(&mdev->peer_seq_lock); | ||
1640 | for (;;) { | ||
1641 | prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE); | ||
1642 | if (seq_le(packet_seq, mdev->peer_seq+1)) | ||
1643 | break; | ||
1644 | if (signal_pending(current)) { | ||
1645 | ret = -ERESTARTSYS; | ||
1646 | break; | ||
1647 | } | ||
1648 | p_seq = mdev->peer_seq; | ||
1649 | spin_unlock(&mdev->peer_seq_lock); | ||
1650 | timeout = schedule_timeout(30*HZ); | ||
1651 | spin_lock(&mdev->peer_seq_lock); | ||
1652 | if (timeout == 0 && p_seq == mdev->peer_seq) { | ||
1653 | ret = -ETIMEDOUT; | ||
1654 | dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n"); | ||
1655 | break; | ||
1656 | } | ||
1657 | } | ||
1658 | finish_wait(&mdev->seq_wait, &wait); | ||
1659 | if (mdev->peer_seq+1 == packet_seq) | ||
1660 | mdev->peer_seq++; | ||
1661 | spin_unlock(&mdev->peer_seq_lock); | ||
1662 | return ret; | ||
1663 | } | ||
1664 | |||
1665 | /* mirrored write */ | ||
1666 | static int receive_Data(struct drbd_conf *mdev, struct p_header *h) | ||
1667 | { | ||
1668 | sector_t sector; | ||
1669 | struct drbd_epoch_entry *e; | ||
1670 | struct p_data *p = (struct p_data *)h; | ||
1671 | int header_size, data_size; | ||
1672 | int rw = WRITE; | ||
1673 | u32 dp_flags; | ||
1674 | |||
1675 | header_size = sizeof(*p) - sizeof(*h); | ||
1676 | data_size = h->length - header_size; | ||
1677 | |||
1678 | ERR_IF(data_size == 0) return FALSE; | ||
1679 | |||
1680 | if (drbd_recv(mdev, h->payload, header_size) != header_size) | ||
1681 | return FALSE; | ||
1682 | |||
1683 | if (!get_ldev(mdev)) { | ||
1684 | if (__ratelimit(&drbd_ratelimit_state)) | ||
1685 | dev_err(DEV, "Can not write mirrored data block " | ||
1686 | "to local disk.\n"); | ||
1687 | spin_lock(&mdev->peer_seq_lock); | ||
1688 | if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num)) | ||
1689 | mdev->peer_seq++; | ||
1690 | spin_unlock(&mdev->peer_seq_lock); | ||
1691 | |||
1692 | drbd_send_ack_dp(mdev, P_NEG_ACK, p); | ||
1693 | atomic_inc(&mdev->current_epoch->epoch_size); | ||
1694 | return drbd_drain_block(mdev, data_size); | ||
1695 | } | ||
1696 | |||
1697 | /* get_ldev(mdev) successful. | ||
1698 | * Corresponding put_ldev done either below (on various errors), | ||
1699 | * or in drbd_endio_write_sec, if we successfully submit the data at | ||
1700 | * the end of this function. */ | ||
1701 | |||
1702 | sector = be64_to_cpu(p->sector); | ||
1703 | e = read_in_block(mdev, p->block_id, sector, data_size); | ||
1704 | if (!e) { | ||
1705 | put_ldev(mdev); | ||
1706 | return FALSE; | ||
1707 | } | ||
1708 | |||
1709 | e->private_bio->bi_end_io = drbd_endio_write_sec; | ||
1710 | e->w.cb = e_end_block; | ||
1711 | |||
1712 | spin_lock(&mdev->epoch_lock); | ||
1713 | e->epoch = mdev->current_epoch; | ||
1714 | atomic_inc(&e->epoch->epoch_size); | ||
1715 | atomic_inc(&e->epoch->active); | ||
1716 | |||
1717 | if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) { | ||
1718 | struct drbd_epoch *epoch; | ||
1719 | /* Issue a barrier if we start a new epoch, and the previous epoch | ||
1720 | was not a epoch containing a single request which already was | ||
1721 | a Barrier. */ | ||
1722 | epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list); | ||
1723 | if (epoch == e->epoch) { | ||
1724 | set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags); | ||
1725 | trace_drbd_epoch(mdev, e->epoch, EV_TRACE_ADD_BARRIER); | ||
1726 | rw |= (1<<BIO_RW_BARRIER); | ||
1727 | e->flags |= EE_IS_BARRIER; | ||
1728 | } else { | ||
1729 | if (atomic_read(&epoch->epoch_size) > 1 || | ||
1730 | !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) { | ||
1731 | set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); | ||
1732 | trace_drbd_epoch(mdev, epoch, EV_TRACE_SETTING_BI); | ||
1733 | set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags); | ||
1734 | trace_drbd_epoch(mdev, e->epoch, EV_TRACE_ADD_BARRIER); | ||
1735 | rw |= (1<<BIO_RW_BARRIER); | ||
1736 | e->flags |= EE_IS_BARRIER; | ||
1737 | } | ||
1738 | } | ||
1739 | } | ||
1740 | spin_unlock(&mdev->epoch_lock); | ||
1741 | |||
1742 | dp_flags = be32_to_cpu(p->dp_flags); | ||
1743 | if (dp_flags & DP_HARDBARRIER) { | ||
1744 | dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n"); | ||
1745 | /* rw |= (1<<BIO_RW_BARRIER); */ | ||
1746 | } | ||
1747 | if (dp_flags & DP_RW_SYNC) | ||
1748 | rw |= (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG); | ||
1749 | if (dp_flags & DP_MAY_SET_IN_SYNC) | ||
1750 | e->flags |= EE_MAY_SET_IN_SYNC; | ||
1751 | |||
1752 | /* I'm the receiver, I do hold a net_cnt reference. */ | ||
1753 | if (!mdev->net_conf->two_primaries) { | ||
1754 | spin_lock_irq(&mdev->req_lock); | ||
1755 | } else { | ||
1756 | /* don't get the req_lock yet, | ||
1757 | * we may sleep in drbd_wait_peer_seq */ | ||
1758 | const int size = e->size; | ||
1759 | const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags); | ||
1760 | DEFINE_WAIT(wait); | ||
1761 | struct drbd_request *i; | ||
1762 | struct hlist_node *n; | ||
1763 | struct hlist_head *slot; | ||
1764 | int first; | ||
1765 | |||
1766 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | ||
1767 | BUG_ON(mdev->ee_hash == NULL); | ||
1768 | BUG_ON(mdev->tl_hash == NULL); | ||
1769 | |||
1770 | /* conflict detection and handling: | ||
1771 | * 1. wait on the sequence number, | ||
1772 | * in case this data packet overtook ACK packets. | ||
1773 | * 2. check our hash tables for conflicting requests. | ||
1774 | * we only need to walk the tl_hash, since an ee can not | ||
1775 | * have a conflict with an other ee: on the submitting | ||
1776 | * node, the corresponding req had already been conflicting, | ||
1777 | * and a conflicting req is never sent. | ||
1778 | * | ||
1779 | * Note: for two_primaries, we are protocol C, | ||
1780 | * so there cannot be any request that is DONE | ||
1781 | * but still on the transfer log. | ||
1782 | * | ||
1783 | * unconditionally add to the ee_hash. | ||
1784 | * | ||
1785 | * if no conflicting request is found: | ||
1786 | * submit. | ||
1787 | * | ||
1788 | * if any conflicting request is found | ||
1789 | * that has not yet been acked, | ||
1790 | * AND I have the "discard concurrent writes" flag: | ||
1791 | * queue (via done_ee) the P_DISCARD_ACK; OUT. | ||
1792 | * | ||
1793 | * if any conflicting request is found: | ||
1794 | * block the receiver, waiting on misc_wait | ||
1795 | * until no more conflicting requests are there, | ||
1796 | * or we get interrupted (disconnect). | ||
1797 | * | ||
1798 | * we do not just write after local io completion of those | ||
1799 | * requests, but only after req is done completely, i.e. | ||
1800 | * we wait for the P_DISCARD_ACK to arrive! | ||
1801 | * | ||
1802 | * then proceed normally, i.e. submit. | ||
1803 | */ | ||
1804 | if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num))) | ||
1805 | goto out_interrupted; | ||
1806 | |||
1807 | spin_lock_irq(&mdev->req_lock); | ||
1808 | |||
1809 | hlist_add_head(&e->colision, ee_hash_slot(mdev, sector)); | ||
1810 | |||
1811 | #define OVERLAPS overlaps(i->sector, i->size, sector, size) | ||
1812 | slot = tl_hash_slot(mdev, sector); | ||
1813 | first = 1; | ||
1814 | for (;;) { | ||
1815 | int have_unacked = 0; | ||
1816 | int have_conflict = 0; | ||
1817 | prepare_to_wait(&mdev->misc_wait, &wait, | ||
1818 | TASK_INTERRUPTIBLE); | ||
1819 | hlist_for_each_entry(i, n, slot, colision) { | ||
1820 | if (OVERLAPS) { | ||
1821 | /* only ALERT on first iteration, | ||
1822 | * we may be woken up early... */ | ||
1823 | if (first) | ||
1824 | dev_alert(DEV, "%s[%u] Concurrent local write detected!" | ||
1825 | " new: %llus +%u; pending: %llus +%u\n", | ||
1826 | current->comm, current->pid, | ||
1827 | (unsigned long long)sector, size, | ||
1828 | (unsigned long long)i->sector, i->size); | ||
1829 | if (i->rq_state & RQ_NET_PENDING) | ||
1830 | ++have_unacked; | ||
1831 | ++have_conflict; | ||
1832 | } | ||
1833 | } | ||
1834 | #undef OVERLAPS | ||
1835 | if (!have_conflict) | ||
1836 | break; | ||
1837 | |||
1838 | /* Discard Ack only for the _first_ iteration */ | ||
1839 | if (first && discard && have_unacked) { | ||
1840 | dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n", | ||
1841 | (unsigned long long)sector); | ||
1842 | inc_unacked(mdev); | ||
1843 | e->w.cb = e_send_discard_ack; | ||
1844 | list_add_tail(&e->w.list, &mdev->done_ee); | ||
1845 | |||
1846 | spin_unlock_irq(&mdev->req_lock); | ||
1847 | |||
1848 | /* we could probably send that P_DISCARD_ACK ourselves, | ||
1849 | * but I don't like the receiver using the msock */ | ||
1850 | |||
1851 | put_ldev(mdev); | ||
1852 | wake_asender(mdev); | ||
1853 | finish_wait(&mdev->misc_wait, &wait); | ||
1854 | return TRUE; | ||
1855 | } | ||
1856 | |||
1857 | if (signal_pending(current)) { | ||
1858 | hlist_del_init(&e->colision); | ||
1859 | |||
1860 | spin_unlock_irq(&mdev->req_lock); | ||
1861 | |||
1862 | finish_wait(&mdev->misc_wait, &wait); | ||
1863 | goto out_interrupted; | ||
1864 | } | ||
1865 | |||
1866 | spin_unlock_irq(&mdev->req_lock); | ||
1867 | if (first) { | ||
1868 | first = 0; | ||
1869 | dev_alert(DEV, "Concurrent write! [W AFTERWARDS] " | ||
1870 | "sec=%llus\n", (unsigned long long)sector); | ||
1871 | } else if (discard) { | ||
1872 | /* we had none on the first iteration. | ||
1873 | * there must be none now. */ | ||
1874 | D_ASSERT(have_unacked == 0); | ||
1875 | } | ||
1876 | schedule(); | ||
1877 | spin_lock_irq(&mdev->req_lock); | ||
1878 | } | ||
1879 | finish_wait(&mdev->misc_wait, &wait); | ||
1880 | } | ||
1881 | |||
1882 | list_add(&e->w.list, &mdev->active_ee); | ||
1883 | spin_unlock_irq(&mdev->req_lock); | ||
1884 | |||
1885 | switch (mdev->net_conf->wire_protocol) { | ||
1886 | case DRBD_PROT_C: | ||
1887 | inc_unacked(mdev); | ||
1888 | /* corresponding dec_unacked() in e_end_block() | ||
1889 | * respective _drbd_clear_done_ee */ | ||
1890 | break; | ||
1891 | case DRBD_PROT_B: | ||
1892 | /* I really don't like it that the receiver thread | ||
1893 | * sends on the msock, but anyways */ | ||
1894 | drbd_send_ack(mdev, P_RECV_ACK, e); | ||
1895 | break; | ||
1896 | case DRBD_PROT_A: | ||
1897 | /* nothing to do */ | ||
1898 | break; | ||
1899 | } | ||
1900 | |||
1901 | if (mdev->state.pdsk == D_DISKLESS) { | ||
1902 | /* In case we have the only disk of the cluster, */ | ||
1903 | drbd_set_out_of_sync(mdev, e->sector, e->size); | ||
1904 | e->flags |= EE_CALL_AL_COMPLETE_IO; | ||
1905 | drbd_al_begin_io(mdev, e->sector); | ||
1906 | } | ||
1907 | |||
1908 | e->private_bio->bi_rw = rw; | ||
1909 | trace_drbd_ee(mdev, e, "submitting for (data)write"); | ||
1910 | trace_drbd_bio(mdev, "Sec", e->private_bio, 0, NULL); | ||
1911 | drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, e->private_bio); | ||
1912 | /* accounting done in endio */ | ||
1913 | |||
1914 | maybe_kick_lo(mdev); | ||
1915 | return TRUE; | ||
1916 | |||
1917 | out_interrupted: | ||
1918 | /* yes, the epoch_size now is imbalanced. | ||
1919 | * but we drop the connection anyways, so we don't have a chance to | ||
1920 | * receive a barrier... atomic_inc(&mdev->epoch_size); */ | ||
1921 | put_ldev(mdev); | ||
1922 | drbd_free_ee(mdev, e); | ||
1923 | return FALSE; | ||
1924 | } | ||
1925 | |||
1926 | static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) | ||
1927 | { | ||
1928 | sector_t sector; | ||
1929 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); | ||
1930 | struct drbd_epoch_entry *e; | ||
1931 | struct digest_info *di = NULL; | ||
1932 | int size, digest_size; | ||
1933 | unsigned int fault_type; | ||
1934 | struct p_block_req *p = | ||
1935 | (struct p_block_req *)h; | ||
1936 | const int brps = sizeof(*p)-sizeof(*h); | ||
1937 | |||
1938 | if (drbd_recv(mdev, h->payload, brps) != brps) | ||
1939 | return FALSE; | ||
1940 | |||
1941 | sector = be64_to_cpu(p->sector); | ||
1942 | size = be32_to_cpu(p->blksize); | ||
1943 | |||
1944 | if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { | ||
1945 | dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, | ||
1946 | (unsigned long long)sector, size); | ||
1947 | return FALSE; | ||
1948 | } | ||
1949 | if (sector + (size>>9) > capacity) { | ||
1950 | dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, | ||
1951 | (unsigned long long)sector, size); | ||
1952 | return FALSE; | ||
1953 | } | ||
1954 | |||
1955 | if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) { | ||
1956 | if (__ratelimit(&drbd_ratelimit_state)) | ||
1957 | dev_err(DEV, "Can not satisfy peer's read request, " | ||
1958 | "no local data.\n"); | ||
1959 | drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY : | ||
1960 | P_NEG_RS_DREPLY , p); | ||
1961 | return TRUE; | ||
1962 | } | ||
1963 | |||
1964 | /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD | ||
1965 | * "criss-cross" setup, that might cause write-out on some other DRBD, | ||
1966 | * which in turn might block on the other node at this very place. */ | ||
1967 | e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO); | ||
1968 | if (!e) { | ||
1969 | put_ldev(mdev); | ||
1970 | return FALSE; | ||
1971 | } | ||
1972 | |||
1973 | e->private_bio->bi_rw = READ; | ||
1974 | e->private_bio->bi_end_io = drbd_endio_read_sec; | ||
1975 | |||
1976 | switch (h->command) { | ||
1977 | case P_DATA_REQUEST: | ||
1978 | e->w.cb = w_e_end_data_req; | ||
1979 | fault_type = DRBD_FAULT_DT_RD; | ||
1980 | break; | ||
1981 | case P_RS_DATA_REQUEST: | ||
1982 | e->w.cb = w_e_end_rsdata_req; | ||
1983 | fault_type = DRBD_FAULT_RS_RD; | ||
1984 | /* Eventually this should become asynchronously. Currently it | ||
1985 | * blocks the whole receiver just to delay the reading of a | ||
1986 | * resync data block. | ||
1987 | * the drbd_work_queue mechanism is made for this... | ||
1988 | */ | ||
1989 | if (!drbd_rs_begin_io(mdev, sector)) { | ||
1990 | /* we have been interrupted, | ||
1991 | * probably connection lost! */ | ||
1992 | D_ASSERT(signal_pending(current)); | ||
1993 | goto out_free_e; | ||
1994 | } | ||
1995 | break; | ||
1996 | |||
1997 | case P_OV_REPLY: | ||
1998 | case P_CSUM_RS_REQUEST: | ||
1999 | fault_type = DRBD_FAULT_RS_RD; | ||
2000 | digest_size = h->length - brps ; | ||
2001 | di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO); | ||
2002 | if (!di) | ||
2003 | goto out_free_e; | ||
2004 | |||
2005 | di->digest_size = digest_size; | ||
2006 | di->digest = (((char *)di)+sizeof(struct digest_info)); | ||
2007 | |||
2008 | if (drbd_recv(mdev, di->digest, digest_size) != digest_size) | ||
2009 | goto out_free_e; | ||
2010 | |||
2011 | e->block_id = (u64)(unsigned long)di; | ||
2012 | if (h->command == P_CSUM_RS_REQUEST) { | ||
2013 | D_ASSERT(mdev->agreed_pro_version >= 89); | ||
2014 | e->w.cb = w_e_end_csum_rs_req; | ||
2015 | } else if (h->command == P_OV_REPLY) { | ||
2016 | e->w.cb = w_e_end_ov_reply; | ||
2017 | dec_rs_pending(mdev); | ||
2018 | break; | ||
2019 | } | ||
2020 | |||
2021 | if (!drbd_rs_begin_io(mdev, sector)) { | ||
2022 | /* we have been interrupted, probably connection lost! */ | ||
2023 | D_ASSERT(signal_pending(current)); | ||
2024 | goto out_free_e; | ||
2025 | } | ||
2026 | break; | ||
2027 | |||
2028 | case P_OV_REQUEST: | ||
2029 | if (mdev->state.conn >= C_CONNECTED && | ||
2030 | mdev->state.conn != C_VERIFY_T) | ||
2031 | dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n", | ||
2032 | drbd_conn_str(mdev->state.conn)); | ||
2033 | if (mdev->ov_start_sector == ~(sector_t)0 && | ||
2034 | mdev->agreed_pro_version >= 90) { | ||
2035 | mdev->ov_start_sector = sector; | ||
2036 | mdev->ov_position = sector; | ||
2037 | mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(sector); | ||
2038 | dev_info(DEV, "Online Verify start sector: %llu\n", | ||
2039 | (unsigned long long)sector); | ||
2040 | } | ||
2041 | e->w.cb = w_e_end_ov_req; | ||
2042 | fault_type = DRBD_FAULT_RS_RD; | ||
2043 | /* Eventually this should become asynchronous. Currently it | ||
2044 | * blocks the whole receiver just to delay the reading of a | ||
2045 | * resync data block. | ||
2046 | * the drbd_work_queue mechanism is made for this... | ||
2047 | */ | ||
2048 | if (!drbd_rs_begin_io(mdev, sector)) { | ||
2049 | /* we have been interrupted, | ||
2050 | * probably connection lost! */ | ||
2051 | D_ASSERT(signal_pending(current)); | ||
2052 | goto out_free_e; | ||
2053 | } | ||
2054 | break; | ||
2055 | |||
2056 | |||
2057 | default: | ||
2058 | dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n", | ||
2059 | cmdname(h->command)); | ||
2060 | fault_type = DRBD_FAULT_MAX; | ||
2061 | } | ||
2062 | |||
2063 | spin_lock_irq(&mdev->req_lock); | ||
2064 | list_add(&e->w.list, &mdev->read_ee); | ||
2065 | spin_unlock_irq(&mdev->req_lock); | ||
2066 | |||
2067 | inc_unacked(mdev); | ||
2068 | |||
2069 | trace_drbd_ee(mdev, e, "submitting for read"); | ||
2070 | trace_drbd_bio(mdev, "Sec", e->private_bio, 0, NULL); | ||
2071 | drbd_generic_make_request(mdev, fault_type, e->private_bio); | ||
2072 | maybe_kick_lo(mdev); | ||
2073 | |||
2074 | return TRUE; | ||
2075 | |||
2076 | out_free_e: | ||
2077 | kfree(di); | ||
2078 | put_ldev(mdev); | ||
2079 | drbd_free_ee(mdev, e); | ||
2080 | return FALSE; | ||
2081 | } | ||
2082 | |||
2083 | static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) | ||
2084 | { | ||
2085 | int self, peer, rv = -100; | ||
2086 | unsigned long ch_self, ch_peer; | ||
2087 | |||
2088 | self = mdev->ldev->md.uuid[UI_BITMAP] & 1; | ||
2089 | peer = mdev->p_uuid[UI_BITMAP] & 1; | ||
2090 | |||
2091 | ch_peer = mdev->p_uuid[UI_SIZE]; | ||
2092 | ch_self = mdev->comm_bm_set; | ||
2093 | |||
2094 | switch (mdev->net_conf->after_sb_0p) { | ||
2095 | case ASB_CONSENSUS: | ||
2096 | case ASB_DISCARD_SECONDARY: | ||
2097 | case ASB_CALL_HELPER: | ||
2098 | dev_err(DEV, "Configuration error.\n"); | ||
2099 | break; | ||
2100 | case ASB_DISCONNECT: | ||
2101 | break; | ||
2102 | case ASB_DISCARD_YOUNGER_PRI: | ||
2103 | if (self == 0 && peer == 1) { | ||
2104 | rv = -1; | ||
2105 | break; | ||
2106 | } | ||
2107 | if (self == 1 && peer == 0) { | ||
2108 | rv = 1; | ||
2109 | break; | ||
2110 | } | ||
2111 | /* Else fall through to one of the other strategies... */ | ||
2112 | case ASB_DISCARD_OLDER_PRI: | ||
2113 | if (self == 0 && peer == 1) { | ||
2114 | rv = 1; | ||
2115 | break; | ||
2116 | } | ||
2117 | if (self == 1 && peer == 0) { | ||
2118 | rv = -1; | ||
2119 | break; | ||
2120 | } | ||
2121 | /* Else fall through to one of the other strategies... */ | ||
2122 | dev_warn(DEV, "Discard younger/older primary did not found a decision\n" | ||
2123 | "Using discard-least-changes instead\n"); | ||
2124 | case ASB_DISCARD_ZERO_CHG: | ||
2125 | if (ch_peer == 0 && ch_self == 0) { | ||
2126 | rv = test_bit(DISCARD_CONCURRENT, &mdev->flags) | ||
2127 | ? -1 : 1; | ||
2128 | break; | ||
2129 | } else { | ||
2130 | if (ch_peer == 0) { rv = 1; break; } | ||
2131 | if (ch_self == 0) { rv = -1; break; } | ||
2132 | } | ||
2133 | if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG) | ||
2134 | break; | ||
2135 | case ASB_DISCARD_LEAST_CHG: | ||
2136 | if (ch_self < ch_peer) | ||
2137 | rv = -1; | ||
2138 | else if (ch_self > ch_peer) | ||
2139 | rv = 1; | ||
2140 | else /* ( ch_self == ch_peer ) */ | ||
2141 | /* Well, then use something else. */ | ||
2142 | rv = test_bit(DISCARD_CONCURRENT, &mdev->flags) | ||
2143 | ? -1 : 1; | ||
2144 | break; | ||
2145 | case ASB_DISCARD_LOCAL: | ||
2146 | rv = -1; | ||
2147 | break; | ||
2148 | case ASB_DISCARD_REMOTE: | ||
2149 | rv = 1; | ||
2150 | } | ||
2151 | |||
2152 | return rv; | ||
2153 | } | ||
2154 | |||
2155 | static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) | ||
2156 | { | ||
2157 | int self, peer, hg, rv = -100; | ||
2158 | |||
2159 | self = mdev->ldev->md.uuid[UI_BITMAP] & 1; | ||
2160 | peer = mdev->p_uuid[UI_BITMAP] & 1; | ||
2161 | |||
2162 | switch (mdev->net_conf->after_sb_1p) { | ||
2163 | case ASB_DISCARD_YOUNGER_PRI: | ||
2164 | case ASB_DISCARD_OLDER_PRI: | ||
2165 | case ASB_DISCARD_LEAST_CHG: | ||
2166 | case ASB_DISCARD_LOCAL: | ||
2167 | case ASB_DISCARD_REMOTE: | ||
2168 | dev_err(DEV, "Configuration error.\n"); | ||
2169 | break; | ||
2170 | case ASB_DISCONNECT: | ||
2171 | break; | ||
2172 | case ASB_CONSENSUS: | ||
2173 | hg = drbd_asb_recover_0p(mdev); | ||
2174 | if (hg == -1 && mdev->state.role == R_SECONDARY) | ||
2175 | rv = hg; | ||
2176 | if (hg == 1 && mdev->state.role == R_PRIMARY) | ||
2177 | rv = hg; | ||
2178 | break; | ||
2179 | case ASB_VIOLENTLY: | ||
2180 | rv = drbd_asb_recover_0p(mdev); | ||
2181 | break; | ||
2182 | case ASB_DISCARD_SECONDARY: | ||
2183 | return mdev->state.role == R_PRIMARY ? 1 : -1; | ||
2184 | case ASB_CALL_HELPER: | ||
2185 | hg = drbd_asb_recover_0p(mdev); | ||
2186 | if (hg == -1 && mdev->state.role == R_PRIMARY) { | ||
2187 | self = drbd_set_role(mdev, R_SECONDARY, 0); | ||
2188 | /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, | ||
2189 | * we might be here in C_WF_REPORT_PARAMS which is transient. | ||
2190 | * we do not need to wait for the after state change work either. */ | ||
2191 | self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY)); | ||
2192 | if (self != SS_SUCCESS) { | ||
2193 | drbd_khelper(mdev, "pri-lost-after-sb"); | ||
2194 | } else { | ||
2195 | dev_warn(DEV, "Successfully gave up primary role.\n"); | ||
2196 | rv = hg; | ||
2197 | } | ||
2198 | } else | ||
2199 | rv = hg; | ||
2200 | } | ||
2201 | |||
2202 | return rv; | ||
2203 | } | ||
2204 | |||
2205 | static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local) | ||
2206 | { | ||
2207 | int self, peer, hg, rv = -100; | ||
2208 | |||
2209 | self = mdev->ldev->md.uuid[UI_BITMAP] & 1; | ||
2210 | peer = mdev->p_uuid[UI_BITMAP] & 1; | ||
2211 | |||
2212 | switch (mdev->net_conf->after_sb_2p) { | ||
2213 | case ASB_DISCARD_YOUNGER_PRI: | ||
2214 | case ASB_DISCARD_OLDER_PRI: | ||
2215 | case ASB_DISCARD_LEAST_CHG: | ||
2216 | case ASB_DISCARD_LOCAL: | ||
2217 | case ASB_DISCARD_REMOTE: | ||
2218 | case ASB_CONSENSUS: | ||
2219 | case ASB_DISCARD_SECONDARY: | ||
2220 | dev_err(DEV, "Configuration error.\n"); | ||
2221 | break; | ||
2222 | case ASB_VIOLENTLY: | ||
2223 | rv = drbd_asb_recover_0p(mdev); | ||
2224 | break; | ||
2225 | case ASB_DISCONNECT: | ||
2226 | break; | ||
2227 | case ASB_CALL_HELPER: | ||
2228 | hg = drbd_asb_recover_0p(mdev); | ||
2229 | if (hg == -1) { | ||
2230 | /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, | ||
2231 | * we might be here in C_WF_REPORT_PARAMS which is transient. | ||
2232 | * we do not need to wait for the after state change work either. */ | ||
2233 | self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY)); | ||
2234 | if (self != SS_SUCCESS) { | ||
2235 | drbd_khelper(mdev, "pri-lost-after-sb"); | ||
2236 | } else { | ||
2237 | dev_warn(DEV, "Successfully gave up primary role.\n"); | ||
2238 | rv = hg; | ||
2239 | } | ||
2240 | } else | ||
2241 | rv = hg; | ||
2242 | } | ||
2243 | |||
2244 | return rv; | ||
2245 | } | ||
2246 | |||
2247 | static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid, | ||
2248 | u64 bits, u64 flags) | ||
2249 | { | ||
2250 | if (!uuid) { | ||
2251 | dev_info(DEV, "%s uuid info vanished while I was looking!\n", text); | ||
2252 | return; | ||
2253 | } | ||
2254 | dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n", | ||
2255 | text, | ||
2256 | (unsigned long long)uuid[UI_CURRENT], | ||
2257 | (unsigned long long)uuid[UI_BITMAP], | ||
2258 | (unsigned long long)uuid[UI_HISTORY_START], | ||
2259 | (unsigned long long)uuid[UI_HISTORY_END], | ||
2260 | (unsigned long long)bits, | ||
2261 | (unsigned long long)flags); | ||
2262 | } | ||
2263 | |||
2264 | /* | ||
2265 | 100 after split brain try auto recover | ||
2266 | 2 C_SYNC_SOURCE set BitMap | ||
2267 | 1 C_SYNC_SOURCE use BitMap | ||
2268 | 0 no Sync | ||
2269 | -1 C_SYNC_TARGET use BitMap | ||
2270 | -2 C_SYNC_TARGET set BitMap | ||
2271 | -100 after split brain, disconnect | ||
2272 | -1000 unrelated data | ||
2273 | */ | ||
2274 | static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local) | ||
2275 | { | ||
2276 | u64 self, peer; | ||
2277 | int i, j; | ||
2278 | |||
2279 | self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1); | ||
2280 | peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1); | ||
2281 | |||
2282 | *rule_nr = 10; | ||
2283 | if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED) | ||
2284 | return 0; | ||
2285 | |||
2286 | *rule_nr = 20; | ||
2287 | if ((self == UUID_JUST_CREATED || self == (u64)0) && | ||
2288 | peer != UUID_JUST_CREATED) | ||
2289 | return -2; | ||
2290 | |||
2291 | *rule_nr = 30; | ||
2292 | if (self != UUID_JUST_CREATED && | ||
2293 | (peer == UUID_JUST_CREATED || peer == (u64)0)) | ||
2294 | return 2; | ||
2295 | |||
2296 | if (self == peer) { | ||
2297 | int rct, dc; /* roles at crash time */ | ||
2298 | |||
2299 | if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) { | ||
2300 | |||
2301 | if (mdev->agreed_pro_version < 91) | ||
2302 | return -1001; | ||
2303 | |||
2304 | if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) && | ||
2305 | (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) { | ||
2306 | dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n"); | ||
2307 | drbd_uuid_set_bm(mdev, 0UL); | ||
2308 | |||
2309 | drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, | ||
2310 | mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0); | ||
2311 | *rule_nr = 34; | ||
2312 | } else { | ||
2313 | dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n"); | ||
2314 | *rule_nr = 36; | ||
2315 | } | ||
2316 | |||
2317 | return 1; | ||
2318 | } | ||
2319 | |||
2320 | if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) { | ||
2321 | |||
2322 | if (mdev->agreed_pro_version < 91) | ||
2323 | return -1001; | ||
2324 | |||
2325 | if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) && | ||
2326 | (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) { | ||
2327 | dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n"); | ||
2328 | |||
2329 | mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START]; | ||
2330 | mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP]; | ||
2331 | mdev->p_uuid[UI_BITMAP] = 0UL; | ||
2332 | |||
2333 | drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); | ||
2334 | *rule_nr = 35; | ||
2335 | } else { | ||
2336 | dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n"); | ||
2337 | *rule_nr = 37; | ||
2338 | } | ||
2339 | |||
2340 | return -1; | ||
2341 | } | ||
2342 | |||
2343 | /* Common power [off|failure] */ | ||
2344 | rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) + | ||
2345 | (mdev->p_uuid[UI_FLAGS] & 2); | ||
2346 | /* lowest bit is set when we were primary, | ||
2347 | * next bit (weight 2) is set when peer was primary */ | ||
2348 | *rule_nr = 40; | ||
2349 | |||
2350 | switch (rct) { | ||
2351 | case 0: /* !self_pri && !peer_pri */ return 0; | ||
2352 | case 1: /* self_pri && !peer_pri */ return 1; | ||
2353 | case 2: /* !self_pri && peer_pri */ return -1; | ||
2354 | case 3: /* self_pri && peer_pri */ | ||
2355 | dc = test_bit(DISCARD_CONCURRENT, &mdev->flags); | ||
2356 | return dc ? -1 : 1; | ||
2357 | } | ||
2358 | } | ||
2359 | |||
2360 | *rule_nr = 50; | ||
2361 | peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1); | ||
2362 | if (self == peer) | ||
2363 | return -1; | ||
2364 | |||
2365 | *rule_nr = 51; | ||
2366 | peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); | ||
2367 | if (self == peer) { | ||
2368 | self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); | ||
2369 | peer = mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1); | ||
2370 | if (self == peer) { | ||
2371 | /* The last P_SYNC_UUID did not get though. Undo the last start of | ||
2372 | resync as sync source modifications of the peer's UUIDs. */ | ||
2373 | |||
2374 | if (mdev->agreed_pro_version < 91) | ||
2375 | return -1001; | ||
2376 | |||
2377 | mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START]; | ||
2378 | mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1]; | ||
2379 | return -1; | ||
2380 | } | ||
2381 | } | ||
2382 | |||
2383 | *rule_nr = 60; | ||
2384 | self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1); | ||
2385 | for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { | ||
2386 | peer = mdev->p_uuid[i] & ~((u64)1); | ||
2387 | if (self == peer) | ||
2388 | return -2; | ||
2389 | } | ||
2390 | |||
2391 | *rule_nr = 70; | ||
2392 | self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1); | ||
2393 | peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1); | ||
2394 | if (self == peer) | ||
2395 | return 1; | ||
2396 | |||
2397 | *rule_nr = 71; | ||
2398 | self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); | ||
2399 | if (self == peer) { | ||
2400 | self = mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1); | ||
2401 | peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); | ||
2402 | if (self == peer) { | ||
2403 | /* The last P_SYNC_UUID did not get though. Undo the last start of | ||
2404 | resync as sync source modifications of our UUIDs. */ | ||
2405 | |||
2406 | if (mdev->agreed_pro_version < 91) | ||
2407 | return -1001; | ||
2408 | |||
2409 | _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]); | ||
2410 | _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]); | ||
2411 | |||
2412 | dev_info(DEV, "Undid last start of resync:\n"); | ||
2413 | |||
2414 | drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, | ||
2415 | mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0); | ||
2416 | |||
2417 | return 1; | ||
2418 | } | ||
2419 | } | ||
2420 | |||
2421 | |||
2422 | *rule_nr = 80; | ||
2423 | for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { | ||
2424 | self = mdev->ldev->md.uuid[i] & ~((u64)1); | ||
2425 | if (self == peer) | ||
2426 | return 2; | ||
2427 | } | ||
2428 | |||
2429 | *rule_nr = 90; | ||
2430 | self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1); | ||
2431 | peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1); | ||
2432 | if (self == peer && self != ((u64)0)) | ||
2433 | return 100; | ||
2434 | |||
2435 | *rule_nr = 100; | ||
2436 | for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { | ||
2437 | self = mdev->ldev->md.uuid[i] & ~((u64)1); | ||
2438 | for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) { | ||
2439 | peer = mdev->p_uuid[j] & ~((u64)1); | ||
2440 | if (self == peer) | ||
2441 | return -100; | ||
2442 | } | ||
2443 | } | ||
2444 | |||
2445 | return -1000; | ||
2446 | } | ||
2447 | |||
2448 | /* drbd_sync_handshake() returns the new conn state on success, or | ||
2449 | CONN_MASK (-1) on failure. | ||
2450 | */ | ||
2451 | static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role, | ||
2452 | enum drbd_disk_state peer_disk) __must_hold(local) | ||
2453 | { | ||
2454 | int hg, rule_nr; | ||
2455 | enum drbd_conns rv = C_MASK; | ||
2456 | enum drbd_disk_state mydisk; | ||
2457 | |||
2458 | mydisk = mdev->state.disk; | ||
2459 | if (mydisk == D_NEGOTIATING) | ||
2460 | mydisk = mdev->new_state_tmp.disk; | ||
2461 | |||
2462 | dev_info(DEV, "drbd_sync_handshake:\n"); | ||
2463 | drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0); | ||
2464 | drbd_uuid_dump(mdev, "peer", mdev->p_uuid, | ||
2465 | mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); | ||
2466 | |||
2467 | hg = drbd_uuid_compare(mdev, &rule_nr); | ||
2468 | |||
2469 | dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr); | ||
2470 | |||
2471 | if (hg == -1000) { | ||
2472 | dev_alert(DEV, "Unrelated data, aborting!\n"); | ||
2473 | return C_MASK; | ||
2474 | } | ||
2475 | if (hg == -1001) { | ||
2476 | dev_alert(DEV, "To resolve this both sides have to support at least protocol\n"); | ||
2477 | return C_MASK; | ||
2478 | } | ||
2479 | |||
2480 | if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) || | ||
2481 | (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) { | ||
2482 | int f = (hg == -100) || abs(hg) == 2; | ||
2483 | hg = mydisk > D_INCONSISTENT ? 1 : -1; | ||
2484 | if (f) | ||
2485 | hg = hg*2; | ||
2486 | dev_info(DEV, "Becoming sync %s due to disk states.\n", | ||
2487 | hg > 0 ? "source" : "target"); | ||
2488 | } | ||
2489 | |||
2490 | if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) { | ||
2491 | int pcount = (mdev->state.role == R_PRIMARY) | ||
2492 | + (peer_role == R_PRIMARY); | ||
2493 | int forced = (hg == -100); | ||
2494 | |||
2495 | switch (pcount) { | ||
2496 | case 0: | ||
2497 | hg = drbd_asb_recover_0p(mdev); | ||
2498 | break; | ||
2499 | case 1: | ||
2500 | hg = drbd_asb_recover_1p(mdev); | ||
2501 | break; | ||
2502 | case 2: | ||
2503 | hg = drbd_asb_recover_2p(mdev); | ||
2504 | break; | ||
2505 | } | ||
2506 | if (abs(hg) < 100) { | ||
2507 | dev_warn(DEV, "Split-Brain detected, %d primaries, " | ||
2508 | "automatically solved. Sync from %s node\n", | ||
2509 | pcount, (hg < 0) ? "peer" : "this"); | ||
2510 | if (forced) { | ||
2511 | dev_warn(DEV, "Doing a full sync, since" | ||
2512 | " UUIDs where ambiguous.\n"); | ||
2513 | hg = hg*2; | ||
2514 | } | ||
2515 | } | ||
2516 | } | ||
2517 | |||
2518 | if (hg == -100) { | ||
2519 | if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1)) | ||
2520 | hg = -1; | ||
2521 | if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1)) | ||
2522 | hg = 1; | ||
2523 | |||
2524 | if (abs(hg) < 100) | ||
2525 | dev_warn(DEV, "Split-Brain detected, manually solved. " | ||
2526 | "Sync from %s node\n", | ||
2527 | (hg < 0) ? "peer" : "this"); | ||
2528 | } | ||
2529 | |||
2530 | if (hg == -100) { | ||
2531 | dev_alert(DEV, "Split-Brain detected, dropping connection!\n"); | ||
2532 | drbd_khelper(mdev, "split-brain"); | ||
2533 | return C_MASK; | ||
2534 | } | ||
2535 | |||
2536 | if (hg > 0 && mydisk <= D_INCONSISTENT) { | ||
2537 | dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n"); | ||
2538 | return C_MASK; | ||
2539 | } | ||
2540 | |||
2541 | if (hg < 0 && /* by intention we do not use mydisk here. */ | ||
2542 | mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) { | ||
2543 | switch (mdev->net_conf->rr_conflict) { | ||
2544 | case ASB_CALL_HELPER: | ||
2545 | drbd_khelper(mdev, "pri-lost"); | ||
2546 | /* fall through */ | ||
2547 | case ASB_DISCONNECT: | ||
2548 | dev_err(DEV, "I shall become SyncTarget, but I am primary!\n"); | ||
2549 | return C_MASK; | ||
2550 | case ASB_VIOLENTLY: | ||
2551 | dev_warn(DEV, "Becoming SyncTarget, violating the stable-data" | ||
2552 | "assumption\n"); | ||
2553 | } | ||
2554 | } | ||
2555 | |||
2556 | if (abs(hg) >= 2) { | ||
2557 | dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n"); | ||
2558 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake")) | ||
2559 | return C_MASK; | ||
2560 | } | ||
2561 | |||
2562 | if (hg > 0) { /* become sync source. */ | ||
2563 | rv = C_WF_BITMAP_S; | ||
2564 | } else if (hg < 0) { /* become sync target */ | ||
2565 | rv = C_WF_BITMAP_T; | ||
2566 | } else { | ||
2567 | rv = C_CONNECTED; | ||
2568 | if (drbd_bm_total_weight(mdev)) { | ||
2569 | dev_info(DEV, "No resync, but %lu bits in bitmap!\n", | ||
2570 | drbd_bm_total_weight(mdev)); | ||
2571 | } | ||
2572 | } | ||
2573 | |||
2574 | return rv; | ||
2575 | } | ||
2576 | |||
2577 | /* returns 1 if invalid */ | ||
2578 | static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self) | ||
2579 | { | ||
2580 | /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */ | ||
2581 | if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) || | ||
2582 | (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL)) | ||
2583 | return 0; | ||
2584 | |||
2585 | /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */ | ||
2586 | if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL || | ||
2587 | self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL) | ||
2588 | return 1; | ||
2589 | |||
2590 | /* everything else is valid if they are equal on both sides. */ | ||
2591 | if (peer == self) | ||
2592 | return 0; | ||
2593 | |||
2594 | /* everything es is invalid. */ | ||
2595 | return 1; | ||
2596 | } | ||
2597 | |||
2598 | static int receive_protocol(struct drbd_conf *mdev, struct p_header *h) | ||
2599 | { | ||
2600 | struct p_protocol *p = (struct p_protocol *)h; | ||
2601 | int header_size, data_size; | ||
2602 | int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; | ||
2603 | int p_want_lose, p_two_primaries; | ||
2604 | char p_integrity_alg[SHARED_SECRET_MAX] = ""; | ||
2605 | |||
2606 | header_size = sizeof(*p) - sizeof(*h); | ||
2607 | data_size = h->length - header_size; | ||
2608 | |||
2609 | if (drbd_recv(mdev, h->payload, header_size) != header_size) | ||
2610 | return FALSE; | ||
2611 | |||
2612 | p_proto = be32_to_cpu(p->protocol); | ||
2613 | p_after_sb_0p = be32_to_cpu(p->after_sb_0p); | ||
2614 | p_after_sb_1p = be32_to_cpu(p->after_sb_1p); | ||
2615 | p_after_sb_2p = be32_to_cpu(p->after_sb_2p); | ||
2616 | p_want_lose = be32_to_cpu(p->want_lose); | ||
2617 | p_two_primaries = be32_to_cpu(p->two_primaries); | ||
2618 | |||
2619 | if (p_proto != mdev->net_conf->wire_protocol) { | ||
2620 | dev_err(DEV, "incompatible communication protocols\n"); | ||
2621 | goto disconnect; | ||
2622 | } | ||
2623 | |||
2624 | if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) { | ||
2625 | dev_err(DEV, "incompatible after-sb-0pri settings\n"); | ||
2626 | goto disconnect; | ||
2627 | } | ||
2628 | |||
2629 | if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) { | ||
2630 | dev_err(DEV, "incompatible after-sb-1pri settings\n"); | ||
2631 | goto disconnect; | ||
2632 | } | ||
2633 | |||
2634 | if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) { | ||
2635 | dev_err(DEV, "incompatible after-sb-2pri settings\n"); | ||
2636 | goto disconnect; | ||
2637 | } | ||
2638 | |||
2639 | if (p_want_lose && mdev->net_conf->want_lose) { | ||
2640 | dev_err(DEV, "both sides have the 'want_lose' flag set\n"); | ||
2641 | goto disconnect; | ||
2642 | } | ||
2643 | |||
2644 | if (p_two_primaries != mdev->net_conf->two_primaries) { | ||
2645 | dev_err(DEV, "incompatible setting of the two-primaries options\n"); | ||
2646 | goto disconnect; | ||
2647 | } | ||
2648 | |||
2649 | if (mdev->agreed_pro_version >= 87) { | ||
2650 | unsigned char *my_alg = mdev->net_conf->integrity_alg; | ||
2651 | |||
2652 | if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size) | ||
2653 | return FALSE; | ||
2654 | |||
2655 | p_integrity_alg[SHARED_SECRET_MAX-1] = 0; | ||
2656 | if (strcmp(p_integrity_alg, my_alg)) { | ||
2657 | dev_err(DEV, "incompatible setting of the data-integrity-alg\n"); | ||
2658 | goto disconnect; | ||
2659 | } | ||
2660 | dev_info(DEV, "data-integrity-alg: %s\n", | ||
2661 | my_alg[0] ? my_alg : (unsigned char *)"<not-used>"); | ||
2662 | } | ||
2663 | |||
2664 | return TRUE; | ||
2665 | |||
2666 | disconnect: | ||
2667 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2668 | return FALSE; | ||
2669 | } | ||
2670 | |||
2671 | /* helper function | ||
2672 | * input: alg name, feature name | ||
2673 | * return: NULL (alg name was "") | ||
2674 | * ERR_PTR(error) if something goes wrong | ||
2675 | * or the crypto hash ptr, if it worked out ok. */ | ||
2676 | struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev, | ||
2677 | const char *alg, const char *name) | ||
2678 | { | ||
2679 | struct crypto_hash *tfm; | ||
2680 | |||
2681 | if (!alg[0]) | ||
2682 | return NULL; | ||
2683 | |||
2684 | tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC); | ||
2685 | if (IS_ERR(tfm)) { | ||
2686 | dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n", | ||
2687 | alg, name, PTR_ERR(tfm)); | ||
2688 | return tfm; | ||
2689 | } | ||
2690 | if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) { | ||
2691 | crypto_free_hash(tfm); | ||
2692 | dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name); | ||
2693 | return ERR_PTR(-EINVAL); | ||
2694 | } | ||
2695 | return tfm; | ||
2696 | } | ||
2697 | |||
2698 | static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h) | ||
2699 | { | ||
2700 | int ok = TRUE; | ||
2701 | struct p_rs_param_89 *p = (struct p_rs_param_89 *)h; | ||
2702 | unsigned int header_size, data_size, exp_max_sz; | ||
2703 | struct crypto_hash *verify_tfm = NULL; | ||
2704 | struct crypto_hash *csums_tfm = NULL; | ||
2705 | const int apv = mdev->agreed_pro_version; | ||
2706 | |||
2707 | exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) | ||
2708 | : apv == 88 ? sizeof(struct p_rs_param) | ||
2709 | + SHARED_SECRET_MAX | ||
2710 | : /* 89 */ sizeof(struct p_rs_param_89); | ||
2711 | |||
2712 | if (h->length > exp_max_sz) { | ||
2713 | dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n", | ||
2714 | h->length, exp_max_sz); | ||
2715 | return FALSE; | ||
2716 | } | ||
2717 | |||
2718 | if (apv <= 88) { | ||
2719 | header_size = sizeof(struct p_rs_param) - sizeof(*h); | ||
2720 | data_size = h->length - header_size; | ||
2721 | } else /* apv >= 89 */ { | ||
2722 | header_size = sizeof(struct p_rs_param_89) - sizeof(*h); | ||
2723 | data_size = h->length - header_size; | ||
2724 | D_ASSERT(data_size == 0); | ||
2725 | } | ||
2726 | |||
2727 | /* initialize verify_alg and csums_alg */ | ||
2728 | memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); | ||
2729 | |||
2730 | if (drbd_recv(mdev, h->payload, header_size) != header_size) | ||
2731 | return FALSE; | ||
2732 | |||
2733 | mdev->sync_conf.rate = be32_to_cpu(p->rate); | ||
2734 | |||
2735 | if (apv >= 88) { | ||
2736 | if (apv == 88) { | ||
2737 | if (data_size > SHARED_SECRET_MAX) { | ||
2738 | dev_err(DEV, "verify-alg too long, " | ||
2739 | "peer wants %u, accepting only %u byte\n", | ||
2740 | data_size, SHARED_SECRET_MAX); | ||
2741 | return FALSE; | ||
2742 | } | ||
2743 | |||
2744 | if (drbd_recv(mdev, p->verify_alg, data_size) != data_size) | ||
2745 | return FALSE; | ||
2746 | |||
2747 | /* we expect NUL terminated string */ | ||
2748 | /* but just in case someone tries to be evil */ | ||
2749 | D_ASSERT(p->verify_alg[data_size-1] == 0); | ||
2750 | p->verify_alg[data_size-1] = 0; | ||
2751 | |||
2752 | } else /* apv >= 89 */ { | ||
2753 | /* we still expect NUL terminated strings */ | ||
2754 | /* but just in case someone tries to be evil */ | ||
2755 | D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0); | ||
2756 | D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0); | ||
2757 | p->verify_alg[SHARED_SECRET_MAX-1] = 0; | ||
2758 | p->csums_alg[SHARED_SECRET_MAX-1] = 0; | ||
2759 | } | ||
2760 | |||
2761 | if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) { | ||
2762 | if (mdev->state.conn == C_WF_REPORT_PARAMS) { | ||
2763 | dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n", | ||
2764 | mdev->sync_conf.verify_alg, p->verify_alg); | ||
2765 | goto disconnect; | ||
2766 | } | ||
2767 | verify_tfm = drbd_crypto_alloc_digest_safe(mdev, | ||
2768 | p->verify_alg, "verify-alg"); | ||
2769 | if (IS_ERR(verify_tfm)) { | ||
2770 | verify_tfm = NULL; | ||
2771 | goto disconnect; | ||
2772 | } | ||
2773 | } | ||
2774 | |||
2775 | if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) { | ||
2776 | if (mdev->state.conn == C_WF_REPORT_PARAMS) { | ||
2777 | dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n", | ||
2778 | mdev->sync_conf.csums_alg, p->csums_alg); | ||
2779 | goto disconnect; | ||
2780 | } | ||
2781 | csums_tfm = drbd_crypto_alloc_digest_safe(mdev, | ||
2782 | p->csums_alg, "csums-alg"); | ||
2783 | if (IS_ERR(csums_tfm)) { | ||
2784 | csums_tfm = NULL; | ||
2785 | goto disconnect; | ||
2786 | } | ||
2787 | } | ||
2788 | |||
2789 | |||
2790 | spin_lock(&mdev->peer_seq_lock); | ||
2791 | /* lock against drbd_nl_syncer_conf() */ | ||
2792 | if (verify_tfm) { | ||
2793 | strcpy(mdev->sync_conf.verify_alg, p->verify_alg); | ||
2794 | mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1; | ||
2795 | crypto_free_hash(mdev->verify_tfm); | ||
2796 | mdev->verify_tfm = verify_tfm; | ||
2797 | dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg); | ||
2798 | } | ||
2799 | if (csums_tfm) { | ||
2800 | strcpy(mdev->sync_conf.csums_alg, p->csums_alg); | ||
2801 | mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1; | ||
2802 | crypto_free_hash(mdev->csums_tfm); | ||
2803 | mdev->csums_tfm = csums_tfm; | ||
2804 | dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg); | ||
2805 | } | ||
2806 | spin_unlock(&mdev->peer_seq_lock); | ||
2807 | } | ||
2808 | |||
2809 | return ok; | ||
2810 | disconnect: | ||
2811 | /* just for completeness: actually not needed, | ||
2812 | * as this is not reached if csums_tfm was ok. */ | ||
2813 | crypto_free_hash(csums_tfm); | ||
2814 | /* but free the verify_tfm again, if csums_tfm did not work out */ | ||
2815 | crypto_free_hash(verify_tfm); | ||
2816 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2817 | return FALSE; | ||
2818 | } | ||
2819 | |||
2820 | static void drbd_setup_order_type(struct drbd_conf *mdev, int peer) | ||
2821 | { | ||
2822 | /* sorry, we currently have no working implementation | ||
2823 | * of distributed TCQ */ | ||
2824 | } | ||
2825 | |||
2826 | /* warn if the arguments differ by more than 12.5% */ | ||
2827 | static void warn_if_differ_considerably(struct drbd_conf *mdev, | ||
2828 | const char *s, sector_t a, sector_t b) | ||
2829 | { | ||
2830 | sector_t d; | ||
2831 | if (a == 0 || b == 0) | ||
2832 | return; | ||
2833 | d = (a > b) ? (a - b) : (b - a); | ||
2834 | if (d > (a>>3) || d > (b>>3)) | ||
2835 | dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s, | ||
2836 | (unsigned long long)a, (unsigned long long)b); | ||
2837 | } | ||
2838 | |||
2839 | static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) | ||
2840 | { | ||
2841 | struct p_sizes *p = (struct p_sizes *)h; | ||
2842 | enum determine_dev_size dd = unchanged; | ||
2843 | unsigned int max_seg_s; | ||
2844 | sector_t p_size, p_usize, my_usize; | ||
2845 | int ldsc = 0; /* local disk size changed */ | ||
2846 | enum drbd_conns nconn; | ||
2847 | |||
2848 | ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; | ||
2849 | if (drbd_recv(mdev, h->payload, h->length) != h->length) | ||
2850 | return FALSE; | ||
2851 | |||
2852 | p_size = be64_to_cpu(p->d_size); | ||
2853 | p_usize = be64_to_cpu(p->u_size); | ||
2854 | |||
2855 | if (p_size == 0 && mdev->state.disk == D_DISKLESS) { | ||
2856 | dev_err(DEV, "some backing storage is needed\n"); | ||
2857 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2858 | return FALSE; | ||
2859 | } | ||
2860 | |||
2861 | /* just store the peer's disk size for now. | ||
2862 | * we still need to figure out whether we accept that. */ | ||
2863 | mdev->p_size = p_size; | ||
2864 | |||
2865 | #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) | ||
2866 | if (get_ldev(mdev)) { | ||
2867 | warn_if_differ_considerably(mdev, "lower level device sizes", | ||
2868 | p_size, drbd_get_max_capacity(mdev->ldev)); | ||
2869 | warn_if_differ_considerably(mdev, "user requested size", | ||
2870 | p_usize, mdev->ldev->dc.disk_size); | ||
2871 | |||
2872 | /* if this is the first connect, or an otherwise expected | ||
2873 | * param exchange, choose the minimum */ | ||
2874 | if (mdev->state.conn == C_WF_REPORT_PARAMS) | ||
2875 | p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size, | ||
2876 | p_usize); | ||
2877 | |||
2878 | my_usize = mdev->ldev->dc.disk_size; | ||
2879 | |||
2880 | if (mdev->ldev->dc.disk_size != p_usize) { | ||
2881 | mdev->ldev->dc.disk_size = p_usize; | ||
2882 | dev_info(DEV, "Peer sets u_size to %lu sectors\n", | ||
2883 | (unsigned long)mdev->ldev->dc.disk_size); | ||
2884 | } | ||
2885 | |||
2886 | /* Never shrink a device with usable data during connect. | ||
2887 | But allow online shrinking if we are connected. */ | ||
2888 | if (drbd_new_dev_size(mdev, mdev->ldev) < | ||
2889 | drbd_get_capacity(mdev->this_bdev) && | ||
2890 | mdev->state.disk >= D_OUTDATED && | ||
2891 | mdev->state.conn < C_CONNECTED) { | ||
2892 | dev_err(DEV, "The peer's disk size is too small!\n"); | ||
2893 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2894 | mdev->ldev->dc.disk_size = my_usize; | ||
2895 | put_ldev(mdev); | ||
2896 | return FALSE; | ||
2897 | } | ||
2898 | put_ldev(mdev); | ||
2899 | } | ||
2900 | #undef min_not_zero | ||
2901 | |||
2902 | if (get_ldev(mdev)) { | ||
2903 | dd = drbd_determin_dev_size(mdev); | ||
2904 | put_ldev(mdev); | ||
2905 | if (dd == dev_size_error) | ||
2906 | return FALSE; | ||
2907 | drbd_md_sync(mdev); | ||
2908 | } else { | ||
2909 | /* I am diskless, need to accept the peer's size. */ | ||
2910 | drbd_set_my_capacity(mdev, p_size); | ||
2911 | } | ||
2912 | |||
2913 | if (mdev->p_uuid && mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) { | ||
2914 | nconn = drbd_sync_handshake(mdev, | ||
2915 | mdev->state.peer, mdev->state.pdsk); | ||
2916 | put_ldev(mdev); | ||
2917 | |||
2918 | if (nconn == C_MASK) { | ||
2919 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2920 | return FALSE; | ||
2921 | } | ||
2922 | |||
2923 | if (drbd_request_state(mdev, NS(conn, nconn)) < SS_SUCCESS) { | ||
2924 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2925 | return FALSE; | ||
2926 | } | ||
2927 | } | ||
2928 | |||
2929 | if (get_ldev(mdev)) { | ||
2930 | if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { | ||
2931 | mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); | ||
2932 | ldsc = 1; | ||
2933 | } | ||
2934 | |||
2935 | max_seg_s = be32_to_cpu(p->max_segment_size); | ||
2936 | if (max_seg_s != queue_max_segment_size(mdev->rq_queue)) | ||
2937 | drbd_setup_queue_param(mdev, max_seg_s); | ||
2938 | |||
2939 | drbd_setup_order_type(mdev, be32_to_cpu(p->queue_order_type)); | ||
2940 | put_ldev(mdev); | ||
2941 | } | ||
2942 | |||
2943 | if (mdev->state.conn > C_WF_REPORT_PARAMS) { | ||
2944 | if (be64_to_cpu(p->c_size) != | ||
2945 | drbd_get_capacity(mdev->this_bdev) || ldsc) { | ||
2946 | /* we have different sizes, probably peer | ||
2947 | * needs to know my new size... */ | ||
2948 | drbd_send_sizes(mdev, 0); | ||
2949 | } | ||
2950 | if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) || | ||
2951 | (dd == grew && mdev->state.conn == C_CONNECTED)) { | ||
2952 | if (mdev->state.pdsk >= D_INCONSISTENT && | ||
2953 | mdev->state.disk >= D_INCONSISTENT) | ||
2954 | resync_after_online_grow(mdev); | ||
2955 | else | ||
2956 | set_bit(RESYNC_AFTER_NEG, &mdev->flags); | ||
2957 | } | ||
2958 | } | ||
2959 | |||
2960 | return TRUE; | ||
2961 | } | ||
2962 | |||
2963 | static int receive_uuids(struct drbd_conf *mdev, struct p_header *h) | ||
2964 | { | ||
2965 | struct p_uuids *p = (struct p_uuids *)h; | ||
2966 | u64 *p_uuid; | ||
2967 | int i; | ||
2968 | |||
2969 | ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; | ||
2970 | if (drbd_recv(mdev, h->payload, h->length) != h->length) | ||
2971 | return FALSE; | ||
2972 | |||
2973 | p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); | ||
2974 | |||
2975 | for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++) | ||
2976 | p_uuid[i] = be64_to_cpu(p->uuid[i]); | ||
2977 | |||
2978 | kfree(mdev->p_uuid); | ||
2979 | mdev->p_uuid = p_uuid; | ||
2980 | |||
2981 | if (mdev->state.conn < C_CONNECTED && | ||
2982 | mdev->state.disk < D_INCONSISTENT && | ||
2983 | mdev->state.role == R_PRIMARY && | ||
2984 | (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) { | ||
2985 | dev_err(DEV, "Can only connect to data with current UUID=%016llX\n", | ||
2986 | (unsigned long long)mdev->ed_uuid); | ||
2987 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2988 | return FALSE; | ||
2989 | } | ||
2990 | |||
2991 | if (get_ldev(mdev)) { | ||
2992 | int skip_initial_sync = | ||
2993 | mdev->state.conn == C_CONNECTED && | ||
2994 | mdev->agreed_pro_version >= 90 && | ||
2995 | mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && | ||
2996 | (p_uuid[UI_FLAGS] & 8); | ||
2997 | if (skip_initial_sync) { | ||
2998 | dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n"); | ||
2999 | drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, | ||
3000 | "clear_n_write from receive_uuids"); | ||
3001 | _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]); | ||
3002 | _drbd_uuid_set(mdev, UI_BITMAP, 0); | ||
3003 | _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), | ||
3004 | CS_VERBOSE, NULL); | ||
3005 | drbd_md_sync(mdev); | ||
3006 | } | ||
3007 | put_ldev(mdev); | ||
3008 | } | ||
3009 | |||
3010 | /* Before we test for the disk state, we should wait until an eventually | ||
3011 | ongoing cluster wide state change is finished. That is important if | ||
3012 | we are primary and are detaching from our disk. We need to see the | ||
3013 | new disk state... */ | ||
3014 | wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags)); | ||
3015 | if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT) | ||
3016 | drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]); | ||
3017 | |||
3018 | return TRUE; | ||
3019 | } | ||
3020 | |||
3021 | /** | ||
3022 | * convert_state() - Converts the peer's view of the cluster state to our point of view | ||
3023 | * @ps: The state as seen by the peer. | ||
3024 | */ | ||
3025 | static union drbd_state convert_state(union drbd_state ps) | ||
3026 | { | ||
3027 | union drbd_state ms; | ||
3028 | |||
3029 | static enum drbd_conns c_tab[] = { | ||
3030 | [C_CONNECTED] = C_CONNECTED, | ||
3031 | |||
3032 | [C_STARTING_SYNC_S] = C_STARTING_SYNC_T, | ||
3033 | [C_STARTING_SYNC_T] = C_STARTING_SYNC_S, | ||
3034 | [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */ | ||
3035 | [C_VERIFY_S] = C_VERIFY_T, | ||
3036 | [C_MASK] = C_MASK, | ||
3037 | }; | ||
3038 | |||
3039 | ms.i = ps.i; | ||
3040 | |||
3041 | ms.conn = c_tab[ps.conn]; | ||
3042 | ms.peer = ps.role; | ||
3043 | ms.role = ps.peer; | ||
3044 | ms.pdsk = ps.disk; | ||
3045 | ms.disk = ps.pdsk; | ||
3046 | ms.peer_isp = (ps.aftr_isp | ps.user_isp); | ||
3047 | |||
3048 | return ms; | ||
3049 | } | ||
3050 | |||
3051 | static int receive_req_state(struct drbd_conf *mdev, struct p_header *h) | ||
3052 | { | ||
3053 | struct p_req_state *p = (struct p_req_state *)h; | ||
3054 | union drbd_state mask, val; | ||
3055 | int rv; | ||
3056 | |||
3057 | ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; | ||
3058 | if (drbd_recv(mdev, h->payload, h->length) != h->length) | ||
3059 | return FALSE; | ||
3060 | |||
3061 | mask.i = be32_to_cpu(p->mask); | ||
3062 | val.i = be32_to_cpu(p->val); | ||
3063 | |||
3064 | if (test_bit(DISCARD_CONCURRENT, &mdev->flags) && | ||
3065 | test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) { | ||
3066 | drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG); | ||
3067 | return TRUE; | ||
3068 | } | ||
3069 | |||
3070 | mask = convert_state(mask); | ||
3071 | val = convert_state(val); | ||
3072 | |||
3073 | rv = drbd_change_state(mdev, CS_VERBOSE, mask, val); | ||
3074 | |||
3075 | drbd_send_sr_reply(mdev, rv); | ||
3076 | drbd_md_sync(mdev); | ||
3077 | |||
3078 | return TRUE; | ||
3079 | } | ||
3080 | |||
3081 | static int receive_state(struct drbd_conf *mdev, struct p_header *h) | ||
3082 | { | ||
3083 | struct p_state *p = (struct p_state *)h; | ||
3084 | enum drbd_conns nconn, oconn; | ||
3085 | union drbd_state ns, peer_state; | ||
3086 | enum drbd_disk_state real_peer_disk; | ||
3087 | int rv; | ||
3088 | |||
3089 | ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) | ||
3090 | return FALSE; | ||
3091 | |||
3092 | if (drbd_recv(mdev, h->payload, h->length) != h->length) | ||
3093 | return FALSE; | ||
3094 | |||
3095 | peer_state.i = be32_to_cpu(p->state); | ||
3096 | |||
3097 | real_peer_disk = peer_state.disk; | ||
3098 | if (peer_state.disk == D_NEGOTIATING) { | ||
3099 | real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT; | ||
3100 | dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk)); | ||
3101 | } | ||
3102 | |||
3103 | spin_lock_irq(&mdev->req_lock); | ||
3104 | retry: | ||
3105 | oconn = nconn = mdev->state.conn; | ||
3106 | spin_unlock_irq(&mdev->req_lock); | ||
3107 | |||
3108 | if (nconn == C_WF_REPORT_PARAMS) | ||
3109 | nconn = C_CONNECTED; | ||
3110 | |||
3111 | if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING && | ||
3112 | get_ldev_if_state(mdev, D_NEGOTIATING)) { | ||
3113 | int cr; /* consider resync */ | ||
3114 | |||
3115 | /* if we established a new connection */ | ||
3116 | cr = (oconn < C_CONNECTED); | ||
3117 | /* if we had an established connection | ||
3118 | * and one of the nodes newly attaches a disk */ | ||
3119 | cr |= (oconn == C_CONNECTED && | ||
3120 | (peer_state.disk == D_NEGOTIATING || | ||
3121 | mdev->state.disk == D_NEGOTIATING)); | ||
3122 | /* if we have both been inconsistent, and the peer has been | ||
3123 | * forced to be UpToDate with --overwrite-data */ | ||
3124 | cr |= test_bit(CONSIDER_RESYNC, &mdev->flags); | ||
3125 | /* if we had been plain connected, and the admin requested to | ||
3126 | * start a sync by "invalidate" or "invalidate-remote" */ | ||
3127 | cr |= (oconn == C_CONNECTED && | ||
3128 | (peer_state.conn >= C_STARTING_SYNC_S && | ||
3129 | peer_state.conn <= C_WF_BITMAP_T)); | ||
3130 | |||
3131 | if (cr) | ||
3132 | nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk); | ||
3133 | |||
3134 | put_ldev(mdev); | ||
3135 | if (nconn == C_MASK) { | ||
3136 | if (mdev->state.disk == D_NEGOTIATING) { | ||
3137 | drbd_force_state(mdev, NS(disk, D_DISKLESS)); | ||
3138 | nconn = C_CONNECTED; | ||
3139 | } else if (peer_state.disk == D_NEGOTIATING) { | ||
3140 | dev_err(DEV, "Disk attach process on the peer node was aborted.\n"); | ||
3141 | peer_state.disk = D_DISKLESS; | ||
3142 | } else { | ||
3143 | D_ASSERT(oconn == C_WF_REPORT_PARAMS); | ||
3144 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
3145 | return FALSE; | ||
3146 | } | ||
3147 | } | ||
3148 | } | ||
3149 | |||
3150 | spin_lock_irq(&mdev->req_lock); | ||
3151 | if (mdev->state.conn != oconn) | ||
3152 | goto retry; | ||
3153 | clear_bit(CONSIDER_RESYNC, &mdev->flags); | ||
3154 | ns.i = mdev->state.i; | ||
3155 | ns.conn = nconn; | ||
3156 | ns.peer = peer_state.role; | ||
3157 | ns.pdsk = real_peer_disk; | ||
3158 | ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp); | ||
3159 | if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) | ||
3160 | ns.disk = mdev->new_state_tmp.disk; | ||
3161 | |||
3162 | rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL); | ||
3163 | ns = mdev->state; | ||
3164 | spin_unlock_irq(&mdev->req_lock); | ||
3165 | |||
3166 | if (rv < SS_SUCCESS) { | ||
3167 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
3168 | return FALSE; | ||
3169 | } | ||
3170 | |||
3171 | if (oconn > C_WF_REPORT_PARAMS) { | ||
3172 | if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED && | ||
3173 | peer_state.disk != D_NEGOTIATING ) { | ||
3174 | /* we want resync, peer has not yet decided to sync... */ | ||
3175 | /* Nowadays only used when forcing a node into primary role and | ||
3176 | setting its disk to UpToDate with that */ | ||
3177 | drbd_send_uuids(mdev); | ||
3178 | drbd_send_state(mdev); | ||
3179 | } | ||
3180 | } | ||
3181 | |||
3182 | mdev->net_conf->want_lose = 0; | ||
3183 | |||
3184 | drbd_md_sync(mdev); /* update connected indicator, la_size, ... */ | ||
3185 | |||
3186 | return TRUE; | ||
3187 | } | ||
3188 | |||
3189 | static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h) | ||
3190 | { | ||
3191 | struct p_rs_uuid *p = (struct p_rs_uuid *)h; | ||
3192 | |||
3193 | wait_event(mdev->misc_wait, | ||
3194 | mdev->state.conn == C_WF_SYNC_UUID || | ||
3195 | mdev->state.conn < C_CONNECTED || | ||
3196 | mdev->state.disk < D_NEGOTIATING); | ||
3197 | |||
3198 | /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */ | ||
3199 | |||
3200 | ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; | ||
3201 | if (drbd_recv(mdev, h->payload, h->length) != h->length) | ||
3202 | return FALSE; | ||
3203 | |||
3204 | /* Here the _drbd_uuid_ functions are right, current should | ||
3205 | _not_ be rotated into the history */ | ||
3206 | if (get_ldev_if_state(mdev, D_NEGOTIATING)) { | ||
3207 | _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid)); | ||
3208 | _drbd_uuid_set(mdev, UI_BITMAP, 0UL); | ||
3209 | |||
3210 | drbd_start_resync(mdev, C_SYNC_TARGET); | ||
3211 | |||
3212 | put_ldev(mdev); | ||
3213 | } else | ||
3214 | dev_err(DEV, "Ignoring SyncUUID packet!\n"); | ||
3215 | |||
3216 | return TRUE; | ||
3217 | } | ||
3218 | |||
3219 | enum receive_bitmap_ret { OK, DONE, FAILED }; | ||
3220 | |||
3221 | static enum receive_bitmap_ret | ||
3222 | receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h, | ||
3223 | unsigned long *buffer, struct bm_xfer_ctx *c) | ||
3224 | { | ||
3225 | unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); | ||
3226 | unsigned want = num_words * sizeof(long); | ||
3227 | |||
3228 | if (want != h->length) { | ||
3229 | dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length); | ||
3230 | return FAILED; | ||
3231 | } | ||
3232 | if (want == 0) | ||
3233 | return DONE; | ||
3234 | if (drbd_recv(mdev, buffer, want) != want) | ||
3235 | return FAILED; | ||
3236 | |||
3237 | drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer); | ||
3238 | |||
3239 | c->word_offset += num_words; | ||
3240 | c->bit_offset = c->word_offset * BITS_PER_LONG; | ||
3241 | if (c->bit_offset > c->bm_bits) | ||
3242 | c->bit_offset = c->bm_bits; | ||
3243 | |||
3244 | return OK; | ||
3245 | } | ||
3246 | |||
3247 | static enum receive_bitmap_ret | ||
3248 | recv_bm_rle_bits(struct drbd_conf *mdev, | ||
3249 | struct p_compressed_bm *p, | ||
3250 | struct bm_xfer_ctx *c) | ||
3251 | { | ||
3252 | struct bitstream bs; | ||
3253 | u64 look_ahead; | ||
3254 | u64 rl; | ||
3255 | u64 tmp; | ||
3256 | unsigned long s = c->bit_offset; | ||
3257 | unsigned long e; | ||
3258 | int len = p->head.length - (sizeof(*p) - sizeof(p->head)); | ||
3259 | int toggle = DCBP_get_start(p); | ||
3260 | int have; | ||
3261 | int bits; | ||
3262 | |||
3263 | bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p)); | ||
3264 | |||
3265 | bits = bitstream_get_bits(&bs, &look_ahead, 64); | ||
3266 | if (bits < 0) | ||
3267 | return FAILED; | ||
3268 | |||
3269 | for (have = bits; have > 0; s += rl, toggle = !toggle) { | ||
3270 | bits = vli_decode_bits(&rl, look_ahead); | ||
3271 | if (bits <= 0) | ||
3272 | return FAILED; | ||
3273 | |||
3274 | if (toggle) { | ||
3275 | e = s + rl -1; | ||
3276 | if (e >= c->bm_bits) { | ||
3277 | dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e); | ||
3278 | return FAILED; | ||
3279 | } | ||
3280 | _drbd_bm_set_bits(mdev, s, e); | ||
3281 | } | ||
3282 | |||
3283 | if (have < bits) { | ||
3284 | dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n", | ||
3285 | have, bits, look_ahead, | ||
3286 | (unsigned int)(bs.cur.b - p->code), | ||
3287 | (unsigned int)bs.buf_len); | ||
3288 | return FAILED; | ||
3289 | } | ||
3290 | look_ahead >>= bits; | ||
3291 | have -= bits; | ||
3292 | |||
3293 | bits = bitstream_get_bits(&bs, &tmp, 64 - have); | ||
3294 | if (bits < 0) | ||
3295 | return FAILED; | ||
3296 | look_ahead |= tmp << have; | ||
3297 | have += bits; | ||
3298 | } | ||
3299 | |||
3300 | c->bit_offset = s; | ||
3301 | bm_xfer_ctx_bit_to_word_offset(c); | ||
3302 | |||
3303 | return (s == c->bm_bits) ? DONE : OK; | ||
3304 | } | ||
3305 | |||
3306 | static enum receive_bitmap_ret | ||
3307 | decode_bitmap_c(struct drbd_conf *mdev, | ||
3308 | struct p_compressed_bm *p, | ||
3309 | struct bm_xfer_ctx *c) | ||
3310 | { | ||
3311 | if (DCBP_get_code(p) == RLE_VLI_Bits) | ||
3312 | return recv_bm_rle_bits(mdev, p, c); | ||
3313 | |||
3314 | /* other variants had been implemented for evaluation, | ||
3315 | * but have been dropped as this one turned out to be "best" | ||
3316 | * during all our tests. */ | ||
3317 | |||
3318 | dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding); | ||
3319 | drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); | ||
3320 | return FAILED; | ||
3321 | } | ||
3322 | |||
3323 | void INFO_bm_xfer_stats(struct drbd_conf *mdev, | ||
3324 | const char *direction, struct bm_xfer_ctx *c) | ||
3325 | { | ||
3326 | /* what would it take to transfer it "plaintext" */ | ||
3327 | unsigned plain = sizeof(struct p_header) * | ||
3328 | ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1) | ||
3329 | + c->bm_words * sizeof(long); | ||
3330 | unsigned total = c->bytes[0] + c->bytes[1]; | ||
3331 | unsigned r; | ||
3332 | |||
3333 | /* total can not be zero. but just in case: */ | ||
3334 | if (total == 0) | ||
3335 | return; | ||
3336 | |||
3337 | /* don't report if not compressed */ | ||
3338 | if (total >= plain) | ||
3339 | return; | ||
3340 | |||
3341 | /* total < plain. check for overflow, still */ | ||
3342 | r = (total > UINT_MAX/1000) ? (total / (plain/1000)) | ||
3343 | : (1000 * total / plain); | ||
3344 | |||
3345 | if (r > 1000) | ||
3346 | r = 1000; | ||
3347 | |||
3348 | r = 1000 - r; | ||
3349 | dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), " | ||
3350 | "total %u; compression: %u.%u%%\n", | ||
3351 | direction, | ||
3352 | c->bytes[1], c->packets[1], | ||
3353 | c->bytes[0], c->packets[0], | ||
3354 | total, r/10, r % 10); | ||
3355 | } | ||
3356 | |||
3357 | /* Since we are processing the bitfield from lower addresses to higher, | ||
3358 | it does not matter if the process it in 32 bit chunks or 64 bit | ||
3359 | chunks as long as it is little endian. (Understand it as byte stream, | ||
3360 | beginning with the lowest byte...) If we would use big endian | ||
3361 | we would need to process it from the highest address to the lowest, | ||
3362 | in order to be agnostic to the 32 vs 64 bits issue. | ||
3363 | |||
3364 | returns 0 on failure, 1 if we successfully received it. */ | ||
3365 | static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h) | ||
3366 | { | ||
3367 | struct bm_xfer_ctx c; | ||
3368 | void *buffer; | ||
3369 | enum receive_bitmap_ret ret; | ||
3370 | int ok = FALSE; | ||
3371 | |||
3372 | wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); | ||
3373 | |||
3374 | drbd_bm_lock(mdev, "receive bitmap"); | ||
3375 | |||
3376 | /* maybe we should use some per thread scratch page, | ||
3377 | * and allocate that during initial device creation? */ | ||
3378 | buffer = (unsigned long *) __get_free_page(GFP_NOIO); | ||
3379 | if (!buffer) { | ||
3380 | dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); | ||
3381 | goto out; | ||
3382 | } | ||
3383 | |||
3384 | c = (struct bm_xfer_ctx) { | ||
3385 | .bm_bits = drbd_bm_bits(mdev), | ||
3386 | .bm_words = drbd_bm_words(mdev), | ||
3387 | }; | ||
3388 | |||
3389 | do { | ||
3390 | if (h->command == P_BITMAP) { | ||
3391 | ret = receive_bitmap_plain(mdev, h, buffer, &c); | ||
3392 | } else if (h->command == P_COMPRESSED_BITMAP) { | ||
3393 | /* MAYBE: sanity check that we speak proto >= 90, | ||
3394 | * and the feature is enabled! */ | ||
3395 | struct p_compressed_bm *p; | ||
3396 | |||
3397 | if (h->length > BM_PACKET_PAYLOAD_BYTES) { | ||
3398 | dev_err(DEV, "ReportCBitmap packet too large\n"); | ||
3399 | goto out; | ||
3400 | } | ||
3401 | /* use the page buff */ | ||
3402 | p = buffer; | ||
3403 | memcpy(p, h, sizeof(*h)); | ||
3404 | if (drbd_recv(mdev, p->head.payload, h->length) != h->length) | ||
3405 | goto out; | ||
3406 | if (p->head.length <= (sizeof(*p) - sizeof(p->head))) { | ||
3407 | dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length); | ||
3408 | return FAILED; | ||
3409 | } | ||
3410 | ret = decode_bitmap_c(mdev, p, &c); | ||
3411 | } else { | ||
3412 | dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command); | ||
3413 | goto out; | ||
3414 | } | ||
3415 | |||
3416 | c.packets[h->command == P_BITMAP]++; | ||
3417 | c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length; | ||
3418 | |||
3419 | if (ret != OK) | ||
3420 | break; | ||
3421 | |||
3422 | if (!drbd_recv_header(mdev, h)) | ||
3423 | goto out; | ||
3424 | } while (ret == OK); | ||
3425 | if (ret == FAILED) | ||
3426 | goto out; | ||
3427 | |||
3428 | INFO_bm_xfer_stats(mdev, "receive", &c); | ||
3429 | |||
3430 | if (mdev->state.conn == C_WF_BITMAP_T) { | ||
3431 | ok = !drbd_send_bitmap(mdev); | ||
3432 | if (!ok) | ||
3433 | goto out; | ||
3434 | /* Omit CS_ORDERED with this state transition to avoid deadlocks. */ | ||
3435 | ok = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); | ||
3436 | D_ASSERT(ok == SS_SUCCESS); | ||
3437 | } else if (mdev->state.conn != C_WF_BITMAP_S) { | ||
3438 | /* admin may have requested C_DISCONNECTING, | ||
3439 | * other threads may have noticed network errors */ | ||
3440 | dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n", | ||
3441 | drbd_conn_str(mdev->state.conn)); | ||
3442 | } | ||
3443 | |||
3444 | ok = TRUE; | ||
3445 | out: | ||
3446 | drbd_bm_unlock(mdev); | ||
3447 | if (ok && mdev->state.conn == C_WF_BITMAP_S) | ||
3448 | drbd_start_resync(mdev, C_SYNC_SOURCE); | ||
3449 | free_page((unsigned long) buffer); | ||
3450 | return ok; | ||
3451 | } | ||
3452 | |||
3453 | static int receive_skip(struct drbd_conf *mdev, struct p_header *h) | ||
3454 | { | ||
3455 | /* TODO zero copy sink :) */ | ||
3456 | static char sink[128]; | ||
3457 | int size, want, r; | ||
3458 | |||
3459 | dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n", | ||
3460 | h->command, h->length); | ||
3461 | |||
3462 | size = h->length; | ||
3463 | while (size > 0) { | ||
3464 | want = min_t(int, size, sizeof(sink)); | ||
3465 | r = drbd_recv(mdev, sink, want); | ||
3466 | ERR_IF(r <= 0) break; | ||
3467 | size -= r; | ||
3468 | } | ||
3469 | return size == 0; | ||
3470 | } | ||
3471 | |||
3472 | static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h) | ||
3473 | { | ||
3474 | if (mdev->state.disk >= D_INCONSISTENT) | ||
3475 | drbd_kick_lo(mdev); | ||
3476 | |||
3477 | /* Make sure we've acked all the TCP data associated | ||
3478 | * with the data requests being unplugged */ | ||
3479 | drbd_tcp_quickack(mdev->data.socket); | ||
3480 | |||
3481 | return TRUE; | ||
3482 | } | ||
3483 | |||
3484 | typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *); | ||
3485 | |||
3486 | static drbd_cmd_handler_f drbd_default_handler[] = { | ||
3487 | [P_DATA] = receive_Data, | ||
3488 | [P_DATA_REPLY] = receive_DataReply, | ||
3489 | [P_RS_DATA_REPLY] = receive_RSDataReply, | ||
3490 | [P_BARRIER] = receive_Barrier, | ||
3491 | [P_BITMAP] = receive_bitmap, | ||
3492 | [P_COMPRESSED_BITMAP] = receive_bitmap, | ||
3493 | [P_UNPLUG_REMOTE] = receive_UnplugRemote, | ||
3494 | [P_DATA_REQUEST] = receive_DataRequest, | ||
3495 | [P_RS_DATA_REQUEST] = receive_DataRequest, | ||
3496 | [P_SYNC_PARAM] = receive_SyncParam, | ||
3497 | [P_SYNC_PARAM89] = receive_SyncParam, | ||
3498 | [P_PROTOCOL] = receive_protocol, | ||
3499 | [P_UUIDS] = receive_uuids, | ||
3500 | [P_SIZES] = receive_sizes, | ||
3501 | [P_STATE] = receive_state, | ||
3502 | [P_STATE_CHG_REQ] = receive_req_state, | ||
3503 | [P_SYNC_UUID] = receive_sync_uuid, | ||
3504 | [P_OV_REQUEST] = receive_DataRequest, | ||
3505 | [P_OV_REPLY] = receive_DataRequest, | ||
3506 | [P_CSUM_RS_REQUEST] = receive_DataRequest, | ||
3507 | /* anything missing from this table is in | ||
3508 | * the asender_tbl, see get_asender_cmd */ | ||
3509 | [P_MAX_CMD] = NULL, | ||
3510 | }; | ||
3511 | |||
3512 | static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler; | ||
3513 | static drbd_cmd_handler_f *drbd_opt_cmd_handler; | ||
3514 | |||
3515 | static void drbdd(struct drbd_conf *mdev) | ||
3516 | { | ||
3517 | drbd_cmd_handler_f handler; | ||
3518 | struct p_header *header = &mdev->data.rbuf.header; | ||
3519 | |||
3520 | while (get_t_state(&mdev->receiver) == Running) { | ||
3521 | drbd_thread_current_set_cpu(mdev); | ||
3522 | if (!drbd_recv_header(mdev, header)) | ||
3523 | break; | ||
3524 | |||
3525 | if (header->command < P_MAX_CMD) | ||
3526 | handler = drbd_cmd_handler[header->command]; | ||
3527 | else if (P_MAY_IGNORE < header->command | ||
3528 | && header->command < P_MAX_OPT_CMD) | ||
3529 | handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE]; | ||
3530 | else if (header->command > P_MAX_OPT_CMD) | ||
3531 | handler = receive_skip; | ||
3532 | else | ||
3533 | handler = NULL; | ||
3534 | |||
3535 | if (unlikely(!handler)) { | ||
3536 | dev_err(DEV, "unknown packet type %d, l: %d!\n", | ||
3537 | header->command, header->length); | ||
3538 | drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); | ||
3539 | break; | ||
3540 | } | ||
3541 | if (unlikely(!handler(mdev, header))) { | ||
3542 | dev_err(DEV, "error receiving %s, l: %d!\n", | ||
3543 | cmdname(header->command), header->length); | ||
3544 | drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); | ||
3545 | break; | ||
3546 | } | ||
3547 | |||
3548 | trace_drbd_packet(mdev, mdev->data.socket, 2, &mdev->data.rbuf, | ||
3549 | __FILE__, __LINE__); | ||
3550 | } | ||
3551 | } | ||
3552 | |||
3553 | static void drbd_fail_pending_reads(struct drbd_conf *mdev) | ||
3554 | { | ||
3555 | struct hlist_head *slot; | ||
3556 | struct hlist_node *pos; | ||
3557 | struct hlist_node *tmp; | ||
3558 | struct drbd_request *req; | ||
3559 | int i; | ||
3560 | |||
3561 | /* | ||
3562 | * Application READ requests | ||
3563 | */ | ||
3564 | spin_lock_irq(&mdev->req_lock); | ||
3565 | for (i = 0; i < APP_R_HSIZE; i++) { | ||
3566 | slot = mdev->app_reads_hash+i; | ||
3567 | hlist_for_each_entry_safe(req, pos, tmp, slot, colision) { | ||
3568 | /* it may (but should not any longer!) | ||
3569 | * be on the work queue; if that assert triggers, | ||
3570 | * we need to also grab the | ||
3571 | * spin_lock_irq(&mdev->data.work.q_lock); | ||
3572 | * and list_del_init here. */ | ||
3573 | D_ASSERT(list_empty(&req->w.list)); | ||
3574 | /* It would be nice to complete outside of spinlock. | ||
3575 | * But this is easier for now. */ | ||
3576 | _req_mod(req, connection_lost_while_pending); | ||
3577 | } | ||
3578 | } | ||
3579 | for (i = 0; i < APP_R_HSIZE; i++) | ||
3580 | if (!hlist_empty(mdev->app_reads_hash+i)) | ||
3581 | dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: " | ||
3582 | "%p, should be NULL\n", i, mdev->app_reads_hash[i].first); | ||
3583 | |||
3584 | memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); | ||
3585 | spin_unlock_irq(&mdev->req_lock); | ||
3586 | } | ||
3587 | |||
3588 | void drbd_flush_workqueue(struct drbd_conf *mdev) | ||
3589 | { | ||
3590 | struct drbd_wq_barrier barr; | ||
3591 | |||
3592 | barr.w.cb = w_prev_work_done; | ||
3593 | init_completion(&barr.done); | ||
3594 | drbd_queue_work(&mdev->data.work, &barr.w); | ||
3595 | wait_for_completion(&barr.done); | ||
3596 | } | ||
3597 | |||
3598 | static void drbd_disconnect(struct drbd_conf *mdev) | ||
3599 | { | ||
3600 | enum drbd_fencing_p fp; | ||
3601 | union drbd_state os, ns; | ||
3602 | int rv = SS_UNKNOWN_ERROR; | ||
3603 | unsigned int i; | ||
3604 | |||
3605 | if (mdev->state.conn == C_STANDALONE) | ||
3606 | return; | ||
3607 | if (mdev->state.conn >= C_WF_CONNECTION) | ||
3608 | dev_err(DEV, "ASSERT FAILED cstate = %s, expected < WFConnection\n", | ||
3609 | drbd_conn_str(mdev->state.conn)); | ||
3610 | |||
3611 | /* asender does not clean up anything. it must not interfere, either */ | ||
3612 | drbd_thread_stop(&mdev->asender); | ||
3613 | |||
3614 | mutex_lock(&mdev->data.mutex); | ||
3615 | drbd_free_sock(mdev); | ||
3616 | mutex_unlock(&mdev->data.mutex); | ||
3617 | |||
3618 | spin_lock_irq(&mdev->req_lock); | ||
3619 | _drbd_wait_ee_list_empty(mdev, &mdev->active_ee); | ||
3620 | _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee); | ||
3621 | _drbd_wait_ee_list_empty(mdev, &mdev->read_ee); | ||
3622 | spin_unlock_irq(&mdev->req_lock); | ||
3623 | |||
3624 | /* We do not have data structures that would allow us to | ||
3625 | * get the rs_pending_cnt down to 0 again. | ||
3626 | * * On C_SYNC_TARGET we do not have any data structures describing | ||
3627 | * the pending RSDataRequest's we have sent. | ||
3628 | * * On C_SYNC_SOURCE there is no data structure that tracks | ||
3629 | * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget. | ||
3630 | * And no, it is not the sum of the reference counts in the | ||
3631 | * resync_LRU. The resync_LRU tracks the whole operation including | ||
3632 | * the disk-IO, while the rs_pending_cnt only tracks the blocks | ||
3633 | * on the fly. */ | ||
3634 | drbd_rs_cancel_all(mdev); | ||
3635 | mdev->rs_total = 0; | ||
3636 | mdev->rs_failed = 0; | ||
3637 | atomic_set(&mdev->rs_pending_cnt, 0); | ||
3638 | wake_up(&mdev->misc_wait); | ||
3639 | |||
3640 | /* make sure syncer is stopped and w_resume_next_sg queued */ | ||
3641 | del_timer_sync(&mdev->resync_timer); | ||
3642 | set_bit(STOP_SYNC_TIMER, &mdev->flags); | ||
3643 | resync_timer_fn((unsigned long)mdev); | ||
3644 | |||
3645 | /* so we can be sure that all remote or resync reads | ||
3646 | * made it at least to net_ee */ | ||
3647 | wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); | ||
3648 | |||
3649 | /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier, | ||
3650 | * w_make_resync_request etc. which may still be on the worker queue | ||
3651 | * to be "canceled" */ | ||
3652 | drbd_flush_workqueue(mdev); | ||
3653 | |||
3654 | /* This also does reclaim_net_ee(). If we do this too early, we might | ||
3655 | * miss some resync ee and pages.*/ | ||
3656 | drbd_process_done_ee(mdev); | ||
3657 | |||
3658 | kfree(mdev->p_uuid); | ||
3659 | mdev->p_uuid = NULL; | ||
3660 | |||
3661 | if (!mdev->state.susp) | ||
3662 | tl_clear(mdev); | ||
3663 | |||
3664 | drbd_fail_pending_reads(mdev); | ||
3665 | |||
3666 | dev_info(DEV, "Connection closed\n"); | ||
3667 | |||
3668 | drbd_md_sync(mdev); | ||
3669 | |||
3670 | fp = FP_DONT_CARE; | ||
3671 | if (get_ldev(mdev)) { | ||
3672 | fp = mdev->ldev->dc.fencing; | ||
3673 | put_ldev(mdev); | ||
3674 | } | ||
3675 | |||
3676 | if (mdev->state.role == R_PRIMARY) { | ||
3677 | if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) { | ||
3678 | enum drbd_disk_state nps = drbd_try_outdate_peer(mdev); | ||
3679 | drbd_request_state(mdev, NS(pdsk, nps)); | ||
3680 | } | ||
3681 | } | ||
3682 | |||
3683 | spin_lock_irq(&mdev->req_lock); | ||
3684 | os = mdev->state; | ||
3685 | if (os.conn >= C_UNCONNECTED) { | ||
3686 | /* Do not restart in case we are C_DISCONNECTING */ | ||
3687 | ns = os; | ||
3688 | ns.conn = C_UNCONNECTED; | ||
3689 | rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | ||
3690 | } | ||
3691 | spin_unlock_irq(&mdev->req_lock); | ||
3692 | |||
3693 | if (os.conn == C_DISCONNECTING) { | ||
3694 | struct hlist_head *h; | ||
3695 | wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0); | ||
3696 | |||
3697 | /* we must not free the tl_hash | ||
3698 | * while application io is still on the fly */ | ||
3699 | wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0); | ||
3700 | |||
3701 | spin_lock_irq(&mdev->req_lock); | ||
3702 | /* paranoia code */ | ||
3703 | for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++) | ||
3704 | if (h->first) | ||
3705 | dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n", | ||
3706 | (int)(h - mdev->ee_hash), h->first); | ||
3707 | kfree(mdev->ee_hash); | ||
3708 | mdev->ee_hash = NULL; | ||
3709 | mdev->ee_hash_s = 0; | ||
3710 | |||
3711 | /* paranoia code */ | ||
3712 | for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) | ||
3713 | if (h->first) | ||
3714 | dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n", | ||
3715 | (int)(h - mdev->tl_hash), h->first); | ||
3716 | kfree(mdev->tl_hash); | ||
3717 | mdev->tl_hash = NULL; | ||
3718 | mdev->tl_hash_s = 0; | ||
3719 | spin_unlock_irq(&mdev->req_lock); | ||
3720 | |||
3721 | crypto_free_hash(mdev->cram_hmac_tfm); | ||
3722 | mdev->cram_hmac_tfm = NULL; | ||
3723 | |||
3724 | kfree(mdev->net_conf); | ||
3725 | mdev->net_conf = NULL; | ||
3726 | drbd_request_state(mdev, NS(conn, C_STANDALONE)); | ||
3727 | } | ||
3728 | |||
3729 | /* tcp_close and release of sendpage pages can be deferred. I don't | ||
3730 | * want to use SO_LINGER, because apparently it can be deferred for | ||
3731 | * more than 20 seconds (longest time I checked). | ||
3732 | * | ||
3733 | * Actually we don't care for exactly when the network stack does its | ||
3734 | * put_page(), but release our reference on these pages right here. | ||
3735 | */ | ||
3736 | i = drbd_release_ee(mdev, &mdev->net_ee); | ||
3737 | if (i) | ||
3738 | dev_info(DEV, "net_ee not empty, killed %u entries\n", i); | ||
3739 | i = atomic_read(&mdev->pp_in_use); | ||
3740 | if (i) | ||
3741 | dev_info(DEV, "pp_in_use = %u, expected 0\n", i); | ||
3742 | |||
3743 | D_ASSERT(list_empty(&mdev->read_ee)); | ||
3744 | D_ASSERT(list_empty(&mdev->active_ee)); | ||
3745 | D_ASSERT(list_empty(&mdev->sync_ee)); | ||
3746 | D_ASSERT(list_empty(&mdev->done_ee)); | ||
3747 | |||
3748 | /* ok, no more ee's on the fly, it is safe to reset the epoch_size */ | ||
3749 | atomic_set(&mdev->current_epoch->epoch_size, 0); | ||
3750 | D_ASSERT(list_empty(&mdev->current_epoch->list)); | ||
3751 | } | ||
3752 | |||
3753 | /* | ||
3754 | * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version | ||
3755 | * we can agree on is stored in agreed_pro_version. | ||
3756 | * | ||
3757 | * feature flags and the reserved array should be enough room for future | ||
3758 | * enhancements of the handshake protocol, and possible plugins... | ||
3759 | * | ||
3760 | * for now, they are expected to be zero, but ignored. | ||
3761 | */ | ||
3762 | static int drbd_send_handshake(struct drbd_conf *mdev) | ||
3763 | { | ||
3764 | /* ASSERT current == mdev->receiver ... */ | ||
3765 | struct p_handshake *p = &mdev->data.sbuf.handshake; | ||
3766 | int ok; | ||
3767 | |||
3768 | if (mutex_lock_interruptible(&mdev->data.mutex)) { | ||
3769 | dev_err(DEV, "interrupted during initial handshake\n"); | ||
3770 | return 0; /* interrupted. not ok. */ | ||
3771 | } | ||
3772 | |||
3773 | if (mdev->data.socket == NULL) { | ||
3774 | mutex_unlock(&mdev->data.mutex); | ||
3775 | return 0; | ||
3776 | } | ||
3777 | |||
3778 | memset(p, 0, sizeof(*p)); | ||
3779 | p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); | ||
3780 | p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); | ||
3781 | ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE, | ||
3782 | (struct p_header *)p, sizeof(*p), 0 ); | ||
3783 | mutex_unlock(&mdev->data.mutex); | ||
3784 | return ok; | ||
3785 | } | ||
3786 | |||
3787 | /* | ||
3788 | * return values: | ||
3789 | * 1 yes, we have a valid connection | ||
3790 | * 0 oops, did not work out, please try again | ||
3791 | * -1 peer talks different language, | ||
3792 | * no point in trying again, please go standalone. | ||
3793 | */ | ||
3794 | static int drbd_do_handshake(struct drbd_conf *mdev) | ||
3795 | { | ||
3796 | /* ASSERT current == mdev->receiver ... */ | ||
3797 | struct p_handshake *p = &mdev->data.rbuf.handshake; | ||
3798 | const int expect = sizeof(struct p_handshake) | ||
3799 | -sizeof(struct p_header); | ||
3800 | int rv; | ||
3801 | |||
3802 | rv = drbd_send_handshake(mdev); | ||
3803 | if (!rv) | ||
3804 | return 0; | ||
3805 | |||
3806 | rv = drbd_recv_header(mdev, &p->head); | ||
3807 | if (!rv) | ||
3808 | return 0; | ||
3809 | |||
3810 | if (p->head.command != P_HAND_SHAKE) { | ||
3811 | dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n", | ||
3812 | cmdname(p->head.command), p->head.command); | ||
3813 | return -1; | ||
3814 | } | ||
3815 | |||
3816 | if (p->head.length != expect) { | ||
3817 | dev_err(DEV, "expected HandShake length: %u, received: %u\n", | ||
3818 | expect, p->head.length); | ||
3819 | return -1; | ||
3820 | } | ||
3821 | |||
3822 | rv = drbd_recv(mdev, &p->head.payload, expect); | ||
3823 | |||
3824 | if (rv != expect) { | ||
3825 | dev_err(DEV, "short read receiving handshake packet: l=%u\n", rv); | ||
3826 | return 0; | ||
3827 | } | ||
3828 | |||
3829 | trace_drbd_packet(mdev, mdev->data.socket, 2, &mdev->data.rbuf, | ||
3830 | __FILE__, __LINE__); | ||
3831 | |||
3832 | p->protocol_min = be32_to_cpu(p->protocol_min); | ||
3833 | p->protocol_max = be32_to_cpu(p->protocol_max); | ||
3834 | if (p->protocol_max == 0) | ||
3835 | p->protocol_max = p->protocol_min; | ||
3836 | |||
3837 | if (PRO_VERSION_MAX < p->protocol_min || | ||
3838 | PRO_VERSION_MIN > p->protocol_max) | ||
3839 | goto incompat; | ||
3840 | |||
3841 | mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); | ||
3842 | |||
3843 | dev_info(DEV, "Handshake successful: " | ||
3844 | "Agreed network protocol version %d\n", mdev->agreed_pro_version); | ||
3845 | |||
3846 | return 1; | ||
3847 | |||
3848 | incompat: | ||
3849 | dev_err(DEV, "incompatible DRBD dialects: " | ||
3850 | "I support %d-%d, peer supports %d-%d\n", | ||
3851 | PRO_VERSION_MIN, PRO_VERSION_MAX, | ||
3852 | p->protocol_min, p->protocol_max); | ||
3853 | return -1; | ||
3854 | } | ||
3855 | |||
3856 | #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) | ||
3857 | static int drbd_do_auth(struct drbd_conf *mdev) | ||
3858 | { | ||
3859 | dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); | ||
3860 | dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); | ||
3861 | return 0; | ||
3862 | } | ||
3863 | #else | ||
3864 | #define CHALLENGE_LEN 64 | ||
3865 | static int drbd_do_auth(struct drbd_conf *mdev) | ||
3866 | { | ||
3867 | char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */ | ||
3868 | struct scatterlist sg; | ||
3869 | char *response = NULL; | ||
3870 | char *right_response = NULL; | ||
3871 | char *peers_ch = NULL; | ||
3872 | struct p_header p; | ||
3873 | unsigned int key_len = strlen(mdev->net_conf->shared_secret); | ||
3874 | unsigned int resp_size; | ||
3875 | struct hash_desc desc; | ||
3876 | int rv; | ||
3877 | |||
3878 | desc.tfm = mdev->cram_hmac_tfm; | ||
3879 | desc.flags = 0; | ||
3880 | |||
3881 | rv = crypto_hash_setkey(mdev->cram_hmac_tfm, | ||
3882 | (u8 *)mdev->net_conf->shared_secret, key_len); | ||
3883 | if (rv) { | ||
3884 | dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv); | ||
3885 | rv = 0; | ||
3886 | goto fail; | ||
3887 | } | ||
3888 | |||
3889 | get_random_bytes(my_challenge, CHALLENGE_LEN); | ||
3890 | |||
3891 | rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN); | ||
3892 | if (!rv) | ||
3893 | goto fail; | ||
3894 | |||
3895 | rv = drbd_recv_header(mdev, &p); | ||
3896 | if (!rv) | ||
3897 | goto fail; | ||
3898 | |||
3899 | if (p.command != P_AUTH_CHALLENGE) { | ||
3900 | dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n", | ||
3901 | cmdname(p.command), p.command); | ||
3902 | rv = 0; | ||
3903 | goto fail; | ||
3904 | } | ||
3905 | |||
3906 | if (p.length > CHALLENGE_LEN*2) { | ||
3907 | dev_err(DEV, "expected AuthChallenge payload too big.\n"); | ||
3908 | rv = 0; | ||
3909 | goto fail; | ||
3910 | } | ||
3911 | |||
3912 | peers_ch = kmalloc(p.length, GFP_NOIO); | ||
3913 | if (peers_ch == NULL) { | ||
3914 | dev_err(DEV, "kmalloc of peers_ch failed\n"); | ||
3915 | rv = 0; | ||
3916 | goto fail; | ||
3917 | } | ||
3918 | |||
3919 | rv = drbd_recv(mdev, peers_ch, p.length); | ||
3920 | |||
3921 | if (rv != p.length) { | ||
3922 | dev_err(DEV, "short read AuthChallenge: l=%u\n", rv); | ||
3923 | rv = 0; | ||
3924 | goto fail; | ||
3925 | } | ||
3926 | |||
3927 | resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm); | ||
3928 | response = kmalloc(resp_size, GFP_NOIO); | ||
3929 | if (response == NULL) { | ||
3930 | dev_err(DEV, "kmalloc of response failed\n"); | ||
3931 | rv = 0; | ||
3932 | goto fail; | ||
3933 | } | ||
3934 | |||
3935 | sg_init_table(&sg, 1); | ||
3936 | sg_set_buf(&sg, peers_ch, p.length); | ||
3937 | |||
3938 | rv = crypto_hash_digest(&desc, &sg, sg.length, response); | ||
3939 | if (rv) { | ||
3940 | dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv); | ||
3941 | rv = 0; | ||
3942 | goto fail; | ||
3943 | } | ||
3944 | |||
3945 | rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size); | ||
3946 | if (!rv) | ||
3947 | goto fail; | ||
3948 | |||
3949 | rv = drbd_recv_header(mdev, &p); | ||
3950 | if (!rv) | ||
3951 | goto fail; | ||
3952 | |||
3953 | if (p.command != P_AUTH_RESPONSE) { | ||
3954 | dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n", | ||
3955 | cmdname(p.command), p.command); | ||
3956 | rv = 0; | ||
3957 | goto fail; | ||
3958 | } | ||
3959 | |||
3960 | if (p.length != resp_size) { | ||
3961 | dev_err(DEV, "expected AuthResponse payload of wrong size\n"); | ||
3962 | rv = 0; | ||
3963 | goto fail; | ||
3964 | } | ||
3965 | |||
3966 | rv = drbd_recv(mdev, response , resp_size); | ||
3967 | |||
3968 | if (rv != resp_size) { | ||
3969 | dev_err(DEV, "short read receiving AuthResponse: l=%u\n", rv); | ||
3970 | rv = 0; | ||
3971 | goto fail; | ||
3972 | } | ||
3973 | |||
3974 | right_response = kmalloc(resp_size, GFP_NOIO); | ||
3975 | if (response == NULL) { | ||
3976 | dev_err(DEV, "kmalloc of right_response failed\n"); | ||
3977 | rv = 0; | ||
3978 | goto fail; | ||
3979 | } | ||
3980 | |||
3981 | sg_set_buf(&sg, my_challenge, CHALLENGE_LEN); | ||
3982 | |||
3983 | rv = crypto_hash_digest(&desc, &sg, sg.length, right_response); | ||
3984 | if (rv) { | ||
3985 | dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv); | ||
3986 | rv = 0; | ||
3987 | goto fail; | ||
3988 | } | ||
3989 | |||
3990 | rv = !memcmp(response, right_response, resp_size); | ||
3991 | |||
3992 | if (rv) | ||
3993 | dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n", | ||
3994 | resp_size, mdev->net_conf->cram_hmac_alg); | ||
3995 | |||
3996 | fail: | ||
3997 | kfree(peers_ch); | ||
3998 | kfree(response); | ||
3999 | kfree(right_response); | ||
4000 | |||
4001 | return rv; | ||
4002 | } | ||
4003 | #endif | ||
4004 | |||
4005 | int drbdd_init(struct drbd_thread *thi) | ||
4006 | { | ||
4007 | struct drbd_conf *mdev = thi->mdev; | ||
4008 | unsigned int minor = mdev_to_minor(mdev); | ||
4009 | int h; | ||
4010 | |||
4011 | sprintf(current->comm, "drbd%d_receiver", minor); | ||
4012 | |||
4013 | dev_info(DEV, "receiver (re)started\n"); | ||
4014 | |||
4015 | do { | ||
4016 | h = drbd_connect(mdev); | ||
4017 | if (h == 0) { | ||
4018 | drbd_disconnect(mdev); | ||
4019 | __set_current_state(TASK_INTERRUPTIBLE); | ||
4020 | schedule_timeout(HZ); | ||
4021 | } | ||
4022 | if (h == -1) { | ||
4023 | dev_warn(DEV, "Discarding network configuration.\n"); | ||
4024 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
4025 | } | ||
4026 | } while (h == 0); | ||
4027 | |||
4028 | if (h > 0) { | ||
4029 | if (get_net_conf(mdev)) { | ||
4030 | drbdd(mdev); | ||
4031 | put_net_conf(mdev); | ||
4032 | } | ||
4033 | } | ||
4034 | |||
4035 | drbd_disconnect(mdev); | ||
4036 | |||
4037 | dev_info(DEV, "receiver terminated\n"); | ||
4038 | return 0; | ||
4039 | } | ||
4040 | |||
4041 | /* ********* acknowledge sender ******** */ | ||
4042 | |||
4043 | static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h) | ||
4044 | { | ||
4045 | struct p_req_state_reply *p = (struct p_req_state_reply *)h; | ||
4046 | |||
4047 | int retcode = be32_to_cpu(p->retcode); | ||
4048 | |||
4049 | if (retcode >= SS_SUCCESS) { | ||
4050 | set_bit(CL_ST_CHG_SUCCESS, &mdev->flags); | ||
4051 | } else { | ||
4052 | set_bit(CL_ST_CHG_FAIL, &mdev->flags); | ||
4053 | dev_err(DEV, "Requested state change failed by peer: %s (%d)\n", | ||
4054 | drbd_set_st_err_str(retcode), retcode); | ||
4055 | } | ||
4056 | wake_up(&mdev->state_wait); | ||
4057 | |||
4058 | return TRUE; | ||
4059 | } | ||
4060 | |||
4061 | static int got_Ping(struct drbd_conf *mdev, struct p_header *h) | ||
4062 | { | ||
4063 | return drbd_send_ping_ack(mdev); | ||
4064 | |||
4065 | } | ||
4066 | |||
4067 | static int got_PingAck(struct drbd_conf *mdev, struct p_header *h) | ||
4068 | { | ||
4069 | /* restore idle timeout */ | ||
4070 | mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; | ||
4071 | |||
4072 | return TRUE; | ||
4073 | } | ||
4074 | |||
4075 | static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h) | ||
4076 | { | ||
4077 | struct p_block_ack *p = (struct p_block_ack *)h; | ||
4078 | sector_t sector = be64_to_cpu(p->sector); | ||
4079 | int blksize = be32_to_cpu(p->blksize); | ||
4080 | |||
4081 | D_ASSERT(mdev->agreed_pro_version >= 89); | ||
4082 | |||
4083 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | ||
4084 | |||
4085 | drbd_rs_complete_io(mdev, sector); | ||
4086 | drbd_set_in_sync(mdev, sector, blksize); | ||
4087 | /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */ | ||
4088 | mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT); | ||
4089 | dec_rs_pending(mdev); | ||
4090 | |||
4091 | return TRUE; | ||
4092 | } | ||
4093 | |||
4094 | /* when we receive the ACK for a write request, | ||
4095 | * verify that we actually know about it */ | ||
4096 | static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev, | ||
4097 | u64 id, sector_t sector) | ||
4098 | { | ||
4099 | struct hlist_head *slot = tl_hash_slot(mdev, sector); | ||
4100 | struct hlist_node *n; | ||
4101 | struct drbd_request *req; | ||
4102 | |||
4103 | hlist_for_each_entry(req, n, slot, colision) { | ||
4104 | if ((unsigned long)req == (unsigned long)id) { | ||
4105 | if (req->sector != sector) { | ||
4106 | dev_err(DEV, "_ack_id_to_req: found req %p but it has " | ||
4107 | "wrong sector (%llus versus %llus)\n", req, | ||
4108 | (unsigned long long)req->sector, | ||
4109 | (unsigned long long)sector); | ||
4110 | break; | ||
4111 | } | ||
4112 | return req; | ||
4113 | } | ||
4114 | } | ||
4115 | dev_err(DEV, "_ack_id_to_req: failed to find req %p, sector %llus in list\n", | ||
4116 | (void *)(unsigned long)id, (unsigned long long)sector); | ||
4117 | return NULL; | ||
4118 | } | ||
4119 | |||
4120 | typedef struct drbd_request *(req_validator_fn) | ||
4121 | (struct drbd_conf *mdev, u64 id, sector_t sector); | ||
4122 | |||
4123 | static int validate_req_change_req_state(struct drbd_conf *mdev, | ||
4124 | u64 id, sector_t sector, req_validator_fn validator, | ||
4125 | const char *func, enum drbd_req_event what) | ||
4126 | { | ||
4127 | struct drbd_request *req; | ||
4128 | struct bio_and_error m; | ||
4129 | |||
4130 | spin_lock_irq(&mdev->req_lock); | ||
4131 | req = validator(mdev, id, sector); | ||
4132 | if (unlikely(!req)) { | ||
4133 | spin_unlock_irq(&mdev->req_lock); | ||
4134 | dev_err(DEV, "%s: got a corrupt block_id/sector pair\n", func); | ||
4135 | return FALSE; | ||
4136 | } | ||
4137 | __req_mod(req, what, &m); | ||
4138 | spin_unlock_irq(&mdev->req_lock); | ||
4139 | |||
4140 | if (m.bio) | ||
4141 | complete_master_bio(mdev, &m); | ||
4142 | return TRUE; | ||
4143 | } | ||
4144 | |||
4145 | static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h) | ||
4146 | { | ||
4147 | struct p_block_ack *p = (struct p_block_ack *)h; | ||
4148 | sector_t sector = be64_to_cpu(p->sector); | ||
4149 | int blksize = be32_to_cpu(p->blksize); | ||
4150 | enum drbd_req_event what; | ||
4151 | |||
4152 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | ||
4153 | |||
4154 | if (is_syncer_block_id(p->block_id)) { | ||
4155 | drbd_set_in_sync(mdev, sector, blksize); | ||
4156 | dec_rs_pending(mdev); | ||
4157 | return TRUE; | ||
4158 | } | ||
4159 | switch (be16_to_cpu(h->command)) { | ||
4160 | case P_RS_WRITE_ACK: | ||
4161 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | ||
4162 | what = write_acked_by_peer_and_sis; | ||
4163 | break; | ||
4164 | case P_WRITE_ACK: | ||
4165 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | ||
4166 | what = write_acked_by_peer; | ||
4167 | break; | ||
4168 | case P_RECV_ACK: | ||
4169 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B); | ||
4170 | what = recv_acked_by_peer; | ||
4171 | break; | ||
4172 | case P_DISCARD_ACK: | ||
4173 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | ||
4174 | what = conflict_discarded_by_peer; | ||
4175 | break; | ||
4176 | default: | ||
4177 | D_ASSERT(0); | ||
4178 | return FALSE; | ||
4179 | } | ||
4180 | |||
4181 | return validate_req_change_req_state(mdev, p->block_id, sector, | ||
4182 | _ack_id_to_req, __func__ , what); | ||
4183 | } | ||
4184 | |||
4185 | static int got_NegAck(struct drbd_conf *mdev, struct p_header *h) | ||
4186 | { | ||
4187 | struct p_block_ack *p = (struct p_block_ack *)h; | ||
4188 | sector_t sector = be64_to_cpu(p->sector); | ||
4189 | |||
4190 | if (__ratelimit(&drbd_ratelimit_state)) | ||
4191 | dev_warn(DEV, "Got NegAck packet. Peer is in troubles?\n"); | ||
4192 | |||
4193 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | ||
4194 | |||
4195 | if (is_syncer_block_id(p->block_id)) { | ||
4196 | int size = be32_to_cpu(p->blksize); | ||
4197 | dec_rs_pending(mdev); | ||
4198 | drbd_rs_failed_io(mdev, sector, size); | ||
4199 | return TRUE; | ||
4200 | } | ||
4201 | return validate_req_change_req_state(mdev, p->block_id, sector, | ||
4202 | _ack_id_to_req, __func__ , neg_acked); | ||
4203 | } | ||
4204 | |||
4205 | static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h) | ||
4206 | { | ||
4207 | struct p_block_ack *p = (struct p_block_ack *)h; | ||
4208 | sector_t sector = be64_to_cpu(p->sector); | ||
4209 | |||
4210 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | ||
4211 | dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n", | ||
4212 | (unsigned long long)sector, be32_to_cpu(p->blksize)); | ||
4213 | |||
4214 | return validate_req_change_req_state(mdev, p->block_id, sector, | ||
4215 | _ar_id_to_req, __func__ , neg_acked); | ||
4216 | } | ||
4217 | |||
4218 | static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h) | ||
4219 | { | ||
4220 | sector_t sector; | ||
4221 | int size; | ||
4222 | struct p_block_ack *p = (struct p_block_ack *)h; | ||
4223 | |||
4224 | sector = be64_to_cpu(p->sector); | ||
4225 | size = be32_to_cpu(p->blksize); | ||
4226 | D_ASSERT(p->block_id == ID_SYNCER); | ||
4227 | |||
4228 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | ||
4229 | |||
4230 | dec_rs_pending(mdev); | ||
4231 | |||
4232 | if (get_ldev_if_state(mdev, D_FAILED)) { | ||
4233 | drbd_rs_complete_io(mdev, sector); | ||
4234 | drbd_rs_failed_io(mdev, sector, size); | ||
4235 | put_ldev(mdev); | ||
4236 | } | ||
4237 | |||
4238 | return TRUE; | ||
4239 | } | ||
4240 | |||
4241 | static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h) | ||
4242 | { | ||
4243 | struct p_barrier_ack *p = (struct p_barrier_ack *)h; | ||
4244 | |||
4245 | tl_release(mdev, p->barrier, be32_to_cpu(p->set_size)); | ||
4246 | |||
4247 | return TRUE; | ||
4248 | } | ||
4249 | |||
4250 | static int got_OVResult(struct drbd_conf *mdev, struct p_header *h) | ||
4251 | { | ||
4252 | struct p_block_ack *p = (struct p_block_ack *)h; | ||
4253 | struct drbd_work *w; | ||
4254 | sector_t sector; | ||
4255 | int size; | ||
4256 | |||
4257 | sector = be64_to_cpu(p->sector); | ||
4258 | size = be32_to_cpu(p->blksize); | ||
4259 | |||
4260 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | ||
4261 | |||
4262 | if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC) | ||
4263 | drbd_ov_oos_found(mdev, sector, size); | ||
4264 | else | ||
4265 | ov_oos_print(mdev); | ||
4266 | |||
4267 | drbd_rs_complete_io(mdev, sector); | ||
4268 | dec_rs_pending(mdev); | ||
4269 | |||
4270 | if (--mdev->ov_left == 0) { | ||
4271 | w = kmalloc(sizeof(*w), GFP_NOIO); | ||
4272 | if (w) { | ||
4273 | w->cb = w_ov_finished; | ||
4274 | drbd_queue_work_front(&mdev->data.work, w); | ||
4275 | } else { | ||
4276 | dev_err(DEV, "kmalloc(w) failed."); | ||
4277 | ov_oos_print(mdev); | ||
4278 | drbd_resync_finished(mdev); | ||
4279 | } | ||
4280 | } | ||
4281 | return TRUE; | ||
4282 | } | ||
4283 | |||
4284 | struct asender_cmd { | ||
4285 | size_t pkt_size; | ||
4286 | int (*process)(struct drbd_conf *mdev, struct p_header *h); | ||
4287 | }; | ||
4288 | |||
4289 | static struct asender_cmd *get_asender_cmd(int cmd) | ||
4290 | { | ||
4291 | static struct asender_cmd asender_tbl[] = { | ||
4292 | /* anything missing from this table is in | ||
4293 | * the drbd_cmd_handler (drbd_default_handler) table, | ||
4294 | * see the beginning of drbdd() */ | ||
4295 | [P_PING] = { sizeof(struct p_header), got_Ping }, | ||
4296 | [P_PING_ACK] = { sizeof(struct p_header), got_PingAck }, | ||
4297 | [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | ||
4298 | [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | ||
4299 | [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | ||
4300 | [P_DISCARD_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | ||
4301 | [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck }, | ||
4302 | [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply }, | ||
4303 | [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply}, | ||
4304 | [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult }, | ||
4305 | [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, | ||
4306 | [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, | ||
4307 | [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, | ||
4308 | [P_MAX_CMD] = { 0, NULL }, | ||
4309 | }; | ||
4310 | if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL) | ||
4311 | return NULL; | ||
4312 | return &asender_tbl[cmd]; | ||
4313 | } | ||
4314 | |||
4315 | int drbd_asender(struct drbd_thread *thi) | ||
4316 | { | ||
4317 | struct drbd_conf *mdev = thi->mdev; | ||
4318 | struct p_header *h = &mdev->meta.rbuf.header; | ||
4319 | struct asender_cmd *cmd = NULL; | ||
4320 | |||
4321 | int rv, len; | ||
4322 | void *buf = h; | ||
4323 | int received = 0; | ||
4324 | int expect = sizeof(struct p_header); | ||
4325 | int empty; | ||
4326 | |||
4327 | sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev)); | ||
4328 | |||
4329 | current->policy = SCHED_RR; /* Make this a realtime task! */ | ||
4330 | current->rt_priority = 2; /* more important than all other tasks */ | ||
4331 | |||
4332 | while (get_t_state(thi) == Running) { | ||
4333 | drbd_thread_current_set_cpu(mdev); | ||
4334 | if (test_and_clear_bit(SEND_PING, &mdev->flags)) { | ||
4335 | ERR_IF(!drbd_send_ping(mdev)) goto reconnect; | ||
4336 | mdev->meta.socket->sk->sk_rcvtimeo = | ||
4337 | mdev->net_conf->ping_timeo*HZ/10; | ||
4338 | } | ||
4339 | |||
4340 | /* conditionally cork; | ||
4341 | * it may hurt latency if we cork without much to send */ | ||
4342 | if (!mdev->net_conf->no_cork && | ||
4343 | 3 < atomic_read(&mdev->unacked_cnt)) | ||
4344 | drbd_tcp_cork(mdev->meta.socket); | ||
4345 | while (1) { | ||
4346 | clear_bit(SIGNAL_ASENDER, &mdev->flags); | ||
4347 | flush_signals(current); | ||
4348 | if (!drbd_process_done_ee(mdev)) { | ||
4349 | dev_err(DEV, "process_done_ee() = NOT_OK\n"); | ||
4350 | goto reconnect; | ||
4351 | } | ||
4352 | /* to avoid race with newly queued ACKs */ | ||
4353 | set_bit(SIGNAL_ASENDER, &mdev->flags); | ||
4354 | spin_lock_irq(&mdev->req_lock); | ||
4355 | empty = list_empty(&mdev->done_ee); | ||
4356 | spin_unlock_irq(&mdev->req_lock); | ||
4357 | /* new ack may have been queued right here, | ||
4358 | * but then there is also a signal pending, | ||
4359 | * and we start over... */ | ||
4360 | if (empty) | ||
4361 | break; | ||
4362 | } | ||
4363 | /* but unconditionally uncork unless disabled */ | ||
4364 | if (!mdev->net_conf->no_cork) | ||
4365 | drbd_tcp_uncork(mdev->meta.socket); | ||
4366 | |||
4367 | /* short circuit, recv_msg would return EINTR anyways. */ | ||
4368 | if (signal_pending(current)) | ||
4369 | continue; | ||
4370 | |||
4371 | rv = drbd_recv_short(mdev, mdev->meta.socket, | ||
4372 | buf, expect-received, 0); | ||
4373 | clear_bit(SIGNAL_ASENDER, &mdev->flags); | ||
4374 | |||
4375 | flush_signals(current); | ||
4376 | |||
4377 | /* Note: | ||
4378 | * -EINTR (on meta) we got a signal | ||
4379 | * -EAGAIN (on meta) rcvtimeo expired | ||
4380 | * -ECONNRESET other side closed the connection | ||
4381 | * -ERESTARTSYS (on data) we got a signal | ||
4382 | * rv < 0 other than above: unexpected error! | ||
4383 | * rv == expected: full header or command | ||
4384 | * rv < expected: "woken" by signal during receive | ||
4385 | * rv == 0 : "connection shut down by peer" | ||
4386 | */ | ||
4387 | if (likely(rv > 0)) { | ||
4388 | received += rv; | ||
4389 | buf += rv; | ||
4390 | } else if (rv == 0) { | ||
4391 | dev_err(DEV, "meta connection shut down by peer.\n"); | ||
4392 | goto reconnect; | ||
4393 | } else if (rv == -EAGAIN) { | ||
4394 | if (mdev->meta.socket->sk->sk_rcvtimeo == | ||
4395 | mdev->net_conf->ping_timeo*HZ/10) { | ||
4396 | dev_err(DEV, "PingAck did not arrive in time.\n"); | ||
4397 | goto reconnect; | ||
4398 | } | ||
4399 | set_bit(SEND_PING, &mdev->flags); | ||
4400 | continue; | ||
4401 | } else if (rv == -EINTR) { | ||
4402 | continue; | ||
4403 | } else { | ||
4404 | dev_err(DEV, "sock_recvmsg returned %d\n", rv); | ||
4405 | goto reconnect; | ||
4406 | } | ||
4407 | |||
4408 | if (received == expect && cmd == NULL) { | ||
4409 | if (unlikely(h->magic != BE_DRBD_MAGIC)) { | ||
4410 | dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n", | ||
4411 | (long)be32_to_cpu(h->magic), | ||
4412 | h->command, h->length); | ||
4413 | goto reconnect; | ||
4414 | } | ||
4415 | cmd = get_asender_cmd(be16_to_cpu(h->command)); | ||
4416 | len = be16_to_cpu(h->length); | ||
4417 | if (unlikely(cmd == NULL)) { | ||
4418 | dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n", | ||
4419 | (long)be32_to_cpu(h->magic), | ||
4420 | h->command, h->length); | ||
4421 | goto disconnect; | ||
4422 | } | ||
4423 | expect = cmd->pkt_size; | ||
4424 | ERR_IF(len != expect-sizeof(struct p_header)) { | ||
4425 | trace_drbd_packet(mdev, mdev->meta.socket, 1, (void *)h, __FILE__, __LINE__); | ||
4426 | goto reconnect; | ||
4427 | } | ||
4428 | } | ||
4429 | if (received == expect) { | ||
4430 | D_ASSERT(cmd != NULL); | ||
4431 | trace_drbd_packet(mdev, mdev->meta.socket, 1, (void *)h, __FILE__, __LINE__); | ||
4432 | if (!cmd->process(mdev, h)) | ||
4433 | goto reconnect; | ||
4434 | |||
4435 | buf = h; | ||
4436 | received = 0; | ||
4437 | expect = sizeof(struct p_header); | ||
4438 | cmd = NULL; | ||
4439 | } | ||
4440 | } | ||
4441 | |||
4442 | if (0) { | ||
4443 | reconnect: | ||
4444 | drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); | ||
4445 | } | ||
4446 | if (0) { | ||
4447 | disconnect: | ||
4448 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
4449 | } | ||
4450 | clear_bit(SIGNAL_ASENDER, &mdev->flags); | ||
4451 | |||
4452 | D_ASSERT(mdev->state.conn < C_CONNECTED); | ||
4453 | dev_info(DEV, "asender terminated\n"); | ||
4454 | |||
4455 | return 0; | ||
4456 | } | ||
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c new file mode 100644 index 00000000000..0656cf1edd5 --- /dev/null +++ b/drivers/block/drbd/drbd_req.c | |||
@@ -0,0 +1,1132 @@ | |||
1 | /* | ||
2 | drbd_req.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #include <linux/autoconf.h> | ||
27 | #include <linux/module.h> | ||
28 | |||
29 | #include <linux/slab.h> | ||
30 | #include <linux/drbd.h> | ||
31 | #include "drbd_int.h" | ||
32 | #include "drbd_tracing.h" | ||
33 | #include "drbd_req.h" | ||
34 | |||
35 | |||
36 | /* Update disk stats at start of I/O request */ | ||
37 | static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio) | ||
38 | { | ||
39 | const int rw = bio_data_dir(bio); | ||
40 | int cpu; | ||
41 | cpu = part_stat_lock(); | ||
42 | part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); | ||
43 | part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); | ||
44 | part_stat_unlock(); | ||
45 | mdev->vdisk->part0.in_flight[rw]++; | ||
46 | } | ||
47 | |||
48 | /* Update disk stats when completing request upwards */ | ||
49 | static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req) | ||
50 | { | ||
51 | int rw = bio_data_dir(req->master_bio); | ||
52 | unsigned long duration = jiffies - req->start_time; | ||
53 | int cpu; | ||
54 | cpu = part_stat_lock(); | ||
55 | part_stat_add(cpu, &mdev->vdisk->part0, ticks[rw], duration); | ||
56 | part_round_stats(cpu, &mdev->vdisk->part0); | ||
57 | part_stat_unlock(); | ||
58 | mdev->vdisk->part0.in_flight[rw]--; | ||
59 | } | ||
60 | |||
61 | static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw) | ||
62 | { | ||
63 | const unsigned long s = req->rq_state; | ||
64 | /* if it was a write, we may have to set the corresponding | ||
65 | * bit(s) out-of-sync first. If it had a local part, we need to | ||
66 | * release the reference to the activity log. */ | ||
67 | if (rw == WRITE) { | ||
68 | /* remove it from the transfer log. | ||
69 | * well, only if it had been there in the first | ||
70 | * place... if it had not (local only or conflicting | ||
71 | * and never sent), it should still be "empty" as | ||
72 | * initialized in drbd_req_new(), so we can list_del() it | ||
73 | * here unconditionally */ | ||
74 | list_del(&req->tl_requests); | ||
75 | /* Set out-of-sync unless both OK flags are set | ||
76 | * (local only or remote failed). | ||
77 | * Other places where we set out-of-sync: | ||
78 | * READ with local io-error */ | ||
79 | if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK)) | ||
80 | drbd_set_out_of_sync(mdev, req->sector, req->size); | ||
81 | |||
82 | if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS)) | ||
83 | drbd_set_in_sync(mdev, req->sector, req->size); | ||
84 | |||
85 | /* one might be tempted to move the drbd_al_complete_io | ||
86 | * to the local io completion callback drbd_endio_pri. | ||
87 | * but, if this was a mirror write, we may only | ||
88 | * drbd_al_complete_io after this is RQ_NET_DONE, | ||
89 | * otherwise the extent could be dropped from the al | ||
90 | * before it has actually been written on the peer. | ||
91 | * if we crash before our peer knows about the request, | ||
92 | * but after the extent has been dropped from the al, | ||
93 | * we would forget to resync the corresponding extent. | ||
94 | */ | ||
95 | if (s & RQ_LOCAL_MASK) { | ||
96 | if (get_ldev_if_state(mdev, D_FAILED)) { | ||
97 | drbd_al_complete_io(mdev, req->sector); | ||
98 | put_ldev(mdev); | ||
99 | } else if (__ratelimit(&drbd_ratelimit_state)) { | ||
100 | dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), " | ||
101 | "but my Disk seems to have failed :(\n", | ||
102 | (unsigned long long) req->sector); | ||
103 | } | ||
104 | } | ||
105 | } | ||
106 | |||
107 | /* if it was a local io error, we want to notify our | ||
108 | * peer about that, and see if we need to | ||
109 | * detach the disk and stuff. | ||
110 | * to avoid allocating some special work | ||
111 | * struct, reuse the request. */ | ||
112 | |||
113 | /* THINK | ||
114 | * why do we do this not when we detect the error, | ||
115 | * but delay it until it is "done", i.e. possibly | ||
116 | * until the next barrier ack? */ | ||
117 | |||
118 | if (rw == WRITE && | ||
119 | ((s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) { | ||
120 | if (!(req->w.list.next == LIST_POISON1 || | ||
121 | list_empty(&req->w.list))) { | ||
122 | /* DEBUG ASSERT only; if this triggers, we | ||
123 | * probably corrupt the worker list here */ | ||
124 | dev_err(DEV, "req->w.list.next = %p\n", req->w.list.next); | ||
125 | dev_err(DEV, "req->w.list.prev = %p\n", req->w.list.prev); | ||
126 | } | ||
127 | req->w.cb = w_io_error; | ||
128 | drbd_queue_work(&mdev->data.work, &req->w); | ||
129 | /* drbd_req_free() is done in w_io_error */ | ||
130 | } else { | ||
131 | drbd_req_free(req); | ||
132 | } | ||
133 | } | ||
134 | |||
135 | static void queue_barrier(struct drbd_conf *mdev) | ||
136 | { | ||
137 | struct drbd_tl_epoch *b; | ||
138 | |||
139 | /* We are within the req_lock. Once we queued the barrier for sending, | ||
140 | * we set the CREATE_BARRIER bit. It is cleared as soon as a new | ||
141 | * barrier/epoch object is added. This is the only place this bit is | ||
142 | * set. It indicates that the barrier for this epoch is already queued, | ||
143 | * and no new epoch has been created yet. */ | ||
144 | if (test_bit(CREATE_BARRIER, &mdev->flags)) | ||
145 | return; | ||
146 | |||
147 | b = mdev->newest_tle; | ||
148 | b->w.cb = w_send_barrier; | ||
149 | /* inc_ap_pending done here, so we won't | ||
150 | * get imbalanced on connection loss. | ||
151 | * dec_ap_pending will be done in got_BarrierAck | ||
152 | * or (on connection loss) in tl_clear. */ | ||
153 | inc_ap_pending(mdev); | ||
154 | drbd_queue_work(&mdev->data.work, &b->w); | ||
155 | set_bit(CREATE_BARRIER, &mdev->flags); | ||
156 | } | ||
157 | |||
158 | static void _about_to_complete_local_write(struct drbd_conf *mdev, | ||
159 | struct drbd_request *req) | ||
160 | { | ||
161 | const unsigned long s = req->rq_state; | ||
162 | struct drbd_request *i; | ||
163 | struct drbd_epoch_entry *e; | ||
164 | struct hlist_node *n; | ||
165 | struct hlist_head *slot; | ||
166 | |||
167 | /* before we can signal completion to the upper layers, | ||
168 | * we may need to close the current epoch */ | ||
169 | if (mdev->state.conn >= C_CONNECTED && | ||
170 | req->epoch == mdev->newest_tle->br_number) | ||
171 | queue_barrier(mdev); | ||
172 | |||
173 | /* we need to do the conflict detection stuff, | ||
174 | * if we have the ee_hash (two_primaries) and | ||
175 | * this has been on the network */ | ||
176 | if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) { | ||
177 | const sector_t sector = req->sector; | ||
178 | const int size = req->size; | ||
179 | |||
180 | /* ASSERT: | ||
181 | * there must be no conflicting requests, since | ||
182 | * they must have been failed on the spot */ | ||
183 | #define OVERLAPS overlaps(sector, size, i->sector, i->size) | ||
184 | slot = tl_hash_slot(mdev, sector); | ||
185 | hlist_for_each_entry(i, n, slot, colision) { | ||
186 | if (OVERLAPS) { | ||
187 | dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; " | ||
188 | "other: %p %llus +%u\n", | ||
189 | req, (unsigned long long)sector, size, | ||
190 | i, (unsigned long long)i->sector, i->size); | ||
191 | } | ||
192 | } | ||
193 | |||
194 | /* maybe "wake" those conflicting epoch entries | ||
195 | * that wait for this request to finish. | ||
196 | * | ||
197 | * currently, there can be only _one_ such ee | ||
198 | * (well, or some more, which would be pending | ||
199 | * P_DISCARD_ACK not yet sent by the asender...), | ||
200 | * since we block the receiver thread upon the | ||
201 | * first conflict detection, which will wait on | ||
202 | * misc_wait. maybe we want to assert that? | ||
203 | * | ||
204 | * anyways, if we found one, | ||
205 | * we just have to do a wake_up. */ | ||
206 | #undef OVERLAPS | ||
207 | #define OVERLAPS overlaps(sector, size, e->sector, e->size) | ||
208 | slot = ee_hash_slot(mdev, req->sector); | ||
209 | hlist_for_each_entry(e, n, slot, colision) { | ||
210 | if (OVERLAPS) { | ||
211 | wake_up(&mdev->misc_wait); | ||
212 | break; | ||
213 | } | ||
214 | } | ||
215 | } | ||
216 | #undef OVERLAPS | ||
217 | } | ||
218 | |||
219 | void complete_master_bio(struct drbd_conf *mdev, | ||
220 | struct bio_and_error *m) | ||
221 | { | ||
222 | trace_drbd_bio(mdev, "Rq", m->bio, 1, NULL); | ||
223 | bio_endio(m->bio, m->error); | ||
224 | dec_ap_bio(mdev); | ||
225 | } | ||
226 | |||
227 | /* Helper for __req_mod(). | ||
228 | * Set m->bio to the master bio, if it is fit to be completed, | ||
229 | * or leave it alone (it is initialized to NULL in __req_mod), | ||
230 | * if it has already been completed, or cannot be completed yet. | ||
231 | * If m->bio is set, the error status to be returned is placed in m->error. | ||
232 | */ | ||
233 | void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) | ||
234 | { | ||
235 | const unsigned long s = req->rq_state; | ||
236 | struct drbd_conf *mdev = req->mdev; | ||
237 | /* only WRITES may end up here without a master bio (on barrier ack) */ | ||
238 | int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE; | ||
239 | |||
240 | trace_drbd_req(req, nothing, "_req_may_be_done"); | ||
241 | |||
242 | /* we must not complete the master bio, while it is | ||
243 | * still being processed by _drbd_send_zc_bio (drbd_send_dblock) | ||
244 | * not yet acknowledged by the peer | ||
245 | * not yet completed by the local io subsystem | ||
246 | * these flags may get cleared in any order by | ||
247 | * the worker, | ||
248 | * the receiver, | ||
249 | * the bio_endio completion callbacks. | ||
250 | */ | ||
251 | if (s & RQ_NET_QUEUED) | ||
252 | return; | ||
253 | if (s & RQ_NET_PENDING) | ||
254 | return; | ||
255 | if (s & RQ_LOCAL_PENDING) | ||
256 | return; | ||
257 | |||
258 | if (req->master_bio) { | ||
259 | /* this is data_received (remote read) | ||
260 | * or protocol C P_WRITE_ACK | ||
261 | * or protocol B P_RECV_ACK | ||
262 | * or protocol A "handed_over_to_network" (SendAck) | ||
263 | * or canceled or failed, | ||
264 | * or killed from the transfer log due to connection loss. | ||
265 | */ | ||
266 | |||
267 | /* | ||
268 | * figure out whether to report success or failure. | ||
269 | * | ||
270 | * report success when at least one of the operations succeeded. | ||
271 | * or, to put the other way, | ||
272 | * only report failure, when both operations failed. | ||
273 | * | ||
274 | * what to do about the failures is handled elsewhere. | ||
275 | * what we need to do here is just: complete the master_bio. | ||
276 | * | ||
277 | * local completion error, if any, has been stored as ERR_PTR | ||
278 | * in private_bio within drbd_endio_pri. | ||
279 | */ | ||
280 | int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); | ||
281 | int error = PTR_ERR(req->private_bio); | ||
282 | |||
283 | /* remove the request from the conflict detection | ||
284 | * respective block_id verification hash */ | ||
285 | if (!hlist_unhashed(&req->colision)) | ||
286 | hlist_del(&req->colision); | ||
287 | else | ||
288 | D_ASSERT((s & RQ_NET_MASK) == 0); | ||
289 | |||
290 | /* for writes we need to do some extra housekeeping */ | ||
291 | if (rw == WRITE) | ||
292 | _about_to_complete_local_write(mdev, req); | ||
293 | |||
294 | /* Update disk stats */ | ||
295 | _drbd_end_io_acct(mdev, req); | ||
296 | |||
297 | m->error = ok ? 0 : (error ?: -EIO); | ||
298 | m->bio = req->master_bio; | ||
299 | req->master_bio = NULL; | ||
300 | } | ||
301 | |||
302 | if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) { | ||
303 | /* this is disconnected (local only) operation, | ||
304 | * or protocol C P_WRITE_ACK, | ||
305 | * or protocol A or B P_BARRIER_ACK, | ||
306 | * or killed from the transfer log due to connection loss. */ | ||
307 | _req_is_done(mdev, req, rw); | ||
308 | } | ||
309 | /* else: network part and not DONE yet. that is | ||
310 | * protocol A or B, barrier ack still pending... */ | ||
311 | } | ||
312 | |||
313 | /* | ||
314 | * checks whether there was an overlapping request | ||
315 | * or ee already registered. | ||
316 | * | ||
317 | * if so, return 1, in which case this request is completed on the spot, | ||
318 | * without ever being submitted or send. | ||
319 | * | ||
320 | * return 0 if it is ok to submit this request. | ||
321 | * | ||
322 | * NOTE: | ||
323 | * paranoia: assume something above us is broken, and issues different write | ||
324 | * requests for the same block simultaneously... | ||
325 | * | ||
326 | * To ensure these won't be reordered differently on both nodes, resulting in | ||
327 | * diverging data sets, we discard the later one(s). Not that this is supposed | ||
328 | * to happen, but this is the rationale why we also have to check for | ||
329 | * conflicting requests with local origin, and why we have to do so regardless | ||
330 | * of whether we allowed multiple primaries. | ||
331 | * | ||
332 | * BTW, in case we only have one primary, the ee_hash is empty anyways, and the | ||
333 | * second hlist_for_each_entry becomes a noop. This is even simpler than to | ||
334 | * grab a reference on the net_conf, and check for the two_primaries flag... | ||
335 | */ | ||
336 | static int _req_conflicts(struct drbd_request *req) | ||
337 | { | ||
338 | struct drbd_conf *mdev = req->mdev; | ||
339 | const sector_t sector = req->sector; | ||
340 | const int size = req->size; | ||
341 | struct drbd_request *i; | ||
342 | struct drbd_epoch_entry *e; | ||
343 | struct hlist_node *n; | ||
344 | struct hlist_head *slot; | ||
345 | |||
346 | D_ASSERT(hlist_unhashed(&req->colision)); | ||
347 | |||
348 | if (!get_net_conf(mdev)) | ||
349 | return 0; | ||
350 | |||
351 | /* BUG_ON */ | ||
352 | ERR_IF (mdev->tl_hash_s == 0) | ||
353 | goto out_no_conflict; | ||
354 | BUG_ON(mdev->tl_hash == NULL); | ||
355 | |||
356 | #define OVERLAPS overlaps(i->sector, i->size, sector, size) | ||
357 | slot = tl_hash_slot(mdev, sector); | ||
358 | hlist_for_each_entry(i, n, slot, colision) { | ||
359 | if (OVERLAPS) { | ||
360 | dev_alert(DEV, "%s[%u] Concurrent local write detected! " | ||
361 | "[DISCARD L] new: %llus +%u; " | ||
362 | "pending: %llus +%u\n", | ||
363 | current->comm, current->pid, | ||
364 | (unsigned long long)sector, size, | ||
365 | (unsigned long long)i->sector, i->size); | ||
366 | goto out_conflict; | ||
367 | } | ||
368 | } | ||
369 | |||
370 | if (mdev->ee_hash_s) { | ||
371 | /* now, check for overlapping requests with remote origin */ | ||
372 | BUG_ON(mdev->ee_hash == NULL); | ||
373 | #undef OVERLAPS | ||
374 | #define OVERLAPS overlaps(e->sector, e->size, sector, size) | ||
375 | slot = ee_hash_slot(mdev, sector); | ||
376 | hlist_for_each_entry(e, n, slot, colision) { | ||
377 | if (OVERLAPS) { | ||
378 | dev_alert(DEV, "%s[%u] Concurrent remote write detected!" | ||
379 | " [DISCARD L] new: %llus +%u; " | ||
380 | "pending: %llus +%u\n", | ||
381 | current->comm, current->pid, | ||
382 | (unsigned long long)sector, size, | ||
383 | (unsigned long long)e->sector, e->size); | ||
384 | goto out_conflict; | ||
385 | } | ||
386 | } | ||
387 | } | ||
388 | #undef OVERLAPS | ||
389 | |||
390 | out_no_conflict: | ||
391 | /* this is like it should be, and what we expected. | ||
392 | * our users do behave after all... */ | ||
393 | put_net_conf(mdev); | ||
394 | return 0; | ||
395 | |||
396 | out_conflict: | ||
397 | put_net_conf(mdev); | ||
398 | return 1; | ||
399 | } | ||
400 | |||
401 | /* obviously this could be coded as many single functions | ||
402 | * instead of one huge switch, | ||
403 | * or by putting the code directly in the respective locations | ||
404 | * (as it has been before). | ||
405 | * | ||
406 | * but having it this way | ||
407 | * enforces that it is all in this one place, where it is easier to audit, | ||
408 | * it makes it obvious that whatever "event" "happens" to a request should | ||
409 | * happen "atomically" within the req_lock, | ||
410 | * and it enforces that we have to think in a very structured manner | ||
411 | * about the "events" that may happen to a request during its life time ... | ||
412 | */ | ||
413 | void __req_mod(struct drbd_request *req, enum drbd_req_event what, | ||
414 | struct bio_and_error *m) | ||
415 | { | ||
416 | struct drbd_conf *mdev = req->mdev; | ||
417 | m->bio = NULL; | ||
418 | |||
419 | trace_drbd_req(req, what, NULL); | ||
420 | |||
421 | switch (what) { | ||
422 | default: | ||
423 | dev_err(DEV, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__); | ||
424 | break; | ||
425 | |||
426 | /* does not happen... | ||
427 | * initialization done in drbd_req_new | ||
428 | case created: | ||
429 | break; | ||
430 | */ | ||
431 | |||
432 | case to_be_send: /* via network */ | ||
433 | /* reached via drbd_make_request_common | ||
434 | * and from w_read_retry_remote */ | ||
435 | D_ASSERT(!(req->rq_state & RQ_NET_MASK)); | ||
436 | req->rq_state |= RQ_NET_PENDING; | ||
437 | inc_ap_pending(mdev); | ||
438 | break; | ||
439 | |||
440 | case to_be_submitted: /* locally */ | ||
441 | /* reached via drbd_make_request_common */ | ||
442 | D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK)); | ||
443 | req->rq_state |= RQ_LOCAL_PENDING; | ||
444 | break; | ||
445 | |||
446 | case completed_ok: | ||
447 | if (bio_data_dir(req->master_bio) == WRITE) | ||
448 | mdev->writ_cnt += req->size>>9; | ||
449 | else | ||
450 | mdev->read_cnt += req->size>>9; | ||
451 | |||
452 | req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); | ||
453 | req->rq_state &= ~RQ_LOCAL_PENDING; | ||
454 | |||
455 | _req_may_be_done(req, m); | ||
456 | put_ldev(mdev); | ||
457 | break; | ||
458 | |||
459 | case write_completed_with_error: | ||
460 | req->rq_state |= RQ_LOCAL_COMPLETED; | ||
461 | req->rq_state &= ~RQ_LOCAL_PENDING; | ||
462 | |||
463 | dev_alert(DEV, "Local WRITE failed sec=%llus size=%u\n", | ||
464 | (unsigned long long)req->sector, req->size); | ||
465 | /* and now: check how to handle local io error. */ | ||
466 | __drbd_chk_io_error(mdev, FALSE); | ||
467 | _req_may_be_done(req, m); | ||
468 | put_ldev(mdev); | ||
469 | break; | ||
470 | |||
471 | case read_ahead_completed_with_error: | ||
472 | /* it is legal to fail READA */ | ||
473 | req->rq_state |= RQ_LOCAL_COMPLETED; | ||
474 | req->rq_state &= ~RQ_LOCAL_PENDING; | ||
475 | _req_may_be_done(req, m); | ||
476 | put_ldev(mdev); | ||
477 | break; | ||
478 | |||
479 | case read_completed_with_error: | ||
480 | drbd_set_out_of_sync(mdev, req->sector, req->size); | ||
481 | |||
482 | req->rq_state |= RQ_LOCAL_COMPLETED; | ||
483 | req->rq_state &= ~RQ_LOCAL_PENDING; | ||
484 | |||
485 | dev_alert(DEV, "Local READ failed sec=%llus size=%u\n", | ||
486 | (unsigned long long)req->sector, req->size); | ||
487 | /* _req_mod(req,to_be_send); oops, recursion... */ | ||
488 | D_ASSERT(!(req->rq_state & RQ_NET_MASK)); | ||
489 | req->rq_state |= RQ_NET_PENDING; | ||
490 | inc_ap_pending(mdev); | ||
491 | |||
492 | __drbd_chk_io_error(mdev, FALSE); | ||
493 | put_ldev(mdev); | ||
494 | /* NOTE: if we have no connection, | ||
495 | * or know the peer has no good data either, | ||
496 | * then we don't actually need to "queue_for_net_read", | ||
497 | * but we do so anyways, since the drbd_io_error() | ||
498 | * and the potential state change to "Diskless" | ||
499 | * needs to be done from process context */ | ||
500 | |||
501 | /* fall through: _req_mod(req,queue_for_net_read); */ | ||
502 | |||
503 | case queue_for_net_read: | ||
504 | /* READ or READA, and | ||
505 | * no local disk, | ||
506 | * or target area marked as invalid, | ||
507 | * or just got an io-error. */ | ||
508 | /* from drbd_make_request_common | ||
509 | * or from bio_endio during read io-error recovery */ | ||
510 | |||
511 | /* so we can verify the handle in the answer packet | ||
512 | * corresponding hlist_del is in _req_may_be_done() */ | ||
513 | hlist_add_head(&req->colision, ar_hash_slot(mdev, req->sector)); | ||
514 | |||
515 | set_bit(UNPLUG_REMOTE, &mdev->flags); /* why? */ | ||
516 | |||
517 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | ||
518 | req->rq_state |= RQ_NET_QUEUED; | ||
519 | req->w.cb = (req->rq_state & RQ_LOCAL_MASK) | ||
520 | ? w_read_retry_remote | ||
521 | : w_send_read_req; | ||
522 | drbd_queue_work(&mdev->data.work, &req->w); | ||
523 | break; | ||
524 | |||
525 | case queue_for_net_write: | ||
526 | /* assert something? */ | ||
527 | /* from drbd_make_request_common only */ | ||
528 | |||
529 | hlist_add_head(&req->colision, tl_hash_slot(mdev, req->sector)); | ||
530 | /* corresponding hlist_del is in _req_may_be_done() */ | ||
531 | |||
532 | /* NOTE | ||
533 | * In case the req ended up on the transfer log before being | ||
534 | * queued on the worker, it could lead to this request being | ||
535 | * missed during cleanup after connection loss. | ||
536 | * So we have to do both operations here, | ||
537 | * within the same lock that protects the transfer log. | ||
538 | * | ||
539 | * _req_add_to_epoch(req); this has to be after the | ||
540 | * _maybe_start_new_epoch(req); which happened in | ||
541 | * drbd_make_request_common, because we now may set the bit | ||
542 | * again ourselves to close the current epoch. | ||
543 | * | ||
544 | * Add req to the (now) current epoch (barrier). */ | ||
545 | |||
546 | /* see drbd_make_request_common, | ||
547 | * just after it grabs the req_lock */ | ||
548 | D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0); | ||
549 | |||
550 | req->epoch = mdev->newest_tle->br_number; | ||
551 | list_add_tail(&req->tl_requests, | ||
552 | &mdev->newest_tle->requests); | ||
553 | |||
554 | /* increment size of current epoch */ | ||
555 | mdev->newest_tle->n_req++; | ||
556 | |||
557 | /* queue work item to send data */ | ||
558 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | ||
559 | req->rq_state |= RQ_NET_QUEUED; | ||
560 | req->w.cb = w_send_dblock; | ||
561 | drbd_queue_work(&mdev->data.work, &req->w); | ||
562 | |||
563 | /* close the epoch, in case it outgrew the limit */ | ||
564 | if (mdev->newest_tle->n_req >= mdev->net_conf->max_epoch_size) | ||
565 | queue_barrier(mdev); | ||
566 | |||
567 | break; | ||
568 | |||
569 | case send_canceled: | ||
570 | /* treat it the same */ | ||
571 | case send_failed: | ||
572 | /* real cleanup will be done from tl_clear. just update flags | ||
573 | * so it is no longer marked as on the worker queue */ | ||
574 | req->rq_state &= ~RQ_NET_QUEUED; | ||
575 | /* if we did it right, tl_clear should be scheduled only after | ||
576 | * this, so this should not be necessary! */ | ||
577 | _req_may_be_done(req, m); | ||
578 | break; | ||
579 | |||
580 | case handed_over_to_network: | ||
581 | /* assert something? */ | ||
582 | if (bio_data_dir(req->master_bio) == WRITE && | ||
583 | mdev->net_conf->wire_protocol == DRBD_PROT_A) { | ||
584 | /* this is what is dangerous about protocol A: | ||
585 | * pretend it was successfully written on the peer. */ | ||
586 | if (req->rq_state & RQ_NET_PENDING) { | ||
587 | dec_ap_pending(mdev); | ||
588 | req->rq_state &= ~RQ_NET_PENDING; | ||
589 | req->rq_state |= RQ_NET_OK; | ||
590 | } /* else: neg-ack was faster... */ | ||
591 | /* it is still not yet RQ_NET_DONE until the | ||
592 | * corresponding epoch barrier got acked as well, | ||
593 | * so we know what to dirty on connection loss */ | ||
594 | } | ||
595 | req->rq_state &= ~RQ_NET_QUEUED; | ||
596 | req->rq_state |= RQ_NET_SENT; | ||
597 | /* because _drbd_send_zc_bio could sleep, and may want to | ||
598 | * dereference the bio even after the "write_acked_by_peer" and | ||
599 | * "completed_ok" events came in, once we return from | ||
600 | * _drbd_send_zc_bio (drbd_send_dblock), we have to check | ||
601 | * whether it is done already, and end it. */ | ||
602 | _req_may_be_done(req, m); | ||
603 | break; | ||
604 | |||
605 | case connection_lost_while_pending: | ||
606 | /* transfer log cleanup after connection loss */ | ||
607 | /* assert something? */ | ||
608 | if (req->rq_state & RQ_NET_PENDING) | ||
609 | dec_ap_pending(mdev); | ||
610 | req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); | ||
611 | req->rq_state |= RQ_NET_DONE; | ||
612 | /* if it is still queued, we may not complete it here. | ||
613 | * it will be canceled soon. */ | ||
614 | if (!(req->rq_state & RQ_NET_QUEUED)) | ||
615 | _req_may_be_done(req, m); | ||
616 | break; | ||
617 | |||
618 | case write_acked_by_peer_and_sis: | ||
619 | req->rq_state |= RQ_NET_SIS; | ||
620 | case conflict_discarded_by_peer: | ||
621 | /* for discarded conflicting writes of multiple primaries, | ||
622 | * there is no need to keep anything in the tl, potential | ||
623 | * node crashes are covered by the activity log. */ | ||
624 | if (what == conflict_discarded_by_peer) | ||
625 | dev_alert(DEV, "Got DiscardAck packet %llus +%u!" | ||
626 | " DRBD is not a random data generator!\n", | ||
627 | (unsigned long long)req->sector, req->size); | ||
628 | req->rq_state |= RQ_NET_DONE; | ||
629 | /* fall through */ | ||
630 | case write_acked_by_peer: | ||
631 | /* protocol C; successfully written on peer. | ||
632 | * Nothing to do here. | ||
633 | * We want to keep the tl in place for all protocols, to cater | ||
634 | * for volatile write-back caches on lower level devices. | ||
635 | * | ||
636 | * A barrier request is expected to have forced all prior | ||
637 | * requests onto stable storage, so completion of a barrier | ||
638 | * request could set NET_DONE right here, and not wait for the | ||
639 | * P_BARRIER_ACK, but that is an unnecessary optimization. */ | ||
640 | |||
641 | /* this makes it effectively the same as for: */ | ||
642 | case recv_acked_by_peer: | ||
643 | /* protocol B; pretends to be successfully written on peer. | ||
644 | * see also notes above in handed_over_to_network about | ||
645 | * protocol != C */ | ||
646 | req->rq_state |= RQ_NET_OK; | ||
647 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | ||
648 | dec_ap_pending(mdev); | ||
649 | req->rq_state &= ~RQ_NET_PENDING; | ||
650 | _req_may_be_done(req, m); | ||
651 | break; | ||
652 | |||
653 | case neg_acked: | ||
654 | /* assert something? */ | ||
655 | if (req->rq_state & RQ_NET_PENDING) | ||
656 | dec_ap_pending(mdev); | ||
657 | req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); | ||
658 | |||
659 | req->rq_state |= RQ_NET_DONE; | ||
660 | _req_may_be_done(req, m); | ||
661 | /* else: done by handed_over_to_network */ | ||
662 | break; | ||
663 | |||
664 | case barrier_acked: | ||
665 | if (req->rq_state & RQ_NET_PENDING) { | ||
666 | /* barrier came in before all requests have been acked. | ||
667 | * this is bad, because if the connection is lost now, | ||
668 | * we won't be able to clean them up... */ | ||
669 | dev_err(DEV, "FIXME (barrier_acked but pending)\n"); | ||
670 | trace_drbd_req(req, nothing, "FIXME (barrier_acked but pending)"); | ||
671 | list_move(&req->tl_requests, &mdev->out_of_sequence_requests); | ||
672 | } | ||
673 | D_ASSERT(req->rq_state & RQ_NET_SENT); | ||
674 | req->rq_state |= RQ_NET_DONE; | ||
675 | _req_may_be_done(req, m); | ||
676 | break; | ||
677 | |||
678 | case data_received: | ||
679 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | ||
680 | dec_ap_pending(mdev); | ||
681 | req->rq_state &= ~RQ_NET_PENDING; | ||
682 | req->rq_state |= (RQ_NET_OK|RQ_NET_DONE); | ||
683 | _req_may_be_done(req, m); | ||
684 | break; | ||
685 | }; | ||
686 | } | ||
687 | |||
688 | /* we may do a local read if: | ||
689 | * - we are consistent (of course), | ||
690 | * - or we are generally inconsistent, | ||
691 | * BUT we are still/already IN SYNC for this area. | ||
692 | * since size may be bigger than BM_BLOCK_SIZE, | ||
693 | * we may need to check several bits. | ||
694 | */ | ||
695 | static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size) | ||
696 | { | ||
697 | unsigned long sbnr, ebnr; | ||
698 | sector_t esector, nr_sectors; | ||
699 | |||
700 | if (mdev->state.disk == D_UP_TO_DATE) | ||
701 | return 1; | ||
702 | if (mdev->state.disk >= D_OUTDATED) | ||
703 | return 0; | ||
704 | if (mdev->state.disk < D_INCONSISTENT) | ||
705 | return 0; | ||
706 | /* state.disk == D_INCONSISTENT We will have a look at the BitMap */ | ||
707 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | ||
708 | esector = sector + (size >> 9) - 1; | ||
709 | |||
710 | D_ASSERT(sector < nr_sectors); | ||
711 | D_ASSERT(esector < nr_sectors); | ||
712 | |||
713 | sbnr = BM_SECT_TO_BIT(sector); | ||
714 | ebnr = BM_SECT_TO_BIT(esector); | ||
715 | |||
716 | return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr); | ||
717 | } | ||
718 | |||
719 | static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio) | ||
720 | { | ||
721 | const int rw = bio_rw(bio); | ||
722 | const int size = bio->bi_size; | ||
723 | const sector_t sector = bio->bi_sector; | ||
724 | struct drbd_tl_epoch *b = NULL; | ||
725 | struct drbd_request *req; | ||
726 | int local, remote; | ||
727 | int err = -EIO; | ||
728 | |||
729 | /* allocate outside of all locks; */ | ||
730 | req = drbd_req_new(mdev, bio); | ||
731 | if (!req) { | ||
732 | dec_ap_bio(mdev); | ||
733 | /* only pass the error to the upper layers. | ||
734 | * if user cannot handle io errors, that's not our business. */ | ||
735 | dev_err(DEV, "could not kmalloc() req\n"); | ||
736 | bio_endio(bio, -ENOMEM); | ||
737 | return 0; | ||
738 | } | ||
739 | |||
740 | trace_drbd_bio(mdev, "Rq", bio, 0, req); | ||
741 | |||
742 | local = get_ldev(mdev); | ||
743 | if (!local) { | ||
744 | bio_put(req->private_bio); /* or we get a bio leak */ | ||
745 | req->private_bio = NULL; | ||
746 | } | ||
747 | if (rw == WRITE) { | ||
748 | remote = 1; | ||
749 | } else { | ||
750 | /* READ || READA */ | ||
751 | if (local) { | ||
752 | if (!drbd_may_do_local_read(mdev, sector, size)) { | ||
753 | /* we could kick the syncer to | ||
754 | * sync this extent asap, wait for | ||
755 | * it, then continue locally. | ||
756 | * Or just issue the request remotely. | ||
757 | */ | ||
758 | local = 0; | ||
759 | bio_put(req->private_bio); | ||
760 | req->private_bio = NULL; | ||
761 | put_ldev(mdev); | ||
762 | } | ||
763 | } | ||
764 | remote = !local && mdev->state.pdsk >= D_UP_TO_DATE; | ||
765 | } | ||
766 | |||
767 | /* If we have a disk, but a READA request is mapped to remote, | ||
768 | * we are R_PRIMARY, D_INCONSISTENT, SyncTarget. | ||
769 | * Just fail that READA request right here. | ||
770 | * | ||
771 | * THINK: maybe fail all READA when not local? | ||
772 | * or make this configurable... | ||
773 | * if network is slow, READA won't do any good. | ||
774 | */ | ||
775 | if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) { | ||
776 | err = -EWOULDBLOCK; | ||
777 | goto fail_and_free_req; | ||
778 | } | ||
779 | |||
780 | /* For WRITES going to the local disk, grab a reference on the target | ||
781 | * extent. This waits for any resync activity in the corresponding | ||
782 | * resync extent to finish, and, if necessary, pulls in the target | ||
783 | * extent into the activity log, which involves further disk io because | ||
784 | * of transactional on-disk meta data updates. */ | ||
785 | if (rw == WRITE && local) | ||
786 | drbd_al_begin_io(mdev, sector); | ||
787 | |||
788 | remote = remote && (mdev->state.pdsk == D_UP_TO_DATE || | ||
789 | (mdev->state.pdsk == D_INCONSISTENT && | ||
790 | mdev->state.conn >= C_CONNECTED)); | ||
791 | |||
792 | if (!(local || remote)) { | ||
793 | dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); | ||
794 | goto fail_free_complete; | ||
795 | } | ||
796 | |||
797 | /* For WRITE request, we have to make sure that we have an | ||
798 | * unused_spare_tle, in case we need to start a new epoch. | ||
799 | * I try to be smart and avoid to pre-allocate always "just in case", | ||
800 | * but there is a race between testing the bit and pointer outside the | ||
801 | * spinlock, and grabbing the spinlock. | ||
802 | * if we lost that race, we retry. */ | ||
803 | if (rw == WRITE && remote && | ||
804 | mdev->unused_spare_tle == NULL && | ||
805 | test_bit(CREATE_BARRIER, &mdev->flags)) { | ||
806 | allocate_barrier: | ||
807 | b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO); | ||
808 | if (!b) { | ||
809 | dev_err(DEV, "Failed to alloc barrier.\n"); | ||
810 | err = -ENOMEM; | ||
811 | goto fail_free_complete; | ||
812 | } | ||
813 | } | ||
814 | |||
815 | /* GOOD, everything prepared, grab the spin_lock */ | ||
816 | spin_lock_irq(&mdev->req_lock); | ||
817 | |||
818 | if (remote) { | ||
819 | remote = (mdev->state.pdsk == D_UP_TO_DATE || | ||
820 | (mdev->state.pdsk == D_INCONSISTENT && | ||
821 | mdev->state.conn >= C_CONNECTED)); | ||
822 | if (!remote) | ||
823 | dev_warn(DEV, "lost connection while grabbing the req_lock!\n"); | ||
824 | if (!(local || remote)) { | ||
825 | dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); | ||
826 | spin_unlock_irq(&mdev->req_lock); | ||
827 | goto fail_free_complete; | ||
828 | } | ||
829 | } | ||
830 | |||
831 | if (b && mdev->unused_spare_tle == NULL) { | ||
832 | mdev->unused_spare_tle = b; | ||
833 | b = NULL; | ||
834 | } | ||
835 | if (rw == WRITE && remote && | ||
836 | mdev->unused_spare_tle == NULL && | ||
837 | test_bit(CREATE_BARRIER, &mdev->flags)) { | ||
838 | /* someone closed the current epoch | ||
839 | * while we were grabbing the spinlock */ | ||
840 | spin_unlock_irq(&mdev->req_lock); | ||
841 | goto allocate_barrier; | ||
842 | } | ||
843 | |||
844 | |||
845 | /* Update disk stats */ | ||
846 | _drbd_start_io_acct(mdev, req, bio); | ||
847 | |||
848 | /* _maybe_start_new_epoch(mdev); | ||
849 | * If we need to generate a write barrier packet, we have to add the | ||
850 | * new epoch (barrier) object, and queue the barrier packet for sending, | ||
851 | * and queue the req's data after it _within the same lock_, otherwise | ||
852 | * we have race conditions were the reorder domains could be mixed up. | ||
853 | * | ||
854 | * Even read requests may start a new epoch and queue the corresponding | ||
855 | * barrier packet. To get the write ordering right, we only have to | ||
856 | * make sure that, if this is a write request and it triggered a | ||
857 | * barrier packet, this request is queued within the same spinlock. */ | ||
858 | if (remote && mdev->unused_spare_tle && | ||
859 | test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { | ||
860 | _tl_add_barrier(mdev, mdev->unused_spare_tle); | ||
861 | mdev->unused_spare_tle = NULL; | ||
862 | } else { | ||
863 | D_ASSERT(!(remote && rw == WRITE && | ||
864 | test_bit(CREATE_BARRIER, &mdev->flags))); | ||
865 | } | ||
866 | |||
867 | /* NOTE | ||
868 | * Actually, 'local' may be wrong here already, since we may have failed | ||
869 | * to write to the meta data, and may become wrong anytime because of | ||
870 | * local io-error for some other request, which would lead to us | ||
871 | * "detaching" the local disk. | ||
872 | * | ||
873 | * 'remote' may become wrong any time because the network could fail. | ||
874 | * | ||
875 | * This is a harmless race condition, though, since it is handled | ||
876 | * correctly at the appropriate places; so it just defers the failure | ||
877 | * of the respective operation. | ||
878 | */ | ||
879 | |||
880 | /* mark them early for readability. | ||
881 | * this just sets some state flags. */ | ||
882 | if (remote) | ||
883 | _req_mod(req, to_be_send); | ||
884 | if (local) | ||
885 | _req_mod(req, to_be_submitted); | ||
886 | |||
887 | /* check this request on the collision detection hash tables. | ||
888 | * if we have a conflict, just complete it here. | ||
889 | * THINK do we want to check reads, too? (I don't think so...) */ | ||
890 | if (rw == WRITE && _req_conflicts(req)) { | ||
891 | /* this is a conflicting request. | ||
892 | * even though it may have been only _partially_ | ||
893 | * overlapping with one of the currently pending requests, | ||
894 | * without even submitting or sending it, we will | ||
895 | * pretend that it was successfully served right now. | ||
896 | */ | ||
897 | if (local) { | ||
898 | bio_put(req->private_bio); | ||
899 | req->private_bio = NULL; | ||
900 | drbd_al_complete_io(mdev, req->sector); | ||
901 | put_ldev(mdev); | ||
902 | local = 0; | ||
903 | } | ||
904 | if (remote) | ||
905 | dec_ap_pending(mdev); | ||
906 | _drbd_end_io_acct(mdev, req); | ||
907 | /* THINK: do we want to fail it (-EIO), or pretend success? */ | ||
908 | bio_endio(req->master_bio, 0); | ||
909 | req->master_bio = NULL; | ||
910 | dec_ap_bio(mdev); | ||
911 | drbd_req_free(req); | ||
912 | remote = 0; | ||
913 | } | ||
914 | |||
915 | /* NOTE remote first: to get the concurrent write detection right, | ||
916 | * we must register the request before start of local IO. */ | ||
917 | if (remote) { | ||
918 | /* either WRITE and C_CONNECTED, | ||
919 | * or READ, and no local disk, | ||
920 | * or READ, but not in sync. | ||
921 | */ | ||
922 | _req_mod(req, (rw == WRITE) | ||
923 | ? queue_for_net_write | ||
924 | : queue_for_net_read); | ||
925 | } | ||
926 | spin_unlock_irq(&mdev->req_lock); | ||
927 | kfree(b); /* if someone else has beaten us to it... */ | ||
928 | |||
929 | if (local) { | ||
930 | req->private_bio->bi_bdev = mdev->ldev->backing_bdev; | ||
931 | |||
932 | trace_drbd_bio(mdev, "Pri", req->private_bio, 0, NULL); | ||
933 | |||
934 | if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR | ||
935 | : rw == READ ? DRBD_FAULT_DT_RD | ||
936 | : DRBD_FAULT_DT_RA)) | ||
937 | bio_endio(req->private_bio, -EIO); | ||
938 | else | ||
939 | generic_make_request(req->private_bio); | ||
940 | } | ||
941 | |||
942 | /* we need to plug ALWAYS since we possibly need to kick lo_dev. | ||
943 | * we plug after submit, so we won't miss an unplug event */ | ||
944 | drbd_plug_device(mdev); | ||
945 | |||
946 | return 0; | ||
947 | |||
948 | fail_free_complete: | ||
949 | if (rw == WRITE && local) | ||
950 | drbd_al_complete_io(mdev, sector); | ||
951 | fail_and_free_req: | ||
952 | if (local) { | ||
953 | bio_put(req->private_bio); | ||
954 | req->private_bio = NULL; | ||
955 | put_ldev(mdev); | ||
956 | } | ||
957 | bio_endio(bio, err); | ||
958 | drbd_req_free(req); | ||
959 | dec_ap_bio(mdev); | ||
960 | kfree(b); | ||
961 | |||
962 | return 0; | ||
963 | } | ||
964 | |||
965 | /* helper function for drbd_make_request | ||
966 | * if we can determine just by the mdev (state) that this request will fail, | ||
967 | * return 1 | ||
968 | * otherwise return 0 | ||
969 | */ | ||
970 | static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write) | ||
971 | { | ||
972 | /* Unconfigured */ | ||
973 | if (mdev->state.conn == C_DISCONNECTING && | ||
974 | mdev->state.disk == D_DISKLESS) | ||
975 | return 1; | ||
976 | |||
977 | if (mdev->state.role != R_PRIMARY && | ||
978 | (!allow_oos || is_write)) { | ||
979 | if (__ratelimit(&drbd_ratelimit_state)) { | ||
980 | dev_err(DEV, "Process %s[%u] tried to %s; " | ||
981 | "since we are not in Primary state, " | ||
982 | "we cannot allow this\n", | ||
983 | current->comm, current->pid, | ||
984 | is_write ? "WRITE" : "READ"); | ||
985 | } | ||
986 | return 1; | ||
987 | } | ||
988 | |||
989 | /* | ||
990 | * Paranoia: we might have been primary, but sync target, or | ||
991 | * even diskless, then lost the connection. | ||
992 | * This should have been handled (panic? suspend?) somewhere | ||
993 | * else. But maybe it was not, so check again here. | ||
994 | * Caution: as long as we do not have a read/write lock on mdev, | ||
995 | * to serialize state changes, this is racy, since we may lose | ||
996 | * the connection *after* we test for the cstate. | ||
997 | */ | ||
998 | if (mdev->state.disk < D_UP_TO_DATE && mdev->state.pdsk < D_UP_TO_DATE) { | ||
999 | if (__ratelimit(&drbd_ratelimit_state)) | ||
1000 | dev_err(DEV, "Sorry, I have no access to good data anymore.\n"); | ||
1001 | return 1; | ||
1002 | } | ||
1003 | |||
1004 | return 0; | ||
1005 | } | ||
1006 | |||
1007 | int drbd_make_request_26(struct request_queue *q, struct bio *bio) | ||
1008 | { | ||
1009 | unsigned int s_enr, e_enr; | ||
1010 | struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; | ||
1011 | |||
1012 | if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) { | ||
1013 | bio_endio(bio, -EPERM); | ||
1014 | return 0; | ||
1015 | } | ||
1016 | |||
1017 | /* Reject barrier requests if we know the underlying device does | ||
1018 | * not support them. | ||
1019 | * XXX: Need to get this info from peer as well some how so we | ||
1020 | * XXX: reject if EITHER side/data/metadata area does not support them. | ||
1021 | * | ||
1022 | * because of those XXX, this is not yet enabled, | ||
1023 | * i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit. | ||
1024 | */ | ||
1025 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags))) { | ||
1026 | /* dev_warn(DEV, "Rejecting barrier request as underlying device does not support\n"); */ | ||
1027 | bio_endio(bio, -EOPNOTSUPP); | ||
1028 | return 0; | ||
1029 | } | ||
1030 | |||
1031 | /* | ||
1032 | * what we "blindly" assume: | ||
1033 | */ | ||
1034 | D_ASSERT(bio->bi_size > 0); | ||
1035 | D_ASSERT((bio->bi_size & 0x1ff) == 0); | ||
1036 | D_ASSERT(bio->bi_idx == 0); | ||
1037 | |||
1038 | /* to make some things easier, force alignment of requests within the | ||
1039 | * granularity of our hash tables */ | ||
1040 | s_enr = bio->bi_sector >> HT_SHIFT; | ||
1041 | e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT; | ||
1042 | |||
1043 | if (likely(s_enr == e_enr)) { | ||
1044 | inc_ap_bio(mdev, 1); | ||
1045 | return drbd_make_request_common(mdev, bio); | ||
1046 | } | ||
1047 | |||
1048 | /* can this bio be split generically? | ||
1049 | * Maybe add our own split-arbitrary-bios function. */ | ||
1050 | if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_SEGMENT_SIZE) { | ||
1051 | /* rather error out here than BUG in bio_split */ | ||
1052 | dev_err(DEV, "bio would need to, but cannot, be split: " | ||
1053 | "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n", | ||
1054 | bio->bi_vcnt, bio->bi_idx, bio->bi_size, | ||
1055 | (unsigned long long)bio->bi_sector); | ||
1056 | bio_endio(bio, -EINVAL); | ||
1057 | } else { | ||
1058 | /* This bio crosses some boundary, so we have to split it. */ | ||
1059 | struct bio_pair *bp; | ||
1060 | /* works for the "do not cross hash slot boundaries" case | ||
1061 | * e.g. sector 262269, size 4096 | ||
1062 | * s_enr = 262269 >> 6 = 4097 | ||
1063 | * e_enr = (262269+8-1) >> 6 = 4098 | ||
1064 | * HT_SHIFT = 6 | ||
1065 | * sps = 64, mask = 63 | ||
1066 | * first_sectors = 64 - (262269 & 63) = 3 | ||
1067 | */ | ||
1068 | const sector_t sect = bio->bi_sector; | ||
1069 | const int sps = 1 << HT_SHIFT; /* sectors per slot */ | ||
1070 | const int mask = sps - 1; | ||
1071 | const sector_t first_sectors = sps - (sect & mask); | ||
1072 | bp = bio_split(bio, | ||
1073 | #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) | ||
1074 | bio_split_pool, | ||
1075 | #endif | ||
1076 | first_sectors); | ||
1077 | |||
1078 | /* we need to get a "reference count" (ap_bio_cnt) | ||
1079 | * to avoid races with the disconnect/reconnect/suspend code. | ||
1080 | * In case we need to split the bio here, we need to get two references | ||
1081 | * atomically, otherwise we might deadlock when trying to submit the | ||
1082 | * second one! */ | ||
1083 | inc_ap_bio(mdev, 2); | ||
1084 | |||
1085 | D_ASSERT(e_enr == s_enr + 1); | ||
1086 | |||
1087 | drbd_make_request_common(mdev, &bp->bio1); | ||
1088 | drbd_make_request_common(mdev, &bp->bio2); | ||
1089 | bio_pair_release(bp); | ||
1090 | } | ||
1091 | return 0; | ||
1092 | } | ||
1093 | |||
1094 | /* This is called by bio_add_page(). With this function we reduce | ||
1095 | * the number of BIOs that span over multiple DRBD_MAX_SEGMENT_SIZEs | ||
1096 | * units (was AL_EXTENTs). | ||
1097 | * | ||
1098 | * we do the calculation within the lower 32bit of the byte offsets, | ||
1099 | * since we don't care for actual offset, but only check whether it | ||
1100 | * would cross "activity log extent" boundaries. | ||
1101 | * | ||
1102 | * As long as the BIO is empty we have to allow at least one bvec, | ||
1103 | * regardless of size and offset. so the resulting bio may still | ||
1104 | * cross extent boundaries. those are dealt with (bio_split) in | ||
1105 | * drbd_make_request_26. | ||
1106 | */ | ||
1107 | int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec) | ||
1108 | { | ||
1109 | struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; | ||
1110 | unsigned int bio_offset = | ||
1111 | (unsigned int)bvm->bi_sector << 9; /* 32 bit */ | ||
1112 | unsigned int bio_size = bvm->bi_size; | ||
1113 | int limit, backing_limit; | ||
1114 | |||
1115 | limit = DRBD_MAX_SEGMENT_SIZE | ||
1116 | - ((bio_offset & (DRBD_MAX_SEGMENT_SIZE-1)) + bio_size); | ||
1117 | if (limit < 0) | ||
1118 | limit = 0; | ||
1119 | if (bio_size == 0) { | ||
1120 | if (limit <= bvec->bv_len) | ||
1121 | limit = bvec->bv_len; | ||
1122 | } else if (limit && get_ldev(mdev)) { | ||
1123 | struct request_queue * const b = | ||
1124 | mdev->ldev->backing_bdev->bd_disk->queue; | ||
1125 | if (b->merge_bvec_fn && mdev->ldev->dc.use_bmbv) { | ||
1126 | backing_limit = b->merge_bvec_fn(b, bvm, bvec); | ||
1127 | limit = min(limit, backing_limit); | ||
1128 | } | ||
1129 | put_ldev(mdev); | ||
1130 | } | ||
1131 | return limit; | ||
1132 | } | ||
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h new file mode 100644 index 00000000000..d37ab57f120 --- /dev/null +++ b/drivers/block/drbd/drbd_req.h | |||
@@ -0,0 +1,327 @@ | |||
1 | /* | ||
2 | drbd_req.h | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2006-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 2006-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
8 | Copyright (C) 2006-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
9 | |||
10 | DRBD is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | DRBD is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | */ | ||
24 | |||
25 | #ifndef _DRBD_REQ_H | ||
26 | #define _DRBD_REQ_H | ||
27 | |||
28 | #include <linux/autoconf.h> | ||
29 | #include <linux/module.h> | ||
30 | |||
31 | #include <linux/slab.h> | ||
32 | #include <linux/drbd.h> | ||
33 | #include "drbd_int.h" | ||
34 | #include "drbd_wrappers.h" | ||
35 | |||
36 | /* The request callbacks will be called in irq context by the IDE drivers, | ||
37 | and in Softirqs/Tasklets/BH context by the SCSI drivers, | ||
38 | and by the receiver and worker in kernel-thread context. | ||
39 | Try to get the locking right :) */ | ||
40 | |||
41 | /* | ||
42 | * Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are | ||
43 | * associated with IO requests originating from the block layer above us. | ||
44 | * | ||
45 | * There are quite a few things that may happen to a drbd request | ||
46 | * during its lifetime. | ||
47 | * | ||
48 | * It will be created. | ||
49 | * It will be marked with the intention to be | ||
50 | * submitted to local disk and/or | ||
51 | * send via the network. | ||
52 | * | ||
53 | * It has to be placed on the transfer log and other housekeeping lists, | ||
54 | * In case we have a network connection. | ||
55 | * | ||
56 | * It may be identified as a concurrent (write) request | ||
57 | * and be handled accordingly. | ||
58 | * | ||
59 | * It may me handed over to the local disk subsystem. | ||
60 | * It may be completed by the local disk subsystem, | ||
61 | * either sucessfully or with io-error. | ||
62 | * In case it is a READ request, and it failed locally, | ||
63 | * it may be retried remotely. | ||
64 | * | ||
65 | * It may be queued for sending. | ||
66 | * It may be handed over to the network stack, | ||
67 | * which may fail. | ||
68 | * It may be acknowledged by the "peer" according to the wire_protocol in use. | ||
69 | * this may be a negative ack. | ||
70 | * It may receive a faked ack when the network connection is lost and the | ||
71 | * transfer log is cleaned up. | ||
72 | * Sending may be canceled due to network connection loss. | ||
73 | * When it finally has outlived its time, | ||
74 | * corresponding dirty bits in the resync-bitmap may be cleared or set, | ||
75 | * it will be destroyed, | ||
76 | * and completion will be signalled to the originator, | ||
77 | * with or without "success". | ||
78 | */ | ||
79 | |||
80 | enum drbd_req_event { | ||
81 | created, | ||
82 | to_be_send, | ||
83 | to_be_submitted, | ||
84 | |||
85 | /* XXX yes, now I am inconsistent... | ||
86 | * these two are not "events" but "actions" | ||
87 | * oh, well... */ | ||
88 | queue_for_net_write, | ||
89 | queue_for_net_read, | ||
90 | |||
91 | send_canceled, | ||
92 | send_failed, | ||
93 | handed_over_to_network, | ||
94 | connection_lost_while_pending, | ||
95 | recv_acked_by_peer, | ||
96 | write_acked_by_peer, | ||
97 | write_acked_by_peer_and_sis, /* and set_in_sync */ | ||
98 | conflict_discarded_by_peer, | ||
99 | neg_acked, | ||
100 | barrier_acked, /* in protocol A and B */ | ||
101 | data_received, /* (remote read) */ | ||
102 | |||
103 | read_completed_with_error, | ||
104 | read_ahead_completed_with_error, | ||
105 | write_completed_with_error, | ||
106 | completed_ok, | ||
107 | nothing, /* for tracing only */ | ||
108 | }; | ||
109 | |||
110 | /* encoding of request states for now. we don't actually need that many bits. | ||
111 | * we don't need to do atomic bit operations either, since most of the time we | ||
112 | * need to look at the connection state and/or manipulate some lists at the | ||
113 | * same time, so we should hold the request lock anyways. | ||
114 | */ | ||
115 | enum drbd_req_state_bits { | ||
116 | /* 210 | ||
117 | * 000: no local possible | ||
118 | * 001: to be submitted | ||
119 | * UNUSED, we could map: 011: submitted, completion still pending | ||
120 | * 110: completed ok | ||
121 | * 010: completed with error | ||
122 | */ | ||
123 | __RQ_LOCAL_PENDING, | ||
124 | __RQ_LOCAL_COMPLETED, | ||
125 | __RQ_LOCAL_OK, | ||
126 | |||
127 | /* 76543 | ||
128 | * 00000: no network possible | ||
129 | * 00001: to be send | ||
130 | * 00011: to be send, on worker queue | ||
131 | * 00101: sent, expecting recv_ack (B) or write_ack (C) | ||
132 | * 11101: sent, | ||
133 | * recv_ack (B) or implicit "ack" (A), | ||
134 | * still waiting for the barrier ack. | ||
135 | * master_bio may already be completed and invalidated. | ||
136 | * 11100: write_acked (C), | ||
137 | * data_received (for remote read, any protocol) | ||
138 | * or finally the barrier ack has arrived (B,A)... | ||
139 | * request can be freed | ||
140 | * 01100: neg-acked (write, protocol C) | ||
141 | * or neg-d-acked (read, any protocol) | ||
142 | * or killed from the transfer log | ||
143 | * during cleanup after connection loss | ||
144 | * request can be freed | ||
145 | * 01000: canceled or send failed... | ||
146 | * request can be freed | ||
147 | */ | ||
148 | |||
149 | /* if "SENT" is not set, yet, this can still fail or be canceled. | ||
150 | * if "SENT" is set already, we still wait for an Ack packet. | ||
151 | * when cleared, the master_bio may be completed. | ||
152 | * in (B,A) the request object may still linger on the transaction log | ||
153 | * until the corresponding barrier ack comes in */ | ||
154 | __RQ_NET_PENDING, | ||
155 | |||
156 | /* If it is QUEUED, and it is a WRITE, it is also registered in the | ||
157 | * transfer log. Currently we need this flag to avoid conflicts between | ||
158 | * worker canceling the request and tl_clear_barrier killing it from | ||
159 | * transfer log. We should restructure the code so this conflict does | ||
160 | * no longer occur. */ | ||
161 | __RQ_NET_QUEUED, | ||
162 | |||
163 | /* well, actually only "handed over to the network stack". | ||
164 | * | ||
165 | * TODO can potentially be dropped because of the similar meaning | ||
166 | * of RQ_NET_SENT and ~RQ_NET_QUEUED. | ||
167 | * however it is not exactly the same. before we drop it | ||
168 | * we must ensure that we can tell a request with network part | ||
169 | * from a request without, regardless of what happens to it. */ | ||
170 | __RQ_NET_SENT, | ||
171 | |||
172 | /* when set, the request may be freed (if RQ_NET_QUEUED is clear). | ||
173 | * basically this means the corresponding P_BARRIER_ACK was received */ | ||
174 | __RQ_NET_DONE, | ||
175 | |||
176 | /* whether or not we know (C) or pretend (B,A) that the write | ||
177 | * was successfully written on the peer. | ||
178 | */ | ||
179 | __RQ_NET_OK, | ||
180 | |||
181 | /* peer called drbd_set_in_sync() for this write */ | ||
182 | __RQ_NET_SIS, | ||
183 | |||
184 | /* keep this last, its for the RQ_NET_MASK */ | ||
185 | __RQ_NET_MAX, | ||
186 | }; | ||
187 | |||
188 | #define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) | ||
189 | #define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED) | ||
190 | #define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK) | ||
191 | |||
192 | #define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */ | ||
193 | |||
194 | #define RQ_NET_PENDING (1UL << __RQ_NET_PENDING) | ||
195 | #define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED) | ||
196 | #define RQ_NET_SENT (1UL << __RQ_NET_SENT) | ||
197 | #define RQ_NET_DONE (1UL << __RQ_NET_DONE) | ||
198 | #define RQ_NET_OK (1UL << __RQ_NET_OK) | ||
199 | #define RQ_NET_SIS (1UL << __RQ_NET_SIS) | ||
200 | |||
201 | /* 0x1f8 */ | ||
202 | #define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK) | ||
203 | |||
204 | /* epoch entries */ | ||
205 | static inline | ||
206 | struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector) | ||
207 | { | ||
208 | BUG_ON(mdev->ee_hash_s == 0); | ||
209 | return mdev->ee_hash + | ||
210 | ((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s); | ||
211 | } | ||
212 | |||
213 | /* transfer log (drbd_request objects) */ | ||
214 | static inline | ||
215 | struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector) | ||
216 | { | ||
217 | BUG_ON(mdev->tl_hash_s == 0); | ||
218 | return mdev->tl_hash + | ||
219 | ((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s); | ||
220 | } | ||
221 | |||
222 | /* application reads (drbd_request objects) */ | ||
223 | static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector) | ||
224 | { | ||
225 | return mdev->app_reads_hash | ||
226 | + ((unsigned int)(sector) % APP_R_HSIZE); | ||
227 | } | ||
228 | |||
229 | /* when we receive the answer for a read request, | ||
230 | * verify that we actually know about it */ | ||
231 | static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev, | ||
232 | u64 id, sector_t sector) | ||
233 | { | ||
234 | struct hlist_head *slot = ar_hash_slot(mdev, sector); | ||
235 | struct hlist_node *n; | ||
236 | struct drbd_request *req; | ||
237 | |||
238 | hlist_for_each_entry(req, n, slot, colision) { | ||
239 | if ((unsigned long)req == (unsigned long)id) { | ||
240 | D_ASSERT(req->sector == sector); | ||
241 | return req; | ||
242 | } | ||
243 | } | ||
244 | return NULL; | ||
245 | } | ||
246 | |||
247 | static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev, | ||
248 | struct bio *bio_src) | ||
249 | { | ||
250 | struct bio *bio; | ||
251 | struct drbd_request *req = | ||
252 | mempool_alloc(drbd_request_mempool, GFP_NOIO); | ||
253 | if (likely(req)) { | ||
254 | bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */ | ||
255 | |||
256 | req->rq_state = 0; | ||
257 | req->mdev = mdev; | ||
258 | req->master_bio = bio_src; | ||
259 | req->private_bio = bio; | ||
260 | req->epoch = 0; | ||
261 | req->sector = bio->bi_sector; | ||
262 | req->size = bio->bi_size; | ||
263 | req->start_time = jiffies; | ||
264 | INIT_HLIST_NODE(&req->colision); | ||
265 | INIT_LIST_HEAD(&req->tl_requests); | ||
266 | INIT_LIST_HEAD(&req->w.list); | ||
267 | |||
268 | bio->bi_private = req; | ||
269 | bio->bi_end_io = drbd_endio_pri; | ||
270 | bio->bi_next = NULL; | ||
271 | } | ||
272 | return req; | ||
273 | } | ||
274 | |||
275 | static inline void drbd_req_free(struct drbd_request *req) | ||
276 | { | ||
277 | mempool_free(req, drbd_request_mempool); | ||
278 | } | ||
279 | |||
280 | static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2) | ||
281 | { | ||
282 | return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9))); | ||
283 | } | ||
284 | |||
285 | /* Short lived temporary struct on the stack. | ||
286 | * We could squirrel the error to be returned into | ||
287 | * bio->bi_size, or similar. But that would be too ugly. */ | ||
288 | struct bio_and_error { | ||
289 | struct bio *bio; | ||
290 | int error; | ||
291 | }; | ||
292 | |||
293 | extern void _req_may_be_done(struct drbd_request *req, | ||
294 | struct bio_and_error *m); | ||
295 | extern void __req_mod(struct drbd_request *req, enum drbd_req_event what, | ||
296 | struct bio_and_error *m); | ||
297 | extern void complete_master_bio(struct drbd_conf *mdev, | ||
298 | struct bio_and_error *m); | ||
299 | |||
300 | /* use this if you don't want to deal with calling complete_master_bio() | ||
301 | * outside the spinlock, e.g. when walking some list on cleanup. */ | ||
302 | static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what) | ||
303 | { | ||
304 | struct drbd_conf *mdev = req->mdev; | ||
305 | struct bio_and_error m; | ||
306 | |||
307 | /* __req_mod possibly frees req, do not touch req after that! */ | ||
308 | __req_mod(req, what, &m); | ||
309 | if (m.bio) | ||
310 | complete_master_bio(mdev, &m); | ||
311 | } | ||
312 | |||
313 | /* completion of master bio is outside of spinlock. | ||
314 | * If you need it irqsave, do it your self! */ | ||
315 | static inline void req_mod(struct drbd_request *req, | ||
316 | enum drbd_req_event what) | ||
317 | { | ||
318 | struct drbd_conf *mdev = req->mdev; | ||
319 | struct bio_and_error m; | ||
320 | spin_lock_irq(&mdev->req_lock); | ||
321 | __req_mod(req, what, &m); | ||
322 | spin_unlock_irq(&mdev->req_lock); | ||
323 | |||
324 | if (m.bio) | ||
325 | complete_master_bio(mdev, &m); | ||
326 | } | ||
327 | #endif | ||
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c new file mode 100644 index 00000000000..76863e3f05b --- /dev/null +++ b/drivers/block/drbd/drbd_strings.c | |||
@@ -0,0 +1,113 @@ | |||
1 | /* | ||
2 | drbd.h | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #include <linux/drbd.h> | ||
27 | |||
28 | static const char *drbd_conn_s_names[] = { | ||
29 | [C_STANDALONE] = "StandAlone", | ||
30 | [C_DISCONNECTING] = "Disconnecting", | ||
31 | [C_UNCONNECTED] = "Unconnected", | ||
32 | [C_TIMEOUT] = "Timeout", | ||
33 | [C_BROKEN_PIPE] = "BrokenPipe", | ||
34 | [C_NETWORK_FAILURE] = "NetworkFailure", | ||
35 | [C_PROTOCOL_ERROR] = "ProtocolError", | ||
36 | [C_WF_CONNECTION] = "WFConnection", | ||
37 | [C_WF_REPORT_PARAMS] = "WFReportParams", | ||
38 | [C_TEAR_DOWN] = "TearDown", | ||
39 | [C_CONNECTED] = "Connected", | ||
40 | [C_STARTING_SYNC_S] = "StartingSyncS", | ||
41 | [C_STARTING_SYNC_T] = "StartingSyncT", | ||
42 | [C_WF_BITMAP_S] = "WFBitMapS", | ||
43 | [C_WF_BITMAP_T] = "WFBitMapT", | ||
44 | [C_WF_SYNC_UUID] = "WFSyncUUID", | ||
45 | [C_SYNC_SOURCE] = "SyncSource", | ||
46 | [C_SYNC_TARGET] = "SyncTarget", | ||
47 | [C_PAUSED_SYNC_S] = "PausedSyncS", | ||
48 | [C_PAUSED_SYNC_T] = "PausedSyncT", | ||
49 | [C_VERIFY_S] = "VerifyS", | ||
50 | [C_VERIFY_T] = "VerifyT", | ||
51 | }; | ||
52 | |||
53 | static const char *drbd_role_s_names[] = { | ||
54 | [R_PRIMARY] = "Primary", | ||
55 | [R_SECONDARY] = "Secondary", | ||
56 | [R_UNKNOWN] = "Unknown" | ||
57 | }; | ||
58 | |||
59 | static const char *drbd_disk_s_names[] = { | ||
60 | [D_DISKLESS] = "Diskless", | ||
61 | [D_ATTACHING] = "Attaching", | ||
62 | [D_FAILED] = "Failed", | ||
63 | [D_NEGOTIATING] = "Negotiating", | ||
64 | [D_INCONSISTENT] = "Inconsistent", | ||
65 | [D_OUTDATED] = "Outdated", | ||
66 | [D_UNKNOWN] = "DUnknown", | ||
67 | [D_CONSISTENT] = "Consistent", | ||
68 | [D_UP_TO_DATE] = "UpToDate", | ||
69 | }; | ||
70 | |||
71 | static const char *drbd_state_sw_errors[] = { | ||
72 | [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config", | ||
73 | [-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk", | ||
74 | [-SS_NO_LOCAL_DISK] = "Can not resync without local disk", | ||
75 | [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk", | ||
76 | [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected", | ||
77 | [-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated", | ||
78 | [-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active", | ||
79 | [-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device", | ||
80 | [-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node", | ||
81 | [-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk", | ||
82 | [-SS_DEVICE_IN_USE] = "Device is held open by someone", | ||
83 | [-SS_NO_NET_CONFIG] = "Have no net/connection configuration", | ||
84 | [-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify", | ||
85 | [-SS_NEED_CONNECTION] = "Need a connection to start verify or resync", | ||
86 | [-SS_NOT_SUPPORTED] = "Peer does not support protocol", | ||
87 | [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", | ||
88 | [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", | ||
89 | [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", | ||
90 | }; | ||
91 | |||
92 | const char *drbd_conn_str(enum drbd_conns s) | ||
93 | { | ||
94 | /* enums are unsigned... */ | ||
95 | return s > C_PAUSED_SYNC_T ? "TOO_LARGE" : drbd_conn_s_names[s]; | ||
96 | } | ||
97 | |||
98 | const char *drbd_role_str(enum drbd_role s) | ||
99 | { | ||
100 | return s > R_SECONDARY ? "TOO_LARGE" : drbd_role_s_names[s]; | ||
101 | } | ||
102 | |||
103 | const char *drbd_disk_str(enum drbd_disk_state s) | ||
104 | { | ||
105 | return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s]; | ||
106 | } | ||
107 | |||
108 | const char *drbd_set_st_err_str(enum drbd_state_ret_codes err) | ||
109 | { | ||
110 | return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" : | ||
111 | err > SS_TWO_PRIMARIES ? "TOO_LARGE" | ||
112 | : drbd_state_sw_errors[-err]; | ||
113 | } | ||
diff --git a/drivers/block/drbd/drbd_tracing.c b/drivers/block/drbd/drbd_tracing.c new file mode 100644 index 00000000000..d18d4f7b4be --- /dev/null +++ b/drivers/block/drbd/drbd_tracing.c | |||
@@ -0,0 +1,752 @@ | |||
1 | /* | ||
2 | drbd_tracing.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | #include <linux/drbd.h> | ||
28 | #include <linux/ctype.h> | ||
29 | #include "drbd_int.h" | ||
30 | #include "drbd_tracing.h" | ||
31 | #include <linux/drbd_tag_magic.h> | ||
32 | |||
33 | MODULE_LICENSE("GPL"); | ||
34 | MODULE_AUTHOR("Philipp Reisner, Lars Ellenberg"); | ||
35 | MODULE_DESCRIPTION("DRBD tracepoint probes"); | ||
36 | MODULE_PARM_DESC(trace_mask, "Bitmap of events to trace see drbd_tracing.c"); | ||
37 | MODULE_PARM_DESC(trace_level, "Current tracing level (changeable in /sys)"); | ||
38 | MODULE_PARM_DESC(trace_devs, "Bitmap of devices to trace (changeable in /sys)"); | ||
39 | |||
40 | unsigned int trace_mask = 0; /* Bitmap of events to trace */ | ||
41 | int trace_level; /* Current trace level */ | ||
42 | int trace_devs; /* Bitmap of devices to trace */ | ||
43 | |||
44 | module_param(trace_mask, uint, 0444); | ||
45 | module_param(trace_level, int, 0644); | ||
46 | module_param(trace_devs, int, 0644); | ||
47 | |||
48 | enum { | ||
49 | TRACE_PACKET = 0x0001, | ||
50 | TRACE_RQ = 0x0002, | ||
51 | TRACE_UUID = 0x0004, | ||
52 | TRACE_RESYNC = 0x0008, | ||
53 | TRACE_EE = 0x0010, | ||
54 | TRACE_UNPLUG = 0x0020, | ||
55 | TRACE_NL = 0x0040, | ||
56 | TRACE_AL_EXT = 0x0080, | ||
57 | TRACE_INT_RQ = 0x0100, | ||
58 | TRACE_MD_IO = 0x0200, | ||
59 | TRACE_EPOCH = 0x0400, | ||
60 | }; | ||
61 | |||
62 | /* Buffer printing support | ||
63 | * dbg_print_flags: used for Flags arg to drbd_print_buffer | ||
64 | * - DBGPRINT_BUFFADDR; if set, each line starts with the | ||
65 | * virtual address of the line being output. If clear, | ||
66 | * each line starts with the offset from the beginning | ||
67 | * of the buffer. */ | ||
68 | enum dbg_print_flags { | ||
69 | DBGPRINT_BUFFADDR = 0x0001, | ||
70 | }; | ||
71 | |||
72 | /* Macro stuff */ | ||
73 | static char *nl_packet_name(int packet_type) | ||
74 | { | ||
75 | /* Generate packet type strings */ | ||
76 | #define NL_PACKET(name, number, fields) \ | ||
77 | [P_ ## name] = # name, | ||
78 | #define NL_INTEGER Argh! | ||
79 | #define NL_BIT Argh! | ||
80 | #define NL_INT64 Argh! | ||
81 | #define NL_STRING Argh! | ||
82 | |||
83 | static char *nl_tag_name[P_nl_after_last_packet] = { | ||
84 | #include "linux/drbd_nl.h" | ||
85 | }; | ||
86 | |||
87 | return (packet_type < sizeof(nl_tag_name)/sizeof(nl_tag_name[0])) ? | ||
88 | nl_tag_name[packet_type] : "*Unknown*"; | ||
89 | } | ||
90 | /* /Macro stuff */ | ||
91 | |||
92 | static inline int is_mdev_trace(struct drbd_conf *mdev, unsigned int level) | ||
93 | { | ||
94 | return trace_level >= level && ((1 << mdev_to_minor(mdev)) & trace_devs); | ||
95 | } | ||
96 | |||
97 | static void probe_drbd_unplug(struct drbd_conf *mdev, char *msg) | ||
98 | { | ||
99 | if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) | ||
100 | return; | ||
101 | |||
102 | dev_info(DEV, "%s, ap_bio_count=%d\n", msg, atomic_read(&mdev->ap_bio_cnt)); | ||
103 | } | ||
104 | |||
105 | static void probe_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index) | ||
106 | { | ||
107 | static char *uuid_str[UI_EXTENDED_SIZE] = { | ||
108 | [UI_CURRENT] = "CURRENT", | ||
109 | [UI_BITMAP] = "BITMAP", | ||
110 | [UI_HISTORY_START] = "HISTORY_START", | ||
111 | [UI_HISTORY_END] = "HISTORY_END", | ||
112 | [UI_SIZE] = "SIZE", | ||
113 | [UI_FLAGS] = "FLAGS", | ||
114 | }; | ||
115 | |||
116 | if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) | ||
117 | return; | ||
118 | |||
119 | if (index >= UI_EXTENDED_SIZE) { | ||
120 | dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n"); | ||
121 | return; | ||
122 | } | ||
123 | |||
124 | dev_info(DEV, " uuid[%s] now %016llX\n", | ||
125 | uuid_str[index], | ||
126 | (unsigned long long)mdev->ldev->md.uuid[index]); | ||
127 | } | ||
128 | |||
129 | static void probe_drbd_md_io(struct drbd_conf *mdev, int rw, | ||
130 | struct drbd_backing_dev *bdev) | ||
131 | { | ||
132 | if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) | ||
133 | return; | ||
134 | |||
135 | dev_info(DEV, " %s metadata superblock now\n", | ||
136 | rw == READ ? "Reading" : "Writing"); | ||
137 | } | ||
138 | |||
139 | static void probe_drbd_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, char* msg) | ||
140 | { | ||
141 | if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) | ||
142 | return; | ||
143 | |||
144 | dev_info(DEV, "EE %s sec=%llus size=%u e=%p\n", | ||
145 | msg, (unsigned long long)e->sector, e->size, e); | ||
146 | } | ||
147 | |||
148 | static void probe_drbd_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch, | ||
149 | enum epoch_event ev) | ||
150 | { | ||
151 | static char *epoch_event_str[] = { | ||
152 | [EV_PUT] = "put", | ||
153 | [EV_GOT_BARRIER_NR] = "got_barrier_nr", | ||
154 | [EV_BARRIER_DONE] = "barrier_done", | ||
155 | [EV_BECAME_LAST] = "became_last", | ||
156 | [EV_TRACE_FLUSH] = "issuing_flush", | ||
157 | [EV_TRACE_ADD_BARRIER] = "added_barrier", | ||
158 | [EV_TRACE_SETTING_BI] = "just set barrier_in_next_epoch", | ||
159 | }; | ||
160 | |||
161 | if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) | ||
162 | return; | ||
163 | |||
164 | ev &= ~EV_CLEANUP; | ||
165 | |||
166 | switch (ev) { | ||
167 | case EV_TRACE_ALLOC: | ||
168 | dev_info(DEV, "Allocate epoch %p/xxxx { } nr_epochs=%d\n", epoch, mdev->epochs); | ||
169 | break; | ||
170 | case EV_TRACE_FREE: | ||
171 | dev_info(DEV, "Freeing epoch %p/%d { size=%d } nr_epochs=%d\n", | ||
172 | epoch, epoch->barrier_nr, atomic_read(&epoch->epoch_size), | ||
173 | mdev->epochs); | ||
174 | break; | ||
175 | default: | ||
176 | dev_info(DEV, "Update epoch %p/%d { size=%d active=%d %c%c n%c%c } ev=%s\n", | ||
177 | epoch, epoch->barrier_nr, atomic_read(&epoch->epoch_size), | ||
178 | atomic_read(&epoch->active), | ||
179 | test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) ? 'n' : '-', | ||
180 | test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) ? 'b' : '-', | ||
181 | test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) ? 'i' : '-', | ||
182 | test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ? 'd' : '-', | ||
183 | epoch_event_str[ev]); | ||
184 | } | ||
185 | } | ||
186 | |||
187 | static void probe_drbd_netlink(void *data, int is_req) | ||
188 | { | ||
189 | struct cn_msg *msg = data; | ||
190 | |||
191 | if (is_req) { | ||
192 | struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)msg->data; | ||
193 | |||
194 | printk(KERN_INFO "drbd%d: " | ||
195 | "Netlink: << %s (%d) - seq: %x, ack: %x, len: %x\n", | ||
196 | nlp->drbd_minor, | ||
197 | nl_packet_name(nlp->packet_type), | ||
198 | nlp->packet_type, | ||
199 | msg->seq, msg->ack, msg->len); | ||
200 | } else { | ||
201 | struct drbd_nl_cfg_reply *nlp = (struct drbd_nl_cfg_reply *)msg->data; | ||
202 | |||
203 | printk(KERN_INFO "drbd%d: " | ||
204 | "Netlink: >> %s (%d) - seq: %x, ack: %x, len: %x\n", | ||
205 | nlp->minor, | ||
206 | nlp->packet_type == P_nl_after_last_packet ? | ||
207 | "Empty-Reply" : nl_packet_name(nlp->packet_type), | ||
208 | nlp->packet_type, | ||
209 | msg->seq, msg->ack, msg->len); | ||
210 | } | ||
211 | } | ||
212 | |||
213 | static void probe_drbd_actlog(struct drbd_conf *mdev, sector_t sector, char* msg) | ||
214 | { | ||
215 | unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); | ||
216 | |||
217 | if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) | ||
218 | return; | ||
219 | |||
220 | dev_info(DEV, "%s (sec=%llus, al_enr=%u, rs_enr=%d)\n", | ||
221 | msg, (unsigned long long) sector, enr, | ||
222 | (int)BM_SECT_TO_EXT(sector)); | ||
223 | } | ||
224 | |||
225 | /** | ||
226 | * drbd_print_buffer() - Hexdump arbitrary binary data into a buffer | ||
227 | * @prefix: String is output at the beginning of each line output. | ||
228 | * @flags: Currently only defined flag: DBGPRINT_BUFFADDR; if set, each | ||
229 | * line starts with the virtual address of the line being | ||
230 | * output. If clear, each line starts with the offset from the | ||
231 | * beginning of the buffer. | ||
232 | * @size: Indicates the size of each entry in the buffer. Supported | ||
233 | * values are sizeof(char), sizeof(short) and sizeof(int) | ||
234 | * @buffer: Start address of buffer | ||
235 | * @buffer_va: Virtual address of start of buffer (normally the same | ||
236 | * as Buffer, but having it separate allows it to hold | ||
237 | * file address for example) | ||
238 | * @length: length of buffer | ||
239 | */ | ||
240 | static void drbd_print_buffer(const char *prefix, unsigned int flags, int size, | ||
241 | const void *buffer, const void *buffer_va, | ||
242 | unsigned int length) | ||
243 | |||
244 | #define LINE_SIZE 16 | ||
245 | #define LINE_ENTRIES (int)(LINE_SIZE/size) | ||
246 | { | ||
247 | const unsigned char *pstart; | ||
248 | const unsigned char *pstart_va; | ||
249 | const unsigned char *pend; | ||
250 | char bytes_str[LINE_SIZE*3+8], ascii_str[LINE_SIZE+8]; | ||
251 | char *pbytes = bytes_str, *pascii = ascii_str; | ||
252 | int offset = 0; | ||
253 | long sizemask; | ||
254 | int field_width; | ||
255 | int index; | ||
256 | const unsigned char *pend_str; | ||
257 | const unsigned char *p; | ||
258 | int count; | ||
259 | |||
260 | /* verify size parameter */ | ||
261 | if (size != sizeof(char) && | ||
262 | size != sizeof(short) && | ||
263 | size != sizeof(int)) { | ||
264 | printk(KERN_DEBUG "drbd_print_buffer: " | ||
265 | "ERROR invalid size %d\n", size); | ||
266 | return; | ||
267 | } | ||
268 | |||
269 | sizemask = size-1; | ||
270 | field_width = size*2; | ||
271 | |||
272 | /* Adjust start/end to be on appropriate boundary for size */ | ||
273 | buffer = (const char *)((long)buffer & ~sizemask); | ||
274 | pend = (const unsigned char *) | ||
275 | (((long)buffer + length + sizemask) & ~sizemask); | ||
276 | |||
277 | if (flags & DBGPRINT_BUFFADDR) { | ||
278 | /* Move start back to nearest multiple of line size, | ||
279 | * if printing address. This results in nicely formatted output | ||
280 | * with addresses being on line size (16) byte boundaries */ | ||
281 | pstart = (const unsigned char *)((long)buffer & ~(LINE_SIZE-1)); | ||
282 | } else { | ||
283 | pstart = (const unsigned char *)buffer; | ||
284 | } | ||
285 | |||
286 | /* Set value of start VA to print if addresses asked for */ | ||
287 | pstart_va = (const unsigned char *)buffer_va | ||
288 | - ((const unsigned char *)buffer-pstart); | ||
289 | |||
290 | /* Calculate end position to nicely align right hand side */ | ||
291 | pend_str = pstart + (((pend-pstart) + LINE_SIZE-1) & ~(LINE_SIZE-1)); | ||
292 | |||
293 | /* Init strings */ | ||
294 | *pbytes = *pascii = '\0'; | ||
295 | |||
296 | /* Start at beginning of first line */ | ||
297 | p = pstart; | ||
298 | count = 0; | ||
299 | |||
300 | while (p < pend_str) { | ||
301 | if (p < (const unsigned char *)buffer || p >= pend) { | ||
302 | /* Before start of buffer or after end- print spaces */ | ||
303 | pbytes += sprintf(pbytes, "%*c ", field_width, ' '); | ||
304 | pascii += sprintf(pascii, "%*c", size, ' '); | ||
305 | p += size; | ||
306 | } else { | ||
307 | /* Add hex and ascii to strings */ | ||
308 | int val; | ||
309 | switch (size) { | ||
310 | default: | ||
311 | case 1: | ||
312 | val = *(unsigned char *)p; | ||
313 | break; | ||
314 | case 2: | ||
315 | val = *(unsigned short *)p; | ||
316 | break; | ||
317 | case 4: | ||
318 | val = *(unsigned int *)p; | ||
319 | break; | ||
320 | } | ||
321 | |||
322 | pbytes += sprintf(pbytes, "%0*x ", field_width, val); | ||
323 | |||
324 | for (index = size; index; index--) { | ||
325 | *pascii++ = isprint(*p) ? *p : '.'; | ||
326 | p++; | ||
327 | } | ||
328 | } | ||
329 | |||
330 | count++; | ||
331 | |||
332 | if (count == LINE_ENTRIES || p >= pend_str) { | ||
333 | /* Null terminate and print record */ | ||
334 | *pascii = '\0'; | ||
335 | printk(KERN_DEBUG "%s%8.8lx: %*s|%*s|\n", | ||
336 | prefix, | ||
337 | (flags & DBGPRINT_BUFFADDR) | ||
338 | ? (long)pstart_va:(long)offset, | ||
339 | LINE_ENTRIES*(field_width+1), bytes_str, | ||
340 | LINE_SIZE, ascii_str); | ||
341 | |||
342 | /* Move onto next line */ | ||
343 | pstart_va += (p-pstart); | ||
344 | pstart = p; | ||
345 | count = 0; | ||
346 | offset += LINE_SIZE; | ||
347 | |||
348 | /* Re-init strings */ | ||
349 | pbytes = bytes_str; | ||
350 | pascii = ascii_str; | ||
351 | *pbytes = *pascii = '\0'; | ||
352 | } | ||
353 | } | ||
354 | } | ||
355 | |||
356 | static void probe_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, va_list args) | ||
357 | { | ||
358 | char str[256]; | ||
359 | |||
360 | if (!is_mdev_trace(mdev, level)) | ||
361 | return; | ||
362 | |||
363 | if (vsnprintf(str, 256, fmt, args) >= 256) | ||
364 | str[255] = 0; | ||
365 | |||
366 | printk(KERN_INFO "%s %s: %s", dev_driver_string(disk_to_dev(mdev->vdisk)), | ||
367 | dev_name(disk_to_dev(mdev->vdisk)), str); | ||
368 | } | ||
369 | |||
370 | static void probe_drbd_bio(struct drbd_conf *mdev, const char *pfx, struct bio *bio, int complete, | ||
371 | struct drbd_request *r) | ||
372 | { | ||
373 | #if defined(CONFIG_LBDAF) || defined(CONFIG_LBD) | ||
374 | #define SECTOR_FORMAT "%Lx" | ||
375 | #else | ||
376 | #define SECTOR_FORMAT "%lx" | ||
377 | #endif | ||
378 | #define SECTOR_SHIFT 9 | ||
379 | |||
380 | unsigned long lowaddr = (unsigned long)(bio->bi_sector << SECTOR_SHIFT); | ||
381 | char *faddr = (char *)(lowaddr); | ||
382 | char rb[sizeof(void *)*2+6] = { 0, }; | ||
383 | struct bio_vec *bvec; | ||
384 | int segno; | ||
385 | |||
386 | const int rw = bio->bi_rw; | ||
387 | const int biorw = (rw & (RW_MASK|RWA_MASK)); | ||
388 | const int biobarrier = (rw & (1<<BIO_RW_BARRIER)); | ||
389 | const int biosync = (rw & ((1<<BIO_RW_UNPLUG) | (1<<BIO_RW_SYNCIO))); | ||
390 | |||
391 | if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) | ||
392 | return; | ||
393 | |||
394 | if (r) | ||
395 | sprintf(rb, "Req:%p ", r); | ||
396 | |||
397 | dev_info(DEV, "%s %s:%s%s%s Bio:%p %s- %soffset " SECTOR_FORMAT ", size %x\n", | ||
398 | complete ? "<<<" : ">>>", | ||
399 | pfx, | ||
400 | biorw == WRITE ? "Write" : "Read", | ||
401 | biobarrier ? " : B" : "", | ||
402 | biosync ? " : S" : "", | ||
403 | bio, | ||
404 | rb, | ||
405 | complete ? (bio_flagged(bio, BIO_UPTODATE) ? "Success, " : "Failed, ") : "", | ||
406 | bio->bi_sector << SECTOR_SHIFT, | ||
407 | bio->bi_size); | ||
408 | |||
409 | if (trace_level >= TRACE_LVL_METRICS && | ||
410 | ((biorw == WRITE) ^ complete)) { | ||
411 | printk(KERN_DEBUG " ind page offset length\n"); | ||
412 | __bio_for_each_segment(bvec, bio, segno, 0) { | ||
413 | printk(KERN_DEBUG " [%d] %p %8.8x %8.8x\n", segno, | ||
414 | bvec->bv_page, bvec->bv_offset, bvec->bv_len); | ||
415 | |||
416 | if (trace_level >= TRACE_LVL_ALL) { | ||
417 | char *bvec_buf; | ||
418 | unsigned long flags; | ||
419 | |||
420 | bvec_buf = bvec_kmap_irq(bvec, &flags); | ||
421 | |||
422 | drbd_print_buffer(" ", DBGPRINT_BUFFADDR, 1, | ||
423 | bvec_buf, | ||
424 | faddr, | ||
425 | (bvec->bv_len <= 0x80) | ||
426 | ? bvec->bv_len : 0x80); | ||
427 | |||
428 | bvec_kunmap_irq(bvec_buf, &flags); | ||
429 | |||
430 | if (bvec->bv_len > 0x40) | ||
431 | printk(KERN_DEBUG " ....\n"); | ||
432 | |||
433 | faddr += bvec->bv_len; | ||
434 | } | ||
435 | } | ||
436 | } | ||
437 | } | ||
438 | |||
439 | static void probe_drbd_req(struct drbd_request *req, enum drbd_req_event what, char *msg) | ||
440 | { | ||
441 | static const char *rq_event_names[] = { | ||
442 | [created] = "created", | ||
443 | [to_be_send] = "to_be_send", | ||
444 | [to_be_submitted] = "to_be_submitted", | ||
445 | [queue_for_net_write] = "queue_for_net_write", | ||
446 | [queue_for_net_read] = "queue_for_net_read", | ||
447 | [send_canceled] = "send_canceled", | ||
448 | [send_failed] = "send_failed", | ||
449 | [handed_over_to_network] = "handed_over_to_network", | ||
450 | [connection_lost_while_pending] = | ||
451 | "connection_lost_while_pending", | ||
452 | [recv_acked_by_peer] = "recv_acked_by_peer", | ||
453 | [write_acked_by_peer] = "write_acked_by_peer", | ||
454 | [neg_acked] = "neg_acked", | ||
455 | [conflict_discarded_by_peer] = "conflict_discarded_by_peer", | ||
456 | [barrier_acked] = "barrier_acked", | ||
457 | [data_received] = "data_received", | ||
458 | [read_completed_with_error] = "read_completed_with_error", | ||
459 | [read_ahead_completed_with_error] = "reada_completed_with_error", | ||
460 | [write_completed_with_error] = "write_completed_with_error", | ||
461 | [completed_ok] = "completed_ok", | ||
462 | }; | ||
463 | |||
464 | struct drbd_conf *mdev = req->mdev; | ||
465 | |||
466 | const int rw = (req->master_bio == NULL || | ||
467 | bio_data_dir(req->master_bio) == WRITE) ? | ||
468 | 'W' : 'R'; | ||
469 | const unsigned long s = req->rq_state; | ||
470 | |||
471 | if (what != nothing) { | ||
472 | dev_info(DEV, "__req_mod(%p %c ,%s)\n", req, rw, rq_event_names[what]); | ||
473 | } else { | ||
474 | dev_info(DEV, "%s %p %c L%c%c%cN%c%c%c%c%c %u (%llus +%u) %s\n", | ||
475 | msg, req, rw, | ||
476 | s & RQ_LOCAL_PENDING ? 'p' : '-', | ||
477 | s & RQ_LOCAL_COMPLETED ? 'c' : '-', | ||
478 | s & RQ_LOCAL_OK ? 'o' : '-', | ||
479 | s & RQ_NET_PENDING ? 'p' : '-', | ||
480 | s & RQ_NET_QUEUED ? 'q' : '-', | ||
481 | s & RQ_NET_SENT ? 's' : '-', | ||
482 | s & RQ_NET_DONE ? 'd' : '-', | ||
483 | s & RQ_NET_OK ? 'o' : '-', | ||
484 | req->epoch, | ||
485 | (unsigned long long)req->sector, | ||
486 | req->size, | ||
487 | drbd_conn_str(mdev->state.conn)); | ||
488 | } | ||
489 | } | ||
490 | |||
491 | |||
492 | #define drbd_peer_str drbd_role_str | ||
493 | #define drbd_pdsk_str drbd_disk_str | ||
494 | |||
495 | #define PSM(A) \ | ||
496 | do { \ | ||
497 | if (mask.A) { \ | ||
498 | int i = snprintf(p, len, " " #A "( %s )", \ | ||
499 | drbd_##A##_str(val.A)); \ | ||
500 | if (i >= len) \ | ||
501 | return op; \ | ||
502 | p += i; \ | ||
503 | len -= i; \ | ||
504 | } \ | ||
505 | } while (0) | ||
506 | |||
507 | static char *dump_st(char *p, int len, union drbd_state mask, union drbd_state val) | ||
508 | { | ||
509 | char *op = p; | ||
510 | *p = '\0'; | ||
511 | PSM(role); | ||
512 | PSM(peer); | ||
513 | PSM(conn); | ||
514 | PSM(disk); | ||
515 | PSM(pdsk); | ||
516 | |||
517 | return op; | ||
518 | } | ||
519 | |||
520 | #define INFOP(fmt, args...) \ | ||
521 | do { \ | ||
522 | if (trace_level >= TRACE_LVL_ALL) { \ | ||
523 | dev_info(DEV, "%s:%d: %s [%d] %s %s " fmt , \ | ||
524 | file, line, current->comm, current->pid, \ | ||
525 | sockname, recv ? "<<<" : ">>>" , \ | ||
526 | ## args); \ | ||
527 | } else { \ | ||
528 | dev_info(DEV, "%s %s " fmt, sockname, \ | ||
529 | recv ? "<<<" : ">>>" , \ | ||
530 | ## args); \ | ||
531 | } \ | ||
532 | } while (0) | ||
533 | |||
534 | static char *_dump_block_id(u64 block_id, char *buff) | ||
535 | { | ||
536 | if (is_syncer_block_id(block_id)) | ||
537 | strcpy(buff, "SyncerId"); | ||
538 | else | ||
539 | sprintf(buff, "%llx", (unsigned long long)block_id); | ||
540 | |||
541 | return buff; | ||
542 | } | ||
543 | |||
544 | static void probe_drbd_packet(struct drbd_conf *mdev, struct socket *sock, | ||
545 | int recv, union p_polymorph *p, char *file, int line) | ||
546 | { | ||
547 | char *sockname = sock == mdev->meta.socket ? "meta" : "data"; | ||
548 | int cmd = (recv == 2) ? p->header.command : be16_to_cpu(p->header.command); | ||
549 | char tmp[300]; | ||
550 | union drbd_state m, v; | ||
551 | |||
552 | switch (cmd) { | ||
553 | case P_HAND_SHAKE: | ||
554 | INFOP("%s (protocol %u-%u)\n", cmdname(cmd), | ||
555 | be32_to_cpu(p->handshake.protocol_min), | ||
556 | be32_to_cpu(p->handshake.protocol_max)); | ||
557 | break; | ||
558 | |||
559 | case P_BITMAP: /* don't report this */ | ||
560 | case P_COMPRESSED_BITMAP: /* don't report this */ | ||
561 | break; | ||
562 | |||
563 | case P_DATA: | ||
564 | INFOP("%s (sector %llus, id %s, seq %u, f %x)\n", cmdname(cmd), | ||
565 | (unsigned long long)be64_to_cpu(p->data.sector), | ||
566 | _dump_block_id(p->data.block_id, tmp), | ||
567 | be32_to_cpu(p->data.seq_num), | ||
568 | be32_to_cpu(p->data.dp_flags) | ||
569 | ); | ||
570 | break; | ||
571 | |||
572 | case P_DATA_REPLY: | ||
573 | case P_RS_DATA_REPLY: | ||
574 | INFOP("%s (sector %llus, id %s)\n", cmdname(cmd), | ||
575 | (unsigned long long)be64_to_cpu(p->data.sector), | ||
576 | _dump_block_id(p->data.block_id, tmp) | ||
577 | ); | ||
578 | break; | ||
579 | |||
580 | case P_RECV_ACK: | ||
581 | case P_WRITE_ACK: | ||
582 | case P_RS_WRITE_ACK: | ||
583 | case P_DISCARD_ACK: | ||
584 | case P_NEG_ACK: | ||
585 | case P_NEG_RS_DREPLY: | ||
586 | INFOP("%s (sector %llus, size %u, id %s, seq %u)\n", | ||
587 | cmdname(cmd), | ||
588 | (long long)be64_to_cpu(p->block_ack.sector), | ||
589 | be32_to_cpu(p->block_ack.blksize), | ||
590 | _dump_block_id(p->block_ack.block_id, tmp), | ||
591 | be32_to_cpu(p->block_ack.seq_num) | ||
592 | ); | ||
593 | break; | ||
594 | |||
595 | case P_DATA_REQUEST: | ||
596 | case P_RS_DATA_REQUEST: | ||
597 | INFOP("%s (sector %llus, size %u, id %s)\n", cmdname(cmd), | ||
598 | (long long)be64_to_cpu(p->block_req.sector), | ||
599 | be32_to_cpu(p->block_req.blksize), | ||
600 | _dump_block_id(p->block_req.block_id, tmp) | ||
601 | ); | ||
602 | break; | ||
603 | |||
604 | case P_BARRIER: | ||
605 | case P_BARRIER_ACK: | ||
606 | INFOP("%s (barrier %u)\n", cmdname(cmd), p->barrier.barrier); | ||
607 | break; | ||
608 | |||
609 | case P_SYNC_PARAM: | ||
610 | case P_SYNC_PARAM89: | ||
611 | INFOP("%s (rate %u, verify-alg \"%.64s\", csums-alg \"%.64s\")\n", | ||
612 | cmdname(cmd), be32_to_cpu(p->rs_param_89.rate), | ||
613 | p->rs_param_89.verify_alg, p->rs_param_89.csums_alg); | ||
614 | break; | ||
615 | |||
616 | case P_UUIDS: | ||
617 | INFOP("%s Curr:%016llX, Bitmap:%016llX, " | ||
618 | "HisSt:%016llX, HisEnd:%016llX\n", | ||
619 | cmdname(cmd), | ||
620 | (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_CURRENT]), | ||
621 | (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_BITMAP]), | ||
622 | (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_HISTORY_START]), | ||
623 | (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_HISTORY_END])); | ||
624 | break; | ||
625 | |||
626 | case P_SIZES: | ||
627 | INFOP("%s (d %lluMiB, u %lluMiB, c %lldMiB, " | ||
628 | "max bio %x, q order %x)\n", | ||
629 | cmdname(cmd), | ||
630 | (long long)(be64_to_cpu(p->sizes.d_size)>>(20-9)), | ||
631 | (long long)(be64_to_cpu(p->sizes.u_size)>>(20-9)), | ||
632 | (long long)(be64_to_cpu(p->sizes.c_size)>>(20-9)), | ||
633 | be32_to_cpu(p->sizes.max_segment_size), | ||
634 | be32_to_cpu(p->sizes.queue_order_type)); | ||
635 | break; | ||
636 | |||
637 | case P_STATE: | ||
638 | v.i = be32_to_cpu(p->state.state); | ||
639 | m.i = 0xffffffff; | ||
640 | dump_st(tmp, sizeof(tmp), m, v); | ||
641 | INFOP("%s (s %x {%s})\n", cmdname(cmd), v.i, tmp); | ||
642 | break; | ||
643 | |||
644 | case P_STATE_CHG_REQ: | ||
645 | m.i = be32_to_cpu(p->req_state.mask); | ||
646 | v.i = be32_to_cpu(p->req_state.val); | ||
647 | dump_st(tmp, sizeof(tmp), m, v); | ||
648 | INFOP("%s (m %x v %x {%s})\n", cmdname(cmd), m.i, v.i, tmp); | ||
649 | break; | ||
650 | |||
651 | case P_STATE_CHG_REPLY: | ||
652 | INFOP("%s (ret %x)\n", cmdname(cmd), | ||
653 | be32_to_cpu(p->req_state_reply.retcode)); | ||
654 | break; | ||
655 | |||
656 | case P_PING: | ||
657 | case P_PING_ACK: | ||
658 | /* | ||
659 | * Dont trace pings at summary level | ||
660 | */ | ||
661 | if (trace_level < TRACE_LVL_ALL) | ||
662 | break; | ||
663 | /* fall through... */ | ||
664 | default: | ||
665 | INFOP("%s (%u)\n", cmdname(cmd), cmd); | ||
666 | break; | ||
667 | } | ||
668 | } | ||
669 | |||
670 | |||
671 | static int __init drbd_trace_init(void) | ||
672 | { | ||
673 | int ret; | ||
674 | |||
675 | if (trace_mask & TRACE_UNPLUG) { | ||
676 | ret = register_trace_drbd_unplug(probe_drbd_unplug); | ||
677 | WARN_ON(ret); | ||
678 | } | ||
679 | if (trace_mask & TRACE_UUID) { | ||
680 | ret = register_trace_drbd_uuid(probe_drbd_uuid); | ||
681 | WARN_ON(ret); | ||
682 | } | ||
683 | if (trace_mask & TRACE_EE) { | ||
684 | ret = register_trace_drbd_ee(probe_drbd_ee); | ||
685 | WARN_ON(ret); | ||
686 | } | ||
687 | if (trace_mask & TRACE_PACKET) { | ||
688 | ret = register_trace_drbd_packet(probe_drbd_packet); | ||
689 | WARN_ON(ret); | ||
690 | } | ||
691 | if (trace_mask & TRACE_MD_IO) { | ||
692 | ret = register_trace_drbd_md_io(probe_drbd_md_io); | ||
693 | WARN_ON(ret); | ||
694 | } | ||
695 | if (trace_mask & TRACE_EPOCH) { | ||
696 | ret = register_trace_drbd_epoch(probe_drbd_epoch); | ||
697 | WARN_ON(ret); | ||
698 | } | ||
699 | if (trace_mask & TRACE_NL) { | ||
700 | ret = register_trace_drbd_netlink(probe_drbd_netlink); | ||
701 | WARN_ON(ret); | ||
702 | } | ||
703 | if (trace_mask & TRACE_AL_EXT) { | ||
704 | ret = register_trace_drbd_actlog(probe_drbd_actlog); | ||
705 | WARN_ON(ret); | ||
706 | } | ||
707 | if (trace_mask & TRACE_RQ) { | ||
708 | ret = register_trace_drbd_bio(probe_drbd_bio); | ||
709 | WARN_ON(ret); | ||
710 | } | ||
711 | if (trace_mask & TRACE_INT_RQ) { | ||
712 | ret = register_trace_drbd_req(probe_drbd_req); | ||
713 | WARN_ON(ret); | ||
714 | } | ||
715 | if (trace_mask & TRACE_RESYNC) { | ||
716 | ret = register_trace__drbd_resync(probe_drbd_resync); | ||
717 | WARN_ON(ret); | ||
718 | } | ||
719 | return 0; | ||
720 | } | ||
721 | |||
722 | module_init(drbd_trace_init); | ||
723 | |||
724 | static void __exit drbd_trace_exit(void) | ||
725 | { | ||
726 | if (trace_mask & TRACE_UNPLUG) | ||
727 | unregister_trace_drbd_unplug(probe_drbd_unplug); | ||
728 | if (trace_mask & TRACE_UUID) | ||
729 | unregister_trace_drbd_uuid(probe_drbd_uuid); | ||
730 | if (trace_mask & TRACE_EE) | ||
731 | unregister_trace_drbd_ee(probe_drbd_ee); | ||
732 | if (trace_mask & TRACE_PACKET) | ||
733 | unregister_trace_drbd_packet(probe_drbd_packet); | ||
734 | if (trace_mask & TRACE_MD_IO) | ||
735 | unregister_trace_drbd_md_io(probe_drbd_md_io); | ||
736 | if (trace_mask & TRACE_EPOCH) | ||
737 | unregister_trace_drbd_epoch(probe_drbd_epoch); | ||
738 | if (trace_mask & TRACE_NL) | ||
739 | unregister_trace_drbd_netlink(probe_drbd_netlink); | ||
740 | if (trace_mask & TRACE_AL_EXT) | ||
741 | unregister_trace_drbd_actlog(probe_drbd_actlog); | ||
742 | if (trace_mask & TRACE_RQ) | ||
743 | unregister_trace_drbd_bio(probe_drbd_bio); | ||
744 | if (trace_mask & TRACE_INT_RQ) | ||
745 | unregister_trace_drbd_req(probe_drbd_req); | ||
746 | if (trace_mask & TRACE_RESYNC) | ||
747 | unregister_trace__drbd_resync(probe_drbd_resync); | ||
748 | |||
749 | tracepoint_synchronize_unregister(); | ||
750 | } | ||
751 | |||
752 | module_exit(drbd_trace_exit); | ||
diff --git a/drivers/block/drbd/drbd_tracing.h b/drivers/block/drbd/drbd_tracing.h new file mode 100644 index 00000000000..c4531a137f6 --- /dev/null +++ b/drivers/block/drbd/drbd_tracing.h | |||
@@ -0,0 +1,87 @@ | |||
1 | /* | ||
2 | drbd_tracing.h | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #ifndef DRBD_TRACING_H | ||
27 | #define DRBD_TRACING_H | ||
28 | |||
29 | #include <linux/tracepoint.h> | ||
30 | #include "drbd_int.h" | ||
31 | #include "drbd_req.h" | ||
32 | |||
33 | enum { | ||
34 | TRACE_LVL_ALWAYS = 0, | ||
35 | TRACE_LVL_SUMMARY, | ||
36 | TRACE_LVL_METRICS, | ||
37 | TRACE_LVL_ALL, | ||
38 | TRACE_LVL_MAX | ||
39 | }; | ||
40 | |||
41 | DECLARE_TRACE(drbd_unplug, | ||
42 | TP_PROTO(struct drbd_conf *mdev, char* msg), | ||
43 | TP_ARGS(mdev, msg)); | ||
44 | |||
45 | DECLARE_TRACE(drbd_uuid, | ||
46 | TP_PROTO(struct drbd_conf *mdev, enum drbd_uuid_index index), | ||
47 | TP_ARGS(mdev, index)); | ||
48 | |||
49 | DECLARE_TRACE(drbd_ee, | ||
50 | TP_PROTO(struct drbd_conf *mdev, struct drbd_epoch_entry *e, char* msg), | ||
51 | TP_ARGS(mdev, e, msg)); | ||
52 | |||
53 | DECLARE_TRACE(drbd_md_io, | ||
54 | TP_PROTO(struct drbd_conf *mdev, int rw, struct drbd_backing_dev *bdev), | ||
55 | TP_ARGS(mdev, rw, bdev)); | ||
56 | |||
57 | DECLARE_TRACE(drbd_epoch, | ||
58 | TP_PROTO(struct drbd_conf *mdev, struct drbd_epoch *epoch, enum epoch_event ev), | ||
59 | TP_ARGS(mdev, epoch, ev)); | ||
60 | |||
61 | DECLARE_TRACE(drbd_netlink, | ||
62 | TP_PROTO(void *data, int is_req), | ||
63 | TP_ARGS(data, is_req)); | ||
64 | |||
65 | DECLARE_TRACE(drbd_actlog, | ||
66 | TP_PROTO(struct drbd_conf *mdev, sector_t sector, char* msg), | ||
67 | TP_ARGS(mdev, sector, msg)); | ||
68 | |||
69 | DECLARE_TRACE(drbd_bio, | ||
70 | TP_PROTO(struct drbd_conf *mdev, const char *pfx, struct bio *bio, int complete, | ||
71 | struct drbd_request *r), | ||
72 | TP_ARGS(mdev, pfx, bio, complete, r)); | ||
73 | |||
74 | DECLARE_TRACE(drbd_req, | ||
75 | TP_PROTO(struct drbd_request *req, enum drbd_req_event what, char *msg), | ||
76 | TP_ARGS(req, what, msg)); | ||
77 | |||
78 | DECLARE_TRACE(drbd_packet, | ||
79 | TP_PROTO(struct drbd_conf *mdev, struct socket *sock, | ||
80 | int recv, union p_polymorph *p, char *file, int line), | ||
81 | TP_ARGS(mdev, sock, recv, p, file, line)); | ||
82 | |||
83 | DECLARE_TRACE(_drbd_resync, | ||
84 | TP_PROTO(struct drbd_conf *mdev, int level, const char *fmt, va_list args), | ||
85 | TP_ARGS(mdev, level, fmt, args)); | ||
86 | |||
87 | #endif | ||
diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h new file mode 100644 index 00000000000..fc824006e72 --- /dev/null +++ b/drivers/block/drbd/drbd_vli.h | |||
@@ -0,0 +1,351 @@ | |||
1 | /* | ||
2 | -*- linux-c -*- | ||
3 | drbd_receiver.c | ||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | */ | ||
24 | |||
25 | #ifndef _DRBD_VLI_H | ||
26 | #define _DRBD_VLI_H | ||
27 | |||
28 | /* | ||
29 | * At a granularity of 4KiB storage represented per bit, | ||
30 | * and stroage sizes of several TiB, | ||
31 | * and possibly small-bandwidth replication, | ||
32 | * the bitmap transfer time can take much too long, | ||
33 | * if transmitted in plain text. | ||
34 | * | ||
35 | * We try to reduce the transfered bitmap information | ||
36 | * by encoding runlengths of bit polarity. | ||
37 | * | ||
38 | * We never actually need to encode a "zero" (runlengths are positive). | ||
39 | * But then we have to store the value of the first bit. | ||
40 | * The first bit of information thus shall encode if the first runlength | ||
41 | * gives the number of set or unset bits. | ||
42 | * | ||
43 | * We assume that large areas are either completely set or unset, | ||
44 | * which gives good compression with any runlength method, | ||
45 | * even when encoding the runlength as fixed size 32bit/64bit integers. | ||
46 | * | ||
47 | * Still, there may be areas where the polarity flips every few bits, | ||
48 | * and encoding the runlength sequence of those areas with fix size | ||
49 | * integers would be much worse than plaintext. | ||
50 | * | ||
51 | * We want to encode small runlength values with minimum code length, | ||
52 | * while still being able to encode a Huge run of all zeros. | ||
53 | * | ||
54 | * Thus we need a Variable Length Integer encoding, VLI. | ||
55 | * | ||
56 | * For some cases, we produce more code bits than plaintext input. | ||
57 | * We need to send incompressible chunks as plaintext, skip over them | ||
58 | * and then see if the next chunk compresses better. | ||
59 | * | ||
60 | * We don't care too much about "excellent" compression ratio for large | ||
61 | * runlengths (all set/all clear): whether we achieve a factor of 100 | ||
62 | * or 1000 is not that much of an issue. | ||
63 | * We do not want to waste too much on short runlengths in the "noisy" | ||
64 | * parts of the bitmap, though. | ||
65 | * | ||
66 | * There are endless variants of VLI, we experimented with: | ||
67 | * * simple byte-based | ||
68 | * * various bit based with different code word length. | ||
69 | * | ||
70 | * To avoid yet an other configuration parameter (choice of bitmap compression | ||
71 | * algorithm) which was difficult to explain and tune, we just chose the one | ||
72 | * variant that turned out best in all test cases. | ||
73 | * Based on real world usage patterns, with device sizes ranging from a few GiB | ||
74 | * to several TiB, file server/mailserver/webserver/mysql/postgress, | ||
75 | * mostly idle to really busy, the all time winner (though sometimes only | ||
76 | * marginally better) is: | ||
77 | */ | ||
78 | |||
79 | /* | ||
80 | * encoding is "visualised" as | ||
81 | * __little endian__ bitstream, least significant bit first (left most) | ||
82 | * | ||
83 | * this particular encoding is chosen so that the prefix code | ||
84 | * starts as unary encoding the level, then modified so that | ||
85 | * 10 levels can be described in 8bit, with minimal overhead | ||
86 | * for the smaller levels. | ||
87 | * | ||
88 | * Number of data bits follow fibonacci sequence, with the exception of the | ||
89 | * last level (+1 data bit, so it makes 64bit total). The only worse code when | ||
90 | * encoding bit polarity runlength is 1 plain bits => 2 code bits. | ||
91 | prefix data bits max val Nº data bits | ||
92 | 0 x 0x2 1 | ||
93 | 10 x 0x4 1 | ||
94 | 110 xx 0x8 2 | ||
95 | 1110 xxx 0x10 3 | ||
96 | 11110 xxx xx 0x30 5 | ||
97 | 111110 xx xxxxxx 0x130 8 | ||
98 | 11111100 xxxxxxxx xxxxx 0x2130 13 | ||
99 | 11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21 | ||
100 | 11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34 | ||
101 | 11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56 | ||
102 | * maximum encodable value: 0x100000400202130 == 2**56 + some */ | ||
103 | |||
104 | /* compression "table": | ||
105 | transmitted x 0.29 | ||
106 | as plaintext x ........................ | ||
107 | x ........................ | ||
108 | x ........................ | ||
109 | x 0.59 0.21........................ | ||
110 | x ........................................................ | ||
111 | x .. c ................................................... | ||
112 | x 0.44.. o ................................................... | ||
113 | x .......... d ................................................... | ||
114 | x .......... e ................................................... | ||
115 | X............. ................................................... | ||
116 | x.............. b ................................................... | ||
117 | 2.0x............... i ................................................... | ||
118 | #X................ t ................................................... | ||
119 | #................. s ........................... plain bits .......... | ||
120 | -+----------------------------------------------------------------------- | ||
121 | 1 16 32 64 | ||
122 | */ | ||
123 | |||
124 | /* LEVEL: (total bits, prefix bits, prefix value), | ||
125 | * sorted ascending by number of total bits. | ||
126 | * The rest of the code table is calculated at compiletime from this. */ | ||
127 | |||
128 | /* fibonacci data 1, 1, ... */ | ||
129 | #define VLI_L_1_1() do { \ | ||
130 | LEVEL( 2, 1, 0x00); \ | ||
131 | LEVEL( 3, 2, 0x01); \ | ||
132 | LEVEL( 5, 3, 0x03); \ | ||
133 | LEVEL( 7, 4, 0x07); \ | ||
134 | LEVEL(10, 5, 0x0f); \ | ||
135 | LEVEL(14, 6, 0x1f); \ | ||
136 | LEVEL(21, 8, 0x3f); \ | ||
137 | LEVEL(29, 8, 0x7f); \ | ||
138 | LEVEL(42, 8, 0xbf); \ | ||
139 | LEVEL(64, 8, 0xff); \ | ||
140 | } while (0) | ||
141 | |||
142 | /* finds a suitable level to decode the least significant part of in. | ||
143 | * returns number of bits consumed. | ||
144 | * | ||
145 | * BUG() for bad input, as that would mean a buggy code table. */ | ||
146 | static inline int vli_decode_bits(u64 *out, const u64 in) | ||
147 | { | ||
148 | u64 adj = 1; | ||
149 | |||
150 | #define LEVEL(t,b,v) \ | ||
151 | do { \ | ||
152 | if ((in & ((1 << b) -1)) == v) { \ | ||
153 | *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \ | ||
154 | return t; \ | ||
155 | } \ | ||
156 | adj += 1ULL << (t - b); \ | ||
157 | } while (0) | ||
158 | |||
159 | VLI_L_1_1(); | ||
160 | |||
161 | /* NOT REACHED, if VLI_LEVELS code table is defined properly */ | ||
162 | BUG(); | ||
163 | #undef LEVEL | ||
164 | } | ||
165 | |||
166 | /* return number of code bits needed, | ||
167 | * or negative error number */ | ||
168 | static inline int __vli_encode_bits(u64 *out, const u64 in) | ||
169 | { | ||
170 | u64 max = 0; | ||
171 | u64 adj = 1; | ||
172 | |||
173 | if (in == 0) | ||
174 | return -EINVAL; | ||
175 | |||
176 | #define LEVEL(t,b,v) do { \ | ||
177 | max += 1ULL << (t - b); \ | ||
178 | if (in <= max) { \ | ||
179 | if (out) \ | ||
180 | *out = ((in - adj) << b) | v; \ | ||
181 | return t; \ | ||
182 | } \ | ||
183 | adj = max + 1; \ | ||
184 | } while (0) | ||
185 | |||
186 | VLI_L_1_1(); | ||
187 | |||
188 | return -EOVERFLOW; | ||
189 | #undef LEVEL | ||
190 | } | ||
191 | |||
192 | #undef VLI_L_1_1 | ||
193 | |||
194 | /* code from here down is independend of actually used bit code */ | ||
195 | |||
196 | /* | ||
197 | * Code length is determined by some unique (e.g. unary) prefix. | ||
198 | * This encodes arbitrary bit length, not whole bytes: we have a bit-stream, | ||
199 | * not a byte stream. | ||
200 | */ | ||
201 | |||
202 | /* for the bitstream, we need a cursor */ | ||
203 | struct bitstream_cursor { | ||
204 | /* the current byte */ | ||
205 | u8 *b; | ||
206 | /* the current bit within *b, nomalized: 0..7 */ | ||
207 | unsigned int bit; | ||
208 | }; | ||
209 | |||
210 | /* initialize cursor to point to first bit of stream */ | ||
211 | static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s) | ||
212 | { | ||
213 | cur->b = s; | ||
214 | cur->bit = 0; | ||
215 | } | ||
216 | |||
217 | /* advance cursor by that many bits; maximum expected input value: 64, | ||
218 | * but depending on VLI implementation, it may be more. */ | ||
219 | static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits) | ||
220 | { | ||
221 | bits += cur->bit; | ||
222 | cur->b = cur->b + (bits >> 3); | ||
223 | cur->bit = bits & 7; | ||
224 | } | ||
225 | |||
226 | /* the bitstream itself knows its length */ | ||
227 | struct bitstream { | ||
228 | struct bitstream_cursor cur; | ||
229 | unsigned char *buf; | ||
230 | size_t buf_len; /* in bytes */ | ||
231 | |||
232 | /* for input stream: | ||
233 | * number of trailing 0 bits for padding | ||
234 | * total number of valid bits in stream: buf_len * 8 - pad_bits */ | ||
235 | unsigned int pad_bits; | ||
236 | }; | ||
237 | |||
238 | static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits) | ||
239 | { | ||
240 | bs->buf = s; | ||
241 | bs->buf_len = len; | ||
242 | bs->pad_bits = pad_bits; | ||
243 | bitstream_cursor_reset(&bs->cur, bs->buf); | ||
244 | } | ||
245 | |||
246 | static inline void bitstream_rewind(struct bitstream *bs) | ||
247 | { | ||
248 | bitstream_cursor_reset(&bs->cur, bs->buf); | ||
249 | memset(bs->buf, 0, bs->buf_len); | ||
250 | } | ||
251 | |||
252 | /* Put (at most 64) least significant bits of val into bitstream, and advance cursor. | ||
253 | * Ignores "pad_bits". | ||
254 | * Returns zero if bits == 0 (nothing to do). | ||
255 | * Returns number of bits used if successful. | ||
256 | * | ||
257 | * If there is not enough room left in bitstream, | ||
258 | * leaves bitstream unchanged and returns -ENOBUFS. | ||
259 | */ | ||
260 | static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits) | ||
261 | { | ||
262 | unsigned char *b = bs->cur.b; | ||
263 | unsigned int tmp; | ||
264 | |||
265 | if (bits == 0) | ||
266 | return 0; | ||
267 | |||
268 | if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len) | ||
269 | return -ENOBUFS; | ||
270 | |||
271 | /* paranoia: strip off hi bits; they should not be set anyways. */ | ||
272 | if (bits < 64) | ||
273 | val &= ~0ULL >> (64 - bits); | ||
274 | |||
275 | *b++ |= (val & 0xff) << bs->cur.bit; | ||
276 | |||
277 | for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8) | ||
278 | *b++ |= (val >> tmp) & 0xff; | ||
279 | |||
280 | bitstream_cursor_advance(&bs->cur, bits); | ||
281 | return bits; | ||
282 | } | ||
283 | |||
284 | /* Fetch (at most 64) bits from bitstream into *out, and advance cursor. | ||
285 | * | ||
286 | * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged. | ||
287 | * | ||
288 | * If there are less than the requested number of valid bits left in the | ||
289 | * bitstream, still fetches all available bits. | ||
290 | * | ||
291 | * Returns number of actually fetched bits. | ||
292 | */ | ||
293 | static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits) | ||
294 | { | ||
295 | u64 val; | ||
296 | unsigned int n; | ||
297 | |||
298 | if (bits > 64) | ||
299 | return -EINVAL; | ||
300 | |||
301 | if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len) | ||
302 | bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3) | ||
303 | - bs->cur.bit - bs->pad_bits; | ||
304 | |||
305 | if (bits == 0) { | ||
306 | *out = 0; | ||
307 | return 0; | ||
308 | } | ||
309 | |||
310 | /* get the high bits */ | ||
311 | val = 0; | ||
312 | n = (bs->cur.bit + bits + 7) >> 3; | ||
313 | /* n may be at most 9, if cur.bit + bits > 64 */ | ||
314 | /* which means this copies at most 8 byte */ | ||
315 | if (n) { | ||
316 | memcpy(&val, bs->cur.b+1, n - 1); | ||
317 | val = le64_to_cpu(val) << (8 - bs->cur.bit); | ||
318 | } | ||
319 | |||
320 | /* we still need the low bits */ | ||
321 | val |= bs->cur.b[0] >> bs->cur.bit; | ||
322 | |||
323 | /* and mask out bits we don't want */ | ||
324 | val &= ~0ULL >> (64 - bits); | ||
325 | |||
326 | bitstream_cursor_advance(&bs->cur, bits); | ||
327 | *out = val; | ||
328 | |||
329 | return bits; | ||
330 | } | ||
331 | |||
332 | /* encodes @in as vli into @bs; | ||
333 | |||
334 | * return values | ||
335 | * > 0: number of bits successfully stored in bitstream | ||
336 | * -ENOBUFS @bs is full | ||
337 | * -EINVAL input zero (invalid) | ||
338 | * -EOVERFLOW input too large for this vli code (invalid) | ||
339 | */ | ||
340 | static inline int vli_encode_bits(struct bitstream *bs, u64 in) | ||
341 | { | ||
342 | u64 code = code; | ||
343 | int bits = __vli_encode_bits(&code, in); | ||
344 | |||
345 | if (bits <= 0) | ||
346 | return bits; | ||
347 | |||
348 | return bitstream_put_bits(bs, code, bits); | ||
349 | } | ||
350 | |||
351 | #endif | ||
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c new file mode 100644 index 00000000000..212e9545e63 --- /dev/null +++ b/drivers/block/drbd/drbd_worker.c | |||
@@ -0,0 +1,1529 @@ | |||
1 | /* | ||
2 | drbd_worker.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #include <linux/autoconf.h> | ||
27 | #include <linux/module.h> | ||
28 | #include <linux/version.h> | ||
29 | #include <linux/drbd.h> | ||
30 | #include <linux/sched.h> | ||
31 | #include <linux/smp_lock.h> | ||
32 | #include <linux/wait.h> | ||
33 | #include <linux/mm.h> | ||
34 | #include <linux/memcontrol.h> | ||
35 | #include <linux/mm_inline.h> | ||
36 | #include <linux/slab.h> | ||
37 | #include <linux/random.h> | ||
38 | #include <linux/mm.h> | ||
39 | #include <linux/string.h> | ||
40 | #include <linux/scatterlist.h> | ||
41 | |||
42 | #include "drbd_int.h" | ||
43 | #include "drbd_req.h" | ||
44 | #include "drbd_tracing.h" | ||
45 | |||
46 | #define SLEEP_TIME (HZ/10) | ||
47 | |||
48 | static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel); | ||
49 | |||
50 | |||
51 | |||
52 | /* defined here: | ||
53 | drbd_md_io_complete | ||
54 | drbd_endio_write_sec | ||
55 | drbd_endio_read_sec | ||
56 | drbd_endio_pri | ||
57 | |||
58 | * more endio handlers: | ||
59 | atodb_endio in drbd_actlog.c | ||
60 | drbd_bm_async_io_complete in drbd_bitmap.c | ||
61 | |||
62 | * For all these callbacks, note the following: | ||
63 | * The callbacks will be called in irq context by the IDE drivers, | ||
64 | * and in Softirqs/Tasklets/BH context by the SCSI drivers. | ||
65 | * Try to get the locking right :) | ||
66 | * | ||
67 | */ | ||
68 | |||
69 | |||
70 | /* About the global_state_lock | ||
71 | Each state transition on an device holds a read lock. In case we have | ||
72 | to evaluate the sync after dependencies, we grab a write lock, because | ||
73 | we need stable states on all devices for that. */ | ||
74 | rwlock_t global_state_lock; | ||
75 | |||
76 | /* used for synchronous meta data and bitmap IO | ||
77 | * submitted by drbd_md_sync_page_io() | ||
78 | */ | ||
79 | void drbd_md_io_complete(struct bio *bio, int error) | ||
80 | { | ||
81 | struct drbd_md_io *md_io; | ||
82 | |||
83 | md_io = (struct drbd_md_io *)bio->bi_private; | ||
84 | md_io->error = error; | ||
85 | |||
86 | trace_drbd_bio(md_io->mdev, "Md", bio, 1, NULL); | ||
87 | |||
88 | complete(&md_io->event); | ||
89 | } | ||
90 | |||
91 | /* reads on behalf of the partner, | ||
92 | * "submitted" by the receiver | ||
93 | */ | ||
94 | void drbd_endio_read_sec(struct bio *bio, int error) __releases(local) | ||
95 | { | ||
96 | unsigned long flags = 0; | ||
97 | struct drbd_epoch_entry *e = NULL; | ||
98 | struct drbd_conf *mdev; | ||
99 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | ||
100 | |||
101 | e = bio->bi_private; | ||
102 | mdev = e->mdev; | ||
103 | |||
104 | if (error) | ||
105 | dev_warn(DEV, "read: error=%d s=%llus\n", error, | ||
106 | (unsigned long long)e->sector); | ||
107 | if (!error && !uptodate) { | ||
108 | dev_warn(DEV, "read: setting error to -EIO s=%llus\n", | ||
109 | (unsigned long long)e->sector); | ||
110 | /* strange behavior of some lower level drivers... | ||
111 | * fail the request by clearing the uptodate flag, | ||
112 | * but do not return any error?! */ | ||
113 | error = -EIO; | ||
114 | } | ||
115 | |||
116 | D_ASSERT(e->block_id != ID_VACANT); | ||
117 | |||
118 | trace_drbd_bio(mdev, "Sec", bio, 1, NULL); | ||
119 | |||
120 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
121 | mdev->read_cnt += e->size >> 9; | ||
122 | list_del(&e->w.list); | ||
123 | if (list_empty(&mdev->read_ee)) | ||
124 | wake_up(&mdev->ee_wait); | ||
125 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
126 | |||
127 | drbd_chk_io_error(mdev, error, FALSE); | ||
128 | drbd_queue_work(&mdev->data.work, &e->w); | ||
129 | put_ldev(mdev); | ||
130 | |||
131 | trace_drbd_ee(mdev, e, "read completed"); | ||
132 | } | ||
133 | |||
134 | /* writes on behalf of the partner, or resync writes, | ||
135 | * "submitted" by the receiver. | ||
136 | */ | ||
137 | void drbd_endio_write_sec(struct bio *bio, int error) __releases(local) | ||
138 | { | ||
139 | unsigned long flags = 0; | ||
140 | struct drbd_epoch_entry *e = NULL; | ||
141 | struct drbd_conf *mdev; | ||
142 | sector_t e_sector; | ||
143 | int do_wake; | ||
144 | int is_syncer_req; | ||
145 | int do_al_complete_io; | ||
146 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | ||
147 | int is_barrier = bio_rw_flagged(bio, BIO_RW_BARRIER); | ||
148 | |||
149 | e = bio->bi_private; | ||
150 | mdev = e->mdev; | ||
151 | |||
152 | if (error) | ||
153 | dev_warn(DEV, "write: error=%d s=%llus\n", error, | ||
154 | (unsigned long long)e->sector); | ||
155 | if (!error && !uptodate) { | ||
156 | dev_warn(DEV, "write: setting error to -EIO s=%llus\n", | ||
157 | (unsigned long long)e->sector); | ||
158 | /* strange behavior of some lower level drivers... | ||
159 | * fail the request by clearing the uptodate flag, | ||
160 | * but do not return any error?! */ | ||
161 | error = -EIO; | ||
162 | } | ||
163 | |||
164 | /* error == -ENOTSUPP would be a better test, | ||
165 | * alas it is not reliable */ | ||
166 | if (error && is_barrier && e->flags & EE_IS_BARRIER) { | ||
167 | drbd_bump_write_ordering(mdev, WO_bdev_flush); | ||
168 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
169 | list_del(&e->w.list); | ||
170 | e->w.cb = w_e_reissue; | ||
171 | /* put_ldev actually happens below, once we come here again. */ | ||
172 | __release(local); | ||
173 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
174 | drbd_queue_work(&mdev->data.work, &e->w); | ||
175 | return; | ||
176 | } | ||
177 | |||
178 | D_ASSERT(e->block_id != ID_VACANT); | ||
179 | |||
180 | trace_drbd_bio(mdev, "Sec", bio, 1, NULL); | ||
181 | |||
182 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
183 | mdev->writ_cnt += e->size >> 9; | ||
184 | is_syncer_req = is_syncer_block_id(e->block_id); | ||
185 | |||
186 | /* after we moved e to done_ee, | ||
187 | * we may no longer access it, | ||
188 | * it may be freed/reused already! | ||
189 | * (as soon as we release the req_lock) */ | ||
190 | e_sector = e->sector; | ||
191 | do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO; | ||
192 | |||
193 | list_del(&e->w.list); /* has been on active_ee or sync_ee */ | ||
194 | list_add_tail(&e->w.list, &mdev->done_ee); | ||
195 | |||
196 | trace_drbd_ee(mdev, e, "write completed"); | ||
197 | |||
198 | /* No hlist_del_init(&e->colision) here, we did not send the Ack yet, | ||
199 | * neither did we wake possibly waiting conflicting requests. | ||
200 | * done from "drbd_process_done_ee" within the appropriate w.cb | ||
201 | * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */ | ||
202 | |||
203 | do_wake = is_syncer_req | ||
204 | ? list_empty(&mdev->sync_ee) | ||
205 | : list_empty(&mdev->active_ee); | ||
206 | |||
207 | if (error) | ||
208 | __drbd_chk_io_error(mdev, FALSE); | ||
209 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
210 | |||
211 | if (is_syncer_req) | ||
212 | drbd_rs_complete_io(mdev, e_sector); | ||
213 | |||
214 | if (do_wake) | ||
215 | wake_up(&mdev->ee_wait); | ||
216 | |||
217 | if (do_al_complete_io) | ||
218 | drbd_al_complete_io(mdev, e_sector); | ||
219 | |||
220 | wake_asender(mdev); | ||
221 | put_ldev(mdev); | ||
222 | |||
223 | } | ||
224 | |||
225 | /* read, readA or write requests on R_PRIMARY coming from drbd_make_request | ||
226 | */ | ||
227 | void drbd_endio_pri(struct bio *bio, int error) | ||
228 | { | ||
229 | unsigned long flags; | ||
230 | struct drbd_request *req = bio->bi_private; | ||
231 | struct drbd_conf *mdev = req->mdev; | ||
232 | struct bio_and_error m; | ||
233 | enum drbd_req_event what; | ||
234 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | ||
235 | |||
236 | if (error) | ||
237 | dev_warn(DEV, "p %s: error=%d\n", | ||
238 | bio_data_dir(bio) == WRITE ? "write" : "read", error); | ||
239 | if (!error && !uptodate) { | ||
240 | dev_warn(DEV, "p %s: setting error to -EIO\n", | ||
241 | bio_data_dir(bio) == WRITE ? "write" : "read"); | ||
242 | /* strange behavior of some lower level drivers... | ||
243 | * fail the request by clearing the uptodate flag, | ||
244 | * but do not return any error?! */ | ||
245 | error = -EIO; | ||
246 | } | ||
247 | |||
248 | trace_drbd_bio(mdev, "Pri", bio, 1, NULL); | ||
249 | |||
250 | /* to avoid recursion in __req_mod */ | ||
251 | if (unlikely(error)) { | ||
252 | what = (bio_data_dir(bio) == WRITE) | ||
253 | ? write_completed_with_error | ||
254 | : (bio_rw(bio) == READA) | ||
255 | ? read_completed_with_error | ||
256 | : read_ahead_completed_with_error; | ||
257 | } else | ||
258 | what = completed_ok; | ||
259 | |||
260 | bio_put(req->private_bio); | ||
261 | req->private_bio = ERR_PTR(error); | ||
262 | |||
263 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
264 | __req_mod(req, what, &m); | ||
265 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
266 | |||
267 | if (m.bio) | ||
268 | complete_master_bio(mdev, &m); | ||
269 | } | ||
270 | |||
271 | int w_io_error(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
272 | { | ||
273 | struct drbd_request *req = container_of(w, struct drbd_request, w); | ||
274 | |||
275 | /* NOTE: mdev->ldev can be NULL by the time we get here! */ | ||
276 | /* D_ASSERT(mdev->ldev->dc.on_io_error != EP_PASS_ON); */ | ||
277 | |||
278 | /* the only way this callback is scheduled is from _req_may_be_done, | ||
279 | * when it is done and had a local write error, see comments there */ | ||
280 | drbd_req_free(req); | ||
281 | |||
282 | return TRUE; | ||
283 | } | ||
284 | |||
285 | int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
286 | { | ||
287 | struct drbd_request *req = container_of(w, struct drbd_request, w); | ||
288 | |||
289 | /* We should not detach for read io-error, | ||
290 | * but try to WRITE the P_DATA_REPLY to the failed location, | ||
291 | * to give the disk the chance to relocate that block */ | ||
292 | |||
293 | spin_lock_irq(&mdev->req_lock); | ||
294 | if (cancel || | ||
295 | mdev->state.conn < C_CONNECTED || | ||
296 | mdev->state.pdsk <= D_INCONSISTENT) { | ||
297 | _req_mod(req, send_canceled); | ||
298 | spin_unlock_irq(&mdev->req_lock); | ||
299 | dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n"); | ||
300 | return 1; | ||
301 | } | ||
302 | spin_unlock_irq(&mdev->req_lock); | ||
303 | |||
304 | return w_send_read_req(mdev, w, 0); | ||
305 | } | ||
306 | |||
307 | int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
308 | { | ||
309 | ERR_IF(cancel) return 1; | ||
310 | dev_err(DEV, "resync inactive, but callback triggered??\n"); | ||
311 | return 1; /* Simply ignore this! */ | ||
312 | } | ||
313 | |||
314 | void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest) | ||
315 | { | ||
316 | struct hash_desc desc; | ||
317 | struct scatterlist sg; | ||
318 | struct bio_vec *bvec; | ||
319 | int i; | ||
320 | |||
321 | desc.tfm = tfm; | ||
322 | desc.flags = 0; | ||
323 | |||
324 | sg_init_table(&sg, 1); | ||
325 | crypto_hash_init(&desc); | ||
326 | |||
327 | __bio_for_each_segment(bvec, bio, i, 0) { | ||
328 | sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset); | ||
329 | crypto_hash_update(&desc, &sg, sg.length); | ||
330 | } | ||
331 | crypto_hash_final(&desc, digest); | ||
332 | } | ||
333 | |||
334 | static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
335 | { | ||
336 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | ||
337 | int digest_size; | ||
338 | void *digest; | ||
339 | int ok; | ||
340 | |||
341 | D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef); | ||
342 | |||
343 | if (unlikely(cancel)) { | ||
344 | drbd_free_ee(mdev, e); | ||
345 | return 1; | ||
346 | } | ||
347 | |||
348 | if (likely(drbd_bio_uptodate(e->private_bio))) { | ||
349 | digest_size = crypto_hash_digestsize(mdev->csums_tfm); | ||
350 | digest = kmalloc(digest_size, GFP_NOIO); | ||
351 | if (digest) { | ||
352 | drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest); | ||
353 | |||
354 | inc_rs_pending(mdev); | ||
355 | ok = drbd_send_drequest_csum(mdev, | ||
356 | e->sector, | ||
357 | e->size, | ||
358 | digest, | ||
359 | digest_size, | ||
360 | P_CSUM_RS_REQUEST); | ||
361 | kfree(digest); | ||
362 | } else { | ||
363 | dev_err(DEV, "kmalloc() of digest failed.\n"); | ||
364 | ok = 0; | ||
365 | } | ||
366 | } else | ||
367 | ok = 1; | ||
368 | |||
369 | drbd_free_ee(mdev, e); | ||
370 | |||
371 | if (unlikely(!ok)) | ||
372 | dev_err(DEV, "drbd_send_drequest(..., csum) failed\n"); | ||
373 | return ok; | ||
374 | } | ||
375 | |||
376 | #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) | ||
377 | |||
378 | static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) | ||
379 | { | ||
380 | struct drbd_epoch_entry *e; | ||
381 | |||
382 | if (!get_ldev(mdev)) | ||
383 | return 0; | ||
384 | |||
385 | /* GFP_TRY, because if there is no memory available right now, this may | ||
386 | * be rescheduled for later. It is "only" background resync, after all. */ | ||
387 | e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY); | ||
388 | if (!e) { | ||
389 | put_ldev(mdev); | ||
390 | return 2; | ||
391 | } | ||
392 | |||
393 | spin_lock_irq(&mdev->req_lock); | ||
394 | list_add(&e->w.list, &mdev->read_ee); | ||
395 | spin_unlock_irq(&mdev->req_lock); | ||
396 | |||
397 | e->private_bio->bi_end_io = drbd_endio_read_sec; | ||
398 | e->private_bio->bi_rw = READ; | ||
399 | e->w.cb = w_e_send_csum; | ||
400 | |||
401 | mdev->read_cnt += size >> 9; | ||
402 | drbd_generic_make_request(mdev, DRBD_FAULT_RS_RD, e->private_bio); | ||
403 | |||
404 | return 1; | ||
405 | } | ||
406 | |||
407 | void resync_timer_fn(unsigned long data) | ||
408 | { | ||
409 | unsigned long flags; | ||
410 | struct drbd_conf *mdev = (struct drbd_conf *) data; | ||
411 | int queue; | ||
412 | |||
413 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
414 | |||
415 | if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) { | ||
416 | queue = 1; | ||
417 | if (mdev->state.conn == C_VERIFY_S) | ||
418 | mdev->resync_work.cb = w_make_ov_request; | ||
419 | else | ||
420 | mdev->resync_work.cb = w_make_resync_request; | ||
421 | } else { | ||
422 | queue = 0; | ||
423 | mdev->resync_work.cb = w_resync_inactive; | ||
424 | } | ||
425 | |||
426 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
427 | |||
428 | /* harmless race: list_empty outside data.work.q_lock */ | ||
429 | if (list_empty(&mdev->resync_work.list) && queue) | ||
430 | drbd_queue_work(&mdev->data.work, &mdev->resync_work); | ||
431 | } | ||
432 | |||
433 | int w_make_resync_request(struct drbd_conf *mdev, | ||
434 | struct drbd_work *w, int cancel) | ||
435 | { | ||
436 | unsigned long bit; | ||
437 | sector_t sector; | ||
438 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); | ||
439 | int max_segment_size = queue_max_segment_size(mdev->rq_queue); | ||
440 | int number, i, size, pe, mx; | ||
441 | int align, queued, sndbuf; | ||
442 | |||
443 | if (unlikely(cancel)) | ||
444 | return 1; | ||
445 | |||
446 | if (unlikely(mdev->state.conn < C_CONNECTED)) { | ||
447 | dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected"); | ||
448 | return 0; | ||
449 | } | ||
450 | |||
451 | if (mdev->state.conn != C_SYNC_TARGET) | ||
452 | dev_err(DEV, "%s in w_make_resync_request\n", | ||
453 | drbd_conn_str(mdev->state.conn)); | ||
454 | |||
455 | if (!get_ldev(mdev)) { | ||
456 | /* Since we only need to access mdev->rsync a | ||
457 | get_ldev_if_state(mdev,D_FAILED) would be sufficient, but | ||
458 | to continue resync with a broken disk makes no sense at | ||
459 | all */ | ||
460 | dev_err(DEV, "Disk broke down during resync!\n"); | ||
461 | mdev->resync_work.cb = w_resync_inactive; | ||
462 | return 1; | ||
463 | } | ||
464 | |||
465 | number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ); | ||
466 | pe = atomic_read(&mdev->rs_pending_cnt); | ||
467 | |||
468 | mutex_lock(&mdev->data.mutex); | ||
469 | if (mdev->data.socket) | ||
470 | mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req); | ||
471 | else | ||
472 | mx = 1; | ||
473 | mutex_unlock(&mdev->data.mutex); | ||
474 | |||
475 | /* For resync rates >160MB/sec, allow more pending RS requests */ | ||
476 | if (number > mx) | ||
477 | mx = number; | ||
478 | |||
479 | /* Limit the number of pending RS requests to no more than the peer's receive buffer */ | ||
480 | if ((pe + number) > mx) { | ||
481 | number = mx - pe; | ||
482 | } | ||
483 | |||
484 | for (i = 0; i < number; i++) { | ||
485 | /* Stop generating RS requests, when half of the send buffer is filled */ | ||
486 | mutex_lock(&mdev->data.mutex); | ||
487 | if (mdev->data.socket) { | ||
488 | queued = mdev->data.socket->sk->sk_wmem_queued; | ||
489 | sndbuf = mdev->data.socket->sk->sk_sndbuf; | ||
490 | } else { | ||
491 | queued = 1; | ||
492 | sndbuf = 0; | ||
493 | } | ||
494 | mutex_unlock(&mdev->data.mutex); | ||
495 | if (queued > sndbuf / 2) | ||
496 | goto requeue; | ||
497 | |||
498 | next_sector: | ||
499 | size = BM_BLOCK_SIZE; | ||
500 | bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo); | ||
501 | |||
502 | if (bit == -1UL) { | ||
503 | mdev->bm_resync_fo = drbd_bm_bits(mdev); | ||
504 | mdev->resync_work.cb = w_resync_inactive; | ||
505 | put_ldev(mdev); | ||
506 | return 1; | ||
507 | } | ||
508 | |||
509 | sector = BM_BIT_TO_SECT(bit); | ||
510 | |||
511 | if (drbd_try_rs_begin_io(mdev, sector)) { | ||
512 | mdev->bm_resync_fo = bit; | ||
513 | goto requeue; | ||
514 | } | ||
515 | mdev->bm_resync_fo = bit + 1; | ||
516 | |||
517 | if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) { | ||
518 | drbd_rs_complete_io(mdev, sector); | ||
519 | goto next_sector; | ||
520 | } | ||
521 | |||
522 | #if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE | ||
523 | /* try to find some adjacent bits. | ||
524 | * we stop if we have already the maximum req size. | ||
525 | * | ||
526 | * Additionally always align bigger requests, in order to | ||
527 | * be prepared for all stripe sizes of software RAIDs. | ||
528 | * | ||
529 | * we _do_ care about the agreed-upon q->max_segment_size | ||
530 | * here, as splitting up the requests on the other side is more | ||
531 | * difficult. the consequence is, that on lvm and md and other | ||
532 | * "indirect" devices, this is dead code, since | ||
533 | * q->max_segment_size will be PAGE_SIZE. | ||
534 | */ | ||
535 | align = 1; | ||
536 | for (;;) { | ||
537 | if (size + BM_BLOCK_SIZE > max_segment_size) | ||
538 | break; | ||
539 | |||
540 | /* Be always aligned */ | ||
541 | if (sector & ((1<<(align+3))-1)) | ||
542 | break; | ||
543 | |||
544 | /* do not cross extent boundaries */ | ||
545 | if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0) | ||
546 | break; | ||
547 | /* now, is it actually dirty, after all? | ||
548 | * caution, drbd_bm_test_bit is tri-state for some | ||
549 | * obscure reason; ( b == 0 ) would get the out-of-band | ||
550 | * only accidentally right because of the "oddly sized" | ||
551 | * adjustment below */ | ||
552 | if (drbd_bm_test_bit(mdev, bit+1) != 1) | ||
553 | break; | ||
554 | bit++; | ||
555 | size += BM_BLOCK_SIZE; | ||
556 | if ((BM_BLOCK_SIZE << align) <= size) | ||
557 | align++; | ||
558 | i++; | ||
559 | } | ||
560 | /* if we merged some, | ||
561 | * reset the offset to start the next drbd_bm_find_next from */ | ||
562 | if (size > BM_BLOCK_SIZE) | ||
563 | mdev->bm_resync_fo = bit + 1; | ||
564 | #endif | ||
565 | |||
566 | /* adjust very last sectors, in case we are oddly sized */ | ||
567 | if (sector + (size>>9) > capacity) | ||
568 | size = (capacity-sector)<<9; | ||
569 | if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) { | ||
570 | switch (read_for_csum(mdev, sector, size)) { | ||
571 | case 0: /* Disk failure*/ | ||
572 | put_ldev(mdev); | ||
573 | return 0; | ||
574 | case 2: /* Allocation failed */ | ||
575 | drbd_rs_complete_io(mdev, sector); | ||
576 | mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); | ||
577 | goto requeue; | ||
578 | /* case 1: everything ok */ | ||
579 | } | ||
580 | } else { | ||
581 | inc_rs_pending(mdev); | ||
582 | if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST, | ||
583 | sector, size, ID_SYNCER)) { | ||
584 | dev_err(DEV, "drbd_send_drequest() failed, aborting...\n"); | ||
585 | dec_rs_pending(mdev); | ||
586 | put_ldev(mdev); | ||
587 | return 0; | ||
588 | } | ||
589 | } | ||
590 | } | ||
591 | |||
592 | if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) { | ||
593 | /* last syncer _request_ was sent, | ||
594 | * but the P_RS_DATA_REPLY not yet received. sync will end (and | ||
595 | * next sync group will resume), as soon as we receive the last | ||
596 | * resync data block, and the last bit is cleared. | ||
597 | * until then resync "work" is "inactive" ... | ||
598 | */ | ||
599 | mdev->resync_work.cb = w_resync_inactive; | ||
600 | put_ldev(mdev); | ||
601 | return 1; | ||
602 | } | ||
603 | |||
604 | requeue: | ||
605 | mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); | ||
606 | put_ldev(mdev); | ||
607 | return 1; | ||
608 | } | ||
609 | |||
610 | static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
611 | { | ||
612 | int number, i, size; | ||
613 | sector_t sector; | ||
614 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); | ||
615 | |||
616 | if (unlikely(cancel)) | ||
617 | return 1; | ||
618 | |||
619 | if (unlikely(mdev->state.conn < C_CONNECTED)) { | ||
620 | dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected"); | ||
621 | return 0; | ||
622 | } | ||
623 | |||
624 | number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ); | ||
625 | if (atomic_read(&mdev->rs_pending_cnt) > number) | ||
626 | goto requeue; | ||
627 | |||
628 | number -= atomic_read(&mdev->rs_pending_cnt); | ||
629 | |||
630 | sector = mdev->ov_position; | ||
631 | for (i = 0; i < number; i++) { | ||
632 | if (sector >= capacity) { | ||
633 | mdev->resync_work.cb = w_resync_inactive; | ||
634 | return 1; | ||
635 | } | ||
636 | |||
637 | size = BM_BLOCK_SIZE; | ||
638 | |||
639 | if (drbd_try_rs_begin_io(mdev, sector)) { | ||
640 | mdev->ov_position = sector; | ||
641 | goto requeue; | ||
642 | } | ||
643 | |||
644 | if (sector + (size>>9) > capacity) | ||
645 | size = (capacity-sector)<<9; | ||
646 | |||
647 | inc_rs_pending(mdev); | ||
648 | if (!drbd_send_ov_request(mdev, sector, size)) { | ||
649 | dec_rs_pending(mdev); | ||
650 | return 0; | ||
651 | } | ||
652 | sector += BM_SECT_PER_BIT; | ||
653 | } | ||
654 | mdev->ov_position = sector; | ||
655 | |||
656 | requeue: | ||
657 | mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); | ||
658 | return 1; | ||
659 | } | ||
660 | |||
661 | |||
662 | int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
663 | { | ||
664 | kfree(w); | ||
665 | ov_oos_print(mdev); | ||
666 | drbd_resync_finished(mdev); | ||
667 | |||
668 | return 1; | ||
669 | } | ||
670 | |||
671 | static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
672 | { | ||
673 | kfree(w); | ||
674 | |||
675 | drbd_resync_finished(mdev); | ||
676 | |||
677 | return 1; | ||
678 | } | ||
679 | |||
680 | int drbd_resync_finished(struct drbd_conf *mdev) | ||
681 | { | ||
682 | unsigned long db, dt, dbdt; | ||
683 | unsigned long n_oos; | ||
684 | union drbd_state os, ns; | ||
685 | struct drbd_work *w; | ||
686 | char *khelper_cmd = NULL; | ||
687 | |||
688 | /* Remove all elements from the resync LRU. Since future actions | ||
689 | * might set bits in the (main) bitmap, then the entries in the | ||
690 | * resync LRU would be wrong. */ | ||
691 | if (drbd_rs_del_all(mdev)) { | ||
692 | /* In case this is not possible now, most probably because | ||
693 | * there are P_RS_DATA_REPLY Packets lingering on the worker's | ||
694 | * queue (or even the read operations for those packets | ||
695 | * is not finished by now). Retry in 100ms. */ | ||
696 | |||
697 | drbd_kick_lo(mdev); | ||
698 | __set_current_state(TASK_INTERRUPTIBLE); | ||
699 | schedule_timeout(HZ / 10); | ||
700 | w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC); | ||
701 | if (w) { | ||
702 | w->cb = w_resync_finished; | ||
703 | drbd_queue_work(&mdev->data.work, w); | ||
704 | return 1; | ||
705 | } | ||
706 | dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n"); | ||
707 | } | ||
708 | |||
709 | dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; | ||
710 | if (dt <= 0) | ||
711 | dt = 1; | ||
712 | db = mdev->rs_total; | ||
713 | dbdt = Bit2KB(db/dt); | ||
714 | mdev->rs_paused /= HZ; | ||
715 | |||
716 | if (!get_ldev(mdev)) | ||
717 | goto out; | ||
718 | |||
719 | spin_lock_irq(&mdev->req_lock); | ||
720 | os = mdev->state; | ||
721 | |||
722 | /* This protects us against multiple calls (that can happen in the presence | ||
723 | of application IO), and against connectivity loss just before we arrive here. */ | ||
724 | if (os.conn <= C_CONNECTED) | ||
725 | goto out_unlock; | ||
726 | |||
727 | ns = os; | ||
728 | ns.conn = C_CONNECTED; | ||
729 | |||
730 | dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n", | ||
731 | (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ? | ||
732 | "Online verify " : "Resync", | ||
733 | dt + mdev->rs_paused, mdev->rs_paused, dbdt); | ||
734 | |||
735 | n_oos = drbd_bm_total_weight(mdev); | ||
736 | |||
737 | if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) { | ||
738 | if (n_oos) { | ||
739 | dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n", | ||
740 | n_oos, Bit2KB(1)); | ||
741 | khelper_cmd = "out-of-sync"; | ||
742 | } | ||
743 | } else { | ||
744 | D_ASSERT((n_oos - mdev->rs_failed) == 0); | ||
745 | |||
746 | if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) | ||
747 | khelper_cmd = "after-resync-target"; | ||
748 | |||
749 | if (mdev->csums_tfm && mdev->rs_total) { | ||
750 | const unsigned long s = mdev->rs_same_csum; | ||
751 | const unsigned long t = mdev->rs_total; | ||
752 | const int ratio = | ||
753 | (t == 0) ? 0 : | ||
754 | (t < 100000) ? ((s*100)/t) : (s/(t/100)); | ||
755 | dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; " | ||
756 | "transferred %luK total %luK\n", | ||
757 | ratio, | ||
758 | Bit2KB(mdev->rs_same_csum), | ||
759 | Bit2KB(mdev->rs_total - mdev->rs_same_csum), | ||
760 | Bit2KB(mdev->rs_total)); | ||
761 | } | ||
762 | } | ||
763 | |||
764 | if (mdev->rs_failed) { | ||
765 | dev_info(DEV, " %lu failed blocks\n", mdev->rs_failed); | ||
766 | |||
767 | if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) { | ||
768 | ns.disk = D_INCONSISTENT; | ||
769 | ns.pdsk = D_UP_TO_DATE; | ||
770 | } else { | ||
771 | ns.disk = D_UP_TO_DATE; | ||
772 | ns.pdsk = D_INCONSISTENT; | ||
773 | } | ||
774 | } else { | ||
775 | ns.disk = D_UP_TO_DATE; | ||
776 | ns.pdsk = D_UP_TO_DATE; | ||
777 | |||
778 | if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) { | ||
779 | if (mdev->p_uuid) { | ||
780 | int i; | ||
781 | for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++) | ||
782 | _drbd_uuid_set(mdev, i, mdev->p_uuid[i]); | ||
783 | drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]); | ||
784 | _drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]); | ||
785 | } else { | ||
786 | dev_err(DEV, "mdev->p_uuid is NULL! BUG\n"); | ||
787 | } | ||
788 | } | ||
789 | |||
790 | drbd_uuid_set_bm(mdev, 0UL); | ||
791 | |||
792 | if (mdev->p_uuid) { | ||
793 | /* Now the two UUID sets are equal, update what we | ||
794 | * know of the peer. */ | ||
795 | int i; | ||
796 | for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++) | ||
797 | mdev->p_uuid[i] = mdev->ldev->md.uuid[i]; | ||
798 | } | ||
799 | } | ||
800 | |||
801 | _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | ||
802 | out_unlock: | ||
803 | spin_unlock_irq(&mdev->req_lock); | ||
804 | put_ldev(mdev); | ||
805 | out: | ||
806 | mdev->rs_total = 0; | ||
807 | mdev->rs_failed = 0; | ||
808 | mdev->rs_paused = 0; | ||
809 | mdev->ov_start_sector = 0; | ||
810 | |||
811 | if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) { | ||
812 | dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n"); | ||
813 | drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished"); | ||
814 | } | ||
815 | |||
816 | if (khelper_cmd) | ||
817 | drbd_khelper(mdev, khelper_cmd); | ||
818 | |||
819 | return 1; | ||
820 | } | ||
821 | |||
822 | /* helper */ | ||
823 | static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e) | ||
824 | { | ||
825 | if (drbd_bio_has_active_page(e->private_bio)) { | ||
826 | /* This might happen if sendpage() has not finished */ | ||
827 | spin_lock_irq(&mdev->req_lock); | ||
828 | list_add_tail(&e->w.list, &mdev->net_ee); | ||
829 | spin_unlock_irq(&mdev->req_lock); | ||
830 | } else | ||
831 | drbd_free_ee(mdev, e); | ||
832 | } | ||
833 | |||
834 | /** | ||
835 | * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST | ||
836 | * @mdev: DRBD device. | ||
837 | * @w: work object. | ||
838 | * @cancel: The connection will be closed anyways | ||
839 | */ | ||
840 | int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
841 | { | ||
842 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | ||
843 | int ok; | ||
844 | |||
845 | if (unlikely(cancel)) { | ||
846 | drbd_free_ee(mdev, e); | ||
847 | dec_unacked(mdev); | ||
848 | return 1; | ||
849 | } | ||
850 | |||
851 | if (likely(drbd_bio_uptodate(e->private_bio))) { | ||
852 | ok = drbd_send_block(mdev, P_DATA_REPLY, e); | ||
853 | } else { | ||
854 | if (__ratelimit(&drbd_ratelimit_state)) | ||
855 | dev_err(DEV, "Sending NegDReply. sector=%llus.\n", | ||
856 | (unsigned long long)e->sector); | ||
857 | |||
858 | ok = drbd_send_ack(mdev, P_NEG_DREPLY, e); | ||
859 | } | ||
860 | |||
861 | dec_unacked(mdev); | ||
862 | |||
863 | move_to_net_ee_or_free(mdev, e); | ||
864 | |||
865 | if (unlikely(!ok)) | ||
866 | dev_err(DEV, "drbd_send_block() failed\n"); | ||
867 | return ok; | ||
868 | } | ||
869 | |||
870 | /** | ||
871 | * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS | ||
872 | * @mdev: DRBD device. | ||
873 | * @w: work object. | ||
874 | * @cancel: The connection will be closed anyways | ||
875 | */ | ||
876 | int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
877 | { | ||
878 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | ||
879 | int ok; | ||
880 | |||
881 | if (unlikely(cancel)) { | ||
882 | drbd_free_ee(mdev, e); | ||
883 | dec_unacked(mdev); | ||
884 | return 1; | ||
885 | } | ||
886 | |||
887 | if (get_ldev_if_state(mdev, D_FAILED)) { | ||
888 | drbd_rs_complete_io(mdev, e->sector); | ||
889 | put_ldev(mdev); | ||
890 | } | ||
891 | |||
892 | if (likely(drbd_bio_uptodate(e->private_bio))) { | ||
893 | if (likely(mdev->state.pdsk >= D_INCONSISTENT)) { | ||
894 | inc_rs_pending(mdev); | ||
895 | ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); | ||
896 | } else { | ||
897 | if (__ratelimit(&drbd_ratelimit_state)) | ||
898 | dev_err(DEV, "Not sending RSDataReply, " | ||
899 | "partner DISKLESS!\n"); | ||
900 | ok = 1; | ||
901 | } | ||
902 | } else { | ||
903 | if (__ratelimit(&drbd_ratelimit_state)) | ||
904 | dev_err(DEV, "Sending NegRSDReply. sector %llus.\n", | ||
905 | (unsigned long long)e->sector); | ||
906 | |||
907 | ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); | ||
908 | |||
909 | /* update resync data with failure */ | ||
910 | drbd_rs_failed_io(mdev, e->sector, e->size); | ||
911 | } | ||
912 | |||
913 | dec_unacked(mdev); | ||
914 | |||
915 | move_to_net_ee_or_free(mdev, e); | ||
916 | |||
917 | if (unlikely(!ok)) | ||
918 | dev_err(DEV, "drbd_send_block() failed\n"); | ||
919 | return ok; | ||
920 | } | ||
921 | |||
922 | int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
923 | { | ||
924 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | ||
925 | struct digest_info *di; | ||
926 | int digest_size; | ||
927 | void *digest = NULL; | ||
928 | int ok, eq = 0; | ||
929 | |||
930 | if (unlikely(cancel)) { | ||
931 | drbd_free_ee(mdev, e); | ||
932 | dec_unacked(mdev); | ||
933 | return 1; | ||
934 | } | ||
935 | |||
936 | drbd_rs_complete_io(mdev, e->sector); | ||
937 | |||
938 | di = (struct digest_info *)(unsigned long)e->block_id; | ||
939 | |||
940 | if (likely(drbd_bio_uptodate(e->private_bio))) { | ||
941 | /* quick hack to try to avoid a race against reconfiguration. | ||
942 | * a real fix would be much more involved, | ||
943 | * introducing more locking mechanisms */ | ||
944 | if (mdev->csums_tfm) { | ||
945 | digest_size = crypto_hash_digestsize(mdev->csums_tfm); | ||
946 | D_ASSERT(digest_size == di->digest_size); | ||
947 | digest = kmalloc(digest_size, GFP_NOIO); | ||
948 | } | ||
949 | if (digest) { | ||
950 | drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest); | ||
951 | eq = !memcmp(digest, di->digest, digest_size); | ||
952 | kfree(digest); | ||
953 | } | ||
954 | |||
955 | if (eq) { | ||
956 | drbd_set_in_sync(mdev, e->sector, e->size); | ||
957 | mdev->rs_same_csum++; | ||
958 | ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e); | ||
959 | } else { | ||
960 | inc_rs_pending(mdev); | ||
961 | e->block_id = ID_SYNCER; | ||
962 | ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); | ||
963 | } | ||
964 | } else { | ||
965 | ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); | ||
966 | if (__ratelimit(&drbd_ratelimit_state)) | ||
967 | dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n"); | ||
968 | } | ||
969 | |||
970 | dec_unacked(mdev); | ||
971 | |||
972 | kfree(di); | ||
973 | |||
974 | move_to_net_ee_or_free(mdev, e); | ||
975 | |||
976 | if (unlikely(!ok)) | ||
977 | dev_err(DEV, "drbd_send_block/ack() failed\n"); | ||
978 | return ok; | ||
979 | } | ||
980 | |||
981 | int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
982 | { | ||
983 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | ||
984 | int digest_size; | ||
985 | void *digest; | ||
986 | int ok = 1; | ||
987 | |||
988 | if (unlikely(cancel)) | ||
989 | goto out; | ||
990 | |||
991 | if (unlikely(!drbd_bio_uptodate(e->private_bio))) | ||
992 | goto out; | ||
993 | |||
994 | digest_size = crypto_hash_digestsize(mdev->verify_tfm); | ||
995 | /* FIXME if this allocation fails, online verify will not terminate! */ | ||
996 | digest = kmalloc(digest_size, GFP_NOIO); | ||
997 | if (digest) { | ||
998 | drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest); | ||
999 | inc_rs_pending(mdev); | ||
1000 | ok = drbd_send_drequest_csum(mdev, e->sector, e->size, | ||
1001 | digest, digest_size, P_OV_REPLY); | ||
1002 | if (!ok) | ||
1003 | dec_rs_pending(mdev); | ||
1004 | kfree(digest); | ||
1005 | } | ||
1006 | |||
1007 | out: | ||
1008 | drbd_free_ee(mdev, e); | ||
1009 | |||
1010 | dec_unacked(mdev); | ||
1011 | |||
1012 | return ok; | ||
1013 | } | ||
1014 | |||
1015 | void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size) | ||
1016 | { | ||
1017 | if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) { | ||
1018 | mdev->ov_last_oos_size += size>>9; | ||
1019 | } else { | ||
1020 | mdev->ov_last_oos_start = sector; | ||
1021 | mdev->ov_last_oos_size = size>>9; | ||
1022 | } | ||
1023 | drbd_set_out_of_sync(mdev, sector, size); | ||
1024 | set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); | ||
1025 | } | ||
1026 | |||
1027 | int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1028 | { | ||
1029 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | ||
1030 | struct digest_info *di; | ||
1031 | int digest_size; | ||
1032 | void *digest; | ||
1033 | int ok, eq = 0; | ||
1034 | |||
1035 | if (unlikely(cancel)) { | ||
1036 | drbd_free_ee(mdev, e); | ||
1037 | dec_unacked(mdev); | ||
1038 | return 1; | ||
1039 | } | ||
1040 | |||
1041 | /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all | ||
1042 | * the resync lru has been cleaned up already */ | ||
1043 | drbd_rs_complete_io(mdev, e->sector); | ||
1044 | |||
1045 | di = (struct digest_info *)(unsigned long)e->block_id; | ||
1046 | |||
1047 | if (likely(drbd_bio_uptodate(e->private_bio))) { | ||
1048 | digest_size = crypto_hash_digestsize(mdev->verify_tfm); | ||
1049 | digest = kmalloc(digest_size, GFP_NOIO); | ||
1050 | if (digest) { | ||
1051 | drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest); | ||
1052 | |||
1053 | D_ASSERT(digest_size == di->digest_size); | ||
1054 | eq = !memcmp(digest, di->digest, digest_size); | ||
1055 | kfree(digest); | ||
1056 | } | ||
1057 | } else { | ||
1058 | ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); | ||
1059 | if (__ratelimit(&drbd_ratelimit_state)) | ||
1060 | dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n"); | ||
1061 | } | ||
1062 | |||
1063 | dec_unacked(mdev); | ||
1064 | |||
1065 | kfree(di); | ||
1066 | |||
1067 | if (!eq) | ||
1068 | drbd_ov_oos_found(mdev, e->sector, e->size); | ||
1069 | else | ||
1070 | ov_oos_print(mdev); | ||
1071 | |||
1072 | ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size, | ||
1073 | eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); | ||
1074 | |||
1075 | drbd_free_ee(mdev, e); | ||
1076 | |||
1077 | if (--mdev->ov_left == 0) { | ||
1078 | ov_oos_print(mdev); | ||
1079 | drbd_resync_finished(mdev); | ||
1080 | } | ||
1081 | |||
1082 | return ok; | ||
1083 | } | ||
1084 | |||
1085 | int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1086 | { | ||
1087 | struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w); | ||
1088 | complete(&b->done); | ||
1089 | return 1; | ||
1090 | } | ||
1091 | |||
1092 | int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1093 | { | ||
1094 | struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w); | ||
1095 | struct p_barrier *p = &mdev->data.sbuf.barrier; | ||
1096 | int ok = 1; | ||
1097 | |||
1098 | /* really avoid racing with tl_clear. w.cb may have been referenced | ||
1099 | * just before it was reassigned and re-queued, so double check that. | ||
1100 | * actually, this race was harmless, since we only try to send the | ||
1101 | * barrier packet here, and otherwise do nothing with the object. | ||
1102 | * but compare with the head of w_clear_epoch */ | ||
1103 | spin_lock_irq(&mdev->req_lock); | ||
1104 | if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED) | ||
1105 | cancel = 1; | ||
1106 | spin_unlock_irq(&mdev->req_lock); | ||
1107 | if (cancel) | ||
1108 | return 1; | ||
1109 | |||
1110 | if (!drbd_get_data_sock(mdev)) | ||
1111 | return 0; | ||
1112 | p->barrier = b->br_number; | ||
1113 | /* inc_ap_pending was done where this was queued. | ||
1114 | * dec_ap_pending will be done in got_BarrierAck | ||
1115 | * or (on connection loss) in w_clear_epoch. */ | ||
1116 | ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER, | ||
1117 | (struct p_header *)p, sizeof(*p), 0); | ||
1118 | drbd_put_data_sock(mdev); | ||
1119 | |||
1120 | return ok; | ||
1121 | } | ||
1122 | |||
1123 | int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1124 | { | ||
1125 | if (cancel) | ||
1126 | return 1; | ||
1127 | return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE); | ||
1128 | } | ||
1129 | |||
1130 | /** | ||
1131 | * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request | ||
1132 | * @mdev: DRBD device. | ||
1133 | * @w: work object. | ||
1134 | * @cancel: The connection will be closed anyways | ||
1135 | */ | ||
1136 | int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1137 | { | ||
1138 | struct drbd_request *req = container_of(w, struct drbd_request, w); | ||
1139 | int ok; | ||
1140 | |||
1141 | if (unlikely(cancel)) { | ||
1142 | req_mod(req, send_canceled); | ||
1143 | return 1; | ||
1144 | } | ||
1145 | |||
1146 | ok = drbd_send_dblock(mdev, req); | ||
1147 | req_mod(req, ok ? handed_over_to_network : send_failed); | ||
1148 | |||
1149 | return ok; | ||
1150 | } | ||
1151 | |||
1152 | /** | ||
1153 | * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet | ||
1154 | * @mdev: DRBD device. | ||
1155 | * @w: work object. | ||
1156 | * @cancel: The connection will be closed anyways | ||
1157 | */ | ||
1158 | int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1159 | { | ||
1160 | struct drbd_request *req = container_of(w, struct drbd_request, w); | ||
1161 | int ok; | ||
1162 | |||
1163 | if (unlikely(cancel)) { | ||
1164 | req_mod(req, send_canceled); | ||
1165 | return 1; | ||
1166 | } | ||
1167 | |||
1168 | ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size, | ||
1169 | (unsigned long)req); | ||
1170 | |||
1171 | if (!ok) { | ||
1172 | /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send(); | ||
1173 | * so this is probably redundant */ | ||
1174 | if (mdev->state.conn >= C_CONNECTED) | ||
1175 | drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); | ||
1176 | } | ||
1177 | req_mod(req, ok ? handed_over_to_network : send_failed); | ||
1178 | |||
1179 | return ok; | ||
1180 | } | ||
1181 | |||
1182 | static int _drbd_may_sync_now(struct drbd_conf *mdev) | ||
1183 | { | ||
1184 | struct drbd_conf *odev = mdev; | ||
1185 | |||
1186 | while (1) { | ||
1187 | if (odev->sync_conf.after == -1) | ||
1188 | return 1; | ||
1189 | odev = minor_to_mdev(odev->sync_conf.after); | ||
1190 | ERR_IF(!odev) return 1; | ||
1191 | if ((odev->state.conn >= C_SYNC_SOURCE && | ||
1192 | odev->state.conn <= C_PAUSED_SYNC_T) || | ||
1193 | odev->state.aftr_isp || odev->state.peer_isp || | ||
1194 | odev->state.user_isp) | ||
1195 | return 0; | ||
1196 | } | ||
1197 | } | ||
1198 | |||
1199 | /** | ||
1200 | * _drbd_pause_after() - Pause resync on all devices that may not resync now | ||
1201 | * @mdev: DRBD device. | ||
1202 | * | ||
1203 | * Called from process context only (admin command and after_state_ch). | ||
1204 | */ | ||
1205 | static int _drbd_pause_after(struct drbd_conf *mdev) | ||
1206 | { | ||
1207 | struct drbd_conf *odev; | ||
1208 | int i, rv = 0; | ||
1209 | |||
1210 | for (i = 0; i < minor_count; i++) { | ||
1211 | odev = minor_to_mdev(i); | ||
1212 | if (!odev) | ||
1213 | continue; | ||
1214 | if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) | ||
1215 | continue; | ||
1216 | if (!_drbd_may_sync_now(odev)) | ||
1217 | rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL) | ||
1218 | != SS_NOTHING_TO_DO); | ||
1219 | } | ||
1220 | |||
1221 | return rv; | ||
1222 | } | ||
1223 | |||
1224 | /** | ||
1225 | * _drbd_resume_next() - Resume resync on all devices that may resync now | ||
1226 | * @mdev: DRBD device. | ||
1227 | * | ||
1228 | * Called from process context only (admin command and worker). | ||
1229 | */ | ||
1230 | static int _drbd_resume_next(struct drbd_conf *mdev) | ||
1231 | { | ||
1232 | struct drbd_conf *odev; | ||
1233 | int i, rv = 0; | ||
1234 | |||
1235 | for (i = 0; i < minor_count; i++) { | ||
1236 | odev = minor_to_mdev(i); | ||
1237 | if (!odev) | ||
1238 | continue; | ||
1239 | if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) | ||
1240 | continue; | ||
1241 | if (odev->state.aftr_isp) { | ||
1242 | if (_drbd_may_sync_now(odev)) | ||
1243 | rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0), | ||
1244 | CS_HARD, NULL) | ||
1245 | != SS_NOTHING_TO_DO) ; | ||
1246 | } | ||
1247 | } | ||
1248 | return rv; | ||
1249 | } | ||
1250 | |||
1251 | void resume_next_sg(struct drbd_conf *mdev) | ||
1252 | { | ||
1253 | write_lock_irq(&global_state_lock); | ||
1254 | _drbd_resume_next(mdev); | ||
1255 | write_unlock_irq(&global_state_lock); | ||
1256 | } | ||
1257 | |||
1258 | void suspend_other_sg(struct drbd_conf *mdev) | ||
1259 | { | ||
1260 | write_lock_irq(&global_state_lock); | ||
1261 | _drbd_pause_after(mdev); | ||
1262 | write_unlock_irq(&global_state_lock); | ||
1263 | } | ||
1264 | |||
1265 | static int sync_after_error(struct drbd_conf *mdev, int o_minor) | ||
1266 | { | ||
1267 | struct drbd_conf *odev; | ||
1268 | |||
1269 | if (o_minor == -1) | ||
1270 | return NO_ERROR; | ||
1271 | if (o_minor < -1 || minor_to_mdev(o_minor) == NULL) | ||
1272 | return ERR_SYNC_AFTER; | ||
1273 | |||
1274 | /* check for loops */ | ||
1275 | odev = minor_to_mdev(o_minor); | ||
1276 | while (1) { | ||
1277 | if (odev == mdev) | ||
1278 | return ERR_SYNC_AFTER_CYCLE; | ||
1279 | |||
1280 | /* dependency chain ends here, no cycles. */ | ||
1281 | if (odev->sync_conf.after == -1) | ||
1282 | return NO_ERROR; | ||
1283 | |||
1284 | /* follow the dependency chain */ | ||
1285 | odev = minor_to_mdev(odev->sync_conf.after); | ||
1286 | } | ||
1287 | } | ||
1288 | |||
1289 | int drbd_alter_sa(struct drbd_conf *mdev, int na) | ||
1290 | { | ||
1291 | int changes; | ||
1292 | int retcode; | ||
1293 | |||
1294 | write_lock_irq(&global_state_lock); | ||
1295 | retcode = sync_after_error(mdev, na); | ||
1296 | if (retcode == NO_ERROR) { | ||
1297 | mdev->sync_conf.after = na; | ||
1298 | do { | ||
1299 | changes = _drbd_pause_after(mdev); | ||
1300 | changes |= _drbd_resume_next(mdev); | ||
1301 | } while (changes); | ||
1302 | } | ||
1303 | write_unlock_irq(&global_state_lock); | ||
1304 | return retcode; | ||
1305 | } | ||
1306 | |||
1307 | /** | ||
1308 | * drbd_start_resync() - Start the resync process | ||
1309 | * @mdev: DRBD device. | ||
1310 | * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET | ||
1311 | * | ||
1312 | * This function might bring you directly into one of the | ||
1313 | * C_PAUSED_SYNC_* states. | ||
1314 | */ | ||
1315 | void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | ||
1316 | { | ||
1317 | union drbd_state ns; | ||
1318 | int r; | ||
1319 | |||
1320 | if (mdev->state.conn >= C_SYNC_SOURCE) { | ||
1321 | dev_err(DEV, "Resync already running!\n"); | ||
1322 | return; | ||
1323 | } | ||
1324 | |||
1325 | trace_drbd_resync(mdev, TRACE_LVL_SUMMARY, "Resync starting: side=%s\n", | ||
1326 | side == C_SYNC_TARGET ? "SyncTarget" : "SyncSource"); | ||
1327 | |||
1328 | /* In case a previous resync run was aborted by an IO error/detach on the peer. */ | ||
1329 | drbd_rs_cancel_all(mdev); | ||
1330 | |||
1331 | if (side == C_SYNC_TARGET) { | ||
1332 | /* Since application IO was locked out during C_WF_BITMAP_T and | ||
1333 | C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET | ||
1334 | we check that we might make the data inconsistent. */ | ||
1335 | r = drbd_khelper(mdev, "before-resync-target"); | ||
1336 | r = (r >> 8) & 0xff; | ||
1337 | if (r > 0) { | ||
1338 | dev_info(DEV, "before-resync-target handler returned %d, " | ||
1339 | "dropping connection.\n", r); | ||
1340 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
1341 | return; | ||
1342 | } | ||
1343 | } | ||
1344 | |||
1345 | drbd_state_lock(mdev); | ||
1346 | |||
1347 | if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { | ||
1348 | drbd_state_unlock(mdev); | ||
1349 | return; | ||
1350 | } | ||
1351 | |||
1352 | if (side == C_SYNC_TARGET) { | ||
1353 | mdev->bm_resync_fo = 0; | ||
1354 | } else /* side == C_SYNC_SOURCE */ { | ||
1355 | u64 uuid; | ||
1356 | |||
1357 | get_random_bytes(&uuid, sizeof(u64)); | ||
1358 | drbd_uuid_set(mdev, UI_BITMAP, uuid); | ||
1359 | drbd_send_sync_uuid(mdev, uuid); | ||
1360 | |||
1361 | D_ASSERT(mdev->state.disk == D_UP_TO_DATE); | ||
1362 | } | ||
1363 | |||
1364 | write_lock_irq(&global_state_lock); | ||
1365 | ns = mdev->state; | ||
1366 | |||
1367 | ns.aftr_isp = !_drbd_may_sync_now(mdev); | ||
1368 | |||
1369 | ns.conn = side; | ||
1370 | |||
1371 | if (side == C_SYNC_TARGET) | ||
1372 | ns.disk = D_INCONSISTENT; | ||
1373 | else /* side == C_SYNC_SOURCE */ | ||
1374 | ns.pdsk = D_INCONSISTENT; | ||
1375 | |||
1376 | r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | ||
1377 | ns = mdev->state; | ||
1378 | |||
1379 | if (ns.conn < C_CONNECTED) | ||
1380 | r = SS_UNKNOWN_ERROR; | ||
1381 | |||
1382 | if (r == SS_SUCCESS) { | ||
1383 | mdev->rs_total = | ||
1384 | mdev->rs_mark_left = drbd_bm_total_weight(mdev); | ||
1385 | mdev->rs_failed = 0; | ||
1386 | mdev->rs_paused = 0; | ||
1387 | mdev->rs_start = | ||
1388 | mdev->rs_mark_time = jiffies; | ||
1389 | mdev->rs_same_csum = 0; | ||
1390 | _drbd_pause_after(mdev); | ||
1391 | } | ||
1392 | write_unlock_irq(&global_state_lock); | ||
1393 | drbd_state_unlock(mdev); | ||
1394 | put_ldev(mdev); | ||
1395 | |||
1396 | if (r == SS_SUCCESS) { | ||
1397 | dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n", | ||
1398 | drbd_conn_str(ns.conn), | ||
1399 | (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10), | ||
1400 | (unsigned long) mdev->rs_total); | ||
1401 | |||
1402 | if (mdev->rs_total == 0) { | ||
1403 | /* Peer still reachable? Beware of failing before-resync-target handlers! */ | ||
1404 | request_ping(mdev); | ||
1405 | __set_current_state(TASK_INTERRUPTIBLE); | ||
1406 | schedule_timeout(mdev->net_conf->ping_timeo*HZ/9); /* 9 instead 10 */ | ||
1407 | drbd_resync_finished(mdev); | ||
1408 | return; | ||
1409 | } | ||
1410 | |||
1411 | /* ns.conn may already be != mdev->state.conn, | ||
1412 | * we may have been paused in between, or become paused until | ||
1413 | * the timer triggers. | ||
1414 | * No matter, that is handled in resync_timer_fn() */ | ||
1415 | if (ns.conn == C_SYNC_TARGET) | ||
1416 | mod_timer(&mdev->resync_timer, jiffies); | ||
1417 | |||
1418 | drbd_md_sync(mdev); | ||
1419 | } | ||
1420 | } | ||
1421 | |||
1422 | int drbd_worker(struct drbd_thread *thi) | ||
1423 | { | ||
1424 | struct drbd_conf *mdev = thi->mdev; | ||
1425 | struct drbd_work *w = NULL; | ||
1426 | LIST_HEAD(work_list); | ||
1427 | int intr = 0, i; | ||
1428 | |||
1429 | sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev)); | ||
1430 | |||
1431 | while (get_t_state(thi) == Running) { | ||
1432 | drbd_thread_current_set_cpu(mdev); | ||
1433 | |||
1434 | if (down_trylock(&mdev->data.work.s)) { | ||
1435 | mutex_lock(&mdev->data.mutex); | ||
1436 | if (mdev->data.socket && !mdev->net_conf->no_cork) | ||
1437 | drbd_tcp_uncork(mdev->data.socket); | ||
1438 | mutex_unlock(&mdev->data.mutex); | ||
1439 | |||
1440 | intr = down_interruptible(&mdev->data.work.s); | ||
1441 | |||
1442 | mutex_lock(&mdev->data.mutex); | ||
1443 | if (mdev->data.socket && !mdev->net_conf->no_cork) | ||
1444 | drbd_tcp_cork(mdev->data.socket); | ||
1445 | mutex_unlock(&mdev->data.mutex); | ||
1446 | } | ||
1447 | |||
1448 | if (intr) { | ||
1449 | D_ASSERT(intr == -EINTR); | ||
1450 | flush_signals(current); | ||
1451 | ERR_IF (get_t_state(thi) == Running) | ||
1452 | continue; | ||
1453 | break; | ||
1454 | } | ||
1455 | |||
1456 | if (get_t_state(thi) != Running) | ||
1457 | break; | ||
1458 | /* With this break, we have done a down() but not consumed | ||
1459 | the entry from the list. The cleanup code takes care of | ||
1460 | this... */ | ||
1461 | |||
1462 | w = NULL; | ||
1463 | spin_lock_irq(&mdev->data.work.q_lock); | ||
1464 | ERR_IF(list_empty(&mdev->data.work.q)) { | ||
1465 | /* something terribly wrong in our logic. | ||
1466 | * we were able to down() the semaphore, | ||
1467 | * but the list is empty... doh. | ||
1468 | * | ||
1469 | * what is the best thing to do now? | ||
1470 | * try again from scratch, restarting the receiver, | ||
1471 | * asender, whatnot? could break even more ugly, | ||
1472 | * e.g. when we are primary, but no good local data. | ||
1473 | * | ||
1474 | * I'll try to get away just starting over this loop. | ||
1475 | */ | ||
1476 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1477 | continue; | ||
1478 | } | ||
1479 | w = list_entry(mdev->data.work.q.next, struct drbd_work, list); | ||
1480 | list_del_init(&w->list); | ||
1481 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1482 | |||
1483 | if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) { | ||
1484 | /* dev_warn(DEV, "worker: a callback failed! \n"); */ | ||
1485 | if (mdev->state.conn >= C_CONNECTED) | ||
1486 | drbd_force_state(mdev, | ||
1487 | NS(conn, C_NETWORK_FAILURE)); | ||
1488 | } | ||
1489 | } | ||
1490 | D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags)); | ||
1491 | D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags)); | ||
1492 | |||
1493 | spin_lock_irq(&mdev->data.work.q_lock); | ||
1494 | i = 0; | ||
1495 | while (!list_empty(&mdev->data.work.q)) { | ||
1496 | list_splice_init(&mdev->data.work.q, &work_list); | ||
1497 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1498 | |||
1499 | while (!list_empty(&work_list)) { | ||
1500 | w = list_entry(work_list.next, struct drbd_work, list); | ||
1501 | list_del_init(&w->list); | ||
1502 | w->cb(mdev, w, 1); | ||
1503 | i++; /* dead debugging code */ | ||
1504 | } | ||
1505 | |||
1506 | spin_lock_irq(&mdev->data.work.q_lock); | ||
1507 | } | ||
1508 | sema_init(&mdev->data.work.s, 0); | ||
1509 | /* DANGEROUS race: if someone did queue his work within the spinlock, | ||
1510 | * but up() ed outside the spinlock, we could get an up() on the | ||
1511 | * semaphore without corresponding list entry. | ||
1512 | * So don't do that. | ||
1513 | */ | ||
1514 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1515 | |||
1516 | D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE); | ||
1517 | /* _drbd_set_state only uses stop_nowait. | ||
1518 | * wait here for the Exiting receiver. */ | ||
1519 | drbd_thread_stop(&mdev->receiver); | ||
1520 | drbd_mdev_cleanup(mdev); | ||
1521 | |||
1522 | dev_info(DEV, "worker terminated\n"); | ||
1523 | |||
1524 | clear_bit(DEVICE_DYING, &mdev->flags); | ||
1525 | clear_bit(CONFIG_PENDING, &mdev->flags); | ||
1526 | wake_up(&mdev->state_wait); | ||
1527 | |||
1528 | return 0; | ||
1529 | } | ||
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h new file mode 100644 index 00000000000..f93fa111ce5 --- /dev/null +++ b/drivers/block/drbd/drbd_wrappers.h | |||
@@ -0,0 +1,91 @@ | |||
1 | #ifndef _DRBD_WRAPPERS_H | ||
2 | #define _DRBD_WRAPPERS_H | ||
3 | |||
4 | #include <linux/ctype.h> | ||
5 | #include <linux/mm.h> | ||
6 | |||
7 | /* see get_sb_bdev and bd_claim */ | ||
8 | extern char *drbd_sec_holder; | ||
9 | |||
10 | /* sets the number of 512 byte sectors of our virtual device */ | ||
11 | static inline void drbd_set_my_capacity(struct drbd_conf *mdev, | ||
12 | sector_t size) | ||
13 | { | ||
14 | /* set_capacity(mdev->this_bdev->bd_disk, size); */ | ||
15 | set_capacity(mdev->vdisk, size); | ||
16 | mdev->this_bdev->bd_inode->i_size = (loff_t)size << 9; | ||
17 | } | ||
18 | |||
19 | #define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE) | ||
20 | |||
21 | static inline int drbd_bio_has_active_page(struct bio *bio) | ||
22 | { | ||
23 | struct bio_vec *bvec; | ||
24 | int i; | ||
25 | |||
26 | __bio_for_each_segment(bvec, bio, i, 0) { | ||
27 | if (page_count(bvec->bv_page) > 1) | ||
28 | return 1; | ||
29 | } | ||
30 | |||
31 | return 0; | ||
32 | } | ||
33 | |||
34 | /* bi_end_io handlers */ | ||
35 | extern void drbd_md_io_complete(struct bio *bio, int error); | ||
36 | extern void drbd_endio_read_sec(struct bio *bio, int error); | ||
37 | extern void drbd_endio_write_sec(struct bio *bio, int error); | ||
38 | extern void drbd_endio_pri(struct bio *bio, int error); | ||
39 | |||
40 | /* | ||
41 | * used to submit our private bio | ||
42 | */ | ||
43 | static inline void drbd_generic_make_request(struct drbd_conf *mdev, | ||
44 | int fault_type, struct bio *bio) | ||
45 | { | ||
46 | __release(local); | ||
47 | if (!bio->bi_bdev) { | ||
48 | printk(KERN_ERR "drbd%d: drbd_generic_make_request: " | ||
49 | "bio->bi_bdev == NULL\n", | ||
50 | mdev_to_minor(mdev)); | ||
51 | dump_stack(); | ||
52 | bio_endio(bio, -ENODEV); | ||
53 | return; | ||
54 | } | ||
55 | |||
56 | if (FAULT_ACTIVE(mdev, fault_type)) | ||
57 | bio_endio(bio, -EIO); | ||
58 | else | ||
59 | generic_make_request(bio); | ||
60 | } | ||
61 | |||
62 | static inline void drbd_plug_device(struct drbd_conf *mdev) | ||
63 | { | ||
64 | struct request_queue *q; | ||
65 | q = bdev_get_queue(mdev->this_bdev); | ||
66 | |||
67 | spin_lock_irq(q->queue_lock); | ||
68 | |||
69 | /* XXX the check on !blk_queue_plugged is redundant, | ||
70 | * implicitly checked in blk_plug_device */ | ||
71 | |||
72 | if (!blk_queue_plugged(q)) { | ||
73 | blk_plug_device(q); | ||
74 | del_timer(&q->unplug_timer); | ||
75 | /* unplugging should not happen automatically... */ | ||
76 | } | ||
77 | spin_unlock_irq(q->queue_lock); | ||
78 | } | ||
79 | |||
80 | static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm) | ||
81 | { | ||
82 | return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK) | ||
83 | == CRYPTO_ALG_TYPE_HASH; | ||
84 | } | ||
85 | |||
86 | #ifndef __CHECKER__ | ||
87 | # undef __cond_lock | ||
88 | # define __cond_lock(x,c) (c) | ||
89 | #endif | ||
90 | |||
91 | #endif | ||